1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2017-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include "IGC/common/StringMacros.hpp"
10 #include "EmitVISAPass.hpp"
11 #include "CISABuilder.hpp"
12 #include "VertexShaderCodeGen.hpp"
13 #include "GeometryShaderCodeGen.hpp"
14 #include "PixelShaderCodeGen.hpp"
15 #include "OpenCLKernelCodeGen.hpp"
16 #include "ComputeShaderCodeGen.hpp"
17 #include "HullShaderCodeGen.hpp"
18 #include "DomainShaderCodeGen.hpp"
19 #include "DeSSA.hpp"
20 #include "messageEncoding.hpp"
21 #include "PayloadMapping.hpp"
22 #include "VectorProcess.hpp"
23 #include "ShaderCodeGen.hpp"
24 #include "common/allocator.h"
25 #include "common/debug/Dump.hpp"
26 #include "common/debug/Dump.hpp"
27 #include "common/igc_regkeys.hpp"
28 #include "common/Stats.hpp"
29 #include "Compiler/CISACodeGen/helper.h"
30 #include "Compiler/DebugInfo/ScalarVISAModule.h"
31 #include "common/secure_mem.h"
32 #include "DebugInfo/VISAIDebugEmitter.hpp"
33 #include "DebugInfo/EmitterOpts.hpp"
34 #include "GenISAIntrinsics/GenIntrinsicInst.h"
35 #include "AdaptorCommon/ImplicitArgs.hpp"
36 #include "Compiler/IGCPassSupport.h"
37 #include "common/LLVMWarningsPush.hpp"
38 #include "llvmWrapper/IR/Instructions.h"
39 #include "llvmWrapper/IR/DerivedTypes.h"
40 #include "llvm/Support/Path.h"
41 #include "llvm/Support/FormattedStream.h"
42 #include "llvm/IR/AssemblyAnnotationWriter.h"
43 #include "llvmWrapper/IR/Intrinsics.h"
44 #include "common/LLVMWarningsPop.hpp"
45 #include "Probe/Assertion.h"
46 
47 #include <fstream>
48 
49 using namespace llvm;
50 using namespace IGC;
51 using namespace IGC::IGCMD;
52 using namespace std;
53 
54 char EmitPass::ID = 0;
55 
56 /// Divide N into multiple of M (must be power of two), and the remaining into M/2,
57 /// M/4, ..., 1. Each sequence takes two elements in execsizeSeq, in which first
58 /// one has execsize, and the second one the starting offset.
59 /// For example with M = 16, N = 47,
60 ///  {16, 0}, {16, 16}, {8, 32}, {4, 40}, {2, 44} {1, 45}
splitIntoPowerOfTwo(SmallVector<uint32_t,16> & execsizeSeq,uint32_t N,uint32_t M)61 static void splitIntoPowerOfTwo(SmallVector<uint32_t, 16>& execsizeSeq, uint32_t N,  uint32_t M)
62 {
63     // Max execution size is 16.
64     int n = (int)N / (int)M;
65     uint32_t offset = 0;
66     for (int i = 0; i < n; ++i) {
67         execsizeSeq.push_back(16);
68         execsizeSeq.push_back(offset);
69         offset += 16;
70     }
71 
72     int m = (int)(N % M);
73     for (uint32_t s = M/2; m > 0; s = s / 2)
74     {
75         if (m >= (int)s)
76         {
77             execsizeSeq.push_back(s);
78             execsizeSeq.push_back(offset);
79             offset += s;
80             m -= s;
81         }
82     }
83 }
84 
85 namespace IGC
86 {
87     class VisaIdAnnotator : public llvm::AssemblyAnnotationWriter
88     {
89         DenseMap<const Value*, uint32_t> m_rootToVISAId;
90         DenseMap<const BasicBlock*, uint32_t> m_blockId;
91 
92     public:
VisaIdAnnotator()93         VisaIdAnnotator() {}
94 
emitBasicBlockStartAnnot(const BasicBlock * BB,formatted_raw_ostream & OS)95         void emitBasicBlockStartAnnot(const BasicBlock* BB, formatted_raw_ostream& OS) override
96         {
97             OS << "; BB";
98             if (m_blockId.count(BB)) {
99                 OS << m_blockId[BB] << " ";
100             }
101             OS << ":\n";
102         }
103 
printInfoComment(const Value & V,formatted_raw_ostream & OS)104         void printInfoComment(const Value& V, formatted_raw_ostream& OS) override
105         {
106             if (m_rootToVISAId.count(&V))
107                 OS << "\t\t; visa id: " << m_rootToVISAId[&V];
108         }
109 
trackVisaId(const Instruction * I,uint32_t vid)110         void trackVisaId(const Instruction* I, uint32_t vid) { m_rootToVISAId[I] = vid; }
trackBlockId(const BasicBlock * BB,uint32_t bbid)111         void trackBlockId(const BasicBlock* BB, uint32_t bbid) { m_blockId[BB] = bbid; }
112     };
113 }
114 
115 
EmitPass(CShaderProgram::KernelShaderMap & shaders,SIMDMode mode,bool canAbortOnSpill,ShaderDispatchMode shaderMode,PSSignature * pSignature)116 EmitPass::EmitPass(CShaderProgram::KernelShaderMap& shaders, SIMDMode mode, bool canAbortOnSpill, ShaderDispatchMode shaderMode, PSSignature* pSignature)
117     : FunctionPass(ID),
118     m_SimdMode(mode),
119     m_ShaderDispatchMode(shaderMode),
120     m_shaders(shaders),
121     m_currShader(nullptr),
122     m_encoder(nullptr),
123     m_canAbortOnSpill(canAbortOnSpill),
124     m_roundingMode_FP(ERoundingMode::ROUND_TO_NEAREST_EVEN),
125     m_roundingMode_FPCvtInt(ERoundingMode::ROUND_TO_ZERO),
126     m_pSignature(pSignature),
127     m_isDuplicate(false)
128 {
129     //Before calling getAnalysisUsage() for EmitPass, the passes that it depends on need to be initialized
130     initializeDominatorTreeWrapperPassPass(*PassRegistry::getPassRegistry());
131     initializeWIAnalysisPass(*PassRegistry::getPassRegistry());
132     initializeCodeGenPatternMatchPass(*PassRegistry::getPassRegistry());
133     initializeDeSSAPass(*PassRegistry::getPassRegistry());
134     initializeBlockCoalescingPass(*PassRegistry::getPassRegistry());
135     initializeCoalescingEnginePass(*PassRegistry::getPassRegistry());
136     initializeMetaDataUtilsWrapperPass(*PassRegistry::getPassRegistry());
137     initializeSimd32ProfitabilityAnalysisPass(*PassRegistry::getPassRegistry());
138     initializeVariableReuseAnalysisPass(*PassRegistry::getPassRegistry());
139     initializeLiveVariablesPass(*PassRegistry::getPassRegistry());
140 }
141 
~EmitPass()142 EmitPass::~EmitPass()
143 {
144 }
145 
146 // Switch to payload section
147 // When switching to payload section, the code redirects vKernel pointing to the payload section
148 // m_destination (LiveOut of interploation) will be allocated before compiling the kernel.
ContextSwitchPayloadSection(bool first)149 void EmitPass::ContextSwitchPayloadSection(bool first)
150 {
151     if (m_encoder->IsCodePatchCandidate())
152     {
153         if (first)
154         {
155             m_tmpDest = m_destination;
156         }
157         m_isDuplicate = first ? m_currShader->AppendPayloadSetup(m_destination) : false;
158         // When duplication happens, multiple instructions in divergent branches write to the same VR.
159         if (m_isDuplicate)
160         {
161             auto uniformSIMDMode = m_currShader->m_Platform->getMinDispatchMode();
162             CVariable* src = m_destination;
163             uint16_t size = m_destination->IsUniform() ? numLanes(uniformSIMDMode) :
164                 numLanes(m_currShader->m_SIMDSize);
165             CVariable* newSource = m_currShader->GetNewVariable(
166                     size,
167                     src->GetType(),
168                     EALIGN_GRF,
169                     m_destination->IsUniform(),
170                     src->getName());
171             m_currShader->AppendPayloadSetup(newSource);
172             m_destination = newSource;
173         }
174         m_encoder->SetPayloadSectionAsPrimary();
175     }
176 }
177 
ContextSwitchShaderBody(bool last)178 void EmitPass::ContextSwitchShaderBody(bool last)
179 {
180     if (m_encoder->IsCodePatchCandidate())
181     {
182         m_encoder->SetPayloadSectionAsSecondary();
183         if (last && m_isDuplicate)
184         {
185             m_encoder->Copy(m_tmpDest, m_destination);
186             m_encoder->Push();
187             m_destination = m_tmpDest;
188         }
189     }
190 }
191 
isHalfGRFReturn(CVariable * dst,SIMDMode simdMode)192 bool EmitPass::isHalfGRFReturn(CVariable* dst, SIMDMode simdMode)
193 {
194     auto typeSize = CEncoder::GetCISADataTypeSize(dst->GetType());
195     return simdMode == m_currShader->m_Platform->getMinDispatchMode() &&
196         typeSize == 2 && !dst->isUnpacked();
197 }
198 
DefReachUseWithinLevel(llvm::Value * def,const llvm::Instruction * use,uint level)199 static bool DefReachUseWithinLevel(llvm::Value* def, const llvm::Instruction* use, uint level)
200 {
201     if (level == 0 || !def || !use)
202         return false;
203     for (auto useIter = def->user_begin(), E = def->user_end(); useIter != E; ++useIter)
204     {
205         llvm::Instruction* useInst = dyn_cast<llvm::Instruction>(*useIter);
206         if (useInst)
207         {
208             if (useInst == use)
209                 return true;
210             else
211             {
212                 if (DefReachUseWithinLevel(useInst, use, level - 1))
213                     return true;
214             }
215         }
216     }
217     return false;
218 }
219 
DecideInstanceAndSlice(const llvm::BasicBlock & blk,SDAG & sdag,bool & slicing)220 uint EmitPass::DecideInstanceAndSlice(const llvm::BasicBlock& blk, SDAG& sdag, bool& slicing)
221 {
222     m_encoder->SetSubSpanDestination(false);
223     uint numInstance = m_currShader->m_numberInstance;
224 
225     slicing = (m_SimdMode == SIMDMode::SIMD32);  // set to false if we don't want slicing
226 
227     bool hasValidDestination = (sdag.m_root->getType()->getTypeID() != llvm::Type::VoidTyID);
228 
229     // Disable for struct type destinations
230     if (sdag.m_root->getType()->isStructTy())
231     {
232         hasValidDestination = false;
233     }
234 
235     if (hasValidDestination)
236     {
237         m_destination = GetSymbol(sdag.m_root);
238         numInstance = m_destination->GetNumberInstance();
239 
240         if (m_pattern->IsSubspanUse(sdag.m_root))
241         {
242             m_encoder->SetSubSpanDestination(true);
243         }
244 
245         if (isa<CmpInst>(sdag.m_root))
246         {
247             if (DefReachUseWithinLevel(sdag.m_root, blk.getTerminator(), 4))
248                 slicing = false;
249         }
250         else if (IsUniformAtomic(sdag.m_root))
251         {
252             numInstance = 1;
253             slicing = false;
254         }
255         else if (IsAtomicIntrinsic(GetOpCode(sdag.m_root)))
256         {
257             slicing = false;
258         }
259         else if (IsMediaIOIntrinsic(sdag.m_root))
260         {
261             numInstance = 1;
262             slicing = false;
263         }
264         else if (getGRFSize() != 32 && IsSIMDBlockIntrinsic(sdag.m_root))
265         {
266             numInstance = 1;
267             slicing = false;
268         }
269         else if (IsSubGroupIntrinsicWithSimd32Implementation(GetOpCode(sdag.m_root)))
270         {
271             numInstance = 1;
272             slicing = false;
273         }
274         else if (m_destination->IsUniform())
275         {
276             // if this uniform value is involved in phi-congruent class
277             // live-interval changed with slicing. Therefore, we need to stop slicing
278             // \todo: is it a good idea to pre-schedule all uniform operations to the beginning of the block?
279             if (m_deSSA->getRootValue(sdag.m_root))
280                 slicing = false;
281         }
282     }
283     else
284     {
285         m_destination = nullptr;
286         if (StoreInst * ST = dyn_cast<StoreInst>(sdag.m_root))
287         {
288             // Limit to OpenCL so far as it has uniform load/store support.
289             if (isUniformStoreOCL(ST))
290                 numInstance = 1;
291             slicing = false;
292         }
293         else if (sdag.m_root->isTerminator())
294         {
295             numInstance = 1;
296             slicing = false;
297         }
298         else if (m_currShader->GetIsUniform(sdag.m_root))
299         {
300             numInstance = 1;
301             // if this uniform value is involved in phi-congruent class
302             // live-interval changed with slicing. Therefore, we need to stop slicing
303             // \todo: is it a good idea to pre-schedule all uniform operations to the beginning of the block?
304             if (m_deSSA->getRootValue(sdag.m_root))
305                 slicing = false;
306         }
307         else if (llvm::GenIntrinsicInst * pIntrinsic = llvm::dyn_cast<llvm::GenIntrinsicInst>(sdag.m_root))
308         {
309             GenISAIntrinsic::ID id = pIntrinsic->getIntrinsicID();
310             if (id == GenISAIntrinsic::GenISA_threadgroupbarrier ||
311                 id == GenISAIntrinsic::GenISA_memoryfence ||
312                 id == GenISAIntrinsic::GenISA_flushsampler ||
313                 id == GenISAIntrinsic::GenISA_typedmemoryfence ||
314                 id == GenISAIntrinsic::GenISA_vaErode ||
315                 id == GenISAIntrinsic::GenISA_vaDilate ||
316                 id == GenISAIntrinsic::GenISA_vaMinMax ||
317                 id == GenISAIntrinsic::GenISA_vaMinMaxFilter ||
318                 id == GenISAIntrinsic::GenISA_vaConvolve ||
319                 id == GenISAIntrinsic::GenISA_vaConvolveGRF_16x1 ||
320                 id == GenISAIntrinsic::GenISA_vaConvolveGRF_16x4 ||
321                 id == GenISAIntrinsic::GenISA_vaCentroid ||
322                 id == GenISAIntrinsic::GenISA_vaBoolSum ||
323                 id == GenISAIntrinsic::GenISA_vaBoolCentroid ||
324                 id == GenISAIntrinsic::GenISA_MediaBlockWrite ||
325                 id == GenISAIntrinsic::GenISA_eu_thread_pause ||
326                 id == GenISAIntrinsic::GenISA_simdBlockWrite ||
327                 id == GenISAIntrinsic::GenISA_simdBlockWriteBindless)
328             {
329                 numInstance = 1;
330                 slicing = false;
331             }
332         }
333     }
334 
335     if (CallInst * callInst = dyn_cast<CallInst>(sdag.m_root))
336     {
337         // Disable slicing for function calls
338         Function* F = dyn_cast<Function>(IGCLLVM::getCalledValue(callInst));
339         if (!F || F->hasFnAttribute("visaStackCall"))
340         {
341             numInstance = 1;
342             slicing = false;
343         }
344     }
345     return numInstance;
346 }
347 
IsUndefOrZeroImmediate(const Value * value)348 bool EmitPass::IsUndefOrZeroImmediate(const Value* value)
349 {
350     if (isUndefOrConstInt0(value))
351     {
352         return true;
353     }
354 
355     if (const llvm::ConstantFP* CFP = llvm::dyn_cast<llvm::ConstantFP>(value))
356     {
357         APInt api = CFP->getValueAPF().bitcastToAPInt();
358         if (api.getZExtValue() == 0)
359         {
360             return true;
361         }
362     }
363     return false;
364 }
365 
setCurrentShader(llvm::Function * F)366 bool EmitPass::setCurrentShader(llvm::Function* F)
367 {
368     llvm::Function* Kernel = F;
369     if (m_FGA)
370     {
371         if (!m_FGA->getModule())
372         {
373             m_FGA->rebuild(F->getParent());
374         }
375         auto FG = m_FGA->getGroup(F);
376         if (!FG)
377         {
378             return false;
379         }
380         Kernel = FG->getHead();
381     }
382     else
383     {
384         // no analysis result avaliable.
385         m_FGA = nullptr;
386     }
387 
388     auto Iter = m_shaders.find(Kernel);
389     if (Iter == m_shaders.end())
390     {
391         return false;
392     }
393     m_currShader = Iter->second->GetOrCreateShader(m_SimdMode, m_ShaderDispatchMode);
394     m_encoder = &(m_currShader->GetEncoder());
395     return true;
396 }
397 
compileSymbolTableKernel(llvm::Function * F)398 bool EmitPass::compileSymbolTableKernel(llvm::Function* F)
399 {
400     IGC_ASSERT(IGC::isIntelSymbolTableVoidProgram(F));
401 
402     // Check has external functions attached
403     if ((m_FGA && m_FGA->getGroup(F) && !m_FGA->getGroup(F)->isSingle()))
404     {
405         return true;
406     }
407     // Checl has global symbols attached
408     else if (!m_moduleMD->inlineProgramScopeOffsets.empty())
409     {
410         for (auto it : m_moduleMD->inlineProgramScopeOffsets)
411         {
412             GlobalVariable* pGlobal = it.first;
413             // Export the symbol if global is external/common linkage
414             if (m_moduleMD->compOpt.EnableTakeGlobalAddress && (pGlobal->hasCommonLinkage() || pGlobal->hasExternalLinkage()))
415             {
416                 return true;
417             }
418 
419             // Remove dead users at this point
420             pGlobal->removeDeadConstantUsers();
421 
422             // Check if relocation is required by checking uses
423             for (auto user : pGlobal->users())
424             {
425                 if (isa<Instruction>(user))
426                 {
427                     return true;
428                 }
429             }
430         }
431     }
432     // Check if requiring symbol for imported function calls
433     else
434     {
435         for (auto& FI : F->getParent()->getFunctionList())
436         {
437             if (FI.isDeclaration() &&
438                 FI.hasFnAttribute("referenced-indirectly") &&
439                 !FI.use_empty())
440             {
441                 return true;
442             }
443         }
444     }
445     return false;
446 }
447 
CreateKernelShaderMap(CodeGenContext * ctx,MetaDataUtils * pMdUtils,llvm::Function & F)448 void EmitPass::CreateKernelShaderMap(CodeGenContext* ctx, MetaDataUtils* pMdUtils, llvm::Function& F)
449 {
450     /* Moving CShaderProgram instantiation to EmitPass from codegen*/
451     // Instantiate CShaderProgram and create map only if m_shaders is empty
452     if (m_shaders.empty())
453     {
454         /* OpenCL shader */
455         if (ctx->type == ShaderType::OPENCL_SHADER)
456         {
457             for (auto i = pMdUtils->begin_FunctionsInfo(), e = pMdUtils->end_FunctionsInfo(); i != e; ++i)
458             {
459                 Function* pFunc = i->first;
460                 // Skip non-kernel functions.
461                 if (!isEntryFunc(pMdUtils, pFunc))
462                     continue;
463 
464                 if (ctx->m_retryManager.kernelSet.empty() ||
465                     ctx->m_retryManager.kernelSet.count(pFunc->getName().str()))
466                 {
467                     m_shaders[pFunc] = new CShaderProgram(ctx, pFunc);
468                     COMPILER_SHADER_STATS_INIT(m_shaders[pFunc]->m_shaderStats);
469                 }
470             }
471         }
472         /* Pixel Shader */
473         else if (ctx->type == ShaderType::PIXEL_SHADER)
474         {
475             Function* coarsePhase = nullptr;
476             Function* pixelPhase = nullptr;
477             NamedMDNode* coarseNode = ctx->getModule()->getNamedMetadata(NAMED_METADATA_COARSE_PHASE);
478             NamedMDNode* pixelNode = ctx->getModule()->getNamedMetadata(NAMED_METADATA_PIXEL_PHASE);
479             if (coarseNode)
480             {
481                 coarsePhase = mdconst::dyn_extract<Function>(coarseNode->getOperand(0)->getOperand(0));
482             }
483             if (pixelNode)
484             {
485                 pixelPhase = mdconst::dyn_extract<Function>(pixelNode->getOperand(0)->getOperand(0));
486             }
487             if (coarsePhase && pixelPhase)
488             {
489                 //Multi stage PS
490                 CShaderProgram* pProgram = new CShaderProgram(ctx, &F);
491                 CPixelShader* pProgram8 =
492                     static_cast<CPixelShader*>(pProgram->GetOrCreateShader(SIMDMode::SIMD8));
493                 CPixelShader* pProgram16 =
494                     static_cast<CPixelShader*>(pProgram->GetOrCreateShader(SIMDMode::SIMD16));
495                 pProgram8->SetPSSignature(m_pSignature);
496                 pProgram16->SetPSSignature(m_pSignature);
497                 m_shaders[&F] = pProgram;
498                 COMPILER_SHADER_STATS_INIT(pProgram->m_shaderStats);
499             }
500             else
501             {
502                 // Single PS
503                 // Assuming single shader information in metadata
504                 Function* pFunc = getUniqueEntryFunc(pMdUtils, ctx->getModuleMetaData());
505 
506                 CShaderProgram* pProgram = new CShaderProgram(ctx, pFunc);
507                 m_shaders[pFunc] = pProgram;
508                 COMPILER_SHADER_STATS_INIT(pProgram->m_shaderStats);
509 
510             }
511         }
512         /* All other shader types */
513         else
514         {
515             for (auto i = pMdUtils->begin_FunctionsInfo(), e = pMdUtils->end_FunctionsInfo(); i != e; ++i)
516             {
517                 Function* pFunc = i->first;
518                 // Skip non-entry functions.
519                 if (!isEntryFunc(pMdUtils, pFunc))
520                 {
521                     continue;
522                 }
523                 m_shaders[pFunc] = new CShaderProgram(ctx, pFunc);
524                 COMPILER_SHADER_STATS_INIT(m_shaders[pFunc]->m_shaderStats);
525             }
526         }
527     }
528 }
529 
runOnFunction(llvm::Function & F)530 bool EmitPass::runOnFunction(llvm::Function& F)
531 {
532     m_currFuncHasSubroutine = false;
533 
534     m_pCtx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
535     MetaDataUtils* pMdUtils = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
536     if (pMdUtils->findFunctionsInfoItem(&F) == pMdUtils->end_FunctionsInfo())
537     {
538         return false;
539     }
540     m_moduleMD = getAnalysis<MetaDataUtilsWrapper>().getModuleMetaData();
541 
542     CreateKernelShaderMap(m_pCtx, pMdUtils, F);
543 
544     m_FGA = getAnalysisIfAvailable<GenXFunctionGroupAnalysis>();
545 
546     if ((IsStage1BestPerf(m_pCtx->m_CgFlag, m_pCtx->m_StagingCtx) ||
547         IGC_IS_FLAG_ENABLED(ForceBestSIMD)) &&
548         m_SimdMode == SIMDMode::SIMD8)
549     {
550         /* Don't do SIMD8 if SIMD16 has no spill */
551         auto Iter = m_shaders.find(&F);
552         if (Iter == m_shaders.end())
553         {
554             return false;
555         }
556 
557         CShader * simd16Program = Iter->second->GetShader(SIMDMode::SIMD16);
558         if (simd16Program &&
559             simd16Program->ProgramOutput()->m_programBin != 0 &&
560             simd16Program->ProgramOutput()->m_scratchSpaceUsedBySpills == 0)
561             return false;
562     }
563 
564     if (!setCurrentShader(&F))
565     {
566         return false;
567     }
568 
569     // Dummy program is only used for symbol table info, check if compilation is required
570     if (IGC::isIntelSymbolTableVoidProgram(&F))
571     {
572         if (!compileSymbolTableKernel(&F))
573         {
574             return false;
575         }
576     }
577 
578     m_DL = &F.getParent()->getDataLayout();
579     m_pattern = &getAnalysis<CodeGenPatternMatch>();
580     m_deSSA = &getAnalysis<DeSSA>();
581     m_blockCoalescing = &getAnalysis<BlockCoalescing>();
582     m_CE = &getAnalysis<CoalescingEngine>();
583     m_VRA = &getAnalysis<VariableReuseAnalysis>();
584 
585     m_currShader->SetUniformHelper(&getAnalysis<WIAnalysis>());
586     m_currShader->SetCodeGenHelper(m_pattern);
587     m_currShader->SetDominatorTreeHelper(&getAnalysis<DominatorTreeWrapperPass>().getDomTree());
588     m_currShader->SetMetaDataUtils(getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils());
589     m_currShader->SetShaderSpecificHelper(this);
590     m_currShader->SetDataLayout(m_DL);
591     m_currShader->SetFunctionGroupAnalysis(m_FGA);
592     m_currShader->SetPushInfoHelper(&(m_moduleMD->pushInfo));
593     m_currShader->SetVariableReuseAnalysis(m_VRA);
594     if (IGC_IS_FLAG_DISABLED(DisableDeSSA))
595     {
596         m_currShader->SetDeSSAHelper(m_deSSA);
597     }
598     //Add CCtuple root variables.
599     if (IGC_IS_FLAG_DISABLED(DisablePayloadCoalescing)) {
600         m_currShader->SetCoalescingEngineHelper(m_CE);
601     }
602 
603 
604     CShader* prevShader = m_pCtx->m_prevShader;
605     bool isFuncGroupHead = !m_FGA || m_FGA->isGroupHead(&F);
606     bool hasStackCall = m_FGA && m_FGA->getGroup(&F) && m_FGA->getGroup(&F)->hasStackCall();
607     if (isFuncGroupHead)
608     {
609         if (hasStackCall)
610         {
611             m_currShader->SetHasStackCalls();
612         }
613         if (isIntelSymbolTableVoidProgram(&F))
614         {
615             m_currShader->SetIsIntelSymbolTableVoidProgram();
616         }
617 
618         m_currShader->InitEncoder(m_SimdMode, m_canAbortOnSpill, m_ShaderDispatchMode);
619         // Pre-analysis pass to be executed before call to visa builder so we can pass scratch space offset
620         m_currShader->PreAnalysisPass();
621         if (!m_currShader->CompileSIMDSize(m_SimdMode, *this, F))
622         {
623             return false;
624         }
625 
626         VISAKernel* prevKernel = nullptr;
627 
628         if (prevShader &&
629             m_currShader->IsPatchablePS() &&
630             m_encoder->GetSimdSize() == prevShader->GetEncoder().GetSimdSize() &&
631             prevShader->GetEncoder().IsCodePatchCandidate() &&
632             prevShader->ProgramOutput()->m_programBin &&
633             prevShader->ProgramOutput()->m_scratchSpaceUsedBySpills == 0)
634         {
635             prevKernel = prevShader->GetEncoder().GetVISAKernel();
636             m_encoder->SetPayloadEnd(prevShader->GetEncoder().GetPayloadEnd());
637         }
638 
639         if (IGC_GET_FLAG_VALUE(CodePatch) &&
640             ((!m_pCtx->hash.nosHash) || IGC_GET_FLAG_VALUE(CodePatch) > CodePatch_Enable_NoLTO) &&
641             m_currShader->IsPatchablePS() &&
642             m_SimdMode == SIMDMode::SIMD16 &&
643             (m_ShaderDispatchMode != ShaderDispatchMode::NOT_APPLICABLE || prevKernel) &&
644             (IGC_GET_FLAG_VALUE(CodePatchLimit) == 0 || 2 <= IGC_GET_FLAG_VALUE(CodePatchLimit)))
645         {
646             m_encoder->SetIsCodePatchCandidate(true);
647 
648             // FIXME: Skip corner cases for now. Remove this later.
649             for (uint i = 0; i < m_pattern->m_numBlocks && m_encoder->IsCodePatchCandidate(); i++)
650             {
651                 SBasicBlock& block = m_pattern->m_blocks[i];
652                 auto I = block.m_dags.rbegin(), E = block.m_dags.rend();
653                 while (I != E && m_encoder->IsCodePatchCandidate())
654                 {
655                     Instruction* llvmInst = I->m_root;
656                     if (llvmInst->getOpcode() == Instruction::Call)
657                     {
658                         if (GenIntrinsicInst * I = dyn_cast<GenIntrinsicInst>(llvmInst))
659                         {
660                             switch(I->getIntrinsicID())
661                             {
662                                 case GenISAIntrinsic::GenISA_PullSampleIndexBarys:
663                                     {
664                                         if (IGC_GET_FLAG_VALUE(CodePatchFilter) & CODE_PATCH_NO_PullSampleIndex) {
665                                             m_encoder->SetIsCodePatchCandidate(false);
666                                         }
667                                     }
668                                     break;
669                                 case GenISAIntrinsic::GenISA_PullSnappedBarys:
670                                     {
671                                         if (IGC_GET_FLAG_VALUE(CodePatchFilter) & CODE_PATCH_NO_PullSnapped) {
672                                             m_encoder->SetIsCodePatchCandidate(false);
673                                         }
674                                     }
675                                     break;
676                                 case GenISAIntrinsic::GenISA_PullCentroidBarys:
677                                     {
678                                         if (IGC_GET_FLAG_VALUE(CodePatchFilter) & CODE_PATCH_NO_PullCentroid) {
679                                             m_encoder->SetIsCodePatchCandidate(false);
680                                         }
681                                     }
682                                     break;
683                                 case GenISAIntrinsic::GenISA_DCL_SystemValue:
684                                     {
685                                         // This is where we will have ZWDelta
686                                         if (IGC_GET_FLAG_VALUE(CodePatchFilter) & CODE_PATCH_NO_ZWDelta &&
687                                             m_currShader->GetShaderType() == ShaderType::PIXEL_SHADER)
688                                         {
689                                             CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
690                                             SGVUsage usage = (SGVUsage)llvm::cast<llvm::ConstantInt>(I->getOperand(0))->getZExtValue();
691                                             if (usage == POSITION_Z &&
692                                                 (psProgram->GetPhase() == PSPHASE_PIXEL || psProgram->GetPhase() == PSPHASE_COARSE))
693                                             {
694                                                 m_encoder->SetIsCodePatchCandidate(false);
695                                             }
696                                         }
697                                     }
698                                     break;
699                                 default:
700                                     break;
701                             }
702                         }
703                     }
704                     ++I;
705                 }
706             }
707             if ((IGC_GET_FLAG_VALUE(CodePatchFilter) & (0x1 << 0x4)) &&
708                     m_pCtx->platform.getPlatformInfo().eProductFamily == IGFX_ALDERLAKE_P) {
709                 m_encoder->SetIsCodePatchCandidate(false);
710             }
711         }
712         else
713         {
714             m_encoder->SetIsCodePatchCandidate(false);
715         }
716 
717         // Check if the function, or the FG, has inline asm calls.
718         // We need this to set the correct builder mode to parse inline asm.
719         bool hasInlineAsmCall = m_pCtx->m_instrTypes.hasInlineAsm &&
720             m_pCtx->m_DriverInfo.SupportInlineAssembly() &&
721             (!m_FGA ? IGC::hasInlineAsmInFunc(F) : m_FGA->getGroup(&F)->hasInlineAsm());
722 
723         // call builder after pre-analysis pass where scratchspace offset to VISA is calculated
724         m_encoder->InitEncoder(m_canAbortOnSpill, hasStackCall, hasInlineAsmCall, prevKernel);
725         initDefaultRoundingMode();
726         m_currShader->PreCompile();
727 
728         // initialize stack if having stack usage
729         bool hasVLA = (m_FGA && m_FGA->getGroup(&F) && m_FGA->getGroup(&F)->hasVariableLengthAlloca()) || F.hasFnAttribute("hasVLA");
730         if (hasStackCall || hasVLA)
731         {
732             m_encoder->InitFuncAttribute(&F, true);
733             InitializeKernelStack(&F);
734         }
735         if (m_encoder->IsCodePatchCandidate())
736         {
737             m_currShader->SplitPayloadFromShader(&F);
738         }
739         m_currShader->AddPrologue();
740     }
741     else
742     {
743         // If kernel function is not compiled for the SIMD size then VISABuilder==nullptr
744         if (m_currShader->GetEncoder().GetVISABuilder() == nullptr)
745         {
746             return false;
747         }
748         if (!m_currShader->CompileSIMDSize(m_SimdMode, *this, F))
749         {
750             return false;
751         }
752         m_currShader->BeginFunction(&F);
753         if (m_FGA && m_FGA->useStackCall(&F))
754         {
755             m_encoder->InitFuncAttribute(&F, false);
756             emitStackFuncEntry(&F);
757         }
758     }
759 
760     // Only apply WA to OCL shaders with stackcall enabled
761     // TODO: Remove this WA once vISA handles the register copy
762     bool needKernelArgOverrideWA = isFuncGroupHead && hasStackCall && m_currShader->GetShaderType() == ShaderType::OPENCL_SHADER;
763     if (needKernelArgOverrideWA)
764     {
765         // Requires early payload allocation to know the kernel arg offsets
766         m_currShader->CacheArgumentsList();
767         m_currShader->MapPushedInputs();
768         m_currShader->AllocatePayload();
769 
770         // This WA copies all kernel args > r26.0 into a temp register when stackcalls are enabled.
771         // Since vISA stackcall ABI predefine the argument register to r26.0, if the payload is larger than
772         // 26GRFs then doing a stackcall will overwrite the payload registers.
773         const int visaStackCallArgRegStart = 26;
774         static const int64_t maxGRFOffset = visaStackCallArgRegStart * m_currShader->getGRFSize();
775         llvm::IRBuilder<> builder(&*F.getEntryBlock().getFirstInsertionPt());
776         for (auto& arg : F.args())
777         {
778             // Skip unused arguments
779             if (arg.user_empty()) continue;
780 
781             Argument* kernArg = &arg;
782             CVariable* kernArgV = m_currShader->GetSymbol(kernArg);
783             // Get the allocated payload offset for this kernel arg
784             int64_t offset = m_currShader->GetKernelArgOffset(kernArgV);
785             // If kernel payload size exceeds maxGRFOffset, we must copy the kernel args into another register.
786             if (offset >= maxGRFOffset)
787             {
788                 // Create a dummy instruction using RTV, just so we can use the LLVM replaceAllUsesWith to replace the kernelArg usages.
789                 Function* pFunc = GenISAIntrinsic::getDeclaration(F.getParent(), GenISAIntrinsic::GenISA_RuntimeValue, kernArg->getType());
790                 Value* tempCall = builder.CreateCall(pFunc, builder.getInt32(kernArg->getArgNo()), "kernArgCopy");
791                 kernArg->replaceAllUsesWith(tempCall);
792 
793                 // Create another CVar to hold the copied kernelArg, and map it to the dummy instruction.
794                 // When doing vISA codegen, all usages of the dummy instruction will get the value of the copied kernelArg.
795                 CVariable* copiedArg = m_currShader->GetNewVariable(kernArgV);
796                 emitCopyAll(copiedArg, kernArgV, kernArg->getType());
797                 m_currShader->UpdateSymbolMap(tempCall, copiedArg);
798                 // Temp instruction needs the same uniform analysis attribute as kernel arg
799                 m_currShader->SetDependency(tempCall, m_currShader->GetDependency(kernArg));
800             }
801         }
802     }
803 
804 
805     if (IGC_IS_FLAG_ENABLED(DumpHasNonKernelArgLdSt)) {
806         ModuleMetaData* modMD = getAnalysis<MetaDataUtilsWrapper>().getModuleMetaData();
807         FunctionMetaData* funcMD = &modMD->FuncMD[&F];
808         if (hasStackCall || m_currFuncHasSubroutine) {
809             // conservative set the hasNonKernelArgLoad/Store to true
810             funcMD->hasNonKernelArgLoad = true;
811             funcMD->hasNonKernelArgStore = true;
812             funcMD->hasNonKernelArgAtomic = true;
813         }
814         // then write the result to the shader
815         if (m_currShader->GetShaderType() == ShaderType::OPENCL_SHADER) {
816             COpenCLKernel* kernel = static_cast<COpenCLKernel*>(m_currShader);
817             kernel->m_kernelInfo.m_hasNonKernelArgLoad = funcMD->hasNonKernelArgLoad? 1 : 0;
818             kernel->m_kernelInfo.m_hasNonKernelArgStore = funcMD->hasNonKernelArgStore? 1 : 0;
819             kernel->m_kernelInfo.m_hasNonKernelArgAtomic = funcMD->hasNonKernelArgAtomic? 1 : 0;
820         }
821     }
822 
823     // Create a symbol relocation entry for each symbol used by F
824     emitSymbolRelocation(F);
825 
826     m_VRA->BeginFunction(&F, numLanes(m_SimdMode));
827     if (isFuncGroupHead)
828     {
829         Function* Entry = m_currShader->entry;
830         // owned by m_pDebugEmitter
831         const bool IsPrimary = true;
832         auto vMod = IGC::ScalarVisaModule::BuildNew(m_currShader, Entry, IsPrimary);
833         IGC::DebugEmitterOpts DebugOpts;
834         DebugOpts.DebugEnabled = DebugInfoData::hasDebugInfo(m_currShader);
835         DebugOpts.EnableGTLocationDebugging = IGC_IS_FLAG_ENABLED(EnableGTLocationDebugging);
836         DebugOpts.UseOffsetInLocation = IGC_IS_FLAG_ENABLED(UseOffsetInLocation);
837         DebugOpts.EmitDebugLoc = IGC_IS_FLAG_ENABLED(EmitDebugLoc);
838         DebugOpts.EmitOffsetInDbgLoc = IGC_IS_FLAG_ENABLED(EmitOffsetInDbgLoc);
839         DebugOpts.ZeBinCompatible = IGC_IS_FLAG_ENABLED(ZeBinCompatibleDebugging) && IGC_IS_FLAG_ENABLED(EnableZEBinary);
840         DebugOpts.EnableRelocation = IGC_IS_FLAG_ENABLED(EnableRelocations) || DebugOpts.ZeBinCompatible;
841         DebugOpts.EnforceAMD64Machine = IGC_IS_FLAG_ENABLED(DebugInfoEnforceAmd64EM) || DebugOpts.ZeBinCompatible;
842         DebugOpts.EnableDebugInfoValidation = IGC_IS_FLAG_ENABLED(DebugInfoValidation);
843         m_pDebugEmitter = IDebugEmitter::Create();
844         m_pDebugEmitter->Initialize(std::move(vMod), DebugOpts);
845     }
846 
847     IGC_ASSERT(m_pDebugEmitter);
848 
849     if (DebugInfoData::hasDebugInfo(m_currShader))
850     {
851         m_currShader->GetDebugInfoData().m_pShader = m_currShader;
852         m_currShader->GetDebugInfoData().m_pDebugEmitter = m_pDebugEmitter;
853 
854         const bool IsPrimary = isFuncGroupHead;
855         m_pDebugEmitter->resetModule(
856             IGC::ScalarVisaModule::BuildNew(m_currShader, &F, IsPrimary));
857     }
858 
859     // We only invoke EndEncodingMark() to update last VISA id.
860     m_pDebugEmitter->EndEncodingMark();
861 
862     phiMovToBB.clear();
863     unsigned int lineNo = 0;
864     bool disableSlicing =
865         IGC_IS_FLAG_ENABLED(DisableSIMD32Slicing) ||
866         !m_currShader->GetContext()->m_retryManager.AllowSimd32Slicing() ||
867         m_currShader->GetContext()->getModuleMetaData()->compOpt.OptDisable ||
868         m_pattern->m_samplertoRenderTargetEnable;
869 
870     IGC::Debug::Dump* llvmtoVISADump = nullptr;
871     if (IGC_IS_FLAG_ENABLED(ShaderDumpEnable))
872     {
873         auto name = IGC::Debug::GetDumpNameObj(m_currShader, "visa.ll");
874         // If the function is in a function group, set the postfix string of
875         // DumpName as "entry name" + "_f" + "id".
876         if (m_FGA && !m_FGA->isGroupHead(&F)) {
877           FunctionGroup* group = m_FGA->getGroup(&F);
878           // To align with visa suffixing, make id start from 0.
879           unsigned id = -1;
880           for (auto it = group->begin(), ie = group->end(); it != ie; ++it)
881           {
882               if (*it == &F)
883                   break;
884               ++id;
885           }
886           std::string postfix = group->getHead()->getName().str() + "_f" + std::to_string(id);
887           name = name.PostFix(postfix);
888         }
889         if (name.allow())
890             llvmtoVISADump = new IGC::Debug::Dump(name, IGC::Debug::DumpType::PASS_IR_TEXT);
891     }
892     VisaIdAnnotator VidAnnotator;  // for visa.ll dump
893     StringRef curSrcFile, curSrcDir;
894 
895     for (uint i = 0; i < m_pattern->m_numBlocks; i++)
896     {
897         SBasicBlock& block = m_pattern->m_blocks[i];
898         block.m_activeMask = nullptr;   // clear for each SIMD size
899         m_currentBlock = i;
900         if (m_blockCoalescing->IsEmptyBlock(block.bb))
901         {
902             continue;
903         }
904 
905         if (llvmtoVISADump)
906         {
907             VidAnnotator.trackBlockId(block.bb, i);
908         }
909 
910         if (i != 0)
911         {
912             m_pDebugEmitter->BeginEncodingMark();
913             // create a label
914             m_encoder->Label(block.id);
915             m_encoder->Push();
916             m_pDebugEmitter->EndEncodingMark();
917         }
918 
919         // remove cached per lane offset variables if any.
920         PerLaneOffsetVars.clear();
921 
922         // Variable reuse per-block states.
923         VariableReuseAnalysis::EnterBlockRAII EnterBlock(m_VRA, block.bb);
924 
925         // go through the list in reverse order
926         auto I = block.m_dags.rbegin(), E = block.m_dags.rend();
927         while (I != E)
928         {
929             Instruction* llvmInst = I->m_root;
930             if (llvmInst->getDebugLoc())
931             {
932                 unsigned int curLineNumber = llvmInst->getDebugLoc().getLine();
933                 auto&& srcFile = llvmInst->getDebugLoc()->getScope()->getFilename();
934                 auto&& srcDir = llvmInst->getDebugLoc()->getScope()->getDirectory();
935                 if (!curSrcFile.equals(srcFile) || !curSrcDir.equals(srcDir))
936                 {
937                     curSrcFile = srcFile;
938                     curSrcDir = srcDir;
939                     m_pDebugEmitter->BeginEncodingMark();
940                     llvm::SmallVector<char, 1024> fileName;
941                     llvm::sys::path::append(fileName, curSrcDir);
942                     llvm::sys::path::append(fileName, curSrcFile);
943                     std::string fileNameStr(fileName.begin(), fileName.end());
944                     m_encoder->File(fileNameStr);
945                     m_pDebugEmitter->EndEncodingMark();
946                 }
947                 if (curLineNumber != lineNo)
948                 {
949                      m_pDebugEmitter->BeginEncodingMark();
950                      m_encoder->Loc(curLineNumber);
951                      m_pDebugEmitter->EndEncodingMark();
952                      lineNo = curLineNumber;
953                 }
954             }
955 
956             bool slicing = false;
957             uint numInstance = DecideInstanceAndSlice(*block.bb, *I, slicing);
958             IGC_ASSERT(numInstance == 1 || numInstance == 2);
959 
960             if (slicing && !disableSlicing)
961             {
962                  m_pDebugEmitter->BeginEncodingMark();
963                  I = emitInSlice(block, I);
964                  m_pDebugEmitter->EndEncodingMark();
965                  llvmInst = I->m_root;
966             }
967 
968             if (I != E)
969             {
970                 m_pDebugEmitter->BeginInstruction(llvmInst);
971 
972                 // before inserting the terminator, initialize constant pool & insert the de-ssa moves
973                 if (isa<BranchInst>(llvmInst))
974                 {
975                     m_encoder->SetSecondHalf(false);
976                     // insert constant initializations.
977                     InitConstant(block.bb);
978                     // Insert lifetime start if there are any
979                     emitLifetimeStartAtEndOfBB(block.bb);
980                     // insert the de-ssa movs.
981                     MovPhiSources(block.bb);
982                 }
983 
984                 // If slicing happens, then recalculate the number of instances.
985                 if (slicing)
986                 {
987                     numInstance = DecideInstanceAndSlice(*block.bb, *I, slicing);
988                 }
989 
990                 if (llvmtoVISADump)
991                 {
992                     VidAnnotator.trackVisaId(llvmInst, m_encoder->GetVISAKernel()->getvIsaInstCount() + 1);
993                 }
994 
995                 // Insert lifetime start if legal. Note taht m_destination
996                 // shall be nullptr if this instruction has no dst.
997                 emitLifetimeStart(m_destination, block.bb, llvmInst, true);
998 
999                 DstModifier init;
1000                 if (numInstance < 2)
1001                 {
1002                     m_encoder->SetSecondHalf(false);
1003                     I->m_pattern->Emit(this, init);
1004                     ++I;
1005                 }
1006                 else
1007                 {
1008                     m_encoder->SetSecondHalf(false);
1009                     I->m_pattern->Emit(this, init);
1010                     m_encoder->SetSecondHalf(true);
1011                     I->m_pattern->Emit(this, init);
1012                     ++I;
1013                 }
1014                 m_pDebugEmitter->EndInstruction(llvmInst);
1015             }
1016         }
1017     }
1018 
1019     if (llvmtoVISADump)
1020     {
1021         F.print(llvmtoVISADump->stream(), &VidAnnotator);
1022         delete llvmtoVISADump;
1023     }
1024 
1025     if (m_FGA && !m_FGA->useStackCall(&F))
1026     {
1027         BasicBlock* exitBB = &*(F.getBasicBlockList().rbegin());
1028         if (IGC_IS_FLAG_ENABLED(ForceSubReturn) &&
1029             !isa_and_nonnull<ReturnInst>(exitBB->getTerminator()))
1030         {
1031             // No return, generate dummy return for each subroutine to meet visa requirement.
1032             m_encoder->SubroutineRet(nullptr, &F);
1033             m_encoder->Push();
1034         }
1035     }
1036 
1037     if (isFuncGroupHead)
1038     {
1039         if (!needKernelArgOverrideWA)
1040         {
1041             // Cache the arguments list into a vector for faster access
1042             m_currShader->CacheArgumentsList();
1043             // Associates values pushed to CVariable
1044             m_currShader->MapPushedInputs();
1045             // Allocate the thread payload
1046             m_currShader->AllocatePayload();
1047         }
1048 
1049         if (m_currShader->ProgramOutput()->m_scratchSpaceUsedBySpills)
1050         {
1051             if (IGC_GET_FLAG_VALUE(CodePatchExperiments))
1052             {
1053                 errs() << "Skip Prologue : " << m_encoder->GetShaderName() << "\n";
1054             }
1055             return false;
1056         }
1057         if (m_encoder->IsCodePatchCandidate())
1058         {
1059             if (IGC_GET_FLAG_VALUE(CodePatchLimit) >= 2)
1060             {
1061                 IGC_SET_FLAG_VALUE(CodePatchLimit, IGC_GET_FLAG_VALUE(CodePatchLimit) - 1);
1062             }
1063             if (IGC_GET_FLAG_VALUE(CodePatchExperiments))
1064             {
1065                 errs() << IGC_GET_FLAG_VALUE(CodePatchLimit) << " Prologue/CodePatch : " << m_encoder->GetShaderName() << "\n";
1066             }
1067         }
1068         else
1069         {
1070             if (IGC_GET_FLAG_VALUE(CodePatchExperiments))
1071             {
1072                 errs() << IGC_GET_FLAG_VALUE(CodePatchLimit) << " not : " << m_encoder->GetShaderName() << "\n";
1073             }
1074         }
1075     }
1076 
1077     if (m_currShader->GetDebugInfoData().m_pDebugEmitter)
1078     {
1079         if (IGC_IS_FLAG_ENABLED(UseOffsetInLocation))
1080         {
1081             if (IGC::ForceAlwaysInline() ||
1082                 ((OpenCLProgramContext*)(m_currShader->GetContext()))->m_InternalOptions.KernelDebugEnable)
1083             {
1084                 DebugInfoData::markOutput(F, m_currShader, m_pDebugEmitter);
1085             }
1086             ScalarVisaModule* scVISAMod = (ScalarVisaModule*)(m_pDebugEmitter->getCurrentVISA());
1087             if (!scVISAMod->getPerThreadOffset() && m_currShader->hasFP())
1088             {
1089                 // Stack calls in use. Nothing is needed to be marked as Output.
1090                 // Just setting frame pointer is required for debug info when stack calls are in use.
1091                 scVISAMod->setFramePtr(m_currShader->GetFP());
1092             }
1093         }
1094         else
1095         {
1096             m_currShader->GetDebugInfoData().markOutput(F, m_currShader);
1097         }
1098 
1099         m_currShader->GetDebugInfoData().addVISAModule(&F, m_pDebugEmitter->getCurrentVISA());
1100         m_currShader->GetDebugInfoData().transferMappings(F);
1101     }
1102 
1103     // Compile only when this is the last function for this kernel.
1104     bool finalize = (!m_FGA || m_FGA->isGroupTail(&F));
1105     bool destroyVISABuilder = false;
1106     if (finalize)
1107     {
1108         destroyVISABuilder = true;
1109         // We only need one symbol table per module. If there are multiple entry functions, only create a symbol
1110         // for the dummy kernel with indirect functions attached.
1111         bool compileWithSymbolTable = false;
1112         Function* currHead = m_FGA ? m_FGA->getGroupHead(&F) : &F;
1113         if (IGC::isIntelSymbolTableVoidProgram(currHead))
1114         {
1115             compileWithSymbolTable = true;
1116         }
1117         m_encoder->Compile(compileWithSymbolTable);
1118         m_pCtx->m_prevShader = m_currShader;
1119 
1120         if (hasStackCall)
1121         {
1122             // Disable retry when stackcalls are present
1123             m_pCtx->m_retryManager.Disable();
1124         }
1125     }
1126 
1127     if (destroyVISABuilder)
1128     {
1129         if (!m_currShader->GetDebugInfoData().m_pDebugEmitter)
1130         {
1131             IDebugEmitter::Release(m_pDebugEmitter);
1132         }
1133 
1134         if (!m_encoder->IsCodePatchCandidate() ||
1135             m_encoder->HasPrevKernel() ||
1136             !m_currShader->ProgramOutput()->m_programBin ||
1137             m_currShader->ProgramOutput()->m_scratchSpaceUsedBySpills)
1138         {
1139             m_pCtx->m_prevShader = nullptr;
1140             // Postpone destroying VISA builder to
1141             // after emitting debug info and passing context for code patching
1142             m_encoder->DestroyVISABuilder();
1143         }
1144         if (m_encoder->IsCodePatchCandidate() && m_encoder->HasPrevKernel())
1145         {
1146             prevShader->GetEncoder().DestroyVISABuilder();
1147         }
1148     }
1149 
1150     if ((m_currShader->GetShaderType() == ShaderType::COMPUTE_SHADER ||
1151         m_currShader->GetShaderType() == ShaderType::OPENCL_SHADER) &&
1152         m_currShader->m_Platform->supportDisableMidThreadPreemptionSwitch() &&
1153         IGC_IS_FLAG_ENABLED(EnableDisableMidThreadPreemptionOpt) &&
1154         (m_currShader->GetContext()->m_instrTypes.numLoopInsts == 0) &&
1155         (m_currShader->ProgramOutput()->m_InstructionCount < IGC_GET_FLAG_VALUE(MidThreadPreemptionDisableThreshold)))
1156     {
1157         if (m_currShader->GetShaderType() == ShaderType::COMPUTE_SHADER)
1158         {
1159             CComputeShader* csProgram = static_cast<CComputeShader*>(m_currShader);
1160             csProgram->SetDisableMidthreadPreemption();
1161         }
1162         else
1163         {
1164             COpenCLKernel* kernel = static_cast<COpenCLKernel*>(m_currShader);
1165             kernel->SetDisableMidthreadPreemption();
1166         }
1167     }
1168 
1169     if (IGC_IS_FLAG_ENABLED(ForceBestSIMD))
1170     {
1171         return false;
1172     }
1173 
1174     if (m_SimdMode == SIMDMode::SIMD16 &&
1175         this->m_ShaderDispatchMode == ShaderDispatchMode::NOT_APPLICABLE &&
1176         IsStage1BestPerf(m_pCtx->m_CgFlag, m_pCtx->m_StagingCtx))
1177     {
1178         m_pCtx->m_doSimd32Stage2 = m_currShader->CompileSIMDSize(SIMDMode::SIMD32, *this, F);
1179     }
1180 
1181     if (m_SimdMode == SIMDMode::SIMD8 &&
1182         IsStage1FastCompile(m_pCtx->m_CgFlag, m_pCtx->m_StagingCtx))
1183     {
1184         m_pCtx->m_doSimd16Stage2 = m_currShader->CompileSIMDSize(SIMDMode::SIMD16, *this, F);
1185         m_pCtx->m_doSimd32Stage2 = m_currShader->CompileSIMDSize(SIMDMode::SIMD32, *this, F);
1186     }
1187 
1188     return false;
1189 }
1190 
1191 // Emit code in slice starting from (reverse) iterator I. Return the iterator to
1192 // the next pattern to emit.
1193 SBasicBlock::reverse_iterator
emitInSlice(SBasicBlock & block,SBasicBlock::reverse_iterator I)1194 EmitPass::emitInSlice(SBasicBlock& block, SBasicBlock::reverse_iterator I)
1195 {
1196     auto sliceBegin = I;
1197     auto sliceIter = I;
1198     auto E = block.m_dags.rend();
1199     DstModifier init;
1200 
1201     bool slicing = true;
1202     m_encoder->SetSecondHalf(false);  // the 1st-half slice for simd32
1203     while (slicing)
1204     {
1205         emitLifetimeStart(m_destination, block.bb, (*sliceIter).m_root, false);
1206 
1207         (*sliceIter).m_pattern->Emit(this, init);
1208         ++sliceIter;
1209         slicing = false;
1210         if (sliceIter != E)
1211         {
1212             unsigned numInstance = DecideInstanceAndSlice(*(block.bb), (*sliceIter), slicing);
1213             IGC_ASSERT(numInstance == 1 || numInstance == 2);
1214         }
1215     }
1216 
1217     // Store the point slicing stops at.
1218     auto sliceEnd = sliceIter;
1219 
1220     m_encoder->SetSecondHalf(true);  // the 2nd-half slice for simd32
1221     for (sliceIter = sliceBegin; sliceIter != sliceEnd; ++sliceIter)
1222     {
1223         unsigned numInstance = DecideInstanceAndSlice(*(block.bb), (*sliceIter), slicing);
1224         // uniform op only emit once
1225         if (numInstance > 1)
1226         {
1227             emitLifetimeStart(m_destination, block.bb, (*sliceIter).m_root, false);
1228 
1229             (*sliceIter).m_pattern->Emit(this, init);
1230         }
1231     }
1232 
1233     return sliceEnd;
1234 }
1235 
1236 /// Insert moves at the end of the basic block to replace the phi node of the successors
1237 // This is a special case that we want to relocate the phi-mov's
1238 // unconditionally. Two functions, isCandidateIfStmt() and
1239 // canRelocatePhiMov(), are used to check if this is the special
1240 // case as below:
1241 //
1242 //  x.1 = ...
1243 //  ...
1244 //  H: br i1 %cond, OtherBB, phiMovBB   // target BBs interchangeable
1245 //  OtherBB:
1246 //     x.0 = ...
1247 //     br phiBB
1248 //  phiMovBB:
1249 //     <empty BB>
1250 //     br phiBB
1251 //  phiBB:
1252 //     phi x = [x.0, OtherBB] [ x.1, phiMovBB]
1253 //
1254 // Normally, a phi-mov is to be inserted into phiMovBB.  This optim is to
1255 // relocate the phi-mov to H so that we have if-then-endif other than
1256 // if-then-else-endif. To make it simple and correct, the following
1257 // conditions are required:
1258 //     1. 'if' branch isn't uniform. (If uniform, it is probably not beneficial
1259 //        to move phi-mov to H)
1260 //     2. either x.0 is defined in otherBB or a phi-mov must be inserted
1261 //        in the otherBB.
1262 // With this, phi-mov can be relocated to H without using predicate.
1263 //
1264 
1265 // canRelocatePhiMov() checks if all phi-mov to phiMovBB can be relocated.
canRelocatePhiMov(llvm::BasicBlock * otherBB,llvm::BasicBlock * phiMovBB,llvm::BasicBlock * phiBB)1266 bool EmitPass::canRelocatePhiMov(
1267     llvm::BasicBlock* otherBB,
1268     llvm::BasicBlock* phiMovBB,
1269     llvm::BasicBlock* phiBB)
1270 {
1271     // Threshold for phi-mov relocation
1272     const int CMAX_PHI_COUNT = 6;
1273 
1274     int n = 0;
1275     for (auto I = phiBB->begin(), E = phiBB->end(); I != E; ++I)
1276     {
1277         llvm::PHINode* PN = llvm::dyn_cast<llvm::PHINode>(I);
1278         if (!PN)
1279         {
1280             break;
1281         }
1282 
1283         CVariable* dst = m_currShader->GetSymbol(PN);
1284         for (uint i = 0, e = PN->getNumOperands(); i != e; ++i)
1285         {
1286             Value* V = PN->getOperand(i);
1287             CVariable* src = m_currShader->GetSymbol(V);
1288             if (PN->getIncomingBlock(i) == phiMovBB)
1289             {
1290                 if (dst != src)
1291                 {
1292                     int numElt = 1;
1293                     if (IGCLLVM::FixedVectorType * vTy = dyn_cast<IGCLLVM::FixedVectorType>(PN->getType()))
1294                     {
1295                         numElt = int_cast<int>(vTy->getNumElements());
1296                     }
1297                     // Conservatively assume the number of mov's is 'numElt'.
1298                     n += numElt;
1299                 }
1300             }
1301             else
1302             {
1303                 // For case with PN->getIncomingBlock(i) == otherBB
1304                 Instruction* Inst = dyn_cast<Instruction>(V);
1305                 if (Inst && Inst->getParent() != otherBB && (dst == src))
1306                 {
1307                     // This is the case that x and x.1 are coalesced, in which
1308                     // we cannot move phi-mov from emptyBB to H, as doing so
1309                     // will clobber x.1 (x.1 and x are the same virtual reg).
1310                     // [Can move it up with predicate always, but need to check
1311                     //  doing so would give us perf benefit.]
1312                     //           x.1 = ...
1313                     //           ...
1314                     //        H: br c, B0, B1
1315                     //  otherBB:
1316                     //           <...>
1317                     //           br phiBB
1318                     //  emptyBB:
1319                     //           br phiBB
1320                     //    phiBB:
1321                     //           phi x = [x.0  emptyBB] [x.1 otherBB]
1322                     return false;
1323                 }
1324             }
1325         }
1326     }
1327     if (m_currShader->m_dispatchSize == SIMDMode::SIMD32)
1328     {
1329         n = (2 * n);
1330     }
1331     return (n > 0) && (n < CMAX_PHI_COUNT);
1332 }
1333 
1334 // Check if 'ifBB' is the If BB for if-then-else pattern in which both then & else
1335 // are single BBs and one of them is empty. It also make sure the branch is not
1336 // uniform.   If it is such a BB, it returns true with emptyBB and otherBB set to
1337 // then & else.
isCandidateIfStmt(llvm::BasicBlock * ifBB,llvm::BasicBlock * & otherBB,llvm::BasicBlock * & emptyBB)1338 bool EmitPass::isCandidateIfStmt(
1339     llvm::BasicBlock* ifBB, llvm::BasicBlock*& otherBB, llvm::BasicBlock*& emptyBB)
1340 {
1341     llvm::BranchInst* Br = dyn_cast<llvm::BranchInst>(ifBB->getTerminator());
1342     if (!Br || Br->getNumSuccessors() != 2 ||
1343         m_currShader->GetIsUniform(Br->getCondition()))
1344     {
1345         return false;
1346     }
1347 
1348     llvm::BasicBlock* S0 = Br->getSuccessor(0), * S1 = Br->getSuccessor(1);
1349     IGCLLVM::TerminatorInst* T0 = S0->getTerminator(), * T1 = S1->getTerminator();
1350     IGC_ASSERT_MESSAGE(nullptr != T1, "BB is missing a terminator!");
1351     IGC_ASSERT_MESSAGE(nullptr != T0, "BB is missing a terminator!");
1352     bool  isMatch =
1353         S0->getSinglePredecessor() == ifBB && S1->getSinglePredecessor() == ifBB &&
1354         T0->getNumSuccessors() == 1 && T1->getNumSuccessors() == 1 &&
1355         T0->getSuccessor(0) == T1->getSuccessor(0) &&
1356         (S0->size() > 1 || S1->size() > 1) &&    // only one empty block
1357         (S0->size() == 1 || S1->size() == 1);
1358     if (isMatch)
1359     {
1360         if (S0->size() == 1)
1361         {
1362             emptyBB = S0;
1363             otherBB = S1;
1364         }
1365         else
1366         {
1367             emptyBB = S1;
1368             otherBB = S0;
1369         }
1370     }
1371     return isMatch;
1372 }
1373 
1374 /// Insert moves at the end of the basic block to replace the phi node of the successors
MovPhiSources(llvm::BasicBlock * aBB)1375 void EmitPass::MovPhiSources(llvm::BasicBlock* aBB)
1376 {
1377     // collect all the src-side phi-moves, then find a good order for emission
1378     struct PhiSrcMoveInfo {
1379         CVariable* dstCVar;
1380         CVariable* srcCVar;
1381         Value* dstRootV; // root value of dst (dessa)
1382         Value* srcRootV; // root value of src (dessa)
1383     };
1384     BumpPtrAllocator phiAllocator;
1385     std::list<PhiSrcMoveInfo*> phiSrcDstList;
1386     std::vector<std::pair<CVariable*, CVariable*>> emitList;
1387     std::map<CVariable*, unsigned int> dstVTyMap;
1388     llvm::BasicBlock* bb = aBB;
1389     IGCLLVM::TerminatorInst* TI = aBB->getTerminator();
1390     IGC_ASSERT(nullptr != TI);
1391 
1392     // main code to generate phi-mov
1393     for (unsigned succ = 0, e = TI->getNumSuccessors(); succ != e; ++succ)
1394     {
1395         llvm::BasicBlock* Succ = TI->getSuccessor(succ);
1396         for (auto II = Succ->begin(), IE = Succ->end(); II != IE; ++II)
1397         {
1398             llvm::PHINode* PN = llvm::dyn_cast<llvm::PHINode>(II);
1399             if (!PN)
1400             {
1401                 break;
1402             }
1403             if (PN->use_empty())
1404             {
1405                 continue;
1406             }
1407             for (uint i = 0, e = PN->getNumOperands(); i != e; ++i)
1408             {
1409                 if (PN->getIncomingBlock(i) == bb)
1410                 {
1411                     Value* Src = PN->getOperand(i);
1412 
1413                     Value* dstRootV = m_deSSA ? m_deSSA->getRootValue(PN) : PN;
1414                     Value* srcRootV = m_deSSA ? m_deSSA->getRootValue(Src) : Src;
1415                     dstRootV = dstRootV ? dstRootV : PN;
1416                     srcRootV = srcRootV ? srcRootV : Src;
1417                     // To check if src-side phi mov is needed, we must use dessa
1418                     // rootValue instead of CVariable, as value alias in dessa
1419                     // might have the same variable with two different CVariable.
1420                     if (dstRootV != srcRootV)
1421                     {
1422                         PhiSrcMoveInfo* phiInfo = new (phiAllocator) PhiSrcMoveInfo();
1423                         phiInfo->dstCVar = m_currShader->GetSymbol(PN);
1424                         phiInfo->srcCVar = m_currShader->GetSymbol(Src);
1425                         phiInfo->dstRootV = dstRootV;
1426                         phiInfo->srcRootV = srcRootV;
1427                         phiSrcDstList.push_back(phiInfo);
1428 
1429                         int numElt = 0;
1430                         if (IGCLLVM::FixedVectorType * vTy = dyn_cast<IGCLLVM::FixedVectorType>(PN->getType()))
1431                         {
1432                             numElt = int_cast<int>(vTy->getNumElements());
1433                         }
1434                         dstVTyMap.insert(std::pair<CVariable*, unsigned int>(phiInfo->dstCVar, numElt));
1435                     }
1436                 }
1437             }
1438         }
1439     }
1440 
1441     // Find a good order for src-side phi-moves.
1442     //
1443     // PHI copies are parallel copy. Here, need to serialize those copies
1444     // in a way that the dst will not be overwritten by a previous copy.
1445     //     For example,
1446     //        (phi_1, phi_2) = (a, phi_1)
1447     //     ==>
1448     //        phi_2 = phi_1
1449     //        phi_1 = a
1450     // If there is a cycle, have to insert a temp copy to break the cycle (see below)
1451     while (!phiSrcDstList.empty())
1452     {
1453         // search should not get into a deadlock, i.e should be able to find one to emit every iteration,
1454         auto It = phiSrcDstList.begin();
1455         auto Et = phiSrcDstList.end();
1456         for (; It != Et; ++It)
1457         {
1458             auto Cmp = [&](const PhiSrcMoveInfo* Val)
1459             {
1460                 return Val->srcRootV == (*It)->dstRootV;
1461             };
1462 
1463             if (0 == std::count_if (phiSrcDstList.begin(), phiSrcDstList.end(), Cmp))
1464             {
1465                 break;
1466             }
1467         }
1468         if (It == Et)
1469         {
1470             // Found cyclic phi-move dependency. Pick the first one (anyone
1471             // should be good) and create a temp to break the dependence cycle.
1472             // (Note that there is no self-cycle.)
1473             // For example,
1474             //    (phi_1, phi_2) = (phi_2, phi_1)
1475             //  ==>
1476             //    t = phi_1
1477             //    phi_1 = phi_2
1478             //    phi_2 = t
1479 
1480             // After the temp copy of the 1st entry's dst is inserted,
1481             // the entry becomes the one to be added into emitList.
1482             It = phiSrcDstList.begin();
1483 
1484             Value* dRootV = (*It)->dstRootV;
1485             CVariable* D1 = (*It)->dstCVar;
1486             CVariable* T = m_currShader->GetNewVariable(D1);
1487             dstVTyMap[T] = dstVTyMap[D1];
1488             emitList.push_back(std::pair<CVariable*, CVariable*>(D1, T));
1489 
1490             // Replace with T all src that is equal to D1 (start from It+1)
1491             auto LI = It, LE = phiSrcDstList.end();
1492             for (++LI; LI != LE; ++LI)
1493             {
1494                 PhiSrcMoveInfo* phiinfo = *LI;
1495                 if (phiinfo->srcRootV == dRootV) {
1496                     CVariable* sVar = phiinfo->srcCVar;
1497                     CVariable* nVar;
1498                     if (sVar->GetType() != T->GetType()) {
1499                         nVar = m_currShader->GetNewAlias(
1500                             T, sVar->GetType(), 0, sVar->GetNumberElement());
1501                     }
1502                     else {
1503                         nVar = T;
1504                     }
1505                     phiinfo->srcCVar = nVar;
1506                 }
1507             }
1508         }
1509         IGC_ASSERT(It != Et);
1510         emitList.push_back(std::pair<CVariable*, CVariable*>((*It)->srcCVar, (*It)->dstCVar));
1511         phiSrcDstList.erase(It);
1512     }
1513     // emit the src-side phi-moves
1514     for (unsigned i = 0, e = int_cast<unsigned>(emitList.size()); i != e; ++i)
1515     {
1516         CVariable* dst = emitList[i].second;
1517         CVariable* src = emitList[i].first;
1518 
1519         for (uint instance = 0; instance < dst->GetNumberInstance(); instance++)
1520         {
1521             m_encoder->SetSecondHalf(instance == 1 ? true : false);
1522             unsigned int numVTyElt = dstVTyMap[dst];
1523             if (numVTyElt > 0)
1524             {
1525                 emitVectorCopy(dst, src, numVTyElt);
1526             }
1527             else
1528             {
1529                 m_encoder->Copy(dst, src);
1530                 m_encoder->Push();
1531             }
1532         }
1533     }
1534 }
1535 
InitConstant(llvm::BasicBlock * BB)1536 void EmitPass::InitConstant(llvm::BasicBlock* BB)
1537 {
1538     for (auto& I : m_pattern->ConstantPlacement)
1539     {
1540         if (I.second != BB)
1541             continue;
1542         Constant* C = I.first;
1543         CVariable* Dst = m_currShader->lookupConstantInPool(C);
1544         if (Dst)
1545             continue;
1546         Dst = m_currShader->GetConstant(C);
1547         if (!C->getType()->isVectorTy()) {
1548             CVariable* Imm = Dst;
1549             Dst = m_currShader->GetNewVector(C);
1550             m_encoder->Copy(Dst, Imm);
1551             m_encoder->Push();
1552         }
1553         m_currShader->addConstantInPool(C, Dst);
1554     }
1555 }
1556 
emitLifetimeStartAtEndOfBB(BasicBlock * BB)1557 void EmitPass::emitLifetimeStartAtEndOfBB(BasicBlock* BB)
1558 {
1559     if (m_pCtx->getVectorCoalescingControl() == 0) {
1560         return;
1561     }
1562 
1563     auto II = m_VRA->m_LifetimeAtEndOfBB.find(BB);
1564     if (II != m_VRA->m_LifetimeAtEndOfBB.end())
1565     {
1566         TinyPtrVector<Value*>& ARVs = II->second;
1567         for (int i = 0, sz = (int)ARVs.size(); i < sz; ++i)
1568         {
1569             Value* RootVal = ARVs[i];
1570             CVariable* Var = GetSymbol(RootVal);
1571 
1572             // vISA info inst, no m_encoder->Push() needed.
1573             m_encoder->Lifetime(LIFETIME_START, Var);
1574         }
1575     }
1576 }
1577 
getPairOutput(Value * V) const1578 std::pair<Value*, Value*> EmitPass::getPairOutput(Value* V) const {
1579     auto I = m_pattern->PairOutputMap.find(V);
1580     IGC_ASSERT(I != m_pattern->PairOutputMap.end());
1581     return std::make_pair(I->second.first, I->second.second);
1582 }
1583 
emitGradientX(const SSource & source,const DstModifier & modifier)1584 void EmitPass::emitGradientX(const SSource& source, const DstModifier& modifier)
1585 {
1586     CVariable* src = GetSrcVariable(source);
1587     if (src->IsUniform())
1588     {
1589         m_encoder->SetSrcModifier(1, EMOD_NEG);
1590         m_encoder->Add(m_destination, src, src);
1591         m_encoder->Push();
1592     }
1593     else
1594     {
1595         // we need to combine negation with the existing source modifiers
1596         // to implement subtraction of values correct also for neg, abs, negabs
1597         const e_modifier src_mod0 = source.mod;
1598         const e_modifier src_mod1 = CombineModifier(EMOD_NEG, src_mod0);
1599         m_encoder->SetSrcModifier(0, src_mod0);
1600         m_encoder->SetSrcModifier(1, src_mod1);
1601         m_encoder->SetDstModifier(modifier);
1602         // set the regioning to get isa instruction
1603         // add dst0.0<1>:f   src0.1<4;4,0>:f   -src0.0<4;4,0>:f
1604         m_encoder->SetSrcRegion(0, 4, 4, 0);
1605         m_encoder->SetSrcRegion(1, 4, 4, 0);
1606         m_encoder->SetSrcSubReg(0, 1);
1607         m_encoder->SetSrcSubReg(1, 0);
1608         m_encoder->Add(m_destination, src, src);
1609         m_encoder->Push();
1610     }
1611 }
1612 
emitGradientY(const SSource & source,const DstModifier & modifier)1613 void EmitPass::emitGradientY(const SSource& source, const DstModifier& modifier)
1614 {
1615     CVariable* src = GetSrcVariable(source);
1616     if (src->IsUniform())
1617     {
1618         m_encoder->SetSrcModifier(1, EMOD_NEG);
1619         m_encoder->Add(m_destination, src, src);
1620         m_encoder->Push();
1621     }
1622     else
1623     {
1624         const e_modifier src_mod0 = source.mod;
1625         const e_modifier src_mod1 = CombineModifier(EMOD_NEG, src_mod0);
1626         m_encoder->SetSrcModifier(0, src_mod0);
1627         m_encoder->SetSrcModifier(1, src_mod1);
1628         m_encoder->SetDstModifier(modifier);
1629         // set the regioning to get isa instruction
1630         // add dst0.0<1>:f   src0.1<4;4,0>:f   -src0.0<4;4,0>:f
1631         m_encoder->SetSrcRegion(0, 4, 4, 0);
1632         m_encoder->SetSrcRegion(1, 4, 4, 0);
1633         m_encoder->SetSrcSubReg(0, 2);
1634         m_encoder->SetSrcSubReg(1, 0);
1635         m_encoder->Add(m_destination, src, src);
1636         m_encoder->Push();
1637     }
1638 }
1639 
emitGradientXFine(const SSource & source,const DstModifier & modifier)1640 void EmitPass::emitGradientXFine(const SSource& source, const DstModifier& modifier)
1641 {
1642     CVariable* src = GetSrcVariable(source);
1643     if (src->IsUniform())
1644     {
1645         m_encoder->SetSrcModifier(1, EMOD_NEG);
1646         m_encoder->Add(m_destination, src, src);
1647         m_encoder->Push();
1648     }
1649     else
1650     {
1651         const e_modifier src_mod0 = source.mod;
1652         const e_modifier src_mod1 = CombineModifier(EMOD_NEG, src_mod0);
1653         m_encoder->SetSrcModifier(0, src_mod0);
1654         m_encoder->SetSrcModifier(1, src_mod1);
1655         m_encoder->SetDstModifier(modifier);
1656         // set the regioning to get isa instruction
1657         // add dst0.0<1>:f   src0.1<2;2,0>:f   -src0.0<2;2,0>:f
1658         m_encoder->SetSrcRegion(0, 2, 2, 0);
1659         m_encoder->SetSrcRegion(1, 2, 2, 0);
1660         m_encoder->SetSrcSubReg(0, 1);
1661         m_encoder->SetSrcSubReg(1, 0);
1662         m_encoder->Add(m_destination, src, src);
1663         m_encoder->Push();
1664     }
1665 }
1666 
1667 /// Computes derivatives with respect to screen space by subtracting values for
1668 /// adjacent pixels in vertical direction.
1669 /// Consider the following four pixels:
1670 /// +----+----+
1671 /// | P0 | P1 |
1672 /// +----+----+
1673 /// | P2 | P3 |
1674 /// +----+----+
1675 ///
1676 /// then gradient_y_fine for scalar attribute A of pixel P0 will be P0.A - P2.A
1677 /// The same value will be used for P2 since the spec leaves the freedom of
1678 /// choosing the quad alignment. The same goes for P1 and P3.
1679 ///
1680 /// Now, if we look at the attribute A as laid out in a SIMD register, we have
1681 ///
1682 /// src0 =  A : |    |    |    |    | P3.A | P2.A | P1.A | P0.A |
1683 ///
1684 /// and the result register should contain
1685 ///
1686 /// dst0 = dy : |    |    |    |    |  q   |  t   |   q  |   t  |
1687 ///
1688 /// where t = P0.A - P2.A and q = P1.A - P3.A
1689 ///
1690 /// The upper half of GRF also contains data for another separate set of four pixels.
1691 ///
1692 /// We compute the result by the following sequence of instructions
1693 ///
1694 /// add (4)  dst0.0<1>:f src0.0<0; 2, 1>:f -src0.2<0; 2, 1>:f   // lower half
1695 /// add (4)  dst0.4<1>:f src0.4<0; 2, 1>:f -src0.6<0; 2, 1>:f   // upper half
1696 ///
1697 /// and if we are in simd16 mode, we need two more instructions
1698 /// if (simd16)
1699 /// {
1700 ///    add (4)   dst0.8<1>:f  src0.8<0; 2, 1>:f -src0.10<0; 2, 1>:f
1701 ///    add (4)  dst0.12<1>:f src0.12<0; 2, 1>:f -src0.14<0; 2, 1>:f
1702 /// }
1703 ///
1704 /// Note: Since the source llvm instruction may contain source modifier (abs, neg, negabs)
1705 /// we need to read them and flip the sign of the second isa source accordingly.
1706 ///////////////////////////////////////////////////////////////////////////////
emitGradientYFine(const SSource & source,const DstModifier & modifier)1707 void EmitPass::emitGradientYFine(const SSource& source, const DstModifier& modifier)
1708 {
1709     CVariable* src = GetSrcVariable(source);
1710     if (src->IsUniform())
1711     {
1712         m_encoder->SetSrcModifier(1, EMOD_NEG);
1713         m_encoder->Add(m_destination, src, src);
1714         m_encoder->Push();
1715     }
1716     else
1717     {
1718         CVariable* temp = m_currShader->GetNewVariable(m_destination);
1719         const e_modifier src_mod0 = source.mod;
1720         const e_modifier src_mod1 = CombineModifier(EMOD_NEG, src_mod0);
1721 
1722         m_encoder->SetSimdSize(SIMDMode::SIMD4);
1723         m_encoder->SetSrcModifier(0, src_mod0);
1724         m_encoder->SetSrcRegion(0, 0, 2, 1);
1725         m_encoder->SetSrcSubReg(0, 2);
1726 
1727         m_encoder->SetSrcModifier(1, src_mod1);
1728         m_encoder->SetSrcRegion(1, 0, 2, 1);
1729         m_encoder->SetSrcSubReg(1, 0);
1730         m_encoder->SetNoMask();
1731 
1732         m_encoder->SetDstModifier(modifier);
1733         m_encoder->SetDstSubReg(0);
1734         m_encoder->Add(temp, src, src);
1735         m_encoder->Push();
1736 
1737         m_encoder->SetSimdSize(SIMDMode::SIMD4);
1738         m_encoder->SetSrcModifier(0, src_mod0);
1739         m_encoder->SetSrcRegion(0, 0, 2, 1);
1740         m_encoder->SetSrcSubReg(0, 6);
1741 
1742         m_encoder->SetSrcModifier(1, src_mod1);
1743         m_encoder->SetSrcRegion(1, 0, 2, 1);
1744         m_encoder->SetSrcSubReg(1, 4);
1745         m_encoder->SetNoMask();
1746 
1747 
1748         m_encoder->SetDstModifier(modifier);
1749         m_encoder->SetDstSubReg(4);
1750         m_encoder->Add(temp, src, src);
1751         m_encoder->Push();
1752 
1753         if (m_currShader->m_SIMDSize == SIMDMode::SIMD16 || m_currShader->m_SIMDSize == SIMDMode::SIMD32)
1754         {
1755             m_encoder->SetSimdSize(SIMDMode::SIMD4);
1756             m_encoder->SetSrcModifier(0, src_mod0);
1757             m_encoder->SetSrcRegion(0, 0, 2, 1);
1758             m_encoder->SetSrcSubReg(0, 10);
1759 
1760             m_encoder->SetSrcModifier(1, src_mod1);
1761             m_encoder->SetSrcRegion(1, 0, 2, 1);
1762             m_encoder->SetSrcSubReg(1, 8);
1763             m_encoder->SetNoMask();
1764 
1765             m_encoder->SetDstModifier(modifier);
1766             m_encoder->SetDstSubReg(8);
1767             m_encoder->Add(temp, src, src);
1768             m_encoder->Push();
1769 
1770             m_encoder->SetSimdSize(SIMDMode::SIMD4);
1771             m_encoder->SetSrcModifier(0, src_mod0);
1772             m_encoder->SetSrcRegion(0, 0, 2, 1);
1773             m_encoder->SetSrcSubReg(0, 14);
1774 
1775             m_encoder->SetSrcModifier(1, src_mod1);
1776             m_encoder->SetSrcRegion(1, 0, 2, 1);
1777             m_encoder->SetSrcSubReg(1, 12);
1778 
1779             m_encoder->SetNoMask();
1780             m_encoder->SetDstModifier(modifier);
1781             m_encoder->SetDstSubReg(12);
1782             m_encoder->Add(temp, src, src);
1783             m_encoder->Push();
1784         }
1785 
1786         if (m_currShader->m_SIMDSize == SIMDMode::SIMD32)
1787         {
1788             m_encoder->SetSimdSize(SIMDMode::SIMD4);
1789             m_encoder->SetSrcModifier(0, src_mod0);
1790             m_encoder->SetSrcRegion(0, 0, 2, 1);
1791             m_encoder->SetSrcSubReg(0, 18);
1792 
1793             m_encoder->SetSrcModifier(1, src_mod1);
1794             m_encoder->SetSrcRegion(1, 0, 2, 1);
1795             m_encoder->SetSrcSubReg(1, 16);
1796             m_encoder->SetNoMask();
1797 
1798             m_encoder->SetDstModifier(modifier);
1799             m_encoder->SetDstSubReg(16);
1800             m_encoder->Add(temp, src, src);
1801             m_encoder->Push();
1802 
1803             m_encoder->SetSimdSize(SIMDMode::SIMD4);
1804             m_encoder->SetSrcModifier(0, src_mod0);
1805             m_encoder->SetSrcRegion(0, 0, 2, 1);
1806             m_encoder->SetSrcSubReg(0, 22);
1807 
1808             m_encoder->SetSrcModifier(1, src_mod1);
1809             m_encoder->SetSrcRegion(1, 0, 2, 1);
1810             m_encoder->SetSrcSubReg(1, 20);
1811             m_encoder->SetNoMask();
1812 
1813             m_encoder->SetDstModifier(modifier);
1814             m_encoder->SetDstSubReg(20);
1815             m_encoder->Add(temp, src, src);
1816             m_encoder->Push();
1817 
1818 
1819             m_encoder->SetSimdSize(SIMDMode::SIMD4);
1820             m_encoder->SetSrcModifier(0, src_mod0);
1821             m_encoder->SetSrcRegion(0, 0, 2, 1);
1822             m_encoder->SetSrcSubReg(0, 26);
1823 
1824             m_encoder->SetSrcModifier(1, src_mod1);
1825             m_encoder->SetSrcRegion(1, 0, 2, 1);
1826             m_encoder->SetSrcSubReg(1, 24);
1827             m_encoder->SetNoMask();
1828 
1829             m_encoder->SetDstModifier(modifier);
1830             m_encoder->SetDstSubReg(24);
1831             m_encoder->Add(temp, src, src);
1832             m_encoder->Push();
1833 
1834             m_encoder->SetSimdSize(SIMDMode::SIMD4);
1835             m_encoder->SetSrcModifier(0, src_mod0);
1836             m_encoder->SetSrcRegion(0, 0, 2, 1);
1837             m_encoder->SetSrcSubReg(0, 30);
1838 
1839             m_encoder->SetSrcModifier(1, src_mod1);
1840             m_encoder->SetSrcRegion(1, 0, 2, 1);
1841             m_encoder->SetSrcSubReg(1, 28);
1842 
1843             m_encoder->SetNoMask();
1844             m_encoder->SetDstModifier(modifier);
1845             m_encoder->SetDstSubReg(28);
1846             m_encoder->Add(temp, src, src);
1847             m_encoder->Push();
1848         }
1849 
1850         m_encoder->Copy(m_destination, temp);
1851         m_encoder->Push();
1852     }
1853 }
1854 
EmitAluIntrinsic(llvm::CallInst * I,const SSource source[2],const DstModifier & modifier)1855 void EmitPass::EmitAluIntrinsic(llvm::CallInst* I, const SSource source[2], const DstModifier& modifier)
1856 {
1857     if (GenIntrinsicInst * CI = dyn_cast<GenIntrinsicInst>(I))
1858     {
1859         switch (CI->getIntrinsicID())
1860         {
1861         case GenISAIntrinsic::GenISA_GradientX:
1862             emitGradientX(source[0], modifier);
1863             break;
1864         case GenISAIntrinsic::GenISA_GradientXfine:
1865             emitGradientXFine(source[0], modifier);
1866             break;
1867         case GenISAIntrinsic::GenISA_GradientY:
1868             emitGradientY(source[0], modifier);
1869             break;
1870         case GenISAIntrinsic::GenISA_GradientYfine:
1871             emitGradientYFine(source[0], modifier);
1872             break;
1873         default:
1874             // no special handling
1875             EmitSimpleAlu(I, source, modifier);
1876             break;
1877         }
1878     }
1879     else if (IntrinsicInst * CI = dyn_cast<IntrinsicInst>(I))
1880     {
1881         switch (CI->getIntrinsicID())
1882         {
1883         case Intrinsic::ctlz:
1884             //Throw away source[1], since for ctlz, this is a flag we don't care about.
1885             emitCtlz(source[0]);
1886             break;
1887         default:
1888             // no special handling
1889             EmitSimpleAlu(I, source, modifier);
1890             break;
1891         }
1892     }
1893 }
1894 
1895 // Those help functions are used only by this file. If other files use them,
1896 // they should be moved to helper.cpp.
GetPredicate(llvm::CmpInst::Predicate predicate)1897 static e_predicate GetPredicate(llvm::CmpInst::Predicate predicate)
1898 {
1899     switch (predicate)
1900     {
1901     case llvm::CmpInst::ICMP_UGT:
1902     case llvm::CmpInst::ICMP_SGT:
1903     case llvm::CmpInst::FCMP_UGT:
1904     case llvm::CmpInst::FCMP_OGT:
1905         return EPREDICATE_GT;
1906     case llvm::CmpInst::ICMP_UGE:
1907     case llvm::CmpInst::ICMP_SGE:
1908     case llvm::CmpInst::FCMP_UGE:
1909     case llvm::CmpInst::FCMP_OGE:
1910         return EPREDICATE_GE;
1911     case llvm::CmpInst::ICMP_ULT:
1912     case llvm::CmpInst::ICMP_SLT:
1913     case llvm::CmpInst::FCMP_ULT:
1914     case llvm::CmpInst::FCMP_OLT:
1915         return EPREDICATE_LT;
1916     case llvm::CmpInst::ICMP_ULE:
1917     case llvm::CmpInst::ICMP_SLE:
1918     case llvm::CmpInst::FCMP_ULE:
1919     case llvm::CmpInst::FCMP_OLE:
1920         return EPREDICATE_LE;
1921     case llvm::CmpInst::ICMP_EQ:
1922     case llvm::CmpInst::FCMP_UEQ:
1923     case llvm::CmpInst::FCMP_OEQ:
1924         return EPREDICATE_EQ;
1925     case llvm::CmpInst::ICMP_NE:
1926     case llvm::CmpInst::FCMP_UNE:
1927         return EPREDICATE_NE;
1928     default:
1929         break;
1930     }
1931     IGC_ASSERT(0);
1932     return EPREDICATE_EQ;
1933 }
1934 
GetUnsignedType(VISA_Type type)1935 static VISA_Type GetUnsignedType(VISA_Type type)
1936 {
1937     switch (type)
1938     {
1939     case ISA_TYPE_Q:
1940     case ISA_TYPE_UQ:
1941         return ISA_TYPE_UQ;
1942     case ISA_TYPE_D:
1943     case ISA_TYPE_UD:
1944         return ISA_TYPE_UD;
1945     case ISA_TYPE_W:
1946     case ISA_TYPE_UW:
1947         return ISA_TYPE_UW;
1948     case ISA_TYPE_B:
1949     case ISA_TYPE_UB:
1950         return ISA_TYPE_UB;
1951     default:
1952         IGC_ASSERT(0);
1953         break;
1954     }
1955     return ISA_TYPE_UD;
1956 }
1957 
GetSignedType(VISA_Type type)1958 static VISA_Type GetSignedType(VISA_Type type)
1959 {
1960     switch (type)
1961     {
1962     case ISA_TYPE_Q:
1963     case ISA_TYPE_UQ:
1964         return ISA_TYPE_Q;
1965     case ISA_TYPE_D:
1966     case ISA_TYPE_UD:
1967         return ISA_TYPE_D;
1968     case ISA_TYPE_W:
1969     case ISA_TYPE_UW:
1970         return ISA_TYPE_W;
1971     case ISA_TYPE_B:
1972     case ISA_TYPE_UB:
1973         return ISA_TYPE_B;
1974     default:
1975         IGC_ASSERT(0);
1976         break;
1977     }
1978     return ISA_TYPE_D;
1979 }
1980 
GetUnsignedIntegerType(VISA_Type type)1981 static VISA_Type GetUnsignedIntegerType(VISA_Type type)
1982 {
1983     switch (type)
1984     {
1985     case ISA_TYPE_Q:
1986     case ISA_TYPE_UQ:
1987         return ISA_TYPE_UQ;
1988     case ISA_TYPE_D:
1989     case ISA_TYPE_UD:
1990         return ISA_TYPE_UD;
1991     case ISA_TYPE_W:
1992     case ISA_TYPE_UW:
1993         return ISA_TYPE_UW;
1994     case ISA_TYPE_B:
1995     case ISA_TYPE_UB:
1996         return ISA_TYPE_UB;
1997     case ISA_TYPE_DF:
1998         return ISA_TYPE_UQ;
1999     case ISA_TYPE_F:
2000         return ISA_TYPE_UD;
2001     case ISA_TYPE_HF:
2002         return ISA_TYPE_UW;
2003     default:
2004         IGC_ASSERT(0);
2005         break;
2006     }
2007     return ISA_TYPE_UD;
2008 }
2009 
getFPOne(VISA_Type Ty)2010 static uint64_t getFPOne(VISA_Type Ty)
2011 {
2012     switch (Ty)
2013     {
2014     case ISA_TYPE_DF:   return 0x3FF0000000000000;
2015     case ISA_TYPE_F:    return 0x3F800000;
2016     case ISA_TYPE_BF:   return 0x3F80;
2017     case ISA_TYPE_HF:   return 0x3C00;
2018     default: break;
2019     }
2020     IGC_ASSERT_MESSAGE(0, "unknown floating type!");
2021     return ~0U;
2022 }
2023 
GetSrcVariable(const SSource & source,bool fromConstPool)2024 CVariable* EmitPass::GetSrcVariable(const SSource& source, bool fromConstPool)
2025 {
2026     CVariable* src = m_currShader->GetSymbol(source.value, fromConstPool);
2027     // Change the type of source if needed.
2028     if (source.type != ISA_TYPE_NUM && source.type != src->GetType())
2029     {
2030         if (src->IsImmediate()) {
2031             src = m_currShader->ImmToVariable(src->GetImmediateValue(), source.type);
2032         }
2033         else {
2034             src = m_currShader->GetNewAlias(src, source.type, 0, src->GetNumberElement());
2035         }
2036     }
2037     return src;
2038 }
2039 
SetSourceModifiers(unsigned int sourceIndex,const SSource & source)2040 void EmitPass::SetSourceModifiers(unsigned int sourceIndex, const SSource& source)
2041 {
2042     if (source.mod != EMOD_NONE)
2043     {
2044         m_encoder->SetSrcModifier(sourceIndex, source.mod);
2045     }
2046 
2047     int numberOfLanes = 0;
2048     if (m_currShader->GetIsUniform(source.value))
2049     {
2050         numberOfLanes = 1;
2051     }
2052     else
2053     {
2054         numberOfLanes = numLanes(m_currShader->m_SIMDSize);
2055     }
2056     int calculated_offset = source.SIMDOffset * numberOfLanes + source.elementOffset;
2057     m_encoder->SetSrcSubReg(sourceIndex, calculated_offset);
2058 
2059     if (source.region_set)
2060     {
2061         m_encoder->SetSrcRegion(sourceIndex, source.region[0], source.region[1], source.region[2], source.instance);
2062     }
2063 }
2064 
EmitSimpleAlu(Instruction * inst,const SSource sources[2],const DstModifier & modifier)2065 void EmitPass::EmitSimpleAlu(Instruction* inst, const SSource sources[2], const DstModifier& modifier)
2066 {
2067     EmitSimpleAlu(GetOpCode(inst), sources, modifier);
2068 }
2069 
EmitSimpleAlu(Instruction * inst,CVariable * dst,CVariable * src0,CVariable * src1)2070 void EmitPass::EmitSimpleAlu(Instruction* inst, CVariable* dst, CVariable* src0, CVariable* src1)
2071 {
2072     EmitSimpleAlu(GetOpCode(inst), dst, src0, src1);
2073 }
2074 
EmitSimpleAlu(EOPCODE opCode,const SSource sources[2],const DstModifier & modifier)2075 void EmitPass::EmitSimpleAlu(EOPCODE opCode, const SSource sources[2], const DstModifier& modifier)
2076 {
2077     CVariable* srcs[2] = { nullptr, nullptr };
2078 
2079     srcs[0] = GetSrcVariable(sources[0], sources[0].fromConstantPool);
2080     SetSourceModifiers(0, sources[0]);
2081 
2082     if (sources[1].value)
2083     {
2084         srcs[1] = GetSrcVariable(sources[1], sources[1].fromConstantPool);
2085         SetSourceModifiers(1, sources[1]);
2086     }
2087     m_encoder->SetDstModifier(modifier);
2088     EmitSimpleAlu(opCode, m_destination, srcs[0], srcs[1]);
2089 }
2090 
EmitSimpleAlu(EOPCODE opCode,CVariable * dst,CVariable * src0,CVariable * src1)2091 void EmitPass::EmitSimpleAlu(EOPCODE opCode, CVariable* dst, CVariable* src0, CVariable* src1)
2092 {
2093     switch (opCode)
2094     {
2095     case llvm_fmul:
2096     case llvm_mul:
2097         m_encoder->Mul(dst, src0, src1);
2098         break;
2099     case llvm_fdiv:
2100         m_encoder->Div(dst, src0, src1);
2101         break;
2102     case llvm_fadd:
2103     case llvm_add:
2104         m_encoder->Add(dst, src0, src1);
2105         break;
2106     case llvm_cos:
2107         m_encoder->Cos(dst, src0);
2108         break;
2109     case llvm_sin:
2110         m_encoder->Sin(dst, src0);
2111         break;
2112     case llvm_log:
2113         m_encoder->Log(dst, src0);
2114         break;
2115     case llvm_exp:
2116         m_encoder->Exp(dst, src0);
2117         break;
2118     case llvm_pow:
2119         m_encoder->Pow(dst, src0, src1);
2120         break;
2121     case llvm_sqrt:
2122         m_encoder->Sqrt(dst, src0);
2123         break;
2124     case llvm_rsq:
2125         m_encoder->Rsqrt(dst, src0);
2126         break;
2127     case llvm_floor:
2128         m_encoder->Floor(dst, src0);
2129         break;
2130     case llvm_ceil:
2131         m_encoder->Ceil(dst, src0);
2132         break;
2133     case llvm_round_z:
2134         m_encoder->Truncate(dst, src0);
2135         break;
2136     case llvm_roundne:
2137         m_encoder->RoundNE(dst, src0);
2138         break;
2139     case llvm_imulh:
2140         m_encoder->MulH(dst, src0, src1);
2141         break;
2142     case llvm_umulh:
2143     {
2144         src0 = m_currShader->BitCast(src0, GetUnsignedType(src0->GetType()));
2145         src1 = m_currShader->BitCast(src1, GetUnsignedType(src1->GetType()));
2146         dst = m_currShader->BitCast(dst, GetUnsignedType(dst->GetType()));
2147         m_encoder->MulH(dst, src0, src1);
2148     }
2149     break;
2150     case llvm_sext:
2151     {
2152         if (src0->GetType() == ISA_TYPE_BOOL)
2153         {
2154             CVariable* minusone = m_currShader->ImmToVariable(-1, dst->GetType());
2155             CVariable* zero = m_currShader->ImmToVariable(0, dst->GetType());
2156                 m_encoder->Select(src0, dst, minusone, zero);
2157         }
2158         else
2159         {
2160             m_encoder->Cast(dst, src0);
2161         }
2162     }
2163     break;
2164     case llvm_zext:
2165     {
2166         if (src0->GetType() == ISA_TYPE_BOOL)
2167         {
2168             CVariable* one = m_currShader->ImmToVariable(1, dst->GetType());
2169             CVariable* zero = m_currShader->ImmToVariable(0, dst->GetType());
2170                 m_encoder->Select(src0, dst, one, zero);
2171         }
2172         else
2173         {
2174             src0 = m_currShader->BitCast(src0, GetUnsignedType(src0->GetType()));
2175             m_encoder->Cast(dst, src0);
2176         }
2177     }
2178     break;
2179     case llvm_trunc:
2180     case llvm_fptrunc:
2181     case llvm_fpext:
2182     case llvm_fptosi:
2183     case llvm_fptoui:
2184         if (dst->GetType() == ISA_TYPE_BOOL)
2185         {
2186             m_encoder->Cmp(EPREDICATE_NE, dst, src0, m_currShader->ImmToVariable(0, src0->GetType()));
2187         }
2188         else
2189         {
2190             if (opCode == llvm_fptoui)
2191             {
2192                 dst = m_currShader->BitCast(dst, GetUnsignedType(dst->GetType()));
2193             }
2194             m_encoder->Cast(dst, src0);
2195         }
2196         break;
2197     case llvm_sitofp:
2198     case llvm_uitofp:
2199         if (src0->GetType() == ISA_TYPE_BOOL)
2200         {
2201             CVariable* one = m_currShader->ImmToVariable(getFPOne(dst->GetType()), dst->GetType());
2202             CVariable* zero = m_currShader->ImmToVariable(0, dst->GetType());
2203                 m_encoder->Select(src0, dst, one, zero);
2204         }
2205         else
2206         {
2207             if (opCode == llvm_uitofp)
2208             {
2209                 src0 = m_currShader->BitCast(src0, GetUnsignedType(src0->GetType()));
2210             }
2211             m_encoder->Cast(dst, src0);
2212         }
2213         break;
2214     case llvm_xor:
2215         m_encoder->Xor(dst, src0, src1);
2216         break;
2217     case llvm_or:
2218         m_encoder->Or(dst, src0, src1);
2219         break;
2220     case llvm_and:
2221         m_encoder->And(dst, src0, src1);
2222         break;
2223     case llvm_udiv:
2224     {
2225         src0 = m_currShader->BitCast(src0, GetUnsignedType(src0->GetType()));
2226         src1 = m_currShader->BitCast(src1, GetUnsignedType(src1->GetType()));
2227         dst = m_currShader->BitCast(dst, GetUnsignedType(dst->GetType()));
2228         m_encoder->Div(dst, src0, src1);
2229     }
2230     break;
2231     case llvm_sdiv:
2232         m_encoder->Div(dst, src0, src1);
2233         break;
2234     case llvm_urem:
2235     {
2236         src0 = m_currShader->BitCast(src0, GetUnsignedType(src0->GetType()));
2237         src1 = m_currShader->BitCast(src1, GetUnsignedType(src1->GetType()));
2238         dst = m_currShader->BitCast(dst, GetUnsignedType(dst->GetType()));
2239         m_encoder->Mod(dst, src0, src1);
2240     }
2241     break;
2242     case llvm_srem:
2243         m_encoder->Mod(dst, src0, src1);
2244         break;
2245     case llvm_shl:
2246         m_encoder->Shl(dst, src0, src1);
2247         break;
2248     case llvm_ishr:
2249         m_encoder->IShr(dst, src0, src1);
2250         break;
2251     case llvm_ushr:
2252     {
2253         src0 = m_currShader->BitCast(src0, GetUnsignedType(src0->GetType()));
2254         m_encoder->Shr(dst, src0, src1);
2255     }
2256     break;
2257     case llvm_min:
2258         m_encoder->Min(dst, src0, src1);
2259         break;
2260     case llvm_max:
2261         m_encoder->Max(dst, src0, src1);
2262         break;
2263     case llvm_uaddc:
2264     {
2265         src0 = m_currShader->BitCast(src0, GetUnsignedType(src0->GetType()));
2266         src1 = m_currShader->BitCast(src1, GetUnsignedType(src1->GetType()));
2267         dst = m_currShader->BitCast(dst, GetUnsignedType(dst->GetType()));
2268         m_encoder->UAddC(dst, src0, src1);
2269     }
2270     break;
2271     case llvm_usubb:
2272     {
2273         src0 = m_currShader->BitCast(src0, GetUnsignedType(src0->GetType()));
2274         src1 = m_currShader->BitCast(src1, GetUnsignedType(src1->GetType()));
2275         dst = m_currShader->BitCast(dst, GetUnsignedType(dst->GetType()));
2276         m_encoder->USubB(dst, src0, src1);
2277     }
2278     break;
2279     case llvm_bfrev:
2280         m_encoder->Bfrev(dst, src0);
2281         break;
2282     case llvm_cbit: {
2283         src0 = m_currShader->BitCast(src0, GetUnsignedType(src0->GetType()));
2284         dst = m_currShader->BitCast(dst, GetUnsignedType(dst->GetType()));
2285         if (dst->GetType() == ISA_TYPE_UD) {
2286             m_encoder->CBit(dst, src0);
2287             break;
2288         }
2289         if (dst->GetType() == ISA_TYPE_UW || dst->GetType() == ISA_TYPE_UB) {
2290             // vISA only supports UD destination. Need a temporary and truncate
2291             // from it.
2292             CVariable* tmp
2293                 = m_currShader->GetNewVariable(
2294                     dst->GetNumberElement(),
2295                     ISA_TYPE_UD,
2296                     dst->IsUniform() ? EALIGN_DWORD : EALIGN_GRF,
2297                     dst->IsUniform(),
2298                     dst->getName());
2299             m_encoder->CBit(tmp, src0);
2300             m_encoder->Push();
2301             m_encoder->Cast(dst, tmp);
2302             break;
2303         }
2304         IGC_ASSERT(dst->GetType() == ISA_TYPE_UQ);
2305         // TODO: So far, 64-bit popcnt is handled in LLVM IR as follows:
2306         // dst = popcnt.32(src & 0xFFFFFFFF);
2307         // dst += popcnt.32(src >> 32);
2308         // We could do the same thing here if the original sequence in LLVM IR
2309         // cannot be translated efficienty.
2310         IGC_ASSERT_MESSAGE(0, "NOT IMPLEMENTED YET!");
2311         break;
2312     }
2313     case llvm_ieee_sqrt:
2314         m_encoder->IEEESqrt(dst, src0);
2315         break;
2316     case llvm_ieee_divide:
2317         m_encoder->IEEEDivide(dst, src0, src1);
2318         break;
2319     default:
2320         //need support
2321         IGC_ASSERT(0);
2322         break;
2323     }
2324     m_encoder->Push();
2325 }
2326 
EmitMinMax(bool isMin,bool isUnsigned,const SSource sources[2],const DstModifier & modifier)2327 void EmitPass::EmitMinMax(bool isMin, bool isUnsigned, const SSource sources[2], const DstModifier& modifier) {
2328     EOPCODE opCode = isMin ? llvm_min : llvm_max;
2329     CVariable* srcs[2] = { nullptr, nullptr };
2330     CVariable* dst = m_destination;
2331     srcs[0] = GetSrcVariable(sources[0]);
2332     srcs[1] = GetSrcVariable(sources[1]);
2333     SetSourceModifiers(0, sources[0]);
2334     SetSourceModifiers(1, sources[1]);
2335     m_encoder->SetDstModifier(modifier);
2336     if (isUnsigned) {
2337         srcs[0] = m_currShader->BitCast(srcs[0], GetUnsignedType(srcs[0]->GetType()));
2338         srcs[1] = m_currShader->BitCast(srcs[1], GetUnsignedType(srcs[1]->GetType()));
2339         dst = m_currShader->BitCast(m_destination, GetUnsignedType(m_destination->GetType()));
2340     }
2341     EmitSimpleAlu(opCode, dst, srcs[0], srcs[1]);
2342 }
2343 
EmitUAdd(llvm::BinaryOperator * inst,const DstModifier & modifier)2344 void IGC::EmitPass::EmitUAdd(llvm::BinaryOperator* inst, const DstModifier& modifier)
2345 {
2346     // the emit function should be called only if saturation is enabled. In other case the signedness of
2347     // the instruction doesn't play a role in case of computing the instruction.
2348     IGC_ASSERT(modifier.sat == true);
2349     CVariable* srcs[2] = { GetSymbol(inst->getOperand(0)), GetSymbol(inst->getOperand(1)) };
2350 
2351     // create new aliases for the operands and the destination
2352     srcs[0] = m_currShader->BitCast(srcs[0], GetUnsignedType(srcs[0]->GetType()));
2353     srcs[1] = m_currShader->BitCast(srcs[1], GetUnsignedType(srcs[1]->GetType()));
2354     CVariable* dst = m_currShader->BitCast(m_destination, GetUnsignedType(m_destination->GetType()));
2355     m_encoder->SetDstModifier(modifier);
2356 
2357     EmitSimpleAlu(EOPCODE::llvm_add, dst, srcs[0], srcs[1]);
2358 }
2359 
EmitFullMul32(bool isUnsigned,const SSource sources[2],const DstModifier & dstMod)2360 void EmitPass::EmitFullMul32(bool isUnsigned, const SSource sources[2], const DstModifier& dstMod) {
2361     CVariable* srcs[2] = { nullptr, nullptr };
2362     srcs[0] = GetSrcVariable(sources[0]);
2363     srcs[1] = GetSrcVariable(sources[1]);
2364     SetSourceModifiers(0, sources[0]);
2365     SetSourceModifiers(1, sources[1]);
2366     m_encoder->SetDstModifier(dstMod);
2367     if (isUnsigned) {
2368         srcs[0] = m_currShader->BitCast(srcs[0], GetUnsignedType(srcs[0]->GetType()));
2369         srcs[1] = m_currShader->BitCast(srcs[1], GetUnsignedType(srcs[1]->GetType()));
2370     }
2371     // Emit *D x *D -> *Q supported by Gen
2372     EmitSimpleAlu(llvm_mul, m_destination, srcs[0], srcs[1]);
2373 }
2374 
EmitFPToIntWithSat(bool isUnsigned,bool needBitCast,VISA_Type type,const SSource & source,const DstModifier & dstMod)2375 void EmitPass::EmitFPToIntWithSat(bool isUnsigned, bool needBitCast, VISA_Type type, const SSource& source, const DstModifier& dstMod) {
2376     EOPCODE op = isUnsigned ? llvm_fptoui : llvm_fptosi;
2377 
2378     CVariable* dst = m_destination;
2379     if (type != m_destination->GetType()) {
2380         dst = m_currShader->GetNewVariable(
2381             dst->GetNumberElement(), type,
2382             m_currShader->getGRFAlignment(),
2383             dst->IsUniform(), m_destination->getName());
2384     }
2385     else if (needBitCast) {
2386         dst = m_currShader->BitCast(dst, GetUnsignedIntegerType(dst->GetType()));
2387     }
2388     DstModifier satDstMod = dstMod;
2389     satDstMod.sat = true;
2390     m_encoder->SetDstModifier(satDstMod);
2391 
2392     CVariable* src = GetSrcVariable(source);
2393     SetSourceModifiers(0, source);
2394     EmitSimpleAlu(op, dst, src, nullptr);
2395     if (type != m_destination->GetType()) {
2396         CVariable* tmp = m_currShader->BitCast(dst, GetUnsignedType(type));
2397         dst = m_destination;
2398         if (needBitCast) {
2399             dst = m_currShader->BitCast(dst, GetUnsignedIntegerType(dst->GetType()));
2400         }
2401         m_encoder->Cast(dst, tmp);
2402     }
2403 }
2404 
EmitIntegerTruncWithSat(bool isSignedDst,bool isSignedSrc,const SSource & source,const DstModifier & dstMod)2405 void EmitPass::EmitIntegerTruncWithSat(bool isSignedDst, bool isSignedSrc, const SSource& source, const DstModifier& dstMod) {
2406     CVariable* dst = m_destination;
2407     if (!isSignedDst) {
2408         dst = m_currShader->BitCast(dst, GetUnsignedIntegerType(dst->GetType()));
2409     }
2410     DstModifier satDstMod = dstMod;
2411     satDstMod.sat = true;
2412     m_encoder->SetDstModifier(satDstMod);
2413 
2414     CVariable* src = GetSrcVariable(source);
2415     if (!isSignedSrc) {
2416         src = m_currShader->BitCast(src, GetUnsignedIntegerType(src->GetType()));
2417     }
2418     m_encoder->SetSrcModifier(0, source.mod);
2419 
2420     m_encoder->Cast(dst, src);
2421     m_encoder->Push();
2422 }
2423 
EmitInsertValueToStruct(llvm::InsertValueInst * II,bool forceVectorInit,const DstModifier & DstMod)2424 void EmitPass::EmitInsertValueToStruct(llvm::InsertValueInst* II, bool forceVectorInit, const DstModifier& DstMod)
2425 {
2426     Value* structOp = II->getOperand(0);
2427     StructType* sTy = dyn_cast<StructType>(structOp->getType());
2428     auto& DL = II->getParent()->getParent()->getParent()->getDataLayout();
2429     const StructLayout* SL = DL.getStructLayout(sTy);
2430 
2431     // Get the source operand to insert
2432     CVariable* SrcV = GetSymbol(II->getOperand(1));
2433 
2434     if (forceVectorInit)
2435     {
2436         IGC_ASSERT(isa<Constant>(structOp) || structOp->getValueID() == Value::UndefValueVal);
2437     }
2438     // Get the dst struct variable, or create one with constant values initialized if it does not exist
2439     CVariable* DstV = m_currShader->GetStructVariable(II, forceVectorInit);
2440 
2441     IGC_ASSERT_MESSAGE((!SrcV->IsUniform() && DstV->IsUniform()) == false, "Can't insert vector value into a scalar struct!");
2442 
2443     // Copy source value into the struct offset
2444     unsigned idx = *II->idx_begin();
2445     unsigned elementOffset = (unsigned)SL->getElementOffset(idx);
2446     unsigned nLanes = DstV->IsUniform() ? 1 : numLanes(m_currShader->m_dispatchSize);
2447     CVariable* elementDst = nullptr;
2448     if (SrcV->IsUniform())
2449         elementDst = m_currShader->GetNewAlias(DstV, SrcV->GetType(), elementOffset * nLanes, SrcV->GetNumberElement() * nLanes);
2450     else
2451         elementDst = m_currShader->GetNewAlias(DstV, SrcV->GetType(), elementOffset * nLanes, SrcV->GetNumberElement());
2452 
2453     emitCopyAll(elementDst, SrcV, sTy->getStructElementType(idx));
2454 }
2455 
EmitExtractValueFromStruct(llvm::ExtractValueInst * EI,const DstModifier & DstMod)2456 void EmitPass::EmitExtractValueFromStruct(llvm::ExtractValueInst* EI, const DstModifier& DstMod)
2457 {
2458     CVariable* SrcV = GetSymbol(EI->getOperand(0));
2459     unsigned idx = *EI->idx_begin();
2460     StructType* sTy = dyn_cast<StructType>(EI->getOperand(0)->getType());
2461     auto& DL = m_currShader->entry->getParent()->getDataLayout();
2462     const StructLayout* SL = DL.getStructLayout(sTy);
2463 
2464     // For extract value, src and dest should share uniformity
2465     IGC_ASSERT(nullptr != m_destination);
2466     IGC_ASSERT(nullptr != SrcV);
2467     IGC_ASSERT(m_destination->IsUniform() == SrcV->IsUniform());
2468 
2469     bool isUniform = SrcV->IsUniform();
2470     unsigned nLanes = isUniform ? 1 : numLanes(m_currShader->m_dispatchSize);
2471     unsigned elementOffset = (unsigned)SL->getElementOffset(idx) * nLanes;
2472     SrcV = m_currShader->GetNewAlias(SrcV, m_destination->GetType(), elementOffset, m_destination->GetNumberElement(), isUniform);
2473 
2474     // Copy from struct to dest
2475     emitCopyAll(m_destination, SrcV, sTy->getStructElementType(idx));
2476 }
2477 
EmitAddPair(GenIntrinsicInst * GII,const SSource Sources[4],const DstModifier & DstMod)2478 void EmitPass::EmitAddPair(GenIntrinsicInst* GII, const SSource Sources[4], const DstModifier& DstMod) {
2479     Value* L=nullptr, * H=nullptr;
2480     std::tie(L, H) = getPairOutput(GII);
2481     CVariable* Lo = L ? GetSymbol(L) : nullptr;
2482     CVariable* Hi = H ? GetSymbol(H) : nullptr;
2483     IGC_ASSERT(Lo == m_destination || Hi == m_destination);
2484 
2485     CVariable* L0 = GetSrcVariable(Sources[0]);
2486     CVariable* H0 = GetSrcVariable(Sources[1]);
2487     CVariable* L1 = GetSrcVariable(Sources[2]);
2488     CVariable* H1 = GetSrcVariable(Sources[3]);
2489     for (unsigned srcId = 0; srcId < 4; ++srcId) {
2490         SetSourceModifiers(srcId, Sources[srcId]);
2491     }
2492 
2493     m_encoder->AddPair(Lo, Hi, L0, H0, L1, H1);
2494     m_encoder->Push();
2495 }
2496 
EmitSubPair(GenIntrinsicInst * GII,const SSource Sources[4],const DstModifier & DstMod)2497 void EmitPass::EmitSubPair(GenIntrinsicInst* GII, const SSource Sources[4], const DstModifier& DstMod) {
2498     Value* L = nullptr, * H = nullptr;
2499     std::tie(L, H) = getPairOutput(GII);
2500     CVariable* Lo = L ? GetSymbol(L) : nullptr;
2501     CVariable* Hi = H ? GetSymbol(H) : nullptr;
2502     IGC_ASSERT(Lo == m_destination || Hi == m_destination);
2503 
2504     CVariable* L0 = GetSrcVariable(Sources[0]);
2505     CVariable* H0 = GetSrcVariable(Sources[1]);
2506     CVariable* L1 = GetSrcVariable(Sources[2]);
2507     CVariable* H1 = GetSrcVariable(Sources[3]);
2508 
2509     m_encoder->SubPair(Lo, Hi, L0, H0, L1, H1);
2510     m_encoder->Push();
2511 }
2512 
EmitMulPair(GenIntrinsicInst * GII,const SSource Sources[4],const DstModifier & DstMod)2513 void EmitPass::EmitMulPair(GenIntrinsicInst* GII, const SSource Sources[4], const DstModifier& DstMod) {
2514     Value* L = nullptr, * H = nullptr;
2515     std::tie(L, H) = getPairOutput(GII);
2516     CVariable* Lo = L ? GetSymbol(L) : nullptr;
2517     CVariable* Hi = H ? GetSymbol(H) : nullptr;
2518     IGC_ASSERT(Lo == m_destination || Hi == m_destination);
2519 
2520     CVariable* L0 = GetSrcVariable(Sources[0]);
2521     CVariable* H0 = GetSrcVariable(Sources[1]);
2522     CVariable* L1 = GetSrcVariable(Sources[2]);
2523     CVariable* H1 = GetSrcVariable(Sources[3]);
2524 
2525     // Use `UD` for Lo(s).
2526     if (Lo && Lo->GetType() != ISA_TYPE_UD) Lo = m_currShader->BitCast(Lo, ISA_TYPE_UD);
2527     if (L0->GetType() != ISA_TYPE_UD) L0 = m_currShader->BitCast(L0, ISA_TYPE_UD);
2528     if (L1->GetType() != ISA_TYPE_UD) L1 = m_currShader->BitCast(L1, ISA_TYPE_UD);
2529 
2530     if (Lo == nullptr && Hi == nullptr)
2531     {
2532         return;
2533     }
2534 
2535     if (Lo != nullptr && Hi == nullptr)
2536     {
2537         // Lo = A * B
2538         m_encoder->Mul(Lo, L0, L1);
2539         m_encoder->Push();
2540         return;
2541     }
2542 
2543     // Algorithm:
2544     //    AB   - L0, L1
2545     //    CD   - H0, H1
2546     //   ----
2547     //     E
2548     //    F
2549     //    G
2550     //   H     - 'H' spills into bit 65 - only needed if overflow detection is required
2551     // --------
2552     // dstLow = E
2553     // dstHigh = F + G + carry
2554 
2555     CVariable* dstHiTmp = m_currShader->GetNewVariable(
2556         Hi->GetNumberElement(), Hi->GetType(), Hi->GetAlign(), Hi->IsUniform(), Hi->getName());
2557 
2558     if (Lo == nullptr && Hi != nullptr)
2559     {
2560         // Cr = carry(A * B)
2561         m_encoder->MulH(dstHiTmp, L0, L1);
2562         m_encoder->Push();
2563     }
2564     else
2565     {
2566         // For those platforms natively not support DW-DW multiply, use vISA madw instruction instead of mul/mulh to get better performance.
2567         if (m_currShader->m_Platform->noNativeDwordMulSupport())
2568         {
2569             // (Cr, E) = A * B
2570             // dst size should be GRF-aligned and doubled as it has both low and high results.
2571             // We must make the dst element number is numDWPerGRF aligned. For example, if the madw is SIMD1,
2572             // the dst has only 1 DW as low result in 1 GRF and only 1 DW as high result in another GRF. We should
2573             // set the dst as (numDWPerGRF * 2) element but not 2 DW elements. This is required by madw.
2574             auto numDWPerGRF = getGRFSize() / SIZE_DWORD;
2575             auto numElements = iSTD::Align(Lo->GetNumberElement(), numDWPerGRF);
2576             CVariable* DstTmp = m_currShader->GetNewVariable(
2577                 numElements * 2, ISA_TYPE_UD, EALIGN_GRF, Lo->IsUniform(),
2578                 CName(Lo->getName(), "int64Tmp"));
2579             CVariable* zero = m_currShader->ImmToVariable(0, ISA_TYPE_UD);
2580             m_encoder->Madw(DstTmp, L0, L1, zero);
2581 
2582             // dstLow = E
2583             m_encoder->SetSrcRegion(0, 1, 1, 0);
2584             m_encoder->Copy(Lo, DstTmp);
2585             m_encoder->Push();
2586 
2587             // dstHigh = Cr
2588             uint regOffset = (uint)std::ceil((float)(numElements * CEncoder::GetCISADataTypeSize(ISA_TYPE_UD)) / getGRFSize());
2589             m_encoder->SetSrcSubVar(0, regOffset);
2590             m_encoder->SetSrcRegion(0, 1, 1, 0);
2591             m_encoder->Copy(dstHiTmp, DstTmp);
2592             m_encoder->Push();
2593         }
2594         else
2595         {
2596             // E = A * B
2597             m_encoder->Mul(Lo, L0, L1);
2598             m_encoder->Push();
2599 
2600             // Cr = carry(A * B)
2601             m_encoder->MulH(dstHiTmp, L0, L1);
2602             m_encoder->Push();
2603         }
2604     }
2605 
2606     // F = A * D
2607     CVariable* T0 = m_currShader->GetNewVariable(
2608         Hi->GetNumberElement(), Hi->GetType(), Hi->GetAlign(), Hi->IsUniform(),
2609         CName(Hi->getName(), "int64HiTmp"));
2610     m_encoder->Mul(T0, L0, H1);
2611     m_encoder->Push();
2612 
2613     // dstHigh = Cr + F
2614     m_encoder->Add(dstHiTmp, dstHiTmp, T0);
2615     m_encoder->Push();
2616 
2617     // G = B * C
2618     m_encoder->Mul(T0, L1, H0);
2619     m_encoder->Push();
2620 
2621     // dstHigh = Cr + F + G
2622     m_encoder->Add(Hi, dstHiTmp, T0);
2623     m_encoder->Push();
2624 }
2625 
EmitPtrToPair(GenIntrinsicInst * GII,const SSource Sources[1],const DstModifier & DstMod)2626 void EmitPass::EmitPtrToPair(GenIntrinsicInst* GII, const SSource Sources[1], const DstModifier& DstMod) {
2627     Value* L = nullptr, * H = nullptr;
2628     std::tie(L, H) = getPairOutput(GII);
2629     CVariable* Lo = L ? GetSymbol(L) : nullptr;
2630     CVariable* Hi = H ? GetSymbol(H) : nullptr;
2631     IGC_ASSERT(Lo == m_destination || Hi == m_destination);
2632 
2633     CVariable* Src = GetSrcVariable(Sources[0]);
2634     Src = m_currShader->BitCast(Src, m_destination->GetType());
2635 
2636     unsigned AS = Sources[0].value->getType()->getPointerAddressSpace();
2637     bool isPtr32 = m_currShader->GetContext()->getRegisterPointerSizeInBits(AS) == 32;
2638 
2639     if (Lo) {
2640         if (isPtr32) {
2641             m_encoder->Cast(Lo, Src);
2642             m_encoder->Push();
2643         }
2644         else {
2645             if (!Src->IsUniform())
2646                 m_encoder->SetSrcRegion(0, 2, 1, 0);
2647             m_encoder->SetSrcSubReg(0, 0);
2648             m_encoder->Copy(Lo, Src);
2649             m_encoder->Push();
2650         }
2651     }
2652 
2653     if (Hi) {
2654         if (isPtr32) {
2655             Src = m_currShader->ImmToVariable(0, m_destination->GetType());
2656             m_encoder->Cast(Hi, Src);
2657             m_encoder->Push();
2658         }
2659         else {
2660             if (!Src->IsUniform())
2661                 m_encoder->SetSrcRegion(0, 2, 1, 0);
2662             m_encoder->SetSrcSubReg(0, 1);
2663             m_encoder->Copy(Hi, Src);
2664             m_encoder->Push();
2665         }
2666     }
2667 }
2668 
2669 
EmitSIToFPZExt(const SSource & source,const DstModifier & dstMod)2670 void EmitPass::EmitSIToFPZExt(const SSource& source, const DstModifier& dstMod) {
2671     CVariable* flag = GetSrcVariable(source);
2672     CVariable* one = m_currShader->ImmToVariable(getFPOne(m_destination->GetType()), m_destination->GetType());
2673     CVariable* zero = m_currShader->ImmToVariable(0, m_destination->GetType());
2674     m_encoder->SetDstModifier(dstMod);
2675     m_encoder->Select(flag, m_destination, one, zero);
2676     m_encoder->Push();
2677 }
2678 
emitCtlz(const SSource & source)2679 void EmitPass::emitCtlz(const SSource& source)
2680 {
2681     // This does not go through the standard EmitAluIntrinsic pass because
2682     // that creates a redundant SetP due to an unused i1 literal.
2683     CVariable* src = GetSrcVariable(source);
2684     src = m_currShader->BitCast(src, GetUnsignedType(src->GetType()));
2685     CVariable* dst = m_currShader->BitCast(m_destination, GetUnsignedType(m_destination->GetType()));
2686     SetSourceModifiers(0, source);
2687     m_encoder->Ctlz(dst, src);
2688     m_encoder->Push();
2689 }
2690 
emitVMESendIME2(GenIntrinsicInst * inst)2691 void EmitPass::emitVMESendIME2(GenIntrinsicInst* inst) {
2692     CVariable* inputVar = GetSymbol(inst->getArgOperand(0));
2693     CVariable* srcImgBTI = GetSymbol(inst->getArgOperand(1));
2694     CVariable* refImgBTI = GetSymbol(inst->getArgOperand(2));
2695     CVariable* bwdRefImgBTI = GetSymbol(inst->getArgOperand(3));
2696     const COMMON_ISA_VME_STREAM_MODE streamMode = (COMMON_ISA_VME_STREAM_MODE)(cast<ConstantInt>(inst->getArgOperand(4))->getZExtValue());
2697 
2698     const bool isDualRef = refImgBTI->GetImmediateValue() != bwdRefImgBTI->GetImmediateValue();
2699     // If the BTIs aren't consecutive then we can't do VME.
2700     if (isDualRef)
2701     {
2702         IGC_ASSERT_MESSAGE(refImgBTI->GetImmediateValue() + 1 == bwdRefImgBTI->GetImmediateValue(), "refImg BTI and bwdRefImg BTI are not consecutive!");
2703     }
2704 
2705     uint32_t regs2snd = 4 + 2;
2706     uint32_t regs2rcv = CShader::GetIMEReturnPayloadSize(inst);
2707 
2708     if ((streamMode == VME_STREAM_IN) || (streamMode == VME_STREAM_IN_OUT))
2709     {
2710         regs2snd += 2;
2711         if (isDualRef)
2712         {
2713             regs2snd += 2;
2714         }
2715     }
2716 
2717     // TODO: this may waste registers. We can allocate payload during evaluation
2718     //       stage, but that needs to initialize and copy payload.
2719     //       Need to revisit when VME initial support is done.
2720     if (inputVar->GetSize() > (regs2snd * getGRFSize()))
2721     {
2722         inputVar = m_currShader->GetNewAlias(inputVar, ISA_TYPE_UD, 0, regs2snd * 8);
2723     }
2724 
2725     CVariable* outputVar = m_destination;
2726 
2727     if (outputVar->GetSize() > (regs2rcv * getGRFSize()))
2728     {
2729         outputVar = m_currShader->GetNewAlias(outputVar, ISA_TYPE_UD, 0, regs2rcv * 8);
2730     }
2731 
2732     const uint32_t desc = VMEDescriptor(streamMode, (uint32_t)(srcImgBTI->GetImmediateValue()),
2733         EU_GEN7_5_VME_MESSAGE_IME, regs2snd, regs2rcv);
2734 
2735     CVariable* messDesc = m_currShader->ImmToVariable(desc, ISA_TYPE_UD);
2736 
2737     m_encoder->Send(outputVar, inputVar, EU_MESSAGE_TARGET_SFID_VME, messDesc, false);
2738     m_encoder->Push();
2739 }
2740 
emitVMESendIME(GenIntrinsicInst * inst)2741 void EmitPass::emitVMESendIME(GenIntrinsicInst* inst) {
2742     const bool has_bwd_ref_image = inst->getIntrinsicID() == GenISAIntrinsic::GenISA_vmeSendIME2;
2743     CVariable* outputVar = GetSymbol(inst->getArgOperand(0));
2744 
2745     CVariable* uniInputVar = GetSymbol(inst->getArgOperand(1));
2746     CVariable* imeInputVar = GetSymbol(inst->getArgOperand(2));
2747 
2748     CVariable* srcImgBTI = GetSymbol(inst->getArgOperand(3));
2749     CVariable* refImgBTI = GetSymbol(inst->getArgOperand(4));
2750     CVariable* bwdRefImgBTI = has_bwd_ref_image ? GetSymbol(inst->getArgOperand(5)) : nullptr;
2751     // If the BTIs aren't consecutive then we can't do VME.
2752     IGC_ASSERT_MESSAGE(srcImgBTI->GetImmediateValue() + 1 == refImgBTI->GetImmediateValue(), "srcImg BTI and refImg BTI are not consecutive!");
2753     if (bwdRefImgBTI != nullptr) {
2754         IGC_ASSERT_MESSAGE(srcImgBTI->GetImmediateValue() + 2 == bwdRefImgBTI->GetImmediateValue(), "srcImg BTI and bwdRefImg BTI are not consecutive!");
2755     }
2756 
2757     uint rest_opnd_idx_base = has_bwd_ref_image ? 6 : 5;
2758 
2759     CVariable* ref0Var = GetSymbol(inst->getArgOperand(rest_opnd_idx_base));
2760     CVariable* ref1Var = GetSymbol(inst->getArgOperand(rest_opnd_idx_base + 1));
2761     CVariable* costCenterVar = GetSymbol(inst->getArgOperand(rest_opnd_idx_base + 2));
2762 
2763     // Those are raw operands, thus make sure they are GRF-aligned
2764     ref0Var = ReAlignUniformVariable(ref0Var, EALIGN_GRF);
2765     ref1Var = ReAlignUniformVariable(ref1Var, EALIGN_GRF);
2766 
2767     // costCenterVar needs to be 1 GRF. If it is uniform, extend it to 1 GRF [bdw+]
2768     if (costCenterVar->IsUniform())
2769     {
2770         VISA_Type costVisaTy = costCenterVar->GetType();
2771         IGC_ASSERT_MESSAGE(SIZE_DWORD == CEncoder::GetCISADataTypeSize(costVisaTy),
2772             "VME IME's cost center var has wrong type!");
2773         CVariable* newVar = m_currShader->GetNewVariable(8, ISA_TYPE_UD, EALIGN_GRF, CName::NONE);
2774 
2775         m_encoder->SetNoMask();
2776         m_encoder->SetSimdSize(SIMDMode::SIMD8);
2777         m_encoder->Copy(newVar, costCenterVar);
2778         m_encoder->Push();
2779 
2780         costCenterVar = newVar;
2781     }
2782 
2783     unsigned char streamMode = VME_STREAM_DISABLE;
2784     unsigned char searchControlMode = VME_SEARCH_SINGLE_REF_SINGLE_REC_SINGLE_START;
2785 
2786     // Force write the costCenter here.  I'd like to have uniInputVar setup before calling
2787     // emitVMESendIME so we don't burn movs each time we call this but CM uses it for now.
2788     // Fix later.
2789     {
2790         CVariable* uniAlias = m_currShader->GetNewAlias(uniInputVar, ISA_TYPE_UD, 3 * getGRFSize(), 8);
2791         m_encoder->SetNoMask();
2792         m_encoder->SetSrcRegion(0, 0, 1, 0);
2793         m_encoder->SetSimdSize(SIMDMode::SIMD8);
2794         m_encoder->Copy(uniAlias, costCenterVar);
2795         m_encoder->Push();
2796     }
2797 
2798     m_encoder->SetNoMask();
2799     m_encoder->SendVmeIme(srcImgBTI,
2800         streamMode,
2801         searchControlMode,
2802         uniInputVar,
2803         imeInputVar,
2804         ref0Var,
2805         ref1Var,
2806         costCenterVar,
2807         outputVar);
2808     m_encoder->Push();
2809     return;
2810 }
2811 
emitVMESendFBR(GenIntrinsicInst * inst)2812 void EmitPass::emitVMESendFBR(GenIntrinsicInst* inst) {
2813     CVariable* outputVar = GetSymbol(inst->getArgOperand(0));
2814 
2815     CVariable* uniInputVar = GetSymbol(inst->getArgOperand(1));
2816     CVariable* fbrInputVar = GetSymbol(inst->getArgOperand(2));
2817 
2818     CVariable* srcImgBTI = GetSymbol(inst->getArgOperand(3));
2819     CVariable* refImgBTI = GetSymbol(inst->getArgOperand(4));
2820     // If the BTIs aren't consecutive then we can't do VME.
2821     IGC_ASSERT_MESSAGE(srcImgBTI->GetImmediateValue() + 1 == refImgBTI->GetImmediateValue(), "srcImg BTI and refImg BTI are not consecutive!");
2822 
2823     const uint rest_opnd_idx_base = 5;
2824     CVariable* FBRMbModeVar = GetSymbol(inst->getArgOperand(rest_opnd_idx_base));
2825     CVariable* FBRSubMbShapeVar = GetSymbol(inst->getArgOperand(rest_opnd_idx_base + 1));
2826     CVariable* FBRSubPredModeVar = GetSymbol(inst->getArgOperand(rest_opnd_idx_base + 2));
2827 
2828     m_encoder->SendVmeFbr(srcImgBTI, uniInputVar, fbrInputVar, FBRMbModeVar, FBRSubMbShapeVar, FBRSubPredModeVar, outputVar);
2829     m_encoder->Push();
2830     return;
2831 }
2832 
emitVMESendFBR2(GenIntrinsicInst * inst)2833 void EmitPass::emitVMESendFBR2(GenIntrinsicInst* inst) {
2834     CVariable* inputVar = GetSymbol(inst->getArgOperand(0));
2835     CVariable* srcImgBTI = GetSymbol(inst->getArgOperand(1));
2836     CVariable* refImgBTI = GetSymbol(inst->getArgOperand(2));
2837     CVariable* bwdRefImgBTI = GetSymbol(inst->getArgOperand(3));
2838 
2839     const bool isDualRef = refImgBTI->GetImmediateValue() != bwdRefImgBTI->GetImmediateValue();
2840     // If the BTIs aren't consecutive then we can't do VME.
2841     if (isDualRef)
2842     {
2843         IGC_ASSERT_MESSAGE(refImgBTI->GetImmediateValue() + 1 == bwdRefImgBTI->GetImmediateValue(), "refImg BTI and bwdRefImg BTI are not consecutive!");
2844     }
2845 
2846     const uint32_t regs2rcv = (7 + 0), regs2snd = (4 + 4);
2847     const uint32_t desc = VMEDescriptor(VME_STREAM_DISABLE, (uint32_t)(srcImgBTI->GetImmediateValue()),
2848         EU_GEN7_5_VME_MESSAGE_FBR, regs2snd, regs2rcv);
2849 
2850     CVariable* messDesc = m_currShader->ImmToVariable(desc, ISA_TYPE_UD);
2851 
2852     CVariable* outputVar = m_destination;
2853 
2854     if (outputVar->GetSize() > (regs2rcv * getGRFSize()))
2855     {
2856         outputVar = m_currShader->GetNewAlias(outputVar, ISA_TYPE_UD, 0, regs2rcv * 8);
2857     }
2858 
2859     m_encoder->Send(outputVar, inputVar, EU_MESSAGE_TARGET_SFID_CRE, messDesc, false);
2860     m_encoder->Push();
2861 
2862     return;
2863 }
2864 
emitVMESendSIC(GenIntrinsicInst * inst)2865 void EmitPass::emitVMESendSIC(GenIntrinsicInst* inst)
2866 {
2867     CVariable* outputVar = GetSymbol(inst->getArgOperand(0));
2868     CVariable* uniInputVar = GetSymbol(inst->getArgOperand(1));
2869     CVariable* sicInputVar = GetSymbol(inst->getArgOperand(2));
2870     CVariable* srcImgBTI = GetSymbol(inst->getArgOperand(3));
2871     CVariable* ref0ImgBTI = GetSymbol(inst->getArgOperand(4));
2872     CVariable* ref1ImgBTI = GetSymbol(inst->getArgOperand(5));
2873     // If the BTIs aren't consecutive then we can't do VME.
2874     IGC_ASSERT_MESSAGE(srcImgBTI->GetImmediateValue() + 1 == ref0ImgBTI->GetImmediateValue(), "srcImg BTI and ref0Img BTI are not consecutive!");
2875     // In the non-bidirectional case, we just pass the same reference image into the
2876     // forward and backward slots.
2877     if (ref0ImgBTI->GetImmediateValue() != ref1ImgBTI->GetImmediateValue())
2878     {
2879         IGC_ASSERT_MESSAGE(ref0ImgBTI->GetImmediateValue() + 1 == ref1ImgBTI->GetImmediateValue(), "ref0Img BTI and ref1Img BTI are not consecutive!");
2880     }
2881 
2882     m_encoder->SendVmeSic(srcImgBTI, uniInputVar, sicInputVar, outputVar);
2883     m_encoder->Push();
2884 }
2885 
emitVMESendSIC2(GenIntrinsicInst * inst)2886 void EmitPass::emitVMESendSIC2(GenIntrinsicInst* inst)
2887 {
2888     CVariable* inputVar = GetSymbol(inst->getArgOperand(0));
2889     CVariable* srcImgBTI = GetSymbol(inst->getArgOperand(1));
2890     CVariable* fwdRefImgBTI = GetSymbol(inst->getArgOperand(2));
2891     CVariable* bwdRefImgBTI = GetSymbol(inst->getArgOperand(3));
2892 
2893     const bool isDualRef = fwdRefImgBTI->GetImmediateValue() != bwdRefImgBTI->GetImmediateValue();
2894     // If the BTIs aren't consecutive then we can't do VME.
2895     if (isDualRef)
2896     {
2897         IGC_ASSERT_MESSAGE(fwdRefImgBTI->GetImmediateValue() + 1 == bwdRefImgBTI->GetImmediateValue(), "refImg BTI and bwdRefImg BTI are not consecutive!");
2898     }
2899 
2900     // If the BTIs aren't consecutive then we can't do VME. And this only applies to case
2901     // when either fwdRefImg or bwdRefImg is presented.
2902     if (srcImgBTI->GetImmediateValue() != fwdRefImgBTI->GetImmediateValue())
2903     {
2904         IGC_ASSERT_MESSAGE(srcImgBTI->GetImmediateValue() + 1 == fwdRefImgBTI->GetImmediateValue(), "srcImg BTI and refImg BTI are not consecutive!");
2905 
2906         if (fwdRefImgBTI->GetImmediateValue() != bwdRefImgBTI->GetImmediateValue())
2907         {
2908             IGC_ASSERT_MESSAGE(srcImgBTI->GetImmediateValue() + 2 == bwdRefImgBTI->GetImmediateValue(), "srcImg BTI and bwdRefImg BTI are not consecutive!");
2909         }
2910     }
2911 
2912     const uint32_t regs2rcv = (7 + 0), regs2snd = (4 + 4);
2913     const uint32_t desc = VMEDescriptor(VME_STREAM_DISABLE, (uint32_t)(srcImgBTI->GetImmediateValue()),
2914         EU_GEN7_5_VME_MESSAGE_SIC, regs2snd, regs2rcv);
2915 
2916     CVariable* messDesc = m_currShader->ImmToVariable(desc, ISA_TYPE_UD);
2917 
2918     CVariable* outputVar = m_destination;
2919 
2920     if (outputVar->GetSize() > (regs2rcv * getGRFSize()))
2921     {
2922         outputVar = m_currShader->GetNewAlias(outputVar, ISA_TYPE_UD, 0, regs2rcv * 8);
2923     }
2924 
2925     m_encoder->Send(outputVar, inputVar, EU_MESSAGE_TARGET_SFID_CRE, messDesc, false);
2926     m_encoder->Push();
2927 
2928     return;
2929 }
2930 
emitCreateMessagePhases(GenIntrinsicInst * inst)2931 void EmitPass::emitCreateMessagePhases(GenIntrinsicInst* inst) {
2932     IGC_ASSERT_MESSAGE((m_destination->GetType() == ISA_TYPE_UD || m_destination->GetType() == ISA_TYPE_D), "Destination type is expected to be UD or D!");
2933     IGC_ASSERT_MESSAGE(isa<ConstantInt>(inst->getArgOperand(0)), "Num phases expected to be const!");
2934     unsigned int numPhases = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(0))->getZExtValue());
2935 
2936     const uint16_t numSimdLanes = numLanes(m_SimdMode);
2937     IGC_ASSERT(0 < numSimdLanes);
2938     unsigned int numWideSimdIters = numPhases * 8 / numSimdLanes;
2939     unsigned int remSimd8Iters = (numPhases * 8 % numSimdLanes) / 8;
2940 
2941     // Zero as many message phases as possible using the widest SIMD
2942     for (unsigned int i = 0; i < numWideSimdIters; ++i) {
2943         CVariable* messagePhase = m_currShader->GetNewAlias(m_destination, ISA_TYPE_UD, i * numSimdLanes * SIZE_DWORD, numSimdLanes);
2944 
2945         m_encoder->SetNoMask();
2946         m_encoder->SetSimdSize(m_SimdMode);
2947         m_encoder->Copy(messagePhase, m_currShader->ImmToVariable(0, ISA_TYPE_UD));
2948         m_encoder->Push();
2949     }
2950 
2951     // Zero the remaining message phases using SIMD8
2952     for (unsigned int i = 0; i < remSimd8Iters; ++i) {
2953         CVariable* messagePhase = m_currShader->GetNewAlias(m_destination, ISA_TYPE_UD, (i * 8 + numWideSimdIters * numSimdLanes) * SIZE_DWORD, numLanes(SIMDMode::SIMD8));
2954 
2955         m_encoder->SetNoMask();
2956         m_encoder->SetSimdSize(SIMDMode::SIMD8);
2957         m_encoder->Copy(messagePhase, m_currShader->ImmToVariable(0, ISA_TYPE_UD));
2958         m_encoder->Push();
2959     }
2960 }
2961 
GetTypeFromSize(unsigned size)2962 static VISA_Type GetTypeFromSize(unsigned size)
2963 {
2964     switch (size)
2965     {
2966     case 1:
2967         return ISA_TYPE_UB;
2968     case 2:
2969         return ISA_TYPE_UW;
2970     case 4:
2971         return ISA_TYPE_UD;
2972     case 8:
2973         return ISA_TYPE_UQ;
2974     default:
2975         IGC_ASSERT_MESSAGE(0, "unknown size");
2976         return ISA_TYPE_UD;
2977     }
2978 }
2979 
emitSimdMediaRegionCopy(llvm::GenIntrinsicInst * inst)2980 void EmitPass::emitSimdMediaRegionCopy(llvm::GenIntrinsicInst* inst)
2981 {
2982     CVariable* pDst = GetSymbol(inst->getArgOperand(0));
2983     unsigned dbyteoffset = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(1))->getZExtValue());
2984     unsigned dstride = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(2))->getZExtValue());
2985     unsigned dnumelem = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(3))->getZExtValue());
2986     CVariable* pSrc = GetSymbol(inst->getArgOperand(4));
2987     Value* sbyteoffset = inst->getArgOperand(5);
2988     unsigned vstride = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(6))->getZExtValue());
2989     unsigned width = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(7))->getZExtValue());
2990     unsigned hstride = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(8))->getZExtValue());
2991     unsigned typesize = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(9))->getZExtValue());
2992     unsigned execsize = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(10))->getZExtValue());
2993     unsigned snumelem = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(11))->getZExtValue());
2994 
2995     CVariable* pDstOffset = m_currShader->GetNewAlias(pDst, GetTypeFromSize(typesize), (uint16_t)dbyteoffset, (uint16_t)dnumelem);
2996 
2997     auto setup = [&]()
2998     {
2999         m_encoder->SetSimdSize(lanesToSIMDMode(execsize));
3000         m_encoder->SetNoMask();
3001         m_encoder->SetDstRegion(dstride);
3002         m_encoder->SetSrcRegion(0, vstride, width, hstride);
3003     };
3004 
3005     if (isa<ConstantInt>(sbyteoffset))
3006     {
3007         CVariable* pSrcOffset = m_currShader->GetNewAlias(
3008             pSrc,
3009             GetTypeFromSize(typesize),
3010             int_cast<uint16_t>(cast<ConstantInt>(sbyteoffset)->getZExtValue()),
3011             (uint16_t)snumelem);
3012 
3013         setup();
3014         m_encoder->Copy(pDstOffset, pSrcOffset);
3015         m_encoder->Push();
3016     }
3017     else
3018     {
3019         CVariable* pSrcOffset = m_currShader->GetNewAddressVariable(
3020             1,
3021             GetTypeFromSize(typesize),
3022             true,
3023             false,
3024             inst->getName());
3025 
3026         m_encoder->AddrAdd(pSrcOffset, pSrc, m_currShader->BitCast(GetSymbol(sbyteoffset), ISA_TYPE_UW));
3027         setup();
3028         m_encoder->Copy(pDstOffset, pSrcOffset);
3029         m_encoder->Push();
3030     }
3031 }
3032 
emitExtractMVAndSAD(llvm::GenIntrinsicInst * inst)3033 void EmitPass::emitExtractMVAndSAD(llvm::GenIntrinsicInst* inst)
3034 {
3035     CVariable* pMV = GetSymbol(inst->getArgOperand(0));
3036     CVariable* pSAD = GetSymbol(inst->getArgOperand(1));
3037     CVariable* pResult = GetSymbol(inst->getArgOperand(2));
3038     CVariable* pBlockType = GetSymbol(inst->getArgOperand(3));
3039 
3040     // W5.0 - W5.7 from Return Data Message Phases (InterDistortion)
3041     CVariable* pDist = m_currShader->GetNewAlias(pResult, ISA_TYPE_UW, 5 * getGRFSize(), 16);
3042     CVariable* pSADAlias = m_currShader->GetNewAlias(pSAD, ISA_TYPE_UW, 0, 16);
3043 
3044     CVariable* pFlag = m_currShader->GetNewVariable(
3045         16,
3046         ISA_TYPE_BOOL,
3047         EALIGN_GRF,
3048         CName::NONE);
3049 
3050     auto EmitCmp = [&](unsigned imm)
3051     {
3052         m_encoder->SetSimdSize(SIMDMode::SIMD16);
3053         m_encoder->SetNoMask();
3054         m_encoder->Cmp(EPREDICATE_EQ, pFlag, pBlockType, m_currShader->ImmToVariable(imm, ISA_TYPE_UD));
3055         m_encoder->Push();
3056     };
3057 
3058     // block type == 0 (16x16)
3059     EmitCmp(0);
3060 
3061 
3062     // Only one SAD, replicate it across.
3063     // (+f1.1) mov (16) r16.0<1>:uw r73.0<0;1,0>:uw { Align1, H1, NoMask }
3064     m_encoder->SetPredicate(pFlag);
3065     m_encoder->SetNoMask();
3066     m_encoder->SetSrcRegion(0, 0, 1, 0);
3067     m_encoder->SetSimdSize(SIMDMode::SIMD16);
3068     m_encoder->Copy(pSADAlias, pDist);
3069     m_encoder->Push();
3070 
3071     // block type == 1 (8x8)
3072     EmitCmp(1);
3073 
3074     // 4 SADs, copy each one 4 times.
3075     // (+f1.1) mov(4) r16.12<1>:uw r73.12<0;1,0>:uw { Align1, Q1, NoMask }
3076     // (+f1.1) mov(4) r16.8<1>:uw r73.8<0;1,0>:uw { Align1, Q1, NoMask }
3077     // (+f1.1) mov(4) r16.4<1>:uw r73.4<0;1,0>:uw { Align1, Q1, NoMask }
3078     // (+f1.1) mov(4) r16.0<1>:uw r73.0<0;1,0>:uw { Align1, Q1, NoMask }
3079     for (int i = 0; i < 4; i++)
3080     {
3081         m_encoder->SetPredicate(pFlag);
3082         m_encoder->SetNoMask();
3083         m_encoder->SetSrcRegion(0, 0, 1, 0);
3084         m_encoder->SetSimdSize(SIMDMode::SIMD4);
3085         CVariable* pDistOffset = m_currShader->GetNewAlias(pDist, ISA_TYPE_UW, i * 8, 4);
3086         CVariable* pSADOffset = m_currShader->GetNewAlias(pSADAlias, ISA_TYPE_UW, i * 8, 4);
3087         m_encoder->Copy(pSADOffset, pDistOffset);
3088         m_encoder->Push();
3089     }
3090 
3091     // block type == 2 (4x4)
3092     EmitCmp(2);
3093 
3094     // All 16 SADs present, copy othem over.
3095     // (+f1.1) mov (16) r16.0<1>:uw r73.0<8;8,1>:uw {Align1, H1, NoMask}
3096     m_encoder->SetPredicate(pFlag);
3097     m_encoder->SetNoMask();
3098     m_encoder->SetSimdSize(SIMDMode::SIMD16);
3099     m_encoder->Copy(pSADAlias, pDist);
3100     m_encoder->Push();
3101 
3102     // Copy over MVs
3103     for (int i = 0; i < 2; i++)
3104     {
3105         CVariable* pResultOffset = m_currShader->GetNewAlias(pResult, ISA_TYPE_UD,
3106             (1 * getGRFSize()) + (2 * i * getGRFSize()),
3107             16);
3108         CVariable* pMVOffset = m_currShader->GetNewAlias(pMV, ISA_TYPE_UD,
3109             2 * i * getGRFSize(),
3110             16);
3111         m_encoder->SetNoMask();
3112         m_encoder->SetSimdSize(SIMDMode::SIMD16);
3113         m_encoder->Copy(pMVOffset, pResultOffset);
3114         m_encoder->Push();
3115     }
3116 }
3117 
emitCmpSADs(llvm::GenIntrinsicInst * inst)3118 void EmitPass::emitCmpSADs(llvm::GenIntrinsicInst* inst)
3119 {
3120     // When called, this builtin will compare two SAD values
3121     // and take the minimum of the two.  The minimum MV associated
3122     // with the minimum SAD is also selected.
3123     CVariable* pMVCurr = GetSymbol(inst->getArgOperand(0));
3124     CVariable* pSADCurr = GetSymbol(inst->getArgOperand(1));
3125     CVariable* pMVMin = GetSymbol(inst->getArgOperand(2));
3126     CVariable* pSADMin = GetSymbol(inst->getArgOperand(3));
3127 
3128     CVariable* pFlag = m_currShader->GetNewVariable(
3129         16,
3130         ISA_TYPE_BOOL,
3131         EALIGN_GRF,
3132         CName::NONE);
3133 
3134     CVariable* pSADCurrAlias = m_currShader->GetNewAlias(pSADCurr, ISA_TYPE_UW, 0, 16);
3135     CVariable* pSADMinAlias = m_currShader->GetNewAlias(pSADMin, ISA_TYPE_UW, 0, 16);
3136 
3137     m_encoder->SetNoMask();
3138     m_encoder->SetSimdSize(SIMDMode::SIMD16);
3139     m_encoder->Cmp(EPREDICATE_LT, pFlag, pSADCurrAlias, pSADMinAlias);
3140     m_encoder->Push();
3141 
3142     // Collect the SADs
3143     m_encoder->SetNoMask();
3144     m_encoder->SetSimdSize(SIMDMode::SIMD16);
3145     m_encoder->Select(pFlag, pSADMinAlias, pSADCurrAlias, pSADMinAlias);
3146     m_encoder->Push();
3147 
3148     // Collect the MVs
3149     if (m_currShader->m_Platform->hasNoFullI64Support()) {
3150         CVariable* pMVMinAlias = m_currShader->GetNewAlias(pMVMin, ISA_TYPE_UD, 0, 32);
3151         CVariable* pMVCurrAlias = m_currShader->GetNewAlias(pMVCurr, ISA_TYPE_UD, 0, 32);
3152 
3153         //(W&fX.X) mov(8|M0) r(DST).0<1>:f    r(SRC).0<2;1,0>:f
3154         m_encoder->SetNoMask();
3155         m_encoder->SetSimdSize(SIMDMode::SIMD8);
3156         m_encoder->SetSrcRegion(0, 2, 1, 0);
3157         m_encoder->SetSrcRegion(1, 2, 1, 0);
3158         m_encoder->SetDstRegion(2);
3159         m_encoder->Select(pFlag, pMVMinAlias, pMVCurrAlias, pMVMinAlias);
3160         m_encoder->Push();
3161 
3162         //(W&fX.X) mov(8|M0) r(DST).1<1>:f    r(SRC).1<2;1,0>:f
3163         m_encoder->SetNoMask();
3164         m_encoder->SetSimdSize(SIMDMode::SIMD8);
3165         m_encoder->SetSrcRegion(0, 2, 1, 0);
3166         m_encoder->SetSrcRegion(1, 2, 1, 0);
3167         m_encoder->SetDstRegion(2);
3168         m_encoder->SetSrcSubReg(0, 1);
3169         m_encoder->SetSrcSubReg(1, 1);
3170         m_encoder->SetDstSubReg(1);
3171         m_encoder->Select(pFlag, pMVMinAlias, pMVCurrAlias, pMVMinAlias);
3172         m_encoder->Push();
3173 
3174         //(W&fX.X) mov(8|M8) r(DST+2).0<2>:f    r(SRC+2).0<2;1,0>:f
3175         m_encoder->SetNoMask();
3176         m_encoder->SetSimdSize(SIMDMode::SIMD8);
3177         m_encoder->SetMask(EMASK_Q2);
3178         m_encoder->SetSrcSubVar(0, 2);
3179         m_encoder->SetSrcSubVar(1, 2);
3180         m_encoder->SetDstSubVar(2);
3181         m_encoder->SetSrcRegion(0, 2, 1, 0);
3182         m_encoder->SetSrcRegion(1, 2, 1, 0);
3183         m_encoder->SetDstRegion(2);
3184         m_encoder->Select(pFlag, pMVMinAlias, pMVCurrAlias, pMVMinAlias);
3185         m_encoder->Push();
3186 
3187         //(W&fX.X) mov(8|M8) r(DST+2).1<2>:f    r(SRC+2).1<2;1,0>:f
3188         m_encoder->SetNoMask();
3189         m_encoder->SetSimdSize(SIMDMode::SIMD8);
3190         m_encoder->SetMask(EMASK_Q2);
3191         m_encoder->SetSrcSubVar(0, 2);
3192         m_encoder->SetSrcSubVar(1, 2);
3193         m_encoder->SetDstSubVar(2);
3194         m_encoder->SetSrcRegion(0, 2, 1, 0);
3195         m_encoder->SetSrcRegion(1, 2, 1, 0);
3196         m_encoder->SetDstRegion(2);
3197         m_encoder->SetSrcSubReg(0, 1);
3198         m_encoder->SetSrcSubReg(1, 1);
3199         m_encoder->SetDstSubReg(1);
3200         m_encoder->Select(pFlag, pMVMinAlias, pMVCurrAlias, pMVMinAlias);
3201         m_encoder->Push();
3202     }
3203     else {
3204         CVariable* pMVCurrAlias = m_currShader->GetNewAlias(pMVCurr, ISA_TYPE_UQ, 0, 16);
3205         CVariable* pMVMinAlias = m_currShader->GetNewAlias(pMVMin, ISA_TYPE_UQ, 0, 16);
3206 
3207         m_encoder->SetNoMask();
3208         m_encoder->SetSimdSize(SIMDMode::SIMD16);
3209         m_encoder->Select(pFlag, pMVMinAlias, pMVCurrAlias, pMVMinAlias);
3210         m_encoder->Push();
3211     }
3212 }
3213 
SameVar(CVariable * A,CVariable * B)3214 static bool SameVar(CVariable* A, CVariable* B)
3215 {
3216     A = (A->GetAlias() && A->GetAliasOffset() == 0) ? A->GetAlias() : A;
3217     B = (B->GetAlias() && B->GetAliasOffset() == 0) ? B->GetAlias() : B;
3218 
3219     return A == B;
3220 }
3221 
emitSimdSetMessagePhase(llvm::GenIntrinsicInst * inst)3222 void EmitPass::emitSimdSetMessagePhase(llvm::GenIntrinsicInst* inst) {
3223     CVariable* messagePhases = GetSymbol(inst->getArgOperand(0));
3224     const uint32_t phaseIndex = int_cast<uint32_t>(cast<ConstantInt>(inst->getArgOperand(1))->getZExtValue());
3225     const uint32_t numPhases = int_cast<uint32_t>(cast<ConstantInt>(inst->getArgOperand(2))->getZExtValue());
3226     const uint32_t dstSubReg = int_cast<uint32_t>(cast<ConstantInt>(inst->getArgOperand(3))->getZExtValue());
3227     const uint32_t numLanesPerPhase = int_cast<uint32_t>(cast<ConstantInt>(inst->getArgOperand(4))->getZExtValue());
3228     const SIMDMode simdMode = lanesToSIMDMode(numLanesPerPhase);
3229     Value* value = inst->getArgOperand(5);
3230     const uint16_t eltSizeInBytes = (uint16_t)m_DL->getTypeSizeInBits(value->getType()) / 8;
3231     const uint16_t numEltsPerPhase = getGRFSize() / eltSizeInBytes;
3232     const VISA_Type type = GetTypeFromSize(eltSizeInBytes);
3233 
3234     CVariable* val = GetSymbol(value);
3235 
3236     if (!SameVar(m_destination, messagePhases))
3237     {
3238         emitCopyAll(m_destination, messagePhases, inst->getArgOperand(0)->getType());
3239     }
3240 
3241     for (uint32_t i = 0; i < numPhases; ++i) {
3242         CVariable* src = val->IsUniform() ? val : m_currShader->GetNewAlias(val, type, i * getGRFSize(), numEltsPerPhase);
3243         CVariable* dst = m_currShader->GetNewAlias(m_destination, type, (i + phaseIndex) * getGRFSize(), numEltsPerPhase);
3244 
3245         m_encoder->SetNoMask();
3246         m_encoder->SetSimdSize(simdMode);
3247         m_encoder->SetDstSubReg(dstSubReg);
3248         if (!val->IsUniform())
3249             m_encoder->SetSrcRegion(0, 0, numEltsPerPhase, 1);
3250         m_encoder->Copy(dst, src);
3251         m_encoder->Push();
3252     }
3253 
3254     return;
3255 }
3256 
emitBroadcastMessagePhase(llvm::GenIntrinsicInst * inst)3257 void EmitPass::emitBroadcastMessagePhase(llvm::GenIntrinsicInst* inst) {
3258     const uint16_t eltSizeInBytes = (uint16_t)m_DL->getTypeSizeInBits(inst->getType()) / 8;
3259     const uint32_t width = int_cast<uint32_t>(cast<ConstantInt>(inst->getArgOperand(3))->getZExtValue());
3260     emitGetMessagePhaseType(inst, GetTypeFromSize(eltSizeInBytes), width);
3261 }
3262 
emitSimdGetMessagePhase(llvm::GenIntrinsicInst * inst)3263 void EmitPass::emitSimdGetMessagePhase(llvm::GenIntrinsicInst* inst) {
3264     CVariable* messagePhases = GetSymbol(inst->getArgOperand(0));
3265     const uint32_t phaseIndex = int_cast<uint32_t>(cast<ConstantInt>(inst->getArgOperand(1))->getZExtValue());
3266     const uint32_t numPhases = int_cast<uint32_t>(cast<ConstantInt>(inst->getArgOperand(2))->getZExtValue());
3267     const uint16_t eltSizeInBytes = (uint16_t)m_DL->getTypeSizeInBits(inst->getType()) / 8;
3268     const uint16_t numEltsPerPhase = getGRFSize() / eltSizeInBytes;
3269     const VISA_Type type = GetTypeFromSize(eltSizeInBytes);
3270     SIMDMode simdMode = SIMDMode::UNKNOWN;
3271 
3272     if (eltSizeInBytes == 8) {
3273         simdMode = SIMDMode::SIMD4;
3274     }
3275     else if (eltSizeInBytes == 4) {
3276         simdMode = SIMDMode::SIMD8;
3277     }
3278     else if (eltSizeInBytes == 2) {
3279         simdMode = SIMDMode::SIMD16;
3280     }
3281     else {
3282         IGC_ASSERT_MESSAGE(0, "Unhandled data type");
3283     }
3284 
3285     for (uint32_t i = 0; i < numPhases; ++i) {
3286         CVariable* src = m_currShader->GetNewAlias(messagePhases, type, (i + phaseIndex) * getGRFSize(), numEltsPerPhase);
3287         CVariable* dst = m_currShader->GetNewAlias(m_destination, type, i * getGRFSize(), numEltsPerPhase);
3288 
3289         m_encoder->SetNoMask();
3290         m_encoder->SetSimdSize(simdMode);
3291         m_encoder->SetSrcRegion(0, 0, numEltsPerPhase, 1);
3292         m_encoder->Copy(dst, src);
3293         m_encoder->Push();
3294     }
3295 
3296     return;
3297 }
3298 
emitGetMessagePhaseType(llvm::GenIntrinsicInst * inst,VISA_Type type,uint32_t width)3299 void EmitPass::emitGetMessagePhaseType(llvm::GenIntrinsicInst* inst, VISA_Type type, uint32_t width) {
3300     CVariable* messagePhases = GetSymbol(inst->getArgOperand(0));
3301     unsigned int phaseIndex = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(1))->getZExtValue());
3302     unsigned int phaseSubindex = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(2))->getZExtValue());
3303 
3304     IGC_ASSERT_MESSAGE(phaseIndex * getGRFSize() < messagePhases->GetSize(), "out of bounds!");
3305 
3306     CVariable* messagePhaseElem = m_currShader->GetNewAlias(messagePhases, type, phaseIndex * getGRFSize(), 1);
3307 
3308     m_encoder->SetNoMask();
3309     m_encoder->SetSrcRegion(0, 0, width, 1);
3310     m_encoder->SetSrcSubReg(0, phaseSubindex);
3311 
3312     m_encoder->Copy(m_destination, messagePhaseElem);
3313     m_encoder->Push();
3314 }
3315 
emitGetMessagePhaseX(llvm::GenIntrinsicInst * inst)3316 void EmitPass::emitGetMessagePhaseX(llvm::GenIntrinsicInst* inst) {
3317     unsigned size = inst->getType()->getScalarSizeInBits() / 8;
3318     emitGetMessagePhaseType(inst, GetTypeFromSize(size), /* width */ 1);
3319 }
3320 
emitSetMessagePhaseType_legacy(GenIntrinsicInst * inst,VISA_Type type)3321 void EmitPass::emitSetMessagePhaseType_legacy(GenIntrinsicInst* inst, VISA_Type type)
3322 {
3323     CVariable* messagePhases = GetSymbol(inst->getArgOperand(0));
3324     unsigned int phaseIndex = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(1))->getZExtValue());
3325     unsigned int phaseSubindex = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(2))->getZExtValue());
3326     CVariable* val = GetSymbol(inst->getArgOperand(3));
3327 
3328     CVariable* messagePhaseElem = m_currShader->GetNewAlias(messagePhases, type, phaseIndex * getGRFSize(), 1);
3329     m_encoder->SetSimdSize(SIMDMode::SIMD1);
3330     m_encoder->SetNoMask();
3331     m_encoder->SetSrcRegion(0, 0, 1, 0);
3332     m_encoder->SetDstSubReg(phaseSubindex);
3333     m_encoder->Copy(messagePhaseElem, val);
3334     m_encoder->Push();
3335 }
3336 
emitSetMessagePhaseType(GenIntrinsicInst * inst,VISA_Type type)3337 void EmitPass::emitSetMessagePhaseType(GenIntrinsicInst* inst, VISA_Type type) {
3338     CVariable* messagePhases = GetSymbol(inst->getArgOperand(0));
3339     unsigned int phaseIndex = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(1))->getZExtValue());
3340     unsigned int phaseSubindex = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(2))->getZExtValue());
3341     CVariable* val = GetSymbol(inst->getArgOperand(3));
3342 
3343     IGC_ASSERT_MESSAGE(phaseIndex * getGRFSize() < messagePhases->GetSize(), "out of bounds!");
3344 
3345     if (!SameVar(m_destination, messagePhases))
3346     {
3347         emitCopyAll(m_destination, messagePhases, inst->getArgOperand(0)->getType());
3348     }
3349 
3350     CVariable* messagePhaseElem = m_currShader->GetNewAlias(m_destination, type, phaseIndex * getGRFSize(), 1);
3351     m_encoder->SetSimdSize(SIMDMode::SIMD1);
3352     m_encoder->SetNoMask();
3353     m_encoder->SetSrcRegion(0, 0, 1, 0);
3354     m_encoder->SetDstSubReg(phaseSubindex);
3355     m_encoder->Copy(messagePhaseElem, val);
3356     m_encoder->Push();
3357 }
3358 
emitSetMessagePhaseX_legacy(GenIntrinsicInst * inst)3359 void EmitPass::emitSetMessagePhaseX_legacy(GenIntrinsicInst* inst)
3360 {
3361     Type* pTy = inst->getArgOperand(inst->getNumArgOperands() - 1)->getType();
3362     unsigned size = pTy->getScalarSizeInBits() / 8;
3363     emitSetMessagePhaseType_legacy(inst, GetTypeFromSize(size));
3364 }
3365 
emitSetMessagePhaseX(GenIntrinsicInst * inst)3366 void EmitPass::emitSetMessagePhaseX(GenIntrinsicInst* inst) {
3367     Type* pTy = inst->getArgOperand(inst->getNumArgOperands() - 1)->getType();
3368     unsigned size = pTy->getScalarSizeInBits() / 8;
3369     emitSetMessagePhaseType(inst, GetTypeFromSize(size));
3370 }
3371 
emitGetMessagePhase(llvm::GenIntrinsicInst * inst)3372 void EmitPass::emitGetMessagePhase(llvm::GenIntrinsicInst* inst) {
3373     if (isa<UndefValue>(inst->getArgOperand(0)))
3374         return;
3375 
3376     CVariable* messagePhases = GetSymbol(inst->getArgOperand(0));
3377     unsigned int phaseIndex = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(1))->getZExtValue());
3378 
3379     IGC_ASSERT_MESSAGE(phaseIndex * getGRFSize() < messagePhases->GetSize(), "out of bounds!");
3380 
3381     CVariable* messagePhase = m_currShader->GetNewAlias(messagePhases, ISA_TYPE_UD, phaseIndex * getGRFSize(), 8);
3382     m_encoder->SetSimdSize(SIMDMode::SIMD8);
3383     m_encoder->SetNoMask();
3384     m_encoder->Copy(m_destination, messagePhase);
3385     m_encoder->Push();
3386 }
3387 
emitSetMessagePhase_legacy(llvm::GenIntrinsicInst * inst)3388 void EmitPass::emitSetMessagePhase_legacy(llvm::GenIntrinsicInst* inst)
3389 {
3390     CVariable* messagePhases = GetSymbol(inst->getArgOperand(0));
3391     unsigned int phaseIndex = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(1))->getZExtValue());
3392     CVariable* val = GetSymbol(inst->getArgOperand(2));
3393 
3394     CVariable* messagePhase = m_currShader->GetNewAlias(messagePhases, ISA_TYPE_UD, phaseIndex * getGRFSize(), 8);
3395     m_encoder->SetSimdSize(SIMDMode::SIMD8);
3396     m_encoder->SetNoMask();
3397     m_encoder->Copy(messagePhase, val);
3398     m_encoder->Push();
3399 }
3400 
emitSetMessagePhase(llvm::GenIntrinsicInst * inst)3401 void EmitPass::emitSetMessagePhase(llvm::GenIntrinsicInst* inst) {
3402     CVariable* messagePhases = GetSymbol(inst->getArgOperand(0));
3403     unsigned int phaseIndex = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(1))->getZExtValue());
3404     CVariable* val = GetSymbol(inst->getArgOperand(2));
3405 
3406     IGC_ASSERT_MESSAGE(phaseIndex * getGRFSize() < messagePhases->GetSize(), "out of bounds!");
3407 
3408     if (!SameVar(m_destination, messagePhases))
3409     {
3410         emitCopyAll(m_destination, messagePhases, inst->getArgOperand(0)->getType());
3411     }
3412 
3413     CVariable* messagePhase = m_currShader->GetNewAlias(m_destination, ISA_TYPE_UD, phaseIndex * getGRFSize(), 8);
3414     m_encoder->SetSimdSize(SIMDMode::SIMD8);
3415     m_encoder->SetNoMask();
3416     m_encoder->Copy(messagePhase, val);
3417     m_encoder->Push();
3418 }
3419 
3420 // VA
emitVideoAnalyticSLM(llvm::GenIntrinsicInst * inst,const DWORD responseLen)3421 void EmitPass::emitVideoAnalyticSLM(llvm::GenIntrinsicInst* inst, const DWORD responseLen)
3422 {
3423     int argNum = 0;
3424     CVariable* outputVar = GetSymbol(inst->getArgOperand(argNum++));
3425     CVariable* coords = GetSymbol(inst->getArgOperand(argNum++));
3426     CVariable* size = NULL;
3427 
3428     IGC_ASSERT_MESSAGE(!(m_currShader->m_dispatchSize == SIMDMode::SIMD32 && m_encoder->IsSecondHalf()), "VA Intrinsics are simd independent");
3429     GenISAIntrinsic::ID id = inst->getIntrinsicID();
3430     if (id == GenISAIntrinsic::GenISA_vaCentroid ||
3431         id == GenISAIntrinsic::GenISA_vaBoolCentroid ||
3432         id == GenISAIntrinsic::GenISA_vaBoolSum)
3433     {
3434         size = GetSymbol(inst->getArgOperand(argNum++));
3435     }
3436 
3437     CVariable* srcImg = GetSymbol(inst->getArgOperand(argNum++));
3438 
3439     // So far we support only one VA function per kernel, and other sample
3440     // messages are not supported when there is VA function within the kernel.
3441     // So, for now it should be fine to always use sampler 0 for VA functions.
3442     DWORD samplerIndex = 0;
3443     CVariable* sampler = m_currShader->ImmToVariable(samplerIndex, ISA_TYPE_UD);
3444 
3445     uint16_t newNumElems = int_cast<uint16_t>(responseLen * getGRFSize() / SIZE_DWORD);
3446 
3447     CVariable* vaResult = m_currShader->GetNewVariable(
3448         newNumElems,
3449         ISA_TYPE_UD,
3450         outputVar->GetAlign(),
3451         false,
3452         CName::NONE);
3453 
3454     if (inst->getIntrinsicID() == GenISAIntrinsic::GenISA_vaConvolve)
3455     {
3456         CVariable* convResult = m_currShader->GetNewAlias(
3457             vaResult,
3458             ISA_TYPE_UW,
3459             0,
3460             newNumElems * 2);
3461 
3462         m_encoder->SendVideoAnalytic(inst, convResult, coords, size, srcImg, sampler);
3463     }
3464     else
3465     {
3466         m_encoder->SendVideoAnalytic(inst, vaResult, coords, size, srcImg, sampler);
3467     }
3468     m_encoder->Push();
3469 
3470     // Data port write msg header:
3471     DWORD msgLen = 2;
3472     DWORD resLen = 0;
3473     bool headerPresent = false;
3474     bool endOfThread = false;
3475     DWORD messageSpecificControl = encodeMessageSpecificControlForReadWrite(
3476         EU_DATA_PORT_WRITE_MESSAGE_TYPE_UNTYPED_SURFACE_WRITE,
3477         CHANNEL_MASK_R,
3478         SIMDMode::SIMD8);
3479     bool invalidateAfterReadEnable = false;
3480     DWORD btiIndex = SLM_BTI;
3481 
3482     DWORD descValue = DataPortWrite(
3483         msgLen,
3484         resLen,
3485         headerPresent,
3486         endOfThread,
3487         EU_DATA_PORT_WRITE_MESSAGE_TYPE_UNTYPED_SURFACE_WRITE,
3488         messageSpecificControl,
3489         invalidateAfterReadEnable,
3490         btiIndex);
3491 
3492     DWORD exDescValue = EU_GEN7_5_MESSAGE_TARGET_DATA_PORT_DATA_CACHE_1;
3493 
3494     CVariable* desc = m_currShader->ImmToVariable(descValue, ISA_TYPE_UD);
3495     CVariable* exdesc = m_currShader->ImmToVariable(exDescValue, ISA_TYPE_UD);
3496 
3497     CVariable* storeMessage = m_currShader->GetNewVariable(
3498         2 * getGRFSize() / SIZE_DWORD,
3499         ISA_TYPE_UD,
3500         outputVar->GetAlign(),
3501         false,
3502         CName::NONE);
3503 
3504     m_encoder->SetSimdSize(SIMDMode::SIMD8);
3505     m_encoder->SetNoMask();
3506     m_encoder->Cast(storeMessage, m_currShader->ImmToVariable(0x76543210, ISA_TYPE_V));
3507     m_encoder->Shl(storeMessage, storeMessage, m_currShader->ImmToVariable(2, ISA_TYPE_UD));
3508     m_encoder->Push();
3509 
3510     for (DWORD i = 0; i < responseLen; i++)
3511     {
3512         if (i > 0)
3513         {
3514             m_encoder->SetSimdSize(SIMDMode::SIMD8);
3515             m_encoder->SetNoMask();
3516             m_encoder->Add(storeMessage, storeMessage, m_currShader->ImmToVariable(0x20, ISA_TYPE_UD));
3517             m_encoder->Push();
3518         }
3519 
3520         m_encoder->SetSimdSize(SIMDMode::SIMD8);
3521         m_encoder->SetNoMask();
3522         m_encoder->SetDstSubVar(1);
3523         m_encoder->SetSrcSubVar(0, i);
3524         m_encoder->Copy(storeMessage, vaResult);
3525 
3526         m_encoder->Send(NULL, storeMessage,
3527             EU_GEN7_5_MESSAGE_TARGET_DATA_PORT_DATA_CACHE_1, exdesc, desc, false);
3528         m_encoder->Push();
3529     }
3530 
3531     return;
3532 }
3533 
emitVideoAnalyticGRF(llvm::GenIntrinsicInst * inst,const DWORD responseLen)3534 void EmitPass::emitVideoAnalyticGRF(llvm::GenIntrinsicInst* inst, const DWORD responseLen)
3535 {
3536     CVariable* dst = m_destination;
3537     int argNum = 0;
3538     CVariable* coords = GetSymbol(inst->getArgOperand(argNum++));
3539 
3540     // So far we support only one VA function per kernel, and other sample
3541     // messages are not supported when there is VA function within the kernel.
3542     // So, for now it should be fine to always use sampler 0 for VA functions.
3543     CVariable* sampler = m_currShader->ImmToVariable(0, ISA_TYPE_UD);
3544     CVariable* srcImg = GetSymbol(inst->getArgOperand(argNum++));
3545 
3546     m_encoder->SendVideoAnalytic(inst, dst, coords, nullptr, srcImg, sampler);
3547     m_encoder->Push();
3548 }
3549 
BinaryUnary(llvm::Instruction * inst,const SSource source[2],const DstModifier & modifier)3550 void EmitPass::BinaryUnary(llvm::Instruction* inst, const SSource source[2], const DstModifier& modifier)
3551 {
3552     switch (inst->getOpcode())
3553     {
3554     case Instruction::FCmp:
3555     case Instruction::ICmp:
3556         Cmp(cast<CmpInst>(inst)->getPredicate(), source, modifier);
3557         break;
3558     case Instruction::Sub:
3559     case Instruction::FSub:
3560         Sub(source, modifier);
3561         break;
3562     case Instruction::FDiv:
3563         FDiv(source, modifier);
3564         break;
3565     case Instruction::Xor:
3566         Xor(source, modifier);
3567         break;
3568     case Instruction::Mul:
3569         Mul(source, modifier);
3570         break;
3571     case Instruction::Call:
3572         EmitAluIntrinsic(cast<CallInst>(inst), source, modifier);
3573         break;
3574     default:
3575         // other instruction don't need special handling
3576         EmitSimpleAlu(inst, source, modifier);
3577         break;
3578     }
3579 }
3580 
Sub(const SSource sources[2],const DstModifier & modifier)3581 void EmitPass::Sub(const SSource sources[2], const DstModifier& modifier)
3582 {
3583     CVariable* src0 = GetSrcVariable(sources[0]);
3584     CVariable* src1 = GetSrcVariable(sources[1]);
3585     e_modifier mod1 = CombineModifier(EMOD_NEG, sources[1].mod);
3586 
3587     m_encoder->SetDstModifier(modifier);
3588     SetSourceModifiers(0, sources[0]);
3589     SetSourceModifiers(1, sources[1]);
3590     // override modifier of source 1
3591     m_encoder->SetSrcModifier(1, mod1);
3592     m_encoder->Add(m_destination, src0, src1);
3593     m_encoder->Push();
3594 
3595 }
3596 
Mul64(CVariable * dst,CVariable * src[2],SIMDMode simdMode,bool noMask) const3597 void EmitPass::Mul64(CVariable* dst, CVariable* src[2], SIMDMode simdMode, bool noMask) const
3598 {
3599     auto EncoderInit = [this, simdMode, noMask]()->void
3600     {
3601         m_encoder->SetSimdSize(simdMode);
3602         if (noMask)
3603         {
3604             m_encoder->SetNoMask();
3605         }
3606     };
3607 
3608     // Mul64 does not write to m_destination!
3609 
3610     IGC_ASSERT_MESSAGE((src[1]->GetType() == ISA_TYPE_Q) || (src[1]->GetType() == ISA_TYPE_UQ),
3611         "Cannot multiply a qword by a non-qword type");
3612 
3613     // The signedness of the hi-part type should be the same as that
3614     // of the original destination type.
3615     VISA_Type hiType;
3616     if (dst->GetType() == ISA_TYPE_Q)
3617         hiType = ISA_TYPE_D;
3618     else
3619         hiType = ISA_TYPE_UD;
3620 
3621     // Figure out what the hi and what the lo part of each source is.
3622     // For non-uniforms, this requires an unpack.
3623     CVariable* srcLo[2], * srcHi[2];
3624     for (int i = 0; i < 2; ++i)
3625     {
3626         CVariable* srcAsUD;
3627         if (src[i]->IsUniform())
3628         {
3629             if (src[i]->IsImmediate())
3630             {
3631                 srcLo[i] = m_currShader->ImmToVariable((uint)src[i]->GetImmediateValue(), ISA_TYPE_UD);
3632                 srcHi[i] = m_currShader->ImmToVariable(src[i]->GetImmediateValue() >> 32, hiType);
3633             }
3634             else
3635             {
3636                 srcAsUD = m_currShader->BitCast(src[i], ISA_TYPE_UD);
3637                 srcLo[i] = m_currShader->GetNewAlias(srcAsUD, ISA_TYPE_UD, 0, 1);
3638                 srcHi[i] = m_currShader->GetNewAlias(srcAsUD, hiType, SIZE_DWORD, 1);
3639             }
3640         }
3641         else
3642         {
3643             srcAsUD = m_currShader->BitCast(src[i], ISA_TYPE_UD);
3644             //TODO: Would it be better for these two to be consecutive?
3645             srcLo[i] = m_currShader->GetNewVariable(
3646                 src[i]->GetNumberElement(),
3647                 ISA_TYPE_UD, EALIGN_GRF, false,
3648                 CName(src[i]->getName(), i == 0 ? "Lo0" : "Lo1"));
3649             srcHi[i] = m_currShader->GetNewVariable(src[i]->GetNumberElement(),
3650                 hiType, EALIGN_GRF, false,
3651                 CName(src[i]->getName(), i == 0 ? "Hi0" : "Hi1"));
3652             EncoderInit();
3653             m_encoder->SetSrcRegion(0, 2, 1, 0);
3654             m_encoder->Copy(srcLo[i], srcAsUD);
3655             m_encoder->Push();
3656 
3657             EncoderInit();
3658             m_encoder->SetSrcSubReg(0, 1);
3659             m_encoder->SetSrcRegion(0, 2, 1, 0);
3660             m_encoder->Copy(srcHi[i], srcAsUD);
3661             m_encoder->Push();
3662 
3663         }
3664     }
3665 
3666     //Now, generate the required sequence of multiplies and adds
3667     TODO("Do not generate intermediate multiplies by constant 0 or 1.");
3668     TODO("Do smarter pattern matching to look for non-constant zexted/sexted sources.");
3669 
3670     CVariable* dstLo, * dstHi, * dstHiTemp;
3671     dstLo = m_currShader->GetNewVariable(dst->GetNumberElement(),
3672         ISA_TYPE_UD, m_destination->GetAlign(), dst->IsUniform(),
3673         CName(m_destination->getName(), "int64Lo"));
3674     dstHi = m_currShader->GetNewVariable(dst->GetNumberElement(),
3675         hiType, m_destination->GetAlign(), dst->IsUniform(),
3676         CName(m_destination->getName(), "int64Hi"));
3677     dstHiTemp = m_currShader->GetNewVariable(dst->GetNumberElement(),
3678         hiType, m_destination->GetAlign(), dst->IsUniform(),
3679         CName(m_destination->getName(), "int64HiTmp"));
3680 
3681 
3682     //
3683     // Algorithm:
3684     //   - Break the 64 bit sources into 32bit low/high halves.
3685     //   - Perform multiplication "by hand"
3686     //
3687     //    AB   - srcLo[0], srcLo[1]
3688     //    CD   - srcHi[0], srcHi[1]
3689     //   ----
3690     //     E
3691     //    F
3692     //    G
3693     //   H     - 'H' spills into bit 65 - only needed if overflow detection is required
3694     // --------
3695     // dstLow = E
3696     // dstHigh = F + G + carry
3697 
3698     // For those platforms natively not support DW-DW multiply, use vISA madw instruction instead of mul/mulh to get better performance.
3699     if (m_currShader->m_Platform->noNativeDwordMulSupport())
3700     {
3701         // (Cr, E) = A * B
3702         EncoderInit();
3703         // dst size should be GRF-aligned and doubled as it has both low and high results.
3704         // We must make the dst element number is numDWPerGRF aligned. For example, if the madw is SIMD1,
3705         // the dst has only 1 DW as low result in 1 GRF and only 1 DW as high result in another GRF. We should
3706         // set the dst as (numDWPerGRF * 2) element but not 2 DW elements. This is required by madw.
3707         auto numDWPerGRF = getGRFSize() / SIZE_DWORD;
3708         auto numElements = iSTD::Align(dst->GetNumberElement(), numDWPerGRF);
3709         CVariable* dstTmp = m_currShader->GetNewVariable(
3710             numElements * 2, ISA_TYPE_UD, EALIGN_GRF, dst->IsUniform(),
3711             CName(m_destination->getName(), "int64Tmp"));
3712         CVariable* zero = m_currShader->ImmToVariable(0, ISA_TYPE_UD);
3713         m_encoder->Madw(dstTmp, srcLo[0], srcLo[1], zero);
3714 
3715         // copy low of A*B to dstLo
3716         EncoderInit();
3717         m_encoder->SetSrcRegion(0, 1, 1, 0);
3718         m_encoder->Copy(dstLo, dstTmp);
3719         m_encoder->Push();
3720 
3721         // copy high of A*B to dstHi
3722         EncoderInit();
3723         uint regOffset = (uint)std::ceil((float)(numElements * CEncoder::GetCISADataTypeSize(ISA_TYPE_UD)) / getGRFSize());
3724         m_encoder->SetSrcSubVar(0, regOffset);
3725         m_encoder->SetSrcRegion(0, 1, 1, 0);
3726         m_encoder->Copy(dstHi, dstTmp);
3727         m_encoder->Push();
3728     }
3729     else
3730     {
3731         // E = A * B
3732         EncoderInit();
3733         m_encoder->Mul(dstLo, srcLo[0], srcLo[1]);
3734         m_encoder->Push();
3735 
3736         // Cr = carry(A * B)
3737         EncoderInit();
3738         m_encoder->MulH(dstHi, srcLo[0], srcLo[1]);
3739         m_encoder->Push();
3740     }
3741 
3742     // F = A * D
3743     EncoderInit();
3744     m_encoder->Mul(dstHiTemp, srcLo[0], srcHi[1]);
3745     m_encoder->Push();
3746 
3747     // dstHigh = Cr + F
3748     EncoderInit();
3749     m_encoder->Add(dstHi, dstHi, dstHiTemp);
3750     m_encoder->Push();
3751 
3752     // G = C * B
3753     EncoderInit();
3754     m_encoder->Mul(dstHiTemp, srcHi[0], srcLo[1]);
3755     m_encoder->Push();
3756 
3757     // dstHigh = (Cr + F) + G
3758     EncoderInit();
3759     m_encoder->Add(dstHi, dstHi, dstHiTemp);
3760     m_encoder->Push();
3761 
3762     //And now, pack the result
3763     CVariable* dstAsUD = m_currShader->BitCast(dst, ISA_TYPE_UD);
3764     EncoderInit();
3765     m_encoder->SetDstRegion(2);
3766     m_encoder->Copy(dstAsUD, dstLo);
3767     m_encoder->Push();
3768 
3769     EncoderInit();
3770     m_encoder->SetDstRegion(2);
3771     m_encoder->SetDstSubReg(1);
3772     m_encoder->Copy(dstAsUD, dstHi);
3773     m_encoder->Push();
3774 }
3775 
Mul(const SSource sources[2],const DstModifier & modifier)3776 void EmitPass::Mul(const SSource sources[2], const DstModifier& modifier)
3777 {
3778     CVariable* src[2];
3779     for (int i = 0; i < 2; ++i)
3780     {
3781         src[i] = GetSrcVariable(sources[i]);
3782     }
3783 
3784     // Only i64 muls need special handling, otherwise go back to standard flow
3785     VISA_Type srcType = src[0]->GetType();
3786     if (srcType != ISA_TYPE_Q && srcType != ISA_TYPE_UQ)
3787     {
3788         Binary(EOPCODE_MUL, sources, modifier);
3789     }
3790     else
3791     {
3792         Mul64(m_destination, src, m_currShader->m_SIMDSize);
3793     }
3794 }
3795 
FDiv(const SSource sources[2],const DstModifier & modifier)3796 void EmitPass::FDiv(const SSource sources[2], const DstModifier& modifier)
3797 {
3798     if (isOne(sources[0].value))
3799     {
3800         Unary(EOPCODE_INV, &sources[1], modifier);
3801     }
3802     else
3803     {
3804         Binary(EOPCODE_DIV, sources, modifier);
3805     }
3806 }
3807 
isConstantAllOnes(const Value * V)3808 static inline bool isConstantAllOnes(const Value* V)
3809 {
3810     if (const Constant * C = dyn_cast<Constant>(V))
3811         return C->isAllOnesValue();
3812     return false;
3813 }
3814 
Xor(const SSource sources[2],const DstModifier & modifier)3815 void EmitPass::Xor(const SSource sources[2], const DstModifier& modifier)
3816 {
3817     if (isConstantAllOnes(sources[0].value))
3818     {
3819         Unary(EOPCODE_NOT, &sources[1], modifier);
3820     }
3821     else if (isConstantAllOnes(sources[1].value))
3822     {
3823         Unary(EOPCODE_NOT, &sources[0], modifier);
3824     }
3825     else
3826     {
3827         Binary(EOPCODE_XOR, sources, modifier);
3828     }
3829 }
3830 
Cmp(llvm::CmpInst::Predicate pred,const SSource sources[2],const DstModifier & modifier)3831 void EmitPass::Cmp(llvm::CmpInst::Predicate pred, const SSource sources[2], const DstModifier& modifier)
3832 {
3833     IGC_ASSERT(modifier.sat == false);
3834     IGC_ASSERT(modifier.flag == nullptr);
3835     IGC_ASSERT(nullptr != m_destination);
3836 
3837     e_predicate predicate = GetPredicate(pred);
3838 
3839     CVariable* src0 = GetSrcVariable(sources[0], sources[0].fromConstantPool);
3840     CVariable* src1 = GetSrcVariable(sources[1], sources[1].fromConstantPool);
3841 
3842     if (IsUnsignedCmp(pred))
3843     {
3844         src0 = m_currShader->BitCast(src0, GetUnsignedType(src0->GetType()));
3845         src1 = m_currShader->BitCast(src1, GetUnsignedType(src1->GetType()));
3846     }
3847     else if (IsSignedCmp(pred))
3848     {
3849         src0 = m_currShader->BitCast(src0, GetSignedType(src0->GetType()));
3850         src1 = m_currShader->BitCast(src1, GetSignedType(src1->GetType()));
3851     }
3852 
3853     CVariable* dst = m_destination;
3854     if (m_destination->GetType() != ISA_TYPE_BOOL && dst->GetType() != src0->GetType())
3855     {
3856         IGC_ASSERT_MESSAGE(CEncoder::GetCISADataTypeSize(dst->GetType()) == CEncoder::GetCISADataTypeSize(src0->GetType()),
3857             "Cmp to GRF must have the same size for source and destination");
3858         dst = m_currShader->BitCast(m_destination, src0->GetType());
3859     }
3860 
3861     SetSourceModifiers(0, sources[0]);
3862     SetSourceModifiers(1, sources[1]);
3863     m_encoder->Cmp(predicate, dst, src0, src1);
3864     m_encoder->Push();
3865 }
3866 
Frc(const SSource & source,const DstModifier & modifier)3867 void EmitPass::Frc(const SSource& source, const DstModifier& modifier)
3868 {
3869     Unary(EOPCODE_FRC, &source, modifier);
3870 }
3871 
Floor(const SSource & source,const DstModifier & modifier)3872 void EmitPass::Floor(const SSource& source, const DstModifier& modifier)
3873 {
3874     Unary(EOPCODE_RNDD, &source, modifier);
3875 }
3876 
Mov(const SSource & source,const DstModifier & modifier)3877 void EmitPass::Mov(const SSource& source, const DstModifier& modifier)
3878 {
3879     Unary(EOPCODE_MOV, &source, modifier);
3880 }
3881 
Rsqrt(const SSource & source,const DstModifier & modifier)3882 void EmitPass::Rsqrt(const SSource& source, const DstModifier& modifier)
3883 {
3884     Unary(EOPCODE_RSQRT, &source, modifier);
3885 }
3886 
Sqrt(const SSource & source,const DstModifier & modifier)3887 void EmitPass::Sqrt(const SSource& source, const DstModifier& modifier)
3888 {
3889     Unary(EOPCODE_SQRT, &source, modifier);
3890 }
3891 
Mad(const SSource sources[3],const DstModifier & modifier)3892 void EmitPass::Mad(const SSource sources[3], const DstModifier& modifier)
3893 {
3894     Tenary(EOPCODE_MAD, sources, modifier);
3895 }
3896 
Lrp(const SSource sources[3],const DstModifier & modifier)3897 void EmitPass::Lrp(const SSource sources[3], const DstModifier& modifier)
3898 {
3899     Tenary(EOPCODE_LRP, sources, modifier);
3900 }
3901 
Pow(const SSource sources[2],const DstModifier & modifier)3902 void EmitPass::Pow(const SSource sources[2], const DstModifier& modifier)
3903 {
3904     Binary(EOPCODE_POW, sources, modifier);
3905 }
3906 
Avg(const SSource sources[2],const DstModifier & modifier)3907 void EmitPass::Avg(const SSource sources[2], const DstModifier& modifier)
3908 {
3909     Binary(EOPCODE_AVG, sources, modifier);
3910 }
3911 
Tenary(e_opcode opCode,const SSource sources[3],const DstModifier & modifier)3912 void EmitPass::Tenary(e_opcode opCode, const SSource sources[3], const DstModifier& modifier)
3913 {
3914     Alu<3>(opCode, sources, modifier);
3915 }
3916 
Binary(e_opcode opCode,const SSource sources[2],const DstModifier & modifier)3917 void EmitPass::Binary(e_opcode opCode, const SSource sources[2], const DstModifier& modifier)
3918 {
3919     Alu<2>(opCode, sources, modifier);
3920 }
3921 
Unary(e_opcode opCode,const SSource sources[1],const DstModifier & modifier)3922 void EmitPass::Unary(e_opcode opCode, const SSource sources[1], const DstModifier& modifier)
3923 {
3924     Alu<1>(opCode, sources, modifier);
3925 }
3926 
3927 template<int N>
Alu(e_opcode opCode,const SSource sources[N],const DstModifier & modifier)3928 void EmitPass::Alu(e_opcode opCode, const SSource sources[N], const DstModifier& modifier)
3929 {
3930 
3931     CVariable* srcs[3] = { nullptr, nullptr, nullptr };
3932     for (uint i = 0; i < N; i++)
3933     {
3934         bool fromConstantPool = sources[i].fromConstantPool;
3935         srcs[i] = GetSrcVariable(sources[i], fromConstantPool);
3936         SetSourceModifiers(i, sources[i]);
3937     }
3938     m_encoder->SetDstModifier(modifier);
3939     m_encoder->GenericAlu(opCode, m_destination, srcs[0], srcs[1], srcs[2]);
3940     m_encoder->Push();
3941 }
3942 
Bfn(uint8_t booleanFuncCtrl,const SSource sources[3],const DstModifier & modifier)3943 void EmitPass::Bfn(uint8_t booleanFuncCtrl, const SSource sources[3], const DstModifier& modifier)
3944 {
3945     CVariable* srcs[3] = { nullptr, nullptr, nullptr };
3946     // Currently the BFN must have 3 sources, otherwise we will not generate it. Though BFN can
3947     // have only 2 sources
3948     for (uint i = 0; i < 3; i++)
3949     {
3950         bool fromConstantPool = sources[i].fromConstantPool;
3951         srcs[i] = GetSrcVariable(sources[i], fromConstantPool);
3952     }
3953     m_encoder->SetDstModifier(modifier);
3954     m_encoder->Bfn(booleanFuncCtrl, m_destination, srcs[0], srcs[1], srcs[2]);
3955     m_encoder->Push();
3956 }
3957 
CmpBfn(llvm::CmpInst::Predicate predicate,const SSource cmpSources[2],uint8_t booleanFuncCtrl,const SSource bfnSources[3],const DstModifier & modifier)3958 void EmitPass::CmpBfn(llvm::CmpInst::Predicate predicate, const SSource cmpSources[2], uint8_t booleanFuncCtrl,
3959     const SSource bfnSources[3], const DstModifier& modifier)
3960 {
3961     // Cmp
3962     e_predicate pred = GetPredicate(predicate);
3963     CVariable* cmpSrc0 = GetSrcVariable(cmpSources[0]);
3964     CVariable* cmpSrc1 = GetSrcVariable(cmpSources[1]);
3965     CVariable* cmpDst = m_currShader->GetNewVariable(m_destination);
3966 
3967     if (IsUnsignedCmp(predicate))
3968     {
3969         cmpSrc0 = m_currShader->BitCast(cmpSrc0, GetUnsignedType(cmpSrc0->GetType()));
3970         cmpSrc1 = m_currShader->BitCast(cmpSrc1, GetUnsignedType(cmpSrc1->GetType()));
3971     }
3972     else if (IsSignedCmp(predicate))
3973     {
3974         cmpSrc0 = m_currShader->BitCast(cmpSrc0, GetSignedType(cmpSrc0->GetType()));
3975         cmpSrc1 = m_currShader->BitCast(cmpSrc1, GetSignedType(cmpSrc1->GetType()));
3976     }
3977 
3978     if (cmpDst->GetType() != cmpSrc0->GetType())
3979     {
3980         cmpDst = m_currShader->BitCast(cmpDst, cmpSrc0->GetType());
3981     }
3982 
3983     SetSourceModifiers(0, cmpSources[0]);
3984     SetSourceModifiers(1, cmpSources[1]);
3985     m_encoder->Cmp(pred, cmpDst, cmpSrc0, cmpSrc1);
3986     m_encoder->Push();
3987 
3988     // BFN
3989     CVariable* bfnSrc1 = GetSrcVariable(bfnSources[1], bfnSources[1].fromConstantPool);
3990     CVariable* bfnSrc2 = GetSrcVariable(bfnSources[2], bfnSources[2].fromConstantPool);
3991     if (cmpDst->GetType() != bfnSrc1->GetType())
3992     {
3993         cmpDst = m_currShader->BitCast(cmpDst, bfnSrc1->GetType());
3994     }
3995     m_encoder->Bfn(booleanFuncCtrl, m_destination, cmpDst, bfnSrc1, bfnSrc2);
3996     m_encoder->Push();
3997 }
3998 
Select(const SSource sources[3],const DstModifier & modifier)3999 void EmitPass::Select(const SSource sources[3], const DstModifier& modifier)
4000 {
4001     IGC_ASSERT(modifier.flag == nullptr);
4002     IGC_ASSERT(sources[0].mod == EMOD_NONE);
4003 
4004     CVariable* flag = GetSrcVariable(sources[0]);
4005 
4006     bool fromConstantPool = sources[1].fromConstantPool;
4007     CVariable* src0 = GetSrcVariable(sources[1], fromConstantPool);
4008 
4009     fromConstantPool = sources[2].fromConstantPool;
4010     CVariable* src1 = GetSrcVariable(sources[2], fromConstantPool);
4011 
4012     SetSourceModifiers(0, sources[1]);
4013     SetSourceModifiers(1, sources[2]);
4014     m_encoder->SetDstModifier(modifier);
4015     m_encoder->SetPredicateMode(modifier.predMode);
4016 
4017     m_encoder->Select(flag, m_destination, src0, src1);
4018     m_encoder->Push();
4019 
4020 }
4021 
PredAdd(const SSource & pred,bool invert,const SSource sources[2],const DstModifier & modifier)4022 void EmitPass::PredAdd(const SSource& pred, bool invert, const SSource sources[2], const DstModifier& modifier)
4023 {
4024     IGC_ASSERT(modifier.flag == nullptr);
4025     CVariable* flag = GetSrcVariable(pred);
4026     CVariable* src0 = GetSrcVariable(sources[0]);
4027     CVariable* src1 = GetSrcVariable(sources[1]);
4028 
4029     // base condition
4030     SetSourceModifiers(0, sources[0]);
4031     m_encoder->Copy(m_destination, src0);
4032     m_encoder->Push();
4033 
4034     // predicate add
4035     SetSourceModifiers(1, sources[1]);
4036     m_encoder->SetDstModifier(modifier);
4037     m_encoder->SetPredicateMode(modifier.predMode);
4038     m_encoder->SetInversePredicate(invert);
4039     m_encoder->PredAdd(flag, m_destination, m_destination, src1);
4040     m_encoder->Push();
4041 }
4042 
emitOutput(llvm::GenIntrinsicInst * inst)4043 void EmitPass::emitOutput(llvm::GenIntrinsicInst* inst)
4044 {
4045     ShaderOutputType outputType =
4046         (ShaderOutputType)llvm::cast<llvm::ConstantInt>(inst->getOperand(4))->getZExtValue();
4047     if (outputType == SHADER_OUTPUT_TYPE_OMASK)
4048     {
4049         CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
4050         IGC_ASSERT_MESSAGE(psProgram->GetPhase() == PSPHASE_COARSE,
4051             "oMask intrinsics should be left only for coarse phase");
4052         if (!psProgram->IsLastPhase())
4053         {
4054             CVariable* oMask = GetSymbol(inst->getOperand(0));
4055             CVariable* temp =
4056                 m_currShader->GetNewVariable(numLanes(m_SimdMode), oMask->GetType(), EALIGN_GRF, inst->getName());
4057             m_encoder->Copy(temp, oMask);
4058             oMask = temp;
4059             psProgram->SetCoarseoMask(oMask);
4060         }
4061     }
4062     else
4063     {
4064         IGC_ASSERT_MESSAGE(0, "output not supported");
4065     }
4066 }
4067 
4068 
emitPSInputMADHalf(llvm::Instruction * inst)4069 void EmitPass::emitPSInputMADHalf(llvm::Instruction* inst)
4070 {
4071     //create the payload and do interpolation
4072     CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
4073     uint setupIndex = 0;
4074     e_interpolation mode;
4075 
4076     setupIndex = (uint)llvm::cast<llvm::ConstantInt>(inst->getOperand(0))->getZExtValue();
4077     mode = (e_interpolation)llvm::cast<llvm::ConstantInt>(inst->getOperand(1))->getZExtValue();
4078     CVariable* baryVar = nullptr;
4079 
4080 
4081     // mov SIMD4 deltas in to tmp
4082     // mov (4) r0.0<1>:hf r15.0<4;4,1>:f {Align1, Q1, NoMask} // #??:$26:%29
4083     CVariable* tmpDeltaDst = nullptr;
4084 
4085     //inputVar
4086     /*
4087         For SIMD8 mode we generate mix-mode instructions
4088         so we won't generate down conversion for
4089         deltas
4090     */
4091     if (psProgram->LowerPSInput())
4092     {
4093         tmpDeltaDst = psProgram->GetInputDeltaLowered(setupIndex);
4094         baryVar = psProgram->GetBaryRegLoweredHalf(mode);
4095     }
4096     else
4097     {
4098         tmpDeltaDst = psProgram->GetInputDelta(setupIndex);
4099         baryVar = psProgram->GetBaryReg(mode);
4100     }
4101     ContextSwitchPayloadSection();
4102     //dst:hf = src1 * src0 + src3
4103     //dst = p    * u    + r
4104     //mad (16) r20.0.xyzw:hf r0.3.r:hf r0.0.r:hf r12.0.xyzw:hf {Align16, H1} // #??:$31:%209
4105     m_encoder->SetSrcSubReg(1, 0);
4106     m_encoder->SetSrcSubReg(2, 3);
4107     m_encoder->Mad(m_destination, baryVar, tmpDeltaDst, tmpDeltaDst);
4108     m_encoder->Push();
4109 
4110     //dst:hf = src1 * src0 + src3
4111     //dst = q    * v    + dst
4112     //mad(16) r20.0.xyzw:hf r20.0.xyzw : hf r0.1.r : hf r18.0.xyzw : hf{ Align16, H1 } // #??:$32:%210
4113     //if we down converting bary coordinate values will be packed
4114     m_encoder->SetSrcSubReg(0, numLanes(m_currShader->m_SIMDSize));
4115     m_encoder->SetSrcSubReg(1, 1);
4116 
4117     m_encoder->Mad(m_destination, baryVar, tmpDeltaDst, m_destination);
4118     m_encoder->Push();
4119     ContextSwitchShaderBody();
4120 }
4121 
emitPSInputCst(llvm::Instruction * inst)4122 void EmitPass::emitPSInputCst(llvm::Instruction* inst)
4123 {
4124     CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
4125     unsigned int inputIndex = (uint)llvm::cast<llvm::ConstantInt>(inst->getOperand(0))->getZExtValue();
4126     psProgram->MarkConstantInterpolation(inputIndex);
4127     unsigned int setupIndex = psProgram->getSetupIndex(inputIndex);
4128     CVariable* inputVar = psProgram->GetInputDelta(setupIndex);
4129     // temp variable should be the same type as the destination
4130     // This is where we have MOV for payload
4131     ContextSwitchPayloadSection();
4132     {
4133         // A0 vertex data are in Rp.{3 + 4*n}
4134         m_encoder->SetSrcRegion(0, 0, 1, 0);
4135         m_encoder->SetSrcSubReg(0, 3);
4136         m_encoder->Cast(m_destination, inputVar);
4137         m_encoder->Push();
4138     }
4139 
4140     ContextSwitchShaderBody();
4141 }
4142 
4143 
emitPSInput(llvm::Instruction * inst)4144 void EmitPass::emitPSInput(llvm::Instruction* inst)
4145 {
4146     e_interpolation mode = (e_interpolation)llvm::cast<llvm::ConstantInt>(inst->getOperand(1))->getZExtValue();
4147     if (mode == EINTERPOLATION_CONSTANT)
4148     {
4149         emitPSInputCst(inst);
4150     }
4151     else if (inst->getType()->isHalfTy()
4152         )
4153     {
4154         emitPSInputMADHalf(inst);
4155     }
4156     else
4157     {
4158         emitPSInputPln(inst);
4159     }
4160 }
4161 
emitPlnInterpolation(CVariable * baryVar,CVariable * inputvar)4162 void EmitPass::emitPlnInterpolation(CVariable* baryVar, CVariable* inputvar)
4163 {
4164     unsigned int numPln = 1;
4165 
4166     for (unsigned int i = 0; i < numPln; i++)
4167     {
4168         // plane will access 4 operands
4169         m_encoder->SetSrcRegion(0, 0, 4, 1);
4170         m_encoder->Pln(m_destination, inputvar, baryVar);
4171         m_encoder->Push();
4172     }
4173 }
4174 
emitPSInputPln(llvm::Instruction * inst)4175 void EmitPass::emitPSInputPln(llvm::Instruction* inst)
4176 {
4177     //create the payload and do interpolationd
4178     CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
4179     uint setupIndex = (uint)llvm::cast<llvm::ConstantInt>(inst->getOperand(0))->getZExtValue();
4180     // temp variable should be the same type as the destination
4181     CVariable* inputVar = psProgram->GetInputDelta(setupIndex);
4182     e_interpolation mode = (e_interpolation)llvm::cast<llvm::ConstantInt>(inst->getOperand(1))->getZExtValue();
4183     // need to do interpolation unless we do constant interpolation
4184     CVariable* baryVar = psProgram->GetBaryReg(mode);
4185 
4186     ContextSwitchPayloadSection();
4187     emitPlnInterpolation(baryVar, inputVar);
4188     ContextSwitchShaderBody();
4189 }
4190 
emitEvalAttribute(llvm::GenIntrinsicInst * inst)4191 void EmitPass::emitEvalAttribute(llvm::GenIntrinsicInst* inst)
4192 {
4193     CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
4194     // temp variable should be the same type as the destination
4195     bool perspective = cast<ConstantInt>(inst->getOperand(inst->getNumArgOperands() - 1))->getZExtValue() != 0;
4196     EU_PIXEL_INTERPOLATOR_INTERPOLATION_MODE interpolationMode =
4197         perspective ? EU_PI_MESSAGE_PERSPECTIVE_INTERPOLATION : EU_PI_MESSAGE_LINEAR_INTERPOLATION;
4198     if (interpolationMode == EU_PI_MESSAGE_LINEAR_INTERPOLATION)
4199     {
4200         // workaround driver interface; tell the driver we use noperspective barys to turn on noperspective interpolation
4201         psProgram->GetBaryReg(EINTERPOLATION_LINEARNOPERSPECTIVE);
4202     }
4203     uint exDesc = EU_GEN7_MESSAGE_TARGET_PIXEL_INTERPOLATOR;
4204     EU_PIXEL_INTERPOLATOR_SIMD_MODE executionMode = pixelInterpolatorSimDMode(m_currShader->m_SIMDSize);
4205     uint responseLength = executionMode ? 4 : 2;
4206     if (getGRFSize() != 32)
4207     {
4208         responseLength /= 2;
4209     }
4210     uint messageLength = 1;
4211     CVariable* payload = nullptr;
4212     uint desc = 0;
4213     CVariable* messDesc = nullptr;
4214     switch (inst->getIntrinsicID())
4215     {
4216     case GenISAIntrinsic::GenISA_PullSampleIndexBarys:
4217     {
4218         payload = m_currShader->GetNewVariable(
4219             messageLength * (getGRFSize() >> 2),
4220             ISA_TYPE_D, EALIGN_GRF, inst->getName());
4221         uint sampleindex = 0;
4222         desc = PixelInterpolator(
4223             messageLength,
4224             responseLength,
4225             m_encoder->IsSecondHalf() ? 1 : 0,
4226             executionMode,
4227             EU_PI_MESSAGE_EVAL_SAMPLE_POSITION,
4228             interpolationMode,
4229             sampleindex);
4230 
4231         if (ConstantInt * index = dyn_cast<ConstantInt>(inst->getOperand(0)))
4232         {
4233             sampleindex = (uint)index->getZExtValue();
4234             desc = desc | (sampleindex << 4);
4235             messDesc = psProgram->ImmToVariable(desc, ISA_TYPE_UD);
4236 
4237             m_encoder->Send(m_destination, payload, exDesc, messDesc, false);
4238             m_encoder->Push();
4239         }
4240         else
4241         {
4242             ResourceDescriptor resource;
4243             CVariable* flag = nullptr;
4244             uint label;
4245             bool needLoop;
4246             CVariable* uniformId;
4247 
4248             SamplerDescriptor sampler = getSampleIDVariable(inst->getOperand(0));
4249             needLoop = ResourceLoopHeader(resource, sampler, flag, label);
4250             uniformId = sampler.m_sampler;
4251 
4252             messDesc = m_currShader->GetNewVariable(1, ISA_TYPE_UD, EALIGN_DWORD, true, CName::NONE);
4253 
4254             CVariable* idxShift = m_currShader->GetNewVariable(uniformId);
4255             m_encoder->Shl(idxShift, uniformId, m_currShader->ImmToVariable(4, ISA_TYPE_UD));
4256             m_encoder->Or(messDesc, m_currShader->ImmToVariable(desc, ISA_TYPE_UD), idxShift);
4257             m_encoder->Push();
4258 
4259             m_encoder->SetPredicate(flag);
4260             m_encoder->Send(m_destination, payload, exDesc, messDesc, false);
4261             m_encoder->Push();
4262 
4263             ResourceLoopBackEdge(needLoop, flag, label);
4264         }
4265     }
4266     break;
4267 
4268     case GenISAIntrinsic::GenISA_PullSnappedBarys:
4269     case GenISAIntrinsic::GenISA_PullCentroidBarys:
4270     {
4271         uint offsetX = 0;
4272         uint offsetY = 0;
4273         bool offsetIsConst = true;
4274         auto messageType = EU_PI_MESSAGE_EVAL_CENTROID_POSITION;
4275         auto numDWPerGRF = getGRFSize() / SIZE_DWORD;
4276         if (inst->getIntrinsicID() == GenISAIntrinsic::GenISA_PullSnappedBarys)
4277         {
4278             offsetIsConst = false;
4279             auto xCstOffset = llvm::dyn_cast<llvm::ConstantInt>(inst->getOperand(0));
4280             auto yCstOffset = llvm::dyn_cast<llvm::ConstantInt>(inst->getOperand(1));
4281             if (xCstOffset && yCstOffset)
4282             {
4283                 offsetIsConst = true;
4284                 offsetX = (uint) xCstOffset->getZExtValue();
4285                 offsetY = (uint) yCstOffset->getZExtValue();
4286             }
4287 
4288             messageType = offsetIsConst && psProgram->GetPhase() != PSPHASE_COARSE ?
4289                 EU_PI_MESSAGE_EVAL_PER_MESSAGE_OFFSET :
4290                 EU_PI_MESSAGE_EVAL_PER_SLOT_OFFSET;
4291         }
4292         if (offsetIsConst && psProgram->GetPhase() != PSPHASE_COARSE)
4293         {
4294             payload = m_currShader->GetNewVariable(
4295                 messageLength * numDWPerGRF, ISA_TYPE_D, EALIGN_GRF, inst->getName());
4296             desc = PixelInterpolator(
4297                 messageLength,
4298                 responseLength,
4299                 m_encoder->IsSecondHalf() ? 1 : 0,
4300                 executionMode,
4301                 messageType,
4302                 interpolationMode,
4303                 offsetX,
4304                 offsetY);
4305         }
4306         else
4307         {
4308             IGC_ASSERT(messageType != EU_PI_MESSAGE_EVAL_CENTROID_POSITION);
4309             IGC_ASSERT(numDWPerGRF);
4310 
4311             messageLength = 2 * numLanes(m_currShader->m_SIMDSize) / numDWPerGRF;
4312             payload = m_currShader->GetNewVariable(
4313                 messageLength * (getGRFSize() >> 2), ISA_TYPE_D, EALIGN_GRF, inst->getName());
4314             desc = PixelInterpolator(
4315                 messageLength,
4316                 responseLength,
4317                 m_encoder->IsSecondHalf() ? 1 : 0,
4318                 psProgram->GetPhase() == PSPHASE_COARSE,
4319                 executionMode,
4320                 messageType,
4321                 interpolationMode);
4322             CVariable* XOffset = GetSymbol(inst->getOperand(0));
4323             CVariable* YOffset = GetSymbol(inst->getOperand(1));
4324             m_encoder->Copy(payload, XOffset);
4325             m_encoder->Push();
4326 
4327             m_encoder->SetDstSubVar(numLanes(m_currShader->m_SIMDSize) / numDWPerGRF);
4328             m_encoder->Copy(payload, YOffset);
4329             m_encoder->Push();
4330         }
4331         messDesc = psProgram->ImmToVariable(desc, ISA_TYPE_UD);
4332     }
4333 
4334     m_encoder->Send(m_destination, payload, exDesc, messDesc, false);
4335     m_encoder->Push();
4336     break;
4337 
4338     default:
4339         IGC_ASSERT(0);
4340         break;
4341     }
4342 }
4343 
emitInterpolate(llvm::GenIntrinsicInst * inst)4344 void EmitPass::emitInterpolate(llvm::GenIntrinsicInst* inst)
4345 {
4346     CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
4347     CVariable* barys = GetSymbol(inst->getOperand(1));
4348     uint setupIndex = (uint)llvm::cast<llvm::ConstantInt>(inst->getOperand(0))->getZExtValue();
4349     // temp variable should be the same type as the destination
4350     CVariable* inputVar = psProgram->GetInputDelta(setupIndex);
4351 
4352     {
4353         ContextSwitchPayloadSection();
4354         emitPlnInterpolation(barys, inputVar);
4355         ContextSwitchShaderBody();
4356     }
4357 }
4358 
emitInterpolate2(llvm::GenIntrinsicInst * inst)4359 void EmitPass::emitInterpolate2(llvm::GenIntrinsicInst* inst)
4360 {
4361     CVariable* inputVar = GetSymbol(inst->getOperand(0));
4362     CVariable* barys = GetSymbol(inst->getOperand(1));
4363     emitPlnInterpolation(barys, inputVar);
4364 }
4365 
emitInterpolant(llvm::GenIntrinsicInst * inst)4366 void EmitPass::emitInterpolant(llvm::GenIntrinsicInst* inst)
4367 {
4368     uint setupIndex = (uint)llvm::cast<llvm::ConstantInt>(inst->getOperand(0))->getZExtValue();
4369     auto psProgram = static_cast<CPixelShader*>(m_currShader);
4370     CVariable* inputVar = psProgram->GetInputDelta(setupIndex);
4371     m_encoder->SetSrcRegion(0, 4, 4, 1);
4372     m_encoder->SetSimdSize(SIMDMode::SIMD4);
4373     m_encoder->SetNoMask();
4374     m_encoder->Copy(m_destination, inputVar);
4375     m_encoder->Push();
4376 }
4377 
emitDSInput(llvm::Instruction * pInst)4378 void EmitPass::emitDSInput(llvm::Instruction* pInst)
4379 {
4380     IGC_ASSERT(m_currShader->GetShaderType() == ShaderType::DOMAIN_SHADER);
4381     CVariable* dst = m_destination;
4382 
4383     CDomainShader* dsProgram = static_cast<CDomainShader*>(m_currShader);
4384     // only pulled inputs reach here
4385     QuadEltUnit globalOffset(0);
4386     llvm::Value* pPayloadInputIdx = pInst->getOperand(0);
4387     llvm::ConstantInt* pConstIntPayloadVar = llvm::dyn_cast_or_null<llvm::ConstantInt>(pPayloadInputIdx);
4388     uint32_t elmIdx = 0;
4389 
4390     if (pConstIntPayloadVar != nullptr)
4391     {
4392         elmIdx = int_cast<uint32_t>(cast<llvm::ConstantInt>(pConstIntPayloadVar)->getZExtValue());
4393 
4394         CVariable* inputVar = dsProgram->GetInputDelta(elmIdx);
4395         if (dsProgram->GetShaderDispatchMode() == ShaderDispatchMode::DUAL_PATCH)
4396         {
4397             m_encoder->SetSrcSubReg(0, elmIdx % 4);
4398             m_encoder->SetSrcRegion(0, 4, 4, 0);
4399         }
4400         m_encoder->Copy(dst, inputVar);
4401         m_encoder->Push();
4402     }
4403     else
4404     {
4405         IGC_ASSERT_MESSAGE(0, "Only constant payload input variable index handled");
4406     }
4407 }
4408 
emitInput(llvm::Instruction * inst)4409 void EmitPass::emitInput(llvm::Instruction* inst)
4410 {
4411     switch (m_currShader->GetShaderType())
4412     {
4413     case ShaderType::PIXEL_SHADER:
4414         emitPSInput(inst);
4415         break;
4416     case ShaderType::DOMAIN_SHADER:
4417         emitDSInput(inst);
4418         break;
4419     default:
4420         IGC_ASSERT(0);
4421         break;
4422     }
4423 }
4424 
emitcycleCounter(llvm::Instruction * inst)4425 void EmitPass::emitcycleCounter(llvm::Instruction* inst)
4426 {
4427     CVariable* dst = m_destination;
4428     m_encoder->Copy(dst, m_currShader->GetTSC());
4429     m_encoder->Push();
4430     m_encoder->SetSrcSubReg(0, 1);
4431     m_encoder->SetDstSubReg(1);
4432     m_encoder->Copy(dst, m_currShader->GetTSC());
4433     m_encoder->Push();
4434 }
4435 
emitSetDebugReg(llvm::Instruction * inst)4436 void EmitPass::emitSetDebugReg(llvm::Instruction* inst)
4437 {
4438     Value* src0 = inst->getOperand(0);
4439     if (!isa<UndefValue>(src0))
4440     {
4441         // write dbg0.0
4442         CVariable* src = GetSymbol(src0);
4443         IGC_ASSERT(nullptr != src);
4444         IGC_ASSERT(src->IsUniform());
4445         m_encoder->SetDstSubReg(0);
4446         m_encoder->Copy(m_currShader->GetDBG(), src);
4447         m_encoder->Push();
4448     }
4449 
4450     // read dbg0.1
4451     m_encoder->SetSrcSubReg(0, 1);
4452     m_encoder->SetSrcRegion(0, 0, 1, 0);
4453     m_encoder->Copy(m_destination, m_currShader->GetDBG());
4454     m_encoder->Push();
4455 }
4456 
ComputeSampleIntOffset(llvm::Instruction * sample,uint sourceIndex)4457 CVariable* EmitPass::ComputeSampleIntOffset(llvm::Instruction* sample, uint sourceIndex)
4458 {
4459     // The (u,v,r) offsets are encoded in SamplerMessageHeader::DW2
4460     // as [11:8], [7:4], [3:0] bitfields, respectively. Format: S3.
4461     uint offset = 0;
4462     bool dynamicOffset = false;
4463     for (uint i = 0; i < 3; i++)
4464     {
4465         if (ConstantInt * immOffset = dyn_cast<ConstantInt>(sample->getOperand(sourceIndex + i)))
4466         {
4467             uint channelOffset = static_cast<uint>(immOffset->getZExtValue());
4468             offset = (offset << 4) | (channelOffset & 0xf);
4469         }
4470         else
4471         {
4472             dynamicOffset = true;
4473         }
4474     }
4475     CVariable* packedOffset = m_currShader->ImmToVariable(offset, ISA_TYPE_UW);
4476     if (dynamicOffset)
4477     {
4478         CVariable* tempPackedOffset = m_currShader->GetNewVariable(1, ISA_TYPE_UW, EALIGN_WORD, true, "PackedOffset");
4479         for (uint i = 0; i < 3; i++)
4480         {
4481             if (!isa<ConstantInt>(sample->getOperand(sourceIndex + i)))
4482             {
4483                 CVariable* offsetV = GetSymbol(sample->getOperand(sourceIndex + i));
4484                 if (!offsetV->IsUniform())
4485                 {
4486                     offsetV = UniformCopy(offsetV);
4487                 }
4488 
4489                 // Offset is only 4 bits, mask off remaining bits
4490                 CVariable* offsetBits = m_currShader->GetNewVariable(1, ISA_TYPE_UW, EALIGN_WORD, true, "PackedOffset");
4491                 m_encoder->And(offsetBits, offsetV, m_currShader->ImmToVariable(0xF, ISA_TYPE_UW));
4492                 if (i != 2)
4493                 {
4494                     m_encoder->Shl(offsetBits, offsetBits, m_currShader->ImmToVariable(4 * (2 - i), ISA_TYPE_UW));
4495                 }
4496                 if (packedOffset->IsImmediate() && packedOffset->GetImmediateValue() == 0)
4497                 {
4498                     packedOffset = offsetBits;
4499                 }
4500                 else
4501                 {
4502                     m_encoder->Or(tempPackedOffset, packedOffset, offsetBits);
4503                     packedOffset = tempPackedOffset;
4504                 }
4505             }
4506         }
4507     }
4508     return packedOffset;
4509 }
4510 
4511 // simple helper to reorder input depending on the generation
CorrectLdIndex(uint i,bool oldLoad)4512 uint CorrectLdIndex(uint i, bool oldLoad)
4513 {
4514     uint index = i;
4515     if (oldLoad)
4516     {
4517         if (i == 1)
4518         {
4519             index = 2;
4520         }
4521         else if (i == 2)
4522         {
4523             index = 1;
4524         }
4525     }
4526     return index;
4527 }
4528 
IndexableResourceIndex(CVariable * indexVar,uint btiIndex)4529 CVariable* EmitPass::IndexableResourceIndex(CVariable* indexVar, uint btiIndex)
4530 {
4531     CVariable* bti = m_currShader->ImmToVariable(btiIndex, ISA_TYPE_UD);
4532     CVariable* dst = m_currShader->GetNewVariable(indexVar);
4533     m_encoder->Add(dst, indexVar, bti);
4534     m_encoder->Push();
4535     return dst;
4536 }
4537 
PackSIMD8HFRet(CVariable * dst)4538 void EmitPass::PackSIMD8HFRet(CVariable* dst)
4539 {
4540     // the extra moves will be cleaned up by vISA
4541     auto numLanePerChannel = numLanes(m_currShader->m_Platform->getMinDispatchMode());
4542     for (uint16_t n = 0; n < m_destination->GetNumberElement() / numLanePerChannel; n++)
4543     {
4544         m_encoder->SetDstSubReg(n * numLanePerChannel);
4545         m_encoder->SetSrcSubReg(0, n * numLanePerChannel * 2);
4546         m_encoder->Copy(m_destination, dst);
4547         m_encoder->Push();
4548     }
4549 }
4550 
4551 
emitLdInstruction(llvm::Instruction * inst)4552 void EmitPass::emitLdInstruction(llvm::Instruction* inst)
4553 {
4554     uint numOperands = inst->getNumOperands();
4555     IGC_ASSERT_MESSAGE(7 < numOperands, "Wrong number of operands");
4556     IGC_ASSERT_MESSAGE(numOperands < 10, "Wrong number of operands");
4557 
4558     const CShader::ExtractMaskWrapper writeMask(m_currShader, inst);
4559     IGC_ASSERT_MESSAGE(writeMask.hasEM() && writeMask.getEM() != 0, "Wrong write mask");
4560 
4561     EOPCODE opCode = GetOpCode(inst);
4562     //Subtract the offsets, resource sources to get
4563     //the number of texture coordinates and index to texture source
4564     uint numSources = numOperands - 5;
4565     uint textureArgIdx = numOperands - 5;
4566 
4567     ResourceDescriptor resource;
4568     Value* ptr = inst->getOperand(textureArgIdx);
4569     resource = GetResourceVariable(ptr);
4570     uint offsetSourceIndex = numSources + 1;
4571     CVariable* offset = ComputeSampleIntOffset(inst, offsetSourceIndex);
4572 
4573     SmallVector<CVariable*, 4> payload;
4574 
4575     for (uint i = numSources - 1; i > 0; i--)
4576     {
4577         uint index = CorrectLdIndex(i, m_currShader->m_Platform->hasOldLdOrder());
4578         CVariable* src = GetSymbol(inst->getOperand(index));
4579         if (!(src->IsImmediate() && src->GetImmediateValue() == 0))
4580         {
4581             break;
4582         }
4583         numSources--;
4584     }
4585 
4586     bool zeroLOD = false;
4587     //SKL+ new message ld_lz
4588     if (numSources > 2 &&
4589         m_currShader->m_Platform->supportSampleAndLd_lz())
4590     {
4591         // Check if lod is 0
4592         CVariable* src = GetSymbol(inst->getOperand(2));
4593         if (src->IsImmediate() && src->GetImmediateValue() == 0)
4594         {
4595             zeroLOD = true;
4596             numSources--;
4597         }
4598     }
4599 
4600     //create send payload for numSources
4601     for (uint i = 0; i < numSources; i++)
4602     {
4603         uint index = i;
4604         //no difference in ld_lz between SKL+ and BDW
4605         if (!zeroLOD)
4606         {
4607             index = CorrectLdIndex(i, m_currShader->m_Platform->hasOldLdOrder());
4608         }
4609         if (zeroLOD && index == 2)
4610         {
4611             //3D resources skip lod and read z coordinate
4612             index = 3;
4613         }
4614         CVariable* src = GetSymbol(inst->getOperand(index));
4615         if (src->IsUniform())
4616         {
4617             auto uniformSIMDMode = m_currShader->m_Platform->getMinDispatchMode();
4618             uint16_t size = m_destination->IsUniform() ? numLanes(uniformSIMDMode) :
4619                 numLanes(m_currShader->m_SIMDSize);
4620             CVariable* newSource = m_currShader->GetNewVariable(
4621                 size,
4622                 src->GetType(),
4623                 EALIGN_GRF,
4624                 m_destination->IsUniform(),
4625                 src->getName());
4626             m_encoder->SetUniformSIMDSize(uniformSIMDMode);
4627             m_encoder->Copy(newSource, src);
4628             m_encoder->Push();
4629             src = newSource;
4630         }
4631         payload.push_back(src);
4632 
4633     }
4634 
4635     //When sampler output is 16 bit float, hardware doesnt pack the output in SIMD8 mode.
4636     //Hence the movs to handle this layout in SIMD8 mode
4637     bool needPacking = false;
4638     CVariable* dst = m_destination;
4639     SIMDMode simdSize = m_currShader->m_SIMDSize;
4640     {
4641         if (dst->IsUniform())
4642         {
4643             simdSize = m_currShader->m_Platform->getMinDispatchMode();
4644             unsigned short numberOfElement = dst->GetNumberElement() * numLanes(simdSize);
4645             numberOfElement = CEncoder::GetCISADataTypeSize(dst->GetType()) == 2 ? numberOfElement * 2 : numberOfElement;
4646             dst = m_currShader->GetNewVariable(
4647                 numberOfElement, dst->GetType(), EALIGN_GRF, dst->IsUniform(), dst->getName());
4648         }
4649         else
4650         {
4651             needPacking = isHalfGRFReturn(m_destination, m_SimdMode);
4652             if (needPacking)
4653             {
4654                 dst = m_currShader->GetNewVariable(
4655                     m_destination->GetNumberElement() * 2, m_destination->GetType(), EALIGN_GRF, dst->IsUniform(), dst->getName());
4656             }
4657         }
4658     }
4659 
4660     bool feedbackEnable = writeMask.isSet(4);
4661     uint label = 0;
4662     CVariable* flag = nullptr;
4663     bool needLoop = ResourceLoopHeader(resource, flag, label);
4664     m_encoder->SetPredicate(flag);
4665     if (m_destination->IsUniform())
4666     {
4667         m_encoder->SetUniformSIMDSize(m_currShader->m_Platform->getMinDispatchMode());
4668     }
4669     m_encoder->Load(
4670         opCode,
4671         writeMask.getEM(),
4672         offset,
4673         resource,
4674         numSources,
4675         dst,
4676         payload,
4677         zeroLOD,
4678         feedbackEnable);
4679     m_encoder->Push();
4680     if (m_currShader->hasReadWriteImage(*(inst->getParent()->getParent())))
4681     {
4682         CVariable* tempdest = m_currShader->BitCast(m_destination, GetUnsignedIntegerType(m_destination->GetType()));
4683         m_encoder->Cast(m_currShader->GetNULL(), tempdest);
4684         m_encoder->Push();
4685         m_encoder->Copy(m_currShader->GetNULL(), m_currShader->GetTSC());
4686         m_encoder->Push();
4687     }
4688     ResourceLoopBackEdge(needLoop, flag, label);
4689 
4690     {
4691         if (m_destination->IsUniform())
4692         {
4693             // if dst is uniform, we simply copy the first lane of each channel (including feedback enable if present)
4694             // to the packed m_destination.
4695             // Note that there's no need to handle feedback enable specially
4696             for (unsigned int i = 0; i < m_destination->GetNumberElement(); i++)
4697             {
4698                 m_encoder->SetSrcRegion(0, 0, 1, 0);
4699                 m_encoder->SetSrcSubVar(0, i);
4700                 m_encoder->SetDstSubReg(i);
4701                 m_encoder->Copy(m_destination, dst);
4702                 m_encoder->Push();
4703             }
4704         }
4705         else
4706         {
4707             if (needPacking)
4708             {
4709                 PackSIMD8HFRet(dst);
4710             }
4711 
4712             if (feedbackEnable)
4713             {
4714                 emitFeedbackEnable();
4715             }
4716         }
4717     }
4718 }
4719 
4720 /// \brief Returns the offset increment in bytes, given the value's type.
GetOffsetIncrement(const DataLayout * m_DL,SIMDMode simdMode,Value * val)4721 static int GetOffsetIncrement(const DataLayout* m_DL, SIMDMode simdMode, Value* val)
4722 {
4723     int inc;
4724     inc = int_cast<int>(numLanes(simdMode) * (unsigned int)m_DL->getTypeAllocSize(val->getType()));
4725     if (val->getType()->isHalfTy() && simdMode == SIMDMode::SIMD8)
4726     {
4727         //Since alloc size for half float is = 2 and if we have simd8 mode we'll get offset = 16
4728         //but need to pad it with extra 16.
4729         IGC_ASSERT(inc <= 16);
4730         inc *= 2;
4731     }
4732     return inc;
4733 }
4734 
4735 ///
4736 template <typename T>
interceptRenderTargetWritePayloadCoalescing(T * inst,CVariable ** src,CVariable * & source0Alpha,CVariable * & oMaskOpnd,CVariable * & outputDepthOpnd,CVariable * & vStencilOpnd,DenseMap<Value *,CVariable ** > & valueToVariableMap)4737 bool EmitPass::interceptRenderTargetWritePayloadCoalescing(
4738     T* inst,
4739     CVariable** src,
4740     CVariable*& source0Alpha,
4741     CVariable*& oMaskOpnd,
4742     CVariable*& outputDepthOpnd,
4743     CVariable*& vStencilOpnd,
4744     DenseMap<Value*, CVariable**>& valueToVariableMap)
4745 {
4746     CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
4747 
4748     //check coalescing
4749     CoalescingEngine::CCTuple* ccTuple = nullptr;
4750     m_CE->SetCurrentPart(inst, 0);
4751     const uint numOperands = m_CE->GetNumPayloadElements(inst);
4752     Value* dummyValPtr = nullptr;
4753     int payloadToCCTupleRelativeOffset = 0;
4754 
4755     ccTuple = m_CE->IsAnyValueCoalescedInCCTuple(inst,
4756         numOperands,
4757         //out:
4758         payloadToCCTupleRelativeOffset,
4759         dummyValPtr);
4760     bool payloadCovered = m_CE->IsPayloadCovered(inst, ccTuple, numOperands, payloadToCCTupleRelativeOffset);
4761     if (!payloadCovered) {
4762         return false;
4763     }
4764 
4765     //This check is necessary, since IsPayloadCovered is not checking for non-homogeneous part.
4766     if (m_CE->HasNonHomogeneousPayloadElements(inst) &&
4767         !ccTuple->HasNonHomogeneousElements())
4768     {
4769         return false;
4770     }
4771 
4772 
4773     if (ccTuple->HasNonHomogeneousElements())
4774     {
4775         if (m_CE->GetLeftReservedOffset(ccTuple->GetRoot(), m_currShader->m_SIMDSize) <
4776             m_CE->GetLeftReservedOffset(inst, m_currShader->m_SIMDSize))
4777         {
4778             return false;
4779         }
4780         if (payloadToCCTupleRelativeOffset)
4781         {
4782             return false;
4783         }
4784     }
4785 
4786     IGC_ASSERT(ccTuple);
4787     CVariable* rootPayloadVar = m_currShader->LazyCreateCCTupleBackingVariable(ccTuple);
4788 
4789     //Elements are processed in the payload slot order.
4790     //Homogeneous part is looked-up through payload coalescing methods.
4791     //Payload layout for RT writer: s0Alpha oM [R G B A] sZ oS
4792     //Payload layout for dual source RT writer: oM [R0 G0 B0 A0 R1 G1 B1 A1] sZ oS
4793     int offset = 0;
4794     if (RTWriteHasSource0Alpha(inst, m_moduleMD))
4795     {
4796         IGC_ASSERT(ccTuple->HasNonHomogeneousElements());
4797 
4798         VISA_Type vType = m_currShader->GetType(inst->getSource0Alpha()->getType());
4799 
4800         IGC_ASSERT(source0Alpha == nullptr);
4801         CVariable* temp = m_currShader->GetNewAlias(rootPayloadVar, vType, (uint16_t)offset, 0);
4802         m_encoder->Copy(temp, GetSymbol(inst->getSource0Alpha()));
4803         m_encoder->Push();
4804         source0Alpha = temp;
4805     }
4806 
4807     if (ccTuple->HasNonHomogeneousElements())
4808     {
4809         IGC_ASSERT_MESSAGE(ccTuple->GetRoot(), "in other words, there is a 'supremum' element");
4810         IGC_ASSERT(llvm::isa<llvm::RTWritIntrinsic>(ccTuple->GetRoot()) || llvm::isa<llvm::RTDualBlendSourceIntrinsic>(ccTuple->GetRoot()));
4811         if (llvm::RTWritIntrinsic * rtwi = llvm::dyn_cast<llvm::RTWritIntrinsic>(ccTuple->GetRoot()))
4812         {
4813             if (RTWriteHasSource0Alpha(rtwi, m_moduleMD))
4814             {
4815                 //This is a stronger condition than querying 'inst' only, since root represents
4816                 //the whole group of 'non-homogeneous' parts. E.g. it might turn out, that this
4817                 //instruction does not have src0 alpha, but it was coalesced in a group that has
4818                 //at least one src0 alpha. Thus, we need to take that src0 alpha into account
4819                 //when computing 'left' reserved offset.
4820                 offset += GetOffsetIncrement(m_DL, m_currShader->m_SIMDSize, rtwi->getSource0Alpha());
4821             }
4822         }
4823         else if (llvm::RTDualBlendSourceIntrinsic * dsrtwi = llvm::dyn_cast<llvm::RTDualBlendSourceIntrinsic>(ccTuple->GetRoot()))
4824         {
4825             IGC_ASSERT_MESSAGE(!RTWriteHasSource0Alpha(dsrtwi, m_moduleMD), "dual-source doesn't support Source0Alpha");
4826         }
4827     }
4828 
4829     if (inst->hasMask())
4830     {
4831         IGC_ASSERT(!DoesRTWriteSrc0AlphaBelongToHomogeneousPart(inst, m_moduleMD));
4832         IGC_ASSERT(oMaskOpnd == nullptr);
4833 
4834         CVariable* oLocalMaskOpnd = GetSymbol(inst->getOMask());
4835         oLocalMaskOpnd = psProgram->BitCast(oLocalMaskOpnd, ISA_TYPE_UW);
4836 
4837         CVariable* temp = m_currShader->GetNewAlias(rootPayloadVar, ISA_TYPE_D, (uint16_t)offset, 0);
4838         psProgram->PackAndCopyVariable(temp, oLocalMaskOpnd);
4839         oMaskOpnd = temp;
4840     }
4841 
4842     if (ccTuple->HasNonHomogeneousElements())
4843     {
4844         IGC_ASSERT_MESSAGE(ccTuple->GetRoot(), "in other words, there is a 'supremum' element");
4845         IGC_ASSERT(llvm::isa<llvm::RTWritIntrinsic>(ccTuple->GetRoot()) || llvm::isa<llvm::RTDualBlendSourceIntrinsic>(ccTuple->GetRoot()));
4846         if (llvm::dyn_cast<llvm::RTWritIntrinsic>(ccTuple->GetRoot()) || llvm::dyn_cast<llvm::RTDualBlendSourceIntrinsic>(ccTuple->GetRoot()))
4847         {
4848             //Take left reserved offset from 'root' of the group, not from this instruction.
4849             offset = m_CE->GetLeftReservedOffset(ccTuple->GetRoot(), m_currShader->m_SIMDSize);
4850         }
4851     }
4852 
4853     IGC_ASSERT(dummyValPtr);
4854 
4855     offset += payloadToCCTupleRelativeOffset *
4856         m_CE->GetSingleElementWidth(m_currShader->m_SIMDSize, m_DL, dummyValPtr);
4857 
4858 
4859     SmallPtrSet<Value*, 8> touchedValuesSet;
4860     IGC_ASSERT(numOperands == 4 || numOperands == 8);
4861     for (uint index = 0; index < numOperands; index++)
4862     {
4863         Value* val = m_CE->GetPayloadElementToValueMapping(inst, index);
4864         IGC_ASSERT_MESSAGE(nullptr != val, "Val cannot be NULL");
4865         VISA_Type type = m_currShader->GetType(val->getType());
4866 
4867         if (touchedValuesSet.count(val)) {
4868             src[index] = m_currShader->GetNewAlias(rootPayloadVar, type, (uint16_t)(offset), 0);
4869             m_encoder->Copy(src[index], GetSymbol(val));
4870             m_encoder->Push();
4871             offset += GetOffsetIncrement(m_DL, m_currShader->m_SIMDSize, val);
4872             continue;
4873         }
4874         else {
4875             touchedValuesSet.insert(val);
4876         }
4877 
4878         bool needsCopy = false;
4879         if (m_CE->IsValConstOrIsolated(val)) {
4880             needsCopy = true;
4881         }
4882         else
4883         {
4884             if (m_CE->GetValueCCTupleMapping(val))
4885             {
4886                 src[index] = GetSymbol(val);
4887             }
4888             else
4889             {
4890                 //this one actually encompasses the case for !getRegRoot(val)
4891                 needsCopy = true;
4892             }
4893         }//if constant
4894 
4895         if (needsCopy)
4896         {
4897             src[index] = m_currShader->GetNewAlias(rootPayloadVar, type, (uint16_t)offset, 0);
4898             m_encoder->Copy(src[index], GetSymbol(val));
4899             m_encoder->Push();
4900         }
4901         offset += GetOffsetIncrement(m_DL, m_currShader->m_SIMDSize, val);
4902     }//for
4903 
4904     if (inst->hasDepth())
4905     {
4906         IGC_ASSERT(outputDepthOpnd == nullptr);
4907         CVariable* temp = m_currShader->GetNewAlias(rootPayloadVar, ISA_TYPE_F, (uint16_t)offset, 0);
4908         m_encoder->Copy(temp, GetSymbol(inst->getDepth()));
4909         m_encoder->Push();
4910         outputDepthOpnd = temp;
4911 
4912         IGC_ASSERT(inst->getDepth()->getType()->isFloatTy());
4913     }
4914 
4915     if (ccTuple->HasNonHomogeneousElements())
4916     {
4917         IGC_ASSERT_MESSAGE(ccTuple->GetRoot(), "in other words, there is a 'supremum' element");
4918         IGC_ASSERT(llvm::isa<llvm::RTWritIntrinsic>(ccTuple->GetRoot()) || llvm::isa<llvm::RTDualBlendSourceIntrinsic>(ccTuple->GetRoot()));
4919 
4920         if (llvm::RTWritIntrinsic * rtwi = llvm::dyn_cast<llvm::RTWritIntrinsic>(ccTuple->GetRoot()))
4921         {
4922             if (rtwi->hasDepth())
4923             {
4924                 offset += GetOffsetIncrement(m_DL, m_currShader->m_SIMDSize, inst->getDepth());
4925             }
4926         }
4927         else if (llvm::RTDualBlendSourceIntrinsic * dsrtwi = llvm::dyn_cast<llvm::RTDualBlendSourceIntrinsic>(ccTuple->GetRoot()))
4928         {
4929             if (dsrtwi->hasDepth())
4930             {
4931                 offset += GetOffsetIncrement(m_DL, m_currShader->m_SIMDSize, inst->getDepth());
4932             }
4933         }
4934     }
4935 
4936 
4937     //Stencil is only supported in SIMD8 mode
4938     if (inst->hasStencil())
4939     {
4940         IGC_ASSERT(m_currShader->m_Platform->supportsStencil(m_currShader->m_SIMDSize));
4941         IGC_ASSERT(vStencilOpnd == nullptr);
4942 
4943         CVariable* temp = m_currShader->GetNewAlias(rootPayloadVar, ISA_TYPE_UB, (uint16_t)offset, 0);
4944         CVariable* ubSrc = m_currShader->BitCast(GetSymbol(inst->getStencil()), ISA_TYPE_UB);
4945         if (ubSrc->IsUniform())
4946         {
4947             m_encoder->SetSrcRegion(0, 0, 1, 0);
4948         }
4949         else
4950         {
4951             m_encoder->SetSrcRegion(0, 32, 8, 4);
4952         }
4953         m_currShader->CopyVariable(temp, ubSrc, 0);
4954 
4955         vStencilOpnd = temp;
4956     }
4957     return true;
4958 }
4959 
4960 ///
4961 template <typename T>
prepareRenderTargetWritePayload(T * inst,DenseMap<Value *,CVariable ** > & valueToVariableMap,Value * color[],uint8_t colorCnt,CVariable ** src,bool * isUndefined,CVariable * & varSource0Alpha,CVariable * & varMaskOpnd,CVariable * & varDepthOpnd,CVariable * & varStencilOpnd)4962 void EmitPass::prepareRenderTargetWritePayload(
4963     T* inst,
4964     DenseMap<Value*, CVariable**>& valueToVariableMap,
4965     Value* color[],
4966     uint8_t colorCnt,
4967     //output:
4968     CVariable** src,
4969     bool* isUndefined,
4970     CVariable*& varSource0Alpha,
4971     CVariable*& varMaskOpnd,
4972     CVariable*& varDepthOpnd,
4973     CVariable*& varStencilOpnd)
4974 {
4975     CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
4976 
4977     VISA_Type vType = ISA_TYPE_F;
4978 
4979     if (color[0]->getType()->isHalfTy())
4980     {
4981         vType = ISA_TYPE_HF;
4982     }
4983 
4984     for (uint i = 0; i < colorCnt; ++i)
4985     {
4986         if (isa<UndefValue>(color[i]))
4987         {
4988             isUndefined[i] = true;
4989         }
4990     }
4991 
4992     if (interceptRenderTargetWritePayloadCoalescing(
4993         inst,
4994         src,
4995         varSource0Alpha,
4996         varMaskOpnd,
4997         varDepthOpnd,
4998         varStencilOpnd,
4999         valueToVariableMap))
5000     {
5001         return;
5002     }
5003 
5004     for (uint i = 0; i < colorCnt; ++i)
5005     {
5006         CVariable* var = GetSymbol(color[i]);
5007 
5008         if (!isa<UndefValue>(color[i]))
5009         {
5010             if (var->IsUniform())
5011             {
5012                 //if uniform creates a move to payload
5013                 src[i] = m_currShader->GetNewVariable(numLanes(m_currShader->m_SIMDSize), vType, EALIGN_GRF, CName::NONE);
5014                 m_encoder->Copy(src[i], var);
5015                 m_encoder->Push();
5016             }
5017             else
5018             {
5019                 src[i] = var;
5020             }
5021         }
5022     }
5023 
5024     if (RTWriteHasSource0Alpha(inst, m_moduleMD))
5025     {
5026         varSource0Alpha = GetSymbol(inst->getSource0Alpha());
5027         if (varSource0Alpha->IsUniform())
5028         {
5029             CVariable* temp = m_currShader->GetNewVariable(
5030                 numLanes(m_currShader->m_SIMDSize), vType, EALIGN_GRF, CName::NONE);
5031             m_encoder->Copy(temp, varSource0Alpha);
5032             m_encoder->Push();
5033             varSource0Alpha = temp;
5034         }
5035     }
5036 
5037     if (inst->hasMask())
5038     {
5039         varMaskOpnd = GetSymbol(inst->getOMask());
5040         //oMask has to be packed since the hardware ignores the upper half
5041         CVariable* temp = m_currShader->GetNewVariable(
5042             numLanes(m_currShader->m_SIMDSize), ISA_TYPE_D, EALIGN_GRF, CName::NONE);
5043         varMaskOpnd = psProgram->BitCast(varMaskOpnd, ISA_TYPE_UW);
5044         psProgram->PackAndCopyVariable(temp, varMaskOpnd);
5045         varMaskOpnd = temp;
5046     }
5047 
5048     if (inst->hasDepth())
5049     {
5050         varDepthOpnd = GetSymbol(inst->getDepth());
5051         if (varDepthOpnd->IsUniform())
5052         {
5053             CVariable* temp = m_currShader->GetNewVariable(
5054                 numLanes(m_currShader->m_SIMDSize), ISA_TYPE_F, EALIGN_GRF, CName::NONE);
5055             m_encoder->Copy(temp, varDepthOpnd);
5056             m_encoder->Push();
5057             varDepthOpnd = temp;
5058         }
5059     }
5060 
5061     if (inst->hasStencil())
5062     {
5063         varStencilOpnd = GetSymbol(inst->getStencil());
5064         /*4 bytes are needed for the final destination per element*/
5065         CVariable* temp = m_currShader->GetNewVariable(
5066             numLanes(m_currShader->m_SIMDSize) * 4, ISA_TYPE_UB, EALIGN_GRF, CName::NONE);
5067         CVariable* ubSrc = m_currShader->BitCast(varStencilOpnd, ISA_TYPE_UB);
5068         if (varStencilOpnd->IsUniform())
5069         {
5070             m_encoder->SetSrcRegion(0, 0, 1, 0);
5071         }
5072         else
5073         {
5074             m_encoder->SetSrcRegion(0, 32, 8, 4);
5075         }
5076         m_currShader->CopyVariable(temp, ubSrc, 0);
5077         varStencilOpnd = temp;
5078     }
5079 
5080 }
5081 
5082 // Generate a predicate based on current active channels.  The 'alias' is
5083 // some existing variable in context to be reused only for generating mask,
5084 // to avoid allocating a new variable.
5085 
emitPredicateFromChannelIP(CVariable * dst,CVariable * alias)5086 void EmitPass::emitPredicateFromChannelIP(CVariable* dst, CVariable* alias)
5087 {
5088     CVariable* any;
5089 
5090     if (alias)
5091     {
5092         any = m_currShader->GetNewAlias(alias, ISA_TYPE_UD, 0, 1);
5093     }
5094     else
5095     {
5096         any = m_currShader->GetNewVariable(1, ISA_TYPE_UD, EALIGN_DWORD, CName::NONE);
5097     }
5098 
5099     m_encoder->SetSrcRegion(0, 0, 1, 0);
5100     m_encoder->SetSrcRegion(1, 0, 1, 0);
5101     m_encoder->Cmp(EPREDICATE_EQ, dst, any, any);
5102     m_encoder->Push();
5103 }
5104 
emitRenderTargetWrite(llvm::RTWritIntrinsic * inst,bool fromRet)5105 void EmitPass::emitRenderTargetWrite(llvm::RTWritIntrinsic* inst, bool fromRet)
5106 {
5107     IGC_ASSERT(m_currShader->GetShaderType() == ShaderType::PIXEL_SHADER);
5108     CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
5109 
5110     bool lastRenderTarget = psProgram->IsLastRTWrite(inst);
5111     bool EOT = lastRenderTarget && (m_encoder->IsSecondHalf() || m_currShader->m_numberInstance == 1);
5112     bool isNullRT = false;
5113     int RTIndex = inst->getRTIndexImm();
5114     bool oMask = inst->hasMask();
5115     bool outputDepth = inst->hasDepth();
5116     bool outputStencil = inst->hasStencil();
5117     bool perSample = inst->perSample();
5118     Value* vSrc0Alpha = inst->getSource0Alpha();
5119     Value* vMask = inst->getOMask();
5120     Value* pMask = inst->getPMask();
5121     Value* vDepth = inst->getDepth();
5122     Value* vStencil = inst->getStencil();
5123     Value* vSample = inst->getSampleIndex();
5124     Value* vColor[4] = { inst->getRed(), inst->getGreen(), inst->getBlue(), inst->getAlpha() };
5125 
5126     if (outputDepth)
5127     {
5128         psProgram->OutputDepth();
5129     }
5130     if (outputStencil)
5131     {
5132         psProgram->OutputStencil();
5133     }
5134     if (oMask)
5135     {
5136         psProgram->OutputMask();
5137     }
5138     uint bindingTableIndex = 0;
5139     if (RTIndex != -1)
5140     {
5141         bindingTableIndex = m_currShader->m_pBtiLayout->GetRenderTargetIndex(RTIndex);
5142     }
5143     else
5144     {
5145         if (!psProgram->IsLastPhase())
5146         {
5147             return;
5148         }
5149         bindingTableIndex = m_currShader->m_pBtiLayout->GetNullSurfaceIdx();
5150 
5151         isNullRT = true;
5152     }
5153 
5154     bool directIdx = inst->isImmRTIndex();
5155     m_currShader->SetBindingTableEntryCountAndBitmap(directIdx, RENDER_TARGET, RTIndex, bindingTableIndex);
5156 
5157     if (EOT)
5158     {
5159         IGC_ASSERT(psProgram->m_hasEOT == false);
5160         psProgram->m_hasEOT = true;
5161     }
5162 
5163     //Following variables will receive output from a call
5164     CVariable* src[4] = { nullptr, nullptr, nullptr, nullptr };
5165     bool isUndefined[4] = { false, false, false, false };
5166     CVariable* source0Alpha = nullptr;
5167     CVariable* oMaskOpnd = nullptr;
5168     CVariable* outputDepthOpnd = nullptr;
5169     CVariable* stencilOpnd = nullptr;
5170     CVariable* pMaskOpnd = nullptr;
5171 
5172     DenseMap<Value*, CVariable**> valueToVariableMap;
5173     if (!isa<UndefValue>(vSrc0Alpha)) {
5174         valueToVariableMap[vSrc0Alpha] = &source0Alpha;
5175     }
5176     if (oMask) {
5177         valueToVariableMap[vMask] = &oMaskOpnd;
5178     }
5179     if (outputDepth) {
5180         valueToVariableMap[vDepth] = &outputDepthOpnd;
5181     }
5182     if (outputStencil) {
5183         valueToVariableMap[vStencil] = &stencilOpnd;
5184     }
5185 
5186     valueToVariableMap[vColor[0]] = &src[0];
5187     valueToVariableMap[vColor[1]] = &src[1];
5188     valueToVariableMap[vColor[2]] = &src[2];
5189     valueToVariableMap[vColor[3]] = &src[3];
5190 
5191     prepareRenderTargetWritePayload(
5192         //in:
5193         inst,
5194         valueToVariableMap,
5195         vColor,
5196         4,
5197         //out:
5198         src,
5199         isUndefined,
5200         source0Alpha,
5201         oMaskOpnd,
5202         outputDepthOpnd,
5203         stencilOpnd);
5204 
5205     CVariable* cpsCounter = nullptr;
5206     if (psProgram->GetPhase() == PSPHASE_PIXEL)
5207     {
5208         cpsCounter = psProgram->GetCurrentPhaseCounter();
5209     }
5210 
5211     bool coarseMode = false;
5212     if (psProgram->GetPhase() == PSPHASE_COARSE)
5213     {
5214         coarseMode = true;
5215     }
5216 
5217     CVariable* bti = m_currShader->ImmToVariable(bindingTableIndex, ISA_TYPE_D);
5218 
5219     CVariable* sampleIndex = nullptr;
5220     if (m_currShader->m_Platform->supportHeaderRTW() && perSample)
5221     {
5222         sampleIndex = GetSymbol(vSample);
5223         if (!sampleIndex->IsUniform())
5224         {
5225             sampleIndex = UniformCopy(sampleIndex);
5226 
5227         }
5228     }
5229 
5230     if (psProgram->HasDiscard())
5231     {
5232         ConstantInt* cv = dyn_cast<ConstantInt>(pMask);
5233         if (!cv || cv->getZExtValue() == 0)
5234         {
5235             pMaskOpnd = GetSymbol(pMask);
5236         }
5237     }
5238 
5239 
5240     if (pMaskOpnd)
5241     {
5242         m_encoder->SetPredicate(pMaskOpnd);
5243     }
5244 
5245     bool isHeaderMaskFromCe0 =
5246         !isa<ReturnInst>(inst->getParent()->getTerminator()) &&
5247         pMaskOpnd == nullptr;
5248 
5249     CVariable* rtIndexOpnd;
5250     if (RTIndex < 0 || (m_moduleMD->psInfo.BlendStateDisabledMask & BIT(RTIndex)))
5251     {
5252         // if blending is disabled no need to set the RTIndex in the header
5253         rtIndexOpnd = m_currShader->ImmToVariable(0, ISA_TYPE_D);
5254     }
5255     else
5256     {
5257         if (psProgram->IsPerSample())
5258         {
5259             rtIndexOpnd = GetSymbol(inst->getBlendStateIndex());
5260             IGC_ASSERT(rtIndexOpnd->IsUniform());
5261         }
5262         else
5263         {
5264             rtIndexOpnd = m_currShader->ImmToVariable(RTIndex, ISA_TYPE_D);
5265         }
5266     }
5267 
5268     m_encoder->RenderTargetWrite(
5269         src,
5270         isUndefined,
5271         lastRenderTarget,
5272         isNullRT,
5273         perSample,
5274         coarseMode,
5275         isHeaderMaskFromCe0,
5276         bti,
5277         rtIndexOpnd,
5278         source0Alpha,
5279         oMaskOpnd,
5280         outputDepthOpnd,
5281         stencilOpnd,
5282         cpsCounter /*cpscounter*/,
5283         sampleIndex,
5284         psProgram->GetR1());
5285     m_encoder->Push();
5286 }
5287 
emitSimdLaneId(llvm::Instruction * inst)5288 void EmitPass::emitSimdLaneId(llvm::Instruction* inst)
5289 {
5290     m_currShader->GetSimdOffsetBase(m_destination);
5291 }
5292 
emitPatchInstanceId(llvm::Instruction * inst)5293 void EmitPass::emitPatchInstanceId(llvm::Instruction* inst)
5294 {
5295     IGC_ASSERT(m_currShader->GetShaderType() == ShaderType::HULL_SHADER);
5296     CHullShader* hsProgram = static_cast<CHullShader*>(m_currShader);
5297 
5298     // Set barrier encountered to true so we can program the instance Count accordingly
5299     hsProgram->SetBarrierEncountered();
5300 
5301     /*
5302     **     R0.2       23:17      Instance Number. A patch-relative instance number between 0 and InstanceCount-1. BDW, SKL.
5303     **     -----------------
5304     **     R0.2       22:16      Instance Number. A patch-relative instance number between 0 and InstanceCount-1. CNL+.
5305     */
5306     unsigned int instanceIdStartBit = m_currShader->m_Platform->getHullShaderThreadInstanceIdBitFieldPosition();
5307     CVariable* mask7bit = m_currShader->ImmToVariable(0x7f, ISA_TYPE_UD);
5308     m_encoder->SetSrcRegion(0, 0, 1, 0);
5309     m_encoder->SetSrcSubReg(0, 2);
5310     m_encoder->Shr(m_destination, hsProgram->GetR0(), m_currShader->ImmToVariable(instanceIdStartBit, ISA_TYPE_UD));
5311     m_encoder->SetSrcSubReg(0, 0);
5312     m_encoder->And(m_destination, m_destination, mask7bit);
5313     m_encoder->Push();
5314 }
5315 
emitSimdSize(llvm::Instruction * inst)5316 void EmitPass::emitSimdSize(llvm::Instruction* inst)
5317 {
5318     //CVariable* simdSize = m_currShader->ImmToVariable(numLanes(m_SimdMode), ISA_TYPE_UD);
5319     //m_encoder->Cast(m_destination, simdSize);
5320     //m_encoder->Push();
5321 }
5322 
5323 /// Emits VISA instructions for SIMD_SHUFFLE.
emitSimdShuffle(llvm::Instruction * inst)5324 void EmitPass::emitSimdShuffle(llvm::Instruction* inst)
5325 {
5326     CVariable* data = GetSymbol(inst->getOperand(0));
5327     CVariable* simdChannel = GetSymbol(inst->getOperand(1));
5328 
5329     const bool isSimd32 = (m_currShader->m_dispatchSize == SIMDMode::SIMD32);
5330 
5331     if (data->IsUniform())
5332     {
5333         m_encoder->Copy(m_destination, data);
5334         m_encoder->Push();
5335         if (isSimd32 && !m_destination->IsUniform())
5336         {
5337             m_encoder->SetSecondHalf(true);
5338             m_encoder->Copy(m_destination, data);
5339             m_encoder->Push();
5340             m_encoder->SetSecondHalf(false);
5341         }
5342     }
5343     else if (simdChannel->IsImmediate())
5344     {
5345         uint dataIndex = int_cast<uint>(simdChannel->GetImmediateValue());
5346         // prevent out of bound access
5347         dataIndex = dataIndex % numLanes(m_currShader->m_dispatchSize);
5348         if (isSimd32)
5349         {
5350             const bool isSrcInSecondHalf = dataIndex >= 16;
5351             dataIndex = dataIndex % numLanes(m_encoder->GetSimdSize());
5352 
5353             if (m_destination->IsUniform())
5354             {
5355                 m_encoder->SetSecondHalf(isSrcInSecondHalf);
5356                 m_encoder->SetSrcRegion(0, 0, 1, 0);
5357                 m_encoder->SetSrcSubReg(0, dataIndex);
5358                 m_encoder->Copy(m_destination, data);
5359                 m_encoder->Push();
5360                 m_encoder->SetSecondHalf(false);
5361             }
5362             else
5363             {
5364                 // Use an intermediate uniform variable
5365                 CVariable* uniformTemp = m_currShader->GetNewVariable(
5366                     1,
5367                     data->GetType(),
5368                     m_encoder->GetCISADataTypeAlignment(data->GetType()),
5369                     true, // isUniform
5370                     "ShuffleTmp");
5371 
5372                 // Copy from source to the uniform temp...
5373                 m_encoder->SetSecondHalf(isSrcInSecondHalf);
5374                 m_encoder->SetSrcRegion(0, 0, 1, 0);
5375                 m_encoder->SetSrcSubReg(0, dataIndex);
5376                 m_encoder->SetNoMask();
5377                 m_encoder->Copy(uniformTemp, data);
5378                 m_encoder->Push();
5379                 m_encoder->SetSecondHalf(false);
5380 
5381                 // ...and broadcast.
5382                 m_encoder->Copy(m_destination, uniformTemp);
5383                 m_encoder->Push();
5384                 m_encoder->SetSecondHalf(true);
5385                 m_encoder->Copy(m_destination, uniformTemp);
5386                 m_encoder->SetSecondHalf(false);
5387 
5388             }
5389         }
5390         else
5391         {
5392             m_encoder->SetSrcRegion(0, 0, 1, 0);
5393             m_encoder->SetSrcSubReg(0, dataIndex);
5394             m_encoder->Copy(m_destination, data);
5395             m_encoder->Push();
5396         }
5397     }
5398     else
5399     {
5400         // Emits below instructions when simdChannel isn't immediate.
5401         //shl (16) r8.0<1>:ud r6.0<0;1,0>:d 0x2:uw {Align1, H1, NoMask}
5402         //add (16) a0.0<1>:uw r8.0<16;8,2>:uw 0x80:uw {Align1, H1, NoMask}
5403         //mov (16) r10.0<1>:d r[a0.0, 0]<1,0>:d {Align1, H1}
5404         // For SIMD32:
5405         //    shl(M1, 32) V465(0, 0)<1> V464(0, 0)<16; 8, 2> 0x2:uw                           /// $592
5406         //    mov(M1, 32) V466(0, 0)<1> V70(0, 0)<1; 1, 0>                                    /// $593
5407         //    addr_add(M1, 16) A0(0)<1> &V466 + 0 V465(0, 0)<1; 1, 0>                          /// $594
5408         //    mov(M1, 16) V463(0, 0)<1> r[A0(0), 0]<1, 0> : f                                  /// $595
5409         //    addr_add(M5, 16) A0(0)<1> &V466 + 0 V465(0, 16)<1; 1, 0>                         /// $596
5410         //    mov(M5, 16) V463(1, 0)<1> r[A0(0), 0]<1, 0> : f                                  /// $597
5411 
5412         bool channelUniform = simdChannel->IsUniform();
5413 
5414         IGC_ASSERT_MESSAGE(m_encoder->GetCISADataTypeSize(simdChannel->GetType()) == 4,
5415             "simdChannel size of simdShuffle should be 4 bytes!");
5416 
5417         // Choose the shift factor.
5418         int shtAmt = 0;
5419         switch (m_encoder->GetCISADataTypeSize(m_destination->GetType()))
5420         {
5421         case 1:  shtAmt = 0; break;
5422         case 2:  shtAmt = 1; break;
5423         case 4:  shtAmt = 2; break;
5424         case 8:  shtAmt = 3; break;
5425         default: IGC_ASSERT_MESSAGE(0, "Unexpected data type size.");
5426         }
5427 
5428         CVariable* simdChannelUW = m_currShader->BitCast(simdChannel, ISA_TYPE_UW);
5429         CVariable* pSrcElm = m_currShader->GetNewVariable(
5430             simdChannel->GetNumberElement(),
5431             ISA_TYPE_UW,
5432             EALIGN_GRF,
5433             channelUniform,
5434             simdChannel->GetNumberInstance(),
5435             "ShuffleTmp");
5436         if (!channelUniform)
5437         {
5438             m_encoder->SetSrcRegion(0, 16, 8, 2);
5439         }
5440         m_encoder->Shl(pSrcElm, simdChannelUW,
5441             m_currShader->ImmToVariable(shtAmt, ISA_TYPE_UW));
5442         m_encoder->Push();
5443 
5444         CVariable* src = data;
5445 
5446         if (isSimd32)
5447         {
5448             CVariable* contiguousData = nullptr;
5449             CVariable* upperHalfOfContiguousData = nullptr;
5450 
5451             const uint16_t numElements = data->GetNumberElement();
5452             const VISA_Type dataType = data->GetType();
5453 
5454             IGC_ASSERT(numElements == 16);
5455             IGC_ASSERT_MESSAGE(!m_encoder->IsSecondHalf(), "This emitter must be called only once for simd32!");
5456 
5457             // Create a 32 element variable and copy both instances of data into it.
5458             contiguousData = m_currShader->GetNewVariable(
5459                 numElements * 2,
5460                 dataType,
5461                 data->GetAlign(),
5462                 false, // isUniform
5463                 1,
5464                 "ShuffleTmp"); // numberInstance
5465 
5466             upperHalfOfContiguousData = m_currShader->GetNewAlias(
5467                 contiguousData,
5468                 dataType,
5469                 numElements * m_encoder->GetCISADataTypeSize(dataType),
5470                 numElements);
5471 
5472             IGC_ASSERT(contiguousData);
5473             IGC_ASSERT(upperHalfOfContiguousData);
5474 
5475             m_encoder->SetSecondHalf(false);
5476             m_encoder->Copy(contiguousData, data);
5477             m_encoder->Push();
5478 
5479             m_encoder->SetSecondHalf(true);
5480             m_encoder->Copy(upperHalfOfContiguousData, data);
5481             m_encoder->Push();
5482 
5483             if (!channelUniform)
5484             {
5485                 // also calculate the second half of address
5486                 m_encoder->SetSrcRegion(0, 16, 8, 2);
5487                 m_encoder->Shl(pSrcElm, simdChannelUW,
5488                     m_currShader->ImmToVariable(shtAmt, ISA_TYPE_UW));
5489                 m_encoder->Push();
5490             }
5491 
5492             m_encoder->SetSecondHalf(false);
5493 
5494             src = contiguousData;
5495         }
5496 
5497         uint16_t addrSize = channelUniform ? 1 :
5498             (m_SimdMode == SIMDMode::SIMD32 ? numLanes(SIMDMode::SIMD16) : numLanes(m_SimdMode));
5499 
5500         // VectorUniform for shuffle is true as all simd lanes will
5501         // take the same data as the lane 0 !
5502         CVariable* pDstArrElm = m_currShader->GetNewAddressVariable(
5503             addrSize,
5504             m_destination->GetType(),
5505             channelUniform,
5506             true,
5507             m_destination->getName());
5508 
5509         m_encoder->AddrAdd(pDstArrElm, src, pSrcElm);
5510         m_encoder->Push();
5511 
5512         m_encoder->Copy(m_destination, pDstArrElm);
5513         m_encoder->Push();
5514 
5515         if (isSimd32)
5516         {
5517             m_encoder->SetSecondHalf(true);
5518             m_encoder->AddrAdd(pDstArrElm, src, pSrcElm);
5519             m_encoder->Push();
5520             m_encoder->Copy(m_destination, pDstArrElm);
5521             m_encoder->Push();
5522             m_encoder->SetSecondHalf(false);
5523         }
5524     }
5525 }
5526 
emitSimdShuffleDown(llvm::Instruction * inst)5527 void EmitPass::emitSimdShuffleDown(llvm::Instruction* inst)
5528 {
5529     CVariable* pCurrentData = GetSymbol(inst->getOperand(0));
5530     CVariable* pNextData = GetSymbol(inst->getOperand(1));
5531     CVariable* pDelta = m_currShader->GetSymbol(inst->getOperand(2));
5532 
5533     // temp size is the sum of src0 and src1
5534     uint16_t nbElements = numLanes(m_SimdMode) * 2;
5535 
5536     // Join current and Next Data
5537     CVariable* pCombinedData = m_currShader->GetNewVariable(
5538         nbElements,
5539         m_destination->GetType(),
5540         m_destination->GetAlign(),
5541         "ShuffleTmp");
5542 
5543     auto CopyData = [this](CVariable* pDestinationData, CVariable* pSourceData, uint32_t offset)
5544     {
5545         for (uint32_t i = 0; i < m_currShader->m_numberInstance; i++)
5546         {
5547             IGC_ASSERT_MESSAGE(!m_encoder->IsSecondHalf(), "This emitter must be called only once for simd32!");
5548             uint32_t currentOffset = offset + numLanes(m_encoder->GetSimdSize()) * i;
5549             bool isSecondHalf = i == 1;
5550 
5551             if (isSecondHalf)
5552             {
5553                 m_encoder->SetSecondHalf(true);
5554             }
5555 
5556             m_encoder->SetSimdSize(m_encoder->GetSimdSize());
5557             m_encoder->SetDstSubReg(currentOffset);
5558             m_encoder->SetNoMask();
5559             m_encoder->Copy(pDestinationData, pSourceData);
5560             m_encoder->Push();
5561 
5562             if (isSecondHalf)
5563             {
5564                 m_encoder->SetSecondHalf(false);
5565             }
5566         }
5567     };
5568 
5569     CopyData(pCombinedData, pCurrentData, 0);
5570     CopyData(pCombinedData, pNextData, numLanes(m_encoder->GetSimdSize()) * m_currShader->m_numberInstance);
5571 
5572     // Emit mov with direct addressing when delta is a compile-time constant.
5573     const bool useDirectAddressing = pDelta->IsImmediate()
5574         && m_currShader->m_Platform->GetPlatformFamily() != IGFX_GEN8_CORE;
5575 
5576     auto nativeExecSize = numLanes(m_currShader->m_Platform->getMinDispatchMode());
5577     auto width = numLanes(m_SimdMode);
5578     if (useDirectAddressing && nativeExecSize * 2 >= width)
5579     {
5580         const uint dataIndex = pDelta->GetImmediateValue() % nbElements;
5581         int tripCount = width <= nativeExecSize ? 1 : 2;
5582         for (int i = 0; i < tripCount; ++i)
5583         {
5584             m_encoder->SetSimdSize(m_currShader->m_Platform->getMinDispatchMode());
5585             m_encoder->SetSrcRegion(0, 1, 1, 0);
5586             m_encoder->SetSrcSubReg(0, dataIndex + nativeExecSize * i);
5587             m_encoder->SetDstSubReg(nativeExecSize * i);
5588             m_encoder->Copy(m_destination, pCombinedData);
5589             m_encoder->Push();
5590         }
5591         return;
5592     }
5593 
5594     // Emits below instructions:
5595     // mov (8) r12.0<1>:w 0x76543210:v {Align1, Q1, NoMask}
5596     // mov (8) r38.0<1>:ud r12.0<8;8,1>:w {Align1, Q1, NoMask}
5597     // add (8) r39.0<1>:ud r38.0<8;8,1>:ud 0x8:uw {Align1, Q1, NoMask}
5598     // add (16) r40.0<1>:ud r14.0<8;8,1>:d r38.0<8;8,1>:ud {Align1, H1, NoMask}
5599     // shl (16) r42.0<1>:ud r40.0<8;8,1>:ud 0x2:uw {Align1, H1, NoMask}
5600     // add (16) a0.0<1>:uw r42.0<16;8,2>:uw 0x440:uw {Align1, H1, NoMask}
5601     // mov (16) r49.0<1>:d r[a0.0, 0]<1,0>:d {Align1, H1}
5602 
5603     CVariable* pLaneId = m_currShader->GetNewVariable(
5604         numLanes(m_SimdMode),
5605         ISA_TYPE_UD,
5606         EALIGN_GRF,
5607         "LaneId");
5608 
5609     m_encoder->SetSimdSize(SIMDMode::SIMD8);
5610     m_encoder->SetNoMask();
5611     CVariable* imm0 = m_currShader->ImmToVariable(0x76543210, ISA_TYPE_V);
5612     m_encoder->Cast(pLaneId, imm0);
5613     m_encoder->Push();
5614 
5615     if (m_SimdMode == SIMDMode::SIMD16 || m_SimdMode == SIMDMode::SIMD32)
5616     {
5617         m_encoder->SetDstSubVar(0);
5618         m_encoder->SetDstSubReg(8);
5619         m_encoder->SetSimdSize(SIMDMode::SIMD8);
5620         m_encoder->SetNoMask();
5621         CVariable* imm1 = m_currShader->ImmToVariable(0x8, ISA_TYPE_UD);
5622         m_encoder->Add(pLaneId, pLaneId, imm1);
5623         m_encoder->Push();
5624     }
5625 
5626     if (m_SimdMode == SIMDMode::SIMD32)
5627     {
5628         m_encoder->SetSimdSize(SIMDMode::SIMD16);
5629         m_encoder->SetDstSubReg(16);
5630         m_encoder->SetNoMask();
5631         CVariable* imm1 = m_currShader->ImmToVariable(0x10, ISA_TYPE_UD);
5632         m_encoder->Add(pLaneId, pLaneId, imm1);
5633         m_encoder->Push();
5634     }
5635 
5636     CVariable* pShuffleIdx = m_currShader->GetNewVariable(
5637         numLanes(m_SimdMode),
5638         ISA_TYPE_UD,
5639         EALIGN_GRF,
5640         "ShuffleIdx");
5641 
5642     for (uint32_t i = 0; i < m_currShader->m_numberInstance; i++)
5643     {
5644         IGC_ASSERT_MESSAGE(!m_encoder->IsSecondHalf(), "This emitter must be called only once for simd32!");
5645         uint32_t offset = numLanes(m_encoder->GetSimdSize()) * i;
5646         bool isSecondHalf = i == 1;
5647 
5648         if (isSecondHalf)
5649         {
5650             m_encoder->SetSecondHalf(true);
5651         }
5652 
5653         CVariable* pCurrentLaneId = m_currShader->GetNewAlias(
5654             pLaneId,
5655             pLaneId->GetType(),
5656             offset * m_encoder->GetCISADataTypeSize(pLaneId->GetType()),
5657             numLanes(m_encoder->GetSimdSize()));
5658 
5659         m_encoder->SetSimdSize(m_encoder->GetSimdSize());
5660         m_encoder->SetDstSubReg(offset);
5661         m_encoder->SetNoMask();
5662         m_encoder->Add(pShuffleIdx, pCurrentLaneId, pDelta);
5663         m_encoder->Push();
5664 
5665         if (isSecondHalf)
5666         {
5667             m_encoder->SetSecondHalf(false);
5668         }
5669     }
5670 
5671     CVariable* pByteOffset = m_currShader->GetNewVariable(
5672         numLanes(m_SimdMode),
5673         ISA_TYPE_UD,
5674         EALIGN_GRF,
5675         "ByteOffset");
5676 
5677     uint32_t shift = m_destination->GetElemSize() / 2;
5678 
5679     for (uint32_t i = 0; i < m_currShader->m_numberInstance; i++)
5680     {
5681         uint32_t offset = numLanes(m_encoder->GetSimdSize()) * i;
5682 
5683         CVariable* pCurrentShuffleIdx = m_currShader->GetNewAlias(
5684             pShuffleIdx,
5685             pShuffleIdx->GetType(),
5686             offset * m_encoder->GetCISADataTypeSize(pShuffleIdx->GetType()),
5687             numLanes(m_encoder->GetSimdSize()));
5688 
5689         m_encoder->SetSimdSize(m_encoder->GetSimdSize());
5690         m_encoder->SetDstSubReg(offset);
5691         m_encoder->SetNoMask();
5692         m_encoder->Shl(pByteOffset, pCurrentShuffleIdx, m_currShader->ImmToVariable(shift, ISA_TYPE_UD));
5693         m_encoder->Push();
5694     }
5695 
5696 
5697     uint16_t addrSize = m_SimdMode == SIMDMode::SIMD32 ? numLanes(SIMDMode::SIMD16) : numLanes(m_SimdMode);
5698 
5699     CVariable* pDstArrElm = m_currShader->GetNewAddressVariable(
5700         addrSize,
5701         m_destination->GetType(),
5702         false,
5703         false,
5704         m_destination->getName());
5705 
5706     for (uint32_t i = 0; i < m_currShader->m_numberInstance; i++)
5707     {
5708         IGC_ASSERT_MESSAGE(!m_encoder->IsSecondHalf(), "This emitter must be called only once for simd32!");
5709         uint32_t offset = numLanes(m_encoder->GetSimdSize()) * i;
5710         bool isSecondHalf = i == 1;
5711 
5712         CVariable* pCurrentByteOffset = m_currShader->GetNewAlias(
5713             pByteOffset,
5714             pByteOffset->GetType(),
5715             offset * m_encoder->GetCISADataTypeSize(pByteOffset->GetType()),
5716             numLanes(m_encoder->GetSimdSize()));
5717 
5718         m_encoder->SetNoMask();
5719         m_encoder->SetSrcRegion(1, 16, 8, 2);
5720         m_encoder->AddrAdd(pDstArrElm, pCombinedData, m_currShader->BitCast(pCurrentByteOffset, ISA_TYPE_UW));
5721         m_encoder->Push();
5722 
5723         if (isSecondHalf)
5724         {
5725             m_encoder->SetSecondHalf(true);
5726         }
5727 
5728         m_encoder->Copy(m_destination, pDstArrElm);
5729         m_encoder->Push();
5730 
5731         if (isSecondHalf)
5732         {
5733             m_encoder->SetSecondHalf(false);
5734         }
5735     }
5736 }
5737 
getBlockMsgSize(uint32_t bytesRemaining,uint32_t maxSize)5738 static uint32_t getBlockMsgSize(uint32_t bytesRemaining, uint32_t maxSize)
5739 {
5740     uint32_t size = 0;
5741     if (bytesRemaining >= 256)
5742     {
5743         size = 256;
5744     }
5745     else if (bytesRemaining >= 128)
5746     {
5747         size = 128;
5748     }
5749     else if (bytesRemaining >= 64)
5750     {
5751         size = 64;
5752     }
5753     else if (bytesRemaining >= 32)
5754     {
5755         size = 32;
5756     }
5757     else
5758     {
5759         size = 16;
5760     }
5761     return std::min(size, maxSize);
5762 }
5763 
5764 
emitSimdBlockWrite(llvm::Instruction * inst,llvm::Value * ptrVal)5765 void EmitPass::emitSimdBlockWrite(llvm::Instruction* inst, llvm::Value* ptrVal)
5766 {
5767     emitLegacySimdBlockWrite(inst, ptrVal);
5768 
5769 }
5770 
emitSimdBlockRead(llvm::Instruction * inst,llvm::Value * ptrVal)5771 void EmitPass::emitSimdBlockRead(llvm::Instruction* inst, llvm::Value* ptrVal)
5772 {
5773     emitLegacySimdBlockRead(inst, ptrVal);
5774 }
5775 
emitLegacySimdBlockWrite(llvm::Instruction * inst,llvm::Value * ptrVal)5776 void EmitPass::emitLegacySimdBlockWrite(llvm::Instruction* inst, llvm::Value* ptrVal)
5777 {
5778     Value* llPtr = inst->getOperand(0);
5779     Value* dataPtr = inst->getOperand(1);
5780 
5781     PointerType* ptrType = cast<PointerType>(llPtr->getType());
5782     ResourceDescriptor resource = GetResourceVariable(llPtr);
5783 
5784     CVariable* src = nullptr;
5785     if (ptrVal)
5786     {
5787         src = GetSymbol(ptrVal);
5788         src = m_currShader->BitCast(src, GetUnsignedType(src->GetType()));
5789     }
5790     else
5791     {
5792         src = GetSymbol(llPtr);
5793     }
5794 
5795     CVariable* data = GetSymbol(dataPtr);
5796     bool useA64 = isA64Ptr(ptrType, m_currShader->GetContext());
5797 
5798     Type* Ty = dataPtr->getType();
5799     IGCLLVM::FixedVectorType* VTy = dyn_cast<IGCLLVM::FixedVectorType>(Ty);
5800     uint32_t nbElements = 0;
5801     nbElements = VTy ? int_cast<uint32_t>(VTy->getNumElements()) : 1;
5802 
5803     uint32_t typeSizeInBytes = Ty->getScalarSizeInBits() / 8;
5804     uint32_t totalBytes = nbElements * typeSizeInBytes * numLanes(m_SimdMode);
5805 
5806     bool isSeparated = m_SimdMode == SIMDMode::SIMD32 &&
5807         m_encoder->GetSimdSize() == SIMDMode::SIMD16;
5808 
5809     // Data has other layout than expecting one by block write instructions in case of multiple instances.
5810     // The expected layout:
5811     //  |0th component of data from thread 0-15 |0th component of data from thread 16-31|
5812     //  |1st component of data from thread 0-15 |1st component of data from thread 16-31|
5813     // The current layout:
5814     //  |0th component of data from thread 0-15 |1st component of data from thread 0-15 |
5815     //  |0th component of data from thread 16-31|1st component of data from thread 16-31|
5816     if (isSeparated)
5817     {
5818         IGC_ASSERT_MESSAGE(!m_encoder->IsSecondHalf(), "This emitter must be called only once for simd32!");
5819         const uint32_t numVectorElementsPerSimd = numLanes(m_encoder->GetSimdSize());
5820         CVariable* copiedData = m_currShader->GetNewVariable(
5821             data->GetNumberElement() * data->GetNumberInstance(),
5822             data->GetType(),
5823             data->GetAlign(),
5824             "");
5825 
5826         for (uint32_t i = 0; i < 2; i++)
5827         {
5828             if (i == 1)
5829             {
5830                 m_encoder->SetSecondHalf(true);
5831             }
5832 
5833             for (uint32_t elementIndex = 0; elementIndex < nbElements; elementIndex++)
5834             {
5835                 // Offsets can be deduced from the upper comment.
5836                 CVariable* destinationAlias = m_currShader->GetNewAlias(
5837                     copiedData,
5838                     copiedData->GetType(),
5839                     numVectorElementsPerSimd * (nbElements * elementIndex + i) * m_encoder->GetCISADataTypeSize(copiedData->GetType()),
5840                     numVectorElementsPerSimd);
5841                 CVariable* sourceAlias = data;
5842                 if (!data->IsUniform())
5843                 {
5844                     sourceAlias = m_currShader->GetNewAlias(
5845                         data,
5846                         data->GetType(),
5847                         numVectorElementsPerSimd * elementIndex * m_encoder->GetCISADataTypeSize(data->GetType()),
5848                         numVectorElementsPerSimd);
5849                 }
5850 
5851                 m_encoder->SetSimdSize(m_encoder->GetSimdSize());
5852                 m_encoder->SetNoMask();
5853                 m_encoder->Copy(destinationAlias, sourceAlias);
5854                 m_encoder->Push();
5855             }
5856         }
5857 
5858         m_encoder->SetSecondHalf(false);
5859         data = copiedData;
5860     }
5861     else
5862     {
5863         // Special case for uniform data. data is expected to be non-uniform.
5864         data = BroadcastIfUniform(data);
5865     }
5866 
5867 
5868     // Special case for simd8 char block write, in which the total bytes = 8.
5869     // (All the other cases, the total bytes is multiple of 16 (OW).
5870     if (totalBytes == 8)
5871     {
5872         // Use Byte scattered write. If address is aligned at least QW,
5873         // we should use QW-aligned QW write!
5874         //    ByteScatterred write:  use (blksizeInBits, nblk) = (8, 4) and two lanes
5875         //    QW write :             use (blksizeInBits, nblk) = (64, 1) [todo]
5876         bool useQW = false;
5877         uint32_t blkBits = useQW ? 64 : 8;
5878         uint32_t nBlks = useQW ? 1 : 4;
5879 
5880         uint16_t activelanes = useQW ? 1 : 2;
5881         // lanesToSIMDMode(activelanes);
5882         SIMDMode simdmode = useQW ? SIMDMode::SIMD1 : SIMDMode::SIMD2;
5883 
5884         CVariable* eOffset = src;
5885         eOffset = ReAlignUniformVariable(src, m_currShader->getGRFAlignment());
5886         CVariable* ScatterOff = eOffset;
5887         if (activelanes > 1)
5888         {
5889             IGC_ASSERT_MESSAGE(!useQW, "Only one lane is active when using QW!");
5890 
5891             ScatterOff = m_currShader->GetNewVariable(
5892                 activelanes, eOffset->GetType(), eOffset->GetAlign(), true, "ScatterOff");
5893 
5894             CVariable* immVar = m_currShader->ImmToVariable(0x40, ISA_TYPE_UV);
5895             if (useA64 && m_currShader->m_Platform->hasNoInt64AddInst()) {
5896                 emitAddPair(ScatterOff, eOffset, immVar);
5897             }
5898             else {
5899                 m_encoder->SetNoMask();
5900                 m_encoder->SetUniformSIMDSize(simdmode);
5901                 m_encoder->SetSrcRegion(0, 0, 1, 0);
5902                 m_encoder->Add(ScatterOff, eOffset, immVar);
5903                 m_encoder->Push();
5904             }
5905         }
5906 
5907         m_encoder->SetNoMask();
5908         m_encoder->SetUniformSIMDSize(simdmode);
5909         if (useA64)
5910         {
5911             emitScatterA64(data, ScatterOff, blkBits, nBlks, true);
5912         }
5913         else
5914         {
5915             m_encoder->ByteScatter(data, resource, ScatterOff, blkBits, nBlks);
5916         }
5917         m_encoder->Push();
5918 
5919         return;
5920     }
5921 
5922     if (useA64)
5923     {
5924         uint32_t bytesRemaining = totalBytes;
5925         uint32_t srcOffset = 0;
5926         uint32_t bytesToRead = 0;
5927 
5928         // Emits instructions generating one or more A64 OWORD block write instructions
5929         // The amount of data we need to write is n * Component Size OWORDs.
5930         // We can write 8, 4, or 2 OWORDs at a time. We can also write 1 OWORD,
5931         // but since this is a SIMD opcode and we're  compiling SIMD8, SIMD16,
5932         // we don't expect to see a 1 OWORD write.
5933 
5934         m_encoder->SetSimdSize(SIMDMode::SIMD1);
5935         m_encoder->SetNoMask();
5936         m_encoder->SetSrcRegion(0, 0, 1, 0);
5937 
5938         CVariable* pTempVar = m_currShader->GetNewVariable(
5939             numLanes(SIMDMode::SIMD1),
5940             ISA_TYPE_UQ,
5941             EALIGN_QWORD, true, CName::NONE);
5942 
5943         m_encoder->Copy(pTempVar, m_currShader->BitCast(src, ISA_TYPE_UQ));
5944         m_encoder->Push();
5945 
5946         while (bytesRemaining)
5947         {
5948             bytesToRead = getBlockMsgSize(bytesRemaining, m_currShader->m_Platform->getMaxBlockMsgSize(false));
5949             bytesRemaining -= bytesToRead;
5950             m_encoder->OWStoreA64(data, pTempVar, bytesToRead, srcOffset);
5951 
5952             srcOffset = srcOffset + bytesToRead;
5953             m_encoder->Push();
5954 
5955             if (bytesRemaining)
5956             {
5957                 if (m_currShader->m_Platform->hasNoInt64AddInst()) {
5958                     CVariable* ImmVar = m_currShader->ImmToVariable(bytesToRead, ISA_TYPE_UD);
5959                     emitAddPair(pTempVar, pTempVar, ImmVar);
5960                 }
5961                 else {
5962                     m_encoder->SetSimdSize(SIMDMode::SIMD1);
5963                     m_encoder->SetNoMask();
5964                     m_encoder->SetSrcRegion(0, 0, 1, 0);
5965                     m_encoder->Add(pTempVar, pTempVar, m_currShader->ImmToVariable((bytesToRead), ISA_TYPE_UQ));
5966                     m_encoder->Push();
5967                 }
5968             }
5969         }
5970     }
5971     else
5972     {
5973         uint32_t bytesRemaining = totalBytes;
5974 
5975         // Emits instructions generating one or more OWORD block write instructions
5976         // The amount of data we need to write is n * Component Size OWORDs.
5977         // We can write 8, 4, or 2 OWORDs at a time. We can also write 1 OWORD,
5978         // but since this is a SIMD opcode and we're  compiling SIMD8, SIMD16,
5979         // we don't expect to see a 1 OWORD write.
5980 
5981         // shr   (1) r64.2<1>:ud r60.0<0; 1, 0>:ud 0x4:uw{ Align1, H1, NoMask }
5982         // mov  (16) r65.0<1>:ud r54.0<8; 8, 1>:ud{ Align1, NoMask, Compacted }
5983         // and   (1) r64.5<1>:ud r0.5<0; 1, 0>:ud 0x3ff:ud{ Align1, NoMask }
5984         // send (16) null<1>:uw r64 0xa 0x60a03ff:ud{ Align1, NoMask } oword block write
5985 
5986         CVariable* src0shifted = m_currShader->GetNewVariable(
5987             numLanes(SIMDMode::SIMD1),
5988             ISA_TYPE_UD,
5989             EALIGN_DWORD,
5990             "Src0Shifted");
5991 
5992         m_encoder->SetSimdSize(SIMDMode::SIMD1);
5993         m_encoder->SetNoMask();
5994         m_encoder->SetSrcRegion(0, 0, 1, 0);
5995         m_encoder->Shr(src0shifted, src, m_currShader->ImmToVariable(4, ISA_TYPE_UD));
5996         m_encoder->Push();
5997 
5998         uint32_t srcOffset = 0;
5999         uint32_t bytesToRead = 0;
6000         while (bytesRemaining)
6001         {
6002             bool isToSLM = ptrType->getPointerAddressSpace() == ADDRESS_SPACE_LOCAL;
6003             bytesToRead = getBlockMsgSize(bytesRemaining, m_currShader->m_Platform->getMaxBlockMsgSize(isToSLM));
6004             bytesRemaining -= bytesToRead;
6005 
6006             m_encoder->OWStore(data, resource.m_surfaceType, resource.m_resource, src0shifted, bytesToRead, srcOffset);
6007 
6008             srcOffset = srcOffset + bytesToRead;
6009             m_encoder->Push();
6010 
6011             if (bytesRemaining)
6012             {
6013                 m_encoder->SetSimdSize(SIMDMode::SIMD1);
6014                 m_encoder->SetNoMask();
6015                 m_encoder->SetSrcRegion(0, 0, 1, 0);
6016                 m_encoder->Add(src0shifted, src0shifted, m_currShader->ImmToVariable((bytesToRead / 16), ISA_TYPE_UD)); // (bytesToRead / 16) is units of OWORDS
6017                 m_encoder->Push();
6018             }
6019         }
6020     }
6021 }
6022 
emitLegacySimdBlockRead(llvm::Instruction * inst,llvm::Value * ptrVal)6023 void EmitPass::emitLegacySimdBlockRead(llvm::Instruction* inst, llvm::Value* ptrVal)
6024 {
6025     Value* llPtr = inst->getOperand(0);
6026     PointerType* ptrType = cast<PointerType>(llPtr->getType());
6027     ResourceDescriptor resource = GetResourceVariable(llPtr);
6028 
6029     CVariable* src = nullptr;
6030     if (ptrVal)
6031     {
6032         src = GetSymbol(ptrVal);
6033         src = m_currShader->BitCast(src, GetUnsignedType(src->GetType()));
6034     }
6035     else
6036     {
6037         src = GetSymbol(llPtr);
6038     }
6039 
6040     // If it is SLM, use OW-aligned OW address. The byte address (default)
6041     // must be right-shifted by 4 bits to be OW address!
6042     bool isToSLM = (ptrType->getPointerAddressSpace() == ADDRESS_SPACE_LOCAL);
6043     bool useA64 = isA64Ptr(ptrType, m_currShader->GetContext());
6044 
6045     Type* Ty = inst->getType();
6046     IGCLLVM::FixedVectorType* VTy = dyn_cast<IGCLLVM::FixedVectorType>(Ty);
6047     uint32_t nbElements = VTy ? int_cast<uint32_t>(VTy->getNumElements()) : 1;
6048 
6049     uint32_t typeSizeInBytes = Ty->getScalarSizeInBits() / 8;
6050     uint32_t totalBytes = nbElements * typeSizeInBytes * numLanes(m_SimdMode);
6051 
6052 
6053     bool needsTempDst = m_SimdMode == SIMDMode::SIMD32 &&
6054         m_encoder->GetSimdSize() == SIMDMode::SIMD16;
6055     CVariable* dest = needsTempDst ?
6056         m_currShader->GetNewVariable(
6057             m_destination->GetNumberElement() * m_destination->GetNumberInstance(),
6058             m_destination->GetType(),
6059             m_destination->GetAlign(),
6060             "") :
6061         m_destination;
6062 
6063     // Special case for simd8 char block read, in which the total bytes = 8.
6064     // (All the other cases, the total bytes is multiple of 16 (OW).
6065     if (totalBytes == 8)
6066     {
6067         // Use Byte scattered read. If address is aligned at least QW,
6068         // we should use QW-aligned QW read!
6069         //    Byte Scattered read :  use (blksizeInBits, nblk) = (8, 4) and two lanes
6070         //    QW read :              use (blksizeInBits, nblk) = (64, 1) [todo]
6071         bool useQW = false;
6072         uint32_t blkBits = useQW ? 64 : 8;
6073         uint32_t nBlks = useQW ? 1 : 4;
6074         CVariable* gatherDst = dest;
6075 
6076         uint16_t activelanes = useQW ? 1 : 2;
6077         // lanesToSIMDMode(activelanes);
6078         SIMDMode simdmode = useQW ? SIMDMode::SIMD1 : SIMDMode::SIMD2;
6079 
6080         CVariable* eOffset = src;
6081         eOffset = ReAlignUniformVariable(src, m_currShader->getGRFAlignment());
6082         CVariable* gatherOff = eOffset;
6083         if (activelanes > 1)
6084         {
6085             IGC_ASSERT_MESSAGE(!useQW, "Only one lane is active when using QW!");
6086 
6087             gatherOff = m_currShader->GetNewVariable(
6088                 activelanes, eOffset->GetType(), eOffset->GetAlign(), true, "GatherOff");
6089 
6090             CVariable* immVar = m_currShader->ImmToVariable(0x40, ISA_TYPE_UV);
6091             if (useA64 && m_currShader->m_Platform->hasNoInt64AddInst()) {
6092                 emitAddPair(gatherOff, eOffset, immVar);
6093             }
6094             else {
6095                 m_encoder->SetNoMask();
6096                 m_encoder->SetUniformSIMDSize(simdmode);
6097                 m_encoder->SetSrcRegion(0, 0, 1, 0);
6098                 m_encoder->Add(gatherOff, eOffset, immVar);
6099                 m_encoder->Push();
6100             }
6101         }
6102 
6103         m_encoder->SetNoMask();
6104         m_encoder->SetUniformSIMDSize(simdmode);
6105         if (useA64)
6106         {
6107             emitGatherA64(inst, gatherDst, gatherOff, blkBits, nBlks, true);
6108         }
6109         else
6110         {
6111             m_encoder->SetNoMask();
6112             m_encoder->SetUniformSIMDSize(simdmode);
6113             m_encoder->ByteGather(gatherDst, resource, gatherOff, blkBits, nBlks);
6114         }
6115         m_encoder->Push();
6116 
6117         return;
6118     }
6119 
6120     if (useA64)
6121     {
6122         IGC_ASSERT_MESSAGE(!isToSLM, "SLM's ptr size should be 32!");
6123 
6124         uint32_t dstOffset = 0;
6125         uint32_t bytesRemaining = totalBytes;
6126         uint32_t bytesToRead = 0;
6127 
6128         // Emits instructions generating one or more A64 OWORD block read instructions
6129         m_encoder->SetSimdSize(SIMDMode::SIMD1);
6130         m_encoder->SetNoMask();
6131         m_encoder->SetSrcRegion(0, 0, 1, 0);
6132 
6133         CVariable* pTempVar = m_currShader->GetNewVariable(
6134             numLanes(SIMDMode::SIMD1),
6135             ISA_TYPE_UQ,
6136             EALIGN_QWORD, true,
6137             CName::NONE);
6138 
6139         m_encoder->Copy(pTempVar, src);
6140         m_encoder->Push();
6141 
6142         while (bytesRemaining)
6143         {
6144             bytesToRead = getBlockMsgSize(bytesRemaining, m_currShader->m_Platform->getMaxBlockMsgSize(false));
6145             bytesRemaining -= bytesToRead;
6146             m_encoder->OWLoadA64(dest, pTempVar, bytesToRead, dstOffset);
6147             m_encoder->Push();
6148             dstOffset += bytesToRead;
6149 
6150             if (bytesRemaining)
6151             {
6152                 if (m_currShader->m_Platform->hasNoInt64AddInst()) {
6153                     CVariable* ImmVar = m_currShader->ImmToVariable(bytesToRead, ISA_TYPE_UD);
6154                     emitAddPair(pTempVar, pTempVar, ImmVar);
6155                 }
6156                 else {
6157                     m_encoder->SetSimdSize(SIMDMode::SIMD1);
6158                     m_encoder->SetNoMask();
6159                     m_encoder->SetSrcRegion(0, 0, 1, 0);
6160                     m_encoder->Add(pTempVar, pTempVar, m_currShader->ImmToVariable(bytesToRead, ISA_TYPE_UQ));
6161                     m_encoder->Push();
6162                 }
6163             }
6164         }
6165     }
6166     else
6167     {
6168         // Emits below instructions generating one or more OWORD block read instructions:
6169         // mov (1)   r20.0<1>:ud r5.1<0;1,0>:ud {Align1, Q1, NoMask, Compacted}
6170         // and (1)   r21.5<1>:ud r0.5<0;1,0>:ud 0x3ff:ud {Align1, NoMask}
6171         // mov (1)   r21.2<1>:ud r20.0<0;1,0>:ud {Align1, NoMask, Compacted}
6172         // send (16) r12.0<1>:w  r21 0xa 0x24844ff:ud{Align1, NoMask}// unaligned oword block read
6173 
6174         m_encoder->SetSimdSize(SIMDMode::SIMD1);
6175         m_encoder->SetNoMask();
6176         m_encoder->SetSrcRegion(0, 0, 1, 0);
6177 
6178         CVariable* pTempVar = m_currShader->GetNewVariable(
6179             numLanes(SIMDMode::SIMD1),
6180             ISA_TYPE_UD,
6181             EALIGN_DWORD,
6182             CName::NONE);
6183 
6184         if (isToSLM)
6185         {
6186             // It is OW-aligned OW address
6187             m_encoder->Shr(pTempVar, src, m_currShader->ImmToVariable(4, ISA_TYPE_UD));
6188         }
6189 
6190         m_encoder->Push();
6191 
6192         uint32_t dstOffset = 0;
6193         uint32_t bytesToRead = 0;
6194         uint32_t bytesRemaining = totalBytes;
6195         bool isFirstIter = true;
6196         while (bytesRemaining)
6197         {
6198 
6199             bytesToRead = getBlockMsgSize(bytesRemaining, m_currShader->m_Platform->getMaxBlockMsgSize(isToSLM));
6200             bytesRemaining -= bytesToRead;
6201 
6202             bool useSrc = isFirstIter && !isToSLM;
6203             m_encoder->OWLoad(dest, resource, useSrc ? src : pTempVar, isToSLM, bytesToRead, dstOffset);
6204             m_encoder->Push();
6205             dstOffset += bytesToRead;
6206 
6207             if (bytesRemaining)
6208             {
6209                 uint32_t offset = (isToSLM ? bytesToRead / 16 : bytesToRead);
6210                 m_encoder->SetSimdSize(SIMDMode::SIMD1);
6211                 m_encoder->SetNoMask();
6212                 m_encoder->SetSrcRegion(0, 0, 1, 0);
6213                 m_encoder->Add(pTempVar, useSrc ? src : pTempVar, m_currShader->ImmToVariable(offset, ISA_TYPE_UD));
6214                 m_encoder->Push();
6215             }
6216             isFirstIter = false;
6217         }
6218     }
6219 
6220     // Destination has other layout than expecting one by block write instructions in case of multiple instances.
6221     // The expected layout:
6222     //  |0th component of data from thread 0-15 |1st component of data from thread 0-15 |
6223     //  |0th component of data from thread 16-31|1st component of data from thread 16-31|
6224     // The current layout:
6225     //  |0th component of data from thread 0-15 |0th component of data from thread 16-31|
6226     //  |1st component of data from thread 0-15 |1st component of data from thread 16-31|
6227     if (needsTempDst)
6228     {
6229         IGC_ASSERT_MESSAGE(!m_encoder->IsSecondHalf(), "This emitter must be called only once for simd32!");
6230         const uint32_t numVectorElementsPerSimd = numLanes(m_encoder->GetSimdSize());
6231 
6232         for (uint32_t i = 0; i < 2; i++)
6233         {
6234             if (i == 1)
6235             {
6236                 m_encoder->SetSecondHalf(true);
6237             }
6238 
6239             for (uint32_t elementIndex = 0; elementIndex < nbElements; elementIndex++)
6240             {
6241                 // Offsets can be deduced from the upper comment.
6242                 CVariable* destinationAlias = m_currShader->GetNewAlias(
6243                     m_destination,
6244                     m_destination->GetType(),
6245                     numVectorElementsPerSimd * elementIndex * m_encoder->GetCISADataTypeSize(m_destination->GetType()),
6246                     numVectorElementsPerSimd);
6247                 CVariable* sourceAlias = m_currShader->GetNewAlias(
6248                     dest,
6249                     dest->GetType(),
6250                     numVectorElementsPerSimd * (nbElements * elementIndex + i) * m_encoder->GetCISADataTypeSize(dest->GetType()),
6251                     numVectorElementsPerSimd);
6252 
6253                 m_encoder->SetSimdSize(m_encoder->GetSimdSize());
6254                 m_encoder->SetNoMask();
6255                 m_encoder->Copy(destinationAlias, sourceAlias);
6256                 m_encoder->Push();
6257             }
6258         }
6259 
6260         m_encoder->SetSecondHalf(false);
6261     }
6262 }
6263 
6264 
emitMediaBlockIO(const llvm::GenIntrinsicInst * inst,bool isRead)6265 void EmitPass::emitMediaBlockIO(const llvm::GenIntrinsicInst* inst, bool isRead)
6266 {
6267     uint ImgArgIndex = (uint)GetImmediateVal(inst->getOperand(0));
6268     uint isImageTypeUAV = (uint)GetImmediateVal(inst->getOperand(3));
6269 
6270     uint32_t BTI = isImageTypeUAV ?
6271         m_currShader->m_pBtiLayout->GetUavIndex(ImgArgIndex) :
6272         m_currShader->m_pBtiLayout->GetTextureIndex(ImgArgIndex);
6273 
6274     bool directIdx = (llvm::dyn_cast<llvm::ConstantInt>(inst->getOperand(0))) ? true : false;
6275     m_currShader->SetBindingTableEntryCountAndBitmap(directIdx, isImageTypeUAV ? UAV : RESOURCE, ImgArgIndex, BTI);
6276 
6277     CVariable* pImgBTI = m_currShader->ImmToVariable(BTI, ISA_TYPE_UD);
6278 
6279     // width and height must be supplied as compile time constants.
6280     uint blockWidth = (uint)cast<ConstantInt>(inst->getOperand(4))->getZExtValue();
6281     uint blockHeight = (uint)cast<ConstantInt>(inst->getOperand(5))->getZExtValue();
6282 
6283     auto* pFunc = inst->getCalledFunction();
6284     auto* pDataType = isRead ? pFunc->getReturnType() : inst->getOperand(6)->getType();
6285 
6286     uint typeSize = isa<VectorType>(pDataType) ?
6287         (uint)m_DL->getTypeSizeInBits(cast<VectorType>(pDataType)->getElementType()) / 8 :
6288         (uint)m_DL->getTypeSizeInBits(pDataType) / 8;
6289 
6290     uint widthInBytes = blockWidth * typeSize;
6291 
6292     CVariable* pXOffset = GetSymbol(inst->getOperand(1));
6293     CVariable* pYOffset = GetSymbol(inst->getOperand(2));
6294 
6295     CVariable* pDst = nullptr;
6296 
6297     auto* pData = isRead ? m_destination : BroadcastIfUniform(GetSymbol(inst->getOperand(6)));
6298 
6299     // For SIMD32, we need to rearrange the data from both halves
6300     // into a contiguous block to treat it as one SIMD32 write and
6301     // we need to split a read back into its two instances after
6302     // doing the read.
6303     bool mergeBlock = (m_SimdMode == SIMDMode::SIMD32);
6304     uint16_t numElements = pData->GetNumberElement();
6305     VISA_Type dataType = pData->GetType();
6306 
6307     if (mergeBlock)
6308     {
6309         // Make a block twice the size to hold data for both halves
6310         pDst = m_currShader->GetNewVariable(numElements * 2,
6311             dataType, pData->GetAlign(), false, 1, CName::NONE);
6312     }
6313     else
6314     {
6315         pDst = pData;
6316     }
6317 
6318     auto BlockCopy = [&](
6319         CVariable* pDst1,
6320         CVariable* pSrc1,
6321         CVariable* pDst2,
6322         CVariable* pSrc2,
6323         uint srcStride,
6324         uint dstStride)
6325     {
6326         auto VecCopy = [&](CVariable* pDst, CVariable* pSrc, uint nElts)
6327         {
6328             for (uint32_t i = 0; i < nElts; ++i)
6329             {
6330                 m_encoder->SetSrcSubReg(0, srcStride * 16 * i);
6331                 m_encoder->SetDstSubReg(dstStride * 16 * i);
6332                 m_encoder->Copy(pDst, pSrc);
6333                 m_encoder->Push();
6334             }
6335         };
6336 
6337         uint nElts = isa<VectorType>(pDataType) ?
6338             (uint)cast<IGCLLVM::FixedVectorType>(pDataType)->getNumElements() :
6339             1;
6340 
6341         // Now, do the copies.
6342         bool isSecondHalf = m_encoder->IsSecondHalf();
6343 
6344         m_encoder->SetSecondHalf(false);
6345         VecCopy(pDst1, pSrc1, nElts);
6346 
6347         m_encoder->SetSecondHalf(true);
6348         VecCopy(pDst2, pSrc2, nElts);
6349 
6350         m_encoder->SetSecondHalf(isSecondHalf);
6351     };
6352 
6353     CVariable* pSecondHalf = m_currShader->GetNewAlias(pDst, dataType,
6354         16 * m_encoder->GetCISADataTypeSize(dataType), numElements);
6355 
6356     if (!isRead && mergeBlock)
6357     {
6358         BlockCopy(pDst, pData, pSecondHalf, pData, 1, 2);
6359     }
6360 
6361     {
6362         m_encoder->MediaBlockMessage(
6363             isRead ? ISA_Opcode::ISA_MEDIA_LD : ISA_Opcode::ISA_MEDIA_ST,
6364             pDst,
6365             ESURFACE_NORMAL,
6366             pImgBTI,
6367             pXOffset,
6368             pYOffset,
6369             0,
6370             (unsigned char)widthInBytes,
6371             (unsigned char)blockHeight,
6372             0);
6373     }
6374 
6375     if (isRead && mergeBlock)
6376     {
6377         BlockCopy(m_destination, pDst, m_destination, pSecondHalf, 2, 1);
6378     }
6379 }
6380 
emitMediaBlockRectangleRead(llvm::Instruction * inst)6381 void EmitPass::emitMediaBlockRectangleRead(llvm::Instruction* inst)
6382 {
6383     int SrcImgBTI = int_cast<int>(GetImmediateVal(inst->getOperand(0)));
6384     int isImageTypeUAV = int_cast<int>(GetImmediateVal(inst->getOperand(3)));
6385 
6386     CVariable* xOffset = GetSymbol(inst->getOperand(1));
6387     CVariable* yOffset = GetSymbol(inst->getOperand(2));
6388 
6389     uint32_t bindingTableIndex = isImageTypeUAV ?
6390         m_currShader->m_pBtiLayout->GetUavIndex(SrcImgBTI) :
6391         m_currShader->m_pBtiLayout->GetTextureIndex(SrcImgBTI);
6392 
6393     bool directIdx = (llvm::dyn_cast<llvm::ConstantInt>(inst->getOperand(0))) ? true : false;
6394     m_currShader->SetBindingTableEntryCountAndBitmap(directIdx, isImageTypeUAV ? UAV : RESOURCE, SrcImgBTI, bindingTableIndex);
6395 
6396     CVariable* srcbti = m_currShader->ImmToVariable(bindingTableIndex, ISA_TYPE_UD);
6397 
6398     CVariable* pDst = GetSymbol(inst->getOperand(6));
6399 
6400     // width and height must be supplied as compile time constants.
6401     uint64_t blockWidth = cast<ConstantInt>(inst->getOperand(4))->getZExtValue();
6402     uint64_t blockHeight = cast<ConstantInt>(inst->getOperand(5))->getZExtValue();
6403 
6404     IGC_ASSERT(blockWidth * blockHeight == pDst->GetSize());
6405 
6406     {
6407         m_encoder->MediaBlockMessage(
6408             ISA_Opcode::ISA_MEDIA_LD,
6409             pDst,
6410             ESURFACE_NORMAL,
6411             srcbti,
6412             xOffset,
6413             yOffset,
6414             0,
6415             (unsigned char)blockWidth,
6416             (unsigned char)blockHeight,
6417             0);
6418     }
6419 
6420     m_encoder->Push();
6421 }
6422 
emitSimdMediaBlockRead(llvm::Instruction * inst)6423 void EmitPass::emitSimdMediaBlockRead(llvm::Instruction* inst)
6424 {
6425     uint32_t nbElements = 1;
6426     if (inst->getType()->isVectorTy())
6427     {
6428         nbElements = (uint32_t)cast<IGCLLVM::FixedVectorType>(inst->getType())->getNumElements();
6429     }
6430     IGC_ASSERT_MESSAGE(nbElements <= 8, "InValid Vector Size");
6431 
6432     int SrcImgBTI = int_cast<int>(GetImmediateVal(inst->getOperand(0)));
6433     int isImageTypeUAV = int_cast<int>(GetImmediateVal(inst->getOperand(3)));
6434 
6435     Value* xOffset = inst->getOperand(1);
6436     Value* yOffset = inst->getOperand(2);
6437 
6438     uint32_t typeSizeInBytes = inst->getType()->getScalarType()->getScalarSizeInBits() / 8;
6439     uint32_t totalWidth = typeSizeInBytes * numLanes(m_SimdMode);
6440 
6441     uint32_t   pass = 0;
6442     uint32_t   numPasses = 0;
6443     uint32_t   bindingTableIndex = 0;
6444 
6445     uint32_t dstSubReg = 0;
6446     uint32_t blockWidth = 0;
6447     uint32_t blockHeight = nbElements;
6448 
6449     if (isImageTypeUAV)
6450     {
6451         bindingTableIndex = m_currShader->m_pBtiLayout->GetUavIndex(SrcImgBTI);
6452     }
6453     else // elseif imageType is Resource
6454     {
6455         bindingTableIndex = m_currShader->m_pBtiLayout->GetTextureIndex(SrcImgBTI);
6456     }
6457 
6458     m_currShader->SetBindingTableEntryCountAndBitmap(true, isImageTypeUAV ? UAV : RESOURCE, SrcImgBTI, bindingTableIndex);
6459 
6460     CVariable* srcbti = m_currShader->ImmToVariable(bindingTableIndex, ISA_TYPE_UD);
6461     uint32_t maxWidth = 32;
6462 
6463     if (totalWidth < maxWidth)
6464     {
6465         numPasses = 1;
6466         blockWidth = totalWidth;
6467     }
6468     else
6469     {
6470         IGC_ASSERT(maxWidth);
6471         IGC_ASSERT_MESSAGE(totalWidth % maxWidth == 0, "Total width must be divisible by 32!");
6472         numPasses = totalWidth / maxWidth;
6473         blockWidth = maxWidth;
6474     }
6475 
6476 
6477     CVariable* pTempVar0 = nullptr;
6478     CVariable* pTempVar = nullptr;
6479 
6480     uint32_t blockRegSize = 0;
6481 
6482     //Following variable declaration is SIMD8 based, UD is used, so blockRegSize is total required registers.
6483     auto simdMode = lanesToSIMDMode(blockWidth / typeSizeInBytes);
6484     blockRegSize = numPasses * blockHeight * numLanes(simdMode);
6485 
6486     CVariable* pTempDest = m_currShader->GetNewVariable(
6487         blockRegSize,
6488         m_destination->GetType(),
6489         m_currShader->getGRFAlignment(),
6490         CName::NONE);
6491 
6492     CVariable* xVar = GetSymbol(xOffset);
6493     CVariable* yVar = GetSymbol(yOffset);
6494 
6495     // Emits a MEDIA_BLOCK_READ instruction.
6496     // Considering block width as x-axis and block height as y axis:
6497     // Pass 0 reads from (xOffset,yOffset) to (xOffset+31, yOffset+blockheight)
6498     // Pass 1 reads from (xOffset+32, yOffset) to (xOffset+63, yOffset+blockheight)
6499     // Instructions generated:
6500     // mov(1) r36.1<1>:d r16.0<0; 1, 0>:d{ Align1, NoMask }
6501     // mov(1) r36.2<1>:ud 0x3001f:ud{ Align1, NoMask }
6502     // mov(1) r36.0<1>:ud r15.0<0; 1, 0>:ud{ Align1, NoMask, Compacted }
6503     // send(8) r28.0<1>:ud r36 0xc 0x2490000:ud{ Align1, NoMask } // media block read
6504     // add(1) r36.0<1>:ud r15.0<0; 1, 0>:ud 0x20:uw{ Align1, NoMask }
6505     // mov(1) r36.1<1>:d r13.1<0; 1, 0>:d{ Align1, NoMask }
6506     // send(8) r32.0<1>:ud r36 0xc 0x2490000:ud{ Align1, NoMask } // media block read
6507     //      -----------------
6508     //      |       |       |
6509     //      |       |       |
6510     //      -----------------
6511     //      ---------  r28 output
6512     //      |       |
6513     //      |       |
6514     //      ---------  r32
6515     //      |       |
6516     //      |       |
6517     //      ---------
6518     //  32 or 64 bytes at most, that's the reason simd8 is used.
6519 
6520     int scale = blockWidth / getGRFSize();
6521 
6522     for (pass = 0; pass < numPasses; pass++)
6523     {
6524         m_encoder->SetSimdSize(SIMDMode::SIMD1);
6525         m_encoder->SetNoMask();
6526         m_encoder->SetSrcRegion(0, 0, 1, 0);
6527 
6528         if (pass == 0)
6529         {
6530             pTempVar0 = m_currShader->GetNewVariable(
6531                 numLanes(m_SimdMode),
6532                 ISA_TYPE_UD,
6533                 EALIGN_DWORD,
6534                 CName::NONE);
6535 
6536             m_encoder->Copy(pTempVar0, xVar);
6537         }
6538         else
6539         {
6540             m_encoder->Add(pTempVar0, pTempVar0, m_currShader->ImmToVariable(blockWidth, ISA_TYPE_UD));
6541             uint32_t subOffset = blockWidth * blockHeight;
6542             subOffset /= getGRFSize();
6543             dstSubReg = dstSubReg + subOffset;
6544         }
6545         m_encoder->Push();
6546 
6547         m_encoder->SetSimdSize(SIMDMode::SIMD1);
6548         m_encoder->SetNoMask();
6549         m_encoder->SetSrcRegion(0, 0, 1, 0);
6550 
6551         pTempVar = m_currShader->GetNewVariable(
6552             numLanes(m_SimdMode),
6553             ISA_TYPE_UD,
6554             EALIGN_DWORD,
6555             CName::NONE);
6556 
6557         m_encoder->Copy(pTempVar, yVar);
6558         m_encoder->Push();
6559 
6560         m_encoder->SetDstSubVar(dstSubReg);
6561 
6562         CVariable* dstVar = numPasses == 1 ? m_destination : pTempDest;
6563 
6564         {
6565             m_encoder->MediaBlockMessage(
6566                 ISA_Opcode::ISA_MEDIA_LD,
6567                 dstVar,
6568                 ESURFACE_NORMAL,
6569                 srcbti,
6570                 pTempVar0,
6571                 pTempVar,
6572                 0,
6573                 (unsigned char)blockWidth,
6574                 (unsigned char)blockHeight,
6575                 0);
6576         }
6577         m_encoder->Push();
6578     }
6579 
6580     if (numPasses > 1)
6581     {
6582         dstSubReg = 0;
6583 
6584         uint32_t srcSubReg = 0;
6585 
6586         // Join data obtained from pass 0 and pass 1 to make
6587         // xOffset contiguous from 0 to 63 bytes (making SIMD 16)
6588         // mov (8) r20.0<1>:ud r28.0<8;8,1>:ud {Align1, Q1}
6589         // mov (8) r21.0<1>:ud r32.0<8;8,1>:ud {Align1, Q2}
6590         // mov (8) r22.0<1>:ud r29.0<8;8,1>:ud {Align1, Q1}
6591         // mov (8) r23.0<1>:ud r33.0<8;8,1>:ud {Align1, Q2}
6592         // mov (8) r24.0<1>:ud r30.0<8;8,1>:ud {Align1, Q1}
6593         // mov (8) r25.0<1>:ud r34.0<8;8,1>:ud {Align1, Q2}
6594         // mov (8) r26.0<1>:ud r31.0<8;8,1>:ud {Align1, Q1}
6595         // mov (8) r27.0<1>:ud r35.0<8;8,1>:ud {Align1, Q2}
6596 
6597 
6598         //For 64 bytes GRF, 32 bytes will be extended to
6599         //.....
6600         //  A0....A1
6601         //  B0....B1
6602         //  C0....C1
6603         //  D0....D1
6604         //  E0....E1
6605         //  F0....F1
6606         //  G0....G1
6607         //  H0....H1
6608         //
6609         //  r20....A0....B0........r30....A1....B1
6610         //  r21....C0....D0........r31....C1....D1
6611         //  r22....E0....F0........r32....E1....F1
6612         //  r23....G0....H0........r33....G1....H1
6613         //
6614         //  r40<--r20,....r30
6615         //  r41<--r20.8,r30.8
6616         //  r42<--r21,....r31
6617         //  r43<--r21.8,r31.8
6618         //  r44<--r22,....r32
6619         //  r45<--r22.8,r32.8
6620         //  r46<--r23,....r33
6621         //  r47<--r23.8,r33.8
6622         //
6623         //mov (8) r40.0<1>:ud       r20.0<8;8,1>:ud {Align1, Q1}
6624         //mov (8) r40.8<1>:ud       r30.0<8;8,1>:ud {Align1, Q1}
6625         //mov (8) r41<1>:ud         r20.8<8;8,1>:ud {Align1, Q1}
6626         //mov (8) r41.8<1>:ud       r30.8<8;8,1>:ud {Align1, Q1}
6627 
6628         for (uint32_t i = 0; i < blockHeight; i++) //Height
6629         {
6630             uint32_t dstSubRegOffset = 0;
6631             uint32_t srcSubRegOffset = 0;
6632 
6633             for (uint32_t pass = 0; pass < numPasses; pass++) //Width
6634             {
6635                 m_encoder->SetSimdSize(simdMode);
6636                 m_encoder->SetNoMask();
6637 
6638                 srcSubReg = ((i + blockHeight * pass) * blockWidth) / getGRFSize();
6639                 srcSubRegOffset = (i * blockWidth) % getGRFSize();
6640 
6641                 m_encoder->SetSrcSubVar(0, srcSubReg);
6642                 m_encoder->SetSrcSubReg(0, srcSubRegOffset / typeSizeInBytes);
6643 
6644                 m_encoder->SetDstSubVar(dstSubReg);
6645                 m_encoder->SetDstSubReg(dstSubRegOffset / typeSizeInBytes);
6646 
6647                 dstSubRegOffset = ((pass + 1) * blockWidth) % getGRFSize();
6648                 if (dstSubRegOffset == 0)
6649                 {
6650                     dstSubReg += (scale > 0 ? scale : 1);
6651                 }
6652 
6653                 m_encoder->Copy(m_destination, pTempDest);
6654                 m_encoder->Push();
6655             }
6656         }
6657     }
6658 }
6659 
emitSimdMediaBlockWrite(llvm::Instruction * inst)6660 void EmitPass::emitSimdMediaBlockWrite(llvm::Instruction* inst)
6661 {
6662     int SrcImgBTI = int_cast<int>(GetImmediateVal(inst->getOperand(0)));
6663     int isImageTypeUAV = int_cast<int>(GetImmediateVal(inst->getOperand(3)));
6664 
6665     Value* xOffset = inst->getOperand(1);
6666     Value* yOffset = inst->getOperand(2);
6667     Value* dataPtr = inst->getOperand(4);
6668 
6669     uint32_t nbElements = 1;
6670     if (dataPtr->getType()->isVectorTy())
6671     {
6672         nbElements = (uint32_t)cast<IGCLLVM::FixedVectorType>(dataPtr->getType())->getNumElements();
6673     }
6674     IGC_ASSERT_MESSAGE(nbElements <= 8, "InValid Vector Size");
6675 
6676     CVariable* data = GetSymbol(dataPtr);
6677     data = BroadcastIfUniform(data);
6678 
6679     uint32_t typeSizeInBytes = dataPtr->getType()->getScalarType()->getScalarSizeInBits() / 8;
6680     uint32_t totalWidth = typeSizeInBytes * numLanes(m_SimdMode);
6681 
6682     uint32_t   pass = 0;
6683     uint32_t   numPasses = 0;
6684 
6685     uint32_t blockWidth = 0;
6686     uint32_t blockHeight = nbElements;
6687     uint32_t bindingTableIndex = 0;
6688 
6689     if (isImageTypeUAV)
6690     {
6691         bindingTableIndex = m_currShader->m_pBtiLayout->GetUavIndex(SrcImgBTI);
6692     }
6693     else // elseif imageType is Resource
6694     {
6695         bindingTableIndex = m_currShader->m_pBtiLayout->GetTextureIndex(SrcImgBTI);
6696     }
6697 
6698     m_currShader->SetBindingTableEntryCountAndBitmap(true, isImageTypeUAV ? UAV : RESOURCE, SrcImgBTI, bindingTableIndex);
6699 
6700     CVariable* srcbti = m_currShader->ImmToVariable(bindingTableIndex, ISA_TYPE_UD);
6701     uint32_t maxWidth = 32;
6702 
6703     if (totalWidth < maxWidth)
6704     {
6705         numPasses = 1;
6706         blockWidth = totalWidth;
6707     }
6708     else
6709     {
6710         IGC_ASSERT(maxWidth);
6711         IGC_ASSERT_MESSAGE(totalWidth % maxWidth == 0, "Total width must be divisible by 32!");
6712         numPasses = totalWidth / maxWidth;
6713         blockWidth = maxWidth;
6714     }
6715 
6716 
6717     CVariable* pTempVar0 = nullptr;
6718     CVariable* pTempVar = nullptr;
6719 
6720     uint32_t dstSubReg = 0;
6721 
6722     int scale = blockWidth / getGRFSize();
6723     auto simdMode = lanesToSIMDMode(blockWidth / typeSizeInBytes);
6724     for (pass = 0; pass < numPasses; pass++)
6725     {
6726         uint32_t srcSubVar = pass * blockWidth / getGRFSize();
6727         uint32_t dstSubVar = 0;
6728         uint32_t srcSubRegOffset = (pass * blockWidth) % getGRFSize();
6729         uint32_t dstSubRegOffset = 0;
6730 
6731         CVariable* tempdst = nullptr;
6732         tempdst = m_currShader->GetNewVariable(
6733             nbElements * numLanes(simdMode),
6734             data->GetType(),
6735             m_currShader->getGRFAlignment(),
6736             CName::NONE);
6737 
6738         // Split the data.
6739         // mov (8) r22.0<1>:d r14.0<8;8,1>:d {Align1, Q1, Compacted}
6740         // mov (8) r23.0<1>:d r16.0<8;8,1>:d {Align1, Q1, Compacted}
6741         // mov (8) r24.0<1>:d r18.0<8;8,1>:d {Align1, Q1, Compacted}
6742         // mov (8) r25.0<1>:d r20.0<8;8,1>:d {Align1, Q1, Compacted}
6743 
6744         //FOR 64 bytes GRF:
6745         //    A0....A1....A2....A3........r60....r60.8....r61....r61.8
6746         //    B0....B1....B2....B3........r62....r62.8....r63....r63.8
6747         //    C0....C1....C2....C3........r64....r64.8....r65....r65.8
6748         //    D0....D1....D2....D3........r66....r66.8....r67....r67.8
6749         //    E0....E1....E2....E3........r68....r68.8....r69....r69.8
6750         //    F0....F1....F2....F3........r70....r70.8....r71....r71.8
6751         //    G0....G1....G2....G3........r72....r72.8....r73....r73.8
6752         //    H0....H1....H2....H3........r74....r74.8....r75....r75.8
6753         //
6754         // block 0
6755         // mov (8) r20.0<1>:d r60.0<8;8,1>:d {Align1, Q1, Compacted}
6756         // mov (8) r20.8<1>:d r62.0<8;8,1>:d {Align1, Q1, Compacted}
6757         // mov (8) r21.0<1>:d r64.0<8;8,1>:d {Align1, Q1, Compacted}
6758         // mov (8) r21.8<1>:d r66.0<8;8,1>:d {Align1, Q1, Compacted}
6759         // ...
6760         //block 1
6761         // mov (8) r30.0<1>:d r60.8<8;8,1>:d {Align1, Q1, Compacted}
6762         // mov (8) r30.8<1>:d r62.8<8;8,1>:d {Align1, Q1, Compacted}
6763         // mov (8) r31.0<1>:d r64.8<8;8,1>:d {Align1, Q1, Compacted}
6764         // mov (8) r31.8<1>:d r66.8<8;8,1>:d {Align1, Q1, Compacted}
6765         //...
6766 
6767         if (numPasses > 1)
6768         {
6769             for (uint i = 0; i < nbElements; ++i)
6770             {
6771                 m_encoder->SetSimdSize(simdMode);
6772                 m_encoder->SetNoMask();
6773 
6774                 //Src
6775                 m_encoder->SetSrcSubVar(0, srcSubVar);
6776                 m_encoder->SetSrcSubReg(0, srcSubRegOffset / typeSizeInBytes);
6777                 //Dst
6778                 m_encoder->SetDstSubVar(dstSubVar);
6779                 m_encoder->SetDstSubReg(dstSubRegOffset / typeSizeInBytes);
6780                 //Strides for dst and src
6781                 dstSubRegOffset = ((i + 1) * blockWidth) % getGRFSize();
6782                 if (dstSubRegOffset == 0)
6783                 {
6784                     dstSubVar += scale > 0 ? scale : 1;
6785                 }
6786                 srcSubVar = srcSubVar + (numPasses * blockWidth / getGRFSize());
6787 
6788                 m_encoder->Copy(tempdst, data);
6789                 m_encoder->Push();
6790             }
6791         }
6792         else
6793         {
6794             tempdst = data;
6795         }
6796         // Emits a MEDIA_BLOCK_WRITE instruction.
6797         // Considering block width as x-axis and block height as y axis:
6798         // Pass 0 writes from (xOffset,yOffset) to (xOffset+31, yOffset+blockheight)
6799         // Pass 1 writes from (xOffset+32, yOffset) to (xOffset+63, yOffset+blockheight)
6800         // mov (8) r28.0<1>:ud r0.0<8;8,1>:ud {Align1, NoMask, Compacted}
6801         // mov (1) r28.2<1>:ud 0x3001f:ud {Align1, NoMask}
6802         // mov (1) r28.0<1>:ud r6.0<0;1,0>:d {Align1, NoMask}
6803         // mov (1) r28.1<1>:ud r7.0<0;1,0>:d {Align1, NoMask}
6804         // mov (16) r29.0<1>:ud r22.0<8;8,1>:ud {Align1, NoMask, Compacted}
6805         // mov (16) r31.0<1>:ud r24.0<8;8,1>:ud {Align1, NoMask, Compacted}
6806         // send (8) null<1>:ud r28 0xc 0xa0a8002:ud{Align1, NoMask} // media block write
6807         if (pass == 0)
6808         {
6809             CVariable* xVar = GetSymbol(xOffset);
6810             CVariable* yVar = GetSymbol(yOffset);
6811             m_encoder->SetSimdSize(SIMDMode::SIMD1);
6812             m_encoder->SetNoMask();
6813             m_encoder->SetSrcRegion(0, 0, 1, 0);
6814 
6815             pTempVar0 = m_currShader->GetNewVariable(
6816                 numLanes(m_SimdMode),
6817                 ISA_TYPE_D,
6818                 EALIGN_DWORD,
6819                 CName::NONE);
6820 
6821             m_encoder->Cast(pTempVar0, xVar);
6822             m_encoder->Push();
6823             m_encoder->SetSimdSize(SIMDMode::SIMD1);
6824             m_encoder->SetNoMask();
6825             m_encoder->SetSrcRegion(0, 0, 1, 0);
6826 
6827             pTempVar = m_currShader->GetNewVariable(
6828                 numLanes(m_SimdMode),
6829                 ISA_TYPE_D,
6830                 EALIGN_DWORD,
6831                 CName::NONE);
6832 
6833             m_encoder->Cast(pTempVar, yVar);
6834             m_encoder->Push();
6835         }
6836         else
6837         {
6838             m_encoder->SetSimdSize(SIMDMode::SIMD1);
6839             m_encoder->SetNoMask();
6840             m_encoder->SetSrcRegion(0, 0, 1, 0);
6841             m_encoder->Add(pTempVar0, pTempVar0, m_currShader->ImmToVariable(blockWidth, ISA_TYPE_UD));
6842             m_encoder->Push();
6843             dstSubReg = dstSubReg + scale * blockHeight;
6844         }
6845 
6846         m_encoder->SetDstSubVar(dstSubReg);
6847 
6848         {
6849             m_encoder->MediaBlockMessage(
6850                 ISA_Opcode::ISA_MEDIA_ST,
6851                 tempdst, ESURFACE_NORMAL,
6852                 srcbti,
6853                 pTempVar0,
6854                 pTempVar,
6855                 0,
6856                 (unsigned char)blockWidth,
6857                 (unsigned char)blockHeight,
6858                 0);
6859         }
6860         m_encoder->Push();
6861     }
6862 }
6863 
emitDualBlendRT(llvm::RTDualBlendSourceIntrinsic * inst,bool fromRet)6864 void EmitPass::emitDualBlendRT(llvm::RTDualBlendSourceIntrinsic* inst, bool fromRet)
6865 {
6866     IGC_ASSERT(m_currShader->GetShaderType() == ShaderType::PIXEL_SHADER);
6867     CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
6868 
6869     uint RTIndex = inst->getRTIndexImm();
6870     bool oMask = inst->hasMask();
6871     bool outputDepth = inst->hasDepth();
6872     bool outputStencil = inst->hasStencil();
6873     Value* vMask = inst->getOMask();
6874     bool perSample = inst->perSample();
6875     Value* vSample = inst->getSampleIndex();
6876 
6877     uint bindingTableIndex = m_currShader->m_pBtiLayout->GetRenderTargetIndex(RTIndex);
6878     bool directIdx = (llvm::dyn_cast<llvm::ConstantInt>(inst->getRTIndex())) ? true : false;
6879     m_currShader->SetBindingTableEntryCountAndBitmap(directIdx, RENDER_TARGET, RTIndex, bindingTableIndex);
6880 
6881     CVariable* pMaskOpnd = nullptr;
6882 
6883     if (psProgram->HasDiscard())
6884     {
6885         ConstantInt* cv = dyn_cast<ConstantInt>(inst->getPMask());
6886         if (!cv || cv->getZExtValue() == 0)
6887         {
6888             pMaskOpnd = GetSymbol(inst->getPMask());
6889         }
6890     }
6891 
6892     bool isHF = false;
6893     uint messageLength = 8;
6894 
6895     if (inst->getRed0()->getType()->isHalfTy())
6896     {
6897         isHF = true;
6898         messageLength = 4;
6899     }
6900 
6901     uint responseLength = 0;
6902     if (outputDepth)
6903     {
6904         messageLength += 1;
6905     }
6906     if (outputStencil)
6907     {
6908         messageLength += 1;
6909     }
6910     if (oMask)
6911     {
6912         messageLength += 1;
6913     }
6914     // Need a header in case we write per sample
6915     bool needHeader = perSample;
6916     if (needHeader)
6917     {
6918         messageLength += 2;
6919     }
6920     int nbMessage = m_SimdMode == SIMDMode::SIMD8 ? 1 : 2;
6921     for (int i = 0; i < nbMessage; i++)
6922     {
6923         uint payloadOffset = 0;
6924         bool lastRenderTarget = psProgram->IsLastRTWrite(inst);
6925 
6926         bool EOT = lastRenderTarget &&
6927             i == nbMessage - 1 &&
6928             (m_encoder->IsSecondHalf() || m_currShader->m_numberInstance == 1);
6929 
6930         if (EOT)
6931         {
6932             IGC_ASSERT(psProgram->m_hasEOT == false);
6933             psProgram->m_hasEOT = true;
6934         }
6935 
6936         CVariable* payload =
6937             m_currShader->GetNewVariable(
6938                 messageLength * (getGRFSize() >> 2),
6939                 ISA_TYPE_D, EALIGN_GRF, CName::NONE);
6940 
6941         if (needHeader)
6942         {
6943             m_encoder->SetNoMask();
6944             m_encoder->SetSimdSize(SIMDMode::SIMD8);
6945             m_encoder->Copy(payload, psProgram->GetR0());
6946             m_encoder->Push();
6947 
6948             m_encoder->SetDstSubVar(1);
6949             m_encoder->SetNoMask();
6950             m_encoder->SetSimdSize(SIMDMode::SIMD8);
6951             m_encoder->Copy(payload, psProgram->GetR1());
6952             m_encoder->Push();
6953             if (perSample)
6954             {
6955                 CVariable* sampleIndex = GetSymbol(vSample);
6956                 if (!sampleIndex->IsUniform())
6957                 {
6958                     sampleIndex = UniformCopy(sampleIndex);
6959 
6960                 }
6961                 CVariable* sampleIndexShifted = m_currShader->GetNewVariable(sampleIndex);
6962                 m_encoder->Shl(sampleIndexShifted, sampleIndex, m_currShader->ImmToVariable(6, ISA_TYPE_D));
6963 
6964                 m_encoder->SetNoMask();
6965                 m_encoder->SetSimdSize(SIMDMode::SIMD1);
6966                 m_encoder->SetSrcRegion(0, 0, 1, 0);
6967                 m_encoder->Or(payload, payload, sampleIndexShifted);
6968                 m_encoder->Push();
6969             }
6970 
6971             CVariable* pixelEnable = m_currShader->GetNewAlias(
6972                 payload, ISA_TYPE_UW, getGRFSize() + 14 * 2, 1);
6973 
6974             if (pMaskOpnd)
6975             {
6976                 m_encoder->SetNoMask();
6977                 m_encoder->SetSimdSize(SIMDMode::SIMD1);
6978                 m_encoder->Copy(pixelEnable, pMaskOpnd);
6979                 m_encoder->Push();
6980             }
6981             else
6982                 if (!isa<ReturnInst>(inst->getParent()->getTerminator()))
6983                 {
6984                     m_encoder->SetNoMask();
6985                     m_encoder->SetSimdSize(SIMDMode::SIMD1);
6986                     m_encoder->Cast(pixelEnable, GetExecutionMask());
6987                     m_encoder->Push();
6988                 }
6989 
6990             payloadOffset += 2;
6991         }
6992 
6993         if (oMask)
6994         {
6995             //oMask has to be packed since the hardware ignores the upper half
6996             CVariable* src = GetSymbol(vMask);
6997             CVariable* payloadUW = psProgram->BitCast(payload, ISA_TYPE_UW);
6998             src = psProgram->BitCast(src, ISA_TYPE_UW);
6999             m_encoder->SetSimdSize(SIMDMode::SIMD8);
7000             m_encoder->SetMask(i == 0 ? EMASK_Q1 : EMASK_Q2);
7001             m_encoder->SetSrcSubVar(0, i);
7002             if (src->IsUniform())
7003             {
7004                 m_encoder->SetSrcRegion(0, 0, 1, 0);
7005             }
7006             else
7007             {
7008                 m_encoder->SetSrcRegion(0, 2, 1, 0);
7009             }
7010             m_encoder->SetDstSubVar(payloadOffset++);
7011             m_encoder->SetDstSubReg(i * 8);
7012             m_encoder->Copy(payloadUW, src);
7013             m_encoder->Push();
7014         }
7015 
7016         CVariable* srcPayload = payload;
7017 
7018         if (isHF)
7019         {
7020             srcPayload = m_currShader->GetNewAlias(payload, ISA_TYPE_HF, 0, 4 * getGRFSize());
7021         }
7022 
7023         Value* colors[] = {
7024             inst->getRed0(), inst->getGreen0(), inst->getBlue0(), inst->getAlpha0(),
7025             inst->getRed1(), inst->getGreen1(), inst->getBlue1(), inst->getAlpha1()
7026         };
7027 
7028         for (uint srcIdx = 0; srcIdx < 8; srcIdx++)
7029         {
7030             m_encoder->SetSimdSize(SIMDMode::SIMD8);
7031             m_encoder->SetMask(i == 0 ? EMASK_Q1 : EMASK_Q2);
7032             CVariable* src = GetSymbol(colors[srcIdx]);
7033 
7034             if (!src->IsUniform())
7035             {
7036                 m_encoder->SetSrcSubReg(0, i * 8);
7037             }
7038 
7039             if (isHF)
7040             {
7041                 // half message has src0 and src1 interleaved
7042                 if (srcIdx / 4 != 0)
7043                 {
7044                     m_encoder->SetDstSubReg(8);
7045                 }
7046                 m_encoder->SetDstSubVar(payloadOffset + (srcIdx % 4));
7047             }
7048             else
7049             {
7050                 m_encoder->SetDstSubVar(payloadOffset + srcIdx);
7051             }
7052 
7053             m_encoder->Copy(srcPayload, src);
7054             m_encoder->Push();
7055         }
7056         payloadOffset += isHF ? 4 : 8;
7057 
7058         if (outputDepth)
7059         {
7060             m_encoder->SetSimdSize(SIMDMode::SIMD8);
7061             m_encoder->SetMask(i == 0 ? EMASK_Q1 : EMASK_Q2);
7062             CVariable* src = GetSymbol(inst->getDepth());
7063             if (!src->IsUniform())
7064             {
7065                 m_encoder->SetSrcSubVar(0, i);
7066             }
7067             m_encoder->SetDstSubVar(payloadOffset++);
7068             m_encoder->Copy(payload, src);
7069             m_encoder->Push();
7070         }
7071 
7072         if (outputStencil)
7073         {
7074             m_encoder->SetSimdSize(SIMDMode::SIMD8);
7075             m_encoder->SetMask(i == 0 ? EMASK_Q1 : EMASK_Q2);
7076             CVariable* src = GetSymbol(inst->getStencil());
7077             CVariable* ubSrc = m_currShader->BitCast(src, ISA_TYPE_UB);
7078             m_encoder->SetSrcRegion(0, 32, 8, 4);
7079             if (!ubSrc->IsUniform())
7080             {
7081                 m_encoder->SetSrcSubVar(0, i);
7082             }
7083             m_encoder->SetDstSubVar(payloadOffset++);
7084             m_encoder->Copy(payload, ubSrc);
7085             m_encoder->Push();
7086         }
7087 
7088         EU_GEN6_DATA_PORT_RENDER_TARGET_WRITE_CONTROL messageType =
7089             (i == 0)
7090             ? EU_GEN6_DATA_PORT_RENDER_TARGET_WRITE_CONTROL_SIMD8_DUAL_SOURCE_LOW :
7091             EU_GEN6_DATA_PORT_RENDER_TARGET_WRITE_CONTROL_SIMD8_DUAL_SOURCE_HIGH;
7092 
7093         uint Desc = PixelDataPort(
7094             isHF,
7095             messageLength,
7096             responseLength,
7097             needHeader,
7098             psProgram->GetPhase() == PSPHASE_COARSE,
7099             perSample,
7100             lastRenderTarget,
7101             m_encoder->IsSecondHalf(),
7102             messageType,
7103             bindingTableIndex);
7104 
7105 
7106         // TODO create a function to encode extended message
7107         CVariable* exDesc =
7108             psProgram->ImmToVariable(EU_MESSAGE_TARGET_DATA_PORT_WRITE | (EOT ? 1 << 5 : 0), ISA_TYPE_UD);
7109         CVariable* messDesc = psProgram->ImmToVariable(Desc, ISA_TYPE_UD);
7110         if (psProgram->GetPhase() == PSPHASE_PIXEL)
7111         {
7112             CVariable* temp = psProgram->GetNewVariable(1, ISA_TYPE_UD, EALIGN_DWORD, true, CName::NONE);
7113             m_encoder->Or(temp, psProgram->GetCurrentPhaseCounter(), exDesc);
7114             m_encoder->Push();
7115             exDesc = temp;
7116         }
7117 
7118         //sendc
7119         if (pMaskOpnd)
7120             m_encoder->SetPredicate(pMaskOpnd);
7121         m_encoder->SetSimdSize(SIMDMode::SIMD8);
7122         m_encoder->SetMask(i == 0 ? EMASK_Q1 : EMASK_Q2);
7123         m_encoder->SendC(NULL, payload, EU_MESSAGE_TARGET_DATA_PORT_WRITE, exDesc, messDesc);
7124         m_encoder->Push();
7125     }
7126 }
7127 
7128 // Common emitter for URBRead and URBReadOutput, used also in associated pattern match pass.
7129 // The offsets are calculated in the caller.
emitURBReadCommon(llvm::GenIntrinsicInst * inst,const QuadEltUnit globalOffset,llvm::Value * const perSlotOffset)7130 void EmitPass::emitURBReadCommon(llvm::GenIntrinsicInst* inst, const QuadEltUnit globalOffset,
7131     llvm::Value* const perSlotOffset)
7132 {
7133     TODO("Have VISA define the URBRead interface instead of using a raw send");
7134 
7135 
7136     auto GetURBInputHandle = [&]()->CVariable*
7137     {
7138         CVariable* urbInputHandle = nullptr;
7139         switch (inst->getIntrinsicID())
7140         {
7141         case GenISAIntrinsic::GenISA_URBRead:
7142         {
7143             CVariable* const pVertexIndex = GetSymbol(inst->getOperand(0));
7144             urbInputHandle = m_currShader->GetURBInputHandle(pVertexIndex);
7145             // Mark input to be pulled.
7146             m_currShader->isInputsPulled = true;
7147             break;
7148         }
7149         case GenISAIntrinsic::GenISA_URBReadOutput:
7150         {
7151             urbInputHandle = m_currShader->GetURBOutputHandle();
7152             break;
7153         }
7154         default:
7155             IGC_ASSERT(0);
7156         }
7157         IGC_ASSERT(urbInputHandle);
7158         return urbInputHandle;
7159     };
7160 
7161     const EltUnit payloadSize(perSlotOffset ? 2 : 1);
7162     const Unit<Element> messageLength = payloadSize;
7163     CVariable* const payload = m_currShader->GetNewVariable(payloadSize.Count() * numLanes(SIMDMode::SIMD8),
7164         ISA_TYPE_UD, EALIGN_GRF, "URBPayload");
7165     IGC_ASSERT(numLanes(SIMDMode::SIMD8));
7166 
7167 
7168     Unit<Element> responseLength(m_destination->GetNumberElement() / numLanes(SIMDMode::SIMD8));
7169     {
7170         // Get the register with URBHandles and update certain per-opcode data.
7171         CVariable* urbInputHandle = GetURBInputHandle();
7172         m_encoder->Copy(payload, urbInputHandle);
7173         m_encoder->Push();
7174 
7175         if (perSlotOffset)
7176         {
7177             m_encoder->SetDstSubVar(1);
7178             CVariable* offset = m_currShader->GetSymbol(perSlotOffset);
7179             m_encoder->Copy(payload, offset);
7180             m_encoder->Push();
7181         }
7182 
7183         constexpr bool eot = false;
7184         constexpr bool channelMaskPresent = false;
7185         const uint desc = UrbMessage(
7186             messageLength.Count(),
7187             responseLength.Count(),
7188             eot,
7189             perSlotOffset != nullptr,
7190             channelMaskPresent,
7191             globalOffset.Count(),
7192             EU_URB_OPCODE_SIMD8_READ);
7193 
7194         constexpr uint exDesc = EU_MESSAGE_TARGET_URB;
7195         CVariable* const pMessDesc = m_currShader->ImmToVariable(desc, ISA_TYPE_UD);
7196         m_encoder->Send(m_destination, payload, exDesc, pMessDesc);
7197         m_encoder->Push();
7198     }
7199 
7200 }
7201 
7202 // Emitter for URBRead and URBReadOutput.
emitURBRead(llvm::GenIntrinsicInst * inst)7203 void EmitPass::emitURBRead(llvm::GenIntrinsicInst* inst)
7204 {
7205     llvm::Value* offset = nullptr;
7206     switch (inst->getIntrinsicID())
7207     {
7208     case GenISAIntrinsic::GenISA_URBRead:
7209         offset = inst->getOperand(1);
7210         break;
7211     case GenISAIntrinsic::GenISA_URBReadOutput:
7212         offset = inst->getOperand(0);
7213         break;
7214     default:
7215         IGC_ASSERT(0);
7216     }
7217     IGC_ASSERT_MESSAGE(!isa<ConstantInt>(offset), "Constant offsets are expected to be handled elsewhere.");
7218     emitURBReadCommon(inst, QuadEltUnit(0), offset);
7219 }
7220 
emitURBWrite(llvm::GenIntrinsicInst * inst)7221 void EmitPass::emitURBWrite(llvm::GenIntrinsicInst* inst)
7222 {
7223     // input: GenISA_URBWrite(%offset, %mask, %data0, ..., %data7)
7224     CVariable* offset = m_currShader->GetSymbol(inst->getOperand(0));
7225     CVariable* channelMask = m_currShader->GetSymbol(inst->getOperand(1));
7226     CVariable* URBHandle = m_currShader->GetURBOutputHandle();
7227 
7228     {
7229         // If offset or channel mask is not immediate value, we need per-slot offsets and/or channel mask
7230         // to contain data in all the channels. However, if the variable is uniform,
7231         // the uniform analysis makes it a scalar value, so we need to broadcast it to simd form.
7232         if (!channelMask->IsImmediate())
7233         {
7234             channelMask = BroadcastIfUniform(channelMask);
7235         }
7236 
7237         if (!offset->IsImmediate())
7238         {
7239             offset = BroadcastIfUniform(offset);
7240         }
7241     }
7242 
7243     {
7244         CVariable* payload = nullptr;
7245         int payloadElementOffset = 0;
7246         {
7247             payload = m_CE->PrepareExplicitPayload(
7248                 m_currShader,
7249                 m_encoder,
7250                 m_SimdMode,
7251                 m_DL,
7252                 inst,
7253                 payloadElementOffset);
7254         }
7255 
7256         m_encoder->URBWrite(payload, payloadElementOffset, offset, URBHandle, channelMask);
7257         m_encoder->Push();
7258     }
7259 }
7260 
interceptSamplePayloadCoalescing(llvm::SampleIntrinsic * inst,uint numPart,SmallVector<CVariable *,4> & payload,bool & payloadCovered)7261 void EmitPass::interceptSamplePayloadCoalescing(
7262     llvm::SampleIntrinsic* inst,
7263     uint numPart,
7264     //out:
7265     SmallVector<CVariable*, 4> & payload,
7266     bool& payloadCovered)
7267 {
7268     m_CE->SetCurrentPart(inst, numPart);
7269 
7270     const uint numPayloadOperands = m_CE->GetNumPayloadElements(inst);
7271     CoalescingEngine::CCTuple* ccTuple = nullptr;
7272     int payloadToCCTupleRelativeOffset = 0;
7273     Value* representativeValPtr = nullptr;
7274 
7275     ccTuple = m_CE->IsAnyValueCoalescedInCCTuple(inst,
7276         numPayloadOperands,
7277         //out:
7278         payloadToCCTupleRelativeOffset,
7279         representativeValPtr
7280    );
7281 
7282     payloadCovered = m_CE->IsPayloadCovered(inst,
7283         ccTuple,
7284         numPayloadOperands,
7285         payloadToCCTupleRelativeOffset);
7286 
7287     if (payloadToCCTupleRelativeOffset < 0)
7288     {
7289         payloadCovered = false;
7290     }
7291 
7292     //Once we are here, there is no rolling back - all the conditions for preparing
7293     //a coalesced load/sample are satisfied at this point, so just proceed with
7294     //preparing one.
7295     if (!payloadCovered)
7296     {
7297         return;
7298     }
7299     else
7300     {
7301         IGC_ASSERT(ccTuple);
7302         CVariable* rootPayloadVar = m_currShader->LazyCreateCCTupleBackingVariable(ccTuple);
7303 
7304         SmallPtrSet<Value*, 8> touchedValuesSet;
7305 
7306         IGC_ASSERT(representativeValPtr);
7307         IGC_ASSERT(payloadToCCTupleRelativeOffset >= 0);
7308         int byteOffset = payloadToCCTupleRelativeOffset *
7309             m_CE->GetSingleElementWidth(m_currShader->m_SIMDSize, m_DL, representativeValPtr);
7310 
7311         if (ccTuple->HasNonHomogeneousElements())
7312         {
7313             byteOffset += m_CE->GetLeftReservedOffset(ccTuple->GetRoot(), m_currShader->m_SIMDSize);
7314         }
7315 
7316         for (uint index = 0; index < numPayloadOperands; index++)
7317         {
7318             CVariable* src = nullptr;
7319 
7320             Value* val = m_CE->GetPayloadElementToValueMapping(inst, index);
7321             VISA_Type type = m_currShader->GetType(val->getType());
7322 
7323             bool needsAlias = false;
7324             if (touchedValuesSet.count(val))
7325             {
7326                 //We have a copy of an element used at least twice in a payload.
7327                 src = m_currShader->GetNewAlias(rootPayloadVar, type, (uint16_t)byteOffset, 0);
7328                 if (inst->IsDerivative())
7329                 {
7330                     m_encoder->SetNoMask();
7331                 }
7332                 m_encoder->Copy(src, GetSymbol(val));
7333                 m_encoder->Push();
7334 
7335                 byteOffset += GetOffsetIncrement(m_DL, m_currShader->m_SIMDSize, val);
7336 
7337                 IGC_ASSERT(src);
7338                 payload.push_back(src);
7339                 continue;
7340 
7341             }
7342             else
7343             {
7344                 touchedValuesSet.insert(val);
7345             }
7346 
7347             if (m_CE->IsValConstOrIsolated(val))
7348             {
7349                 needsAlias = true;
7350             }
7351             else
7352             {
7353                 if (m_CE->GetValueCCTupleMapping(val))
7354                 {
7355                     src = GetSymbol(val);
7356                 }
7357                 else
7358                 {
7359                     //this one actually encompasses the case for !getRegRoot(val)
7360                     needsAlias = true;
7361                 }
7362             } //if constant
7363 
7364             if (needsAlias)
7365             {
7366                 src = m_currShader->GetNewAlias(rootPayloadVar, type, (uint16_t)byteOffset, 0);
7367                 //TODO:WARNING: workaround
7368                 if (inst->IsDerivative() /*&& GetSymbol(val)->IsUniform()*/)
7369                 {
7370                     m_encoder->SetNoMask();
7371                 }
7372                 m_encoder->Copy(src, GetSymbol(val));
7373                 m_encoder->Push();
7374             }
7375             IGC_ASSERT(src);
7376             payload.push_back(src);
7377 
7378             byteOffset += GetOffsetIncrement(m_DL, m_currShader->m_SIMDSize, val);
7379 
7380 
7381         }
7382 
7383     }
7384 
7385 }
7386 
7387 
GetSampleResourceHelper(SampleIntrinsic * inst)7388 ResourceDescriptor EmitPass::GetSampleResourceHelper(SampleIntrinsic* inst)
7389 {
7390     llvm::Value* texOp = inst->getTextureValue();
7391     ResourceDescriptor resource = GetResourceVariable(texOp);
7392     return resource;
7393 }
7394 
emitSampleInstruction(SampleIntrinsic * inst)7395 void EmitPass::emitSampleInstruction(SampleIntrinsic* inst)
7396 {
7397     EOPCODE opCode = GetOpCode(inst);
7398 
7399     ResourceDescriptor resource = GetSampleResourceHelper(inst);
7400 
7401 
7402     //Get sampler index in the array of operands
7403     llvm::Value* samplerOp = inst->getSamplerValue();
7404     SamplerDescriptor sampler = GetSamplerVariable(samplerOp);
7405 
7406     const uint numOperands = inst->getNumOperands();
7407     // offset
7408     CVariable* immOffset = m_currShader->ImmToVariable(0, ISA_TYPE_UW);
7409     if (!inst->IsLODInst())
7410     {
7411         uint offsetSourceIndex = numOperands - 4;
7412         immOffset = ComputeSampleIntOffset(inst, offsetSourceIndex);
7413     }
7414 
7415     const CShader::ExtractMaskWrapper writeMask(m_currShader, inst);
7416     IGC_ASSERT_MESSAGE(writeMask.hasEM() && writeMask.getEM() != 0, "Wrong write mask");
7417 
7418     bool derivativeSample = inst->IsDerivative();
7419 
7420     bool cpsEnable = derivativeSample &&
7421         m_currShader->GetShaderType() == ShaderType::PIXEL_SHADER &&
7422         static_cast<CPixelShader*>(m_currShader)->GetPhase() == PSPHASE_COARSE;
7423 
7424     SmallVector<CVariable*, 4>  payload;
7425 
7426     bool doIntercept = true;
7427     // Skip sample_d* instructions in SIMD16 and SIMD32.
7428     if ((m_currShader->m_SIMDSize > SIMDMode::SIMD8 &&
7429         (opCode == llvm_sample_dptr ||
7430             opCode == llvm_sample_dcptr)))
7431     {
7432         doIntercept = false;
7433     }
7434     uint numSources = 0;
7435     const uint numParts = m_CE->GetNumSplitParts(inst);
7436     for (uint part = 0; part < numParts; part++)
7437     {
7438         bool payloadCovered = false;
7439         m_CE->SetCurrentPart(inst, part);
7440         const unsigned int numPartSources = m_CE->GetNumPayloadElements(inst);
7441         numSources += numPartSources;
7442         if (doIntercept)
7443         {
7444             interceptSamplePayloadCoalescing(inst, part, payload, payloadCovered);
7445         }
7446 
7447         if (!payloadCovered)
7448         {
7449             m_CE->SetCurrentPart(inst, part);
7450 
7451             //create send payload for numSources
7452             for (uint i = 0; i < numPartSources; i++)
7453             {
7454                 Value* v = m_CE->GetPayloadElementToValueMapping(inst, i);
7455                 CVariable* src = GetSymbol(v);
7456                 if (src->IsUniform())
7457                 {
7458                     CVariable* srcReg = m_currShader->GetNewVariable(
7459                         numLanes(m_currShader->m_SIMDSize), src->GetType(), EALIGN_GRF, CName::NONE);
7460                     if (derivativeSample)
7461                     {
7462                         m_encoder->SetNoMask();
7463                     }
7464                     m_encoder->Copy(srcReg, src);
7465                     m_encoder->Push();
7466                     src = srcReg;
7467                 }
7468                 payload.push_back(src);
7469             }
7470         }
7471     }
7472 
7473     // the responses to the sample + killpix and feedback messages have an extra register that contains a mask.
7474     bool hasMaskResponse = writeMask.isSet(4);
7475 
7476     CVariable* dst = m_destination;
7477     //When sampler output is 16 bit float, hardware doesnt pack the output in SIMD8 mode.
7478     //Hence the movs to handle this layout in SIMD8 mode
7479     bool simd8HFRet = isHalfGRFReturn(m_destination, m_SimdMode);
7480 
7481         if (simd8HFRet)
7482         {
7483             dst = m_currShader->GetNewVariable(
7484                 m_destination->GetNumberElement() * 2, ISA_TYPE_HF, EALIGN_GRF, false, CName::NONE);
7485         }
7486     uint label = 0;
7487     CVariable* flag = nullptr;
7488     bool zeroLOD = m_currShader->m_Platform->supportSampleAndLd_lz() && inst->ZeroLOD() &&
7489                    !m_currShader->m_Platform->WaDisableSampleLz();
7490     bool needLoop = ResourceLoopHeader(resource, sampler, flag, label);
7491 
7492     if (m_currShader->m_Platform->getWATable().Wa_22011157800 && !IGC_IS_FLAG_DISABLED(DiableWaSamplerNoMask))
7493     {
7494         m_encoder->SetNoMask();
7495     }
7496     else
7497     {
7498         m_encoder->SetPredicate(flag);
7499     }
7500     m_encoder->Sample(
7501         opCode,
7502         writeMask.getEM(),
7503         immOffset,
7504         resource,
7505         sampler,
7506         numSources,
7507         dst,
7508         payload,
7509         zeroLOD,
7510         cpsEnable,
7511         hasMaskResponse,
7512         needLoop);
7513     m_encoder->Push();
7514 
7515     if (m_currShader->hasReadWriteImage(*(inst->getParent()->getParent())))
7516     {
7517         CVariable* tempdest = m_currShader->BitCast(m_destination, GetUnsignedIntegerType(m_destination->GetType()));
7518         m_encoder->Cast(m_currShader->GetNULL(), tempdest);
7519         m_encoder->Push();
7520         m_encoder->Copy(m_currShader->GetNULL(), m_currShader->GetTSC());
7521         m_encoder->Push();
7522     }
7523     ResourceLoopBackEdge(needLoop, flag, label);
7524 
7525     {
7526         if (simd8HFRet)
7527         {
7528             PackSIMD8HFRet(dst);
7529         }
7530 
7531         if (hasMaskResponse)
7532         {
7533             CVariable* flag = m_currShader->GetNewVariable(
7534                 numLanes(m_currShader->m_dispatchSize), ISA_TYPE_BOOL, EALIGN_BYTE, CName::NONE);
7535             uint subvar = numLanes(m_currShader->m_SIMDSize) * 4 / (getGRFSize() >> 2);
7536             m_encoder->SetSrcSubVar(0, subvar);
7537             m_encoder->SetSrcRegion(0, 0, 1, 0);
7538             CVariable* newdestination = m_currShader->BitCast(m_destination, ISA_TYPE_UD);
7539             m_encoder->SetP(flag, newdestination);
7540             m_encoder->Push();
7541 
7542             // Use integer types for select in case driver uses alt mode
7543             // (0xFFFFFFFF is a NaN value, so the result is always 0).
7544             VISA_Type dstIntType = GetUnsignedIntegerType(m_destination->GetType());
7545             CVariable* pred = m_currShader->ImmToVariable(0xFFFFFFFF, dstIntType);
7546             CVariable* zero = m_currShader->ImmToVariable(0x0, dstIntType);
7547             CVariable* dstAlias = m_currShader->GetNewAlias(m_destination, dstIntType, 0, m_destination->GetNumberElement());
7548             m_encoder->SetDstSubVar(subvar);
7549             m_encoder->Select(flag, dstAlias, pred, zero);
7550             m_encoder->Push();
7551         }
7552     }
7553 }
7554 
7555 // Initialize global discard mask as ~dmask.
emitInitDiscardMask(llvm::GenIntrinsicInst * inst)7556 void EmitPass::emitInitDiscardMask(llvm::GenIntrinsicInst* inst)
7557 {
7558     if (m_encoder->IsSecondHalf())
7559         return;
7560 
7561     // (W) not (1|M0) f0.0:uw sr0.2<0;1,0>:ud
7562     CVariable* t = m_currShader->GetNewVariable(
7563         numLanes(m_currShader->m_dispatchSize),
7564         ISA_TYPE_BOOL, EALIGN_BYTE, CName::NONE);
7565     m_encoder->SetNoMask();
7566     m_encoder->SetSrcSubReg(0, 2);
7567     m_encoder->SetP(t, m_currShader->GetSR0());
7568     m_encoder->Push();
7569 
7570     m_encoder->SetNoMask();
7571     m_encoder->SetSimdSize(m_currShader->m_dispatchSize);
7572     m_encoder->GenericAlu(EOPCODE_NOT, m_destination, t, nullptr);
7573     m_encoder->Push();
7574 
7575     IGC_ASSERT(m_currShader->GetShaderType() == ShaderType::PIXEL_SHADER);
7576     CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
7577     psProgram->SetDiscardPixelMask(m_destination);
7578 }
7579 
7580 // update global discard mask with discard condition
emitUpdateDiscardMask(llvm::GenIntrinsicInst * inst)7581 void EmitPass::emitUpdateDiscardMask(llvm::GenIntrinsicInst* inst)
7582 {
7583     IGC_ASSERT(m_currShader->GetShaderType() == ShaderType::PIXEL_SHADER);
7584     CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
7585     CVariable* discardMask;
7586     CVariable* maskp;
7587 
7588     discardMask = GetSymbol(inst->getArgOperand(0));
7589     IGC_ASSERT(discardMask == psProgram->GetDiscardPixelMask());
7590 
7591     if (ConstantInt * ci = dyn_cast<ConstantInt>(inst->getArgOperand(1)))
7592     {
7593         if (ci->getZExtValue() == 1)
7594         {
7595             CVariable* dummyVar = m_currShader->GetNewVariable(1, ISA_TYPE_UW, EALIGN_WORD, true, CName::NONE);
7596             m_encoder->Cmp(EPREDICATE_EQ, discardMask, dummyVar, dummyVar);
7597             m_encoder->Push();
7598         }
7599         else
7600         {
7601             return;
7602         }
7603     }
7604     else
7605     {
7606         maskp = GetSymbol(inst->getArgOperand(1));
7607         m_encoder->Or(discardMask, discardMask, maskp);
7608         m_encoder->Push();
7609     }
7610 }
7611 
7612 // get live pixel mask for RTWrite from global discard mask
emitGetPixelMask(llvm::GenIntrinsicInst * inst)7613 void EmitPass::emitGetPixelMask(llvm::GenIntrinsicInst* inst)
7614 {
7615     IGC_ASSERT(m_currShader->GetShaderType() == ShaderType::PIXEL_SHADER);
7616     CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
7617     CVariable* globalMask;
7618 
7619     globalMask = GetSymbol(inst->getArgOperand(0));
7620     IGC_ASSERT(globalMask == psProgram->GetDiscardPixelMask());
7621 
7622     CVariable* dst = m_destination;
7623     m_encoder->SetNoMask();
7624     m_encoder->GenericAlu(EOPCODE_NOT, dst, globalMask, nullptr);
7625     m_encoder->Push();
7626 }
7627 
emitDiscard(llvm::Instruction * inst)7628 void EmitPass::emitDiscard(llvm::Instruction* inst)
7629 {
7630     IGC_ASSERT_MESSAGE(0, "No codegen for discard intrinsic");
7631 }
7632 
emitInfoInstruction(InfoIntrinsic * inst)7633 void EmitPass::emitInfoInstruction(InfoIntrinsic* inst)
7634 {
7635     EOPCODE opCode = GetOpCode(inst);
7636     llvm::Value* texOp = inst->getOperand(0);
7637 
7638     ResourceDescriptor resource = GetResourceVariable(texOp);
7639 
7640 
7641     CVariable* lod = nullptr;
7642     if (opCode != llvm_sampleinfoptr)
7643     {
7644         lod = GetSymbol(inst->getOperand(1));
7645     }
7646     if (lod && lod->IsUniform())
7647     {
7648         auto uniformSIMDMode = m_currShader->m_Platform->getMinDispatchMode();
7649         CVariable* srcReg = m_currShader->GetNewVariable(
7650             m_destination->IsUniform() ? numLanes(uniformSIMDMode) : numLanes(m_currShader->m_SIMDSize),
7651             ISA_TYPE_F,
7652             EALIGN_GRF,
7653             m_destination->IsUniform(),
7654             lod->getName());
7655         m_encoder->SetUniformSIMDSize(uniformSIMDMode);
7656         m_encoder->Copy(srcReg, lod);
7657         m_encoder->Push();
7658         lod = srcReg;
7659     }
7660 
7661     CVariable* tempDest = m_destination;
7662     if (m_destination->IsUniform())
7663     {
7664         auto uniformSIMDMode = m_currShader->m_Platform->getMinDispatchMode();
7665         tempDest = m_currShader->GetNewVariable(
7666             m_destination->GetNumberElement() * numLanes(uniformSIMDMode),
7667             ISA_TYPE_UD, EALIGN_GRF, true, m_destination->getName());
7668         m_encoder->SetUniformSIMDSize(uniformSIMDMode);
7669     }
7670 
7671     uint label = 0;
7672     CVariable* flag = nullptr;
7673     bool needLoop = ResourceLoopHeader(resource, flag, label);
7674     m_encoder->SetPredicate(flag);
7675 
7676     const CShader::ExtractMaskWrapper writeMask(m_currShader, inst);
7677     IGC_ASSERT_MESSAGE(writeMask.hasEM() && writeMask.getEM() != 0, "Wrong write mask");
7678 
7679     m_encoder->Info(opCode, writeMask.getEM(), resource, lod, tempDest);
7680     m_encoder->Push();
7681 
7682     ResourceLoopBackEdge(needLoop, flag, label);
7683 
7684     if (tempDest != m_destination)
7685     {
7686         unsigned int writemask = 0;
7687         for (auto I = inst->user_begin(), E = inst->user_end(); I != E; ++I)
7688         {
7689             if (llvm::ExtractElementInst * extract = llvm::dyn_cast<llvm::ExtractElementInst>(*I))
7690             {
7691                 if (llvm::ConstantInt * index = llvm::dyn_cast<ConstantInt>(extract->getIndexOperand()))
7692                 {
7693                     writemask |= BIT(static_cast<uint>(index->getZExtValue()));
7694                     continue;
7695                 }
7696             }
7697             writemask = 0xF;
7698             break;
7699         }
7700         for (uint i = 0; i < 4; i++)
7701         {
7702             if (BIT(i) & writemask)
7703             {
7704                 m_encoder->SetSrcSubVar(0, i);
7705                 m_encoder->SetDstSubReg(i);
7706                 m_encoder->Copy(m_destination, tempDest);
7707                 m_encoder->Push();
7708             }
7709         }
7710     }
7711 }
7712 
emitSurfaceInfo(GenIntrinsicInst * inst)7713 void EmitPass::emitSurfaceInfo(GenIntrinsicInst* inst)
7714 {
7715     ResourceDescriptor resource = GetResourceVariable(inst->getOperand(0));
7716     ForceDMask(false);
7717 
7718     DATA_PORT_TARGET_CACHE targetCache = DATA_PORT_TARGET_CONSTANT_CACHE;
7719     EU_MESSAGE_TARGET messageTarget = EU_MESSAGE_TARGET_DATA_PORT_DATA_CACHE_READ_ONLY;
7720     if (m_currShader->m_Platform->supportSamplerCacheResinfo())
7721     {
7722         targetCache = DATA_PORT_TARGET_SAMPLER_CACHE;
7723         messageTarget = EU_MESSAGE_TARGET_DATA_PORT_READ;
7724     }
7725 
7726     uint messageSpecificControl = DataPortRead(
7727         1,
7728         2,
7729         false,
7730         EU_DATA_PORT_READ_MESSAGE_TYPE_SURFACE_INFO_READ,
7731         0,
7732         false,
7733         targetCache,
7734         resource.m_surfaceType == ESURFACE_BINDLESS ? BINDLESS_BTI : (uint)resource.m_resource->GetImmediateValue());
7735 
7736     CVariable* pMessDesc = m_currShader->ImmToVariable(messageSpecificControl, ISA_TYPE_D);
7737 
7738     CVariable* exDesc =
7739         m_currShader->ImmToVariable(messageTarget, ISA_TYPE_D);
7740     if (resource.m_surfaceType == ESURFACE_BINDLESS)
7741     {
7742         CVariable* temp = m_currShader->GetNewVariable(resource.m_resource);
7743         m_encoder->Add(temp, resource.m_resource, exDesc);
7744         m_encoder->Push();
7745         exDesc = temp;
7746     }
7747     uint label = 0;
7748     CVariable* flag = nullptr;
7749     bool needLoop = ResourceLoopHeader(resource, flag, label);
7750     CVariable* payload = m_currShader->GetNewVariable(8, ISA_TYPE_UD, EALIGN_GRF, CName::NONE);
7751 
7752     m_encoder->SetSimdSize(SIMDMode::SIMD8);
7753     m_encoder->SetNoMask();
7754     m_encoder->Copy(payload, m_currShader->ImmToVariable(0, ISA_TYPE_UD));
7755     m_encoder->Push();
7756 
7757     m_encoder->SetUniformSIMDSize(SIMDMode::SIMD8);
7758     m_encoder->SetNoMask();
7759     m_encoder->Send(m_destination, payload,
7760         messageTarget, exDesc, pMessDesc);
7761     m_encoder->Push();
7762 
7763     IGC_ASSERT(m_destination->IsUniform());
7764     ResourceLoopBackEdge(needLoop, flag, label);
7765     ResetVMask(false);
7766 }
7767 
emitFeedbackEnable()7768 void EmitPass::emitFeedbackEnable()
7769 {
7770     // if feedback is enabled we always return all 4 channels
7771     CVariable* flag = m_currShader->GetNewVariable(
7772         numLanes(m_currShader->m_dispatchSize), ISA_TYPE_BOOL, EALIGN_BYTE, CName::NONE);
7773     uint typeSize = CEncoder::GetCISADataTypeSize(m_destination->GetType());
7774     uint subvar = (numLanes(m_currShader->m_SIMDSize) * typeSize * 4) / getGRFSize();
7775 
7776     m_encoder->SetSrcSubVar(0, subvar);
7777     m_encoder->SetSrcRegion(0, 0, 1, 0);
7778     CVariable* newdestination = m_currShader->BitCast(m_destination, ISA_TYPE_UD);
7779     m_encoder->SetP(flag, newdestination);
7780     m_encoder->Push();
7781 
7782     CVariable* pred = m_currShader->ImmToVariable(0xFFFFFFFF, m_destination->GetType());
7783     CVariable* zero = m_currShader->ImmToVariable(0x0, m_destination->GetType());
7784     m_encoder->SetDstSubVar(subvar);
7785     m_encoder->Select(flag, m_destination, pred, zero);
7786     m_encoder->Push();
7787 }
7788 
emitGather4Instruction(SamplerGatherIntrinsic * inst)7789 void EmitPass::emitGather4Instruction(SamplerGatherIntrinsic* inst)
7790 {
7791     EOPCODE opCode = GetOpCode(inst);
7792     uint numOperands = inst->getNumOperands();
7793 
7794     //Subtract the offsets, resource and sampler sources to get
7795     //the number of texture coordinates, src channel select and index to texture source
7796     uint numSources = numOperands - 7;
7797 
7798     Value* textureValue = inst->getTextureValue();
7799     ResourceDescriptor resource = GetResourceVariable(textureValue);
7800 
7801     SamplerDescriptor sampler;
7802     Value* samplerValue = inst->getSamplerValue();
7803 
7804     sampler = GetSamplerVariable(samplerValue);
7805 
7806     //Check for valid number of sources from the end of the list
7807     for (uint i = (numSources - 1); i >= 1; i--)
7808     {
7809         CVariable* validSrc = GetSymbol(inst->getOperand(i));
7810         if (validSrc->IsImmediate() &&
7811             validSrc->GetImmediateValue() == 0)
7812         {
7813             numSources--;
7814         }
7815         else
7816         {
7817             break;
7818         }
7819     }
7820 
7821     // offset
7822     uint offsetSourceIndex = numOperands - 5;
7823     CVariable* offset = ComputeSampleIntOffset(inst, offsetSourceIndex);
7824 
7825     uint channelIndx = numOperands - 2;
7826     uint channel = int_cast<uint>(GetImmediateVal(inst->getOperand(channelIndx)));
7827     SmallVector<CVariable*, 4> payload;
7828 
7829 
7830     //create send payload for numSources
7831     for (uint i = 0; i < numSources; i++)
7832     {
7833         CVariable* src = GetSymbol(inst->getOperand(i));
7834         if (src->IsUniform())
7835         {
7836             CVariable* srcReg = m_currShader->GetNewVariable(
7837                 numLanes(m_currShader->m_SIMDSize), ISA_TYPE_F,
7838                 m_currShader->getGRFAlignment(),
7839                 src->getName());
7840             m_encoder->Copy(srcReg, src);
7841             m_encoder->Push();
7842             src = srcReg;
7843         }
7844         payload.push_back(src);
7845     }
7846 
7847     CVariable* dst = m_destination;
7848     //When sampler output is 16 bit float, hardware doesnt pack the output in SIMD8 mode.
7849     //Hence the movs to handle this layout in SIMD8 mode
7850     bool simd8HFRet = isHalfGRFReturn(m_destination, m_SimdMode);
7851     if (simd8HFRet)
7852     {
7853         dst = m_currShader->GetNewVariable(
7854             m_destination->GetNumberElement() * 2, ISA_TYPE_HF, EALIGN_GRF, false, CName::NONE);
7855     }
7856 
7857     bool feedbackEnable = (m_destination->GetNumberElement() / numLanes(m_currShader->m_SIMDSize) == 5) ? true : false;
7858     uint label = 0;
7859     CVariable* flag = nullptr;
7860     bool needLoop = ResourceLoopHeader(resource, sampler, flag, label);
7861     m_encoder->SetPredicate(flag);
7862     m_encoder->Gather4Inst(
7863         opCode,
7864         offset,
7865         resource,
7866         sampler,
7867         numSources,
7868         dst,
7869         payload,
7870         channel,
7871         feedbackEnable);
7872     m_encoder->Push();
7873     if (m_currShader->hasReadWriteImage(*(inst->getParent()->getParent())))
7874     {
7875         CVariable* tempdest = m_currShader->BitCast(m_destination, GetUnsignedIntegerType(m_destination->GetType()));
7876         m_encoder->Cast(m_currShader->GetNULL(), tempdest);
7877         m_encoder->Push();
7878         m_encoder->Copy(m_currShader->GetNULL(), m_currShader->GetTSC());
7879         m_encoder->Push();
7880     }
7881     ResourceLoopBackEdge(needLoop, flag, label);
7882 
7883     {
7884         if (simd8HFRet)
7885         {
7886             PackSIMD8HFRet(dst);
7887         }
7888 
7889         if (feedbackEnable)
7890         {
7891             emitFeedbackEnable();
7892         }
7893     }
7894 }
7895 
emitLdmsInstruction(llvm::Instruction * inst)7896 void EmitPass::emitLdmsInstruction(llvm::Instruction* inst)
7897 {
7898     uint numOperands = inst->getNumOperands();
7899     EOPCODE opCode = GetOpCode(inst);
7900     //Subtract the offsets, and texture resource, lod to get
7901     //the number of texture coordinates and index to texture source
7902     uint numSources = numOperands - 5;
7903     uint textureArgIdx = numOperands - 5;
7904 
7905     for (uint i = numSources - 1; i > 0; i--)
7906     {
7907         CVariable* validSrc = GetSymbol(inst->getOperand(i));
7908         if (!(validSrc->IsImmediate() && validSrc->GetImmediateValue() == 0))
7909         {
7910             break;
7911         }
7912         numSources--;
7913     }
7914 
7915     // Figure out the write mask from the size of the destination we want to write
7916     const CShader::ExtractMaskWrapper writeMask(m_currShader, inst);
7917     IGC_ASSERT_MESSAGE(writeMask.hasEM() && writeMask.getEM() != 0, "Wrong write mask");
7918 
7919     Value* texOperand = inst->getOperand(textureArgIdx);
7920     ResourceDescriptor resource = GetResourceVariable(texOperand);
7921 
7922     uint offsetSourceIndex = numOperands - 4;
7923     CVariable* offset = ComputeSampleIntOffset(inst, offsetSourceIndex);
7924 
7925     SmallVector<CVariable*, 4> payload;
7926 
7927     //create send payload for numSources
7928     for (uint i = 0; i < numSources; i++)
7929     {
7930         CVariable* src = GetSymbol(inst->getOperand(i));
7931         src = BroadcastIfUniform(src);
7932         IGC_ASSERT(src->GetAliasOffset() % getGRFSize() == 0);
7933         payload.push_back(src);
7934     }
7935 
7936     CVariable* dst = m_destination;
7937     //When sampler output is 16 bit float, hardware doesnt pack the output in SIMD8 mode.
7938     //Hence the movs to handle this layout in SIMD8 mode
7939     bool simd8HFRet = isHalfGRFReturn(m_destination, m_SimdMode);
7940     if (simd8HFRet)
7941     {
7942         dst = m_currShader->GetNewVariable(
7943             m_destination->GetNumberElement() * 2, ISA_TYPE_HF, EALIGN_GRF, false, CName::NONE);
7944     }
7945 
7946     bool feedbackEnable = writeMask.isSet(4);
7947     uint label = 0;
7948     CVariable* flag = nullptr;
7949     bool needLoop = ResourceLoopHeader(resource, flag, label);
7950     m_encoder->SetPredicate(flag);
7951     m_encoder->LoadMS(opCode, writeMask.getEM(), offset, resource, numSources, dst, payload, feedbackEnable);
7952     m_encoder->Push();
7953     if (m_currShader->hasReadWriteImage(*(inst->getParent()->getParent())))
7954     {
7955         CVariable* tempdest = m_currShader->BitCast(m_destination, GetUnsignedIntegerType(m_destination->GetType()));
7956         m_encoder->Cast(m_currShader->GetNULL(), tempdest);
7957         m_encoder->Push();
7958         m_encoder->Copy(m_currShader->GetNULL(), m_currShader->GetTSC());
7959         m_encoder->Push();
7960     }
7961     ResourceLoopBackEdge(needLoop, flag, label);
7962 
7963     if (simd8HFRet)
7964     {
7965         PackSIMD8HFRet(dst);
7966     }
7967 
7968     if (feedbackEnable)
7969     {
7970         emitFeedbackEnable();
7971     }
7972 }
7973 
emitCSSGV(GenIntrinsicInst * inst)7974 void EmitPass::emitCSSGV(GenIntrinsicInst* inst)
7975 {
7976     CComputeShader* csProgram = static_cast<CComputeShader*>(m_currShader);
7977     SGVUsage usage =
7978         static_cast<SGVUsage>(llvm::cast<llvm::ConstantInt>(inst->getOperand(0))->getZExtValue());
7979     CVariable* pThreadIdInGroup = nullptr;
7980     switch (usage)
7981     {
7982     case THREAD_GROUP_ID_X:
7983     {
7984         m_encoder->SetSrcRegion(0, 0, 1, 0);
7985         if (csProgram->GetDispatchAlongY())
7986         {
7987             m_encoder->SetSrcSubReg(0, 6);
7988         }
7989         else
7990         {
7991             m_encoder->SetSrcSubReg(0, 1);
7992         }
7993         m_encoder->Copy(m_destination, csProgram->GetR0());
7994         m_encoder->Push();
7995         break;
7996     }
7997     case THREAD_GROUP_ID_Y:
7998     {
7999         m_encoder->SetSrcRegion(0, 0, 1, 0);
8000         if (csProgram->GetDispatchAlongY())
8001         {
8002             m_encoder->SetSrcSubReg(0, 1);
8003         }
8004         else
8005         {
8006             m_encoder->SetSrcSubReg(0, 6);
8007         }
8008         m_encoder->Copy(m_destination, csProgram->GetR0());
8009         m_encoder->Push();
8010         break;
8011     }
8012     case THREAD_GROUP_ID_Z:
8013     {
8014         m_encoder->SetSrcRegion(0, 0, 1, 0);
8015         m_encoder->SetSrcSubReg(0, 7);
8016         m_encoder->Copy(m_destination, csProgram->GetR0());
8017         m_encoder->Push();
8018         break;
8019     }
8020     case THREAD_ID_IN_GROUP_X:
8021     {
8022         IGC_ASSERT_MESSAGE(inst->getType() == Type::getInt16Ty(inst->getContext()), "only 16bit ThreadID is supported now.");
8023         pThreadIdInGroup = csProgram->CreateThreadIDsinGroup(THREAD_ID_IN_GROUP_X);
8024         m_currShader->CopyVariable(m_destination, pThreadIdInGroup);
8025         break;
8026     }
8027     case THREAD_ID_IN_GROUP_Y:
8028     {
8029         IGC_ASSERT_MESSAGE(inst->getType() == Type::getInt16Ty(inst->getContext()), "only 16bit ThreadID is supported now.");
8030         pThreadIdInGroup = csProgram->CreateThreadIDsinGroup(THREAD_ID_IN_GROUP_Y);
8031         m_currShader->CopyVariable(m_destination, pThreadIdInGroup);
8032         break;
8033     }
8034     case THREAD_ID_IN_GROUP_Z:
8035     {
8036         IGC_ASSERT_MESSAGE(inst->getType() == Type::getInt16Ty(inst->getContext()), "only 16bit ThreadID is supported now.");
8037         pThreadIdInGroup = csProgram->CreateThreadIDsinGroup(THREAD_ID_IN_GROUP_Z);
8038         m_currShader->CopyVariable(m_destination, pThreadIdInGroup);
8039         break;
8040     }
8041     default:
8042         break;
8043     }
8044 }
8045 
8046 // Store Coarse Pixel (Actual) size in the destination variable
getCoarsePixelSize(CVariable * destination,const uint component,bool isCodePatchCandidate)8047 void EmitPass::getCoarsePixelSize(CVariable* destination, const uint component, bool isCodePatchCandidate)
8048 {
8049     IGC_ASSERT(component < 2);
8050 
8051     CPixelShader* const psProgram = static_cast<CPixelShader*>(m_currShader);
8052     CVariable* r;
8053     bool isR1Lo = false;
8054     // Coarse pixel sizes are in R1 for both simd32 halves.
8055     {
8056         r = psProgram->GetPhase() == PSPHASE_PIXEL ? psProgram->GetCoarseR1() : psProgram->GetR1();
8057         isR1Lo = true;
8058     }
8059     r = m_currShader->GetVarHalf(r, 0);
8060     CVariable* const coarsePixelSize = m_currShader->BitCast(r, ISA_TYPE_UB);
8061     if (isR1Lo && isCodePatchCandidate)
8062     {
8063         psProgram->AppendR1Lo(coarsePixelSize);
8064     }
8065     m_encoder->SetSrcRegion(0, 0, 1, 0);
8066     uint subReg;
8067     {
8068         subReg = (component == 0) ? 0 : 1;
8069     }
8070     m_encoder->SetSrcSubReg(0, subReg);
8071     if (isCodePatchCandidate)
8072     {
8073         m_encoder->SetPayloadSectionAsPrimary();
8074     }
8075     m_encoder->Cast(destination, coarsePixelSize);
8076     m_encoder->Push();
8077     if (isCodePatchCandidate)
8078     {
8079         m_encoder->SetPayloadSectionAsSecondary();
8080     }
8081 }
8082 
emitPSSGV(GenIntrinsicInst * inst)8083 void EmitPass::emitPSSGV(GenIntrinsicInst* inst)
8084 {
8085     CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
8086     CVariable* dst = m_destination;
8087     const SGVUsage usage = (SGVUsage)llvm::cast<llvm::ConstantInt>(inst->getOperand(0))->getZExtValue();
8088 
8089     // Helper lambda to copy SGV data from thread payload when no data
8090     // processing/calculation is needed.
8091     auto CopySGV = [this, &psProgram, usage](
8092         CVariable* dst, CVariable* src)->void
8093     {
8094         IGC_ASSERT(usage == POSITION_Z ||
8095             usage == POSITION_W ||
8096             usage == INPUT_COVERAGE_MASK);
8097         {
8098             m_encoder->Copy(dst, src);
8099             m_encoder->Push();
8100         }
8101     };
8102 
8103     switch (usage)
8104     {
8105     case POSITION_Z:
8106     {
8107         if (psProgram->GetPhase() == PSPHASE_PIXEL || psProgram->GetPhase() == PSPHASE_COARSE)
8108         {
8109             // source depth:
8110             //      src_z = (x - xstart)*z_cx + (y - ystart)*z_cy + z_c0
8111             CVariable* delta = psProgram->GetZWDelta();
8112             CVariable* floatR1 = nullptr;
8113             {
8114                 floatR1 = psProgram->BitCast(psProgram->GetR1(), ISA_TYPE_F);
8115                 if (m_encoder->IsCodePatchCandidate())
8116                 {
8117                     psProgram->AppendR1Lo(floatR1);
8118                 }
8119             }
8120 
8121             // Returns (x - xstart) or (y - ystart) in float.
8122             auto getPixelPositionDelta = [this, psProgram, delta, floatR1](const uint component)->CVariable*
8123             {
8124                 IGC_ASSERT(component < 2);
8125                 CVariable* uintPixelPosition =
8126                     m_currShader->GetNewVariable(
8127                         numLanes(m_currShader->m_SIMDSize), ISA_TYPE_UW, EALIGN_GRF, CName::NONE);
8128                 getPixelPosition(uintPixelPosition, component, m_encoder->IsCodePatchCandidate());
8129 
8130                 CVariable* floatPixelPosition =
8131                     m_currShader->GetNewVariable(
8132                         numLanes(m_currShader->m_SIMDSize), ISA_TYPE_F, EALIGN_GRF, CName::NONE);
8133                 if (m_encoder->IsCodePatchCandidate())
8134                 {
8135                     m_encoder->SetPayloadSectionAsPrimary();
8136                 }
8137                 m_encoder->Cast(floatPixelPosition, uintPixelPosition);
8138                 m_encoder->Push();
8139                 if (m_encoder->IsCodePatchCandidate())
8140                 {
8141                     m_encoder->SetPayloadSectionAsSecondary();
8142                 }
8143 
8144                 // Pixel location is center in all APIs that use CPS.
8145                 {
8146                     CVariable* pixelCenter = m_currShader->ImmToVariable(0x3f000000, ISA_TYPE_F, m_encoder->IsCodePatchCandidate()); // 0.5f
8147                     if (psProgram->GetPhase() == PSPHASE_COARSE)
8148                     {
8149                         CVariable* coarsePixelSize = m_currShader->GetNewVariable(
8150                             numLanes(m_currShader->m_SIMDSize), ISA_TYPE_F, EALIGN_GRF, CName::NONE);
8151                         getCoarsePixelSize(coarsePixelSize, component, m_encoder->IsCodePatchCandidate());
8152                         if (m_encoder->IsCodePatchCandidate())
8153                         {
8154                             m_encoder->SetPayloadSectionAsPrimary();
8155                             m_currShader->AddPatchTempSetup(coarsePixelSize);
8156                         }
8157                         m_encoder->Mul(coarsePixelSize, coarsePixelSize, pixelCenter);
8158                         m_encoder->Push();
8159                         if (m_encoder->IsCodePatchCandidate())
8160                         {
8161                             m_encoder->SetPayloadSectionAsSecondary();
8162                         }
8163                         pixelCenter = coarsePixelSize;
8164                     }
8165                     if (m_encoder->IsCodePatchCandidate())
8166                     {
8167                         m_encoder->SetPayloadSectionAsPrimary();
8168                         m_currShader->AddPatchTempSetup(floatPixelPosition);
8169                     }
8170                     m_encoder->Add(floatPixelPosition, floatPixelPosition, pixelCenter);
8171                     m_encoder->Push();
8172                     if (m_encoder->IsCodePatchCandidate())
8173                     {
8174                         m_encoder->SetPayloadSectionAsSecondary();
8175                     }
8176                 }
8177 
8178                 CVariable* floatPixelPositionDelta = floatPixelPosition; //reuse the same variable for the final delta
8179 
8180                 m_encoder->SetSrcRegion(1, 0, 1, 0);
8181                 CVariable* startCoordinate = floatR1;
8182                 uint topLeftVertexStartSubReg = (component == 0 ? 1 : 6); // R1.1 for XStart and R1.6 for YStart
8183 
8184                 {
8185                     if (psProgram->m_Platform->hasStartCoordinatesDeliveredWithDeltas())
8186                     {
8187                         startCoordinate = delta;
8188                         topLeftVertexStartSubReg = (component == 0 ? 2 : 6);
8189                     }
8190                     m_encoder->SetSrcSubReg(1, topLeftVertexStartSubReg);
8191                     m_encoder->SetSrcModifier(1, EMOD_NEG);
8192                     if (m_encoder->IsCodePatchCandidate())
8193                     {
8194                         m_encoder->SetPayloadSectionAsPrimary();
8195                     }
8196                     m_encoder->Add(floatPixelPositionDelta, floatPixelPosition, startCoordinate);
8197                     m_encoder->Push();
8198                     if (m_encoder->IsCodePatchCandidate())
8199                     {
8200                         m_encoder->SetPayloadSectionAsSecondary();
8201                     }
8202                 }
8203                 return floatPixelPositionDelta;
8204             };
8205             const uint componentX = 0;
8206             const uint componentY = 1;
8207             // (x - xstart)
8208             CVariable* floatPixelPositionDeltaX = getPixelPositionDelta(componentX);
8209             // (y - ystart)
8210             CVariable* floatPixelPositionDeltaY = getPixelPositionDelta(componentY);
8211 
8212             // (y - ystart)*z_cy + z_c0
8213             {
8214                 {
8215                     m_encoder->SetSrcRegion(1, 0, 1, 0);
8216                     m_encoder->SetSrcRegion(2, 0, 1, 0);
8217                 }
8218                 m_encoder->SetSrcSubReg(1, 0);
8219                 m_encoder->SetSrcSubReg(2, 3);
8220                 ContextSwitchPayloadSection();
8221                 m_encoder->Mad(floatPixelPositionDeltaY, floatPixelPositionDeltaY, delta, delta);
8222                 m_encoder->Push();
8223             }
8224             // (x - xstart)*z_cx + (y - ystart)*z_cy + z_c0
8225             {
8226                 {
8227                     m_encoder->SetSrcRegion(1, 0, 1, 0);
8228                 }
8229                 m_encoder->SetSrcSubReg(1, 1);
8230                 m_encoder->Mad(m_destination, floatPixelPositionDeltaX, delta, floatPixelPositionDeltaY);
8231                 m_encoder->Push();
8232             }
8233             ContextSwitchShaderBody();
8234         }
8235         else
8236         {
8237             CopySGV(dst, psProgram->GetPositionZ());
8238         }
8239         break;
8240     }
8241     case POSITION_W:
8242     {
8243         CopySGV(dst, psProgram->GetPositionW());
8244         break;
8245     }
8246     case POSITION_X_OFFSET:
8247     case POSITION_Y_OFFSET:
8248     {
8249         // This returns to you the register value in the payload containing PSXYPositionOffset
8250         CVariable* pPositionXYOffset = psProgram->GetPositionXYOffset();
8251         // Access the correct subregion for the interleaved XY follow Spec
8252         m_encoder->SetSrcRegion(0, 16, 8, 2);
8253         m_encoder->SetSrcSubReg(0, usage == POSITION_X_OFFSET ? 0 : 1);
8254 
8255         // U4.4 encoding= upper 4 bits represent integer part and lower 4 bits represent decimal part
8256         // Extract integer part by AND with 11110000 and right shifting 4 bits
8257         CVariable* intVal_B = m_currShader->GetNewVariable(
8258             numLanes(m_currShader->m_SIMDSize),
8259             ISA_TYPE_B,
8260             EALIGN_GRF,
8261             CName::NONE);
8262         m_encoder->And(intVal_B, pPositionXYOffset, m_currShader->ImmToVariable(0xf0, ISA_TYPE_B));
8263         m_encoder->Push();
8264 
8265         CVariable* intVal_F = m_currShader->GetNewVariable(
8266             numLanes(m_currShader->m_SIMDSize),
8267             ISA_TYPE_F,
8268             EALIGN_GRF,
8269             CName::NONE);
8270         m_encoder->Shr(intVal_B, intVal_B, m_currShader->ImmToVariable(0x04, ISA_TYPE_B));
8271         m_encoder->Cast(intVal_F, intVal_B);
8272         m_encoder->Push();
8273 
8274         // Extract decimal part by AND with 00001111 and divide by 16
8275         CVariable* deciVal_B = m_currShader->GetNewVariable(
8276             numLanes(m_currShader->m_SIMDSize),
8277             ISA_TYPE_B,
8278             EALIGN_GRF,
8279             CName::NONE);
8280 
8281         m_encoder->SetSrcRegion(0, 16, 8, 2);
8282         m_encoder->SetSrcSubReg(0, usage == POSITION_X_OFFSET ? 0 : 1);
8283         m_encoder->And(deciVal_B, pPositionXYOffset, m_currShader->ImmToVariable(0x0f, ISA_TYPE_B));
8284         m_encoder->Push();
8285 
8286         CVariable* deciVal_F = m_currShader->GetNewVariable(
8287             numLanes(m_currShader->m_SIMDSize),
8288             ISA_TYPE_F,
8289             EALIGN_GRF,
8290             CName::NONE);
8291         m_encoder->Cast(deciVal_F, deciVal_B);
8292         // Divide lower 4 bits decimal value  by 16 = cheaper operation of cheaper Mul Op by 0.0625
8293         CVariable* temp = m_currShader->GetNewVariable(
8294             numLanes(m_currShader->m_SIMDSize),
8295             ISA_TYPE_F,
8296             EALIGN_GRF,
8297             CName::NONE);
8298         m_encoder->Mul(temp, deciVal_F, m_currShader->ImmToVariable(0x3d800000, ISA_TYPE_F));
8299         m_encoder->Push();
8300 
8301         // Add decimal and integer to compute our PSXYOffsetValue
8302         m_encoder->Add(dst, intVal_F, temp);
8303         m_encoder->Push();
8304     }
8305     break;
8306     case RENDER_TARGET_ARRAY_INDEX:
8307     case VIEWPORT_INDEX:
8308     case VFACE:
8309     {
8310         // VFACE in shader's payload is one bit: 0/1 for front/back facing, respectively.
8311         // As it is sign bit of R1.2:w value in payload, the value may be simply converted to float and set as
8312         // dst - float(VFACE) >=0 (<0) means front (back) facing.
8313         {
8314             unsigned int numTri = 1;
8315             SIMDMode simdSize = psProgram->m_SIMDSize;
8316             CVariable* reg = psProgram->GetR0();
8317             unsigned int subReg = 0;
8318             if (usage == VFACE)
8319             {
8320                 for (unsigned int i = 0; i < numTri; i++)
8321                 {
8322                     CVariable* src = m_currShader->BitCast(reg, ISA_TYPE_W);
8323                     m_encoder->SetSrcSubReg(0, 2 * (subReg + i * 5));
8324                     m_encoder->SetSrcRegion(0, 0, 1, 0);
8325                     m_encoder->SetSimdSize(simdSize);
8326                     m_encoder->SetMask(i == 0 ? EMASK_Q1 : EMASK_Q2);
8327                     m_encoder->SetDstSubVar(i);
8328                     if (m_encoder->IsCodePatchCandidate())
8329                     {
8330                         psProgram->AppendR1Lo(src);
8331                     }
8332                     ContextSwitchPayloadSection(i == 0);
8333                     m_encoder->Cast(dst, src);
8334                     m_encoder->Push();
8335                     ContextSwitchShaderBody(i == numTri);
8336                 }
8337             }
8338             else if (usage == RENDER_TARGET_ARRAY_INDEX)
8339             {
8340                 dst = m_currShader->BitCast(dst, ISA_TYPE_UD);
8341                 CVariable* temp = m_currShader->GetNewVariable(dst);
8342                 for (unsigned int i = 0; i < numTri; i++)
8343                 {
8344                     m_encoder->SetSrcRegion(0, 0, 1, 0);
8345                     m_encoder->SetSrcSubReg(0, subReg + i * 5);
8346                     m_encoder->SetSimdSize(simdSize);
8347                     m_encoder->SetMask(i == 0 ? EMASK_Q1 : EMASK_Q2);
8348                     m_encoder->SetDstSubVar(i);
8349                     m_encoder->Shr(temp, reg, m_currShader->ImmToVariable(16, ISA_TYPE_UD));
8350                     m_encoder->Push();
8351                 }
8352                 m_encoder->And(dst, temp, m_currShader->ImmToVariable(BITMASK(11), ISA_TYPE_UD));
8353                 m_encoder->Push();
8354             }
8355             else if (usage == VIEWPORT_INDEX)
8356             {
8357                 dst = m_currShader->BitCast(dst, ISA_TYPE_UD);
8358                 CVariable* temp = m_currShader->GetNewVariable(dst);
8359                 for (unsigned int i = 0; i < numTri; i++)
8360                 {
8361                     m_encoder->SetSrcRegion(0, 0, 1, 0);
8362                     m_encoder->SetSrcSubReg(0, subReg + i * 5);
8363                     m_encoder->SetSimdSize(simdSize);
8364                     m_encoder->SetMask(i == 0 ? EMASK_Q1 : EMASK_Q2);
8365                     m_encoder->SetDstSubVar(i);
8366                     m_encoder->Shr(temp, reg, m_currShader->ImmToVariable(27, ISA_TYPE_UD));
8367                     m_encoder->Push();
8368                 }
8369                 m_encoder->And(dst, temp, m_currShader->ImmToVariable(BITMASK(4), ISA_TYPE_UD));
8370                 m_encoder->Push();
8371             }
8372         }
8373     }
8374     break;
8375     case SAMPLEINDEX:
8376     {
8377         // Sample index is stored in one half byte per subspan. We shift right
8378         // each lane with a different value to get the right number for each subspan
8379         // shr (8) r9.0<1>:uw   r1.0<0;1,0>:uw   0x0c080400:uv    { Align1, NoMask, Q1 }
8380         // and (16) r9.0<1>:ud   r9.0<1;4,0>:ud   0x0000000f:uw       { Align1, Q1 }
8381         CVariable* shiftPos = m_currShader->ImmToVariable(0x0C080400, ISA_TYPE_UV);
8382         CVariable* temp = nullptr;
8383         {
8384             CVariable* r1 = m_currShader->BitCast(psProgram->GetR1(), ISA_TYPE_UW);
8385             temp = m_currShader->GetNewVariable(8, ISA_TYPE_UW, EALIGN_GRF,
8386                                                 "SampleIndexExtracted");
8387             m_encoder->SetSrcRegion(0, 0, 1, 0);
8388             m_encoder->SetSimdSize(SIMDMode::SIMD8);
8389             m_encoder->SetNoMask();
8390             m_encoder->Shr(temp, r1, shiftPos);
8391             m_encoder->Push();
8392         }
8393 
8394         CVariable* andMask = m_currShader->ImmToVariable(0x0000000F, ISA_TYPE_UD);
8395         dst = m_currShader->BitCast(dst, ISA_TYPE_UD);
8396         temp = m_currShader->BitCast(temp, ISA_TYPE_UD);
8397         m_encoder->SetSrcRegion(0, 1, 4, 0);
8398         m_encoder->And(dst, temp, andMask);
8399         m_encoder->Push();
8400     }
8401     break;
8402     case INPUT_COVERAGE_MASK:
8403     {
8404         CVariable* pInputCoverageMask = psProgram->GetInputCoverageMask();
8405         CopySGV(dst, pInputCoverageMask);
8406     }
8407     break;
8408     case ACTUAL_COARSE_SIZE_X:
8409     case ACTUAL_COARSE_SIZE_Y:
8410     {
8411         getCoarsePixelSize(m_destination, (usage == ACTUAL_COARSE_SIZE_X ? 0 : 1));
8412     }
8413     break;
8414     case REQUESTED_COARSE_SIZE_X:
8415     case REQUESTED_COARSE_SIZE_Y:
8416     {
8417         CVariable* requestedSize = (usage == REQUESTED_COARSE_SIZE_X) ?
8418             psProgram->GetCPSRequestedSizeX() : psProgram->GetCPSRequestedSizeY();
8419         m_encoder->SetSrcRegion(0, 1, 4, 0);
8420         m_encoder->Cast(m_destination, requestedSize);
8421         m_encoder->Push();
8422     }
8423     break;
8424     case MSAA_RATE:
8425     {
8426         dst = m_currShader->BitCast(dst, ISA_TYPE_UD);
8427         m_encoder->SetSrcRegion(0, 0, 1, 0);
8428         CVariable* r;
8429         {
8430             m_encoder->SetSrcSubReg(0, 1);
8431             r = psProgram->GetR1();
8432         }
8433         m_encoder->And(
8434             dst,
8435             m_currShader->BitCast(r, ISA_TYPE_UW),
8436             m_currShader->ImmToVariable(BITMASK(4), ISA_TYPE_UW));
8437         m_encoder->Push();
8438     }
8439     break;
8440 
8441     default:
8442         IGC_ASSERT(0);
8443         break;
8444     }
8445 
8446     psProgram->DeclareSGV(usage);
8447 }
8448 
emitDSSGV(llvm::GenIntrinsicInst * pInst)8449 void EmitPass::emitDSSGV(llvm::GenIntrinsicInst* pInst)
8450 {
8451     IGC_ASSERT(m_currShader->GetShaderType() == ShaderType::DOMAIN_SHADER);
8452     CDomainShader* dsProgram = static_cast<CDomainShader*>(m_currShader);
8453     SGVUsage usage = static_cast<SGVUsage>(llvm::dyn_cast<llvm::ConstantInt>(pInst->getOperand(0))->getZExtValue());
8454     if (PRIMITIVEID == usage)
8455     {
8456         if (dsProgram->m_ShaderDispatchMode == ShaderDispatchMode::DUAL_PATCH)
8457         {
8458             m_encoder->SetSrcRegion(0, 4, 4, 0);
8459             m_encoder->SetSrcSubReg(0, 0);
8460             m_encoder->Copy(m_destination, dsProgram->GetPrimitiveID());
8461             m_encoder->Push();
8462         }
8463         else
8464         {
8465             m_encoder->SetSrcRegion(0, 0, 1, 0);
8466             m_encoder->SetSrcSubReg(0, 1);
8467             m_encoder->Copy(m_destination, dsProgram->GetR0());
8468             m_encoder->Push();
8469         }
8470     }
8471 }
8472 
emitHSSGV(llvm::GenIntrinsicInst * pInst)8473 void EmitPass::emitHSSGV(llvm::GenIntrinsicInst* pInst)
8474 {
8475     IGC_ASSERT(m_currShader->GetShaderType() == ShaderType::HULL_SHADER);
8476     CHullShader* hsProgram = static_cast<CHullShader*>(m_currShader);
8477     SGVUsage usage = static_cast<SGVUsage>(llvm::dyn_cast<llvm::ConstantInt>(pInst->getOperand(0))->getZExtValue());
8478     if (PRIMITIVEID == usage)
8479     {
8480         if (hsProgram->GetShaderDispatchMode() == SINGLE_PATCH_DISPATCH_MODE)
8481         {
8482             m_encoder->SetSrcRegion(0, 0, 1, 0);
8483             m_encoder->SetSrcSubReg(0, 1);
8484             m_encoder->Copy(m_destination, hsProgram->GetR0());
8485             m_encoder->Push();
8486         }
8487         else
8488         {
8489             // eight patch dispatch mode
8490             m_encoder->Copy(m_destination, hsProgram->GetR2());
8491             m_encoder->Push();
8492         }
8493     }
8494     else
8495     {
8496         IGC_ASSERT_MESSAGE(0, "Hull Shader SGV not supported");
8497     }
8498 }
8499 
8500 
8501 // Store integer pixel position in the destination variable.
8502 // Only X and Y components are handled here!
getPixelPosition(CVariable * destination,const uint component,bool isCodePatchCandidate)8503 void EmitPass::getPixelPosition(CVariable* destination, const uint component, bool isCodePatchCandidate)
8504 {
8505     IGC_ASSERT(component < 2);
8506     IGC_ASSERT(nullptr != destination);
8507     IGC_ASSERT(nullptr != m_encoder);
8508     IGC_ASSERT(m_encoder->IsIntegerType(destination->GetType()));
8509 
8510     const bool getX = (component == 0);
8511 
8512     CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
8513     CVariable* imm = m_currShader->ImmToVariable(
8514         getX ? 0x10101010 : 0x11001100, ISA_TYPE_V, isCodePatchCandidate);
8515     CVariable* pixelSize = nullptr;
8516     if (psProgram->GetPhase() == PSPHASE_COARSE)
8517     {
8518         // Coarse pixel sizes are in R1 for both simd32 halves.
8519         CVariable* r;
8520         bool isR1Lo = false;
8521         {
8522             r = m_currShader->GetVarHalf(psProgram->GetR1(), 0);
8523             isR1Lo = true;
8524         }
8525         CVariable* CPSize = m_currShader->BitCast(r, ISA_TYPE_UB);
8526         if (isR1Lo && isCodePatchCandidate)
8527         {
8528             psProgram->AppendR1Lo(CPSize);
8529         }
8530         pixelSize =
8531             m_currShader->GetNewVariable(
8532                 numLanes(m_currShader->m_SIMDSize), ISA_TYPE_UW, EALIGN_GRF, CName::NONE);
8533         m_encoder->SetSrcRegion(0, 0, 1, 0);
8534         uint subReg;
8535         {
8536             subReg = getX ? 0 : 1;
8537         }
8538         m_encoder->SetSrcSubReg(0, subReg);
8539         if (isCodePatchCandidate)
8540         {
8541             m_encoder->SetPayloadSectionAsPrimary();
8542             m_currShader->AddPatchTempSetup(pixelSize);
8543         }
8544         m_encoder->Mul(pixelSize, CPSize, imm);
8545         m_encoder->Push();
8546         if (isCodePatchCandidate)
8547         {
8548             m_encoder->SetPayloadSectionAsSecondary();
8549         }
8550     }
8551     else
8552     {
8553         pixelSize = imm;
8554     }
8555 
8556     {
8557         CVariable* position = m_currShader->BitCast(psProgram->GetR1(), ISA_TYPE_UW);
8558         // subreg 4 as position_x and subreg 5 as position_y
8559         m_encoder->SetSrcSubReg(0, getX ? 4 : 5);
8560         m_encoder->SetSrcRegion(0, 2, 4, 0);
8561         if (isCodePatchCandidate)
8562         {
8563             m_encoder->SetPayloadSectionAsPrimary();
8564             psProgram->AppendR1Lo(position);
8565             m_currShader->AddPatchTempSetup(destination);
8566         }
8567         m_encoder->Add(destination, position, pixelSize);
8568         m_encoder->Push();
8569         if (isCodePatchCandidate)
8570         {
8571             m_encoder->SetPayloadSectionAsSecondary();
8572         }
8573     }
8574 }
8575 
8576 
emitPixelPosition(llvm::GenIntrinsicInst * inst)8577 void EmitPass::emitPixelPosition(llvm::GenIntrinsicInst* inst)
8578 {
8579     const GenISAIntrinsic::ID IID = inst->getIntrinsicID();
8580     const uint component = IID == GenISAIntrinsic::GenISA_PixelPositionX ? 0 : 1;
8581     getPixelPosition(m_destination, component);
8582 }
8583 
emitSGV(SGVIntrinsic * inst)8584 void EmitPass::emitSGV(SGVIntrinsic* inst)
8585 {
8586     switch (m_currShader->GetShaderType())
8587     {
8588     case ShaderType::PIXEL_SHADER:
8589         emitPSSGV(inst);
8590         break;
8591     case ShaderType::COMPUTE_SHADER:
8592         emitCSSGV(inst);
8593         break;
8594     case ShaderType::DOMAIN_SHADER:
8595         emitDSSGV(inst);
8596         break;
8597     case ShaderType::HULL_SHADER:
8598         emitHSSGV(inst);
8599         break;
8600     case ShaderType::GEOMETRY_SHADER:
8601         emitGS_SGV(inst);
8602         break;
8603     default:
8604         IGC_ASSERT_MESSAGE(0, "This shader should not have SGV");
8605         break;
8606     }
8607 }
8608 
emitAluNoModifier(llvm::GenIntrinsicInst * inst)8609 void EmitPass::emitAluNoModifier(llvm::GenIntrinsicInst* inst)
8610 {
8611     CVariable* pSrc0 = GetSymbol(inst->getOperand(0));
8612     CVariable* pSrc1;
8613     CVariable* pSrc2;
8614     CVariable* dst;
8615 
8616     switch (inst->getIntrinsicID())
8617     {
8618     case GenISAIntrinsic::GenISA_bfi:
8619     {
8620         pSrc1 = GetSymbol(inst->getOperand(1));
8621         pSrc2 = GetSymbol(inst->getOperand(2));
8622         CVariable* pSrc3 = GetSymbol(inst->getOperand(3));
8623         m_encoder->Bfi(m_destination, pSrc0, pSrc1, pSrc2, pSrc3);
8624     }
8625     break;
8626     case GenISAIntrinsic::GenISA_ibfe:
8627         pSrc1 = GetSymbol(inst->getOperand(1));
8628         pSrc2 = GetSymbol(inst->getOperand(2));
8629         m_encoder->Bfe(m_destination, pSrc0, pSrc1, pSrc2);
8630         break;
8631     case GenISAIntrinsic::GenISA_ubfe:
8632         pSrc1 = GetSymbol(inst->getOperand(1));
8633         pSrc2 = GetSymbol(inst->getOperand(2));
8634         pSrc0 = m_currShader->BitCast(pSrc0, ISA_TYPE_UD);
8635         pSrc1 = m_currShader->BitCast(pSrc1, ISA_TYPE_UD);
8636         pSrc2 = m_currShader->BitCast(pSrc2, ISA_TYPE_UD);
8637         dst = m_currShader->BitCast(m_destination, GetUnsignedType(m_destination->GetType()));
8638         m_encoder->Bfe(dst, pSrc0, pSrc1, pSrc2);
8639         break;
8640     case GenISAIntrinsic::GenISA_firstbitLo:
8641         dst = m_currShader->BitCast(m_destination, GetUnsignedType(m_destination->GetType()));
8642         m_encoder->Fbl(dst, pSrc0);
8643         break;
8644     case GenISAIntrinsic::GenISA_firstbitHi:
8645         pSrc0 = m_currShader->BitCast(pSrc0, ISA_TYPE_UD);
8646         dst = m_currShader->BitCast(m_destination, GetUnsignedType(m_destination->GetType()));
8647         m_encoder->Fbh(dst, pSrc0);
8648         break;
8649     case GenISAIntrinsic::GenISA_firstbitShi:
8650         dst = m_currShader->BitCast(m_destination, GetUnsignedType(m_destination->GetType()));
8651         m_encoder->Fbh(dst, pSrc0);
8652         break;
8653     default:
8654         break;
8655     }
8656     m_encoder->Push();
8657 }
8658 
EmitGenIntrinsicMessage(llvm::GenIntrinsicInst * inst)8659 void EmitPass::EmitGenIntrinsicMessage(llvm::GenIntrinsicInst* inst)
8660 {
8661     switch (inst->getIntrinsicID())
8662     {
8663     case GenISAIntrinsic::GenISA_OUTPUT:
8664         emitOutput(inst);
8665         break;
8666     case GenISAIntrinsic::GenISA_RTWrite:
8667         emitRenderTargetWrite(cast<RTWritIntrinsic>(inst), false);
8668         break;
8669     case GenISAIntrinsic::GenISA_RTDualBlendSource:
8670         emitDualBlendRT(cast<RTDualBlendSourceIntrinsic>(inst), false);
8671         break;
8672     case GenISAIntrinsic::GenISA_simdLaneId:
8673         emitSimdLaneId(inst);
8674         break;
8675     case GenISAIntrinsic::GenISA_patchInstanceId:
8676         emitPatchInstanceId(inst);
8677         break;
8678     case GenISAIntrinsic::GenISA_simdSize:
8679         emitSimdSize(inst);
8680         break;
8681     case GenISAIntrinsic::GenISA_simdShuffleDown:
8682         emitSimdShuffleDown(inst);
8683         break;
8684     case GenISAIntrinsic::GenISA_simdBlockRead:
8685         emitSimdBlockRead(inst);
8686         break;
8687     case GenISAIntrinsic::GenISA_simdBlockReadBindless:
8688         emitSimdBlockRead(inst, inst->getOperand(1));
8689         break;
8690     case GenISAIntrinsic::GenISA_simdBlockWrite:
8691         emitSimdBlockWrite(inst);
8692         break;
8693     case GenISAIntrinsic::GenISA_simdBlockWriteBindless:
8694         emitSimdBlockWrite(inst, inst->getOperand(2));
8695         break;
8696     case GenISAIntrinsic::GenISA_MediaBlockRead:
8697         emitMediaBlockIO(inst, true);
8698         break;
8699     case GenISAIntrinsic::GenISA_MediaBlockWrite:
8700         emitMediaBlockIO(inst, false);
8701         break;
8702     case GenISAIntrinsic::GenISA_MediaBlockRectangleRead:
8703         emitMediaBlockRectangleRead(inst);
8704         break;
8705     case GenISAIntrinsic::GenISA_simdMediaBlockRead:
8706         emitSimdMediaBlockRead(inst);
8707         break;
8708     case GenISAIntrinsic::GenISA_simdMediaBlockWrite:
8709         emitSimdMediaBlockWrite(inst);
8710         break;
8711     case GenISAIntrinsic::GenISA_frc:
8712         emitFrc(inst);
8713         break;
8714     case GenISAIntrinsic::GenISA_RenderTargetRead:
8715     case GenISAIntrinsic::GenISA_RenderTargetReadSampleFreq:
8716         emitRenderTargetRead(inst);
8717         break;
8718     case GenISAIntrinsic::GenISA_URBWrite:
8719         emitURBWrite(inst);
8720         break;
8721     case GenISAIntrinsic::GenISA_URBRead:
8722     case GenISAIntrinsic::GenISA_URBReadOutput:
8723         emitURBRead(inst);
8724         break;
8725     case GenISAIntrinsic::GenISA_cycleCounter:
8726         emitcycleCounter(inst);
8727         break;
8728     case GenISAIntrinsic::GenISA_SetDebugReg:
8729         emitSetDebugReg(inst);
8730         break;
8731     case GenISAIntrinsic::GenISA_vmeSendIME:
8732         emitVMESendIME(inst);
8733         break;
8734     case GenISAIntrinsic::GenISA_vmeSendIME2:
8735         emitVMESendIME2(inst);
8736         break;
8737     case GenISAIntrinsic::GenISA_vmeSendFBR:
8738         emitVMESendFBR(inst);
8739         break;
8740     case GenISAIntrinsic::GenISA_vmeSendFBR2:
8741         emitVMESendFBR2(inst);
8742         break;
8743     case GenISAIntrinsic::GenISA_vmeSendSIC2:
8744         emitVMESendSIC2(inst);
8745         break;
8746     case GenISAIntrinsic::GenISA_vmeSendSIC:
8747         emitVMESendSIC(inst);
8748         break;
8749     case GenISAIntrinsic::GenISA_vaErode:
8750     case GenISAIntrinsic::GenISA_vaDilate:
8751     case GenISAIntrinsic::GenISA_vaMinMax:
8752         emitVideoAnalyticSLM(inst, 1);
8753         break;
8754     case GenISAIntrinsic::GenISA_vaMinMaxFilter:
8755         emitVideoAnalyticSLM(inst, 8);
8756         break;
8757     case GenISAIntrinsic::GenISA_vaConvolve:
8758     case GenISAIntrinsic::GenISA_vaCentroid:
8759         emitVideoAnalyticSLM(inst, 4);
8760         break;
8761     case GenISAIntrinsic::GenISA_vaConvolveGRF_16x1:
8762     case GenISAIntrinsic::GenISA_vaConvolveGRF_16x4:
8763         emitVideoAnalyticGRF(inst, 1);
8764         break;
8765     case GenISAIntrinsic::GenISA_vaBoolSum:
8766     case GenISAIntrinsic::GenISA_vaBoolCentroid:
8767         emitVideoAnalyticSLM(inst, 2);
8768         break;
8769     case GenISAIntrinsic::GenISA_createMessagePhasesNoInit:
8770     case GenISAIntrinsic::GenISA_createMessagePhasesNoInitV:
8771         break;
8772     case GenISAIntrinsic::GenISA_createMessagePhases:
8773     case GenISAIntrinsic::GenISA_createMessagePhasesV:
8774         emitCreateMessagePhases(inst);
8775         break;
8776     case GenISAIntrinsic::GenISA_getMessagePhaseX:
8777     case GenISAIntrinsic::GenISA_getMessagePhaseXV:
8778         emitGetMessagePhaseX(inst);
8779         break;
8780     case GenISAIntrinsic::GenISA_simdGetMessagePhase:
8781     case GenISAIntrinsic::GenISA_simdGetMessagePhaseV:
8782         emitSimdGetMessagePhase(inst);
8783         break;
8784     case GenISAIntrinsic::GenISA_broadcastMessagePhase:
8785     case GenISAIntrinsic::GenISA_broadcastMessagePhaseV:
8786         emitBroadcastMessagePhase(inst);
8787         return;
8788     case GenISAIntrinsic::GenISA_simdSetMessagePhase:
8789     case GenISAIntrinsic::GenISA_simdSetMessagePhaseV:
8790         emitSimdSetMessagePhase(inst);
8791         break;
8792     case GenISAIntrinsic::GenISA_simdMediaRegionCopy:
8793         emitSimdMediaRegionCopy(inst);
8794         break;
8795     case GenISAIntrinsic::GenISA_extractMVAndSAD:
8796         emitExtractMVAndSAD(inst);
8797         break;
8798     case GenISAIntrinsic::GenISA_cmpSADs:
8799         emitCmpSADs(inst);
8800         break;
8801     case GenISAIntrinsic::GenISA_setMessagePhaseX_legacy:
8802         emitSetMessagePhaseX_legacy(inst);
8803         break;
8804     case GenISAIntrinsic::GenISA_setMessagePhase_legacy:
8805         emitSetMessagePhase_legacy(inst);
8806         break;
8807     case GenISAIntrinsic::GenISA_setMessagePhaseX:
8808     case GenISAIntrinsic::GenISA_setMessagePhaseXV:
8809         emitSetMessagePhaseX(inst);
8810         break;
8811     case GenISAIntrinsic::GenISA_getMessagePhase:
8812     case GenISAIntrinsic::GenISA_getMessagePhaseV:
8813         emitGetMessagePhase(inst);
8814         break;
8815     case GenISAIntrinsic::GenISA_setMessagePhase:
8816     case GenISAIntrinsic::GenISA_setMessagePhaseV:
8817         emitSetMessagePhase(inst);
8818         break;
8819     case GenISAIntrinsic::GenISA_DCL_ShaderInputVec:
8820     case GenISAIntrinsic::GenISA_DCL_inputVec:
8821         emitInput(inst);
8822         break;
8823     case GenISAIntrinsic::GenISA_PullSampleIndexBarys:
8824     case GenISAIntrinsic::GenISA_PullSnappedBarys:
8825     case GenISAIntrinsic::GenISA_PullCentroidBarys:
8826         emitEvalAttribute(inst);
8827         break;
8828     case GenISAIntrinsic::GenISA_Interpolate:
8829         emitInterpolate(inst);
8830         break;
8831     case GenISAIntrinsic::GenISA_Interpolate2:
8832         emitInterpolate2(inst);
8833         break;
8834     case GenISAIntrinsic::GenISA_Interpolant:
8835         emitInterpolant(inst);
8836         break;
8837     case GenISAIntrinsic::GenISA_DCL_DSCntrlPtInputVec:
8838         emitInput(inst);
8839         break;
8840     case GenISAIntrinsic::GenISA_ldptr:
8841         emitLdInstruction(inst);
8842         break;
8843     case GenISAIntrinsic::GenISA_sampleptr:
8844     case GenISAIntrinsic::GenISA_sampleBptr:
8845     case GenISAIntrinsic::GenISA_sampleCptr:
8846     case GenISAIntrinsic::GenISA_sampleDptr:
8847     case GenISAIntrinsic::GenISA_sampleDCptr:
8848     case GenISAIntrinsic::GenISA_sampleLptr:
8849     case GenISAIntrinsic::GenISA_sampleLCptr:
8850     case GenISAIntrinsic::GenISA_sampleBCptr:
8851     case GenISAIntrinsic::GenISA_lodptr:
8852     case GenISAIntrinsic::GenISA_sampleKillPix:
8853         emitSampleInstruction(cast<SampleIntrinsic>(inst));
8854         break;
8855     case GenISAIntrinsic::GenISA_discard:
8856         emitDiscard(inst);
8857         break;
8858     case GenISAIntrinsic::GenISA_resinfoptr:
8859     case GenISAIntrinsic::GenISA_sampleinfoptr:
8860         emitInfoInstruction(cast<InfoIntrinsic>(inst));
8861         break;
8862     case GenISAIntrinsic::GenISA_gather4ptr:
8863     case GenISAIntrinsic::GenISA_gather4Cptr:
8864     case GenISAIntrinsic::GenISA_gather4POptr:
8865     case GenISAIntrinsic::GenISA_gather4POCptr:
8866         emitGather4Instruction(cast<SamplerGatherIntrinsic>(inst));
8867         break;
8868     case GenISAIntrinsic::GenISA_ldmcsptr:
8869     case GenISAIntrinsic::GenISA_ldmsptr:
8870     case GenISAIntrinsic::GenISA_ldmsptr16bit:
8871         emitLdmsInstruction(inst);
8872         break;
8873     case GenISAIntrinsic::GenISA_DCL_SystemValue:
8874         emitSGV(cast<SGVIntrinsic>(inst));
8875         break;
8876     case GenISAIntrinsic::GenISA_PixelPositionX:
8877     case GenISAIntrinsic::GenISA_PixelPositionY:
8878         emitPixelPosition(inst);
8879         break;
8880     case GenISAIntrinsic::GenISA_DCL_GSsystemValue:
8881         emitGS_SGV(cast<SGVIntrinsic>(inst));
8882         break;
8883     case GenISAIntrinsic::GenISA_SampleOffsetX:
8884     case GenISAIntrinsic::GenISA_SampleOffsetY:
8885         emitSampleOffset(inst);
8886         break;
8887     case GenISAIntrinsic::GenISA_typedread:
8888         emitTypedRead(inst);
8889         break;
8890     case GenISAIntrinsic::GenISA_typedwrite:
8891         emitTypedWrite(inst);
8892         break;
8893     case GenISAIntrinsic::GenISA_threadgroupbarrier:
8894     case GenISAIntrinsic::GenISA_threadgroupbarrier_signal:
8895     case GenISAIntrinsic::GenISA_threadgroupbarrier_wait:
8896         emitThreadGroupBarrier(inst);
8897         break;
8898     case GenISAIntrinsic::GenISA_memoryfence:
8899         emitMemoryFence(inst);
8900         break;
8901     case GenISAIntrinsic::GenISA_flushsampler:
8902         emitFlushSamplerCache();
8903         break;
8904     case GenISAIntrinsic::GenISA_typedmemoryfence:
8905         emitTypedMemoryFence(inst);
8906         break;
8907     case GenISAIntrinsic::GenISA_assume_uniform:
8908         // nothing to do
8909         break;
8910     case GenISAIntrinsic::GenISA_intatomicraw:
8911     case GenISAIntrinsic::GenISA_floatatomicraw:
8912     case GenISAIntrinsic::GenISA_intatomicrawA64:
8913     case GenISAIntrinsic::GenISA_floatatomicrawA64:
8914     case GenISAIntrinsic::GenISA_icmpxchgatomicraw:
8915     case GenISAIntrinsic::GenISA_fcmpxchgatomicraw:
8916     case GenISAIntrinsic::GenISA_icmpxchgatomicrawA64:
8917     case GenISAIntrinsic::GenISA_fcmpxchgatomicrawA64:
8918         emitAtomicRaw(inst);
8919         break;
8920     case GenISAIntrinsic::GenISA_intatomictyped:
8921     case GenISAIntrinsic::GenISA_icmpxchgatomictyped:
8922         emitAtomicTyped(inst);
8923         break;
8924     case GenISAIntrinsic::GenISA_atomiccounterinc:
8925     case GenISAIntrinsic::GenISA_atomiccounterpredec:
8926         emitAtomicCounter(inst);
8927         break;
8928     case GenISAIntrinsic::GenISA_bfi:
8929     case GenISAIntrinsic::GenISA_ubfe:
8930     case GenISAIntrinsic::GenISA_ibfe:
8931     case GenISAIntrinsic::GenISA_firstbitLo:
8932     case GenISAIntrinsic::GenISA_firstbitHi:
8933     case GenISAIntrinsic::GenISA_firstbitShi:
8934         emitAluNoModifier(inst);
8935         break;
8936     case GenISAIntrinsic::GenISA_OutputTessFactors:
8937         emitHSTessFactors(inst);
8938         break;
8939     case GenISAIntrinsic::GenISA_f32tof16_rtz:
8940         emitf32tof16_rtz(inst);
8941         break;
8942     case GenISAIntrinsic::GenISA_ftoi_rtn:
8943     case GenISAIntrinsic::GenISA_ftoi_rtp:
8944     case GenISAIntrinsic::GenISA_ftoi_rte:
8945     case GenISAIntrinsic::GenISA_ftoui_rtn:
8946     case GenISAIntrinsic::GenISA_ftoui_rtp:
8947     case GenISAIntrinsic::GenISA_ftoui_rte:
8948         emitftoi(inst);
8949         break;
8950     case GenISAIntrinsic::GenISA_itof_rtn:
8951     case GenISAIntrinsic::GenISA_itof_rtp:
8952     case GenISAIntrinsic::GenISA_itof_rtz:
8953     case GenISAIntrinsic::GenISA_uitof_rtn:
8954     case GenISAIntrinsic::GenISA_uitof_rtp:
8955     case GenISAIntrinsic::GenISA_uitof_rtz:
8956     case GenISAIntrinsic::GenISA_ftof_rte:
8957     case GenISAIntrinsic::GenISA_ftof_rtn:
8958     case GenISAIntrinsic::GenISA_ftof_rtp:
8959     case GenISAIntrinsic::GenISA_ftof_rtz:
8960         emitfitof(inst);
8961         break;
8962     case GenISAIntrinsic::GenISA_ftobf:
8963     case GenISAIntrinsic::GenISA_bftof:
8964     case GenISAIntrinsic::GenISA_2fto2bf:
8965         emitfcvt(inst);
8966         break;
8967     case GenISAIntrinsic::GenISA_uavSerializeAll:
8968     case GenISAIntrinsic::GenISA_uavSerializeOnResID:
8969         emitUAVSerialize();
8970         break;
8971     case GenISAIntrinsic::GenISA_globalSync:
8972         emitMemoryFence();
8973         break;
8974     case GenISAIntrinsic::GenISA_PHASE_OUTPUT:
8975     case GenISAIntrinsic::GenISA_PHASE_OUTPUTVEC:
8976         emitPhaseOutput(inst);
8977         break;
8978     case GenISAIntrinsic::GenISA_PHASE_INPUT:
8979     case GenISAIntrinsic::GenISA_PHASE_INPUTVEC:
8980         emitPhaseInput(inst);
8981         break;
8982     case GenISAIntrinsic::GenISA_ldrawvector_indexed:
8983     case GenISAIntrinsic::GenISA_ldraw_indexed:
8984         emitLoadRawIndexed(
8985             cast<LdRawIntrinsic>(inst),
8986             cast<LdRawIntrinsic>(inst)->getOffsetValue(),
8987             nullptr);
8988         break;
8989     case GenISAIntrinsic::GenISA_storerawvector_indexed:
8990     case GenISAIntrinsic::GenISA_storeraw_indexed:
8991         emitStoreRawIndexed(
8992             cast<StoreRawIntrinsic>(inst),
8993             cast<StoreRawIntrinsic>(inst)->getOffsetValue(),
8994             nullptr);
8995         break;
8996     case GenISAIntrinsic::GenISA_GetBufferPtr:
8997         emitGetBufferPtr(inst);
8998         break;
8999     case GenISAIntrinsic::GenISA_readsurfaceinfoptr:
9000         emitSurfaceInfo(inst);
9001         break;
9002     case GenISAIntrinsic::GenISA_mov_identity:
9003     {
9004       // Use Or instead of a Copy, as VISA will remove redundant movs.
9005       auto Var = GetSymbol(inst->getOperand(0));
9006       CVariable* Zero = m_currShader->ImmToVariable(0, ISA_TYPE_UD);
9007       m_encoder->Or(Var, Var, Zero);
9008       m_encoder->Push();
9009       break;
9010     }
9011     case GenISAIntrinsic::GenISA_source_value:
9012     {
9013         m_encoder->Copy(m_currShader->GetNULL(), GetSymbol(inst->getOperand(0)));
9014         m_encoder->Push();
9015         break;
9016     }
9017     case GenISAIntrinsic::GenISA_movcr:
9018     {
9019         m_encoder->SetSrcSubReg(0, static_cast<uint16_t>(GetImmediateVal(inst->getOperand(0))));
9020         m_encoder->Copy(m_destination, m_currShader->GetCR0());
9021         m_encoder->Push();
9022         break;
9023     }
9024     case GenISAIntrinsic::GenISA_hw_thread_id:
9025     case GenISAIntrinsic::GenISA_hw_thread_id_alloca:
9026     {
9027         m_encoder->Copy(m_destination, m_currShader->GetHWTID());
9028         m_encoder->Push();
9029         break;
9030     }
9031     case GenISAIntrinsic::GenISA_slice_id:
9032     {
9033         if (m_currShader->m_Platform->GetPlatformFamily() == IGFX_GEN8_CORE ||
9034             m_currShader->m_Platform->GetPlatformFamily() == IGFX_GEN9_CORE)
9035             emitStateRegID(14, 15);
9036         else if (m_currShader->m_Platform->GetPlatformFamily() == IGFX_GEN12_CORE ||
9037             m_currShader->m_Platform->GetPlatformFamily() == IGFX_XE_HP_CORE)
9038             emitStateRegID(11, 13);
9039         else
9040             emitStateRegID(12, 14);
9041         break;
9042     }
9043     case GenISAIntrinsic::GenISA_subslice_id:
9044     {
9045         if (m_currShader->m_Platform->GetPlatformFamily() == IGFX_GEN8_CORE ||
9046             m_currShader->m_Platform->GetPlatformFamily() == IGFX_GEN9_CORE)
9047             emitStateRegID(12, 13);
9048         else
9049             emitStateRegID(8, 8);
9050         break;
9051     }
9052     case GenISAIntrinsic::GenISA_dual_subslice_id:
9053     {
9054         if (m_currShader->m_Platform->GetPlatformFamily() == IGFX_GEN11_CORE ||
9055             m_currShader->m_Platform->GetPlatformFamily() == IGFX_GEN11LP_CORE ||
9056             m_currShader->m_Platform->GetPlatformFamily() == IGFX_GEN12LP_CORE)
9057             emitStateRegID(9, 11);
9058         else if (m_currShader->m_Platform->GetPlatformFamily() == IGFX_GEN12_CORE ||
9059                  m_currShader->m_Platform->GetPlatformFamily() == IGFX_XE_HP_CORE)
9060             emitStateRegID(9, 10);
9061         else
9062             IGC_ASSERT_MESSAGE(0, "No support for Dual Subslice in current platform");
9063         break;
9064     }
9065     case GenISAIntrinsic::GenISA_eu_id:
9066     {
9067         if (m_currShader->m_Platform->GetPlatformFamily() == IGFX_GEN8_CORE ||
9068             m_currShader->m_Platform->GetPlatformFamily() == IGFX_GEN9_CORE)
9069             emitStateRegID(8, 11);
9070         else
9071             emitStateRegID(4, 7);
9072         break;
9073     }
9074     case GenISAIntrinsic::GenISA_getSR0:
9075     {
9076         m_encoder->SetSrcSubReg(0, static_cast<uint16_t>(GetImmediateVal(inst->getOperand(0))));
9077         m_encoder->Copy(m_destination, m_currShader->GetSR0());
9078         m_encoder->Push();
9079         break;
9080     }
9081     case GenISAIntrinsic::GenISA_getSR0_0:
9082     {
9083         m_encoder->SetSrcSubReg(0, 0);
9084         m_encoder->Copy(m_destination, m_currShader->GetSR0());
9085         m_encoder->Push();
9086         break;
9087     }
9088     case GenISAIntrinsic::GenISA_eu_thread_id:
9089         emitStateRegID(0, 2);
9090         break;
9091     case GenISAIntrinsic::GenISA_eu_thread_pause:
9092         emitThreadPause(inst);
9093         break;
9094     case GenISAIntrinsic::GenISA_pair_to_ptr:
9095         emitPairToPtr(inst);
9096         break;
9097     case GenISAIntrinsic::GenISA_StackAlloca:
9098         emitStackAlloca(inst);
9099         break;
9100     case GenISAIntrinsic::GenISA_VLAStackAlloca:
9101         emitVLAStackAlloca(inst);
9102         break;
9103     case GenISAIntrinsic::GenISA_WaveBallot:
9104         emitWaveBallot(inst);
9105         break;
9106     case GenISAIntrinsic::GenISA_WaveInverseBallot:
9107         emitWaveInverseBallot(inst);
9108         break;
9109     case GenISAIntrinsic::GenISA_WaveShuffleIndex:
9110         emitSimdShuffle(inst);
9111         break;
9112     case GenISAIntrinsic::GenISA_WavePrefix:
9113         emitWavePrefix(cast<WavePrefixIntrinsic>(inst));
9114         break;
9115     case GenISAIntrinsic::GenISA_QuadPrefix:
9116         emitQuadPrefix(cast<QuadPrefixIntrinsic>(inst));
9117         break;
9118     case GenISAIntrinsic::GenISA_WaveAll:
9119         emitWaveAll(inst);
9120         break;
9121     case GenISAIntrinsic::GenISA_WaveClustered:
9122         emitWaveClustered(inst);
9123         break;
9124     case GenISAIntrinsic::GenISA_InitDiscardMask:
9125         emitInitDiscardMask(inst);
9126         break;
9127     case GenISAIntrinsic::GenISA_UpdateDiscardMask:
9128         emitUpdateDiscardMask(inst);
9129         break;
9130     case GenISAIntrinsic::GenISA_GetPixelMask:
9131         emitGetPixelMask(inst);
9132         break;
9133     case GenISAIntrinsic::GenISA_dp4a_ss:
9134     case GenISAIntrinsic::GenISA_dp4a_uu:
9135     case GenISAIntrinsic::GenISA_dp4a_su:
9136     case GenISAIntrinsic::GenISA_dp4a_us:
9137         emitDP4A(inst);
9138         break;
9139     case GenISAIntrinsic::GenISA_evaluateSampler:
9140         // nothing to do
9141         break;
9142     case GenISAIntrinsic::GenISA_wavebarrier:
9143         // nothing to do
9144         break;
9145     case GenISAIntrinsic::GenISA_mul_rtz:
9146     case GenISAIntrinsic::GenISA_fma_rtz:
9147     case GenISAIntrinsic::GenISA_add_rtz:
9148         emitFPOrtz(inst);
9149         break;
9150     case GenISAIntrinsic::GenISA_fma_rtp:
9151         emitFMArtp(inst);
9152         break;
9153     case GenISAIntrinsic::GenISA_fma_rtn:
9154         emitFMArtn(inst);
9155         break;
9156     case GenISAIntrinsic::GenISA_CatchAllDebugLine:
9157         emitDebugPlaceholder(inst);
9158         break;
9159     case GenISAIntrinsic::GenISA_getR0:
9160     case GenISAIntrinsic::GenISA_getPayloadHeader:
9161     case GenISAIntrinsic::GenISA_getWorkDim:
9162     case GenISAIntrinsic::GenISA_getNumWorkGroups:
9163     case GenISAIntrinsic::GenISA_getLocalSize:
9164     case GenISAIntrinsic::GenISA_getGlobalSize:
9165     case GenISAIntrinsic::GenISA_getEnqueuedLocalSize:
9166     case GenISAIntrinsic::GenISA_getLocalID_X:
9167     case GenISAIntrinsic::GenISA_getLocalID_Y:
9168     case GenISAIntrinsic::GenISA_getLocalID_Z:
9169     case GenISAIntrinsic::GenISA_getPrivateBase:
9170     case GenISAIntrinsic::GenISA_getPrintfBuffer:
9171     case GenISAIntrinsic::GenISA_getStageInGridOrigin:
9172     case GenISAIntrinsic::GenISA_getStageInGridSize:
9173     case GenISAIntrinsic::GenISA_getSyncBuffer:
9174         emitImplicitArgIntrinsic(inst);
9175         break;
9176     case GenISAIntrinsic::GenISA_dummyInst:
9177         emitDummyInst(inst);
9178         break;
9179     case GenISAIntrinsic::GenISA_vectorUniform:
9180         break;  // pseudo instruction, do nothing
9181     case GenISAIntrinsic::GenISA_staticConstantPatchValue:
9182         emitStaticConstantPatchValue(cast<StaticConstantPatchIntrinsic>(inst));
9183     case GenISAIntrinsic::GenISA_SetImplicitBufferPtr:
9184         emitStoreImplBufferPtr(inst);
9185         break;
9186     case GenISAIntrinsic::GenISA_SetLocalIdBufferPtr:
9187         emitStoreLocalIdBufferPtr(inst);
9188         break;
9189     case GenISAIntrinsic::GenISA_GetImplicitBufferPtr:
9190         emitLoadImplBufferPtr(inst);
9191         break;
9192     case GenISAIntrinsic::GenISA_GetLocalIdBufferPtr:
9193         emitLoadLocalIdBufferPtr(inst);
9194         break;
9195     default:
9196         // we assume that some of gen-intrinsic should always be pattern-matched away,
9197         // therefore we do not handle them in visa-emission.
9198         // let us know if you see a case that hits this assertion by those intrinsics
9199         inst->print(IGC::Debug::ods());
9200         IGC_ASSERT_MESSAGE(0, "unknown intrinsic");
9201         break;
9202     }
9203 }
9204 
EmitIntrinsicMessage(llvm::IntrinsicInst * inst)9205 void EmitPass::EmitIntrinsicMessage(llvm::IntrinsicInst* inst)
9206 {
9207     switch (inst->getIntrinsicID())
9208     {
9209     case Intrinsic::lifetime_start:
9210     case Intrinsic::lifetime_end:
9211     case Intrinsic::fabs:
9212     case Intrinsic::trap:
9213         // do nothing
9214         break;
9215     case Intrinsic::stacksave:
9216         // If stack is not initialized (no SP), we can assume there's no VLA.
9217         // We can ignore llvm.stacksave and llvm.stackrestore intrinsics
9218         if (m_currShader->hasSP())
9219             emitLLVMStackSave(inst);
9220         break;
9221 
9222     case Intrinsic::stackrestore:
9223         // If stack is not initialized (no SP), we can assume there's no VLA.
9224         // We can ignore llvm.stacksave and llvm.stackrestore intrinsics
9225         if (m_currShader->hasSP())
9226             emitLLVMStackRestore(inst);
9227         break;
9228 
9229     case Intrinsic::bswap:
9230         emitLLVMbswap(inst);
9231         break;
9232 
9233     case Intrinsic::sqrt:
9234         emitSqrt(inst);
9235         break;
9236 
9237     default:
9238         inst->print(IGC::Debug::ods());
9239         IGC_ASSERT_MESSAGE(0, "unknown intrinsic");
9240         break;
9241     }
9242 }
9243 
validateInlineAsmConstraints(llvm::CallInst * inst,SmallVector<StringRef,8> & constraints)9244 bool EmitPass::validateInlineAsmConstraints(llvm::CallInst* inst, SmallVector<StringRef, 8> & constraints)
9245 {
9246     IGC_ASSERT(inst->isInlineAsm());
9247     InlineAsm* IA = cast<InlineAsm>(IGCLLVM::getCalledValue(inst));
9248     StringRef constraintStr(IA->getConstraintString());
9249     if (constraintStr.empty()) return true;
9250 
9251     //lambda for checking constraint types
9252     auto CheckConstraintTypes = [this](StringRef str, CVariable* cv = nullptr)->bool
9253     {
9254         unsigned matchVal;
9255         if (str.equals("=rw"))
9256         {
9257             return true;
9258         }
9259         else if (str.equals("rw"))
9260         {
9261             return true;
9262         }
9263         else if (str.getAsInteger(10, matchVal) == 0)
9264         {
9265             // Also allows matching input reg to output reg
9266             return true;
9267         }
9268         else if (str.equals("i"))
9269         {
9270             return cv && cv->IsImmediate();
9271         }
9272         else if (str.equals("rw.u"))
9273         {
9274             return cv && cv->IsUniform();
9275         }
9276         else
9277         {
9278             IGC_ASSERT_MESSAGE(0, "Unsupported constraint type!");
9279             return false;
9280         }
9281     };
9282 
9283     // Get a list of constraint tokens
9284     constraintStr.split(constraints, ',');
9285 
9286     bool success = true;
9287 
9288     unsigned index = 0;
9289 
9290     // Check the output constraint tokens
9291     for (; index < constraints.size(); index++)
9292     {
9293         StringRef &str = constraints[index];
9294         if (str.startswith("="))
9295         {
9296             success &= CheckConstraintTypes(str);
9297         }
9298         else
9299         {
9300             break;
9301         }
9302     }
9303     if (success)
9304     {
9305         // Check the input constraint tokens
9306         for (unsigned i = 0; i < inst->getNumArgOperands(); i++, index++)
9307         {
9308             CVariable* cv = GetSymbol(inst->getArgOperand(i));
9309             success &= CheckConstraintTypes(constraints[index], cv);
9310         }
9311     }
9312     return success;
9313 }
9314 
9315 // Parse the inlined asm string to generate VISA operands
9316 // Example: "mul (M1, 16) $0(0, 0)<1> $1(0, 0)<1;1,0> $2(0, 0)<1;1,0>", "=r,r,r"(float %6, float %7)
EmitInlineAsm(llvm::CallInst * inst)9317 void EmitPass::EmitInlineAsm(llvm::CallInst* inst)
9318 {
9319     std::stringstream& str = m_encoder->GetVISABuilder()->GetAsmTextStream();
9320     InlineAsm* IA = cast<InlineAsm>(IGCLLVM::getCalledValue(inst));
9321     string asmStr = IA->getAsmString();
9322     smallvector<CVariable*, 8> opnds;
9323     SmallVector<StringRef, 8> constraints;
9324 
9325     if (asmStr.empty())
9326         return;
9327 
9328     if (!validateInlineAsmConstraints(inst, constraints))
9329     {
9330         IGC_ASSERT_MESSAGE(0, "Constraints for inline assembly cannot be validated");
9331         return;
9332     }
9333 
9334     if (inst->getType()->isStructTy())
9335     {
9336         // Handle multiple outputs
9337         unsigned numOutputs = inst->getType()->getStructNumElements();
9338         std::vector<CVariable*> outputs(numOutputs);
9339         for (auto var : outputs) var = nullptr;
9340 
9341         for (auto user : inst->users())
9342         {
9343             ExtractValueInst* ex = dyn_cast<ExtractValueInst>(user);
9344             IGC_ASSERT_MESSAGE(nullptr != ex, "Invalid user of inline asm call");
9345             unsigned id = *ex->idx_begin();
9346             IGC_ASSERT(id < numOutputs);
9347             IGC_ASSERT(outputs[id] == nullptr);
9348             outputs[id] = GetSymbol(ex);
9349         }
9350         for (auto var : outputs) opnds.push_back(var);
9351     }
9352     else if (m_destination)
9353     {
9354         opnds.push_back(m_destination);
9355     }
9356     for (unsigned i = 0; i < inst->getNumArgOperands(); i++)
9357     {
9358         CVariable* cv = GetSymbol(inst->getArgOperand(i));
9359         opnds.push_back(cv);
9360     }
9361 
9362     IGC_ASSERT(opnds.size() == constraints.size());
9363 
9364     // Check for read/write registers
9365     if (!inst->getType()->isVoidTy())
9366     {
9367         for (unsigned i = 0; i < constraints.size(); i++)
9368         {
9369             unsigned destID;
9370             if (constraints[i].getAsInteger(10, destID) == 0)
9371             {
9372                 // If input is linked to output reg, move the input value into the output
9373                 CVariable* cv = opnds[i];
9374                 CVariable* dest = opnds[destID];
9375                 if (cv && dest && cv != dest)
9376                 {
9377                     if (inst->getType()->isVectorTy())
9378                     {
9379                         emitVectorCopy(dest, cv, int_cast<unsigned>(dyn_cast<IGCLLVM::FixedVectorType>(inst->getType())->getNumElements()));
9380                     }
9381                     else
9382                     {
9383                         m_encoder->Copy(dest, cv);
9384                         m_encoder->Push();
9385                     }
9386                 }
9387             }
9388         }
9389     }
9390 
9391     for (unsigned i = 0; i < opnds.size(); i++)
9392     {
9393         CVariable* opVar = opnds[i];
9394         StringRef constraint = constraints[i];
9395 
9396         // All uniform variables must be broadcasted if 'rw' constraint was specified
9397         if (opVar && opVar->IsUniform() && constraint.equals("rw"))
9398         {
9399             opnds[i] = BroadcastIfUniform(opVar);
9400         }
9401         // Special handling if LLVM replaces a variable with an immediate, we need to insert an extra move
9402         else if (opVar && opVar->IsImmediate() && !constraint.equals("i"))
9403         {
9404             CVariable* tempMov = m_currShader->GetNewVariable(
9405                 1, opVar->GetType(), EALIGN_GRF, true, opVar->getName());
9406             m_encoder->Copy(tempMov, opVar);
9407             m_encoder->Push();
9408             opnds[i] = tempMov;
9409         }
9410     }
9411 
9412     // Replace all instances of ${:uid} with a label string unique to this asm block.
9413     // Clang translates the '%=' format string to '${:uid}' in LLVMIR.
9414     // This option is useful when creating local labels and referring to them multiple times
9415     // in a single template that generates multiple assembler instructions.
9416     {
9417         string hashStr = m_encoder->GetUniqueInlineAsmLabel();
9418         string uniqueIDStr = "${:uid}";
9419         size_t pos = 0;
9420         while (pos < asmStr.size())
9421         {
9422             size_t varPos = asmStr.find(uniqueIDStr, pos);
9423             if (varPos == string::npos)
9424                 break;
9425             asmStr.replace(varPos, uniqueIDStr.size(), hashStr);
9426             pos = varPos + hashStr.size();
9427         }
9428     }
9429 
9430     str << endl << "/// Inlined ASM" << endl;
9431     // Look for variables to replace with the VISA variable
9432     size_t startPos = 0;
9433     while (startPos < asmStr.size())
9434     {
9435         size_t varPos = asmStr.find('$', startPos);
9436         if (varPos == string::npos)
9437             break;
9438 
9439         // Find the operand number
9440         const char* idStart = &(asmStr[varPos + 1]);
9441         const char* idEnd = idStart;
9442         while (*idEnd >= '0' && *idEnd <= '9')
9443             ++idEnd;
9444 
9445         unsigned val = 0;
9446         if (StringRef(idStart, idEnd - idStart).getAsInteger(10, val))
9447         {
9448             IGC_ASSERT_MESSAGE(0, "Invalid operand format");
9449             return;
9450         }
9451         if (val >= opnds.size())
9452         {
9453             IGC_ASSERT_MESSAGE(0, "Invalid operand index");
9454             return;
9455         }
9456         string varName = opnds[val] ? m_encoder->GetVariableName(opnds[val]) : "null";
9457         asmStr.replace(varPos, (idEnd - idStart + 1), varName);
9458 
9459         startPos = varPos + varName.size();
9460     }
9461 
9462     str << asmStr;
9463     if (asmStr.back() != '\n') str << endl;
9464     str << "/// End Inlined ASM" << endl << endl;
9465 }
9466 
Mul(CVariable * Src0,CVariable * Src1,const CVariable * DstPrototype)9467 CVariable* EmitPass::Mul(CVariable* Src0, CVariable* Src1, const CVariable* DstPrototype)
9468 {
9469     bool IsSrc0Imm = Src0->IsImmediate();
9470     bool IsSrc1Imm = Src1->IsImmediate();
9471     if (IsSrc0Imm && IsSrc1Imm) {
9472         uint64_t Prod = Src0->GetImmediateValue() * Src1->GetImmediateValue();
9473         return m_currShader->ImmToVariable(Prod, DstPrototype->GetType());
9474     }
9475     if (IsSrc0Imm && !IsSrc1Imm) {
9476         std::swap(Src0, Src1);
9477     }
9478     if (IsSrc1Imm) {
9479         APInt Imm(APInt(m_DL->getPointerSizeInBits(), Src1->GetImmediateValue()));
9480         if (Imm == 0) {
9481             return Src1;
9482         }
9483         if (Imm == 1) {
9484             return Src0;
9485         }
9486         if (Imm.isPowerOf2()) {
9487             unsigned Amt = Imm.logBase2();
9488             CVariable* VarAmt = m_currShader->ImmToVariable(Amt, ISA_TYPE_UD);
9489             CVariable* Dst = m_currShader->GetNewVariable(DstPrototype);
9490             m_encoder->Shl(Dst, Src0, VarAmt);
9491             m_encoder->Push();
9492             return Dst;
9493         }
9494     }
9495 
9496     CVariable* Dst = m_currShader->GetNewVariable(DstPrototype);
9497     VISA_Type srcType = Src0->GetType();
9498 
9499     // Only i64 muls need special handling, otherwise go back to standard flow
9500     if (srcType != ISA_TYPE_Q && srcType != ISA_TYPE_UQ)
9501     {
9502         m_encoder->Mul(Dst, Src0, Src1);
9503         m_encoder->Push();
9504     }
9505     else {
9506         CVariable* src[] = { Src0, Src1 };
9507         Mul64(Dst, src, m_currShader->m_SIMDSize);
9508     }
9509     return Dst;
9510 }
9511 
Add(CVariable * Src0,CVariable * Src1,const CVariable * DstPrototype)9512 CVariable* EmitPass::Add(CVariable* Src0, CVariable* Src1, const CVariable* DstPrototype)
9513 {
9514     bool IsSrc0Imm = Src0->IsImmediate();
9515     bool IsSrc1Imm = Src1->IsImmediate();
9516     if (IsSrc1Imm && !Src1->GetImmediateValue()) {
9517         return Src0;
9518     }
9519     if (IsSrc0Imm && !Src0->GetImmediateValue()) {
9520         return Src1;
9521     }
9522     if (IsSrc0Imm && IsSrc1Imm) {
9523         uint64_t Sum = Src0->GetImmediateValue() + Src1->GetImmediateValue();
9524         return m_currShader->ImmToVariable(Sum, DstPrototype->GetType());
9525     }
9526     CVariable* Dst = m_currShader->GetNewVariable(DstPrototype);
9527     m_encoder->Add(Dst, Src0, Src1);
9528     m_encoder->Push();
9529     return Dst;
9530 }
9531 
9532 // Insert lifetime start right before instruction I if it is a candidate.
emitLifetimeStart(CVariable * Var,BasicBlock * BB,Instruction * I,bool ForAllInstance)9533 void EmitPass::emitLifetimeStart(CVariable* Var, BasicBlock* BB, Instruction* I, bool ForAllInstance)
9534 {
9535     if (m_pCtx->getVectorCoalescingControl() == 0 || Var == nullptr) {
9536         return;
9537     }
9538 
9539     // m_LifetimeAt1stDefOfBB uses dessa root of aliasee as its key
9540     Value* ARV = m_VRA->getAliasRootValue(I);
9541     ARV = m_VRA->getRootValue(ARV);
9542 
9543     auto II = m_VRA->m_LifetimeAt1stDefOfBB.find(ARV);
9544     if (II != m_VRA->m_LifetimeAt1stDefOfBB.end())
9545     {
9546         // Insert lifetime start on the root value
9547         // Note that lifetime is a kind of info directive,
9548         // thus no m_encoder->Push() is needed.
9549         CVariable* RootVar = GetSymbol(ARV);
9550         if (ForAllInstance)
9551         {
9552             for (uint instance = 0; instance < RootVar->GetNumberInstance(); instance++)
9553             {
9554                 m_encoder->SetSecondHalf(instance == 0 ? false : true);
9555                 m_encoder->Lifetime(LIFETIME_START, RootVar);
9556             }
9557         }
9558         else {
9559             // Current instance, set already in the calling context.
9560             m_encoder->Lifetime(LIFETIME_START, RootVar);
9561         }
9562 
9563         // Once inserted, remove it from map to
9564         // prevent from inserting again.
9565         m_VRA->m_LifetimeAt1stDefOfBB.erase(II);
9566     }
9567 }
9568 
emitGEP(llvm::Instruction * I)9569 void EmitPass::emitGEP(llvm::Instruction* I)
9570 {
9571     GetElementPtrInst& GEP = cast<GetElementPtrInst>(*I);
9572     unsigned AddrSpace = I->getType()->getPointerAddressSpace();
9573     VISA_Type PtrTy =
9574         m_currShader->GetContext()->getRegisterPointerSizeInBits(AddrSpace) == 64 ? ISA_TYPE_UQ : ISA_TYPE_UD;
9575 
9576     // First compute the offset from the base to benefit from constant folding,
9577     // and then add to the base (which is less likely to be a constant).
9578 
9579     // vOffset is the value of the advancing offset in the loop below
9580     // Use the pre-allocated variable for storage
9581     CVariable* vOffset = m_destination;
9582     // vN is the current offset at the begining of each iteration in the loop below
9583     CVariable* vN = m_currShader->ImmToVariable(0, PtrTy);
9584     // Note that the pointer operand may be a vector of pointers. Take the scalar
9585     // element which holds a pointer.
9586     Type* Ty = GEP.getPointerOperand()->getType()->getScalarType();
9587 
9588     // Prototype temporary used for cloning from
9589     CVariable* vTmp = m_currShader->GetNewVariable(
9590         numLanes(m_currShader->m_SIMDSize),
9591         PtrTy,
9592         m_currShader->getGRFAlignment(),
9593         m_destination->IsUniform(),
9594         CName::NONE);
9595 
9596     gep_type_iterator GTI = gep_type_begin(GEP);
9597     for (auto OI = GEP.op_begin() + 1, E = GEP.op_end(); OI != E; ++OI, ++GTI) {
9598         Value* Idx = *OI;
9599         // Offset of element contributed by current index being visited
9600         CVariable* vElemOffset;
9601         if (StructType * StTy = GTI.getStructTypeOrNull()) {
9602             // GEP indices into structs are always constant i32's
9603             unsigned Field = int_cast<unsigned>(cast<Constant>(Idx)->getUniqueInteger().getZExtValue());
9604             uint64_t Offset = 0;
9605             if (Field) {
9606                 Offset = m_DL->getStructLayout(StTy)->getElementOffset(Field);
9607             }
9608             vElemOffset = m_currShader->ImmToVariable(Offset, ISA_TYPE_UD);
9609             Ty = StTy->getElementType(Field);
9610         }
9611         else {
9612             Ty = GTI.getIndexedType();
9613             // vElemOffset = vIdx * vElemSize
9614             CVariable* vElemSize = m_currShader->ImmToVariable(m_DL->getTypeAllocSize(Ty), PtrTy);
9615             CVariable* vIdx = GetSymbol(Idx);
9616             // The Mul does a push and takes care of constant folding
9617             vElemOffset = Mul(vIdx, vElemSize, vTmp);
9618         }
9619         // vOffset = vN + vElemOffset
9620         vOffset = Add(vElemOffset, vN, vTmp); // The Add does a m_encoder->push
9621         vN = vOffset; // After eating an index operand, advance the current offset
9622     }
9623 
9624     CVariable* vBasePtr = GetSymbol(GEP.getPointerOperand());
9625     // GEP = VBasePtrt + VOffset
9626     vTmp = Add(vBasePtr, vOffset, vTmp);  // The Add does a m_encoder->push
9627     // Copy the result
9628     if (CEncoder::GetCISADataTypeSize(vTmp->GetType()) <
9629         CEncoder::GetCISADataTypeSize(m_destination->GetType()))
9630     {
9631         // If both offset and the base are immediates, we may end up with an offset of a smaller
9632         // type than the destination, due to immediate creation optimizations in the Add.
9633         m_encoder->Cast(m_destination, vTmp);
9634     }
9635     else
9636     {
9637         m_encoder->Copy(m_destination, vTmp);
9638     }
9639     m_encoder->Push();
9640 }
9641 
emitIntToPtr(llvm::IntToPtrInst * I2P)9642 void EmitPass::emitIntToPtr(llvm::IntToPtrInst* I2P)
9643 {
9644     CVariable* src = GetSymbol(I2P->getOperand(0));
9645     CVariable* IntVar = m_currShader->BitCast(src, GetUnsignedType(src->GetType()));
9646     m_encoder->Cast(m_destination, IntVar);
9647     m_encoder->Push();
9648 }
9649 
emitBitCast(llvm::BitCastInst * btCst)9650 void EmitPass::emitBitCast(llvm::BitCastInst* btCst)
9651 {
9652     Type* srcType = btCst->getOperand(0)->getType();
9653     Type* dstType = btCst->getType();
9654     unsigned int numSrcElement = srcType->isVectorTy() ? (unsigned)cast<IGCLLVM::FixedVectorType>(srcType)->getNumElements() : 1;
9655     unsigned int numDstElement = dstType->isVectorTy() ? (unsigned)cast<IGCLLVM::FixedVectorType>(dstType)->getNumElements() : 1;
9656 
9657     if (srcType->isPointerTy())
9658     {
9659         IGC_ASSERT_MESSAGE(dstType->isPointerTy(), "Expected both src and dst have pointer type.");
9660     }
9661 
9662     if (btCst->getOperand(0)->getType()->isVectorTy() ||
9663         btCst->getType()->isVectorTy())
9664     {
9665         emitVectorBitCast(btCst);
9666         return;
9667     }
9668 
9669     CVariable* src = GetSymbol(btCst->getOperand(0));
9670     CVariable* dst = m_destination;
9671     IGC_ASSERT(nullptr != src);
9672     IGC_ASSERT(nullptr != dst);
9673     IGC_ASSERT_MESSAGE(numSrcElement == 1, "vector to vector bitcast not supported");
9674     IGC_ASSERT_MESSAGE(numDstElement == 1, "vector to vector bitcast not supported");
9675 
9676     src = m_currShader->BitCast(src, dst->GetType());
9677     m_encoder->Copy(dst, src);
9678     m_encoder->Push();
9679 }
9680 
emitPtrToInt(llvm::PtrToIntInst * P2I)9681 void EmitPass::emitPtrToInt(llvm::PtrToIntInst* P2I)
9682 {
9683     CVariable* dst = m_currShader->BitCast(m_destination, GetUnsignedType(m_destination->GetType()));
9684     CVariable* PtrVar = GetSymbol(P2I->getOperand(0));
9685     m_encoder->Cast(dst, PtrVar);
9686     m_encoder->Push();
9687 }
9688 
emitAddrSpaceToGenericCast(llvm::AddrSpaceCastInst * addrSpaceCast,CVariable * srcV,unsigned tag)9689 void EmitPass::emitAddrSpaceToGenericCast(llvm::AddrSpaceCastInst* addrSpaceCast, CVariable* srcV, unsigned tag)
9690 {
9691     if (m_pCtx->m_hasEmu64BitInsts && m_currShader->m_Platform->hasNoFullI64Support())
9692     {
9693         if (m_currShader->GetContext()->getRegisterPointerSizeInBits(addrSpaceCast->getSrcAddressSpace()) == 32)
9694         {
9695             // Add tag to high part
9696             CVariable* dstAlias = m_currShader->BitCast(m_destination, ISA_TYPE_UD);
9697             // Low:
9698             m_encoder->SetDstRegion(2);
9699             m_encoder->Copy(dstAlias, srcV);
9700             m_encoder->Push();
9701             // High:
9702             m_encoder->SetDstSubReg(1);
9703             m_encoder->SetDstRegion(2);
9704             m_encoder->Copy(dstAlias, m_currShader->ImmToVariable(tag << 29, ISA_TYPE_UD));
9705             m_encoder->Push();
9706         }
9707         else
9708         {
9709             // Src
9710             CVariable* srcAlias = m_currShader->GetNewAlias(srcV, ISA_TYPE_UD, 0, 0);
9711             CVariable* srcLow = m_currShader->GetNewVariable(
9712                 numLanes(m_currShader->m_SIMDSize),
9713                 ISA_TYPE_UD, EALIGN_GRF, m_destination->IsUniform(),
9714                 CName(srcV->getName(), "Lo"));
9715             CVariable* srcHigh = m_currShader->GetNewVariable(
9716                 numLanes(m_currShader->m_SIMDSize),
9717                 ISA_TYPE_UD, EALIGN_GRF, m_destination->IsUniform(),
9718                 CName(srcV->getName(), "Hi"));
9719 
9720             // Split Src into {Low, High}
9721             // Low:
9722             m_encoder->SetSrcSubReg(0, 0);
9723             m_encoder->SetSrcRegion(0, 2, 1, 0);
9724             m_encoder->Copy(srcLow, srcAlias);
9725             m_encoder->Push();
9726             // High:
9727             m_encoder->SetSrcSubReg(0, 1);
9728             m_encoder->SetSrcRegion(0, 2, 1, 0);
9729             m_encoder->Copy(srcHigh, srcAlias);
9730             m_encoder->Push();
9731 
9732             // Add tag to high part
9733             m_encoder->Or(srcHigh, srcHigh, m_currShader->ImmToVariable(tag << 29, ISA_TYPE_UD));
9734             m_encoder->Push();
9735 
9736             // Copy result to Dst
9737             CVariable* dstAlias = m_currShader->BitCast(m_destination, ISA_TYPE_UD);
9738             // Low:
9739             m_encoder->SetDstRegion(2);
9740             m_encoder->Copy(dstAlias, srcLow);
9741             m_encoder->Push();
9742             // High:
9743             m_encoder->SetDstSubReg(1);
9744             m_encoder->SetDstRegion(2);
9745             m_encoder->Copy(dstAlias, srcHigh);
9746             m_encoder->Push();
9747         }
9748     }
9749     else
9750     {
9751         CVariable* pTempVar = m_currShader->GetNewVariable(
9752             numLanes(m_currShader->m_SIMDSize),
9753             ISA_TYPE_UQ, m_currShader->getGRFAlignment(),
9754             m_destination->IsUniform(), CName::NONE);
9755         m_encoder->Or(pTempVar, srcV, m_currShader->ImmToVariable(static_cast<uint64_t>(tag) << 61, ISA_TYPE_UQ));
9756         m_encoder->Cast(m_destination, pTempVar);
9757         m_encoder->Push();
9758     }
9759 }
9760 
emitAddrSpaceCast(llvm::AddrSpaceCastInst * addrSpaceCast)9761 void EmitPass::emitAddrSpaceCast(llvm::AddrSpaceCastInst* addrSpaceCast)
9762 {
9763     // Tags are used to determine the address space of generic pointers
9764     // casted from private, local or global pointers.
9765     // Bit[60:63] are used for this purpose. bit[60] is reserved for future use.
9766     // Address space tag on bit[61:63] can be:
9767     // 001: private
9768     // 010: local
9769     // 000/111: global
9770 
9771     // In platforms that don't support 64bit operations, 64bit pointers are emulated
9772     // with pair{i32, i32}. So tags on generic pointers are added/removed by using:
9773     // - 64bit Or/And operations directly in platforms with 64bit operation support.
9774     // - 32bit Or/And operations on second element of the pair in platforms with no
9775     //   64bit operation support.
9776 
9777     CVariable* srcV = GetSymbol(addrSpaceCast->getOperand(0));
9778 
9779     if ((m_pCtx->allocatePrivateAsGlobalBuffer() || m_pCtx->hasNoPrivateToGenericCast()) &&
9780         m_pCtx->hasNoLocalToGenericCast())
9781     {
9782         // If forcing global memory allocacion and there are no generic pointers to local AS,
9783         // there is no need to tag generic pointers.
9784         m_encoder->Cast(m_destination, srcV);
9785         m_encoder->Push();
9786         return;
9787     }
9788 
9789     if (srcV->IsImmediate() && srcV->GetImmediateValue() == 0x0)
9790     {
9791         // If casting from null, don't do tagging
9792         m_encoder->Cast(m_destination, srcV);
9793         m_encoder->Push();
9794         return;
9795     }
9796 
9797     unsigned sourceAddrSpace = addrSpaceCast->getSrcAddressSpace();
9798     unsigned destAddrSpace = addrSpaceCast->getDestAddressSpace();
9799 
9800     if (destAddrSpace == ADDRESS_SPACE_GENERIC)
9801     {
9802         // Address space cast is in the form of {private, local, global} -> generic
9803         // A tag is added according to the address space of the source
9804 
9805         MDNode* genericMD = addrSpaceCast->getMetadata("generic.arith");
9806         if (genericMD)
9807         {
9808             m_encoder->Cast(m_destination, srcV);
9809             m_encoder->Push();
9810             return;
9811         }
9812 
9813         if (sourceAddrSpace == ADDRESS_SPACE_PRIVATE && !m_pCtx->allocatePrivateAsGlobalBuffer())
9814         {
9815             emitAddrSpaceToGenericCast(addrSpaceCast, srcV, 1);
9816         }
9817         else if (sourceAddrSpace == ADDRESS_SPACE_LOCAL)
9818         {
9819             emitAddrSpaceToGenericCast(addrSpaceCast, srcV, 2);
9820         }
9821         else // ADDRESS_SPACE_GLOBAL
9822         {
9823             m_encoder->Cast(m_destination, srcV);
9824             m_encoder->Push();
9825         }
9826     }
9827     else if (sourceAddrSpace == ADDRESS_SPACE_GENERIC &&
9828         (destAddrSpace == ADDRESS_SPACE_PRIVATE || destAddrSpace == ADDRESS_SPACE_LOCAL))
9829     {
9830         // Address space cast is in the form of generic -> {private, local, global}
9831         // Tag is removed according to the address space of the destination
9832 
9833         // The initial address could be in canonical form, that means bit 47 is replicated
9834         // to the upper bits. As bits [60:63] are spoiled already we need to restore the
9835         // address to the canonical form. This is done by merging bits [56:59], which we
9836         // assume are in canonical form, into bits [60:63].
9837 
9838         if (m_pCtx->m_hasEmu64BitInsts && m_currShader->m_Platform->hasNoFullI64Support())
9839         {
9840             if (m_currShader->GetContext()->getRegisterPointerSizeInBits(destAddrSpace) == 32)
9841             {
9842                 // Src
9843                 CVariable* srcAlias = m_currShader->GetNewAlias(srcV, ISA_TYPE_UD, 0, 0);
9844                 CVariable* srcLow = m_currShader->GetNewVariable(
9845                     numLanes(m_currShader->m_SIMDSize),
9846                     ISA_TYPE_UD, EALIGN_GRF, m_destination->IsUniform(),
9847                     CName(srcV->getName(), "Lo"));
9848 
9849                 // Get low part of srcV
9850                 m_encoder->SetSrcSubReg(0, 0);
9851                 m_encoder->SetSrcRegion(0, 2, 1, 0);
9852                 m_encoder->Copy(srcLow, srcAlias);
9853                 m_encoder->Push();
9854 
9855                 // Copy result to Dst
9856                 m_encoder->Cast(m_destination, srcLow);
9857                 m_encoder->Push();
9858             }
9859             else
9860             {
9861                 // Src
9862                 CVariable* srcAlias = m_currShader->GetNewAlias(srcV, ISA_TYPE_UD, 0, 0);
9863                 CVariable* srcLow = m_currShader->GetNewVariable(
9864                     numLanes(m_currShader->m_SIMDSize),
9865                     ISA_TYPE_UD, EALIGN_GRF, m_destination->IsUniform(),
9866                     CName(srcV->getName(), "Lo"));
9867                 CVariable* srcHigh = m_currShader->GetNewVariable(
9868                     numLanes(m_currShader->m_SIMDSize),
9869                     ISA_TYPE_UD, EALIGN_GRF, m_destination->IsUniform(),
9870                     CName(srcV->getName(), "Hi"));
9871                 CVariable* tempVar = m_currShader->GetNewVariable(
9872                     numLanes(m_currShader->m_SIMDSize),
9873                     ISA_TYPE_D, EALIGN_GRF, m_destination->IsUniform(),
9874                     CName::NONE);
9875 
9876                 // Split Src into {Low, High}
9877                 // Low:
9878                 m_encoder->SetSrcSubReg(0, 0);
9879                 m_encoder->SetSrcRegion(0, 2, 1, 0);
9880                 m_encoder->Copy(srcLow, srcAlias);
9881                 m_encoder->Push();
9882                 // High:
9883                 m_encoder->SetSrcSubReg(0, 1);
9884                 m_encoder->SetSrcRegion(0, 2, 1, 0);
9885                 m_encoder->Copy(srcHigh, srcAlias);
9886                 m_encoder->Push();
9887 
9888                 // Clear tag in the high part and restore address canonical form
9889                 m_encoder->Shl(tempVar, srcHigh, m_currShader->ImmToVariable(4, ISA_TYPE_D));
9890                 m_encoder->IShr(srcHigh, tempVar, m_currShader->ImmToVariable(4, ISA_TYPE_D));
9891                 m_encoder->Push();
9892 
9893                 // Copy to Dst
9894                 CVariable* dstAlias = m_currShader->BitCast(m_destination, ISA_TYPE_UD);
9895                 // Low:
9896                 m_encoder->SetDstRegion(2);
9897                 m_encoder->Copy(dstAlias, srcLow);
9898                 m_encoder->Push();
9899                 // High:
9900                 m_encoder->SetDstSubReg(1);
9901                 m_encoder->SetDstRegion(2);
9902                 m_encoder->Copy(dstAlias, srcHigh);
9903                 m_encoder->Push();
9904             }
9905         }
9906         else
9907         {
9908             CVariable* pTempVar = m_currShader->GetNewVariable(
9909                 numLanes(m_currShader->m_SIMDSize),
9910                 ISA_TYPE_Q, m_currShader->getGRFAlignment(),
9911                 m_destination->IsUniform(), CName::NONE);
9912             // Clear tag in the high part and restore address canonical form
9913             m_encoder->Shl(pTempVar, srcV, m_currShader->ImmToVariable(4, ISA_TYPE_D));
9914             m_encoder->IShr(pTempVar, pTempVar, m_currShader->ImmToVariable(4, ISA_TYPE_D));
9915             m_encoder->Cast(m_destination, pTempVar);
9916             m_encoder->Push();
9917         }
9918     }
9919     else // ADDRESS_SPACE_GLOBAL
9920     {
9921         m_encoder->Cast(m_destination, srcV);
9922         m_encoder->Push();
9923     }
9924 }
9925 
emitExtract(llvm::Instruction * inst)9926 void EmitPass::emitExtract(llvm::Instruction* inst)
9927 {
9928     IGC_ASSERT(llvm::isa<llvm::ExtractElementInst>(inst));
9929     llvm::ExtractElementInst* Extract = llvm::cast<llvm::ExtractElementInst>(inst);
9930     llvm::Value* vecOperand = Extract->getVectorOperand();
9931     auto vectorBCI = dyn_cast<BitCastInst>(vecOperand);
9932     CVariable* vector = m_currShader->GetSymbol(vecOperand, true);
9933 
9934     if (llvm::ConstantInt * pConstElem = llvm::dyn_cast<llvm::ConstantInt>(Extract->getIndexOperand()))
9935     {
9936         uint element = m_currShader->AdjustExtractIndex(vecOperand, int_cast<uint16_t>(pConstElem->getZExtValue()));
9937         // Do not use allocated type to compute the offsets; otherwise the computed
9938         // offsets may be out-of-bound. The alignment information of the base
9939         // element type should not impact the offset.
9940         uint eltBytes = GetScalarTypeSizeInRegister(Extract->getType());
9941         IGC_ASSERT_MESSAGE(eltBytes, "illegal ExtractElement instruction");
9942 
9943         if (m_currShader->CanTreatAsAlias(Extract))
9944         {
9945             if (vectorBCI && m_currShader->getCVarForVectorBCI(vectorBCI, element))
9946             {
9947                 //do nothing as we can reuse the symbol from the vector bitcast
9948                 return;
9949             }
9950             uint offset = 0;
9951             if (m_currShader->GetIsUniform(inst->getOperand(0)))
9952             {
9953                 offset = element * eltBytes;
9954             }
9955             else
9956             {
9957                 offset = vector->getOffsetMultiplier() * element * numLanes(m_currShader->m_SIMDSize) * eltBytes;
9958             }
9959             // the symbol table should have coalesced those two values;
9960             // TODO: clean up when we get generic coalescing
9961             IGC_ASSERT(vector == m_destination->GetAlias() || vector->GetAlias() == m_destination->GetAlias());
9962             IGC_ASSERT(m_destination->GetAliasOffset() == (offset + vector->GetAliasOffset()));
9963         }
9964         else
9965         {
9966             if (vectorBCI)
9967             {
9968                 if (auto var = m_currShader->getCVarForVectorBCI(vectorBCI, element))
9969                 {
9970                     // use the separate CVar for each index instead
9971                     m_encoder->Copy(m_destination, var);
9972                     m_encoder->Push();
9973                     return;
9974                 }
9975             }
9976 
9977             if (m_currShader->GetIsUniform(inst->getOperand(0)))
9978             {
9979                 uint offset = element * eltBytes;
9980                 m_encoder->SetSrcSubVar(0, (offset / getGRFSize()));
9981                 m_encoder->SetSrcSubReg(0, ((offset % getGRFSize()) / eltBytes));
9982             }
9983             else
9984             {
9985                 uint offset = vector->getOffsetMultiplier() * element * numLanes(m_currShader->m_SIMDSize) * eltBytes;
9986                 uint subvar = offset / getGRFSize();
9987                 m_encoder->SetSrcSubVar(0, subvar);
9988                 m_encoder->SetSrcSubReg(0, ((offset % getGRFSize()) / eltBytes));
9989             }
9990             m_encoder->Copy(m_destination, vector);
9991             m_encoder->Push();
9992         }
9993     }
9994     else
9995     {
9996         // We got an index which is not a value known at compile-time.
9997         llvm::Value* pIndex = Extract->getIndexOperand();
9998         llvm::Type* pVecType = vecOperand->getType();
9999 
10000         // When the index type is i32, it is better to create a uw alias since
10001         // the following address computation will be in uw.
10002         CVariable* pIndexVar = GetSymbol(pIndex);
10003         IGC_ASSERT(pIndex->getType()->getPrimitiveSizeInBits() <= 64);
10004 
10005         bool DoAliasing = pIndex->getType()->getPrimitiveSizeInBits() >= 32;
10006         if (DoAliasing)
10007         {
10008             pIndexVar = m_currShader->BitCast(pIndexVar, ISA_TYPE_UW);
10009         }
10010 
10011         // size of vector entry
10012         const uint vectorEntrySimdWidth = vector->IsUniform() ?
10013             1 : numLanes(m_currShader->m_SIMDSize);
10014 
10015         const uint vecTypeSize = GetScalarTypeSizeInRegister(pVecType);
10016 
10017         const uint offset = vectorEntrySimdWidth * vecTypeSize;
10018 
10019         CVariable* pOffset1 = m_currShader->ImmToVariable(offset, ISA_TYPE_UW);
10020 
10021         // offset2 is the offset within the array expressed in bytes (index*element size in bytes)
10022         CVariable* pOffset2 = m_currShader->GetNewVariable(
10023             pIndexVar->IsUniform() ? 1 : numLanes(m_currShader->m_SIMDSize),
10024             ISA_TYPE_UW,
10025             pIndexVar->IsUniform() ? EALIGN_WORD : EALIGN_HWORD,
10026             pIndexVar->IsUniform(),
10027             CName::NONE);
10028 
10029         // We bitcast the address as uw so it is an "unpacked" uw
10030         if (!pIndexVar->IsUniform() && DoAliasing)
10031         {
10032             m_encoder->SetSrcRegion(0, 2, 1, 0);
10033         }
10034 
10035         m_encoder->Mul(pOffset2, pIndexVar, pOffset1);
10036         m_encoder->Push();
10037 
10038         // if pIndexVar is non-uniform, we will need to use VxH addressing.
10039         // And if both pIndexVar and pVectorVar are non-uniform, need to add
10040         // per-element offsets to the content of address register
10041         CVariable* pOffset3 = nullptr;
10042         if (!pIndexVar->IsUniform() && !vector->IsUniform())
10043         {
10044             pOffset3 = m_currShader->GetNewVariable(
10045                 numLanes(m_currShader->m_SIMDSize),
10046                 ISA_TYPE_UW,
10047                 EALIGN_HWORD,
10048                 false,
10049                 CName::NONE);
10050             CVariable* OffsetVar = getOrCreatePerLaneOffsetVariable(vecTypeSize);
10051             m_encoder->Add(pOffset3, pOffset2, OffsetVar);
10052             m_encoder->Push();
10053         }
10054         else
10055         {
10056             // no need to add per-lane offsets
10057             pOffset3 = pOffset2;
10058         }
10059 
10060         {
10061             // address variable represents register a0
10062             CVariable* pDstArrElm = m_currShader->GetNewAddressVariable(
10063                 pIndexVar->IsUniform() ? 1 : numLanes(m_currShader->m_SIMDSize),
10064                 m_destination->GetType(),
10065                 pIndexVar->IsUniform(),
10066                 vector->IsUniform(),
10067                 m_destination->getName());
10068 
10069             // we add offsets to the base that is the beginning of the vector variable
10070             m_encoder->AddrAdd(pDstArrElm, vector, pOffset3);
10071             m_encoder->Push();
10072 
10073             // finally, we move the indirectly addressed values to the destination register
10074             m_encoder->Copy(m_destination, pDstArrElm);
10075             m_encoder->Push();
10076         }
10077     }
10078 }
10079 
emitUAVSerialize()10080 void EmitPass::emitUAVSerialize()
10081 {
10082     m_encoder->Wait();
10083     m_encoder->Push();
10084 }
10085 
10086 
emitLoadRawIndexed(LdRawIntrinsic * inst,Value * varOffset,ConstantInt * immOffset)10087 void EmitPass::emitLoadRawIndexed(
10088     LdRawIntrinsic * inst, Value * varOffset, ConstantInt * immOffset)
10089 {
10090     Value* bufPtrv = inst->getResourceValue();
10091 
10092     ResourceDescriptor resource = GetResourceVariable(bufPtrv);
10093     m_currShader->isMessageTargetDataCacheDataPort = true;
10094     IGC_ASSERT(immOffset == nullptr);
10095     emitLoad3DInner(inst, resource, varOffset);
10096 }
10097 
emitLoad3DInner(LdRawIntrinsic * inst,ResourceDescriptor & resource,Value * elem_idxv)10098 void EmitPass::emitLoad3DInner(LdRawIntrinsic* inst, ResourceDescriptor& resource, Value* elem_idxv)
10099 {
10100     IGC::e_predefSurface predDefSurface = resource.m_surfaceType;
10101     CVariable* gOffset = m_currShader->ImmToVariable(0x0, ISA_TYPE_UD);
10102 
10103     CVariable* src_offset = GetSymbol(elem_idxv);
10104 
10105     // still collect buffer type here to work around some alignment problem with different messages
10106     BufferType bufType = GetBufferType(inst->getOperand(0)->getType()->getPointerAddressSpace());
10107 
10108     // generate oword_load if it is uniform
10109     // otherwise, generate gather/gather4
10110     if (m_currShader->GetIsUniform(inst))
10111     {
10112         IGC_ASSERT_MESSAGE(predDefSurface != ESURFACE_STATELESS, "scratch cannot be uniform");
10113         Type* loadType = inst->getType();
10114         uint numElement = loadType->isVectorTy() ? (uint)cast<IGCLLVM::FixedVectorType>(loadType)->getNumElements() : 1;
10115         if (predDefSurface == ESURFACE_SLM)
10116         {
10117             IGC_ASSERT(numElement <= 4);
10118             uint numLane = (numElement == 3) ? 4 : numElement;
10119             // there is no oword-block read for SLM, also we expect loading only up to 4-dwords
10120             CVariable* imm = m_currShader->ImmToVariable(0x0C840, ISA_TYPE_UV);
10121             CVariable* srcTmp = m_currShader->GetNewVariable(
10122                 (uint16_t)numLane, ISA_TYPE_UD, m_currShader->getGRFAlignment(), true,
10123                 CName(src_offset->getName(), "Broadcast"));
10124             m_encoder->SetNoMask();
10125             m_encoder->SetUniformSIMDSize(lanesToSIMDMode(numLane));
10126             m_encoder->Add(srcTmp, src_offset, imm);
10127             m_encoder->Push();
10128             CVariable* dstTmp = m_destination;
10129             if (numElement != numLane)
10130             {
10131                 dstTmp = m_currShader->GetNewVariable(
10132                     (uint16_t)numLane, ISA_TYPE_D, m_currShader->getGRFAlignment(), true,
10133                     CName::NONE);
10134             }
10135             m_encoder->SetNoMask();
10136             m_encoder->SetUniformSIMDSize(lanesToSIMDMode(numLane));
10137             m_encoder->ByteGather(dstTmp, resource, srcTmp, 8, 4);
10138             m_encoder->Push();
10139 
10140             // generate an extract-element due to dst-size difference when numElement == 3
10141             // \todo, we should canonicalize <floatx3> to <floatx4> before code-gen to avoid this
10142             if (dstTmp != m_destination)
10143             {
10144                 for (uint i = 0; i < numElement; i++)
10145                 {
10146                     m_encoder->SetSrcSubReg(0, i);
10147                     m_encoder->SetDstSubReg(i);
10148                     m_encoder->SetSrcRegion(0, 0, 1, 0);
10149                     m_encoder->Copy(m_destination, dstTmp);
10150                     m_encoder->Push();
10151                 }
10152             }
10153         }
10154         else if (predDefSurface == ESURFACE_SCRATCH && m_currShader->m_Platform->hasScratchSurface() && inst->getAlignment() >= 4)
10155         {
10156             IGC_ASSERT(numElement <= 8);
10157             CVariable* tmpAddress = nullptr;
10158             if (numElement > 1)
10159             {
10160                 tmpAddress = m_currShader->GetNewVariable(numElement, ISA_TYPE_UD, EALIGN_GRF, true, CName::NONE);
10161                 m_encoder->SetNoMask();
10162                 m_encoder->SetUniformSIMDSize(lanesToSIMDMode(numElement));
10163                 m_encoder->Shl(tmpAddress, m_currShader->ImmToVariable(0x76543210, ISA_TYPE_V), m_currShader->ImmToVariable(2, ISA_TYPE_D));
10164                 m_encoder->Push();
10165                 m_encoder->SetNoMask();
10166                 m_encoder->SetUniformSIMDSize(lanesToSIMDMode(numElement));
10167                 m_encoder->Add(tmpAddress, tmpAddress, src_offset);
10168                 m_encoder->Push();
10169             }
10170             else
10171             {
10172                 tmpAddress = m_currShader->GetNewVariable(numElement, ISA_TYPE_UD, EALIGN_GRF, true, CName::NONE);
10173                 m_encoder->SetNoMask();
10174                 m_encoder->SetUniformSIMDSize(SIMDMode::SIMD1);
10175                 m_encoder->Copy(tmpAddress, src_offset);
10176                 m_encoder->Push();
10177             }
10178 
10179             bool needsTempDest = numElement < 4;
10180             CVariable* destination = m_destination;
10181             if (needsTempDest)
10182             {
10183                 uint elemSize = m_destination->GetElemSize();
10184                 destination = m_currShader->GetNewVariable(
10185                     numElement * SIZE_DWORD / elemSize, m_destination->GetType(),
10186                     EALIGN_GRF, m_destination->IsUniform(), CName::NONE);
10187             }
10188 
10189             m_encoder->SetNoMask();
10190             m_encoder->SetUniformSIMDSize(lanesToSIMDMode(numElement));
10191             m_encoder->Gather4Scaled(destination, resource, tmpAddress);
10192             m_encoder->Push();
10193             if (needsTempDest)
10194             {
10195                 // generate an extract-element
10196                 for (uint i = 0; i < numElement; i++)
10197                 {
10198                     m_encoder->SetSrcSubReg(0, i);
10199                     m_encoder->SetDstSubReg(i);
10200                     m_encoder->SetSrcRegion(0, 0, 1, 0);
10201                     m_encoder->Copy(m_destination, destination);
10202                     m_encoder->Push();
10203                 }
10204             }
10205         }
10206         else
10207         {
10208             bool owordAligned = false;
10209             // need to clear lower two-bits for unaligned
10210             CVariable* visaOffset = nullptr;
10211             if (bufType == CONSTANT_BUFFER)
10212             {
10213                 visaOffset = src_offset;
10214             }
10215             else if (src_offset->IsImmediate())
10216             {
10217                 // clear lower-two-bits
10218                 visaOffset = m_currShader->ImmToVariable(src_offset->GetImmediateValue() & 0xfffffffc, ISA_TYPE_UD);
10219             }
10220             else
10221             {
10222                 // clear lower-two-bits
10223                 CVariable* masklast2bits = m_currShader->ImmToVariable(0xfffffffc, ISA_TYPE_UD);
10224                 visaOffset = m_currShader->GetNewVariable(
10225                     src_offset->GetNumberElement(),
10226                     ISA_TYPE_UD,
10227                     src_offset->GetAlign(),
10228                     src_offset->IsUniform(),
10229                     src_offset->getName());
10230                 m_encoder->And(visaOffset, m_currShader->BitCast(src_offset, ISA_TYPE_UD), masklast2bits);
10231                 m_encoder->Push();
10232             }
10233             if (numElement >= 4)
10234             {
10235                 m_encoder->OWLoad(m_destination, resource, visaOffset, owordAligned, m_destination->GetSize());
10236                 m_encoder->Push();
10237             }
10238             else
10239             {
10240                 IGC_ASSERT(GetPrimitiveTypeSizeInRegisterInBits(loadType) < SIZE_DWORD * 8 * 4);
10241                 uint elemSize = m_destination->GetElemSize();
10242 
10243                 if (elemSize > 0)
10244                 {
10245                     unsigned int alignment = inst->getAlignment();
10246                     if (alignment < SIZE_DWORD && !(src_offset->IsImmediate() && src_offset->GetImmediateValue() % SIZE_DWORD == 0))
10247                     {
10248                         IGC_ASSERT(alignment == 1 || alignment == 2);
10249                         IGC_ASSERT(src_offset->IsUniform());
10250                         uint numElements = m_destination->GetSize() / alignment;
10251                         VISA_Type realType = alignment == 1 ? ISA_TYPE_UB : ISA_TYPE_UW;
10252                         CVariable* tmp = m_currShader->GetNewVariable(
10253                             numElements * (SIZE_DWORD / alignment), realType, EALIGN_GRF, true, CName::NONE);
10254                         if (numElements > 1)
10255                         {
10256                             IGC_ASSERT(numElements <= 8);
10257                             CVariable* offsetVector = m_currShader->GetNewVariable(numElements, ISA_TYPE_UD, EALIGN_GRF, CName::NONE);
10258                             m_encoder->SetSimdSize(lanesToSIMDMode(numElements));
10259                             m_encoder->SetNoMask();
10260                             m_encoder->Add(offsetVector, src_offset, m_currShader->ImmToVariable(alignment * 0x76543210, ISA_TYPE_UV));
10261                             m_encoder->Push();
10262                             src_offset = offsetVector;
10263                         }
10264                         else if (src_offset->IsImmediate() || src_offset->GetAlign() != EALIGN_GRF)
10265                         {
10266                             IGC_ASSERT(numElements == 1);
10267                             CVariable* tmpSrcOffset = m_currShader->GetNewVariable(numElements, ISA_TYPE_UD, EALIGN_GRF, true, CName::NONE);
10268                             m_encoder->SetSimdSize(lanesToSIMDMode(numElements));
10269                             m_encoder->SetNoMask();
10270                             m_encoder->Cast(tmpSrcOffset, src_offset);
10271                             m_encoder->Push();
10272                             src_offset = tmpSrcOffset;
10273                         }
10274                         m_encoder->SetSimdSize(lanesToSIMDMode(numElements));
10275                         m_encoder->SetNoMask();
10276                         m_encoder->ByteGather(tmp, resource, src_offset, 8, alignment);
10277                         m_encoder->Push();
10278                         CVariable* dstWordAlias = m_currShader->GetNewAlias(m_destination, realType, 0, 0, false);
10279                         m_encoder->SetSimdSize(lanesToSIMDMode(numElements));
10280                         m_encoder->SetNoMask();
10281                         m_encoder->SetSrcRegion(0, SIZE_DWORD / alignment, 1, 0);
10282                         m_encoder->Copy(dstWordAlias, tmp);
10283                         m_encoder->Push();
10284                     }
10285                     else
10286                     {
10287                         CVariable* tmp = m_currShader->GetNewVariable(
10288                             4 * SIZE_DWORD / elemSize, m_destination->GetType(), EALIGN_GRF, m_destination->IsUniform(), CName::NONE);
10289                         m_encoder->OWLoad(tmp, resource, visaOffset, owordAligned, tmp->GetSize());
10290                         m_encoder->Push();
10291                         // generate an extract-element
10292                         for (uint i = 0; i < numElement; i++)
10293                         {
10294                             m_encoder->SetSrcSubReg(0, i);
10295                             m_encoder->SetDstSubReg(i);
10296                             m_encoder->SetSrcRegion(0, 0, 1, 0);
10297                             m_encoder->Copy(m_destination, tmp);
10298                             m_encoder->Push();
10299                         }
10300                     }
10301                 }
10302             }
10303         }
10304     }
10305     else
10306     {
10307         uint label = 0;
10308         CVariable* flag = nullptr;
10309         bool needLoop = ResourceLoopHeader(resource, flag, label);
10310         uint sizeInBits = GetPrimitiveTypeSizeInRegisterInBits(inst->getType());
10311         IGC_ASSERT_MESSAGE((sizeInBits == 8) || (sizeInBits == 16) || (sizeInBits == 32) || (sizeInBits == 64) || (sizeInBits == 96) || (sizeInBits == 128),
10312             "load type must be 1/2/4/8/12/16 bytes long");
10313         IGC::CVariable* visaOffset = BroadcastIfUniform(src_offset);
10314         unsigned int alignment = inst->getAlignment();
10315         if (sizeInBits == 32 && resource.m_surfaceType == ESURFACE_STATELESS &&
10316             m_currShader->m_Platform->getWATable().WaNoA32ByteScatteredStatelessMessages)
10317         {
10318             // DWORD gather
10319             CVariable* shiftedPtr = m_currShader->GetNewVariable(visaOffset);
10320             m_encoder->Shr(shiftedPtr, visaOffset, m_currShader->ImmToVariable(2, ISA_TYPE_UD));
10321             m_encoder->Push();
10322             visaOffset = shiftedPtr;
10323             m_encoder->SetPredicate(flag);
10324             m_encoder->Gather(m_destination, resource.m_resource, visaOffset, gOffset, resource.m_surfaceType, 4);
10325             m_encoder->Push();
10326         }
10327         else if (sizeInBits == 32 && (bufType == CONSTANT_BUFFER || resource.m_surfaceType == ESURFACE_STATELESS || alignment < 4))
10328         {
10329             // uav and resource cannot be changed to this path due to alignment issue encountered in some tests
10330             uint elementSize = 8;
10331             uint numElems = 4;
10332             m_encoder->SetPredicate(flag);
10333             m_encoder->ByteGather(m_destination, resource, visaOffset, elementSize, numElems);
10334             m_encoder->Push();
10335         }
10336         else if (sizeInBits >= 32)
10337         {
10338             // constant-buffer cannot go this way due to driver surface-state setting to RGBA-F32
10339             if (bufType == CONSTANT_BUFFER || bufType == BINDLESS_CONSTANT_BUFFER)
10340             {
10341                 IGC_ASSERT(!UsesTypedConstantBuffer(m_currShader->GetContext(), bufType));
10342             }
10343 
10344             m_encoder->SetPredicate(flag);
10345             m_encoder->Gather4ScaledNd(m_destination, resource, visaOffset, sizeInBits / 32);
10346             m_encoder->Push();
10347         }
10348         else if (sizeInBits == 8 || sizeInBits == 16)
10349         {
10350             uint elementSize = 8;
10351             uint numElems = sizeInBits / 8;
10352             uint hStride = 32 / sizeInBits;
10353             uint16_t vStride = numLanes(m_currShader->m_SIMDSize);
10354             CVariable* gatherDest = m_currShader->GetNewVariable(vStride, ISA_TYPE_UD, EALIGN_GRF, CName::NONE);
10355             m_encoder->SetPredicate(flag);
10356             m_encoder->ByteGather(gatherDest, resource, visaOffset, elementSize, numElems);
10357             m_encoder->Push();
10358 
10359             gatherDest = m_currShader->GetNewAlias(gatherDest, m_destination->GetType(), 0, 0);
10360             m_encoder->SetSrcRegion(0, vStride, vStride / hStride, hStride);
10361             m_encoder->Cast(m_destination, gatherDest);
10362             m_encoder->Push();
10363         }
10364         ResourceLoopBackEdge(needLoop, flag, label);
10365     }
10366 }
10367 
emitLoad(LoadInst * inst,Value * offset,ConstantInt * immOffset)10368 void EmitPass::emitLoad(LoadInst* inst, Value* offset, ConstantInt* immOffset)
10369 {
10370     emitVectorLoad(inst, offset, immOffset);
10371 }
10372 
EmitNoModifier(llvm::Instruction * inst)10373 void EmitPass::EmitNoModifier(llvm::Instruction* inst)
10374 {
10375     // This is a single instruction pattern emitter
10376     // Check if this inst has been turned into noop due to alias.
10377     // If so, no code shall be emitted for this instruction.
10378     if (m_currShader->HasBecomeNoop(inst))
10379     {
10380         return;
10381     }
10382 
10383     if (IGC_IS_FLAG_ENABLED(EnableDeSSAAlias) &&
10384         m_deSSA && m_deSSA->isNoopAliaser(inst))
10385     {
10386         return;
10387     }
10388 
10389     switch (inst->getOpcode())
10390     {
10391     case Instruction::Ret:
10392         emitReturn(cast<ReturnInst>(inst));
10393         break;
10394     case Instruction::Call:
10395         if (GenIntrinsicInst * I = dyn_cast<GenIntrinsicInst>(inst))
10396         {
10397             EmitGenIntrinsicMessage(I);
10398         }
10399         else if (IntrinsicInst * I = dyn_cast<IntrinsicInst>(inst))
10400         {
10401             EmitIntrinsicMessage(I);
10402         }
10403         else if (cast<CallInst>(inst)->isInlineAsm())
10404         {
10405             EmitInlineAsm(cast<CallInst>(inst));
10406         }
10407         else
10408         {
10409             emitCall(cast<CallInst>(inst));
10410         }
10411         break;
10412     case Instruction::Store:
10413         emitStore(cast<StoreInst>(inst),
10414             cast<StoreInst>(inst)->getPointerOperand(),
10415             nullptr);
10416         break;
10417     case Instruction::Load:
10418         emitLoad(
10419             cast<LoadInst>(inst),
10420             cast<LoadInst>(inst)->getPointerOperand(),
10421             nullptr);
10422         break;
10423     case Instruction::GetElementPtr:
10424         emitGEP(cast<GetElementPtrInst>(inst));
10425         break;
10426     case Instruction::BitCast:
10427         emitBitCast(cast<BitCastInst>(inst));
10428         break;
10429     case Instruction::PtrToInt:
10430         emitPtrToInt(cast<PtrToIntInst>(inst));
10431         break;
10432     case Instruction::IntToPtr:
10433         emitIntToPtr(cast<IntToPtrInst>(inst));
10434         break;
10435     case Instruction::AddrSpaceCast:
10436         emitAddrSpaceCast(cast<AddrSpaceCastInst>(inst));
10437         break;
10438     case Instruction::InsertElement:
10439         emitInsert(cast<InsertElementInst>(inst));
10440         break;
10441     case Instruction::ExtractElement:
10442         emitExtract(cast<ExtractElementInst>(inst));
10443         break;
10444     case Instruction::Unreachable:
10445         break;
10446     default:
10447         IGC_ASSERT_MESSAGE(0, "need to add code gen support for this instruction");
10448     }
10449 }
10450 
emitPairToPtr(GenIntrinsicInst * GII)10451 void EmitPass::emitPairToPtr(GenIntrinsicInst* GII) {
10452     CVariable* Lo = GetSymbol(GII->getOperand(0));
10453     CVariable* Hi = GetSymbol(GII->getOperand(1));
10454 
10455     unsigned AS = GII->getType()->getPointerAddressSpace();
10456     if (m_currShader->GetContext()->getRegisterPointerSizeInBits(AS) == 32) {
10457         CVariable* Tmp = m_currShader->BitCast(Lo, GetUnsignedType(Lo->GetType()));
10458         m_encoder->Cast(m_destination, Tmp);
10459         m_encoder->Push();
10460         return;
10461     }
10462 
10463     IGC_ASSERT_MESSAGE(m_currShader->GetContext()->getRegisterPointerSizeInBits(AS) == 64,
10464         "Pointer size should be either 32 or 64!");
10465 
10466     CVariable* Dst32 = m_currShader->BitCast(m_destination, ISA_TYPE_UD);
10467     // Lo
10468     m_encoder->SetDstRegion(2);
10469     m_encoder->Copy(Dst32, Lo);
10470     m_encoder->Push();
10471     // Hi
10472     m_encoder->SetDstRegion(2);
10473     m_encoder->SetDstSubReg(1);
10474     m_encoder->Copy(Dst32, Hi);
10475     m_encoder->Push();
10476 }
10477 
emitLLVMStackSave(llvm::IntrinsicInst * inst)10478 void EmitPass::emitLLVMStackSave(llvm::IntrinsicInst* inst) {
10479     // save current SP
10480     CVariable* pSP = m_currShader->GetSP();
10481     m_encoder->Copy(m_destination, pSP);
10482     m_encoder->Push();
10483 }
10484 
emitLLVMStackRestore(llvm::IntrinsicInst * inst)10485 void EmitPass::emitLLVMStackRestore(llvm::IntrinsicInst* inst) {
10486     // restore the SP to arg(0)
10487     CVariable* pSP = m_currShader->GetSP();
10488     CVariable* savedSP = m_currShader->GetSymbol(inst->getOperand(0));
10489     // stacksave and stackrestore are forced to be uniform in WIAnalysis.
10490     // While here we still set to scalar region just in case
10491     m_encoder->SetSrcRegion(0, 0, 1, 0);
10492     m_encoder->Copy(pSP, savedSP);
10493     m_encoder->Push();
10494 }
10495 
emitVLAStackAlloca(llvm::GenIntrinsicInst * intrinsic)10496 void EmitPass::emitVLAStackAlloca(llvm::GenIntrinsicInst* intrinsic)
10497 {
10498     CVariable* pSP = m_currShader->GetSP();
10499     CVariable* lane_off = m_currShader->GetSymbol(intrinsic->getOperand(0));
10500     // m_destination = curr_SP + lane_offset
10501     emitAddPointer(m_destination, pSP, lane_off);
10502     m_encoder->Push();
10503 
10504     if (m_currShader->m_numberInstance == 1 || m_encoder->IsSecondHalf()) {
10505         // SP = SP + vla_size * simdWidth
10506         CVariable* vla_size = m_currShader->GetSymbol(intrinsic->getOperand(1));
10507         // vla_size must be uniform, if it's not uniform, set region to take only <0;1,0>
10508         m_encoder->SetSrcRegion(0, 0, 1, 0);
10509         m_encoder->Mul(vla_size, vla_size,
10510             m_currShader->ImmToVariable(numLanes(m_currShader->m_SIMDSize), ISA_TYPE_UW));
10511         m_encoder->Push();
10512 
10513         m_encoder->SetSrcRegion(1, 0, 1, 0);
10514         emitAddPointer(pSP, pSP, vla_size);
10515         m_encoder->Push();
10516     }
10517 }
10518 
emitStackAlloca(GenIntrinsicInst * GII)10519 void EmitPass::emitStackAlloca(GenIntrinsicInst* GII)
10520 {
10521     // Static private mem access is done through the FP
10522     CVariable* pFP = m_currShader->GetFP();
10523     if IGC_IS_FLAG_ENABLED(EnableWriteOldFPToStack)
10524     {
10525         // If we have written the previous FP to the current frame's start, the start of
10526         // private memory will be offset by 16 bytes
10527         CVariable* tempFP = m_currShader->GetNewVariable(pFP);
10528         emitAddPointer(tempFP, pFP, m_currShader->ImmToVariable(getFPOffset(), ISA_TYPE_UD));
10529         pFP = tempFP;
10530     }
10531     CVariable* pOffset = m_currShader->GetSymbol(GII->getOperand(0));
10532     emitAddPointer(m_destination, pFP, pOffset);
10533 }
10534 
emitCall(llvm::CallInst * inst)10535 void EmitPass::emitCall(llvm::CallInst* inst)
10536 {
10537     llvm::Function* F = inst->getCalledFunction();
10538     if (!F || F->hasFnAttribute("referenced-indirectly") || (m_FGA && m_FGA->useStackCall(F)))
10539     {
10540         emitStackCall(inst);
10541         return;
10542     }
10543 
10544     IGC_ASSERT_MESSAGE(!F->empty(), "unexpanded builtin?");
10545 
10546     unsigned i = 0;
10547     for (auto& Arg : F->args())
10548     {
10549         // Skip unused arguments if any.
10550         if (Arg.use_empty())
10551         {
10552             ++i;
10553             continue;
10554         }
10555 
10556         CVariable* Dst = m_currShader->getOrCreateArgumentSymbol(&Arg, true);
10557         CVariable* Src = GetSymbol(inst->getArgOperand(i++));
10558 
10559         // When both symbols are the same, then this argument passing has been
10560         // lifted to use a global vISA variable, just skip the copy.
10561         if (Dst != Src)
10562         {
10563             emitCopyAll(Dst, Src, Arg.getType());
10564         }
10565     }
10566     m_currFuncHasSubroutine = true;
10567     m_encoder->SubroutineCall(nullptr, F);
10568     m_encoder->Push();
10569 
10570     // Emit the return value if used.
10571     if (!inst->use_empty())
10572     {
10573         CVariable* Dst = GetSymbol(inst);
10574         CVariable* Src = m_currShader->getOrCreateReturnSymbol(F);
10575         emitCopyAll(Dst, Src, inst->getType());
10576     }
10577 }
10578 
emitReturn(llvm::ReturnInst * inst)10579 void EmitPass::emitReturn(llvm::ReturnInst* inst)
10580 {
10581     llvm::Function* F = inst->getParent()->getParent();
10582     MetaDataUtils* pMdUtils = m_currShader->GetMetaDataUtils();
10583 
10584     // return from a function (not a kernel)
10585     if (!isEntryFunc(pMdUtils, F))
10586     {
10587         if (m_FGA && m_FGA->useStackCall(F))
10588         {
10589             emitStackFuncExit(inst);
10590             return;
10591         }
10592 
10593         llvm::Type* RetTy = F->getReturnType();
10594         if (!RetTy->isVoidTy())
10595         {
10596             CVariable* Dst = m_currShader->getOrCreateReturnSymbol(F);
10597             CVariable* Src = GetSymbol(inst->getReturnValue());
10598             emitCopyAll(Dst, Src, RetTy);
10599         }
10600 
10601         m_encoder->SubroutineRet(nullptr, F);
10602         m_encoder->Push();
10603         return;
10604     }
10605 
10606     if (m_currShader->GetShaderType() == ShaderType::PIXEL_SHADER)
10607     {
10608         CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
10609         unsigned nRTWrites = int_cast<unsigned>(psProgram->rtWriteList.size());
10610 
10611         for (unsigned i = 0; i < nRTWrites; i++)
10612         {
10613             GenIntrinsicInst* inst;
10614             bool isSecondHalf;
10615 
10616             inst = cast<GenIntrinsicInst>(psProgram->rtWriteList[i].first);
10617             isSecondHalf = psProgram->rtWriteList[i].second;
10618             m_encoder->SetSecondHalf(isSecondHalf);
10619 
10620             switch (inst->getIntrinsicID())
10621             {
10622             case GenISAIntrinsic::GenISA_RTWrite:
10623                 emitRenderTargetWrite(cast<RTWritIntrinsic>(inst), true);
10624                 break;
10625             case GenISAIntrinsic::GenISA_RTDualBlendSource:
10626                 emitDualBlendRT(cast<RTDualBlendSourceIntrinsic>(inst), true);
10627                 break;
10628             default:
10629                 IGC_ASSERT_MESSAGE(0, "unknown intrinsic");
10630                 break;
10631             }
10632         }
10633         // restore encoder's second half flag.
10634         if (psProgram->m_numberInstance == 2)
10635         {
10636             m_encoder->SetSecondHalf(false);
10637         }
10638 
10639         // check to make sure we will have EOT
10640         IGC_ASSERT(psProgram->m_hasEOT || psProgram->GetPhase() != PSPHASE_LEGACY);
10641     }
10642 
10643     m_currShader->AddEpilogue(inst);
10644 }
10645 
10646 /// Initializes the kernel for stack call by initializing the SP and FP
InitializeKernelStack(Function * pKernel)10647 void EmitPass::InitializeKernelStack(Function* pKernel)
10648 {
10649     m_currShader->InitializeStackVariables();
10650     auto pCtx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
10651     auto pModuleMetadata = pCtx->getModuleMetaData();
10652 
10653     CVariable* pStackBufferBase = m_currShader->GetPrivateBase();
10654 
10655     CVariable* pHWTID = m_currShader->GetHWTID();
10656 
10657     CVariable* pSize = nullptr;
10658 
10659     uint32_t MaxPrivateSize = pModuleMetadata->FuncMD[pKernel].privateMemoryPerWI;
10660     FunctionGroup* FG = m_FGA ? m_FGA->getGroup(pKernel) : nullptr;
10661     if (FG)
10662     {
10663         // Get the max PrivateMem used in the FG, which is set by
10664         // PrivateMemoryResolution.cpp after analyzing the call depth
10665         MaxPrivateSize = FG->getMaxPrivateMemOnStack();
10666 
10667         // If there are indirect calls or recursions, we no longer
10668         // know the call depth, so just add 4KB and hope we don't overflow.
10669         if (FG->hasIndirectCall() || FG->hasRecursion())
10670             MaxPrivateSize += (4 * 1024);
10671         // Add another 1KB for VLA
10672         if (FG->hasVariableLengthAlloca())
10673             MaxPrivateSize += 1024;
10674     }
10675 
10676     if (IGC_IS_FLAG_ENABLED(EnableRuntimeFuncAttributePatching))
10677     {
10678         // Experimental: Patch private memory size
10679         std::string patchName = "INTEL_PATCH_PRIVATE_MEMORY_SIZE";
10680         pSize = m_currShader->GetNewVariable(1, ISA_TYPE_UD, CVariable::getAlignment(getGRFSize()), true, CName(patchName));
10681         m_encoder->AddVISASymbol(patchName, pSize);
10682     }
10683     else
10684     {
10685         // hard-code per-workitem private-memory size to max size
10686         pSize = m_currShader->ImmToVariable(MaxPrivateSize * numLanes(m_currShader->m_dispatchSize), ISA_TYPE_UD);
10687     }
10688 
10689     CVariable* pThreadOffset = m_currShader->GetNewVariable(1, ISA_TYPE_UD, EALIGN_DWORD, true, 1, CName::NONE);
10690     m_encoder->Mul(pThreadOffset, pHWTID, pSize);
10691     m_encoder->Push();
10692 
10693     unsigned totalAllocaSize = 0;
10694 
10695     // reserve space for alloca
10696     auto funcMDItr = pModuleMetadata->FuncMD.find(pKernel);
10697     if (funcMDItr != pModuleMetadata->FuncMD.end() && funcMDItr->second.privateMemoryPerWI != 0)
10698     {
10699         totalAllocaSize += funcMDItr->second.privateMemoryPerWI * numLanes(m_currShader->m_dispatchSize);
10700     }
10701 
10702     if (IGC_IS_FLAG_DISABLED(EnableRuntimeFuncAttributePatching))
10703     {
10704         // If we don't return per-function private memory size,
10705         // modify private-memory size to a large setting.
10706         // This will be reported through patch-tokens as per-kernel requirement.
10707         pModuleMetadata->FuncMD[pKernel].privateMemoryPerWI = MaxPrivateSize;
10708     }
10709 
10710     // Initialize SP to per-thread kernel stack base
10711     CVariable* pSP = m_currShader->GetSP();
10712     emitAddPointer(pSP, pStackBufferBase, pThreadOffset);
10713 
10714     // Push a new stack frame
10715     emitPushFrameToStack(totalAllocaSize);
10716 
10717     // Set the total alloca size for the entry function
10718     m_encoder->SetFunctionAllocaStackSize(pKernel, totalAllocaSize);
10719 }
10720 
10721 // Either do a block load or store to the stack-pointer given a vector of function arguments
emitStackArgumentLoadOrStore(std::vector<CVariable * > & Args,bool isWrite)10722 uint EmitPass::emitStackArgumentLoadOrStore(std::vector<CVariable*>& Args, bool isWrite)
10723 {
10724     uint32_t offsetS = 0;
10725     SmallVector<std::tuple<CVariable*, uint32_t, uint32_t, uint32_t>, 8> dataBlks;
10726     for (auto Arg : Args)
10727     {
10728         // stack offset is always oword-aligned
10729         offsetS = int_cast<unsigned>(llvm::alignTo(offsetS, SIZE_OWORD));
10730 
10731         // calculate block sizes for each arg
10732         int32_t RmnBytes = Arg->GetSize();
10733         uint32_t ArgOffset = 0;
10734         do
10735         {
10736             uint32_t BlkSize = 0;
10737             {
10738                 BlkSize = getBlockMsgSize(RmnBytes, m_currShader->m_Platform->getMaxBlockMsgSize(false));
10739             }
10740             dataBlks.push_back(std::make_tuple(Arg, offsetS, BlkSize, ArgOffset));
10741 
10742             offsetS += BlkSize;
10743             ArgOffset += BlkSize;
10744             RmnBytes -= BlkSize;
10745         } while (RmnBytes > 0);
10746     }
10747 
10748     if (offsetS > 0)
10749     {
10750         // Get current SP
10751         CVariable* pSP = m_currShader->GetSP();
10752         if (isWrite)
10753         {
10754             // If storing to stack, first push SP by total store bytes
10755             CVariable* pPushSize = m_currShader->ImmToVariable(offsetS, ISA_TYPE_UD);
10756             emitAddPointer(pSP, pSP, pPushSize);
10757         }
10758 
10759         // Load or store each OWORD block to stack
10760         for (auto& I : dataBlks)
10761         {
10762             CVariable* Arg = std::get<0>(I);
10763             uint32_t StackOffset = std::get<1>(I);
10764             uint32_t BlkSize = std::get<2>(I);
10765             uint32_t ArgOffset = std::get<3>(I);
10766             // spOffset is a negative offset from SP
10767             int32_t spOffset = StackOffset - offsetS;
10768 
10769             if (isWrite)  // Write args to stack
10770             {
10771                 {
10772                     // SP offset for each block
10773                     CVariable* pTempSP = m_currShader->GetNewVariable(pSP);
10774                     emitAddPointer(pTempSP, pSP, m_currShader->ImmToVariable(spOffset, ISA_TYPE_D));
10775 
10776                     m_encoder->OWStoreA64(Arg, pTempSP, BlkSize, ArgOffset);
10777                     m_encoder->Push();
10778                 }
10779             }
10780             else  // Read args from stack
10781             {
10782                 CVariable* LdDst = Arg;
10783                 if (Arg->GetType() == ISA_TYPE_BOOL)
10784                 {
10785                     LdDst = m_currShader->GetNewVariable(numLanes(m_currShader->m_dispatchSize), ISA_TYPE_W, EALIGN_HWORD, false, 1, CName::NONE);
10786                 }
10787 
10788                 int RmnBytes = LdDst->GetSize() - ArgOffset;
10789                 bool needRmCopy = BlkSize == SIZE_OWORD && RmnBytes > 0 && RmnBytes < SIZE_OWORD;
10790                 {
10791                     // SP offset for each block
10792                     CVariable* pTempSP = m_currShader->GetNewVariable(pSP);
10793                     emitAddPointer(pTempSP, pSP, m_currShader->ImmToVariable(spOffset, ISA_TYPE_D));
10794 
10795                     if (!needRmCopy)
10796                     {
10797                         m_encoder->OWLoadA64(LdDst, pTempSP, BlkSize, ArgOffset);
10798                         m_encoder->Push();
10799                     }
10800                     else
10801                     {
10802                         // Reading less than one oword, read one oword, then copy
10803                         uint ldDstElemSize = LdDst->GetElemSize();
10804                         if (ldDstElemSize > 0)
10805                         {
10806                             CVariable* pTempDst = m_currShader->GetNewVariable(SIZE_OWORD / ldDstElemSize, LdDst->GetType(), m_currShader->getGRFAlignment(), true, 1, CName::NONE);
10807                             m_encoder->OWLoadA64(pTempDst, pTempSP, SIZE_OWORD);
10808                             m_encoder->Push();
10809                             emitVectorCopy(LdDst, pTempDst, RmnBytes / ldDstElemSize, ArgOffset, 0);
10810                         }
10811                     }
10812                 }
10813                 if (LdDst != Arg)
10814                 {
10815                     // only happens to bool
10816                     IGC_ASSERT(Arg->GetType() == ISA_TYPE_BOOL);
10817                     m_encoder->Cmp(EPREDICATE_NE, Arg, LdDst, m_currShader->ImmToVariable(0, LdDst->GetType()));
10818                 }
10819             }
10820         }
10821     }
10822     return offsetS;
10823 }
10824 
emitStackCall(llvm::CallInst * inst)10825 void EmitPass::emitStackCall(llvm::CallInst* inst)
10826 {
10827     llvm::Function* F = inst->getCalledFunction();
10828 
10829     bool isIndirectFCall = !F || F->hasFnAttribute("referenced-indirectly");
10830     bool isInvokeSIMDTarget = F && F->hasFnAttribute("invoke_simd_target");
10831     CVariable* ArgBlkVar = m_currShader->GetARGV();
10832     uint32_t offsetA = 0;  // visa argument offset
10833     uint32_t offsetS = 0;  // visa stack offset
10834     std::vector<CVariable*> argsOnStack;
10835     SmallVector<std::tuple<CVariable*, Type*, uint32_t>, 8> argsOnRegister;
10836 
10837     for (uint32_t i = 0; i < inst->getNumArgOperands(); i++)
10838     {
10839         Value* operand = inst->getArgOperand(i);
10840         CVariable* Src = GetSymbol(operand);
10841         Type* argType = operand->getType();
10842 
10843         if (!isIndirectFCall)
10844         {
10845             // Skip unused arguments if any for direct call
10846             auto argIter = F->arg_begin();
10847             std::advance(argIter, i);
10848             if (argIter->use_empty()) continue;
10849         }
10850 
10851         if (Src->GetType() == ISA_TYPE_BOOL)
10852         {
10853             // bool args are treated as a vector of WORDs
10854             uint nElts = numLanes(m_currShader->m_dispatchSize);
10855             CVariable* ReplaceArg = m_currShader->GetNewVariable(
10856                 nElts,
10857                 ISA_TYPE_W,
10858                 EALIGN_HWORD, false, 1,
10859                 CName::NONE);
10860             CVariable* one = m_currShader->ImmToVariable(1, ISA_TYPE_W);
10861             CVariable* zero = m_currShader->ImmToVariable(0, ISA_TYPE_W);
10862             m_encoder->Select(Src, ReplaceArg, one, zero);
10863 
10864             argType = IntegerType::getInt16Ty(inst->getContext());
10865             Src = ReplaceArg;
10866         }
10867 
10868         // adjust offset for alignment
10869         uint align = getGRFSize();
10870         offsetA = int_cast<unsigned>(llvm::alignTo(offsetA, align));
10871         // check if an argument can be written to ARGV based upon offset + arg-size
10872         unsigned argSize = Src->GetSize();
10873         if (Src->IsUniform())
10874         {
10875             argSize = Src->GetSize() * numLanes(m_currShader->m_dispatchSize);
10876         }
10877         bool overflow = ((offsetA + argSize) > ArgBlkVar->GetSize());
10878         if (!overflow)
10879         {
10880             argsOnRegister.push_back(std::make_tuple(Src, argType, offsetA));
10881             offsetA += argSize;
10882         }
10883         else
10884         {
10885             // Vectorize, then push to stack
10886             if (Src->IsUniform())
10887             {
10888                 uint16_t nElts = (uint16_t)m_currShader->GetNumElts(argType, false);
10889                 CVariable* SrcVec = m_currShader->GetNewVariable(nElts, Src->GetType(), m_currShader->getGRFAlignment(), false, Src->getName());
10890                 emitCopyAll(SrcVec, Src, argType);
10891                 Src = SrcVec;
10892             }
10893             argsOnStack.push_back(Src);
10894         }
10895     }
10896     // Write all arguments that does not fit in GRF to stack
10897     offsetS = emitStackArgumentLoadOrStore(argsOnStack, true);
10898 
10899     uint retSize = 0;
10900     if (!inst->use_empty())
10901     {
10902         CVariable* Dst = GetSymbol(inst);
10903         if (Dst->GetType() == ISA_TYPE_BOOL)
10904         {
10905             retSize = numLanes(m_currShader->m_dispatchSize) * SIZE_WORD;
10906         }
10907         else
10908         {
10909             retSize = Dst->GetSize();
10910         }
10911         CVariable* Src = m_currShader->GetRETV();
10912         IGC_ASSERT_MESSAGE(retSize <= Src->GetSize(), "No support for return on stack!");
10913     }
10914 
10915     unsigned char argSizeInGRF = (offsetA + getGRFSize() - 1) / getGRFSize();
10916     unsigned char retSizeInGRF = (retSize + getGRFSize() - 1) / getGRFSize();
10917 
10918     // lamda to copy arguments to arg register block
10919     auto CopyArgBlkVariables = [&](void)->void
10920     {
10921         for (auto& I : argsOnRegister)
10922         {
10923             CVariable * Src = std::get<0>(I);
10924             Type* argType = std::get<1>(I);
10925             uint32_t offset = std::get<2>(I);
10926 
10927             uint16_t nElts = (uint16_t)m_currShader->GetNumElts(argType, false);
10928             CVariable* Dst = m_currShader->GetNewAlias(ArgBlkVar, m_currShader->GetType(argType), offset, nElts, false);
10929             emitCopyAll(Dst, Src, argType);
10930         }
10931     };
10932 
10933     // lambda to read the return value
10934     auto CopyReturnValue = [this](CallInst* inst)->void
10935     {
10936         // No need to copy if there are no uses
10937         if (inst->use_empty())
10938             return;
10939 
10940         CVariable* Dst = GetSymbol(inst);
10941         CVariable* Src = m_currShader->GetRETV();
10942         if (Dst->GetType() == ISA_TYPE_BOOL)
10943         {
10944             CVariable* SrcAlias = m_currShader->GetNewAlias(Src, ISA_TYPE_W, 0, numLanes(m_currShader->m_dispatchSize), false);
10945             m_encoder->Cmp(EPREDICATE_NE, Dst, SrcAlias, m_currShader->ImmToVariable(0, ISA_TYPE_W));
10946         }
10947         else
10948         {
10949             IGC_ASSERT(Dst->GetSize() <= Src->GetSize());
10950             if (Dst->GetType() != Src->GetType() || Src->IsUniform() != Dst->IsUniform())
10951             {
10952                 Src = m_currShader->GetNewAlias(Src, Dst->GetType(), 0, Dst->GetNumberElement(), Dst->IsUniform());
10953             }
10954             emitCopyAll(Dst, Src, inst->getType());
10955         }
10956     };
10957 
10958     CVariable* funcAddr = GetSymbol(IGCLLVM::getCalledValue(inst));
10959     if (!isIndirectFCall || isInvokeSIMDTarget)
10960     {
10961         CopyArgBlkVariables();
10962         m_encoder->StackCall(nullptr, F, argSizeInGRF, retSizeInGRF);
10963         m_encoder->Push();
10964         CopyReturnValue(inst);
10965     }
10966     else
10967     {
10968         if (funcAddr->IsUniform() || IGC_IS_FLAG_ENABLED(AssumeUniformIndirectCall))
10969         {
10970             CopyArgBlkVariables();
10971             funcAddr = TruncatePointer(funcAddr);
10972             m_encoder->IndirectStackCall(nullptr, funcAddr, argSizeInGRF, retSizeInGRF);
10973             m_encoder->Push();
10974             CopyReturnValue(inst);
10975         }
10976         else
10977         {
10978             // If the call is not uniform, we have to make a uniform call per lane
10979             // First get the execution mask for active lanes
10980             CVariable* eMask = GetExecutionMask();
10981             // Create a label for the loop
10982             uint label = m_encoder->GetNewLabelID("non_unif_call_body");
10983             m_encoder->Label(label);
10984             m_encoder->Push();
10985 
10986             // Get the first active lane's function address
10987             CVariable* offset = nullptr;
10988             funcAddr = TruncatePointer(funcAddr);
10989             CVariable* uniformAddr = UniformCopy(funcAddr, offset, eMask);
10990             // Set the predicate to true for all lanes with the same address
10991             CVariable* callPred = m_currShader->ImmToVariable(0, ISA_TYPE_BOOL);
10992             m_encoder->Cmp(EPREDICATE_EQ, callPred, uniformAddr, funcAddr);
10993             m_encoder->Push();
10994 
10995             uint callLabel = m_encoder->GetNewLabelID("non_unif_call_end");
10996             m_encoder->SetInversePredicate(true);
10997             m_encoder->Jump(callPred, callLabel);
10998             m_encoder->Push();
10999 
11000             // Copy args to ArgBlk on each iteration of the loop, such that arg registers
11001             // won't be corrupted by previous iterations.
11002             CopyArgBlkVariables();
11003 
11004             // Indirect call for all lanes set by the flag
11005             m_encoder->IndirectStackCall(nullptr, uniformAddr, argSizeInGRF, retSizeInGRF);
11006             m_encoder->Copy(eMask, eMask);
11007             m_encoder->Push();
11008 
11009             // For non-uniform call, copy the ret inside this loop so that it'll honor the loop mask
11010             CopyReturnValue(inst);
11011 
11012             // Label for lanes that skipped the call
11013             m_encoder->Label(callLabel);
11014             m_encoder->Push();
11015 
11016             // Unset the bits in execution mask for lanes that were called
11017             CVariable* callMask = m_currShader->GetNewVariable(1, eMask->GetType(), eMask->GetAlign(), true, CName::NONE);
11018             CVariable* loopPred = m_currShader->ImmToVariable(0, ISA_TYPE_BOOL);
11019             m_encoder->Cast(callMask, callPred);
11020             m_encoder->Not(callMask, callMask);
11021             m_encoder->And(eMask, eMask, callMask);
11022             m_encoder->Push();
11023             m_encoder->SetP(loopPred, eMask);
11024             m_encoder->Push();
11025 
11026             // Loop while there are bits still left in the mask
11027             m_encoder->Jump(loopPred, label);
11028             m_encoder->Push();
11029         }
11030     }
11031 
11032     if (offsetS > 0)
11033     {
11034         // Set the max stack sized pushed in the parent function for this call's args
11035         m_encoder->SetFunctionMaxArgumentStackSize(inst->getParent()->getParent(), offsetS);
11036 
11037         //  pop stack pointer after the call
11038         CVariable* pSP = m_currShader->GetSP();
11039         CVariable* pPopSize = m_currShader->ImmToVariable((uint64_t)(~offsetS + 1), ISA_TYPE_D);
11040         emitAddPointer(pSP, pSP, pPopSize);
11041     }
11042 }
11043 
isFuncSRetArg(Argument * arg)11044 static inline bool isFuncSRetArg(Argument * arg)
11045 {
11046     Function * F = arg->getParent();
11047     return (arg == F->arg_begin() &&
11048         arg != F->arg_end() &&
11049         arg->hasStructRetAttr() &&
11050         F->getReturnType()->isVoidTy());
11051 }
11052 
emitStackFuncEntry(Function * F)11053 void EmitPass::emitStackFuncEntry(Function* F)
11054 {
11055     m_encoder->SetDispatchSimdSize();
11056     m_currShader->InitializeStackVariables();
11057 
11058     if (F->hasFnAttribute("referenced-indirectly"))
11059     {
11060         m_encoder->SetExternFunctionFlag();
11061     }
11062 
11063     CVariable* ArgBlkVar = m_currShader->GetARGV();
11064     uint32_t offsetA = 0;  // visa argument offset
11065     uint32_t offsetS = 0;  // visa stack offset
11066     std::vector<CVariable*> argsOnStack;
11067     for (auto& Arg : F->args())
11068     {
11069         if (!F->hasFnAttribute("referenced-indirectly"))
11070         {
11071             // Skip unused arguments if any for direct call
11072             if (Arg.use_empty()) continue;
11073         }
11074 
11075         // adjust offset for alignment
11076         CVariable* Dst = m_currShader->getOrCreateArgumentSymbol(&Arg, false, true);
11077         uint align = getGRFSize();
11078         offsetA = int_cast<unsigned>(llvm::alignTo(offsetA, align));
11079         uint argSize = Dst->GetSize();
11080         if (Dst->GetType() == ISA_TYPE_BOOL)
11081         {
11082             argSize = numLanes(m_currShader->m_dispatchSize) * SIZE_WORD;
11083         }
11084         // check if an argument can be written to ARGV based upon offset + arg-size
11085         bool overflow = ((offsetA + argSize) > ArgBlkVar->GetSize());
11086         if (!overflow)
11087         {
11088             if (!Arg.use_empty())
11089             {
11090                 CVariable* Src = ArgBlkVar;
11091                 if (Dst->GetType() == ISA_TYPE_BOOL)
11092                 {
11093                     Src = m_currShader->GetNewAlias(ArgBlkVar, ISA_TYPE_W, (uint16_t)offsetA, numLanes(m_currShader->m_dispatchSize), false);
11094                     m_encoder->Cmp(EPREDICATE_NE, Dst, Src, m_currShader->ImmToVariable(0, ISA_TYPE_W));
11095                 }
11096                 else if (m_FGA->isLeafFunc(F))
11097                 {
11098                     // Directly map the dst register to an alias of ArgBlkVar, and update symbol mapping for future uses
11099                     Dst = m_currShader->GetNewAlias(ArgBlkVar, Dst->GetType(), (uint16_t)offsetA, Dst->GetNumberElement(), Dst->IsUniform());
11100                     m_currShader->UpdateSymbolMap(&Arg, Dst);
11101                 }
11102                 else
11103                 {
11104                     // For calls not guaranteed to preserve the ARG register, we copy it first to a temp
11105                     if (Src->GetType() != Dst->GetType() || offsetA != 0 || Src->IsUniform() != Dst->IsUniform())
11106                     {
11107                         Src = m_currShader->GetNewAlias(ArgBlkVar, Dst->GetType(), (uint16_t)offsetA, Dst->GetNumberElement(), Dst->IsUniform());
11108                     }
11109                     emitCopyAll(Dst, Src, Arg.getType());
11110                 }
11111             }
11112             offsetA += argSize;
11113         }
11114         else
11115         {
11116             argsOnStack.push_back(Dst);
11117         }
11118 
11119         // Get the symbol for arg0 if it has the "sret" attribute and save it.
11120         if (isFuncSRetArg(&Arg)) m_currShader->SaveSRet(Dst);
11121     }
11122     m_encoder->SetStackFunctionArgSize((offsetA + getGRFSize() - 1) / getGRFSize());
11123 
11124     // Read all stack-pushed args back into registers
11125     offsetS = emitStackArgumentLoadOrStore(argsOnStack, false);
11126 
11127     unsigned totalAllocaSize = 0;
11128 
11129     // reserve space for all the alloca in the function subgroup
11130     auto funcMDItr = m_currShader->m_ModuleMetadata->FuncMD.find(F);
11131     if (funcMDItr != m_currShader->m_ModuleMetadata->FuncMD.end() && funcMDItr->second.privateMemoryPerWI != 0)
11132     {
11133         totalAllocaSize += funcMDItr->second.privateMemoryPerWI * numLanes(m_currShader->m_dispatchSize);
11134     }
11135 
11136     // save FP before allocation
11137     m_currShader->SaveStackState();
11138 
11139     // Push a new stack frame
11140     emitPushFrameToStack(totalAllocaSize);
11141 
11142     // Set the per-function private mem size
11143     m_encoder->SetFunctionAllocaStackSize(F, totalAllocaSize);
11144 }
11145 
emitStackFuncExit(llvm::ReturnInst * inst)11146 void EmitPass::emitStackFuncExit(llvm::ReturnInst* inst)
11147 {
11148     // restore SP and FP
11149     m_currShader->RestoreStackState();
11150 
11151     llvm::Function* F = inst->getParent()->getParent();
11152     llvm::Type* RetTy = F->getReturnType();
11153     CVariable* Dst = m_currShader->GetRETV();
11154     if (!RetTy->isVoidTy())
11155     {
11156         unsigned RetSize = 0;
11157         unsigned nLanes = numLanes(m_currShader->m_dispatchSize);
11158         CVariable* Src = GetSymbol(inst->getReturnValue());
11159 
11160         if (Src->GetType() == ISA_TYPE_BOOL)
11161         {
11162             CVariable* one = m_currShader->ImmToVariable(1, ISA_TYPE_W);
11163             CVariable* zero = m_currShader->ImmToVariable(0, ISA_TYPE_W);
11164             CVariable* DstAlias = m_currShader->GetNewAlias(Dst, ISA_TYPE_W, 0, nLanes, false);
11165             m_encoder->Select(Src, DstAlias, one, zero);
11166             RetSize = nLanes * SIZE_WORD;
11167         }
11168         else
11169         {
11170             bool isSrcUniform = Src->IsUniform();
11171             RetSize = isSrcUniform ? nLanes * Src->GetSize() : Src->GetSize();
11172             IGC_ASSERT_MESSAGE(RetSize <= Dst->GetSize(), "No support for return on stack!");
11173 
11174             if (Dst->GetType() != Src->GetType() || Dst->IsUniform() != Src->IsUniform())
11175             {
11176                 unsigned elements = isSrcUniform ? Src->GetNumberElement() * nLanes : Src->GetNumberElement();
11177                 Dst = m_currShader->GetNewAlias(Dst, Src->GetType(), 0, elements, false);
11178             }
11179             emitCopyAll(Dst, Src, RetTy);
11180         }
11181         m_encoder->SetStackFunctionRetSize((RetSize + getGRFSize() - 1) / getGRFSize());
11182     }
11183     else
11184     {
11185         // Based on other arch's ABIs, the sret argument is guaranteed to be written to the return register upon function exit.
11186         // vISA ABI states that the return and argument registers start at the same location. If the function is non-void, %retVal
11187         // starts at r26. Otherwise, %arg0 will start at r26.
11188         // Here we write the saved arg0 value back into arg0. Since arg0 has the "sret" attribute, the function is guaranteed to be void,
11189         // thus writing to %arg0 is the same as writing to %retval.
11190         // We still set the retSize to 0 to match the LLVM IR function signature, so we avoid writing to vISA's return reg directly.
11191         // Note: For leaf functions, we don't need to copy since we are guaranteed that %arg0 will not be overwritten.
11192         CVariable* sretPtr = m_currShader->GetAndResetSRet();
11193         if (sretPtr && isFuncSRetArg(F->arg_begin()) && !m_FGA->isLeafFunc(F))
11194         {
11195             // If the sret value is saved, copy it back into arg0
11196             CVariable* ArgBlk = m_currShader->GetARGV();
11197             CVariable* Arg0 = m_currShader->GetNewAlias(ArgBlk, sretPtr->GetType(), 0, sretPtr->GetNumberElement(), sretPtr->IsUniform());
11198             m_encoder->Copy(Arg0, sretPtr);
11199             m_encoder->Push();
11200         }
11201         m_encoder->SetStackFunctionRetSize(0);
11202     }
11203     // emit return
11204     m_encoder->StackRet(nullptr);
11205     m_encoder->Push();
11206 }
11207 
emitSymbolRelocation(Function & F)11208 void EmitPass::emitSymbolRelocation(Function& F)
11209 {
11210     Module* pModule = F.getParent();
11211 
11212     SmallSet<Function*, 16> funcAddrSymbols;
11213     SmallSet<GlobalVariable*, 16> globalAddrSymbols;
11214 
11215     ModuleMetaData* moduleMD = getAnalysis<MetaDataUtilsWrapper>().getModuleMetaData();
11216 
11217     std::function<void(Value*)> usedValues =
11218         [&usedValues, &funcAddrSymbols, &globalAddrSymbols, moduleMD, pModule]
11219     (Value* v)
11220     {
11221         if (Function* pFunc = dyn_cast<Function>(v))
11222         {
11223             if (pModule == pFunc->getParent() &&
11224                 pFunc->hasFnAttribute("referenced-indirectly"))
11225                 funcAddrSymbols.insert(pFunc);
11226         }
11227         else if (GlobalVariable* pGlobal = dyn_cast<GlobalVariable>(v))
11228         {
11229             if (pModule == pGlobal->getParent() &&
11230                 moduleMD->inlineProgramScopeOffsets.count(pGlobal) > 0)
11231                 globalAddrSymbols.insert(pGlobal);
11232         }
11233         else if (Constant* C = dyn_cast<Constant>(v))
11234         {
11235             for (auto it = C->value_op_begin(), end = C->value_op_end(); it != end; it++)
11236                 usedValues(*it);
11237         }
11238     };
11239 
11240     for (auto&& BB : F)
11241     {
11242         for (auto& I : BB)
11243         {
11244             for (auto it = I.value_op_begin(), end = I.value_op_end(); it != end; it++)
11245                 usedValues(*it);
11246         }
11247     }
11248 
11249     for (auto pFunc : funcAddrSymbols)
11250     {
11251         m_currShader->CreateFunctionSymbol(pFunc);
11252     }
11253 
11254     for (auto pGlobal : globalAddrSymbols)
11255     {
11256         m_currShader->CreateGlobalSymbol(pGlobal);
11257     }
11258 }
11259 
emitStoreRawIndexed(StoreRawIntrinsic * inst,Value * varOffset,ConstantInt * immOffset)11260 void EmitPass::emitStoreRawIndexed(
11261     StoreRawIntrinsic* inst, Value* varOffset, ConstantInt* immOffset)
11262 {
11263     Value* pBufPtr = inst->getResourceValue();
11264     Value* pValToStore = inst->getStoreValue();
11265 
11266     m_currShader->isMessageTargetDataCacheDataPort = true;
11267 
11268     IGC_ASSERT(immOffset == nullptr);
11269     emitStore3DInner(pValToStore, pBufPtr, varOffset);
11270 }
11271 
emitStore3D(StoreInst * inst,Value * elmIdxV)11272 void EmitPass::emitStore3D(StoreInst* inst, Value* elmIdxV)
11273 {
11274     // Only support for scratch space added currently during emitStore
11275     Value* pllValToStore = inst->getValueOperand();
11276     Value* pllDstPtr = inst->getPointerOperand();
11277 
11278 
11279     emitStore3DInner(pllValToStore, pllDstPtr, elmIdxV);
11280 }
11281 
emitStore3DInner(Value * pllValToStore,Value * pllDstPtr,Value * pllElmIdx)11282 void EmitPass::emitStore3DInner(Value* pllValToStore, Value* pllDstPtr, Value* pllElmIdx)
11283 {
11284     IGC_ASSERT(pllDstPtr != nullptr);
11285 
11286     bool isPrivateMem = pllDstPtr->getType()->getPointerAddressSpace() == ADDRESS_SPACE_PRIVATE;
11287     if (!isPrivateMem)
11288     {
11289         ForceDMask(false);
11290     }
11291 
11292     ResourceDescriptor resource = GetResourceVariable(pllDstPtr);
11293 
11294     uint sizeInBits = GetPrimitiveTypeSizeInRegisterInBits(pllValToStore->getType());
11295 
11296     IGC_ASSERT_MESSAGE((sizeInBits == 8) || (sizeInBits == 16) || (sizeInBits == 32) || (sizeInBits == 64) || (sizeInBits == 96) || (sizeInBits == 128),
11297         "Stored type must be 1/2/4/8/12/16 bytes long");
11298 
11299     CVariable* storedVal = GetSymbol(pllValToStore);
11300 
11301     IGC_ASSERT(pllElmIdx);
11302     CVariable* ptr = GetSymbol(pllElmIdx);
11303 
11304     IGC_ASSERT(pllDstPtr->getType()->isPointerTy());
11305     if (!IGC::isA64Ptr(cast<PointerType>(pllDstPtr->getType()), m_currShader->GetContext()))
11306     {
11307         ptr = TruncatePointer(ptr);
11308     }
11309 
11310     CVariable* gOffset = m_currShader->ImmToVariable(0x0, ISA_TYPE_UD);
11311 
11312     // The stored value and the ptr must be placed aligned in GRFs, as SIMDSize DWORDs.
11313     // So if it's not already in this form, bring it to it:
11314     // Broadcast the value, and extend it (doesn't matter if it's sext, zext, or any
11315     // other kind of extend).
11316 
11317     CVariable* storedValOriginal = storedVal;
11318     CVariable* ptrOriginal = ptr;
11319 
11320     storedVal = BroadcastIfUniform(storedVal);
11321     ptr = BroadcastIfUniform(ptr);
11322 
11323     uint label = 0;
11324     CVariable* flag = nullptr;
11325     bool needLoop = ResourceLoopHeader(resource, flag, label);
11326     if (sizeInBits == 32)
11327     {
11328         if (resource.m_surfaceType == ESURFACE_STATELESS &&
11329             m_currShader->m_Platform->getWATable().WaNoA32ByteScatteredStatelessMessages)
11330         {
11331             // DWORD scatter
11332             CVariable* shiftedPtr = m_currShader->GetNewVariable(ptr);
11333             m_encoder->Shr(shiftedPtr, ptr, m_currShader->ImmToVariable(2, ISA_TYPE_UD));
11334             m_encoder->Push();
11335             ptr = shiftedPtr;
11336             setPredicateForDiscard(flag);
11337             m_encoder->Scatter(
11338                 storedVal,
11339                 resource.m_resource,
11340                 ptr,
11341                 gOffset,
11342                 resource.m_surfaceType,
11343                 4);
11344             m_encoder->Push();
11345         }
11346         else
11347         {
11348             if (m_currShader->m_Platform->emulateByteScraterMsgForSS() &&
11349                 (ESURFACE_SCRATCH == resource.m_surfaceType))
11350             {
11351                 setPredicateForDiscard(flag);
11352                 bool isUniformInst = (ptrOriginal->IsUniform() && storedValOriginal->IsUniform());
11353                 ptrOriginal = (isUniformInst ? ReAlignUniformVariable(ptrOriginal, EALIGN_GRF) : ptr);
11354                 storedValOriginal = (isUniformInst ? ReAlignUniformVariable(storedValOriginal, EALIGN_GRF) : storedVal);
11355                 m_encoder->Scatter4Scaled(storedValOriginal, resource, ptrOriginal);
11356             }
11357             else
11358             {
11359                 // using byte scatter
11360                 uint elementSize = 8;
11361                 uint numElems = 4;
11362                 setPredicateForDiscard(flag);
11363                 m_encoder->ByteScatter(
11364                     storedVal,
11365                     resource,
11366                     ptr,
11367                     elementSize,
11368                     numElems);
11369             }
11370             m_encoder->Push();
11371         }
11372     }
11373     else if (sizeInBits == 16 || sizeInBits == 8)
11374     {
11375         // using byte scatter
11376         uint elementSize = 8;
11377         uint numElems = sizeInBits / 8;
11378         VISA_Type elementType = (sizeInBits == 8) ? ISA_TYPE_UB : ISA_TYPE_UW;
11379         CVariable* val = m_currShader->GetNewVariable(
11380             numLanes(m_currShader->m_SIMDSize), ISA_TYPE_UD, EALIGN_GRF, CName::NONE);
11381         storedVal = m_currShader->GetNewAlias(storedVal, elementType, 0, 0);
11382         m_encoder->Cast(val, storedVal);
11383         setPredicateForDiscard(flag);
11384         m_encoder->ByteScatter(
11385             val,
11386             resource,
11387             ptr,
11388             elementSize,
11389             numElems);
11390         m_encoder->Push();
11391     }
11392     else  // (sizeInBits > 32)
11393     {
11394         setPredicateForDiscard(flag);
11395         m_encoder->Scatter4Scaled(storedVal, resource, ptr);
11396         m_encoder->Push();
11397     }
11398     ResourceLoopBackEdge(needLoop, flag, label);
11399     if (!isPrivateMem)
11400     {
11401         ResetVMask(false);
11402     }
11403 }
11404 
emitStore(StoreInst * inst,Value * varOffset,ConstantInt * immOffset)11405 void EmitPass::emitStore(StoreInst* inst, Value* varOffset, ConstantInt* immOffset)
11406 {
11407     emitVectorStore(inst, varOffset, immOffset);
11408 }
11409 
GetSymbol(llvm::Value * v) const11410 CVariable* EmitPass::GetSymbol(llvm::Value* v) const
11411 {
11412     return m_currShader->GetSymbol(v);
11413 }
11414 
CountStatelessIndirectAccess(llvm::Value * pointer,ResourceDescriptor resource)11415 void EmitPass::CountStatelessIndirectAccess(llvm::Value* pointer, ResourceDescriptor resource)
11416 {
11417     instrMap.clear();
11418     IGC_ASSERT_MESSAGE(isa<PointerType>(pointer->getType()), "Value should be a pointer");
11419     if (resource.m_surfaceType == ESURFACE_STATELESS && IsIndirectAccess(pointer))
11420     {
11421         m_currShader->IncIndirectStatelessCount();
11422     }
11423 }
11424 
IsIndirectAccess(llvm::Value * pointer)11425 bool EmitPass::IsIndirectAccess(llvm::Value* pointer)
11426 {
11427     Instruction* inst = dyn_cast<Instruction>(pointer);
11428     if (inst == nullptr)
11429     {
11430         return false;
11431     }
11432 
11433     // we cache the instructions
11434     // when we meet the instruction again know it has already been checked
11435     if (instrMap.count(inst))
11436     {
11437         return instrMap.lookup(inst);
11438     }
11439 
11440     bool isIndirect = false;
11441     instrMap.try_emplace(inst, isIndirect);
11442 
11443     if (LoadInst* loadInst = dyn_cast<LoadInst>(inst))
11444     {
11445         isIndirect = true;
11446     }
11447     else if (CallInst* callInstr = dyn_cast<CallInst>(inst))
11448     {
11449         // if the call instruction isn't intrinsic we assume that it should be indirect
11450         // because intrinsic is rather the simple arithmetic
11451         GenIntrinsicInst* pIntrinsic = dyn_cast<GenIntrinsicInst>(callInstr);
11452         if (pIntrinsic == nullptr)
11453         {
11454             isIndirect = true;
11455         }
11456     }
11457 
11458     if (!isIndirect)
11459     {
11460         for (unsigned int i = 0; i < inst->getNumOperands(); i++)
11461         {
11462             if (IsIndirectAccess(inst->getOperand(i)))
11463             {
11464                 isIndirect = true;
11465                 break;
11466             }
11467         }
11468     }
11469     instrMap.insert(std::make_pair(inst, isIndirect));
11470     return isIndirect;
11471 }
11472 
emitInsert(llvm::Instruction * inst)11473 void EmitPass::emitInsert(llvm::Instruction* inst)
11474 {
11475     auto IEI = llvm::cast<llvm::InsertElementInst>(inst);
11476     // Skip emit scalar copy if this `insertelement` could be aliased.
11477     if (m_currShader->CanTreatScalarSourceAsAlias(IEI))
11478         return;
11479 
11480     llvm::Type* eTy = inst->getOperand(1)->getType();
11481     // Do not use allocated type to compute the offsets; otherwise the computed
11482     // offsets may be out-of-bound. The alignment information of the base
11483     // element type should not impact the offset.
11484     uint32_t eBytes = GetScalarTypeSizeInRegister(eTy);
11485     IGC_ASSERT_MESSAGE(eBytes, "illegal InsertElementInst instruction");
11486 
11487     llvm::Value* pVec = inst->getOperand(0);
11488     CVariable* pInstVar = GetSymbol(inst);
11489     CVariable* pVecVar = nullptr;
11490     llvm::Type* pVecType = inst->getType();
11491     if (!isa<UndefValue>(pVec))
11492     {
11493         if (isa<ConstantVector>(pVec))
11494         {
11495             auto CV = cast<ConstantVector>(pVec);
11496             pInstVar = m_currShader->GetConstant(CV, pInstVar);
11497         }
11498         else
11499         {
11500             pVecVar = GetSymbol(pVec);
11501             if (pVecVar != pInstVar)
11502             {
11503                 emitVectorCopy(pInstVar, pVecVar, int_cast<unsigned>(dyn_cast<IGCLLVM::FixedVectorType>(pVecType)->getNumElements()));
11504             }
11505         }
11506     }
11507 
11508     if (llvm::ConstantInt * pConstElem = llvm::dyn_cast<llvm::ConstantInt>(IEI->getOperand(2)))
11509     {
11510         CVariable* pElm = GetSymbol(inst->getOperand(1));
11511 
11512         uint element = int_cast<uint>(pConstElem->getZExtValue());
11513         uint eStartBytes;
11514         if (m_currShader->GetIsUniform(inst) && m_currShader->GetIsUniform(pVec))
11515         {
11516             eStartBytes = eBytes * element;
11517         }
11518         else
11519         {
11520             eStartBytes = numLanes(m_currShader->m_SIMDSize) * eBytes * element;
11521         }
11522 
11523         uint subVar = (eStartBytes / getGRFSize());
11524         uint subReg = (eStartBytes % getGRFSize()) / eBytes; // unit of element(eTy)
11525         m_encoder->SetDstSubVar(subVar);
11526         m_encoder->SetDstSubReg(subReg);
11527         m_encoder->Copy(m_destination, pElm);
11528         m_encoder->Push();
11529     }
11530     else
11531     {
11532         // the index is not a compile-time constant, we need to use runtime indirect addressing
11533         llvm::Value* pElement = inst->getOperand(1);       // element to insert
11534         llvm::Value* pIndex = inst->getOperand(2);         // index to insert at
11535         CVariable* pIndexVar = m_currShader->BitCast(GetSymbol(pIndex), ISA_TYPE_UW);
11536         CVariable* pElemVar = GetSymbol(pElement);
11537 
11538         // size of vector entry
11539         const uint vectorEntrySimdWidth = pInstVar->IsUniform() ?
11540             1 : numLanes(m_currShader->m_SIMDSize);
11541 
11542         const uint vecTypeSize =
11543             GetPrimitiveTypeSizeInRegister(cast<VectorType>(pVecType)->getElementType());
11544 
11545         const uint offset = vectorEntrySimdWidth * vecTypeSize;
11546 
11547         CVariable* pOffset1 = m_currShader->ImmToVariable(offset, ISA_TYPE_UW);
11548 
11549         // offset2 = index * sizeof(vector entry)  <-- offset within the vector counted in bytes
11550         CVariable* pOffset2 = m_currShader->GetNewVariable(
11551             pIndexVar->IsUniform() ? 1 : numLanes(m_currShader->m_SIMDSize),
11552             ISA_TYPE_UW,
11553             EALIGN_WORD,
11554             pIndexVar->IsUniform(), CName::NONE);
11555 
11556         if (!pIndexVar->IsUniform())
11557         {
11558             m_encoder->SetSrcRegion(0, 16, 8, 2);
11559         }
11560         m_encoder->Mul(pOffset2, pIndexVar, pOffset1);
11561         m_encoder->Push();
11562 
11563         // a0 = addressof(vector variable) + offset2 <-- address of element to insert at
11564         if (pIndexVar->IsUniform())
11565         {
11566             CVariable* pDstArrElm =
11567                 m_currShader->GetNewAddressVariable(
11568                     1,
11569                     m_destination->GetType(),
11570                     true,
11571                     pInstVar->IsUniform(),
11572                     m_destination->getName());
11573             m_encoder->AddrAdd(pDstArrElm, m_destination, pOffset2);
11574             m_encoder->Push();
11575             m_encoder->Copy(pDstArrElm, pElemVar);
11576             m_encoder->Push();
11577         }
11578         else
11579         {
11580             // Lower execution size to avoid complains of indirectly addressing across more than two GRFs.
11581             // One example is below:
11582             //(W)     mov (1|M0)              f1.1<1>:uw    0x100:uw
11583             //(f1.1)  mov(16|M0)              r[a0.8]<1>:f  r63.0 < 0; 1, 0 >:f
11584             //will be changed to
11585             //(W)     mov (1|M0)              f1.1<1>:uw    0x100:uw
11586             //(f1.1)  mov(8|M8)              r[a0.8+0x20]<1>:f  r63.0 < 0; 1, 0 >:f
11587             // To avoid complains, we limit the execSizeNew*datatypesize to the same memory size of getMinDispatchMode()
11588             // In above example, say, getMinDispatchMode()==8, that means the execSizeNew should be 8
11589             // because 8 * SIZE_DWORD = getMinDispatchMode() * SIZE_DWORD
11590             // But if datatype is 64bit, then, execSizeNew should be 4
11591             // because 4 * SIZE_QWORD = getMinDispatchMode() * SIZE_DWORD
11592             // Changing to simd1 needs more work and might cause extra overhead as well.
11593             // indirect address, emaskoffset should be offsetted correspondingly
11594             SIMDMode simdMode = std::min(m_currShader->m_SIMDSize, SIMDMode::SIMD16);
11595             SIMDMode minDispatchMode = m_currShader->m_Platform->getMinDispatchMode();
11596             SIMDMode execSizeNew = minDispatchMode;
11597             bool bWAMultiGRF = false;
11598             if (!pInstVar->IsUniform() && m_currShader->m_Platform->enableMultiGRFAccessWA())
11599             {
11600                 uint32_t dataTypeSize = GetScalarTypeSizeInRegisterInBits(pElement->getType());
11601                 uint32_t memSizeToUse = numLanes(simdMode) * dataTypeSize / 8;
11602                 uint32_t memSizeMinDisp = numLanes(minDispatchMode) * SIZE_DWORD;
11603                 bWAMultiGRF = (memSizeToUse > memSizeMinDisp);
11604                 if (bWAMultiGRF)
11605                 {
11606                     execSizeNew = lanesToSIMDMode(memSizeMinDisp * 8 / dataTypeSize);
11607                     uint32_t lanesNew = numLanes(execSizeNew);
11608                     int cnt = memSizeToUse / memSizeMinDisp;
11609                     for (int i=1; i<cnt; i++)
11610                     {
11611                         CVariable* pOffset1_2ndHalf = m_currShader->ImmToVariable(memSizeMinDisp * i, ISA_TYPE_UW);
11612                         uint32_t laneIdx = lanesNew * i;
11613                         CVariable* pOffset2_2ndHalf = m_currShader->GetNewAlias(pOffset2, ISA_TYPE_UW, laneIdx * SIZE_WORD, 0);
11614                         m_encoder->SetSrcRegion(0, lanesNew, lanesNew, 1);
11615                         m_encoder->SetSimdSize(execSizeNew);
11616                         m_encoder->SetMask((laneIdx / 8) % 2 ? EMASK_Q2 : EMASK_Q1);
11617                         m_encoder->SetSecondNibble((laneIdx / 4) % 2 ? true : false);
11618                         m_encoder->Add(pOffset2_2ndHalf, pOffset2_2ndHalf, pOffset1_2ndHalf);
11619                         m_encoder->Push();
11620                     }
11621                     m_encoder->SetSecondNibble(false);
11622                 }
11623             }
11624 
11625             int loopCount = (m_currShader->m_dispatchSize == SIMDMode::SIMD32 && m_currShader->m_numberInstance == 1) ? 2 : 1;
11626             for (int i = 0; i < loopCount; ++i)
11627             {
11628                 CVariable* dst = m_destination;
11629                 if (i == 1)
11630                 {
11631                     // explicitly set second half as we are manually splitting
11632                     m_encoder->SetSecondHalf(true);
11633                     m_encoder->SetSrcSubReg(1, 16);
11634                     dst = m_currShader->GetNewAlias(dst, dst->GetType(), 16 * dst->GetElemSize(), 0);
11635                 }
11636                 CVariable* pDstArrElm = m_currShader->GetNewAddressVariable(
11637                     numLanes(simdMode),
11638                     m_destination->GetType(),
11639                     false,
11640                     pInstVar->IsUniform(),
11641                     m_destination->getName());
11642 
11643                 m_encoder->SetSimdSize(simdMode);
11644                 m_encoder->AddrAdd(pDstArrElm, dst, pOffset2);
11645                 m_encoder->Push();
11646 
11647                 // Handle the case when the index is non-uniform - we need to lookup a different value
11648                 // for each simd lane.
11649                 // Since HW doesn't support scattered GRF writes, we need to simulate
11650                 // scattered write by a sequence of instructions, each one writing to a single simd-lane.
11651                 for (uint lane = 0; lane < numLanes(simdMode); ++lane)
11652                 {
11653                     uint position = lane + i * 16;
11654                     // write to uniform-vector has no-mask and no-predicate
11655                     if (!pInstVar->IsUniform())
11656                     {
11657                         CVariable* immMask = m_currShader->ImmToVariable(1ULL << lane, ISA_TYPE_UD);
11658                         CVariable* dstPred = m_currShader->GetNewVariable(
11659                             numLanes(m_SimdMode),
11660                             ISA_TYPE_BOOL,
11661                             EALIGN_BYTE,
11662                             CName::NONE);
11663 
11664                         m_encoder->SetSimdSize(simdMode);
11665                         m_encoder->SetP(dstPred, immMask);
11666                         m_encoder->Push();
11667                         m_encoder->SetPredicate(dstPred);
11668                     }
11669                     if (!pElemVar->IsUniform())
11670                     {
11671                         m_encoder->SetSrcSubReg(0, position);
11672                     }
11673                     m_encoder->SetSrcRegion(0, 0, 1, 0);
11674                     m_encoder->SetDstSubReg(lane);
11675                     if (bWAMultiGRF)
11676                     {
11677                         m_encoder->SetMask((lane / 8) % 2 ? EMASK_Q2 : EMASK_Q1);
11678                         if (execSizeNew == SIMDMode::SIMD4)
11679                         {
11680                             m_encoder->SetSecondNibble((lane / 4) % 2 ? true : false);
11681                         }
11682                         m_encoder->SetSimdSize(execSizeNew);
11683                     }
11684                     else if (pInstVar->IsUniform())
11685                     {
11686                         m_encoder->SetSimdSize(SIMDMode::SIMD1);
11687                         m_encoder->SetNoMask();
11688                     }
11689                     else
11690                     {
11691                         m_encoder->SetSimdSize(simdMode);
11692                     }
11693                     m_encoder->Copy(pDstArrElm, pElemVar);
11694                     m_encoder->Push();
11695                     m_encoder->SetSecondNibble(false);
11696                 }
11697             }
11698         }
11699     }
11700 }
11701 
emitBranch(llvm::BranchInst * branch,const SSource & cond,e_predMode predMode)11702 void EmitPass::emitBranch(llvm::BranchInst* branch, const SSource& cond, e_predMode predMode)
11703 {
11704     llvm::BasicBlock* next = m_blockCoalescing->SkipEmptyBasicBlock(branch->getParent()->getNextNode());
11705     if (branch->isConditional())
11706     {
11707         CVariable* flag = GetSrcVariable(cond);
11708         bool inversePred = cond.mod == EMOD_NOT;;
11709         // if it is not a fallthrough
11710         BasicBlock* succ0 = m_blockCoalescing->FollowEmptyBlock(branch->getSuccessor(0));
11711         BasicBlock* succ1 = m_blockCoalescing->FollowEmptyBlock(branch->getSuccessor(1));
11712         uint label0 = m_pattern->GetBlockId(succ0);
11713         uint label1 = m_pattern->GetBlockId(succ1);
11714 
11715         m_encoder->SetPredicateMode(predMode);
11716         m_encoder->SetInversePredicate(inversePred);
11717 
11718         if (next == NULL || (next != succ0 && next != succ1))
11719         {
11720             // Both succ0 and succ1 are not next. Thus, need one conditional jump and
11721             // one unconditional jump. There are three cases for selecting the target
11722             // of the conditional jump:
11723             //    1. both are backward, select one with the larger ID (closer to branch) as target
11724             //           L0:
11725             //              ....
11726             //           L1:
11727             //              ...
11728             //           [+-flag] goto L1
11729             //           goto L0
11730             //
11731             //    2. both are forward,  select one with the larger ID (farther to branch) as target
11732             //           [+- flag] goto L1
11733             //            goto L0
11734             //            ...
11735             //           L0:
11736             //              ......
11737             //           L1:
11738             //       (making sense in this way ?)
11739             //    3. one is backward and one is forward, select the backward one as target.
11740             //
11741             uint label = m_pattern->GetBlockId(branch->getParent());
11742             uint condTarget, uncondTarget;
11743             if ((label0 <= label && label1 <= label) || (label0 > label && label1 > label))
11744             {
11745                 // case 1 & 2
11746                 condTarget = (label0 < label1) ? label1 : label0;
11747                 uncondTarget = (label0 < label1) ? label0 : label1;
11748             }
11749             else
11750             {
11751                 // case 3
11752                 condTarget = (label0 <= label) ? label0 : label1;
11753                 uncondTarget = (label0 <= label) ? label1 : label0;
11754             }
11755 
11756             if (condTarget == uncondTarget)
11757             {   // sanity check. label0 == label1 (we don't expect it, but it's legal)
11758                 m_encoder->Jump(condTarget);
11759                 m_encoder->Push();
11760             }
11761             else
11762             {
11763                 if (condTarget != label0)
11764                 {
11765                     m_encoder->SetInversePredicate(!inversePred);
11766                 }
11767                 m_encoder->Jump(flag, condTarget);
11768                 m_encoder->Push();
11769 
11770                 m_encoder->Jump(uncondTarget);
11771                 m_encoder->Push();
11772             }
11773         }
11774         else if (next != succ0)
11775         {
11776             IGC_ASSERT_MESSAGE(next == succ1, "next should be succ1!");
11777 
11778             m_encoder->Jump(flag, label0);
11779             m_encoder->Push();
11780         }
11781         else
11782         {
11783             IGC_ASSERT_MESSAGE(next == succ0, "next should be succ0");
11784 
11785             m_encoder->SetInversePredicate(!inversePred);
11786             m_encoder->Jump(flag, label1);
11787             m_encoder->Push();
11788         }
11789     }
11790     else
11791     {
11792         BasicBlock* succ = m_blockCoalescing->FollowEmptyBlock(branch->getSuccessor(0));
11793         if ((next == NULL) || (next != succ))
11794         {
11795             uint label = m_pattern->GetBlockId(succ);
11796             m_encoder->Jump(label);
11797             m_encoder->Push();
11798         }
11799     }
11800 }
11801 
emitDiscardBranch(BranchInst * branch,const SSource & cond)11802 void EmitPass::emitDiscardBranch(
11803     BranchInst* branch, const SSource& cond)
11804 {
11805     if (m_pattern->NeedVMask())
11806     {
11807         emitBranch(branch, cond, EPRED_ALL);
11808     }
11809     else
11810     {
11811         emitBranch(branch, cond, EPRED_NORMAL);
11812     }
11813 }
11814 
SplitSIMD(llvm::Instruction * inst,uint numSources,uint headerSize,CVariable * payload,SIMDMode mode,uint half)11815 void EmitPass::SplitSIMD(llvm::Instruction* inst, uint numSources, uint headerSize, CVariable* payload, SIMDMode mode, uint half)
11816 {
11817     for (uint i = 0; i < numSources; ++i)
11818     {
11819         const unsigned int GRFSizeBy4 = (getGRFSize() >> 2);
11820         IGC_ASSERT(GRFSizeBy4);
11821 
11822         uint subVarIdx = numLanes(mode) / GRFSizeBy4 * i + headerSize;
11823 
11824         CVariable* rawDst = payload;
11825         CVariable* src = GetSymbol(inst->getOperand(i));
11826         // The source have to match for a raw copy
11827         if (src->GetType() != payload->GetType())
11828         {
11829             rawDst = m_currShader->BitCast(payload, src->GetType());
11830         }
11831         m_encoder->SetSimdSize(mode);
11832         m_encoder->SetDstSubVar(subVarIdx);
11833         m_encoder->SetSrcSubVar(0, half);
11834         m_encoder->SetMask(half == 0 ? EMASK_Q1 : EMASK_Q2);
11835         m_encoder->Copy(rawDst, src);
11836         m_encoder->Push();
11837     }
11838 }
11839 
11840 template<size_t N>
JoinSIMD(CVariable * (& tempdst)[N],uint responseLength,SIMDMode mode)11841 void EmitPass::JoinSIMD(CVariable* (&tempdst)[N], uint responseLength, SIMDMode mode)
11842 {
11843     auto origMode = mode == SIMDMode::SIMD8 ? SIMDMode::SIMD16 : SIMDMode::SIMD32;
11844     uint iterationCount = numLanes(m_currShader->m_SIMDSize) / numLanes(mode);
11845     for (uint half = 0; half < iterationCount; half++)
11846     {
11847         for (uint i = 0; i < responseLength; ++i)
11848         {
11849             const unsigned int GRFSizeBy4 = (getGRFSize() >> 2);
11850             IGC_ASSERT(GRFSizeBy4);
11851             m_encoder->SetSimdSize(mode);
11852             const unsigned int subVarIdx = numLanes(origMode) / GRFSizeBy4 * i;
11853             m_encoder->SetSrcSubVar(0, i);
11854             m_encoder->SetDstSubVar(subVarIdx + half);
11855             m_encoder->SetMask(half == 0 ? (mode == SIMDMode::SIMD8 ? EMASK_Q1 : EMASK_H1) :
11856                 (mode == SIMDMode::SIMD8 ? EMASK_Q2 : EMASK_H2));
11857             IGC_ASSERT(half < ARRAY_COUNT(tempdst));
11858             m_encoder->Copy(m_destination, tempdst[half]);
11859             m_encoder->Push();
11860         }
11861     }
11862 }
11863 
BroadcastIfUniform(CVariable * pVar,bool nomask)11864 CVariable* EmitPass::BroadcastIfUniform(CVariable* pVar, bool nomask)
11865 {
11866     IGC_ASSERT_MESSAGE(nullptr != pVar, "pVar is null");
11867     VISA_Type VarT = pVar->GetType();
11868     bool Need64BitEmu = m_currShader->m_Platform->hasNoFullI64Support() &&
11869         (VarT == ISA_TYPE_Q || VarT == ISA_TYPE_UQ);
11870     bool IsImm = pVar->IsImmediate();
11871     if (pVar->IsUniform())
11872     {
11873         uint32_t width = numLanes(m_currShader->m_SIMDSize);
11874         uint elts = IsImm ? 1 : pVar->GetNumberElement();
11875         CVariable* pBroadcast =
11876             m_currShader->GetNewVariable(elts * width, pVar->GetType(),
11877                 EALIGN_GRF, CName(pVar->getName(), "Broadcast"));
11878         CVariable* Dst = pBroadcast;
11879         CVariable* Src = pVar;
11880         CVariable* ImmLo = nullptr, * ImmHi = nullptr;
11881         unsigned Stride = 1;
11882         if (Need64BitEmu) {
11883             Dst = m_currShader->GetNewAlias(pBroadcast, ISA_TYPE_UD, 0, 0);
11884             if (IsImm) {
11885                 uint64_t Imm = pVar->GetImmediateValue();
11886                 ImmLo = m_currShader->ImmToVariable(Imm & 0xFFFFFFFFULL, ISA_TYPE_UD);
11887                 ImmHi = m_currShader->ImmToVariable(Imm >> 32, ISA_TYPE_UD);
11888             }
11889             else {
11890                 Src = m_currShader->GetNewAlias(pVar, ISA_TYPE_UD, 0, 0);
11891             }
11892             Stride = 2;
11893         }
11894 
11895         for (uint i = 0; i < elts; ++i)
11896         {
11897             if (nomask)
11898                 m_encoder->SetNoMask();
11899             m_encoder->SetSrcSubReg(0, i * Stride);
11900             if (Stride != 1) m_encoder->SetDstRegion(Stride);
11901             m_encoder->SetDstSubReg((i * Stride) * width);
11902             m_encoder->Copy(Dst, ImmLo ? ImmLo : Src);
11903             m_encoder->Push();
11904             if (Need64BitEmu) {
11905                 if (nomask)
11906                     m_encoder->SetNoMask();
11907                 m_encoder->SetSrcSubReg(0, i * Stride + 1);
11908                 if (Stride != 1) m_encoder->SetDstRegion(Stride);
11909                 m_encoder->SetDstSubReg((i * Stride) * width + 1);
11910                 m_encoder->Copy(Dst, ImmHi ? ImmHi : Src);
11911                 m_encoder->Push();
11912             }
11913         }
11914 
11915         pVar = pBroadcast;
11916     }
11917 
11918     return pVar;
11919 }
11920 
11921 // Get either the 1st/2nd of the execution mask based on whether IsSecondHalf() is set
11922 // Note that for SIMD32 kernels we always return UD with one half zeroed-out
GetHalfExecutionMask()11923 CVariable* EmitPass::GetHalfExecutionMask()
11924 {
11925     auto& currBlock = getCurrentBlock();
11926     if (!currBlock.m_activeMask)
11927     {
11928         bool isSecondHalf = m_encoder->IsSecondHalf();
11929         bool isSubSpanDst = m_encoder->IsSubSpanDestination();
11930         m_encoder->SetSecondHalf(false);
11931         m_encoder->SetSubSpanDestination(false);
11932         CVariable* flag = m_currShader->ImmToVariable(0, ISA_TYPE_BOOL);
11933         CVariable* dummyVar = m_currShader->GetNewVariable(1, ISA_TYPE_UW, EALIGN_WORD, true, CName::NONE);
11934         m_encoder->Cmp(EPREDICATE_EQ, flag, dummyVar, dummyVar);
11935         m_encoder->Push();
11936 
11937         if (m_currShader->m_dispatchSize > SIMDMode::SIMD16)
11938         {
11939             m_encoder->SetSecondHalf(true);
11940             m_encoder->Cmp(EPREDICATE_EQ, flag, dummyVar, dummyVar);
11941             m_encoder->Push();
11942         }
11943         m_encoder->SetSecondHalf(isSecondHalf);
11944         m_encoder->SetSubSpanDestination(isSubSpanDst);
11945         currBlock.m_activeMask = flag;
11946     }
11947 
11948     VISA_Type maskType = m_currShader->m_dispatchSize > SIMDMode::SIMD16 ? ISA_TYPE_UD : ISA_TYPE_UW;
11949     CVariable* eMask = m_currShader->GetNewVariable(1, maskType, EALIGN_DWORD, true, CName::NONE);
11950     m_encoder->SetNoMask();
11951     m_encoder->Cast(eMask, currBlock.m_activeMask);
11952     m_encoder->Push();
11953 
11954     // for SIMD32, clear out the other half
11955     if (maskType == ISA_TYPE_UD)
11956     {
11957         CVariable* halfMask = m_currShader->GetNewVariable(1, maskType, EALIGN_DWORD, true, CName::NONE);
11958         m_encoder->SetNoMask();
11959         m_encoder->And(halfMask, eMask, m_currShader->ImmToVariable(m_encoder->IsSecondHalf() ? 0xFFFF0000 : 0xFFFF, ISA_TYPE_UD));
11960         m_encoder->Push();
11961         return halfMask;
11962     }
11963 
11964     return eMask;
11965 }
11966 
GetExecutionMask(CVariable * & vecMaskVar)11967 CVariable* EmitPass::GetExecutionMask(CVariable*& vecMaskVar)
11968 {
11969     bool isSecondHalf = m_encoder->IsSecondHalf();
11970     bool isSubSpanDst = m_encoder->IsSubSpanDestination();
11971     m_encoder->SetSecondHalf(false);
11972     m_encoder->SetSubSpanDestination(false);
11973     CVariable* flag = m_currShader->ImmToVariable(0, ISA_TYPE_BOOL);
11974 
11975     CVariable* dummyVar = m_currShader->GetNewVariable(1, ISA_TYPE_UW, EALIGN_WORD, true, CName::NONE);
11976     m_encoder->Cmp(EPREDICATE_EQ, flag, dummyVar, dummyVar);
11977     m_encoder->Push();
11978 
11979     if (m_currShader->m_dispatchSize > SIMDMode::SIMD16 && m_currShader->m_SIMDSize != SIMDMode::SIMD32)
11980     {
11981         m_encoder->SetSecondHalf(true);
11982         m_encoder->Cmp(EPREDICATE_EQ, flag, dummyVar, dummyVar);
11983         m_encoder->Push();
11984     }
11985     m_encoder->SetSecondHalf(isSecondHalf);
11986     m_encoder->SetSubSpanDestination(isSubSpanDst);
11987     vecMaskVar = flag;
11988 
11989     VISA_Type maskType = m_currShader->m_dispatchSize > SIMDMode::SIMD16 ? ISA_TYPE_UD : ISA_TYPE_UW;
11990     CVariable* eMask = m_currShader->GetNewVariable(1, maskType, EALIGN_DWORD, true, CName::NONE);
11991     m_encoder->SetNoMask();
11992     m_encoder->Cast(eMask, flag);
11993     m_encoder->Push();
11994     return eMask;
11995 }
11996 
GetExecutionMask()11997 CVariable* EmitPass::GetExecutionMask()
11998 {
11999     CVariable* vecMask = nullptr;
12000     return GetExecutionMask(vecMask);
12001 }
12002 
12003 /// UniformCopy - Copy a non-uniform source into a uniform variable by copying
12004 /// ANY active elements.
12005 
UniformCopy(CVariable * var)12006 CVariable* EmitPass::UniformCopy(CVariable* var)
12007 {
12008     CVariable* offset = nullptr;
12009     CVariable* eMask = nullptr;
12010     return UniformCopy(var, offset, eMask);
12011 }
12012 
12013 /// Uniform copy allowing to reuse the off calculated by a previous call
12014 /// This allow avoiding redundant code
UniformCopy(CVariable * var,CVariable * & off,CVariable * eMask,bool doSub)12015 CVariable* EmitPass::UniformCopy(CVariable* var, CVariable*& off, CVariable* eMask, bool doSub)
12016 {
12017     IGC_ASSERT_MESSAGE(!var->IsUniform(), "Expect non-uniform source!");
12018 
12019     if (eMask == nullptr)
12020     {
12021         eMask = GetExecutionMask();
12022     }
12023     if (off == nullptr)
12024     {
12025         // Get offset to any 1s. For simplicity, use 'fbl' to find the lowest 1s.
12026         off = m_currShader->GetNewVariable(1, ISA_TYPE_UD, EALIGN_DWORD, true, CName::NONE);
12027         if (doSub && m_encoder->IsSecondHalf())
12028         {
12029             // here our eMask is UD but we only want the upper 16-bit
12030             // use an UW alias to the high 16-bit instead
12031             auto uwMask = m_currShader->GetNewAlias(eMask, ISA_TYPE_UW, 2, 1);
12032             m_encoder->Fbl(off, uwMask);
12033         }
12034         else
12035         {
12036             m_encoder->Fbl(off, eMask);
12037         }
12038         m_encoder->Push();
12039 
12040         // Calculate byte offset
12041         CVariable* shAmt = nullptr;
12042         switch (var->GetElemSize()) {
12043         case 1:
12044             // No need to shift.
12045             break;
12046         case 2:
12047             shAmt = m_currShader->ImmToVariable(1, ISA_TYPE_W);
12048             break;
12049         case 4:
12050             shAmt = m_currShader->ImmToVariable(2, ISA_TYPE_W);
12051             break;
12052         case 8:
12053             shAmt = m_currShader->ImmToVariable(3, ISA_TYPE_W);
12054             break;
12055         default:
12056             IGC_ASSERT_MESSAGE(0, "Unsupported element size!");
12057             break;
12058         }
12059         if (shAmt) {
12060             m_encoder->Shl(off, off, shAmt);
12061             m_encoder->Push();
12062         }
12063         off = m_currShader->BitCast(off, ISA_TYPE_UW);
12064     }
12065     // Calculate that active lane address.
12066     CVariable* addr =
12067         m_currShader->GetNewAddressVariable(1, var->GetType(), true, true, var->getName());
12068 
12069     // Now, we need to jump through a few hoops for SIMD32, since the variables
12070     // representing all of the SIMD lanes may not be consecutive.
12071     uint8_t numInstances = var->GetNumberInstance();
12072 
12073     if (numInstances == 2)
12074     {
12075         uint16_t numElements = var->GetNumberElement();
12076         VISA_Type dataType = var->GetType();
12077 
12078         // Create a variable into which we'll merge both instances of the original variable,
12079         // and an alias into the upper half.
12080         CVariable* merged = m_currShader->GetNewVariable(numElements * numInstances,
12081             dataType, var->GetAlign(), false, 1, CName(var->getName(), "Merged"));
12082         CVariable* upperMerged = m_currShader->GetNewAlias(merged, dataType,
12083             numElements * m_encoder->GetCISADataTypeSize(dataType), numElements);
12084 
12085         // Now, do the copies.
12086         bool isSecondHalf = m_encoder->IsSecondHalf();
12087 
12088         m_encoder->SetSecondHalf(false);
12089         m_encoder->Copy(merged, var);
12090         m_encoder->Push();
12091 
12092         m_encoder->SetSecondHalf(true);
12093         m_encoder->Copy(upperMerged, var);
12094         m_encoder->Push();
12095 
12096         m_encoder->SetSecondHalf(false);
12097         m_encoder->AddrAdd(addr, merged, off);
12098         m_encoder->Push();
12099         m_encoder->SetSecondHalf(isSecondHalf);
12100     }
12101     else
12102     {
12103         m_encoder->AddrAdd(addr, var, off);
12104         m_encoder->Push();
12105     }
12106 
12107     // Indirect access to that active scalar register.
12108     CVariable* exVal = m_currShader->GetNewVariable(
12109         1, var->GetType(), CEncoder::GetCISADataTypeAlignment(var->GetType()), true, CName::NONE);
12110     m_encoder->Copy(exVal, addr);
12111 
12112     return exVal;
12113 }
12114 
ExtendVariable(CVariable * pVar,e_alignment uniformAlign)12115 CVariable* EmitPass::ExtendVariable(CVariable* pVar, e_alignment uniformAlign) {
12116     if (pVar->GetElemSize() >= 4) {
12117         // There's no need to extend the operand. But, if the variable holding
12118         // a uniform value is not aligned to GRF, additional copy is required
12119         // to align it for SIMD1 gather/scatter.
12120         if (!pVar->IsUniform())
12121             return pVar;
12122         if (!pVar->IsImmediate() && IsGRFAligned(pVar, EALIGN_GRF))
12123             return pVar;
12124         // Otherwise, we need to re-align the variable holding that uniform value.
12125     }
12126 
12127     VISA_Type NewType = ISA_TYPE_UD;
12128     if (pVar->GetElemSize() > 4)
12129         NewType = ISA_TYPE_UQ;
12130 
12131     // Cast to extend and/or re-align the variable.
12132     CVariable* NewVar = 0;
12133     if (pVar->IsUniform()) {
12134         NewVar = m_currShader->GetNewVariable(1, NewType, uniformAlign, true, pVar->getName());
12135     }
12136     else {
12137         NewVar = m_currShader->GetNewVariable(
12138             numLanes(m_currShader->m_SIMDSize), NewType, EALIGN_GRF, pVar->getName());
12139     }
12140 
12141     if (pVar->IsImmediate()) {
12142         pVar =
12143             m_currShader->ImmToVariable(
12144                 pVar->GetImmediateValue(),
12145                 GetUnsignedIntegerType(pVar->GetType()));
12146     }
12147     else {
12148         pVar =
12149             m_currShader->GetNewAlias(
12150                 pVar, GetUnsignedIntegerType(pVar->GetType()), 0, 0);
12151     }
12152 
12153     m_encoder->Cast(NewVar, pVar);
12154     m_encoder->Push();
12155     return NewVar;
12156 }
12157 
BroadcastAndExtend(CVariable * pVar)12158 CVariable* EmitPass::BroadcastAndExtend(CVariable* pVar)
12159 {
12160     VISA_Type varType = pVar->GetType();
12161     const int typeSize = CEncoder::GetCISADataTypeSize(varType);
12162 
12163     if (!pVar->IsUniform() && typeSize >= 4)
12164     {
12165         return pVar;
12166     }
12167 
12168     if (pVar->IsImmediate())
12169     {
12170         pVar = m_currShader->ImmToVariable(
12171             pVar->GetImmediateValue(),
12172             GetUnsignedIntegerType(pVar->GetType()));
12173     }
12174     else
12175     {
12176         pVar = m_currShader->GetNewAlias(pVar, GetUnsignedIntegerType(pVar->GetType()), 0, 0);
12177     }
12178 
12179     const VISA_Type broadcastType = typeSize == 8 ? ISA_TYPE_UQ : ISA_TYPE_UD;
12180 
12181     CVariable* pBroadcast = m_currShader->GetNewVariable(
12182         numLanes(m_currShader->m_SIMDSize),
12183         broadcastType,
12184         EALIGN_GRF,
12185         CName(pVar->getName(), "Broadcast"));
12186 
12187     m_encoder->Cast(pBroadcast, pVar);
12188     m_encoder->Push();
12189 
12190     return pBroadcast;
12191 }
12192 
TruncatePointer(CVariable * pVar)12193 CVariable* EmitPass::TruncatePointer(CVariable* pVar) {
12194     // Truncate pointer is used to prepare pointers for A32 and A64
12195     // messages and in stateful loads and stores to prepare the
12196     // offset value.
12197     // For stateless messages pointer data type can only be 32 or 64 bits wide.
12198     // For stateful messages offset data type can be 8, 16, 32 or 64 bits wide.
12199 
12200     // 32-bit integer
12201     if (pVar->GetElemSize() == 4) {
12202         if (!pVar->IsUniform())
12203             return pVar;
12204         // For uniform variable, we need to re-align to GRF to ensure it's
12205         // placed at the 1st element.
12206         if (!pVar->IsImmediate() && IsGRFAligned(pVar, EALIGN_GRF))
12207             return pVar;
12208         // Re-align the container of the pointer.
12209     }
12210 
12211     // Cast to truncate and/or re-align the variable.
12212     CVariable* NewVar = 0;
12213     if (pVar->IsUniform()) {
12214         NewVar = m_currShader->GetNewVariable(1, ISA_TYPE_UD, EALIGN_GRF, true, CName(pVar->getName(), "Trunc"));
12215     }
12216     else {
12217         NewVar = m_currShader->GetNewVariable(
12218             numLanes(m_currShader->m_SIMDSize), ISA_TYPE_UD, EALIGN_GRF, CName(pVar->getName(), "Trunc"));
12219     }
12220     m_encoder->Cast(NewVar, pVar);
12221     m_encoder->Push();
12222 
12223     return NewVar;
12224 }
12225 
ReAlignUniformVariable(CVariable * pVar,e_alignment align)12226 CVariable* EmitPass::ReAlignUniformVariable(CVariable* pVar, e_alignment align) {
12227     if (!pVar->IsUniform())
12228         return pVar;
12229 
12230     if (!pVar->IsImmediate() && IsGRFAligned(pVar, EALIGN_GRF))
12231         return pVar;
12232 
12233     CVariable* NewVar = m_currShader->GetNewVariable(
12234         1, pVar->GetType(), align, true, pVar->getName());
12235 
12236     m_encoder->Cast(NewVar, pVar);
12237     m_encoder->Push();
12238 
12239     return NewVar;
12240 }
12241 
BroadcastAndTruncPointer(CVariable * pVar)12242 CVariable* EmitPass::BroadcastAndTruncPointer(CVariable* pVar)
12243 {
12244     if (pVar->GetElemSize() == 8)
12245     {
12246         // If the pointer is 64-bit, trunc it to 32-bit.
12247         // Note that we don't care if the pointer is uniform or not,
12248         // if it's uniform the trunc will also broadcast.
12249         CVariable* pTrunc = m_currShader->GetNewVariable(
12250             numLanes(m_currShader->m_SIMDSize),
12251             ISA_TYPE_UD,
12252             m_currShader->getGRFAlignment(),
12253             CName(pVar->getName(),"Broadcast64b"));
12254 
12255         m_encoder->Cast(pTrunc, pVar);
12256         m_encoder->Push();
12257         pVar = pTrunc;
12258     }
12259     else
12260     {
12261         pVar = BroadcastIfUniform(pVar);
12262     }
12263 
12264     return pVar;
12265 }
12266 
12267 // Method used to emit reads from GS SGV variables that are not per-vertex.
12268 // Only two cases exist: PrimitiveID, GSInstanceID
emitGS_SGV(SGVIntrinsic * pInst)12269 void EmitPass::emitGS_SGV(SGVIntrinsic* pInst)
12270 {
12271     IGC_ASSERT(m_currShader->GetShaderType() == ShaderType::GEOMETRY_SHADER);
12272     CGeometryShader* gsProgram = static_cast<CGeometryShader*>(m_currShader);
12273     switch (pInst->getUsage())
12274     {
12275     case PRIMITIVEID:
12276     {
12277         CVariable* pPrimitiveID = gsProgram->GetPrimitiveID();
12278         m_currShader->CopyVariable(m_destination, pPrimitiveID);
12279         break;
12280     }
12281     case GS_INSTANCEID:
12282     {
12283         CVariable* pInstanceID = gsProgram->GetInstanceID();
12284         IGC_ASSERT(pInstanceID != nullptr);
12285         m_currShader->CopyVariable(m_destination, pInstanceID);
12286         break;
12287     }
12288     default:
12289         IGC_ASSERT_MESSAGE(0, "This should not happen after lowering to URB reads.");
12290     }
12291 }
12292 
emitSampleOffset(GenIntrinsicInst * inst)12293 void EmitPass::emitSampleOffset(GenIntrinsicInst* inst)
12294 {
12295     IGC_ASSERT(m_currShader->GetShaderType() == ShaderType::PIXEL_SHADER);
12296     CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
12297     CVariable* offsets = nullptr;
12298     if (inst->getIntrinsicID() == GenISAIntrinsic::GenISA_SampleOffsetX)
12299     {
12300         offsets = psProgram->GetSampleOffsetX();
12301     }
12302     else
12303     {
12304         offsets = psProgram->GetSampleOffsetY();
12305     }
12306 
12307     CVariable* pDstArrElm = nullptr;
12308 
12309     CVariable* index = GetSymbol(inst->getOperand(0));
12310 
12311     CVariable* pIndexVar = m_currShader->BitCast(index, ISA_TYPE_UW);
12312 
12313     {
12314         pDstArrElm = m_currShader->GetNewAddressVariable(
12315             numLanes(m_currShader->m_SIMDSize),
12316             offsets->GetType(),
12317             false,
12318             true,
12319             offsets->getName());
12320 
12321         if (!pIndexVar->IsUniform())
12322         {
12323             m_encoder->SetSrcRegion(1, 16, 8, 2);
12324         }
12325 
12326         m_encoder->AddrAdd(pDstArrElm, offsets, pIndexVar);
12327         m_encoder->Push();
12328     }
12329 
12330     m_encoder->Cast(m_destination, pDstArrElm);
12331     m_encoder->Push();
12332 
12333 }
12334 
12335 // Copy identity value to dst with no mask, then src to dst with mask. Notes:
12336 // * dst may be nullptr - it will be created then
12337 // * actual second half setting is preserved
ScanReducePrepareSrc(VISA_Type type,uint64_t identityValue,bool negate,bool secondHalf,CVariable * src,CVariable * dst,CVariable * flag)12338 CVariable* EmitPass::ScanReducePrepareSrc(VISA_Type type, uint64_t identityValue, bool negate, bool secondHalf,
12339     CVariable* src, CVariable* dst, CVariable* flag)
12340 {
12341     if (!dst)
12342     {
12343         dst = m_currShader->GetNewVariable(
12344             numLanes(m_currShader->m_SIMDSize),
12345             type,
12346             EALIGN_GRF,
12347             false,
12348             src->getName());
12349     }
12350     else
12351     {
12352         IGC_ASSERT(0 < dst->GetElemSize());
12353         IGC_ASSERT(numLanes(m_currShader->m_SIMDSize) == (dst->GetSize() / dst->GetElemSize()));
12354         IGC_ASSERT(dst->GetType() == type);
12355         IGC_ASSERT(dst->GetAlign() == EALIGN_GRF);
12356         IGC_ASSERT(!dst->IsUniform());
12357     }
12358 
12359     IGC_ASSERT(nullptr != m_encoder);
12360 
12361     const bool savedSecondHalf = m_encoder->IsSecondHalf();
12362     m_encoder->SetSecondHalf(secondHalf);
12363 
12364     // Set the GRF to <identity> with no mask. This will set all the registers to <identity>
12365     CVariable* pIdentityValue = m_currShader->ImmToVariable(identityValue, type);
12366     m_encoder->SetNoMask();
12367     m_encoder->Copy(dst, pIdentityValue);
12368     m_encoder->Push();
12369 
12370     // Now copy the src with a mask so the disabled lanes still keep their <identity>
12371     if (negate)
12372     {
12373         m_encoder->SetSrcModifier(0, EMOD_NEG);
12374     }
12375     if (flag)
12376     {
12377         m_encoder->SetPredicate(flag);
12378     }
12379     m_encoder->Copy(dst, src);
12380     m_encoder->Push();
12381 
12382     m_encoder->SetSecondHalf(savedSecondHalf);
12383 
12384     return dst;
12385 }
12386 
12387 // Reduction all reduce helper: dst_lane{k} = src_lane{simd + k} OP src_lane{k}, k = 0..(simd-1)
ReductionReduceHelper(e_opcode op,VISA_Type type,SIMDMode simd,CVariable * src)12388 CVariable* EmitPass::ReductionReduceHelper(e_opcode op, VISA_Type type, SIMDMode simd, CVariable* src)
12389 {
12390     const bool isInt64Mul = (op == EOPCODE_MUL && CEncoder::IsIntegerType(type) &&
12391         CEncoder::GetCISADataTypeSize(type) == 8);
12392     const bool is64bitType = type == ISA_TYPE_Q || type == ISA_TYPE_UQ || type == ISA_TYPE_DF;
12393     const auto alignment = is64bitType ? IGC::EALIGN_QWORD : IGC::EALIGN_DWORD;
12394     CVariable* previousTemp = src;
12395     CVariable* temp = m_currShader->GetNewVariable(
12396         numLanes(simd),
12397         type,
12398         alignment,
12399         false,
12400         CName::NONE);
12401 
12402     if (isInt64Mul)
12403     {
12404         m_encoder->SetSimdSize(simd);
12405         m_encoder->SetNoMask();
12406         m_encoder->SetSrcSubReg(0, numLanes(simd));
12407         m_encoder->Copy(temp, previousTemp);
12408         m_encoder->Push();
12409         CVariable* pMulSrc[2] = { previousTemp, temp };
12410         Mul64(temp, pMulSrc, simd, true /*noMask*/);
12411     }
12412     else
12413     {
12414         m_encoder->SetNoMask();
12415         m_encoder->SetSimdSize(simd);
12416         m_encoder->SetSrcSubReg(1, numLanes(simd));
12417         m_encoder->GenericAlu(op, temp, previousTemp, previousTemp);
12418         m_encoder->Push();
12419     }
12420     return temp;
12421 }
12422 
12423 // Reduction all expand helper: dst_lane{0..(simd-1)} = src_lane{0} OP src_lane{1}
ReductionExpandHelper(e_opcode op,VISA_Type type,CVariable * src,CVariable * dst)12424 void EmitPass::ReductionExpandHelper(e_opcode op, VISA_Type type, CVariable* src, CVariable* dst)
12425 {
12426     const bool isInt64Mul = (op == EOPCODE_MUL && CEncoder::IsIntegerType(type) &&
12427         CEncoder::GetCISADataTypeSize(type) == 8);
12428 
12429     if (isInt64Mul)
12430     {
12431         CVariable* tmpMulSrc[2] = {};
12432         tmpMulSrc[0] = m_currShader->GetNewAlias(src, type, 0, 1, true);
12433         tmpMulSrc[1] = m_currShader->GetNewAlias(src, type, sizeof(QWORD), 1, true);
12434         Mul64(dst, tmpMulSrc, m_currShader->m_SIMDSize, false /*noMask*/);
12435     }
12436     else
12437     {
12438         m_encoder->SetSrcSubReg(1, 1);
12439         m_encoder->SetSrcRegion(0, 0, 1, 0);
12440         m_encoder->SetSrcRegion(1, 0, 1, 0);
12441         m_encoder->GenericAlu(op, dst, src, src);
12442         m_encoder->Push();
12443     }
12444 }
12445 
12446 // Reduction clustered: rearrange src by copying src data elements from even subregisters
12447 // to adjacent subregisters of a new variable. Then do the same for odd src subregisters.
12448 // Rearranged src is a pair of the new variables.
12449 // Notes:
12450 // * numLanes refers to the number of elements of each of new variables (same as dst variable used for reduction)
12451 // * numInst cannot be deduced from numLanes and type
12452 // * second half setting is not preserved by this function
ReductionClusteredSrcHelper(CVariable * (& pSrc)[2],CVariable * src,uint16_t numLanes,VISA_Type type,uint numInst,bool secondHalf)12453 void EmitPass::ReductionClusteredSrcHelper(CVariable* (&pSrc)[2], CVariable* src, uint16_t numLanes,
12454     VISA_Type type, uint numInst, bool secondHalf)
12455 {
12456     const bool is64bitType = type == ISA_TYPE_Q || type == ISA_TYPE_UQ || type == ISA_TYPE_DF;
12457     const auto alignment = is64bitType ? IGC::EALIGN_QWORD : IGC::EALIGN_DWORD;
12458 
12459     pSrc[0] = m_currShader->GetNewVariable(
12460         numLanes,
12461         type,
12462         alignment,
12463         false, CName::NONE);
12464     pSrc[1] = m_currShader->GetNewVariable(pSrc[0]);
12465     IGC_ASSERT(pSrc[0]);
12466     IGC_ASSERT(pSrc[1]);
12467 
12468     CVariable* srcTmp = src;
12469     CVariable* pSrcTmp[2] = { pSrc[0], pSrc[1] };
12470 
12471     IGC_ASSERT(nullptr != m_encoder);
12472     m_encoder->SetSecondHalf(secondHalf);
12473     for (uint i = 0; i < numInst; ++i)
12474     {
12475         const e_mask mask = secondHalf ? (i == 1 ? EMASK_Q4 : EMASK_Q3) : (i == 1 ? EMASK_Q2 : EMASK_Q1);
12476 
12477         for (uint j = 0; j < 2; ++j)
12478         {
12479             IGC_ASSERT(numInst);
12480             m_encoder->SetSimdSize(lanesToSIMDMode(numLanes / numInst));
12481             m_encoder->SetNoMask();
12482             m_encoder->SetMask(mask);
12483             m_encoder->SetSrcRegion(0, 2, 1, 0);
12484             m_encoder->SetSrcSubReg(0, j);
12485             m_encoder->SetSrcSubVar(0, 2 * i);
12486             m_encoder->SetDstSubVar(i);
12487             m_encoder->Copy(pSrcTmp[j], srcTmp);
12488             m_encoder->Push();
12489         }
12490     }
12491     m_encoder->SetSecondHalf(false);
12492 }
12493 
12494 // Reduction clustered reduce helper: dst_lane{k} = src_lane{2k} OP src_lane{2k+1}, k = 0..(simd-1)
12495 // For certain opcodes src must be rearranged, to move operation's arguments to the same subreg of different regs.
12496 // Notes:
12497 // * simd is SIMD mode after reduction
12498 // * second half setting is not preserved by this function
12499 // * src and dst may be the same variable
ReductionClusteredReduceHelper(e_opcode op,VISA_Type type,SIMDMode simd,bool secondHalf,CVariable * src,CVariable * dst)12500 CVariable* EmitPass::ReductionClusteredReduceHelper(e_opcode op, VISA_Type type, SIMDMode simd, bool secondHalf,
12501     CVariable* src, CVariable* dst)
12502 {
12503     const bool is64bitType = type == ISA_TYPE_Q || type == ISA_TYPE_UQ || type == ISA_TYPE_DF;
12504     const bool isInt64Mul = (op == EOPCODE_MUL && CEncoder::IsIntegerType(type) &&
12505         CEncoder::GetCISADataTypeSize(type) == 8);
12506     const uint numInst = is64bitType && simd == (getGRFSize() > 32 ? SIMDMode::SIMD16 : SIMDMode::SIMD8) ? 2 : 1;
12507 
12508     IGC_ASSERT(simd == SIMDMode::SIMD2 || simd == SIMDMode::SIMD4 || simd == SIMDMode::SIMD8 || (simd == SIMDMode::SIMD16 && getGRFSize() > 32));
12509 
12510     // The op is performed on pairs of adjacent src data elements.
12511     // In certain cases it is mandatory or might be beneficial for performance reasons
12512     // to ensure that for each such pair the src data elements are in separate GRFs
12513     // and that their regioning patterns match.
12514     bool isRearrangementRequired = isInt64Mul;
12515     if (isRearrangementRequired)
12516     {
12517         // Rearrange src
12518         CVariable* pSrc[2] = {};
12519         ReductionClusteredSrcHelper(pSrc, src, numLanes(simd), type, numInst, secondHalf);
12520 
12521         // Perform reduction with op
12522         m_encoder->SetSecondHalf(secondHalf);
12523         if (isInt64Mul)
12524         {
12525             Mul64(dst, pSrc, simd, true /*noMask*/);
12526         }
12527         else
12528         {
12529             m_encoder->SetSimdSize(simd);
12530             m_encoder->SetNoMask();
12531             m_encoder->GenericAlu(op, dst, pSrc[0], pSrc[1]);
12532             m_encoder->Push();
12533         }
12534         m_encoder->SetSecondHalf(false);
12535     }
12536     else
12537     {
12538         m_encoder->SetSecondHalf(secondHalf);
12539         for (uint i = 0; i < numInst; ++i)
12540         {
12541             IGC_ASSERT(numInst);
12542             m_encoder->SetSimdSize(lanesToSIMDMode(numLanes(simd) / numInst));
12543             m_encoder->SetNoMask();
12544             const e_mask mask = secondHalf ? (i == 1 ? EMASK_Q4 : EMASK_Q3) : (i == 1 ? EMASK_Q2 : EMASK_Q1);
12545             m_encoder->SetMask(mask);
12546             m_encoder->SetSrcRegion(0, 2, 1, 0);
12547             m_encoder->SetSrcSubVar(0, 2 * i);
12548             m_encoder->SetSrcSubReg(0, 0);
12549             m_encoder->SetSrcRegion(1, 2, 1, 0);
12550             m_encoder->SetSrcSubVar(1, 2 * i);
12551             m_encoder->SetSrcSubReg(1, 1);
12552             m_encoder->SetDstSubVar(i);
12553             m_encoder->GenericAlu(op, dst, src, src);
12554             m_encoder->Push();
12555         }
12556         m_encoder->SetSecondHalf(false);
12557     }
12558 
12559     return dst;
12560 }
12561 
12562 // Final reduction and expansion clustered expand helper: for each cluster reduce one pair of values to one value,
12563 // and broadcast it to the whole cluster.
12564 // For certain opcodes the src must be rearranged, to keep operation's arguments in the same subreg of different regs.
12565 // Notes:
12566 // * simd is shader's SIMD size
12567 // * second half setting is not preserved by this function
12568 // * src and dst may be the same variable
ReductionClusteredExpandHelper(e_opcode op,VISA_Type type,SIMDMode simd,const uint clusterSize,bool secondHalf,CVariable * src,CVariable * dst)12569 void EmitPass::ReductionClusteredExpandHelper(e_opcode op, VISA_Type type, SIMDMode simd, const uint clusterSize,
12570     bool secondHalf, CVariable* src, CVariable* dst)
12571 {
12572     const bool is64bitType = type == ISA_TYPE_Q || type == ISA_TYPE_UQ || type == ISA_TYPE_DF;
12573     const bool isInt64Mul = (op == EOPCODE_MUL && CEncoder::IsIntegerType(type) &&
12574         CEncoder::GetCISADataTypeSize(type) == 8);
12575     const uint numInst = is64bitType && simd == (getGRFSize() > 32 ? SIMDMode::SIMD32 : SIMDMode::SIMD16) ? 2 : 1;
12576     IGC_ASSERT(clusterSize == 2 || clusterSize == 4 || clusterSize == 8 || clusterSize == 16);
12577     IGC_ASSERT_MESSAGE(clusterSize * CEncoder::GetCISADataTypeSize(type) <= int_cast<uint>(2 * getGRFSize()),
12578         "Will generate instructions that cross 2 GRFs boundary.");
12579 
12580     // For information on rearrangement see EmitPass::ReductionClusteredReduceHelper()
12581     bool isRearrangementRequired = isInt64Mul;
12582     if (isRearrangementRequired)
12583     {
12584         // Rearrange src
12585         CVariable* pSrc[2] = {};
12586         // For src the 2 grf boundary may be crossed for 2-clusters only in SIMD16 for 64-bit types.
12587         const uint srcNumInst = clusterSize == 2 ? numInst : 1;
12588         IGC_ASSERT(clusterSize);
12589         ReductionClusteredSrcHelper(pSrc, src, numLanes(simd) / clusterSize, type, srcNumInst, secondHalf);
12590 
12591         // Perform reduction with op
12592         CVariable* tempDst = m_currShader->GetNewVariable(dst);
12593         m_encoder->SetSecondHalf(secondHalf);
12594         IGC_ASSERT(clusterSize);
12595         const SIMDMode tmpSimd = lanesToSIMDMode(numLanes(simd) / clusterSize);
12596         if (isInt64Mul)
12597         {
12598             Mul64(tempDst, pSrc, tmpSimd, true /*noMask*/);
12599         }
12600         else
12601         {
12602             m_encoder->SetSimdSize(tmpSimd);
12603             m_encoder->SetNoMask();
12604             m_encoder->GenericAlu(op, tempDst, pSrc[0], pSrc[1]);
12605             m_encoder->Push();
12606         }
12607         m_encoder->SetSecondHalf(false);
12608 
12609         // In certain cases a 64-bit move may need to be split into two 32-bit uint moves
12610         const bool use32BitMov = false;
12611 
12612         // Broadcast to clusters
12613         // Example for a 4-clusters of QWORDs:
12614         // * with 64-bit MOVs:
12615         // mov (8|M8)               r11.0<1>:uq   r21.2<1;4,0>:uq
12616         // mov (8|M0)               r35.0<1>:uq   r21.0<1;4,0>:uq
12617         // * with 32-bit MOVs:
12618         // mov (8|M8)               r33.0<2>:ud   r21.4<2;4,0>:ud
12619         // mov (8|M8)               r33.1<2>:ud   r21.5<2;4,0>:ud
12620         // mov (8|M0)               r31.0<2>:ud   r21.0<2;4,0>:ud
12621         // mov (8|M0)               r31.1<2>:ud   r21.1<2;4,0>:ud
12622         m_encoder->SetSecondHalf(secondHalf);
12623         for (uint i = numInst; i-- != 0;)
12624         {
12625             const uint numMovPerElement = use32BitMov ? 2u : 1u;
12626             for (uint j = 0; j < numMovPerElement; ++j)
12627             {
12628                 // Outer loop is for 64-bit types in SIMD16 only (cluster size is always <= 8)
12629                 // to broadcast data to upper dst's half which crosses 2-grf boundary.
12630                 // The inner is for movement splitting: one 64-bit to a pair of 32-bit.
12631                 IGC_ASSERT(numInst);
12632                 uint lanes = numLanes(simd) / numInst;
12633                 IGC_ASSERT(clusterSize);
12634                 uint clustersPerInst = lanes / clusterSize;
12635                 uint srcSubReg = i * clustersPerInst * numMovPerElement + j;
12636                 const e_mask mask = simd == SIMDMode::SIMD32 ? (i == 1 ? EMASK_H2 : EMASK_H1) :
12637                                     secondHalf ? (i == 1 ? EMASK_Q4 : EMASK_Q3) : (i == 1 ? EMASK_Q2 : EMASK_Q1);
12638 
12639                 m_encoder->SetSimdSize(lanesToSIMDMode(lanes));
12640                 m_encoder->SetMask(mask);
12641                 m_encoder->SetSrcRegion(0, numMovPerElement, clusterSize, 0);
12642                 m_encoder->SetSrcSubReg(0, srcSubReg);
12643                 m_encoder->SetSrcSubVar(0, 0);
12644                 m_encoder->SetDstRegion(numMovPerElement);
12645                 m_encoder->SetDstSubReg(j);
12646                 m_encoder->SetDstSubVar(2 * i);
12647 
12648                 CVariable* broadcastSrc = tempDst;
12649                 CVariable* broadcastDst = dst;
12650                 if (use32BitMov)
12651                 {
12652                     broadcastSrc = m_currShader->GetNewAlias(broadcastSrc, VISA_Type::ISA_TYPE_UD, 0, 0);
12653                     broadcastDst = m_currShader->GetNewAlias(broadcastDst, VISA_Type::ISA_TYPE_UD, 0, 0);
12654                 }
12655                 m_encoder->Copy(broadcastDst, broadcastSrc);
12656                 m_encoder->Push();
12657             }
12658         }
12659         m_encoder->SetSecondHalf(false);
12660     }
12661     else
12662     {
12663         m_encoder->SetSecondHalf(secondHalf);
12664         for (uint i = numInst; i-- > 0;)
12665         {
12666             const uint srcSubVar = i * (4 / clusterSize);
12667             const uint srcSubReg = i * (clusterSize == 8 ? 2 : 0);
12668 
12669             m_encoder->SetSimdSize(lanesToSIMDMode(numLanes(simd) / numInst));
12670             m_encoder->SetNoMask();
12671             const e_mask mask = secondHalf ? (i == 1 ? EMASK_Q4 : EMASK_Q3) : (i == 1 ? EMASK_Q2 : EMASK_Q1);
12672             m_encoder->SetMask(mask);
12673             m_encoder->SetSrcRegion(0, 2, clusterSize, 0);
12674             m_encoder->SetSrcSubReg(0, srcSubReg);
12675             m_encoder->SetSrcSubVar(0, srcSubVar);
12676             m_encoder->SetSrcRegion(1, 2, clusterSize, 0);
12677             m_encoder->SetSrcSubReg(1, srcSubReg + 1);
12678             m_encoder->SetSrcSubVar(1, srcSubVar);
12679             m_encoder->SetDstSubVar(2 * i);
12680             m_encoder->GenericAlu(op, dst, src, src);
12681             m_encoder->Push();
12682         }
12683         m_encoder->SetSecondHalf(false);
12684     }
12685 }
12686 
12687 // do reduction and accumulate all the activate channels, return a uniform
emitReductionAll(e_opcode op,uint64_t identityValue,VISA_Type type,bool negate,CVariable * src,CVariable * dst)12688 void EmitPass::emitReductionAll(
12689     e_opcode op, uint64_t identityValue, VISA_Type type, bool negate, CVariable* src, CVariable* dst)
12690 {
12691     const bool isInt64Mul = (op == EOPCODE_MUL && CEncoder::IsIntegerType(type) &&
12692         CEncoder::GetCISADataTypeSize(type) == 8);
12693 
12694     CVariable* srcH1 = ScanReducePrepareSrc(type, identityValue, negate, false /*secondHalf*/, src, nullptr /*dst*/);
12695     CVariable* temp = srcH1;
12696     if (m_currShader->m_dispatchSize == SIMDMode::SIMD32)
12697     {
12698         if (m_currShader->m_numberInstance == 1)
12699         {
12700             temp = ReductionReduceHelper(op, type, SIMDMode::SIMD16, temp);
12701         }
12702         else
12703         {
12704             CVariable* srcH2 = ScanReducePrepareSrc(type, identityValue, negate, true /*secondHalf*/, src, nullptr /*dst*/);
12705 
12706             temp = m_currShader->GetNewVariable(
12707                 numLanes(SIMDMode::SIMD16),
12708                 type,
12709                 EALIGN_GRF,
12710                 false,
12711                 CName::NONE);
12712             if (isInt64Mul)
12713             {
12714                 CVariable* tmpMulSrc[2] = { srcH1, srcH2 };
12715                 Mul64(temp, tmpMulSrc, SIMDMode::SIMD16, true /*noMask*/);
12716             }
12717             else
12718             {
12719                 m_encoder->SetNoMask();
12720                 m_encoder->SetSimdSize(SIMDMode::SIMD16);
12721                 m_encoder->GenericAlu(op, temp, srcH1, srcH2);
12722                 m_encoder->Push();
12723             }
12724         }
12725     }
12726     if (m_currShader->m_dispatchSize >= SIMDMode::SIMD16)
12727     {
12728         temp = ReductionReduceHelper(op, type, SIMDMode::SIMD8, temp);
12729     }
12730     temp = ReductionReduceHelper(op, type, SIMDMode::SIMD4, temp);
12731     temp = ReductionReduceHelper(op, type, SIMDMode::SIMD2, temp);
12732     ReductionExpandHelper(op, type, temp, dst);
12733 }
12734 
12735 // for all the active channels within each cluster do reduction and accumulate, return a non-uniform
emitReductionClustered(const e_opcode op,const uint64_t identityValue,const VISA_Type type,const bool negate,const unsigned int clusterSize,CVariable * const src,CVariable * const dst)12736 void EmitPass::emitReductionClustered(const e_opcode op, const uint64_t identityValue, const VISA_Type type,
12737     const bool negate, const unsigned int clusterSize, CVariable* const src, CVariable* const dst)
12738 {
12739     const bool isInt64Type = type == ISA_TYPE_Q || type == ISA_TYPE_UQ;
12740     const bool isFP64Type = type == ISA_TYPE_DF;
12741     const bool is64bitType = isInt64Type || isFP64Type;
12742     const bool isInt64Mul = (op == EOPCODE_MUL && CEncoder::IsIntegerType(type) &&
12743         CEncoder::GetCISADataTypeSize(type) == 8);
12744 
12745     IGC_ASSERT_MESSAGE(iSTD::BitCount(clusterSize) == 1, "Cluster size must be a power of two.");
12746     IGC_ASSERT_MESSAGE(!is64bitType || CEncoder::GetCISADataTypeSize(type) == 8, "Unsupported 64-bit type.");
12747 
12748     IGC_ASSERT_MESSAGE(!isInt64Type || !m_currShader->m_Platform->hasNoFullI64Support(), "Int64 emulation is not supported.");
12749     IGC_ASSERT_MESSAGE(!isFP64Type || !m_currShader->m_Platform->hasNoFP64Inst(), "FP64 emulation is not supported.");
12750     // Src might be uniform, as its value will be broadcasted during src preparation.
12751     // Dst uniformness depends on actual support in WIAnalysis, so far implemented for 32-clusters only.
12752     IGC_ASSERT(!dst->IsUniform() || clusterSize == 32);
12753 
12754     const unsigned int dispatchSize = numLanes(m_currShader->m_dispatchSize);
12755     const bool useReduceAll = clusterSize >= dispatchSize;
12756 
12757     if (clusterSize == 1)
12758     {
12759         IGC_ASSERT_MESSAGE(0, "Simple copy. For performance reasons handle it somehow at earlier stage.");
12760         for (uint half = 0; half < m_currShader->m_numberInstance; ++half)
12761         {
12762             const bool secondHalf = half > 0;
12763             m_encoder->SetSecondHalf(secondHalf);
12764             if (negate)
12765             {
12766                 m_encoder->SetSrcModifier(0, EMOD_NEG);
12767             }
12768             m_encoder->Copy(dst, src);
12769             m_encoder->Push();
12770             m_encoder->SetSecondHalf(false);
12771         }
12772     }
12773     else if (useReduceAll)
12774     {
12775         // TODO: consider if it is possible to detect and handle this case in frontends
12776         // and emit GenISA_WaveAll there, to enable optimizations specific to the ReduceAll intrinsic.
12777         emitReductionAll(op, identityValue, type, negate, src, dst);
12778     }
12779     else
12780     {
12781         for (uint half = 0; half < m_currShader->m_numberInstance; ++half)
12782         {
12783             const bool secondHalf = half > 0;
12784 
12785             // Use the "ReduceAll()" approach if code generated by the
12786             // "optimized path" would generate instructions that cross 2-GRF
12787             // boundary. The "optimized path" is code generated by
12788             // ReductionClusteredReduceHelper() + ReductionClusteredExpandHelper().
12789             const bool mayCross2GRFs =
12790                 clusterSize * CEncoder::GetCISADataTypeSize(type) > int_cast<uint>(2 * getGRFSize());
12791             if (mayCross2GRFs)
12792             {
12793                 CVariable* temp = ScanReducePrepareSrc(type, identityValue, negate, secondHalf, src, nullptr);
12794                 // Two halves, for each half src and dst cross 2 grf boundary - "ReduceAll" approach.
12795                 m_encoder->SetSecondHalf(secondHalf);
12796                 IGC_ASSERT(clusterSize == 16);
12797                 temp = ReductionReduceHelper(op, type, SIMDMode::SIMD8, temp);
12798                 temp = ReductionReduceHelper(op, type, SIMDMode::SIMD4, temp);
12799                 temp = ReductionReduceHelper(op, type, SIMDMode::SIMD2, temp);
12800                 ReductionExpandHelper(op, type, temp, dst);
12801                 m_encoder->SetSecondHalf(false);
12802             }
12803             else
12804             {
12805                 // For certain types it is more beneficial (e.g. due to HW restrictions) to perform clustered
12806                 // operations on values converted to another type.
12807                 VISA_Type tmpType = type;
12808                 CVariable* tmpSrc = src;
12809                 CVariable* tmpDst = dst;
12810                 uint64_t tmpIdentityValue = identityValue;
12811                 if (type == VISA_Type::ISA_TYPE_B || type == VISA_Type::ISA_TYPE_UB)
12812                 {
12813                     const bool isSigned = type == VISA_Type::ISA_TYPE_B;
12814                     tmpType = isSigned ? VISA_Type::ISA_TYPE_W : VISA_Type::ISA_TYPE_UW;
12815                     tmpSrc = m_currShader->GetNewVariable(
12816                         src->GetNumberElement(),
12817                         tmpType,
12818                         IGC::EALIGN_DWORD,
12819                         false,
12820                         src->getName());
12821                     m_encoder->SetSecondHalf(secondHalf);
12822                     m_encoder->Cast(tmpSrc, src);
12823                     m_encoder->Push();
12824                     m_encoder->SetSecondHalf(false);
12825                     tmpDst = m_currShader->GetNewVariable(
12826                         dst->GetNumberElement(),
12827                         tmpType,
12828                         IGC::EALIGN_DWORD,
12829                         false,
12830                         CName::NONE);
12831                     switch (op)
12832                     {
12833                     case EOPCODE_MAX:
12834                         tmpIdentityValue = isSigned ? std::numeric_limits<int16_t>::min() :
12835                             std::numeric_limits<uint16_t>::min();
12836                         break;
12837                     case EOPCODE_MIN:
12838                         tmpIdentityValue = isSigned ? std::numeric_limits<int16_t>::max() :
12839                             std::numeric_limits<uint16_t>::max();
12840                         break;
12841                     case EOPCODE_AND:
12842                         tmpIdentityValue = 0xFFFF;
12843                         break;
12844                     default:
12845                         break;
12846                     }
12847                 }
12848 
12849                 CVariable* temp = ScanReducePrepareSrc(tmpType, tmpIdentityValue, negate, secondHalf, tmpSrc, nullptr);
12850 
12851                 SIMDMode simd = secondHalf ? SIMDMode::SIMD16 : m_currShader->m_SIMDSize;
12852 
12853                 // Reduce with op: SIMDN -> SIMD2; that is, N/2 value pairs -> 1 value pair
12854                 for (uint32_t reducedClusterSize = clusterSize;
12855                     reducedClusterSize > 2; reducedClusterSize /= 2)
12856                 {
12857                     simd = lanesToSIMDMode(numLanes(simd) / 2);
12858                     ReductionClusteredReduceHelper(op, tmpType, simd, secondHalf, temp, temp);
12859                 }
12860 
12861                 ReductionClusteredExpandHelper(op, tmpType, m_currShader->m_SIMDSize, clusterSize, secondHalf, temp, tmpDst);
12862 
12863                 if (type == VISA_Type::ISA_TYPE_B || type == VISA_Type::ISA_TYPE_UB)
12864                 {
12865                     m_encoder->SetSecondHalf(secondHalf);
12866                     m_encoder->Cast(dst, tmpDst);
12867                     m_encoder->Push();
12868                     m_encoder->SetSecondHalf(false);
12869                 }
12870             }
12871         }
12872     }
12873 }
12874 
12875 // do prefix op across all activate channels
emitPreOrPostFixOp(e_opcode op,uint64_t identityValue,VISA_Type type,bool negateSrc,CVariable * pSrc,CVariable * pSrcsArr[2],CVariable * Flag,bool isPrefix,bool isQuad)12876 void EmitPass::emitPreOrPostFixOp(
12877     e_opcode op, uint64_t identityValue, VISA_Type type, bool negateSrc,
12878     CVariable* pSrc, CVariable* pSrcsArr[2], CVariable* Flag,
12879     bool isPrefix, bool isQuad)
12880 {
12881     const bool isInt64Mul = (op == EOPCODE_MUL && CEncoder::IsIntegerType(type) && CEncoder::GetCISADataTypeSize(type) == 8);
12882 
12883     if (m_currShader->m_Platform->doScalar64bScan() && CEncoder::GetCISADataTypeSize(type) == 8 && !isQuad)
12884     {
12885         emitPreOrPostFixOpScalar(
12886             op, identityValue, type, negateSrc,
12887             pSrc, pSrcsArr, Flag,
12888             isPrefix);
12889         return;
12890     }
12891 
12892     bool isSimd32 = m_currShader->m_numberInstance == 2;
12893     int counter = isSimd32 ? 2 : 1;
12894 
12895     CVariable* maskedSrc[2] = { 0 };
12896     for (int i = 0; i < counter; ++i)
12897     {
12898         // This is to handle cases when not all lanes are enabled. In that case we fill the lanes with identity.
12899         CVariable* pSrcCopy = ScanReducePrepareSrc(type, identityValue, negateSrc, i == 1 /*secondHalf*/,
12900             pSrc, nullptr /*dst*/, Flag);
12901 
12902         m_encoder->SetSecondHalf(i == 1);
12903 
12904         // For case where we need the prefix shift the source by 1 lane
12905         if (isPrefix)
12906         {
12907             maskedSrc[i] = pSrcCopy;
12908             pSrcCopy = m_currShader->GetNewVariable(pSrcCopy);
12909             // Copy identity
12910             m_encoder->SetSimdSize(SIMDMode::SIMD1);
12911             m_encoder->SetNoMask();
12912             if (i == 0)
12913             {
12914                 CVariable* pIdentityValue = m_currShader->ImmToVariable(identityValue, type);
12915                 m_encoder->Copy(pSrcCopy, pIdentityValue);
12916             }
12917             else
12918             {
12919                 m_encoder->SetSrcSubReg(0, 15);
12920                 m_encoder->Copy(pSrcCopy, maskedSrc[i - 1]);
12921             }
12922             m_encoder->Push();
12923             // Copy remained data
12924             unsigned int simdsize = numLanes(m_currShader->m_SIMDSize);
12925             unsigned int offset = 1;
12926             while (simdsize > 1)
12927             {
12928                 simdsize = simdsize >> 1;
12929                 int numInst = m_encoder->GetCISADataTypeSize(type) == 8 &&
12930                     simdsize == 8 ? 2 : 1;
12931                 for (int instNum = 0; instNum < numInst; ++instNum)
12932                 {
12933                     m_encoder->SetSimdSize(lanesToSIMDMode(simdsize / numInst));
12934                     m_encoder->SetDstSubReg(offset + instNum * 4);
12935                     m_encoder->SetSrcSubReg(0, offset - 1 + instNum * 4);
12936                     m_encoder->SetNoMask();
12937                     m_encoder->Copy(pSrcCopy, maskedSrc[i]);
12938                     m_encoder->Push();
12939                 }
12940                 offset += simdsize;
12941             }
12942         }
12943         pSrcsArr[i] = pSrcCopy;
12944     }
12945 
12946     auto CreateAlu = [this, op, type, isInt64Mul](
12947         const SIMDMode simdSize,
12948         const uint numInst,
12949         CVariable* pDst,
12950         CVariable* pSrc0,
12951         CVariable* pSrc1,
12952         const uint src0SubReg,
12953         const uint src0Region[3],
12954         const uint src1SubReg,
12955         const uint src1Region[3],
12956         const uint dstSubReg,
12957         const uint dstRegion)->void
12958     {
12959         if (isInt64Mul)
12960         {
12961             // 64 bit integer multiply case is done in 3 steps:
12962             // - copy source data to temporary registers to apply
12963             //   sources regioning and subregister values
12964             // - call Mul64() emulation usig temporary sources and
12965             //   a temporary destination
12966             // - copy the result from the temporary destination
12967             //   and apply destination regioning and subregister
12968             //   values
12969             // Note: Consider passing regioning information
12970             // directly to the Mul64() emulation function instead
12971             // of using the temporary registers.
12972             CVariable* pMulSrc[2] = {};
12973             const uint16_t maxNumLanes = numLanes(simdSize);
12974             pMulSrc[0] = m_currShader->GetNewVariable(
12975                 maxNumLanes,
12976                 type,
12977                 EALIGN_GRF,
12978                 false,
12979                 pSrc0->getName());
12980             pMulSrc[1] = m_currShader->GetNewVariable(
12981                 maxNumLanes,
12982                 type,
12983                 EALIGN_GRF,
12984                 false,
12985                 pSrc1->getName());
12986             CVariable* pMulDst = m_currShader->GetNewVariable(
12987                 maxNumLanes,
12988                 type,
12989                 EALIGN_GRF,
12990                 false,
12991                 pDst->getName());
12992 
12993             for (uint instNum = 0; instNum < numInst; ++instNum)
12994             {
12995                 // copy sources with regioning
12996                 m_encoder->SetSimdSize(simdSize);
12997                 m_encoder->SetNoMask();
12998                 m_encoder->SetSrcSubVar(0, instNum * 2);
12999                 m_encoder->SetSrcRegion(0, src0Region[0], src0Region[1], src0Region[2]);
13000                 m_encoder->SetSrcSubReg(0, src0SubReg);
13001                 m_encoder->Copy(pMulSrc[0], pSrc0);
13002                 m_encoder->SetSrcRegion(0, src1Region[0], src1Region[1], src1Region[2]);
13003                 m_encoder->SetSrcSubReg(0, src1SubReg);
13004                 m_encoder->Copy(pMulSrc[1], pSrc1);
13005                 m_encoder->Push();
13006                 // create emulation code
13007                 Mul64(pMulDst, pMulSrc, simdSize, true /*noMask*/);
13008                 // copy destination with regioning
13009                 m_encoder->SetSimdSize(simdSize);
13010                 m_encoder->SetNoMask();
13011                 m_encoder->SetDstSubVar(instNum * 2);
13012                 m_encoder->SetDstRegion(dstRegion);
13013                 m_encoder->SetDstSubReg(dstSubReg);
13014                 m_encoder->Copy(pDst, pMulDst);
13015                 m_encoder->Push();
13016             }
13017         }
13018         else
13019         {
13020             for (uint instNum = 0; instNum < numInst; ++instNum)
13021             {
13022                 m_encoder->SetSimdSize(simdSize);
13023                 m_encoder->SetNoMask();
13024                 m_encoder->SetSrcSubVar(0, instNum * 2);
13025                 m_encoder->SetSrcRegion(0, src0Region[0], src0Region[1], src0Region[2]);
13026                 m_encoder->SetSrcSubReg(0, src0SubReg);
13027                 m_encoder->SetSrcSubVar(1, instNum * 2);
13028                 m_encoder->SetSrcRegion(1, src1Region[0], src1Region[1], src1Region[2]);
13029                 m_encoder->SetSrcSubReg(1, src1SubReg);
13030                 m_encoder->SetDstSubVar(instNum * 2);
13031                 m_encoder->SetDstRegion(dstRegion);
13032                 m_encoder->SetDstSubReg(dstSubReg);
13033                 m_encoder->GenericAlu(op, pDst, pSrc0, pSrc1);
13034                 m_encoder->Push();
13035             }
13036         }
13037     };
13038 
13039 
13040     for (int i = 0; i < counter; ++i)
13041     {
13042         /*
13043         Copy the adjacent elements.
13044         for example: let r10 be the register
13045         Assume we are performing addition for this example
13046            ____      ____      ____      ____
13047         __|____|____|____|____|____|____|____|_
13048         |  7 |  6 |  5 |  4 |  9 |  5 |  3 |  2 |
13049         ---------------------------------------
13050         */
13051 
13052         {
13053             // So then start adding from r10.0 & r10.1
13054             uint numInst = m_encoder->GetCISADataTypeSize(type) == 8 &&
13055                 m_currShader->m_SIMDSize != SIMDMode::SIMD8 ? 2 : 1;
13056             auto simdSize = m_encoder->GetCISADataTypeSize(type) == 8 ||
13057                 m_currShader->m_SIMDSize == SIMDMode::SIMD8 ? SIMDMode::SIMD4 : SIMDMode::SIMD8;
13058             const uint srcRegion[3] = { 2, 1, 0 };
13059             CreateAlu(
13060                 simdSize, numInst, pSrcsArr[i], pSrcsArr[i], pSrcsArr[i],
13061                 0 /*src0 subreg*/, srcRegion /*src0 region*/,
13062                 1 /*src1 subreg*/, srcRegion /*src1 region*/,
13063                 1 /*dst subreg*/, 2 /*dst region*/);
13064         }
13065 
13066         /*
13067                 ____                  ____
13068         _______|____|________________|____|______            ___________________________________________
13069         |  13 |  6 |  9 |  4 |  14 |  5 |  5 |  2 |    ==>  |  13 |  15 |  9 |  4 |  14 |  10 |  5 |  2 |
13070          -----------------------------------------           -------------------------------------------
13071         */
13072         // Now we have a weird copy happening. This will be done by SIMD 2 instructions.
13073 
13074         {
13075             uint numInst = m_encoder->GetCISADataTypeSize(type) == 8 &&
13076                 m_currShader->m_SIMDSize != SIMDMode::SIMD8 ? 2 : 1;
13077             auto simdSize = m_encoder->GetCISADataTypeSize(type) == 8 ||
13078                 m_currShader->m_SIMDSize == SIMDMode::SIMD8 ? SIMDMode::SIMD2 : SIMDMode::SIMD4;
13079             const uint srcRegion[3] = { 4, 1, 0 };
13080             CreateAlu(
13081                 simdSize, numInst, pSrcsArr[i], pSrcsArr[i], pSrcsArr[i],
13082                 2 /*src0 subreg*/, srcRegion /*src0 region*/,
13083                 1 /*src1 subreg*/, srcRegion /*src1 region*/,
13084                 2 /*dst subreg*/, 4 /*dst region*/);
13085         }
13086 
13087         /*
13088            ___________           ___________
13089         __|___________|_________|___________|______         ___________________________________________
13090         |  13 |  15 |  9 |  4 |  14 |  10 |  5 |  2 |  ==>  |  22 |  15 |  9 |  4 |  19 |  10 |  5 |  2 |
13091         -------------------------------------------         -------------------------------------------
13092         */
13093 
13094         {
13095             uint numInst = m_encoder->GetCISADataTypeSize(type) == 8 &&
13096                 m_currShader->m_SIMDSize != SIMDMode::SIMD8 ? 2 : 1;
13097             auto simdSize = m_encoder->GetCISADataTypeSize(type) == 8 ||
13098                 m_currShader->m_SIMDSize == SIMDMode::SIMD8 ? SIMDMode::SIMD2 : SIMDMode::SIMD4;
13099             const uint srcRegion[3] = { 4, 1, 0 };
13100             CreateAlu(
13101                 simdSize, numInst, pSrcsArr[i], pSrcsArr[i], pSrcsArr[i],
13102                 3 /*src0 subreg*/, srcRegion /*src0 region*/,
13103                 1 /*src1 subreg*/, srcRegion /*src1 region*/,
13104                 3 /*dst subreg*/, 4 /*dst region*/);
13105         }
13106 
13107         if (isQuad)
13108         {
13109             // For quads, we don't want ALU ops across SIMD4 lanes, so stop here
13110             continue;
13111         }
13112 
13113         /*
13114                            ____
13115         __________________|____|_________________         ____________________________________________
13116         | 22 |  15 |  9 |  4 | 19 |  10 |  5 |  2 |  ==>  |  22 |  15 |  9 |  23 |  19 |  10 |  5 |  2 |
13117         -----------------------------------------         --------------------------------------------
13118                       _________
13119         _____________|_________|_________________         _____________________________________________
13120         | 22 |  15 |  9 |  4 | 19 |  10 |  5 |  2 |  ==>  |  22 |  15 |  28 |  23 |  19 |  10 |  5 |  2 |
13121         -----------------------------------------         ---------------------------------------------
13122 
13123                  ______________
13124         ________|______________|_________________         _____________________________________________
13125         | 22 |  15 |  9 |  4 | 19 |  10 |  5 |  2 |  ==>  |  22 |  34 |  28 |  23 |  19 |  10 |  5 |  2 |
13126         -----------------------------------------         ---------------------------------------------
13127 
13128            ____________________
13129         __|____________________|_________________         _____________________________________________
13130         | 22 |  15 |  9 |  4 | 19 |  10 |  5 |  2 |  ==>  |  41 |  34 |  28 |  23 |  19 |  10 |  5 |  2 |
13131         -----------------------------------------         ---------------------------------------------
13132         */
13133 
13134         // Because we write continuous elements in the one above, for SIMD16 we have to split into
13135         // 2 SIMD4's.
13136         const unsigned int numLanesForSimd8 = numLanes(SIMDMode::SIMD8);
13137         IGC_ASSERT(numLanesForSimd8);
13138         const unsigned int numTimesToLoop = numLanes(m_currShader->m_SIMDSize) / numLanesForSimd8;
13139 
13140         for (uint loop_counter = 0; loop_counter < numTimesToLoop; ++loop_counter)
13141         {
13142             const uint src0Region[3] = { 0, 1, 0 };
13143             const uint src1Region[3] = { 4, 4, 1 };
13144             CreateAlu(
13145                 SIMDMode::SIMD4, 1, pSrcsArr[i], pSrcsArr[i], pSrcsArr[i],
13146                 (loop_counter * 8 + 3) /*src0 subreg*/, src0Region /*src0 region*/,
13147                 (loop_counter * 8 + 4) /*src1 subreg*/, src1Region /*src1 region*/,
13148                 (loop_counter * 8 + 4) /*dst subreg*/, 1 /*dst region*/);
13149         }
13150 
13151         if (m_currShader->m_SIMDSize == SIMDMode::SIMD16 || isSimd32)
13152         {
13153             // Add the last element of the 1st GRF to all the elements of the 2nd GRF
13154             const uint src0Region[3] = { 0, 1, 0 };
13155             const uint src1Region[3] = { 1, 1, 0 };
13156             CreateAlu(
13157                 SIMDMode::SIMD8, 1, pSrcsArr[i], pSrcsArr[i], pSrcsArr[i],
13158                 7 /*src0 subreg*/, src0Region /*src0 region*/,
13159                 8 /*src1 subreg*/, src1Region /*src1 region*/,
13160                 8 /*dst subreg*/, 1 /*dst region*/);
13161         }
13162     }
13163 
13164     if (isSimd32 && !isQuad)
13165     {
13166         // For SIMD32 we need to write the last element of the prev element to the next 16 elements
13167         const uint src0Region[3] = { 0, 1, 0 };
13168         const uint src1Region[3] = { 1, 1, 0 };
13169         CreateAlu(
13170             SIMDMode::SIMD16, 1, pSrcsArr[1], pSrcsArr[0], pSrcsArr[1],
13171             (numLanes(m_currShader->m_SIMDSize) - 1) /*src0 subreg*/, src0Region /*src0 region*/,
13172             0 /*src1 subreg*/, src1Region /*src1 region*/,
13173             0 /*dst subreg*/, 1 /*dst region*/);
13174     }
13175     // reset second half state
13176     m_encoder->SetSecondHalf(false);
13177 }
13178 
13179 // scalar version of the scan operation for 64b types
emitPreOrPostFixOpScalar(e_opcode op,uint64_t identityValue,VISA_Type type,bool negateSrc,CVariable * src,CVariable * result[2],CVariable * Flag,bool isPrefix)13180 void EmitPass::emitPreOrPostFixOpScalar(
13181     e_opcode op,
13182     uint64_t identityValue,
13183     VISA_Type type,
13184     bool negateSrc,
13185     CVariable* src,
13186     CVariable* result[2],
13187     CVariable* Flag,
13188     bool isPrefix)
13189 {
13190     const bool isInt64Mul = (op == EOPCODE_MUL && CEncoder::IsIntegerType(type) &&
13191         CEncoder::GetCISADataTypeSize(type) == 8);
13192 
13193     bool isSimd32 = m_currShader->m_numberInstance == 2;
13194     int counter = isSimd32 ? 2 : 1;
13195     CVariable* pSrcCopy[2] = {};
13196     for (int i = 0; i < counter; ++i)
13197     {
13198         // This is to handle cases when not all lanes are enabled. In that case we fill the lanes with identity.
13199         pSrcCopy[i] = ScanReducePrepareSrc(type, identityValue, negateSrc, i == 1 /*secondHalf*/,
13200             src, nullptr /*dst*/, Flag);
13201 
13202         result[i] = m_currShader->GetNewVariable(
13203             numLanes(m_currShader->m_SIMDSize),
13204             type,
13205             EALIGN_GRF,
13206             false,
13207             CName::NONE);
13208 
13209         m_encoder->SetSecondHalf(i == 1);
13210 
13211         int srcIdx = 0;
13212         m_encoder->SetSimdSize(SIMDMode::SIMD1);
13213         m_encoder->SetNoMask();
13214         if (isPrefix)
13215         {
13216             // For case where we need the prefix shift the source by 1 lane.
13217             if (i == 0)
13218             {
13219                 // (W) mov (1) result[0] identity
13220                 CVariable* pIdentityValue = m_currShader->ImmToVariable(identityValue, type);
13221                 m_encoder->Copy(result[i], pIdentityValue);
13222             }
13223             else
13224             {
13225                 // (W) mov (1) result[16] srcCopy[15]
13226                 m_encoder->SetSrcSubReg(0, 15);
13227                 m_encoder->SetSrcRegion(0, 0, 1, 0);
13228                 m_encoder->Copy(result[i], pSrcCopy[0]);
13229             }
13230         }
13231         else
13232         {
13233             // (W) mov (1) result[0/16] srcCopy[0/16]
13234             m_encoder->SetSrcSubReg(0, 0);
13235             m_encoder->SetSrcRegion(0, 0, 1, 0);
13236             m_encoder->Copy(result[i], pSrcCopy[i]);
13237             srcIdx = 1;
13238         }
13239         m_encoder->Push();
13240 
13241         CVariable* tmpDst = isInt64Mul ?
13242             m_currShader->GetNewVariable(
13243                 1,
13244                 type,
13245                 EALIGN_GRF,
13246                 true,
13247                 result[0]->getName()) : nullptr;
13248 
13249         for (int dstIdx = 1; dstIdx < numLanes(m_currShader->m_SIMDSize); ++dstIdx, ++srcIdx)
13250         {
13251             // do the scan one by one
13252             // (W) op (1) result[dstIdx] srcCopy[srcIdx] result[dstIdx-1]
13253             if (isInt64Mul)
13254             {
13255                 CVariable* pMulSrc[2] = {
13256                     m_currShader->GetNewAlias(pSrcCopy[i], type, srcIdx * sizeof(QWORD), 1, true),
13257                     m_currShader->GetNewAlias(result[i], type, (dstIdx - 1) * sizeof(QWORD), 1, true) };
13258                 Mul64(tmpDst, pMulSrc, SIMDMode::SIMD1, true /*noMask*/);
13259                 // (W) mov (1) result[dstIdx] tmpDst
13260                 m_encoder->SetSimdSize(SIMDMode::SIMD1);
13261                 m_encoder->SetNoMask();
13262                 m_encoder->SetDstSubReg(dstIdx);
13263                 m_encoder->Copy(result[i], tmpDst);
13264                 m_encoder->Push();
13265             }
13266             else
13267             {
13268                 m_encoder->SetSimdSize(SIMDMode::SIMD1);
13269                 m_encoder->SetNoMask();
13270                 m_encoder->SetSrcSubReg(0, srcIdx);
13271                 m_encoder->SetSrcRegion(0, 0, 1, 0);
13272                 m_encoder->SetSrcRegion(1, 0, 1, 0);
13273                 m_encoder->SetSrcSubReg(1, dstIdx - 1);
13274                 m_encoder->SetDstSubReg(dstIdx);
13275                 m_encoder->GenericAlu(op, result[i], pSrcCopy[i], result[i]);
13276                 m_encoder->Push();
13277             }
13278         }
13279 
13280         m_encoder->SetSecondHalf(false);
13281     }
13282 
13283     if (isSimd32)
13284     {
13285         m_encoder->SetSecondHalf(true);
13286 
13287         // For SIMD32 we need to write the last element of the prev element to the next 16 elements.
13288         if (isInt64Mul)
13289         {
13290             CVariable* pMulSrc[2] = {
13291                  m_currShader->GetNewAlias(result[0], type, 15 * sizeof(QWORD), 1, true),
13292                  result[1] };
13293             Mul64(result[1], pMulSrc, SIMDMode::SIMD16, true /*noMask*/);
13294         }
13295         else
13296         {
13297             m_encoder->SetSimdSize(SIMDMode::SIMD16);
13298             m_encoder->SetNoMask();
13299             m_encoder->SetSrcRegion(0, 0, 1, 0);
13300             m_encoder->SetSrcSubReg(0, 15);
13301             m_encoder->GenericAlu(op, result[1], result[0], result[1]);
13302             m_encoder->Push();
13303         }
13304 
13305         m_encoder->SetSecondHalf(false);
13306     }
13307 }
13308 
13309 /*
13310 ScalarAtomics: This optimization attempts to reduce the number of atomic instructions issued when
13311 the destination addresses and the source are both uniform. For example lets say we have an atomic
13312 add happens with destination address as <addr> = constant. <src> = constant too. In this case, lets
13313 say for SIMD8 there are 8 lanes trying to write to the same address. H/W will serialize this to
13314 8 back to back atomic instructions which are extremely slow to execute.
13315 */
emitScalarAtomics(llvm::Instruction * pInst,ResourceDescriptor & resource,AtomicOp atomic_op,CVariable * pDstAddr,CVariable * pSrc,bool isA64,int bitWidth)13316 void EmitPass::emitScalarAtomics(
13317     llvm::Instruction* pInst,
13318     ResourceDescriptor& resource,
13319     AtomicOp atomic_op,
13320     CVariable* pDstAddr,
13321     CVariable* pSrc,
13322     bool isA64,
13323     int bitWidth)
13324 {
13325     e_opcode op = EOPCODE_ADD;
13326     // find the value for which opcode(x, identity) == x
13327     unsigned int identityValue = 0;
13328     switch (atomic_op)
13329     {
13330     case EATOMIC_IADD:
13331     case EATOMIC_SUB:
13332     case EATOMIC_INC:
13333     case EATOMIC_DEC:
13334         identityValue = 0;
13335         op = EOPCODE_ADD;
13336         break;
13337     case EATOMIC_UMAX:
13338         identityValue = 0;
13339         op = EOPCODE_MAX;
13340         break;
13341     case EATOMIC_IMAX:
13342         identityValue = 0x80000000;
13343         op = EOPCODE_MAX;
13344         break;
13345     case EATOMIC_UMIN:
13346         identityValue = 0xFFFFFFFF;
13347         op = EOPCODE_MIN;
13348         break;
13349     case EATOMIC_IMIN:
13350         identityValue = 0X7FFFFFFF;
13351         op = EOPCODE_MIN;
13352         break;
13353     default:
13354         IGC_ASSERT_MESSAGE(0, "unsupported scalar atomic type");
13355         break;
13356     }
13357 
13358     VISA_Type type =
13359         bitWidth == 16 ? ISA_TYPE_W :
13360         bitWidth == 32 ? ISA_TYPE_D :
13361                          ISA_TYPE_Q;
13362     IGC_ASSERT_MESSAGE((bitWidth == 16) || (bitWidth == 32) || (bitWidth == 64), "invalid bitsize");
13363     if (atomic_op == EATOMIC_INC || atomic_op == EATOMIC_DEC)
13364     {
13365         if (atomic_op == EATOMIC_INC)
13366         {
13367             atomic_op = EATOMIC_IADD;
13368         }
13369         else
13370         {
13371             atomic_op = EATOMIC_SUB;
13372         }
13373 
13374         pSrc = m_currShader->ImmToVariable(1, type);
13375     }
13376     if (atomic_op == EATOMIC_UMAX || atomic_op == EATOMIC_UMIN)
13377     {
13378         type = GetUnsignedType(type);
13379     }
13380     AtomicOp uniformAtomicOp = atomic_op;
13381     bool negateSrc = false;
13382     if (atomic_op == EATOMIC_SUB)
13383     {
13384         negateSrc = true;
13385         uniformAtomicOp = EATOMIC_IADD;
13386     }
13387     bool returnsImmValue = (!pInst->use_empty());
13388     CVariable* pFinalAtomicSrcVal = m_currShader->GetNewVariable(
13389         1,
13390         type,
13391         isA64 ? EALIGN_2GRF : EALIGN_GRF,
13392         true,
13393         CName::NONE);
13394     CVariable* pSrcsArr[2] = { nullptr, nullptr };
13395     if (returnsImmValue)
13396     {
13397         // sum all the lanes
13398         emitPreOrPostFixOp(op, identityValue, type, negateSrc, pSrc, pSrcsArr);
13399 
13400         CVariable* pSrcCopy = pSrcsArr[0];
13401         if (m_currShader->m_numberInstance == 2)
13402         {
13403             pSrcCopy = pSrcsArr[1];
13404         }
13405 
13406         m_encoder->SetSrcRegion(0, 0, 1, 0);
13407         m_encoder->SetSrcSubReg(0, numLanes(m_currShader->m_SIMDSize) - 1);
13408         m_encoder->Copy(pFinalAtomicSrcVal, pSrcCopy);
13409         m_encoder->Push();
13410     }
13411     else
13412     {
13413         emitReductionAll(op, identityValue, type, negateSrc, pSrc, pFinalAtomicSrcVal);
13414     }
13415 
13416     if (pDstAddr->IsImmediate())
13417     {
13418         CVariable* pDstAddrCopy = m_currShader->GetNewVariable(
13419             1, ISA_TYPE_UD, EALIGN_GRF, true, CName::NONE);
13420         m_encoder->SetSimdSize(SIMDMode::SIMD1);
13421         m_encoder->SetNoMask();
13422         m_encoder->Copy(pDstAddrCopy, pDstAddr);
13423         m_encoder->Push();
13424         pDstAddr = pDstAddrCopy;
13425     }
13426 
13427     m_encoder->SetSimdSize(SIMDMode::SIMD1);
13428     m_encoder->SetNoMask();
13429 
13430     CVariable* pReturnVal = returnsImmValue ?
13431         m_currShader->GetNewVariable(
13432             1, ISA_TYPE_UD, EALIGN_GRF, true, CName::NONE) :
13433         nullptr;
13434 
13435     if (bitWidth == 16)
13436     {
13437         CVariable* pCastAtomicSrcVal =
13438             m_currShader->GetNewVariable(1, ISA_TYPE_UD, EALIGN_GRF, true, CName::NONE);
13439 
13440         m_encoder->Cast(pCastAtomicSrcVal, pFinalAtomicSrcVal);
13441         pFinalAtomicSrcVal = pCastAtomicSrcVal;
13442     }
13443 
13444         if (isA64)
13445         {
13446             m_encoder->AtomicRawA64(
13447                 uniformAtomicOp, resource,
13448                 pReturnVal, pDstAddr,
13449                 pFinalAtomicSrcVal, nullptr,
13450                 bitWidth);
13451         }
13452         else
13453         {
13454             m_encoder->DwordAtomicRaw(
13455                 uniformAtomicOp, resource,
13456                 pReturnVal, pDstAddr,
13457                 pFinalAtomicSrcVal,
13458                 nullptr, bitWidth == 16);
13459         }
13460     m_encoder->Push();
13461 
13462     if (returnsImmValue)
13463     {
13464         unsigned int counter = m_currShader->m_numberInstance;
13465         IGC_ASSERT_MESSAGE(op == EOPCODE_ADD, "we can only get the return value for add right now");
13466         for (unsigned int i = 0; i < counter; ++i)
13467         {
13468             m_encoder->SetNoMask();
13469             m_encoder->Add(pSrcsArr[i], pSrcsArr[i], pReturnVal);
13470             m_encoder->Push();
13471 
13472             if (atomic_op == EATOMIC_IADD)
13473             {
13474                 m_encoder->SetSrcModifier(1, EMOD_NEG);
13475             }
13476 
13477             m_encoder->SetSecondHalf(i == 1);
13478             m_encoder->Add(m_destination, pSrcsArr[i], pSrc);
13479             m_encoder->Push();
13480         }
13481     }
13482 }
13483 
13484 //
13485 // We emulate an atomic_load with an atomic_or with zero.
13486 // when the atomic is uniform we can directly generate a SIMD1 atomic_or
13487 //
emitScalarAtomicLoad(llvm::Instruction * pInst,ResourceDescriptor & resource,CVariable * pDstAddr,CVariable * pSrc,bool isA64,int bitWidth)13488 void EmitPass::emitScalarAtomicLoad(
13489     llvm::Instruction* pInst,
13490     ResourceDescriptor& resource,
13491     CVariable* pDstAddr,
13492     CVariable* pSrc,
13493     bool isA64,
13494     int bitWidth)
13495 {
13496     if (pDstAddr->IsImmediate())
13497     {
13498         CVariable* pDstAddrCopy = m_currShader->GetNewVariable(1, ISA_TYPE_UD, EALIGN_GRF, true, pDstAddr->getName());
13499         m_encoder->SetSimdSize(SIMDMode::SIMD1);
13500         m_encoder->SetNoMask();
13501         m_encoder->Copy(pDstAddrCopy, pDstAddr);
13502         m_encoder->Push();
13503         pDstAddr = pDstAddrCopy;
13504     }
13505 
13506     {
13507         // pSrc is imm zero
13508         CVariable* pSrcCopy = m_currShader->GetNewVariable(1, ISA_TYPE_UD, EALIGN_GRF, true, pSrc->getName());
13509         m_encoder->SetSimdSize(SIMDMode::SIMD1);
13510         m_encoder->SetNoMask();
13511         m_encoder->Copy(pSrcCopy, pSrc);
13512         m_encoder->Push();
13513         pSrc = pSrcCopy;
13514     }
13515 
13516     m_encoder->SetSimdSize(SIMDMode::SIMD1);
13517     m_encoder->SetNoMask();
13518 
13519     CVariable* atomicDst = !pInst->use_empty() ?
13520         m_currShader->GetNewVariable(
13521             1,
13522             ISA_TYPE_UD,
13523             isA64 ? EALIGN_2GRF : EALIGN_GRF,
13524             true,
13525             pDstAddr->getName()) : nullptr;
13526 
13527         if (isA64)
13528         {
13529             m_encoder->AtomicRawA64(
13530                 EATOMIC_OR, resource,
13531                 atomicDst, pDstAddr,
13532                 pSrc, nullptr,
13533                 bitWidth);
13534         }
13535         else
13536         {
13537             m_encoder->DwordAtomicRaw(
13538                 EATOMIC_OR, resource,
13539                 atomicDst, pDstAddr,
13540                 pSrc,
13541                 nullptr, bitWidth == 16);
13542         }
13543     m_encoder->Push();
13544 
13545     if (!pInst->use_empty())
13546     {
13547         // we need to broadcast the return value
13548         // ToDo: change divergence analysis to mark scalar atomic load as uniform
13549         unsigned int counter = m_currShader->m_numberInstance;
13550         for (unsigned int i = 0; i < counter; ++i)
13551         {
13552             m_encoder->SetSecondHalf(i == 1);
13553             m_encoder->Copy(m_destination, atomicDst);
13554             m_encoder->Push();
13555         }
13556     }
13557 }
13558 
IsUniformAtomic(llvm::Instruction * pInst)13559 bool EmitPass::IsUniformAtomic(llvm::Instruction* pInst)
13560 {
13561     if (llvm::GenIntrinsicInst * pIntrinsic = llvm::dyn_cast<llvm::GenIntrinsicInst>(pInst))
13562     {
13563         GenISAIntrinsic::ID id = pIntrinsic->getIntrinsicID();
13564 
13565         // Dst address in bytes.
13566         if (id == GenISAIntrinsic::GenISA_intatomicraw ||
13567             id == GenISAIntrinsic::GenISA_intatomicrawA64)
13568         {
13569             Function* F = pInst->getParent()->getParent();
13570             if (IGC_IS_FLAG_ENABLED(DisableScalarAtomics) ||
13571                 F->hasFnAttribute("KMPLOCK") ||
13572                 m_currShader->m_DriverInfo->WASLMPointersDwordUnit())
13573                 return false;
13574             llvm::Value* pllDstAddr = pInst->getOperand(1);
13575             CVariable* pDstAddr = GetSymbol(pllDstAddr);
13576             if (pDstAddr->IsUniform())
13577             {
13578                 AtomicOp atomic_op = static_cast<AtomicOp>(llvm::cast<llvm::ConstantInt>(pInst->getOperand(3))->getZExtValue());
13579 
13580                 bool isAddAtomic = atomic_op == EATOMIC_IADD ||
13581                     atomic_op == EATOMIC_INC ||
13582                     atomic_op == EATOMIC_SUB;
13583                 bool isMinMaxAtomic =
13584                     atomic_op == EATOMIC_UMAX ||
13585                     atomic_op == EATOMIC_UMIN ||
13586                     atomic_op == EATOMIC_IMIN ||
13587                     atomic_op == EATOMIC_IMAX;
13588 
13589                 // capture the special case of atomic_or with 0 (it's used to simulate atomic_load)
13590                 bool isOrWith0Atomic = atomic_op == EATOMIC_OR &&
13591                     isa<ConstantInt>(pInst->getOperand(2)) && cast<ConstantInt>(pInst->getOperand(2))->isZero();
13592 
13593                 if (isAddAtomic || (isMinMaxAtomic && pInst->use_empty()) || isOrWith0Atomic)
13594                     return true;
13595             }
13596         }
13597     }
13598 
13599     return false;
13600 }
13601 
UnpackOrBroadcastIfUniform(CVariable * pVar)13602 CVariable* EmitPass::UnpackOrBroadcastIfUniform(CVariable* pVar)
13603 {
13604     if (pVar->GetElemSize() == 4 || pVar->GetElemSize() == 8)
13605         return BroadcastIfUniform(pVar);
13606 
13607     IGC_ASSERT(pVar->GetElemSize() == 2);
13608 
13609     uint16_t elts = numLanes(m_currShader->m_SIMDSize);
13610     // 16-bit atomics are still aligned at dword boundaries
13611     // with the upper 16-bits ignored.
13612     CVariable* pUnpacked =
13613         m_currShader->GetNewVariable(elts, ISA_TYPE_UD, EALIGN_GRF, CName(pVar->getName(), "Unpacked"));
13614 
13615     m_encoder->Cast(pUnpacked, m_currShader->BitCast(pVar, ISA_TYPE_UW));
13616     return pUnpacked;
13617 }
13618 
emitAtomicRaw(llvm::GenIntrinsicInst * pInsn)13619 void EmitPass::emitAtomicRaw(llvm::GenIntrinsicInst* pInsn)
13620 {
13621     ForceDMask();
13622     // Currently, Dword Atomics can be called by matching 2 intrinsics. One is the DwordAtomicRaw
13623     // and AtomicCmpXchg (which has 2 srcs unlike the other atomics).
13624     IGC_ASSERT(pInsn->getNumArgOperands() == 4);
13625 
13626     /// Immediate Atomics return the value before the atomic operation is performed. So that flag
13627     /// needs to be set for this.
13628     bool returnsImmValue = !pInsn->use_empty();
13629 
13630     llvm::Value* pllbuffer = pInsn->getOperand(0);
13631     llvm::Value* pllDstAddr = pInsn->getOperand(1);
13632     llvm::Value* pllSrc0 = pInsn->getOperand(2);
13633     ResourceDescriptor resource = GetResourceVariable(pllbuffer);
13634     CountStatelessIndirectAccess(pllbuffer, resource);
13635     AtomicOp atomic_op = EATOMIC_UNDEF;
13636 
13637     if (pllbuffer->getType()->getPointerAddressSpace() == ADDRESS_SPACE_GLOBAL)
13638     {
13639         m_currShader->SetHasGlobalAtomics();
13640     }
13641 
13642     CVariable* pSrc0 = nullptr;
13643     CVariable* pSrc1 = nullptr;
13644     llvm::GenIntrinsicInst* pIntrinCall = llvm::cast<llvm::GenIntrinsicInst>(pInsn);
13645     GenISAIntrinsic::ID IID = pIntrinCall->getIntrinsicID();
13646     if (IID == GenISAIntrinsic::GenISA_icmpxchgatomicraw ||
13647         IID == GenISAIntrinsic::GenISA_fcmpxchgatomicraw ||
13648         IID == GenISAIntrinsic::GenISA_icmpxchgatomicrawA64 ||
13649         IID == GenISAIntrinsic::GenISA_fcmpxchgatomicrawA64)
13650     {
13651         llvm::Value* pllSrc1 = pInsn->getOperand(3);
13652         pSrc1 = GetSymbol(pllSrc1);
13653 
13654         Function* F = pInsn->getParent()->getParent();
13655         if (F->hasFnAttribute("KMPLOCK") && m_currShader->GetIsUniform(pInsn))
13656         {
13657             m_encoder->SetSimdSize(SIMDMode::SIMD1);
13658             m_encoder->SetNoMask();
13659         }
13660 
13661         pSrc1 = UnpackOrBroadcastIfUniform(pSrc1);
13662         if (IID == GenISAIntrinsic::GenISA_fcmpxchgatomicraw ||
13663             IID == GenISAIntrinsic::GenISA_fcmpxchgatomicrawA64)
13664         {
13665             atomic_op = EATOMIC_FCMPWR;
13666         }
13667         else
13668         {
13669             atomic_op = EATOMIC_CMPXCHG;
13670         }
13671     }
13672     else
13673     {
13674         atomic_op = static_cast<AtomicOp>(llvm::cast<llvm::ConstantInt>(pInsn->getOperand(3))->getZExtValue());
13675     }
13676 
13677 
13678     unsigned short bitwidth = pInsn->getType()->getScalarSizeInBits();
13679     const bool is16Bit = (pInsn->getType()->getScalarSizeInBits() == 16);
13680 
13681 
13682     // atomic_inc and atomic_dec don't have both src0 and src1.
13683     if (atomic_op != EATOMIC_INC && atomic_op != EATOMIC_DEC &&
13684         atomic_op != EATOMIC_INC64 && atomic_op != EATOMIC_DEC64 &&
13685         atomic_op != EATOMIC_PREDEC && atomic_op != EATOMIC_PREDEC64)
13686     {
13687         pSrc0 = GetSymbol(pllSrc0);
13688     }
13689 
13690     // Dst address in bytes.
13691     CVariable* pDstAddr = GetSymbol(pllDstAddr);
13692     // If DisableScalarAtomics regkey is enabled or DisableIGCOptimizations regkey is enabled then
13693     // don't enable scalar atomics, also do not enable for 64 bit
13694     if (IsUniformAtomic(pInsn) && bitwidth != 64)
13695     {
13696         PointerType* PtrTy = dyn_cast<PointerType>(pllDstAddr->getType());
13697         bool isA64 = PtrTy && isA64Ptr(PtrTy, m_currShader->GetContext());
13698         e_alignment uniformAlign = isA64 ? EALIGN_2GRF : EALIGN_GRF;
13699         // Re-align the pointer if it's not GRF aligned.
13700         pDstAddr = ReAlignUniformVariable(pDstAddr, uniformAlign);
13701         if (atomic_op == EATOMIC_OR)
13702         {
13703             // special case of atomic_load
13704             emitScalarAtomicLoad(pInsn, resource, pDstAddr, pSrc0, isA64, bitwidth);
13705         }
13706         else
13707         {
13708             emitScalarAtomics(pInsn, resource, atomic_op, pDstAddr, pSrc0, isA64, bitwidth);
13709             ResetVMask();
13710         }
13711         return;
13712     }
13713 
13714     Function* F = pInsn->getParent()->getParent();
13715     if (F->hasFnAttribute("KMPLOCK") && m_currShader->GetIsUniform(pInsn))
13716     {
13717         m_encoder->SetSimdSize(SIMDMode::SIMD1);
13718         m_encoder->SetNoMask();
13719     }
13720     pDstAddr = BroadcastIfUniform(pDstAddr);
13721 
13722     if (F->hasFnAttribute("KMPLOCK") && m_currShader->GetIsUniform(pInsn))
13723     {
13724         m_encoder->SetSimdSize(SIMDMode::SIMD1);
13725         m_encoder->SetNoMask();
13726     }
13727     if (pSrc0)
13728     {
13729         pSrc0 = UnpackOrBroadcastIfUniform(pSrc0);
13730     }
13731 
13732     if (F->hasFnAttribute("KMPLOCK") && m_currShader->GetIsUniform(pInsn))
13733     {
13734         m_encoder->SetSimdSize(SIMDMode::SIMD1);
13735         m_encoder->SetNoMask();
13736     }
13737 
13738     {
13739         CVariable* pDst = returnsImmValue ?
13740             m_currShader->GetNewVariable(
13741                 numLanes(m_currShader->m_SIMDSize),
13742                 bitwidth != 64 ? ISA_TYPE_UD : ISA_TYPE_UQ,
13743                 EALIGN_GRF, CName::NONE) :
13744             nullptr;
13745 
13746         PointerType* PtrTy = dyn_cast<PointerType>(pllDstAddr->getType());
13747         bool isA64 = PtrTy && isA64Ptr(PtrTy, m_currShader->GetContext());
13748         bool extendPointer = (bitwidth == 64 && !isA64);
13749         if (isA64 || extendPointer)
13750         {
13751             if (extendPointer)
13752             {
13753                 pDstAddr = m_currShader->BitCast(pDstAddr, GetUnsignedIntegerType(pDstAddr->GetType()));
13754                 CVariable* pDstAddr2 = m_currShader->GetNewVariable(
13755                     pDstAddr->GetNumberElement(), ISA_TYPE_UQ, EALIGN_GRF, CName::NONE);
13756                 m_encoder->Cast(pDstAddr2, pDstAddr);
13757                 m_encoder->AtomicRawA64(atomic_op, resource, pDst, pDstAddr2, pSrc0, pSrc1, bitwidth);
13758                 m_encoder->Push();
13759             }
13760             else
13761             {
13762                 m_encoder->AtomicRawA64(atomic_op, resource, pDst, pDstAddr, pSrc0, pSrc1, bitwidth);
13763                 m_encoder->Push();
13764             }
13765 
13766             if (returnsImmValue) //This is needed for repacking of 16bit atomics otherwise it will be a vanilla mov
13767             {
13768                 m_encoder->Cast(
13769                     m_currShader->BitCast(m_destination, GetUnsignedIntegerType(m_destination->GetType())),
13770                     pDst);
13771                 m_encoder->Push();
13772             }
13773         }
13774         else
13775         {
13776             // TODO: SEND SLM OFFSET IN BYTES
13777             CodeGenContext* ctx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
13778             if (resource.m_surfaceType == ESURFACE_SLM && ctx->m_DriverInfo.WASLMPointersDwordUnit())
13779             {
13780                 CVariable* pDwordAddr =
13781                     m_currShader->GetNewVariable(numLanes(m_currShader->m_SIMDSize),
13782                         ISA_TYPE_D, EALIGN_GRF, CName::NONE);
13783 
13784                 m_encoder->Shl(pDwordAddr, pDstAddr,
13785                     m_currShader->ImmToVariable(0x2, ISA_TYPE_D));
13786                 m_encoder->Push();
13787                 pDstAddr = pDwordAddr;
13788             }
13789             pDstAddr = m_currShader->BitCast(pDstAddr, ISA_TYPE_UD);
13790 
13791             if (pSrc0)
13792             {
13793                 pSrc0 = m_currShader->BitCast(pSrc0, bitwidth != 64 ? ISA_TYPE_UD : ISA_TYPE_UQ);
13794             }
13795 
13796             if (pSrc1)
13797             {
13798                 pSrc1 = m_currShader->BitCast(pSrc1, bitwidth != 64 ? ISA_TYPE_UD : ISA_TYPE_UQ);
13799             }
13800             uint label = 0;
13801             CVariable* flag = nullptr;
13802             bool needLoop = ResourceLoopHeader(resource, flag, label);
13803             m_encoder->DwordAtomicRaw(
13804                 atomic_op,
13805                 resource,
13806                 pDst,
13807                 pDstAddr,
13808                 pSrc0,
13809                 pSrc1,
13810                 is16Bit);
13811             m_encoder->Push();
13812             if (returnsImmValue)
13813             {
13814                 m_encoder->Cast(
13815                     m_currShader->BitCast(m_destination, GetUnsignedIntegerType(m_destination->GetType())),
13816                     pDst);
13817                 m_encoder->Push();
13818             }
13819             ResourceLoopBackEdge(needLoop, flag, label);
13820         }
13821 
13822 
13823     }
13824     ResetVMask();
13825     m_currShader->isMessageTargetDataCacheDataPort = true;
13826 }
13827 
emitAtomicTyped(GenIntrinsicInst * pInsn)13828 void EmitPass::emitAtomicTyped(GenIntrinsicInst* pInsn)
13829 {
13830     ForceDMask();
13831     // Currently, Dword Atomics can be called by matching 2 intrinsics. One is the DwordAtomicRaw
13832     // and AtomicCmpXchg (which has 2 srcs unlike the other atomics).
13833     IGC_ASSERT(pInsn->getNumArgOperands() == 6);
13834 
13835     /// Immediate Atomics return the value before the atomic operation is performed. So that flag
13836     /// needs to be set for this.
13837     bool returnsImmValue = !pInsn->user_empty();
13838 
13839     llvm::Value* pllbuffer = pInsn->getOperand(0);
13840     llvm::Value* pllU = pInsn->getOperand(1);
13841     llvm::Value* pllV = pInsn->getOperand(2);
13842     llvm::Value* pllR = pInsn->getOperand(3);
13843     llvm::Value* pllSrc0 = pInsn->getOperand(4);
13844 
13845     AtomicOp atomic_op = EATOMIC_UNDEF;
13846 
13847     CVariable* pSrc0 = nullptr;
13848     CVariable* pSrc1 = nullptr;
13849 
13850     if (pInsn->getIntrinsicID() == GenISAIntrinsic::GenISA_icmpxchgatomictyped)
13851     {
13852         llvm::Value* pllSrc1 = pInsn->getOperand(5);
13853         pSrc1 = GetSymbol(pllSrc1);
13854         pSrc1 = UnpackOrBroadcastIfUniform(pSrc1);
13855         atomic_op = EATOMIC_CMPXCHG;
13856     }
13857     else
13858     {
13859         atomic_op = static_cast<AtomicOp>(cast<ConstantInt>(pInsn->getOperand(5))->getZExtValue());
13860     }
13861 
13862     if (atomic_op != EATOMIC_INC && atomic_op != EATOMIC_DEC)
13863     {
13864         pSrc0 = GetSymbol(pllSrc0);
13865         pSrc0 = UnpackOrBroadcastIfUniform(pSrc0);
13866     }
13867 
13868     ResourceDescriptor resource = GetResourceVariable(pllbuffer);
13869 
13870     CVariable* pU = GetSymbol(pllU);
13871     CVariable* pV = GetSymbol(pllV);
13872     CVariable* pR = GetSymbol(pllR);
13873 
13874     pU = BroadcastIfUniform(pU);
13875     pV = BroadcastIfUniform(pV);
13876     pR = BroadcastIfUniform(pR);
13877 
13878     if (m_currShader->GetIsUniform(pInsn))
13879     {
13880         IGC_ASSERT_MESSAGE(0, "Uniform DWordAtomicTyped not implemented yet");
13881     }
13882     else
13883     {
13884         uint addrDimension = 3;
13885         while (addrDimension > 1 && isUndefOrConstInt0(pInsn->getOperand(addrDimension)))
13886         {
13887             addrDimension--;
13888         }
13889 
13890         TODO("Adding headers to atomic typed ops is a workaround, verify if this is needed");
13891         const bool headerPresent = true;
13892 
13893         const uint parameterLength =
13894             addrDimension + (pSrc0 != nullptr) + (pSrc1 != nullptr) + headerPresent;
13895 
13896         auto hw_atomic_op_enum = getHwAtomicOpEnum(atomic_op);
13897         uint responseLength = returnsImmValue;
13898 
13899         unsigned int bti = 0;
13900         if (resource.m_surfaceType == ESURFACE_BINDLESS)
13901         {
13902             bti = BINDLESS_BTI;
13903         }
13904         else if (resource.m_resource->IsImmediate())
13905         {
13906             bti = (uint)resource.m_resource->GetImmediateValue();
13907         }
13908 
13909         const auto messageType = EU_GEN7_5_DATA_CACHE_1_MESSAGE_TYPE_TYPED_ATOMIC_OPERATION;
13910 
13911         uint messageSpecificControl = encodeMessageDescriptorForAtomicUnaryOp(
13912             parameterLength,
13913             responseLength,
13914             headerPresent,
13915             messageType,
13916             returnsImmValue,
13917             m_currShader->m_SIMDSize,
13918             hw_atomic_op_enum,
13919             bti);
13920 
13921         CVariable* pMessDesc = m_currShader->ImmToVariable(messageSpecificControl, ISA_TYPE_D);
13922         CVariable* exDesc =
13923             m_currShader->ImmToVariable(EU_GEN7_5_MESSAGE_TARGET_DATA_PORT_DATA_CACHE_1, ISA_TYPE_D);
13924         if (resource.m_surfaceType == ESURFACE_BINDLESS)
13925         {
13926             CVariable* temp = m_currShader->GetNewVariable(resource.m_resource);
13927             m_encoder->Add(temp, resource.m_resource, exDesc);
13928             m_encoder->Push();
13929 
13930             exDesc = temp;
13931         }
13932         CVariable* tempdst = returnsImmValue ?
13933             m_currShader->GetNewVariable(
13934                 numLanes(m_currShader->m_SIMDSize), ISA_TYPE_UD, EALIGN_GRF, CName::NONE) :
13935             nullptr;
13936         CVariable* pPayload[2] = { nullptr, nullptr };
13937 
13938         const unsigned int numLanesForSimd8 = numLanes(SIMDMode::SIMD8);
13939         IGC_ASSERT(numLanesForSimd8);
13940         const unsigned int loopIter = numLanes(m_currShader->m_SIMDSize) / numLanesForSimd8;
13941 
13942         for (uint i = 0; i < loopIter; ++i)
13943         {
13944             pPayload[i] = m_currShader->GetNewVariable(
13945                 parameterLength * numLanes(SIMDMode::SIMD8),
13946                 ISA_TYPE_F,
13947                 EALIGN_GRF,
13948                 CName::NONE);
13949 
13950             int writeIndex = 0;
13951             if (headerPresent)
13952             {
13953                 m_encoder->SetSimdSize(SIMDMode::SIMD1);
13954                 m_encoder->SetDstSubReg(7);
13955                 m_encoder->SetNoMask();
13956                 m_encoder->Copy(pPayload[i], m_currShader->ImmToVariable(0xFF, ISA_TYPE_D));
13957                 m_encoder->Push();
13958                 ++writeIndex;
13959             }
13960 
13961             auto CopyVar = [&](CVariable* pVar)
13962             {
13963                 m_encoder->SetSimdSize(SIMDMode::SIMD8);
13964                 m_encoder->SetMask((i == 0) ? EMASK_Q1 : EMASK_Q2);
13965                 if (!pVar->IsUniform())
13966                 {
13967                     m_encoder->SetSrcSubVar(0, i);
13968                 }
13969                 m_encoder->SetDstSubVar(writeIndex);
13970                 m_encoder->Copy(pPayload[i], pVar);
13971                 m_encoder->Push();
13972                 ++writeIndex;
13973             };
13974 
13975             CopyVar(pU);
13976 
13977             if (addrDimension > 1)
13978                 CopyVar(pV);
13979 
13980             if (addrDimension > 2)
13981                 CopyVar(pR);
13982 
13983             if (pSrc0)
13984                 CopyVar(pSrc0);
13985 
13986             if (pSrc1)
13987                 CopyVar(pSrc1);
13988         }
13989 
13990         uint label = 0;
13991         CVariable* flag = nullptr;
13992         bool needLoop = ResourceLoopHeader(resource, flag, label);
13993         if (resource.m_surfaceType == ESURFACE_BINDLESS && !exDesc->IsUniform())
13994         {
13995             exDesc = UniformCopy(exDesc);
13996         }
13997         if (resource.m_surfaceType == ESURFACE_NORMAL && !resource.m_resource->IsImmediate())
13998         {
13999             CVariable* indirectMess = m_currShader->GetNewVariable(
14000                 1, ISA_TYPE_UD, EALIGN_DWORD, true, CName::NONE);
14001             m_encoder->Or(indirectMess, pMessDesc, resource.m_resource);
14002             m_encoder->Push();
14003             pMessDesc = indirectMess;
14004         }
14005         for (uint i = 0; i < loopIter; ++i)
14006         {
14007             m_encoder->SetPredicate(flag);
14008             m_encoder->SetSimdSize(SIMDMode::SIMD8);
14009             m_encoder->SetMask((i == 0) ? EMASK_Q1 : EMASK_Q2);
14010             m_encoder->SetDstSubVar(i);
14011             m_encoder->Send(tempdst, pPayload[i],
14012                 EU_GEN7_5_MESSAGE_TARGET_DATA_PORT_DATA_CACHE_1, exDesc, pMessDesc);
14013             m_encoder->Push();
14014         }
14015         ResourceLoopBackEdge(needLoop, flag, label);
14016 
14017         if (returnsImmValue)
14018         {
14019             m_encoder->Cast(
14020                 m_currShader->BitCast(m_destination, GetUnsignedIntegerType(m_destination->GetType())),
14021                 tempdst);
14022             m_encoder->Push();
14023         }
14024     }
14025     ResetVMask();
14026     m_currShader->isMessageTargetDataCacheDataPort = true;
14027 }
14028 
setSIMDSizeMask(CEncoder * m_encoder,const CShader * m_currShader,int i)14029 void setSIMDSizeMask(CEncoder* m_encoder, const CShader* m_currShader, int i)
14030 {
14031     m_encoder->SetSimdSize(SIMDMode::SIMD8);
14032     m_encoder->SetMask((i == 0) ? EMASK_Q1 : EMASK_Q2);
14033 
14034 
14035     return;
14036 }
14037 
emitTypedRead(llvm::Instruction * pInsn)14038 void EmitPass::emitTypedRead(llvm::Instruction* pInsn)
14039 {
14040     const CShader::ExtractMaskWrapper writeMask(m_currShader, pInsn);
14041     IGC_ASSERT_MESSAGE(writeMask.hasEM() && writeMask.getEM() != 0, "Wrong write mask");
14042 
14043     llvm::Value* pllSrcBuffer = pInsn->getOperand(0);
14044     llvm::Value* pllU = pInsn->getOperand(1);
14045     llvm::Value* pllV = pInsn->getOperand(2);
14046     llvm::Value* pllR = pInsn->getOperand(3);
14047     llvm::Value* pllLOD = getOperandIfExist(pInsn, 4);
14048 
14049     CVariable* pLOD = isUndefOrConstInt0(pllLOD) ? nullptr : GetSymbol(pllLOD);
14050     CVariable* pR = (pLOD == nullptr && isUndefOrConstInt0(pllR)) ? nullptr : GetSymbol(pllR);
14051     CVariable* pV = (pR == nullptr && isUndefOrConstInt0(pllV)) ? nullptr : GetSymbol(pllV);
14052     CVariable* pU = GetSymbol(pllU);
14053 
14054     pU = BroadcastIfUniform(pU, m_currShader->GetIsUniform(pInsn));
14055     pV = pV ? BroadcastIfUniform(pV, m_currShader->GetIsUniform(pInsn)) : nullptr;
14056     pR = pR ? BroadcastIfUniform(pR, m_currShader->GetIsUniform(pInsn)) : nullptr;
14057     pLOD = pLOD ? BroadcastIfUniform(pLOD, m_currShader->GetIsUniform(pInsn)) : nullptr;
14058 
14059     ResourceDescriptor resource = GetResourceVariable(pllSrcBuffer);
14060 
14061     uint numChannels = iSTD::BitCount(writeMask.getEM());
14062 
14063     if (m_currShader->GetIsUniform(pInsn))
14064     {
14065         SIMDMode nativeDispatchMode = m_currShader->m_Platform->getMinDispatchMode();
14066         CVariable* tempdst = nullptr;
14067         tempdst = m_currShader->GetNewVariable(
14068             numChannels * numLanes(nativeDispatchMode),
14069             ISA_TYPE_F,
14070             EALIGN_GRF,
14071             CName("tyReadDest"));
14072         m_encoder->SetSimdSize(nativeDispatchMode);
14073         m_encoder->SetPredicate(nullptr);
14074         m_encoder->SetNoMask();
14075         m_encoder->TypedRead4(resource, pU, pV, pR, pLOD, tempdst, writeMask.getEM());
14076 
14077         m_encoder->Push();
14078 
14079         // Mov the required channel values to m_destination
14080         m_encoder->SetSimdSize(SIMDMode::SIMD1);
14081         m_encoder->SetNoMask();
14082 
14083         for (uint i = 0; i < numChannels; ++i)
14084         {
14085             m_encoder->SetSrcSubReg(0, i * numLanes(nativeDispatchMode));
14086             m_encoder->SetDstSubReg(i);
14087             m_encoder->Copy(m_destination, tempdst);
14088             m_encoder->Push();
14089         }
14090     }
14091     else
14092     {
14093         uint label = 0;
14094         CVariable* flag = nullptr;
14095         bool needLoop = ResourceLoopHeader(resource, flag, label);
14096         CVariable* tempdst[4] = { nullptr, nullptr, nullptr, nullptr };
14097         SIMDMode instWidth = std::min(
14098             m_currShader->m_Platform->supportsSIMD16TypedRW() ? SIMDMode::SIMD16 : SIMDMode::SIMD8,
14099             m_currShader->m_SIMDSize);
14100         bool needsSplit = m_currShader->m_SIMDSize > instWidth;
14101 
14102         if (!needsSplit)
14103         {
14104             m_encoder->SetPredicate(flag);
14105             m_encoder->TypedRead4(resource, pU, pV, pR, pLOD, m_destination, writeMask.getEM());
14106 
14107             m_encoder->Push();
14108         }
14109         else
14110         {
14111             const unsigned int numLanesForInstWidth = numLanes(instWidth);
14112             IGC_ASSERT(numLanesForInstWidth);
14113             const unsigned int splitInstCount = numLanes(m_currShader->m_SIMDSize) / numLanesForInstWidth;
14114 
14115             for (uint i = 0; i < splitInstCount; ++i)
14116             {
14117                 tempdst[i] = m_currShader->GetNewVariable(
14118                     numChannels * numLanes(instWidth),
14119                     ISA_TYPE_F,
14120                     EALIGN_GRF,
14121                     CName::NONE);
14122 
14123                 setSIMDSizeMask(m_encoder, m_currShader, i);
14124                 m_encoder->SetSrcSubVar(0, i);
14125                 m_encoder->SetSrcSubVar(1, i);
14126                 m_encoder->SetSrcSubVar(2, i);
14127                 m_encoder->SetPredicate(flag);
14128                 m_encoder->TypedRead4(resource, pU, pV, pR, pLOD, tempdst[i], writeMask.getEM());
14129                 m_encoder->Push();
14130             }
14131         }
14132         ResourceLoopBackEdge(needLoop, flag, label);
14133 
14134         if (m_currShader->m_SIMDSize != instWidth)
14135         {
14136             JoinSIMD(tempdst, numChannels, instWidth);
14137         }
14138     }
14139     m_currShader->isMessageTargetDataCacheDataPort = true;
14140 }
14141 
emitTypedWrite(llvm::Instruction * pInsn)14142 void EmitPass::emitTypedWrite(llvm::Instruction* pInsn)
14143 {
14144     ForceDMask();
14145     llvm::Value* pllDstBuffer = pInsn->getOperand(0);
14146     llvm::Value* pllU = pInsn->getOperand(1);
14147     llvm::Value* pllV = pInsn->getOperand(2);
14148     llvm::Value* pllR = pInsn->getOperand(3);
14149     llvm::Value* pllLOD = pInsn->getOperand(4);
14150     llvm::Value* pllSrc_X = pInsn->getOperand(5);
14151     llvm::Value* pllSrc_Y = pInsn->getOperand(6);
14152     llvm::Value* pllSrc_Z = pInsn->getOperand(7);
14153     llvm::Value* pllSrc_W = pInsn->getOperand(8);
14154 
14155     CVariable* pLOD = isUndefOrConstInt0(pllLOD) ? nullptr : GetSymbol(pllLOD);
14156     CVariable* pR = (pLOD == nullptr && isUndefOrConstInt0(pllR)) ? nullptr : GetSymbol(pllR);
14157     CVariable* pV = (pR == nullptr && isUndefOrConstInt0(pllV)) ? nullptr : GetSymbol(pllV);
14158     CVariable* pU = GetSymbol(pllU);
14159 
14160     CVariable* pSrc_X = GetSymbol(pllSrc_X);
14161     CVariable* pSrc_Y = GetSymbol(pllSrc_Y);
14162     CVariable* pSrc_Z = GetSymbol(pllSrc_Z);
14163     CVariable* pSrc_W = GetSymbol(pllSrc_W);
14164 
14165     pU = BroadcastIfUniform(pU);
14166     pV = pV ? BroadcastIfUniform(pV) : nullptr;
14167     pR = pR ? BroadcastIfUniform(pR) : nullptr;
14168     pLOD = pLOD ? BroadcastIfUniform(pLOD) : nullptr;
14169 
14170     uint writeMask =
14171         (!llvm::isa<UndefValue>(pllSrc_X) ? 1 : 0) |
14172         (!llvm::isa<UndefValue>(pllSrc_Y) ? 2 : 0) |
14173         (!llvm::isa<UndefValue>(pllSrc_Z) ? 4 : 0) |
14174         (!llvm::isa<UndefValue>(pllSrc_W) ? 8 : 0);
14175 
14176     ResourceDescriptor resource = GetResourceVariable(pllDstBuffer);
14177 
14178     if (m_currShader->GetIsUniform(pInsn))
14179     {
14180         IGC_ASSERT_MESSAGE(0, "Uniform store_uav_typed not implemented yet");
14181     }
14182     else
14183     {
14184         uint label = 0;
14185         CVariable* flag = nullptr;
14186         bool needLoop = ResourceLoopHeader(resource, flag, label);
14187         uint parameterLength = 4;
14188 
14189         SIMDMode instWidth = std::min(
14190             m_currShader->m_Platform->supportsSIMD16TypedRW() ? SIMDMode::SIMD16 : SIMDMode::SIMD8,
14191             m_currShader->m_SIMDSize);
14192         bool needsSplit = m_currShader->m_SIMDSize > instWidth;
14193 
14194         if (!needsSplit)
14195         {
14196             CVariable* pPayload = m_currShader->GetNewVariable(
14197                 parameterLength * numLanes(m_currShader->m_SIMDSize),
14198                 ISA_TYPE_F,
14199                 EALIGN_GRF,
14200                 CName::NONE);
14201             // pSrcX, Y, Z & W are broadcast to uniform by this function itself.
14202             m_currShader->CopyVariable(pPayload, pSrc_X, 0);
14203             m_currShader->CopyVariable(pPayload, pSrc_Y, 1);
14204             m_currShader->CopyVariable(pPayload, pSrc_Z, 2);
14205             m_currShader->CopyVariable(pPayload, pSrc_W, 3);
14206             m_encoder->SetPredicate(flag);
14207             m_encoder->TypedWrite4(resource, pU, pV, pR, pLOD, pPayload, writeMask);
14208 
14209             m_encoder->Push();
14210         }
14211         else
14212         {
14213             IGC_ASSERT(instWidth == SIMDMode::SIMD8 ||
14214                 instWidth == SIMDMode::SIMD16);
14215             IGC_ASSERT(m_currShader->m_SIMDSize > instWidth);
14216             const uint numInst = numLanes(m_currShader->m_SIMDSize) / numLanes(instWidth);
14217             std::vector<CVariable*> pPayload(numInst);
14218             for (uint i = 0; i < numInst; ++i)
14219             {
14220                 pPayload[i] = m_currShader->GetNewVariable(
14221                     parameterLength * numLanes(instWidth),
14222                     ISA_TYPE_F,
14223                     EALIGN_GRF, CName::NONE);
14224                 setSIMDSizeMask(m_encoder, m_currShader, i);
14225                 if (!pSrc_X->IsUniform())
14226                 {
14227                     m_encoder->SetSrcSubVar(0, i);
14228                 }
14229                 m_encoder->SetDstSubVar(0);
14230                 m_encoder->Copy(pPayload[i], pSrc_X);
14231                 m_encoder->Push();
14232 
14233                 setSIMDSizeMask(m_encoder, m_currShader, i);
14234                 if (!pSrc_Y->IsUniform())
14235                 {
14236                     m_encoder->SetSrcSubVar(0, i);
14237                 }
14238                 m_encoder->SetDstSubVar(1);
14239                 m_encoder->Copy(pPayload[i], pSrc_Y);
14240                 m_encoder->Push();
14241 
14242                 setSIMDSizeMask(m_encoder, m_currShader, i);
14243                 if (!pSrc_Z->IsUniform())
14244                 {
14245                     m_encoder->SetSrcSubVar(0, i);
14246                 }
14247                 m_encoder->SetDstSubVar(2);
14248                 m_encoder->Copy(pPayload[i], pSrc_Z);
14249                 m_encoder->Push();
14250 
14251                 setSIMDSizeMask(m_encoder, m_currShader, i);
14252                 if (!pSrc_W->IsUniform())
14253                 {
14254                     m_encoder->SetSrcSubVar(0, i);
14255                 }
14256                 m_encoder->SetDstSubVar(3);
14257                 m_encoder->Copy(pPayload[i], pSrc_W);
14258                 m_encoder->Push();
14259                 if (!m_currShader->m_Platform->canFuseTypedWrite())
14260                 {
14261                     setSIMDSizeMask(m_encoder, m_currShader, i);
14262                     m_encoder->SetSrcSubVar(0, i);
14263                     m_encoder->SetSrcSubVar(1, i);
14264                     m_encoder->SetSrcSubVar(2, i);
14265                     m_encoder->SetSrcSubVar(3, i);
14266                     m_encoder->SetPredicate(flag);
14267                     m_encoder->TypedWrite4(resource, pU, pV, pR, pLOD, pPayload[i], writeMask);
14268                     m_encoder->Push();
14269                 }
14270             }
14271             if (m_currShader->m_Platform->canFuseTypedWrite())
14272             {
14273                 for (uint i = 0; i < numInst; ++i)
14274                 {
14275                     setSIMDSizeMask(m_encoder, m_currShader, i);
14276                     m_encoder->SetSrcSubVar(0, i);
14277                     m_encoder->SetSrcSubVar(1, i);
14278                     m_encoder->SetSrcSubVar(2, i);
14279                     m_encoder->SetSrcSubVar(3, i);
14280                     m_encoder->SetPredicate(flag);
14281                     m_encoder->TypedWrite4(resource, pU, pV, pR, pLOD, pPayload[i], writeMask);
14282                     m_encoder->Push();
14283                 }
14284             }
14285         }
14286         ResourceLoopBackEdge(needLoop, flag, label);
14287     }
14288     ResetVMask();
14289     m_currShader->isMessageTargetDataCacheDataPort = true;
14290 }
14291 
divergentBarrierCheck(const CShader * Shader,const CodeGenContext & Ctx,const Instruction * I)14292 static void divergentBarrierCheck(
14293     const CShader* Shader, const CodeGenContext &Ctx, const Instruction* I)
14294 {
14295     if (IGC_IS_FLAG_DISABLED(EnableDivergentBarrierCheck))
14296         return;
14297 
14298     if (Shader->InsideWorkgroupDivergentCF(I))
14299     {
14300         Debug::DumpName name =
14301             IGC::Debug::GetDumpNameObj(Shader, "divergent_barrier.log");
14302         std::string Path = name.str();
14303         std::ofstream OS(Path, std::ios::app);
14304         if (OS.is_open())
14305         {
14306             std::string Repr;
14307             raw_string_ostream SS(Repr);
14308             I->print(SS, true);
14309             SS.flush();
14310             OS << '\n' << Repr;
14311             Ctx.EmitError(OS, "Possible divergent barrier found", I);
14312         }
14313     }
14314 }
14315 
emitThreadGroupBarrier(llvm::Instruction * inst)14316 void EmitPass::emitThreadGroupBarrier(llvm::Instruction* inst)
14317 {
14318     if (m_currShader->GetShaderType() == ShaderType::HULL_SHADER)
14319     {
14320         // set barrier counter bits in R0.2 (for use by VISA barrier instruction)
14321 
14322         CHullShader* hsProgram = static_cast<CHullShader*>(m_currShader);
14323         int instanceCount = hsProgram->DetermineInstanceCount();
14324         // This sets the barrier message counter bits which is needed for HS
14325         unsigned int counterBits = m_currShader->m_Platform->getBarrierCountBits(instanceCount);
14326         CVariable* tmpVar = m_currShader->GetNewVariable(1, ISA_TYPE_UD, EALIGN_GRF, true, CName::NONE);
14327 
14328         if (m_currShader->m_Platform->needsHSBarrierIDWorkaround())
14329         {
14330             // move barrier id into bits 27:24 of R0.2 in the payload to match with GPGPU payload for barrier id
14331             // VISA assumes barrier id is found in bits 27:24 as in GPGPU payload and to avoid any IGC/VISA change
14332             // this is a simple WA which needs to be applied
14333 
14334             CVariable* masklower24bit = m_currShader->ImmToVariable(0xf000000, ISA_TYPE_UD);
14335             m_encoder->SetSrcRegion(0, 0, 1, 0);
14336             m_encoder->SetSrcSubReg(0, 2);
14337             m_encoder->Shl(tmpVar, hsProgram->GetR0(), m_currShader->ImmToVariable(11, ISA_TYPE_UD));
14338             m_encoder->Push();
14339             m_encoder->And(tmpVar, tmpVar, masklower24bit);
14340             m_encoder->Push();
14341             m_encoder->Or(tmpVar, tmpVar, m_currShader->ImmToVariable(counterBits, ISA_TYPE_UD));
14342             m_encoder->Push();
14343         }
14344         else
14345         {
14346             // If barrier - id bits match GPGPU payload
14347             m_encoder->SetSrcRegion(0, 0, 1, 0);
14348             m_encoder->SetSrcSubReg(0, 2);
14349             m_encoder->Or(tmpVar, hsProgram->GetR0(), m_currShader->ImmToVariable(counterBits, ISA_TYPE_UD));
14350             m_encoder->Push();
14351         }
14352 
14353         m_encoder->SetDstSubReg(2);
14354         m_encoder->SetSimdSize(SIMDMode::SIMD1);
14355         m_encoder->SetNoMask();
14356         m_encoder->Copy(hsProgram->GetR0(), tmpVar);
14357         m_encoder->Push();
14358     }
14359 
14360     // OPT: Remove barrier instruction when thread group size is less or equal than simd size.
14361     bool skipBarrierInstructionInCS = false;
14362     if (m_currShader->GetShaderType() == ShaderType::COMPUTE_SHADER)
14363     {
14364         unsigned int threadGroupSizeCS = (static_cast<CComputeShader*>(m_currShader))->GetThreadGroupSize();
14365         if (threadGroupSizeCS <= numLanes(m_SimdMode))
14366         {
14367             skipBarrierInstructionInCS = true;
14368         }
14369     }
14370     else if (m_currShader->GetShaderType() == ShaderType::OPENCL_SHADER) {
14371         Function* F = inst->getParent()->getParent();
14372         MetaDataUtils* pMdUtils = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
14373         uint32_t sz = IGCMetaDataHelper::getThreadGroupSize(*pMdUtils, F);
14374         if (sz != 0 && sz <= numLanes(m_SimdMode)) {
14375             skipBarrierInstructionInCS = true;
14376         }
14377     }
14378 
14379     if (!skipBarrierInstructionInCS)
14380     {
14381         e_barrierKind BarrierKind = EBARRIER_NORMAL; // default
14382         GenIntrinsicInst* geninst = cast<GenIntrinsicInst>(inst);
14383         if (geninst->getIntrinsicID() == GenISAIntrinsic::GenISA_threadgroupbarrier_signal) {
14384             BarrierKind = EBARRIER_SIGNAL;
14385         }
14386         else if (geninst->getIntrinsicID() == GenISAIntrinsic::GenISA_threadgroupbarrier_wait) {
14387             BarrierKind = EBARRIER_WAIT;
14388         }
14389         m_encoder->Barrier(BarrierKind);
14390         m_encoder->Push();
14391 
14392         // Set if barrier was used for this function
14393         m_encoder->SetFunctionHasBarrier(inst->getFunction());
14394 
14395         divergentBarrierCheck(m_currShader, *m_pCtx, inst);
14396     }
14397 }
14398 
14399 
emitMemoryFence(llvm::Instruction * inst)14400 void EmitPass::emitMemoryFence(llvm::Instruction* inst)
14401 {
14402     CodeGenContext* ctx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
14403 
14404     bool CommitEnable = llvm::cast<llvm::ConstantInt>((inst->getOperand(0)))->getValue().getBoolValue();
14405     bool L3_Flush_RW_Data = llvm::cast<llvm::ConstantInt>((inst->getOperand(1)))->getValue().getBoolValue();
14406     bool L3_Flush_Constant_Data = llvm::cast<llvm::ConstantInt>((inst->getOperand(2)))->getValue().getBoolValue();
14407     bool L3_Flush_Texture_Data = llvm::cast<llvm::ConstantInt>((inst->getOperand(3)))->getValue().getBoolValue();
14408     bool L3_Flush_Instructions = llvm::cast<llvm::ConstantInt>((inst->getOperand(4)))->getValue().getBoolValue();
14409     bool Global_Mem_Fence = true;
14410     bool L1_Invalidate = ctx->platform.hasL1ReadOnlyCache();
14411 
14412     // If passed a non-constant parameter, be conservative and assume that the parameter is true
14413     if (ConstantInt* globalConst = llvm::dyn_cast<llvm::ConstantInt>(inst->getOperand(6)))
14414     {
14415         L1_Invalidate &= globalConst->getValue().getBoolValue();
14416     }
14417 
14418     bool EmitFence = true;
14419     // If passed a non-constant parameter, be conservative and emit a fence.
14420     // We really don't want to add control-flow at this point.
14421     if (ConstantInt* globalConst = llvm::dyn_cast<llvm::ConstantInt>(inst->getOperand(5)))
14422     {
14423         Global_Mem_Fence = globalConst->getValue().getBoolValue();
14424         if (globalConst->isZero())
14425         {
14426             // Check whether we know this is a local fence. If we do, don't emit fence for a BDW+SKL/BXT only.
14427             // case CLK_LOCAL_MEM_FENCE:
14428 
14429             if (ctx->platform.localMemFenceSupress())
14430             {
14431                 EmitFence = false;
14432             }
14433         }
14434     }
14435 
14436     // for untyped memory fence L3 flush is never necessary.
14437     L3_Flush_RW_Data = false;
14438     if (L3_Flush_RW_Data)
14439     {
14440         // dont flush L1 if L3 is also being flushed
14441         L1_Invalidate = false;
14442     }
14443 
14444 
14445     m_encoder->Fence(CommitEnable,
14446         L3_Flush_RW_Data,
14447         L3_Flush_Constant_Data,
14448         L3_Flush_Texture_Data,
14449         L3_Flush_Instructions,
14450         Global_Mem_Fence,
14451         L1_Invalidate,
14452         !EmitFence);
14453 
14454     m_encoder->Push();
14455 }
14456 
emitMemoryFence()14457 void EmitPass::emitMemoryFence()
14458 {
14459     m_encoder->Fence(true,
14460         false,
14461         false,
14462         false,
14463         false,
14464         true,
14465         false,
14466         false);
14467     m_encoder->Push();
14468 }
14469 
emitTypedMemoryFence(llvm::Instruction * inst)14470 void EmitPass::emitTypedMemoryFence(llvm::Instruction* inst)
14471 {
14472     CodeGenContext* ctx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
14473 
14474     bool CommitEnable = true;
14475     bool L3_Flush_RW_Data = m_currShader->m_Platform->flushL3ForTypedMemory();
14476     bool L3_Flush_Constant_Data = false;
14477     bool L3_Flush_Texture_Data = false;
14478     bool L3_Flush_Instructions = false;
14479     bool Global_Mem_Fence = true;
14480     bool L1_Invalidate = ctx->platform.hasL1ReadOnlyCache();
14481 
14482     // If passed a non-constant parameter, be conservative and assume that the parameter is true
14483     if (ConstantInt* globalConst = llvm::dyn_cast<llvm::ConstantInt>(inst->getOperand(0)))
14484     {
14485         L1_Invalidate &= globalConst->getValue().getBoolValue();
14486     }
14487 
14488 
14489 
14490     m_encoder->Fence(CommitEnable,
14491         L3_Flush_RW_Data,
14492         L3_Flush_Constant_Data,
14493         L3_Flush_Texture_Data,
14494         L3_Flush_Instructions,
14495         Global_Mem_Fence,
14496         L1_Invalidate,
14497         false);
14498     emitFlushSamplerCache();
14499 }
14500 
14501 
emitFlushSamplerCache()14502 void EmitPass::emitFlushSamplerCache()
14503 {
14504     m_encoder->FlushSamplerCache();
14505     m_encoder->Push();
14506 }
14507 
emitPhaseOutput(llvm::GenIntrinsicInst * inst)14508 void EmitPass::emitPhaseOutput(llvm::GenIntrinsicInst* inst)
14509 {
14510     IGC_ASSERT(m_currShader->GetShaderType() == ShaderType::PIXEL_SHADER);
14511     CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
14512     IGC_ASSERT(nullptr != psProgram);
14513     IGC_ASSERT(psProgram->GetPhase() == PSPHASE_COARSE);
14514 
14515     unsigned int outputIndex = (unsigned int)cast<llvm::ConstantInt>(inst->getOperand(1))->getZExtValue();
14516     CVariable* output = GetSymbol(inst->getOperand(0));
14517     if (inst->getIntrinsicID() == GenISAIntrinsic::GenISA_PHASE_OUTPUT)
14518     {
14519         CVariable* temp =
14520             m_currShader->GetNewVariable(numLanes(m_SimdMode), output->GetType(), EALIGN_GRF, CName::NONE);
14521         m_encoder->Copy(temp, output);
14522         output = temp;
14523     }
14524 
14525     psProgram->AddCoarseOutput(output, outputIndex);
14526 }
14527 
emitPhaseInput(llvm::GenIntrinsicInst * inst)14528 void EmitPass::emitPhaseInput(llvm::GenIntrinsicInst* inst)
14529 {
14530     IGC_ASSERT(m_currShader->GetShaderType() == ShaderType::PIXEL_SHADER);
14531     CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
14532     IGC_ASSERT(psProgram->GetPhase() == PSPHASE_PIXEL);
14533 
14534     unsigned int inputIndex = (unsigned int)cast<llvm::ConstantInt>(inst->getOperand(0))->getZExtValue();
14535     bool isVectorInput = inst->getIntrinsicID() == GenISAIntrinsic::GenISA_PHASE_INPUTVEC;
14536     uint16_t vectorSize = isVectorInput ?
14537         int_cast<uint16_t>(cast<ConstantInt>(inst->getArgOperand(2))->getZExtValue()) : (uint16_t)1;
14538     CVariable* input = psProgram->GetCoarseInput(inputIndex, vectorSize, m_destination->GetType());
14539 
14540     // address variable represents register a0
14541     CVariable* pDstArrElm = m_currShader->GetNewAddressVariable(
14542         numLanes(m_currShader->m_SIMDSize),
14543         input->GetType(),
14544         false,
14545         true,
14546         input->getName());
14547 
14548     // we add offsets to the base that is the beginning of the vector variable
14549     CVariable* index = psProgram->GetCoarseParentIndex();
14550     CVariable* byteAddress = psProgram->GetNewVariable(numLanes(m_SimdMode), ISA_TYPE_UW, EALIGN_OWORD, CName::NONE);
14551     DWORD shiftAmount = iSTD::Log2(CEncoder::GetCISADataTypeSize(input->GetType()));
14552     m_encoder->Shl(byteAddress, index, psProgram->ImmToVariable(shiftAmount, ISA_TYPE_UW));
14553     m_encoder->Push();
14554 
14555     if (isVectorInput)
14556     {
14557         CVariable* elementOffset = psProgram->GetNewVariable(numLanes(m_SimdMode), ISA_TYPE_UW, EALIGN_OWORD, CName::NONE);
14558         uint elementSize = numLanes(m_SimdMode) * CEncoder::GetCISADataTypeSize(input->GetType());
14559         m_encoder->Mul(elementOffset, GetSymbol(inst->getArgOperand(1)), psProgram->ImmToVariable(elementSize, ISA_TYPE_UW));
14560         m_encoder->Push();
14561         CVariable* adjustedByteAddress = psProgram->GetNewVariable(numLanes(m_SimdMode), ISA_TYPE_UW, EALIGN_OWORD, CName::NONE);
14562         m_encoder->Add(adjustedByteAddress, byteAddress, elementOffset);
14563         m_encoder->Push();
14564         byteAddress = adjustedByteAddress;
14565     }
14566 
14567     m_encoder->AddrAdd(pDstArrElm, input, byteAddress);
14568     m_encoder->Push();
14569 
14570     m_encoder->Copy(m_destination, pDstArrElm);
14571     m_encoder->Push();
14572 }
14573 
emitUniformAtomicCounter(llvm::GenIntrinsicInst * pInsn)14574 void EmitPass::emitUniformAtomicCounter(llvm::GenIntrinsicInst* pInsn)
14575 {
14576     ForceDMask();
14577     IGC_ASSERT(pInsn->getNumOperands() == 2);
14578     GenISAIntrinsic::ID IID = pInsn->getIntrinsicID();
14579     /// Immediate Atomics return the value before the atomic operation is performed. So that flag
14580     /// needs to be set for this.
14581     bool returnsImmValue = !pInsn->user_empty();
14582 
14583     llvm::Value* pllbuffer = pInsn->getOperand(0);
14584     ResourceDescriptor resource = GetResourceVariable(pllbuffer);
14585     uint binding_table_index = 0;
14586 
14587     CVariable* prefixVar[2] = { nullptr, nullptr };
14588     CVariable* dst = m_destination;
14589     bool hasheader = m_currShader->m_Platform->needsHeaderForAtomicCounter();
14590 
14591     EU_DATA_PORT_ATOMIC_OPERATION_TYPE atomicType = EU_DATA_PORT_ATOMIC_OPERATION_ADD;
14592     // for SIMD dispatch greater than 8 it is more efficient to emit a SIMD1 atomic
14593     CVariable* src = m_currShader->ImmToVariable(
14594         IID == GenISAIntrinsic::GenISA_atomiccounterinc ? 1 : -1, ISA_TYPE_D);
14595     emitPreOrPostFixOp(EOPCODE_ADD, 0, ISA_TYPE_D, false, src, prefixVar);
14596     CVariable* pSrcCopy = prefixVar[0];
14597     if (m_currShader->m_numberInstance == 2)
14598     {
14599         pSrcCopy = prefixVar[1];
14600     }
14601 
14602     CVariable* pHeader = nullptr;
14603     if (hasheader)
14604     {
14605         pHeader = m_currShader->GetNewVariable(
14606             numLanes(SIMDMode::SIMD8),
14607             ISA_TYPE_UD,
14608             EALIGN_GRF, CName::NONE);
14609 
14610         m_encoder->SetNoMask();
14611         m_encoder->SetSimdSize(SIMDMode::SIMD1);
14612         m_encoder->SetDstSubReg(7);
14613         m_encoder->Copy(pHeader, m_currShader->ImmToVariable(0xFFFF, ISA_TYPE_UD));
14614         m_encoder->Push();
14615     }
14616 
14617     CVariable* pPayload = m_currShader->GetNewVariable(
14618         8,
14619         ISA_TYPE_D,
14620         EALIGN_GRF,
14621         true, CName::NONE);
14622     m_encoder->SetSimdSize(SIMDMode::SIMD1);
14623     m_encoder->SetSrcRegion(0, 0, 1, 0);
14624     m_encoder->SetSrcSubReg(0, numLanes(m_currShader->m_SIMDSize) - 1);
14625     m_encoder->Copy(pPayload, pSrcCopy);
14626     m_encoder->Push();
14627     dst = m_currShader->GetNewVariable(
14628         8,
14629         ISA_TYPE_D,
14630         EALIGN_GRF,
14631         true, CName::NONE);
14632 
14633     if (resource.m_surfaceType == ESURFACE_SSHBINDLESS)
14634         binding_table_index = SSH_BINDLESS_BTI;
14635     else if (resource.m_surfaceType == ESURFACE_BINDLESS)
14636         binding_table_index = BINDLESS_BTI;
14637     else
14638         binding_table_index = (uint)resource.m_resource->GetImmediateValue();
14639 
14640     uint messageDescriptor = encodeMessageDescriptorForAtomicUnaryOp(
14641         1,
14642         returnsImmValue ? 1 : 0,
14643         hasheader,
14644         EU_GEN7_5_DATA_CACHE_1_MESSAGE_TYPE_ATOMIC_COUNTER_OPERATION,
14645         returnsImmValue,
14646         SIMDMode::SIMD8,
14647         atomicType,
14648         binding_table_index);
14649 
14650     CVariable* pMessDesc = m_currShader->ImmToVariable(messageDescriptor, ISA_TYPE_D);
14651     // src1 len = 1, SFID = DC1
14652     uint32_t src1Len = hasheader ? 1 : 0;
14653     //src1Len is not encoded in ext descriptor in case of 26bit bso
14654     if (m_currShader->m_Platform->support26BitBSOFormat() &&
14655        (resource.m_surfaceType == ESURFACE_BINDLESS || resource.m_surfaceType == ESURFACE_SCRATCH))
14656     {
14657         src1Len = 0;
14658     }
14659     uint32_t exDescVal = (src1Len << 6) | EU_GEN7_5_MESSAGE_TARGET_DATA_PORT_DATA_CACHE_1;
14660     CVariable* exDesc =
14661         m_currShader->ImmToVariable(exDescVal, ISA_TYPE_D);
14662 
14663     if (resource.m_surfaceType == ESURFACE_BINDLESS || resource.m_surfaceType == ESURFACE_SSHBINDLESS)
14664     {
14665         CVariable* temp = m_currShader->GetNewVariable(resource.m_resource);
14666         m_encoder->Add(temp, resource.m_resource, exDesc);
14667         m_encoder->Push();
14668         exDesc = temp;
14669     }
14670 
14671     m_encoder->SetSimdSize(SIMDMode::SIMD1);
14672     m_encoder->SetNoMask();
14673 
14674     if (hasheader)
14675     {
14676         m_encoder->Sends(returnsImmValue ? dst : nullptr, pHeader, pPayload,
14677             EU_GEN7_5_MESSAGE_TARGET_DATA_PORT_DATA_CACHE_1, exDesc, pMessDesc);
14678     }
14679     else
14680     {
14681         m_encoder->Send(
14682             returnsImmValue ? dst : NULL,
14683             pPayload,
14684             EU_GEN7_5_MESSAGE_TARGET_DATA_PORT_DATA_CACHE_1,
14685             exDesc,
14686             pMessDesc);
14687     }
14688     m_encoder->Push();
14689 
14690     if (returnsImmValue)
14691     {
14692         unsigned int counter = m_currShader->m_numberInstance;
14693         for (unsigned int i = 0; i < counter; ++i)
14694         {
14695             m_encoder->SetSecondHalf(i == 1);
14696             m_encoder->Add(m_destination, prefixVar[i], dst);
14697             m_encoder->Push();
14698 
14699             if (IID == GenISAIntrinsic::GenISA_atomiccounterinc)
14700             {
14701                 CVariable* src = m_currShader->ImmToVariable(-1, ISA_TYPE_D);
14702                 m_encoder->Add(m_destination, m_destination, src);
14703                 m_encoder->Push();
14704             }
14705         }
14706     }
14707 
14708     ResetVMask();
14709     m_currShader->isMessageTargetDataCacheDataPort = true;
14710 }
14711 
emitAtomicCounter(llvm::GenIntrinsicInst * pInsn)14712 void EmitPass::emitAtomicCounter(llvm::GenIntrinsicInst* pInsn)
14713 {
14714 
14715     IGC_ASSERT(pInsn->getNumOperands() == 2);
14716 
14717     bool uniformAtomic = IsUniformAtomic(pInsn) &&
14718         (m_currShader->m_SIMDSize != SIMDMode::SIMD8 || !m_currShader->m_Platform->HDCCoalesceAtomicCounterAccess());
14719     if (uniformAtomic)
14720     {
14721         emitUniformAtomicCounter(pInsn);
14722         return;
14723     }
14724 
14725     ForceDMask();
14726     GenISAIntrinsic::ID IID = pInsn->getIntrinsicID();
14727     /// Immediate Atomics return the value before the atomic operation is performed. So that flag
14728     /// needs to be set for this.
14729     bool returnsImmValue = !pInsn->user_empty();
14730 
14731     llvm::Value* pllbuffer = pInsn->getOperand(0);
14732     ResourceDescriptor resource = GetResourceVariable(pllbuffer);
14733 
14734     CVariable* dst = m_destination;
14735 
14736     bool hasheader = true;
14737     unsigned int num_split = m_currShader->m_SIMDSize == SIMDMode::SIMD16 ? 2 : 1;
14738 
14739     // header
14740     CVariable* pPayload = m_currShader->GetNewVariable(
14741         numLanes(SIMDMode::SIMD8),
14742         ISA_TYPE_UD,
14743         EALIGN_GRF, CName::NONE);
14744     m_encoder->SetNoMask();
14745     m_encoder->SetSimdSize(SIMDMode::SIMD1);
14746     m_encoder->SetDstSubReg(7);
14747     m_encoder->Copy(pPayload, m_currShader->ImmToVariable(0xFFFF, ISA_TYPE_UD));
14748     m_encoder->Push();
14749 
14750     EU_DATA_PORT_ATOMIC_OPERATION_TYPE atomicType = EU_DATA_PORT_ATOMIC_OPERATION_INC;
14751     if (IID == GenISAIntrinsic::GenISA_atomiccounterpredec)
14752     {
14753         atomicType = m_currShader->m_Platform->hasAtomicPreDec() ?
14754             EU_DATA_PORT_ATOMIC_OPERATION_PREDEC : EU_DATA_PORT_ATOMIC_OPERATION_DEC;
14755     }
14756 
14757     uint label = 0;
14758     CVariable* flag = nullptr;
14759     bool needLoop = ResourceLoopHeader(resource, flag, label);
14760 
14761     uint messageDescriptor = encodeMessageDescriptorForAtomicUnaryOp(
14762         1,
14763         returnsImmValue ? 1 : 0,
14764         hasheader,
14765         EU_GEN7_5_DATA_CACHE_1_MESSAGE_TYPE_ATOMIC_COUNTER_OPERATION,
14766         returnsImmValue,
14767         SIMDMode::SIMD8,
14768         atomicType,
14769         resource.m_surfaceType == ESURFACE_BINDLESS ? BINDLESS_BTI : (uint)resource.m_resource->GetImmediateValue());
14770 
14771     CVariable* pMessDesc = m_currShader->ImmToVariable(messageDescriptor, ISA_TYPE_D);
14772     CVariable* exDesc =
14773         m_currShader->ImmToVariable(EU_GEN7_5_MESSAGE_TARGET_DATA_PORT_DATA_CACHE_1, ISA_TYPE_D);
14774 
14775     if (resource.m_surfaceType == ESURFACE_BINDLESS)
14776     {
14777         CVariable* temp = m_currShader->GetNewVariable(resource.m_resource);
14778         m_encoder->Add(temp, resource.m_resource, exDesc);
14779         m_encoder->Push();
14780 
14781         exDesc = temp;
14782     }
14783 
14784     for (uint32_t i = 0; i < num_split; ++i)
14785     {
14786         m_encoder->SetSimdSize(SIMDMode::SIMD8);
14787         m_encoder->SetDstSubVar(i);
14788         m_encoder->SetMask((i == 0) ? EMASK_Q1 : EMASK_Q2);
14789 
14790         m_encoder->Send(
14791             returnsImmValue ? dst : NULL,
14792             pPayload,
14793             EU_GEN7_5_MESSAGE_TARGET_DATA_PORT_DATA_CACHE_1,
14794             exDesc,
14795             pMessDesc);
14796         m_encoder->Push();
14797     }
14798 
14799     if (IID == GenISAIntrinsic::GenISA_atomiccounterpredec &&
14800         !m_currShader->m_Platform->hasAtomicPreDec())
14801     {
14802         unsigned int counter = m_currShader->m_numberInstance;
14803         for (unsigned int i = 0; i < counter; ++i)
14804         {
14805             m_encoder->SetSecondHalf(i == 1);
14806             CVariable* src = m_currShader->ImmToVariable(-1, ISA_TYPE_D);
14807             m_encoder->Add(m_destination, m_destination, src);
14808             m_encoder->Push();
14809         }
14810     }
14811 
14812     ResourceLoopBackEdge(needLoop, flag, label);
14813     ResetVMask();
14814     m_currShader->isMessageTargetDataCacheDataPort = true;
14815 }
14816 
CmpBoolOp(llvm::BinaryOperator * inst,llvm::CmpInst::Predicate predicate,const SSource cmpSources[2],const SSource & bitSource,const DstModifier & modifier)14817 void EmitPass::CmpBoolOp(llvm::BinaryOperator* inst,
14818     llvm::CmpInst::Predicate predicate,
14819     const SSource cmpSources[2],
14820     const SSource& bitSource,
14821     const DstModifier& modifier)
14822 {
14823 
14824     DstModifier init;
14825     Cmp(predicate, cmpSources, init);
14826 
14827     IGC_ASSERT(bitSource.mod == EMOD_NONE);
14828     CVariable* boolOpSource = GetSrcVariable(bitSource);
14829     m_encoder->SetDstModifier(modifier);
14830 
14831     EmitSimpleAlu(inst, m_destination, m_destination, boolOpSource);
14832 }
14833 
emitAluConditionMod(Pattern * aluPattern,Instruction * alu,CmpInst * cmp,int aluOprdNum)14834 void EmitPass::emitAluConditionMod(Pattern* aluPattern, Instruction* alu, CmpInst* cmp, int aluOprdNum)
14835 {
14836     CVariable* temp = m_currShader->GetNewVector(alu);
14837     CVariable* dst = m_destination;
14838     m_destination = temp;
14839     DstModifier init;
14840 
14841     aluPattern->Emit(this, init);
14842 
14843     // condMod is in the form of "alu cmpOp 0". If pattern is in the form of
14844     // "0 cmpOp alu", cmp's predicate should be swapped. aluOprdNum indicates
14845     // which form this pattern is.
14846     auto llvmPredicate = (aluOprdNum == 0 ? cmp->getPredicate() : cmp->getSwappedPredicate());
14847     e_predicate predicate = GetPredicate(llvmPredicate);
14848     if (IsUnsignedCmp(llvmPredicate))
14849     {
14850         temp = m_currShader->BitCast(temp, GetUnsignedType(temp->GetType()));
14851     }
14852     m_encoder->Cmp(predicate, dst, temp, m_currShader->ImmToVariable(0, temp->GetType()));
14853     m_encoder->Push();
14854     m_destination = dst;
14855 }
14856 
emitHSTessFactors(llvm::Instruction * pInst)14857 void EmitPass::emitHSTessFactors(llvm::Instruction* pInst)
14858 {
14859     IGC_ASSERT(m_currShader->GetShaderType() == ShaderType::HULL_SHADER);
14860     CHullShader* hsProgram = static_cast<CHullShader*>(m_currShader);
14861     CVariable* payload[8];
14862 
14863     for (uint32_t channel = 2; channel < 8; channel++)
14864     {
14865         payload[channel] = GetSymbol(pInst->getOperand(channel - 2));
14866     }
14867 
14868     bool endOfThread = llvm::isa<llvm::ReturnInst>(pInst->getNextNode());
14869     hsProgram->EmitPatchConstantHeader(payload, endOfThread);
14870 }
14871 
emitRenderTargetRead(llvm::GenIntrinsicInst * inst)14872 void EmitPass::emitRenderTargetRead(llvm::GenIntrinsicInst* inst)
14873 {
14874     IGC_ASSERT(m_currShader->GetShaderType() == ShaderType::PIXEL_SHADER);
14875     CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
14876     uint RTIndex = 0;
14877     bool isRTIndexConstant = false;
14878     if (llvm::ConstantInt * pRenderTargetCnst = llvm::dyn_cast<llvm::ConstantInt>(inst->getOperand(0)))
14879     {
14880         RTIndex = (uint)llvm::cast<llvm::ConstantInt>(pRenderTargetCnst)->getZExtValue();
14881         isRTIndexConstant = true;
14882     }
14883 
14884     uint bindingTableIndex = m_currShader->m_pBtiLayout->GetRenderTargetIndex(RTIndex);
14885     m_currShader->SetBindingTableEntryCountAndBitmap(isRTIndexConstant, RENDER_TARGET, RTIndex, bindingTableIndex);
14886     CVariable* pSampleIndexR0 = nullptr;
14887 
14888     uint hasSampleIndex = (inst->getIntrinsicID() == GenISAIntrinsic::GenISA_RenderTargetReadSampleFreq);
14889     if (hasSampleIndex)
14890     {
14891         CVariable* pShiftedSampleIndex = nullptr;
14892         if (llvm::isa<llvm::ConstantInt>(inst->getOperand(1)))
14893         {
14894             uint sampleIndex = int_cast<uint>(cast<llvm::ConstantInt>(inst->getOperand(1))->getZExtValue());
14895             pShiftedSampleIndex = m_currShader->ImmToVariable((sampleIndex << 6), ISA_TYPE_D);
14896         }
14897         else
14898         {
14899             CVariable* SampleIndex = GetSymbol(inst->getOperand(1));
14900             if (!SampleIndex->IsUniform())
14901             {
14902                 SampleIndex = UniformCopy(SampleIndex);
14903             }
14904             pShiftedSampleIndex = m_currShader->GetNewVariable(SampleIndex);
14905             m_encoder->Shl(pShiftedSampleIndex, SampleIndex, m_currShader->ImmToVariable(6, ISA_TYPE_D));
14906             m_encoder->Push();
14907         }
14908 
14909         // and       (1) r15.0<1>:ud  r0.0<0;1,0>:ud 0xFFFFFC3F:ud
14910         // or       (1) r16.0<1>:ud  r15.0<0;1,0>:ud  r14.0<0;1,0>:ud
14911         pSampleIndexR0 = m_currShader->GetNewVariable(numLanes(m_SimdMode), ISA_TYPE_D, EALIGN_GRF, CName::NONE);
14912         m_encoder->SetSimdSize(SIMDMode::SIMD8);
14913         m_encoder->SetNoMask();
14914         m_encoder->Copy(pSampleIndexR0, psProgram->GetR0());
14915         m_encoder->Push();
14916 
14917         m_encoder->SetSimdSize(SIMDMode::SIMD1);
14918         m_encoder->SetSrcRegion(0, 0, 1, 0);
14919         m_encoder->SetSrcSubReg(0, 0);
14920         m_encoder->SetNoMask();
14921         m_encoder->And(pSampleIndexR0, pSampleIndexR0, m_currShader->ImmToVariable(0xFFFFFC3F, ISA_TYPE_UD));
14922         m_encoder->Push();
14923 
14924         m_encoder->SetSimdSize(SIMDMode::SIMD1);
14925         m_encoder->SetSrcRegion(0, 0, 1, 0);
14926         m_encoder->SetSrcSubReg(0, 0);
14927         m_encoder->SetSrcRegion(1, 0, 1, 0);
14928         m_encoder->SetSrcSubReg(1, 0);
14929         m_encoder->SetNoMask();
14930         m_encoder->Or(pSampleIndexR0, pSampleIndexR0, pShiftedSampleIndex);
14931         m_encoder->Push();
14932     }
14933 
14934     // RT read header is 2 GRF
14935     uint messageLength = 2;
14936     uint responseLength = 4 * numLanes(m_currShader->m_SIMDSize) / 8;
14937     bool headerRequired = true;
14938 
14939     // We shouldn't need any copies since R0 and R1 are already aligned
14940     // but we don't want to declare R0 and R1 as one variable in V-ISA
14941     // The problem could be fixed by moving away from raw_send for this message
14942     CVariable* payload =
14943         m_currShader->GetNewVariable(messageLength * (getGRFSize() >> 2), ISA_TYPE_D, EALIGN_GRF, CName::NONE);
14944     m_encoder->SetNoMask();
14945     m_encoder->SetSimdSize(SIMDMode::SIMD8);
14946     m_encoder->Copy(payload, (hasSampleIndex ? pSampleIndexR0 : psProgram->GetR0()));
14947     m_encoder->Push();
14948 
14949     // The following bits must be set to 0 for render target read messages:
14950     //  Bit 11 - Source0 Alpha Present to Render Target
14951     //  Bit 12 - oMask to Render Target
14952     //  Bit 13 - Source Depth Present to Render Target
14953     //  Bit 14 - Stencil Present to Render Target
14954     m_encoder->SetSimdSize(SIMDMode::SIMD1);
14955     m_encoder->SetSrcRegion(0, 0, 1, 0);
14956     m_encoder->SetSrcSubReg(0, 0);
14957     m_encoder->SetNoMask();
14958     m_encoder->And(payload, payload, m_currShader->ImmToVariable(0xFFFF87FF, ISA_TYPE_UD));
14959     m_encoder->Push();
14960 
14961     m_encoder->SetNoMask();
14962     m_encoder->SetSimdSize(SIMDMode::SIMD8);
14963     m_encoder->SetDstSubVar(1);
14964     m_encoder->Copy(payload, psProgram->GetR1());
14965     m_encoder->Push();
14966 
14967     uint msgControl =
14968         (m_SimdMode == SIMDMode::SIMD8)
14969         ? EU_GEN9_DATA_PORT_RENDER_TARGET_READ_CONTROL_SIMD8_SINGLE_SOURCE_LOW
14970         : EU_GEN9_DATA_PORT_RENDER_TARGET_READ_CONTROL_SIMD16_SINGLE_SOURCE;
14971     msgControl |=
14972         m_encoder->IsSecondHalf() ? EU_GEN6_DATA_PORT_RENDER_TARGET_WRITE_SLOTGRP_HI : EU_GEN6_DATA_PORT_RENDER_TARGET_WRITE_SLOTGRP_LO;
14973     msgControl |= psProgram->IsPerSample() ? EU_GEN9_DATA_PORT_RENDER_TARGET_READ_CONTROL_PER_SAMPLE_ENABLE : 0;
14974 
14975     uint Desc = DataPortRead(
14976         messageLength,
14977         responseLength,
14978         headerRequired,
14979         EU_DATA_PORT_READ_MESSAGE_TYPE_RENDER_TARGET_READ,
14980         msgControl,
14981         hasSampleIndex ? true : false,
14982         DATA_PORT_TARGET_RENDER_CACHE,
14983         bindingTableIndex);
14984 
14985     uint exDesc = EU_MESSAGE_TARGET_DATA_PORT_WRITE;
14986 
14987     CVariable* messDesc;
14988     if (isRTIndexConstant)
14989     {
14990         messDesc = psProgram->ImmToVariable(Desc, ISA_TYPE_UD);
14991     }
14992     else
14993     {
14994         messDesc = m_currShader->GetNewVariable(1, ISA_TYPE_UD, EALIGN_DWORD, true, CName::NONE);
14995         // RTIndex is not a constant, so OR the value with desc to get the correct RTIndex
14996         m_encoder->Add(messDesc, GetSymbol(inst->getOperand(0)), psProgram->ImmToVariable(Desc, ISA_TYPE_UD));
14997         m_encoder->Push();
14998     }
14999     //sendc
15000     m_encoder->SendC(m_destination, payload, exDesc, messDesc);
15001     m_encoder->Push();
15002 }
15003 
GetRoundingMode_FPCvtInt(Instruction * pInst)15004 ERoundingMode EmitPass::GetRoundingMode_FPCvtInt(Instruction* pInst)
15005 {
15006     if (isa<FPToSIInst>(pInst) || isa <FPToUIInst>(pInst))
15007     {
15008         const ERoundingMode defaultRoundingMode_FPCvtInt = static_cast<ERoundingMode>(
15009             m_pCtx->getModuleMetaData()->compOpt.FloatCvtIntRoundingMode);
15010         return defaultRoundingMode_FPCvtInt;
15011     }
15012 
15013     if (GenIntrinsicInst* GII = dyn_cast<GenIntrinsicInst>(pInst))
15014     {
15015         switch (GII->getIntrinsicID())
15016         {
15017         default:
15018             break;
15019         case GenISAIntrinsic::GenISA_ftoui_rtn:
15020         case GenISAIntrinsic::GenISA_ftoi_rtn:
15021             return ERoundingMode::ROUND_TO_NEGATIVE;
15022         case GenISAIntrinsic::GenISA_ftoui_rtp:
15023         case GenISAIntrinsic::GenISA_ftoi_rtp:
15024             return ERoundingMode::ROUND_TO_POSITIVE;
15025         case GenISAIntrinsic::GenISA_ftoui_rte:
15026         case GenISAIntrinsic::GenISA_ftoi_rte:
15027             return ERoundingMode::ROUND_TO_NEAREST_EVEN;
15028         }
15029     }
15030     // rounding not needed!
15031     return ERoundingMode::ROUND_TO_ANY;
15032 }
15033 
GetRoundingMode_FP(Instruction * inst)15034 ERoundingMode EmitPass::GetRoundingMode_FP(Instruction* inst)
15035 {
15036     // Float rounding mode
15037     ERoundingMode RM = static_cast<ERoundingMode>(m_pCtx->getModuleMetaData()->compOpt.FloatRoundingMode);
15038     if (GenIntrinsicInst* GII = dyn_cast<GenIntrinsicInst>(inst))
15039     {
15040         switch (GII->getIntrinsicID())
15041         {
15042         case GenISAIntrinsic::GenISA_f32tof16_rtz:
15043         case GenISAIntrinsic::GenISA_ftof_rtz:
15044         case GenISAIntrinsic::GenISA_itof_rtz:
15045         case GenISAIntrinsic::GenISA_uitof_rtz:
15046         case GenISAIntrinsic::GenISA_add_rtz:
15047         case GenISAIntrinsic::GenISA_mul_rtz:
15048         case GenISAIntrinsic::GenISA_fma_rtz:
15049             RM = ERoundingMode::ROUND_TO_ZERO;
15050             break;
15051         case GenISAIntrinsic::GenISA_ftof_rtn:
15052         case GenISAIntrinsic::GenISA_itof_rtn:
15053         case GenISAIntrinsic::GenISA_uitof_rtn:
15054         case GenISAIntrinsic::GenISA_fma_rtn:
15055             RM = ERoundingMode::ROUND_TO_NEGATIVE;
15056             break;
15057         case GenISAIntrinsic::GenISA_ftof_rtp:
15058         case GenISAIntrinsic::GenISA_itof_rtp:
15059         case GenISAIntrinsic::GenISA_uitof_rtp:
15060         case GenISAIntrinsic::GenISA_fma_rtp:
15061             RM = ERoundingMode::ROUND_TO_POSITIVE;
15062             break;
15063         case GenISAIntrinsic::GenISA_ftof_rte:
15064             RM = ERoundingMode::ROUND_TO_NEAREST_EVEN;
15065             break;
15066         case GenISAIntrinsic::GenISA_ftobf:
15067         case GenISAIntrinsic::GenISA_2fto2bf:
15068         {
15069             ConstantInt* rmVal;
15070             if (GII->getIntrinsicID() == GenISAIntrinsic::GenISA_2fto2bf) {
15071                 rmVal = cast<ConstantInt>(GII->getArgOperand(2));
15072             }
15073             else {
15074                 rmVal = cast<ConstantInt>(GII->getArgOperand(1));
15075             }
15076             RM = (ERoundingMode)rmVal->getZExtValue();
15077             break;
15078         }
15079         default:
15080             break;
15081         }
15082     }
15083     return RM;
15084 }
15085 
ignoreRoundingMode(llvm::Instruction * inst) const15086 bool EmitPass::ignoreRoundingMode(llvm::Instruction* inst) const
15087 {
15088     auto isFZero = [](Value* V)->bool {
15089         if (ConstantFP* FCST = dyn_cast<ConstantFP>(V))
15090         {
15091             return FCST->isZero();
15092         }
15093         return false;
15094     };
15095 
15096     if (isa<InsertElementInst>(inst) ||
15097         isa<ExtractElementInst>(inst) ||
15098         isa<BitCastInst>(inst) ||
15099         isa<ICmpInst>(inst) ||
15100         isa<FCmpInst>(inst) ||
15101         isa<SelectInst>(inst) ||
15102         isa<TruncInst>(inst) ||
15103         isa<LoadInst>(inst) ||
15104         isa<StoreInst>(inst))
15105     {
15106         // these are not affected by rounding mode.
15107         return true;
15108     }
15109 
15110     if (BinaryOperator* BOP = dyn_cast<BinaryOperator>(inst))
15111     {
15112         if (BOP->getType()->isIntOrIntVectorTy()) {
15113             // Integer binary op does not need rounding mode
15114             return true;
15115         }
15116 
15117         // float operations on EM uses RTNE only and are not affected
15118         // by rounding mode.
15119         if (BOP->getType()->isFPOrFPVectorTy())
15120         {
15121             switch (BOP->getOpcode())
15122             {
15123             default:
15124                 break;
15125             case Instruction::FDiv:
15126                 return true;
15127             case Instruction::FSub:
15128                 // Negation is okay for any rounding mode
15129                 if (isFZero(BOP->getOperand(0))) {
15130                     return true;
15131                 }
15132                 break;
15133             }
15134         }
15135     }
15136     if (IntrinsicInst* II = dyn_cast<IntrinsicInst>(inst))
15137     {
15138         switch (II->getIntrinsicID())
15139         {
15140         default:
15141             break;
15142         case IGCLLVM::Intrinsic::exp2:
15143         case IGCLLVM::Intrinsic::sqrt:
15144             return true;
15145         }
15146     }
15147 
15148     if (GenIntrinsicInst * GII = dyn_cast<GenIntrinsicInst>(inst))
15149     {
15150         GenISAIntrinsic::ID id = GII->getIntrinsicID();
15151         switch (id)
15152         {
15153         case GenISAIntrinsic::GenISA_bftof:
15154             return true;
15155         default:
15156             break;
15157         }
15158     }
15159     // add more instr as needed
15160     return false;
15161 }
15162 
initDefaultRoundingMode()15163 void EmitPass::initDefaultRoundingMode()
15164 {
15165     const ERoundingMode defaultRM_FP = static_cast<ERoundingMode>(m_pCtx->getModuleMetaData()->compOpt.FloatRoundingMode);
15166     const ERoundingMode defaultRM_FPCvtInt = static_cast<ERoundingMode>(m_pCtx->getModuleMetaData()->compOpt.FloatCvtIntRoundingMode);
15167 
15168     // Rounding modes must meet the following restrictions
15169     // in order to be used as default:
15170     //   1. if FPCvtInt's RM is rtz, FP's RM can be any;
15171     //   2. otherwise, FPCvtIn's RM must be the same as FP's RM
15172     const bool supportedDefaultRoundingModes =
15173         ((defaultRM_FPCvtInt == ERoundingMode::ROUND_TO_ZERO) ||
15174         (defaultRM_FPCvtInt == defaultRM_FP));
15175 
15176     IGC_ASSERT_EXIT(supportedDefaultRoundingModes);
15177 
15178     m_roundingMode_FPCvtInt = defaultRM_FPCvtInt;
15179     m_roundingMode_FP = defaultRM_FP;
15180 }
15181 
SetRoundingMode_FP(ERoundingMode newRM_FP)15182 void EmitPass::SetRoundingMode_FP(ERoundingMode newRM_FP)
15183 {
15184     if (newRM_FP != ERoundingMode::ROUND_TO_ANY &&
15185         newRM_FP != m_roundingMode_FP)
15186     {
15187         m_encoder->SetRoundingMode_FP(m_roundingMode_FP, newRM_FP);
15188         m_roundingMode_FP = newRM_FP;
15189 
15190         if (m_roundingMode_FPCvtInt != ERoundingMode::ROUND_TO_ZERO)
15191         {
15192             // If FPCvtInt's RM is not RTZ, it must be the same as FP's
15193             m_roundingMode_FPCvtInt = m_roundingMode_FP;
15194         }
15195     }
15196 }
15197 
SetRoundingMode_FPCvtInt(ERoundingMode newRM_FPCvtInt)15198 void EmitPass::SetRoundingMode_FPCvtInt(ERoundingMode newRM_FPCvtInt)
15199 {
15200     if (newRM_FPCvtInt != ERoundingMode::ROUND_TO_ANY &&
15201         newRM_FPCvtInt != m_roundingMode_FPCvtInt)
15202     {
15203         m_encoder->SetRoundingMode_FPCvtInt(m_roundingMode_FPCvtInt, newRM_FPCvtInt);
15204         m_roundingMode_FPCvtInt = newRM_FPCvtInt;
15205 
15206         if (m_roundingMode_FPCvtInt != ERoundingMode::ROUND_TO_ZERO)
15207         {
15208             // If FPCvtInt's RM is not RTZ, it must be the same as FP's
15209             m_roundingMode_FP = m_roundingMode_FPCvtInt;
15210         }
15211     }
15212 }
15213 
15214 // Return true if inst needs specific rounding mode; false otherwise.
15215 //
15216 // Currently, only gen intrinsic needs rounding mode other than the default.
setRMExplicitly(Instruction * inst)15217 bool EmitPass::setRMExplicitly(Instruction* inst)
15218 {
15219     if (GenIntrinsicInst* GII = dyn_cast<GenIntrinsicInst>(inst))
15220     {
15221         switch (GII->getIntrinsicID())
15222         {
15223         case GenISAIntrinsic::GenISA_f32tof16_rtz:
15224         case GenISAIntrinsic::GenISA_ftof_rtz:
15225         case GenISAIntrinsic::GenISA_itof_rtz:
15226         case GenISAIntrinsic::GenISA_uitof_rtz:
15227         case GenISAIntrinsic::GenISA_add_rtz:
15228         case GenISAIntrinsic::GenISA_mul_rtz:
15229         case GenISAIntrinsic::GenISA_fma_rtz:
15230         case GenISAIntrinsic::GenISA_fma_rtp:
15231         case GenISAIntrinsic::GenISA_fma_rtn:
15232         case GenISAIntrinsic::GenISA_ftof_rtn:
15233         case GenISAIntrinsic::GenISA_itof_rtn:
15234         case GenISAIntrinsic::GenISA_uitof_rtn:
15235         case GenISAIntrinsic::GenISA_ftof_rtp:
15236         case GenISAIntrinsic::GenISA_itof_rtp:
15237         case GenISAIntrinsic::GenISA_uitof_rtp:
15238         case GenISAIntrinsic::GenISA_ftobf:
15239         case GenISAIntrinsic::GenISA_2fto2bf:
15240             return true;
15241         default:
15242             break;
15243         }
15244     }
15245     return false;
15246 }
15247 
ResetRoundingMode(Instruction * inst)15248 void EmitPass::ResetRoundingMode(Instruction* inst)
15249 {
15250     // Reset rounding modes to default if they are not. Howerver, if
15251     // next inst requires non-default, which requires to set
15252     // RM explicitly, don't set default rounding modes and let the next
15253     // inst to set it explicitly.
15254     const ERoundingMode defaultRoundingMode_FP = static_cast<ERoundingMode>(
15255         m_pCtx->getModuleMetaData()->compOpt.FloatRoundingMode);
15256     const ERoundingMode defaultRoundingMode_FPCvtInt = static_cast<ERoundingMode>(
15257         m_pCtx->getModuleMetaData()->compOpt.FloatCvtIntRoundingMode);
15258 
15259     if (m_roundingMode_FP == defaultRoundingMode_FP &&
15260         m_roundingMode_FPCvtInt == defaultRoundingMode_FPCvtInt)
15261     {
15262         // Already in default mode.
15263         return;
15264     }
15265 
15266     // Those two variables are set to true if default RM is required before the next
15267     // explicit-RM setting instruction (genintrinsic).
15268     bool nextImplicitFPCvtInt = false;
15269     bool nextImplicitFP = false;
15270     for (auto nextInst = GetNextInstruction(inst);
15271          nextInst != nullptr;
15272          nextInst = GetNextInstruction(nextInst))
15273     {
15274         if (ignoreRoundingMode(nextInst))
15275         {
15276             continue;
15277         }
15278         if (setRMExplicitly(nextInst))
15279         {
15280             // As nextInst will set RM explicitly, no need to go further.
15281             break;
15282         }
15283 
15284         // At this point, a default RM is needed. For FPCvtInt, we know
15285         // precisely whether FPCvtInt RM is needed or not; but for FP, we
15286         // do it conservatively as we do not scan all instructions here.
15287         ERoundingMode intRM = GetRoundingMode_FPCvtInt(nextInst);
15288 
15289         // If it is not ROUND_TO_ANY, it uses FPCvtInt RM;
15290         // otherwise, it does not use FPCvtInt RM.
15291         if (intRM != ERoundingMode::ROUND_TO_ANY) {
15292             nextImplicitFPCvtInt = true;
15293         }
15294         else {
15295             // Conservatively assume FP default RM is used.
15296             nextImplicitFP = true;
15297         }
15298 
15299         if (nextImplicitFPCvtInt && nextImplicitFP) {
15300             break;
15301         }
15302     }
15303 
15304     if (nextImplicitFPCvtInt && !nextImplicitFP)
15305     {
15306         SetRoundingMode_FPCvtInt(defaultRoundingMode_FPCvtInt);
15307     }
15308     else if (nextImplicitFP && !nextImplicitFPCvtInt)
15309     {
15310         SetRoundingMode_FP(defaultRoundingMode_FP);
15311     }
15312     else  if (nextImplicitFP  && nextImplicitFPCvtInt)
15313     {
15314         // Need to set default for both
15315         if (defaultRoundingMode_FPCvtInt == ERoundingMode::ROUND_TO_ZERO)
15316         {
15317             SetRoundingMode_FP(defaultRoundingMode_FP);
15318         }
15319         else
15320         {
15321             SetRoundingMode_FPCvtInt(defaultRoundingMode_FPCvtInt);
15322         }
15323     }
15324 }
15325 
emitf32tof16_rtz(llvm::GenIntrinsicInst * inst)15326 void EmitPass::emitf32tof16_rtz(llvm::GenIntrinsicInst* inst)
15327 {
15328     CVariable* src = GetSymbol(inst->getOperand(0));
15329     CVariable imm0_hf(0, ISA_TYPE_HF);
15330     CVariable* dst_hf = m_currShader->BitCast(m_destination, ISA_TYPE_HF);
15331 
15332     SetRoundingMode_FP(ERoundingMode::ROUND_TO_ZERO);
15333 
15334     m_encoder->SetDstRegion(2);
15335     m_encoder->Cast(dst_hf, src);
15336     m_encoder->Push();
15337 
15338     m_encoder->SetDstRegion(2);
15339     m_encoder->SetDstSubReg(1);
15340     m_encoder->Copy(dst_hf, &imm0_hf);
15341     m_encoder->Push();
15342 
15343     ResetRoundingMode(inst);
15344 }
15345 
emitfitof(llvm::GenIntrinsicInst * inst)15346 void EmitPass::emitfitof(llvm::GenIntrinsicInst* inst)
15347 {
15348     CVariable* src = GetSymbol(inst->getOperand(0));
15349     ERoundingMode RM = GetRoundingMode_FP(inst);
15350     CVariable* dst = m_destination;
15351 
15352     GenISAIntrinsic::ID id = inst->getIntrinsicID();
15353     if (id == GenISAIntrinsic::GenISA_uitof_rtn ||
15354         id == GenISAIntrinsic::GenISA_uitof_rtp ||
15355         id == GenISAIntrinsic::GenISA_uitof_rtz)
15356     {
15357         src = m_currShader->BitCast(src, GetUnsignedType(src->GetType()));
15358     }
15359 
15360     SetRoundingMode_FP(RM);
15361 
15362     m_encoder->Cast(dst, src);
15363     m_encoder->Push();
15364 
15365     ResetRoundingMode(inst);
15366 }
15367 
15368 // Emit FP Operations (FPO) using round-to-zero (rtz)
emitFPOrtz(llvm::GenIntrinsicInst * inst)15369 void EmitPass::emitFPOrtz(llvm::GenIntrinsicInst* inst)
15370 {
15371     IGC_ASSERT_MESSAGE(inst->getNumArgOperands() >= 2, "ICE: incorrect gen intrinsic");
15372 
15373     GenISAIntrinsic::ID GID = inst->getIntrinsicID();
15374     CVariable* src0 = GetSymbol(inst->getOperand(0));
15375     CVariable* src1 = GetSymbol(inst->getOperand(1));
15376     CVariable* dst = m_destination;
15377 
15378     SetRoundingMode_FP(ERoundingMode::ROUND_TO_ZERO);
15379 
15380     switch (GID)
15381     {
15382     default:
15383         IGC_ASSERT_MESSAGE(0, "ICE: unexpected Gen Intrinsic");
15384         break;
15385     case GenISAIntrinsic::GenISA_mul_rtz:
15386         m_encoder->Mul(dst, src0, src1);
15387         m_encoder->Push();
15388         break;
15389     case  GenISAIntrinsic::GenISA_add_rtz:
15390         m_encoder->Add(dst, src0, src1);
15391         m_encoder->Push();
15392         break;
15393     case GenISAIntrinsic::GenISA_fma_rtz:
15394     {
15395         CVariable* src2 = GetSymbol(inst->getOperand(2));
15396         m_encoder->Mad(dst, src0, src1, src2);
15397         m_encoder->Push();
15398         break;
15399     }
15400     }
15401 
15402     ResetRoundingMode(inst);
15403 }
15404 
15405 // Emit FP mad (FMA) using round-to-positive-infinity (rtp)
emitFMArtp(llvm::GenIntrinsicInst * inst)15406 void EmitPass::emitFMArtp(llvm::GenIntrinsicInst *inst) {
15407   IGC_ASSERT_MESSAGE(inst->getNumArgOperands() == 3, "ICE: incorrect gen intrinsic");
15408 
15409   CVariable *src0 = GetSymbol(inst->getOperand(0));
15410   CVariable *src1 = GetSymbol(inst->getOperand(1));
15411   CVariable *src2 = GetSymbol(inst->getOperand(2));
15412   CVariable *dst = m_destination;
15413 
15414   SetRoundingMode_FP(ERoundingMode::ROUND_TO_POSITIVE);
15415 
15416   m_encoder->Mad(dst, src0, src1, src2);
15417   m_encoder->Push();
15418 
15419   ResetRoundingMode(inst);
15420 }
15421 
15422 // Emit FP mad (FMA) using round-to-negative-infinity (rtn)
emitFMArtn(llvm::GenIntrinsicInst * inst)15423 void EmitPass::emitFMArtn(llvm::GenIntrinsicInst *inst) {
15424   IGC_ASSERT_MESSAGE(inst->getNumArgOperands() == 3, "ICE: incorrect gen intrinsic");
15425 
15426   CVariable *src0 = GetSymbol(inst->getOperand(0));
15427   CVariable *src1 = GetSymbol(inst->getOperand(1));
15428   CVariable *src2 = GetSymbol(inst->getOperand(2));
15429   CVariable *dst = m_destination;
15430 
15431   SetRoundingMode_FP(ERoundingMode::ROUND_TO_NEGATIVE);
15432 
15433   m_encoder->Mad(dst, src0, src1, src2);
15434   m_encoder->Push();
15435 
15436   ResetRoundingMode(inst);
15437 }
15438 
emitftoi(llvm::GenIntrinsicInst * inst)15439 void EmitPass::emitftoi(llvm::GenIntrinsicInst* inst)
15440 {
15441     IGC_ASSERT_MESSAGE(inst->getOperand(0)->getType()->isFloatingPointTy(), "Unsupported type");
15442     CVariable* src = GetSymbol(inst->getOperand(0));
15443     CVariable* dst = m_destination;
15444     ERoundingMode RM = GetRoundingMode_FPCvtInt(inst);
15445     IGC_ASSERT_MESSAGE(RM != ERoundingMode::ROUND_TO_ANY, "Not valid FP->int rounding mode!");
15446 
15447     GenISAIntrinsic::ID id = inst->getIntrinsicID();
15448     if (id == GenISAIntrinsic::GenISA_ftoui_rtn ||
15449         id == GenISAIntrinsic::GenISA_ftoui_rtp ||
15450         id == GenISAIntrinsic::GenISA_ftoui_rte)
15451     {
15452         dst = m_currShader->BitCast(dst, GetUnsignedType(dst->GetType()));
15453     }
15454 
15455     SetRoundingMode_FPCvtInt(RM);
15456 
15457     m_encoder->Cast(dst, src);
15458     m_encoder->Push();
15459 
15460     ResetRoundingMode(inst);
15461 }
15462 
isUniformStoreOCL(Value * ptr,Value * storeVal)15463 bool EmitPass::isUniformStoreOCL(Value* ptr, Value* storeVal)
15464 {
15465     if (m_currShader->GetShaderType() != ShaderType::OPENCL_SHADER ||
15466         !m_currShader->GetIsUniform(ptr))
15467     {
15468         return false;
15469     }
15470 
15471     Type* Ty = storeVal->getType();
15472     IGCLLVM::FixedVectorType* VTy = dyn_cast<IGCLLVM::FixedVectorType>(Ty);
15473     uint32_t elts = VTy ? int_cast<uint32_t>(VTy->getNumElements()) : 1;
15474     Type* eltTy = VTy ? VTy->getElementType() : Ty;
15475 
15476     // use TypeSize to be consistent with VectorLoad/Store
15477     uint32_t totalBytes = elts * ((uint32_t)m_DL->getTypeSizeInBits(eltTy) / 8);
15478 
15479     // Note that when elts > 1, VectorProcess make sure that its element
15480     // size must be 4 or 8. Also, note that if totalBytes = 4, elts must be 1.
15481     bool doUniformStore = (elts == 1 ||
15482         (m_currShader->GetIsUniform(storeVal) &&
15483             (totalBytes == 8 || totalBytes == 12 || totalBytes == 16)));
15484     return doUniformStore;
15485 }
15486 
15487 // Return true if this store will be emit as uniform store
isUniformStoreOCL(llvm::StoreInst * SI)15488 bool EmitPass::isUniformStoreOCL(llvm::StoreInst* SI)
15489 {
15490     return isUniformStoreOCL(SI->getPointerOperand(), SI->getValueOperand());
15491 }
15492 
emitVectorBitCast(llvm::BitCastInst * BCI)15493 void EmitPass::emitVectorBitCast(llvm::BitCastInst* BCI)
15494 {
15495     const CShader::ExtractMaskWrapper destMask(m_currShader, BCI);
15496 
15497     CVariable* src = GetSymbol(BCI->getOperand(0));
15498     llvm::Type* srcTy = BCI->getOperand(0)->getType();
15499     llvm::Type* dstTy = BCI->getType();
15500     llvm::Type* srcEltTy, * dstEltTy;
15501     uint32_t srcNElts, dstNElts;
15502 
15503     IGC_ASSERT_MESSAGE((srcTy->isVectorTy() || dstTy->isVectorTy()), "No vector type !");
15504 
15505     if (srcTy->isVectorTy())
15506     {
15507         srcEltTy = cast<VectorType>(srcTy)->getElementType();
15508         srcNElts = (uint32_t)cast<IGCLLVM::FixedVectorType>(srcTy)->getNumElements();
15509     }
15510     else
15511     {
15512         srcEltTy = srcTy;
15513         srcNElts = 1;
15514     }
15515     if (dstTy->isVectorTy())
15516     {
15517         dstEltTy = cast<VectorType>(dstTy)->getElementType();
15518         dstNElts = (uint32_t)cast<IGCLLVM::FixedVectorType>(dstTy)->getNumElements();
15519     }
15520     else
15521     {
15522         dstEltTy = dstTy;
15523         dstNElts = 1;
15524     }
15525 
15526     if (src->IsImmediate())
15527     {
15528         CVariable* reg = m_currShader->GetNewVariable(
15529             1,
15530             src->GetType(),
15531             m_encoder->GetCISADataTypeAlignment(src->GetType()),
15532             true,
15533             1, CName::NONE);
15534 
15535         m_encoder->Copy(reg, src);
15536         m_encoder->Push();
15537 
15538         src = reg;
15539     }
15540 
15541     uint32_t width = numLanes(m_currShader->m_SIMDSize);
15542     uint32_t dstEltBytes = GetPrimitiveTypeSizeInRegister(dstEltTy);
15543     uint32_t srcEltBytes = GetPrimitiveTypeSizeInRegister(srcEltTy);
15544     bool srcUniform = src->IsUniform();
15545     bool dstUniform = m_destination->IsUniform();
15546     if (srcUniform && dstUniform &&
15547         (dstNElts == 2 || dstNElts == 4 || dstNElts == 8) &&
15548         m_destination != src &&
15549         destMask.getEM() == ((1U << dstNElts) - 1)/* Full mask */ &&
15550         /* If alignment of source is safe to be aliased to the dst type. */
15551         src->GetAlign() >= CEncoder::GetCISADataTypeAlignment(m_destination->GetType()) &&
15552         /* Exclude bitcast from/to 16-bit */
15553         srcEltBytes != 2 && dstEltBytes != 2) {
15554         // TODO; Add uniform vector bitcast support. A simple copy is enough but
15555         // the ideal resolution is to teach DeSSA to handle that.
15556         CVariable* dst = m_destination;
15557         src = m_currShader->BitCast(src, dst->GetType());
15558         m_encoder->SetNoMask();
15559         m_encoder->SetUniformSIMDSize(lanesToSIMDMode(dstNElts));
15560         m_encoder->SetSrcRegion(0, dstNElts, dstNElts, 1);
15561         m_encoder->Copy(dst, src);
15562         m_encoder->Push();
15563         return;
15564     }
15565     if (srcEltBytes == dstEltBytes)
15566     {
15567         // This should not happen now, but generate code anyway.
15568         // CISABuilder does split if there is any spliting.
15569 
15570         // Special case for: 1 element vectors to scalars
15571         //    %15 = bitcast <1 x i64> %4 to i64
15572         if ((srcEltTy == dstEltTy) &&
15573             (srcNElts == dstNElts) && (srcNElts == 1))
15574         {
15575             m_encoder->Copy(m_destination, src);
15576             m_encoder->Push();
15577         }
15578         else if (m_destination != src)
15579         {
15580             for (uint32_t i = 0, offset = 0; i < dstNElts; ++i)
15581             {
15582                 if (destMask.isSet(i))
15583                 {
15584                     m_encoder->SetSrcRegion(0,
15585                         srcUniform ? 0 : 1,
15586                         srcUniform ? 1 : 1,
15587                         srcUniform ? 0 : 0);
15588                     m_encoder->SetSrcSubReg(0, srcUniform ? i : i * width);
15589                     m_encoder->SetDstRegion(1);
15590                     m_encoder->SetDstSubReg(dstUniform ? offset : offset * width);
15591                     m_encoder->Copy(m_destination, src);
15592                     m_encoder->Push();
15593                     offset++;
15594                 }
15595             }
15596         }
15597     }
15598     else if (dstEltBytes > srcEltBytes)
15599     {
15600         IGC_ASSERT(0 < srcEltBytes);
15601         CVariable* aliasDst = m_currShader->GetNewAlias(m_destination, src->GetType(), 0, 0);
15602         uint32_t N = dstEltBytes / srcEltBytes;
15603         IGC_ASSERT_MESSAGE((dstEltBytes % srcEltBytes) == 0, "Basic types should be power of 2");
15604         // Since srcEltBytes can be the second largest element type (32bit)
15605         // and region hstride == 1, Src will not need splitting!
15606         // Only dst might need spliting.
15607         bool splitDst = (!dstUniform && (dstEltBytes * width > m_currShader->getGRFSize() * 2));
15608         IGC_ASSERT_MESSAGE((!splitDst || (width == 16) || (width == 32)),
15609             "Internal Error: Dst needs splitting only under SIMD16!");
15610         if (N > 4)
15611         {
15612             // Special case for N = 8 as dst's stride can be 1/2/4, not 8.
15613             //   for example, <1xi64> Y = bitcast <8xi8> X
15614             // we will do the following (simd8)
15615             //   .decl X  type=q num_elts=8
15616             //   .decl Y  type=b num_elts=64
15617             //   .decl Y_alias type=d num_elts=16 alias=<Y,0>
15618             //   .decl V0  type=d num_elts=8
15619             //   .decl V1  type=d num_elts=8
15620             //   .decl V0_alias type=b num_elts=32 alias=<V0, 0>
15621             //   .decl V1_alias type=b num_elts=32 alias=<V1, 0>
15622             //
15623             //   mov (8) V0_alias.0<4> X(0,0)<8;8:1>
15624             //   mov (8) V0_alias.1<4> X(0,8)<8;8:1>
15625             //   mov (8) V0_alias.2<4> X(0,16)<8;8:1>
15626             //   mov (8) V0_alias.3<4> X(0,24)<8;8:1>
15627             //   mov (8) V1_alias.0<4> X(1,0)<8;8:1>
15628             //   mov (8) V1_alias.1<4> X(1,8)<8;8:1>
15629             //   mov (8) V1_alias.2<4> X(1,16)<8;8:1>
15630             //   mov (8) V1_alias.3<4> X(1,24)<8;8:1>
15631             //
15632             // then, combine V0 and V1 to create Y
15633             //   mov (8) Y_alias.0<2> V0(0,0)<8;8,1>
15634             //   mov (8) Y_alias.1<2> V1(0,0)<8;8,1>
15635             //
15636             // For SIMD16, the above two movs will span across two GRFs for their
15637             // dst operands, therefore, they need splitting, that is
15638             //   mov (16) Y_alias.0<2> V0(0,0)<16;16,1>
15639             //   mov (16) Y_alias.1<2> V1(0,0)<16;16,1>
15640             // should be splitted into the following:
15641             //   mov (8, Q1) Y_alias.0<2>   V0(0,0)<8;8,1>
15642             //   mov (8, Q2) Y_alias.16<2>  V0(1,0)<8;8,1>
15643             //   mov (8, Q1) Y_alias.1<2>   V1(0,0)<8;8,1>
15644             //   mov (8, Q2) Y_alias.17<2>  V1(1,0)<8;8,1>
15645             //
15646             IGC_ASSERT(N == 8);
15647             IGC_ASSERT(srcEltBytes == 1);
15648             const uint32_t N2 = N / 2; // 4
15649             VISA_Type TyD = (src->GetType() == ISA_TYPE_UB) ? ISA_TYPE_UD : ISA_TYPE_D;
15650             CVariable* V0 = m_currShader->GetNewVariable(dstUniform ? 1 : width, TyD, EALIGN_GRF, dstUniform, CName::NONE);
15651             CVariable* V1 = m_currShader->GetNewVariable(dstUniform ? 1 : width, TyD, EALIGN_GRF, dstUniform, CName::NONE);
15652             CVariable* V0_alias = m_currShader->GetNewAlias(V0, src->GetType(), 0, 0);
15653             CVariable* V1_alias = m_currShader->GetNewAlias(V1, src->GetType(), 0, 0);
15654             CVariable* dst_alias = m_currShader->GetNewAlias(m_destination, V0->GetType(), 0, 0);
15655             for (unsigned i = 0, offset = 0; i < dstNElts; ++i)
15656             {
15657                 if (destMask.isSet(i))
15658                 {
15659                     for (unsigned j = 0; j < N; ++j)
15660                     {
15661                         bool useV0 = (j < N2);
15662                         uint32_t oft = useV0 ? j : j - N2;
15663                         m_encoder->SetSrcRegion(0, srcUniform ? 0 : 1, 1, 0);
15664                         m_encoder->SetSrcSubReg(0, srcUniform ? (i * N + j) : (width * (i * N + j)));
15665                         m_encoder->SetDstRegion(dstUniform ? 1 : N2);
15666                         m_encoder->SetDstSubReg(oft);
15667                         m_encoder->Copy(useV0 ? V0_alias : V1_alias, src);
15668                         m_encoder->Push();
15669                     }
15670                     // combine V0 and V1 into dst
15671                     if (splitDst)
15672                     {
15673                         SIMDMode simdSize = SIMDMode::SIMD8;
15674                         int exSize = simdSize == SIMDMode::SIMD16 ? 16 : 8;
15675                         // Dst must not be uniform and it must be SIMD16!
15676                         // first simd8/simd16 : dst_alias = V0
15677                         m_encoder->SetDstRegion(2);
15678                         m_encoder->SetDstSubReg(2 * offset * width);
15679                         m_encoder->SetSimdSize(simdSize);
15680                         m_encoder->SetMask(simdSize != SIMDMode::SIMD16 ? EMASK_Q1 : EMASK_H1);
15681                         m_encoder->Copy(dst_alias, V0);
15682                         m_encoder->Push();
15683                         // second simd8/simd16: dst_alias=V0
15684                         m_encoder->SetSrcSubReg(0, exSize);
15685                         m_encoder->SetDstRegion(2);
15686                         m_encoder->SetDstSubReg(2 * offset * width + 2 * exSize);
15687                         m_encoder->SetSimdSize(simdSize);
15688                         m_encoder->SetMask(simdSize != SIMDMode::SIMD16 ? EMASK_Q2 : EMASK_H2);
15689                         m_encoder->Copy(dst_alias, V0);
15690                         m_encoder->Push();
15691 
15692                         // first simd8/simd16 : dist_alias=V1
15693                         m_encoder->SetDstRegion(2);
15694                         m_encoder->SetDstSubReg(2 * offset * width + 1);
15695                         m_encoder->SetSimdSize(simdSize);
15696                         m_encoder->SetMask(simdSize != SIMDMode::SIMD16 ? EMASK_Q1 : EMASK_H1);
15697                         m_encoder->Copy(dst_alias, V1);
15698                         m_encoder->Push();
15699                         // first simd8/simd16 : dist_alias=V1
15700                         m_encoder->SetSrcSubReg(0, exSize);
15701                         m_encoder->SetDstRegion(2);
15702                         m_encoder->SetDstSubReg(2 * offset * width + 2 * exSize + 1);
15703                         m_encoder->SetSimdSize(simdSize);
15704                         m_encoder->SetMask(simdSize != SIMDMode::SIMD16 ? EMASK_Q2 : EMASK_H2);
15705                         m_encoder->Copy(dst_alias, V1);
15706                         m_encoder->Push();
15707                     }
15708                     else
15709                     {
15710                         m_encoder->SetDstRegion(dstUniform ? 1 : 2);
15711                         m_encoder->SetDstSubReg(dstUniform ? (2 * offset) : (2 * offset * width));
15712                         m_encoder->Copy(dst_alias, V0);
15713                         m_encoder->Push();
15714                         m_encoder->SetDstRegion(dstUniform ? 1 : 2);
15715                         m_encoder->SetDstSubReg(dstUniform ? (2 * offset + 1) : (2 * offset * width + 1));
15716                         m_encoder->Copy(dst_alias, V1);
15717                         m_encoder->Push();
15718                     }
15719                     offset++;
15720                 }
15721             }
15722         }
15723         else
15724         {
15725             for (unsigned i = 0, offset = 0; i < dstNElts; ++i)
15726             {
15727                 if (destMask.isSet(i))
15728                 {
15729                     for (unsigned j = 0; j < N; ++j)
15730                     {
15731                         if (splitDst)
15732                         {
15733                             // !dstUniform
15734                             // first half
15735                             SIMDMode mode = m_currShader->m_SIMDSize == SIMDMode::SIMD32 ? SIMDMode::SIMD16 : SIMDMode::SIMD8;
15736                             int exSize = mode == SIMDMode::SIMD16 ? 16 : 8;
15737                             m_encoder->SetSrcRegion(0, srcUniform ? 0 : 1, 1, 0);
15738                             m_encoder->SetSrcSubReg(0, srcUniform ? (i * N + j) : (width * (i * N + j)));
15739                             m_encoder->SetDstRegion(N);
15740                             m_encoder->SetDstSubReg(offset * N * width + j);
15741                             m_encoder->SetSimdSize(mode);
15742                             m_encoder->SetMask(mode == SIMDMode::SIMD16 ? EMASK_H1 : EMASK_Q1);
15743                             m_encoder->Copy(aliasDst, src);
15744                             m_encoder->Push();
15745 
15746                             // second half
15747                             m_encoder->SetSrcRegion(0, srcUniform ? 0 : 1, 1, 0);
15748                             m_encoder->SetSrcSubReg(0, srcUniform ? (i * N + j) : (width * (i * N + j) + exSize));
15749                             m_encoder->SetDstRegion(N);
15750                             m_encoder->SetDstSubReg(offset * N * width + N * exSize + j);
15751                             m_encoder->SetSimdSize(mode);
15752                             m_encoder->SetMask(mode == SIMDMode::SIMD16 ? EMASK_H2 : EMASK_Q2);
15753                             m_encoder->Copy(aliasDst, src);
15754                             m_encoder->Push();
15755                         }
15756                         else
15757                         {
15758                             m_encoder->SetSrcRegion(0, srcUniform ? 0 : 1, 1, 0);
15759                             m_encoder->SetSrcSubReg(0, srcUniform ? (i * N + j) : (width * (i * N + j)));
15760                             m_encoder->SetDstRegion(dstUniform ? 1 : N);
15761                             m_encoder->SetDstSubReg(dstUniform ? (offset * N + j) : (offset * N * width + j));
15762                             m_encoder->Copy(aliasDst, src);
15763                             m_encoder->Push();
15764                         }
15765                     }
15766                     offset++;
15767                 }
15768             }
15769         }
15770     }
15771     else // (dstEltBytes < srcEltBytes)
15772     {
15773         IGC_ASSERT(0 < dstEltBytes);
15774         // Create an aliase to src and mov the alias to the dst
15775         CVariable* aliasSrc = m_currShader->GetNewAlias(src, m_destination->GetType(), 0, 0);
15776         uint32_t N = srcEltBytes / dstEltBytes;
15777         // Similar to dstEltBytes > srcEltBytes, dstEltBytes can be 32bit
15778         // at most and dst's stride == 1, so it will not need spliting.
15779         bool splitSrc = (!srcUniform && (srcEltBytes * width > m_currShader->getGRFSize() * 2));
15780         IGC_ASSERT_MESSAGE((!splitSrc || (width == 16) || (width == 32)),
15781             "Internal Error: Src needs splitting only under SIMD16!");
15782         IGC_ASSERT_MESSAGE((srcEltBytes % dstEltBytes) == 0, "Basic types should be power of 2");
15783         // avoid coalescing the dst variable if all of its uses are EEI with constant index
15784         // this give RA more freedom (e.g. for bank conflict assignments)
15785         auto allUsesAreEEwithImm = [this](BitCastInst* BCI)
15786         {
15787             for (auto I = BCI->user_begin(), E = BCI->user_end(); I != E; ++I)
15788             {
15789                 if (auto EEInst = dyn_cast<ExtractElementInst>(*I))
15790                 {
15791                     if (dyn_cast<ConstantInt>(EEInst->getIndexOperand()))
15792                     {
15793                         continue;
15794                     }
15795                 }
15796                 return false;
15797             }
15798             return true;
15799         };
15800 
15801         SmallVector<CVariable*, 8> VectorBCICVars;
15802         bool useSeparateCVar = m_currShader->m_numberInstance == 1 &&
15803             !dstUniform && srcNElts == 1 && N <= 8 &&
15804             allUsesAreEEwithImm(BCI);
15805 
15806         // Once BCI has been coalesced, don't separate CVar for BCI
15807         // [todo evaluate the performance impact and let alias handle it
15808         //  if needed]
15809         if (m_currShader->IsCoalesced(BCI))
15810             useSeparateCVar = false;
15811 
15812         for (unsigned i = 0, offset = 0; i < srcNElts; ++i)
15813         {
15814             for (unsigned j = 0; j < N; ++j)
15815             {
15816                 if (destMask.isSet(i * N + j))
15817                 {
15818                     if (useSeparateCVar)
15819                     {
15820                         CVariable* newDst = m_currShader->GetNewVariable(
15821                             width, m_destination->GetType(),
15822                             m_destination->GetAlign(),
15823                             CName::NONE);
15824                         VectorBCICVars.push_back(newDst);
15825                         m_destination = newDst;
15826                     }
15827                     if (splitSrc)
15828                     {
15829                         // !srcUniform
15830                         // first half
15831                         SIMDMode mode = m_currShader->m_SIMDSize == SIMDMode::SIMD32 ? SIMDMode::SIMD16 : SIMDMode::SIMD8;
15832                         int exSize = mode == SIMDMode::SIMD16 ? 16 : 8;
15833                         m_encoder->SetSrcRegion(0, N, 1, 0); // = (0, width*N, width, N)
15834                         m_encoder->SetSrcSubReg(0, i * N * width + j);
15835                         m_encoder->SetDstSubReg(dstUniform ? offset : (width * offset));
15836                         m_encoder->SetDstRegion(1);
15837                         m_encoder->SetSimdSize(mode);
15838                         m_encoder->SetMask(mode == SIMDMode::SIMD16 ? EMASK_H1 : EMASK_Q1);
15839                         m_encoder->Copy(m_destination, aliasSrc);
15840                         m_encoder->Push();
15841 
15842                         // second half
15843                         m_encoder->SetSrcRegion(0, N, 1, 0); // = (0, width*N, width, N)
15844                         m_encoder->SetSrcSubReg(0, i * N * width + N * exSize + j);
15845                         m_encoder->SetDstSubReg(dstUniform ? offset : (width * offset + exSize));
15846                         m_encoder->SetDstRegion(1);
15847                         m_encoder->SetSimdSize(mode);
15848                         m_encoder->SetMask(mode == SIMDMode::SIMD16 ? EMASK_H2 : EMASK_Q2);
15849                         m_encoder->Copy(m_destination, aliasSrc);
15850                         m_encoder->Push();
15851                     }
15852                     else
15853                     {
15854                         m_encoder->SetSrcRegion(0, srcUniform ? 0 : N, 1, 0); // = (0, width*N, width, N)
15855                         m_encoder->SetSrcSubReg(0, srcUniform ? (i * N + j) : (i * N * width + j));
15856                         m_encoder->SetDstSubReg(dstUniform ? offset : (width * offset));
15857                         m_encoder->SetDstRegion(1);
15858                         m_encoder->Copy(m_destination, aliasSrc);
15859                         m_encoder->Push();
15860                     }
15861                     if (!useSeparateCVar)
15862                     {
15863                         // offset stays as zero if we are using distinct variablbes for each EEI
15864                         offset++;
15865                     }
15866                 }
15867             }
15868         }
15869 
15870         if (useSeparateCVar)
15871         {
15872             m_currShader->addCVarsForVectorBC(BCI, VectorBCICVars);
15873         }
15874     }
15875 }
15876 
GetPrimitiveTypeSizeInRegisterInBits(const Type * Ty) const15877 unsigned int EmitPass::GetPrimitiveTypeSizeInRegisterInBits(const Type* Ty) const
15878 {
15879     return m_currShader->GetPrimitiveTypeSizeInRegisterInBits(Ty);
15880 }
15881 
GetPrimitiveTypeSizeInRegister(const Type * Ty) const15882 unsigned int EmitPass::GetPrimitiveTypeSizeInRegister(const Type* Ty) const
15883 {
15884     return m_currShader->GetPrimitiveTypeSizeInRegister(Ty);
15885 }
15886 
GetScalarTypeSizeInRegisterInBits(const Type * Ty) const15887 unsigned int EmitPass::GetScalarTypeSizeInRegisterInBits(const Type* Ty) const
15888 {
15889     return m_currShader->GetScalarTypeSizeInRegisterInBits(Ty);
15890 }
15891 
GetScalarTypeSizeInRegister(const Type * Ty) const15892 unsigned int EmitPass::GetScalarTypeSizeInRegister(const Type* Ty) const
15893 {
15894     return m_currShader->GetScalarTypeSizeInRegister(Ty);
15895 }
15896 
15897 
A64LSLoopHead(CVariable * addr,CVariable * & curMask,CVariable * & lsPred,uint & label)15898 void EmitPass::A64LSLoopHead(
15899     CVariable* addr, CVariable*& curMask, CVariable*& lsPred, uint& label)
15900 {
15901     // Create a loop to calculate LS's pred (lsPred) that make sure for every active lane of the LS,
15902     // the address hi part must be the same
15903     //
15904     // pseudo code (including A64LSLoopHead and A64LSLoopTail):
15905     //          addrHigh = packed addr hi part
15906     //          curMask = executionMask
15907     //      label:
15908     //          uniformAddrHi = the_first_active_lane_of_CurMask(addrHigh)
15909     //          lsPred = cmp(uniformAddrHi, addrHigh)
15910     //          (lsPred) send // the original LS instruction
15911     //          lsPred = ~lsPred
15912     //          CurMask = lsPred & CurMask
15913     //          lsPred = CurMask
15914     //          (lsPred) jmp label
15915 
15916     SIMDMode simdMode = m_encoder->GetSimdSize();
15917     uint16_t execSize = numLanes(simdMode);
15918     IGC_ASSERT(simdMode == SIMDMode::SIMD8 || simdMode == SIMDMode::SIMD16);
15919 
15920     // get address hi part
15921     CVariable* addrAlias = m_currShader->GetNewAlias(addr, ISA_TYPE_UD, 0, execSize * 2);
15922     CVariable* addrHigh = m_currShader->GetNewVariable(
15923         execSize, ISA_TYPE_UD, EALIGN_GRF, false, CName::NONE);
15924     m_encoder->SetSrcSubReg(0, 1);
15925     m_encoder->SetSrcRegion(0, 2, 1, 0);
15926     m_encoder->Copy(addrHigh, addrAlias);
15927     m_encoder->Push();
15928 
15929     curMask = GetHalfExecutionMask();
15930 
15931     // create loop
15932     label = m_encoder->GetNewLabelID("a64_loop");
15933     m_encoder->Label(label);
15934     m_encoder->Push();
15935 
15936     // Get the first active lane's address-hi
15937     CVariable* ufoffset = nullptr;
15938     CVariable* uniformAddrHi = UniformCopy(addrHigh, ufoffset, curMask, true);
15939 
15940     // Set the predicate lsPred to true for all lanes with the same address_hi
15941     lsPred = m_currShader->GetNewVariable(
15942         numLanes(m_currShader->m_dispatchSize), ISA_TYPE_BOOL, EALIGN_BYTE, CName::NONE);
15943     m_encoder->Cmp(EPREDICATE_EQ, lsPred, uniformAddrHi, addrHigh);
15944     m_encoder->Push();
15945 }
15946 
A64LSLoopTail(CVariable * curMask,CVariable * lsPred,uint label)15947 void EmitPass::A64LSLoopTail(CVariable* curMask, CVariable* lsPred, uint label)
15948 {
15949     // Unset the bits in the mask for lanes that were executed
15950     bool tmpSh = m_encoder->IsSecondHalf();
15951     m_encoder->SetSecondHalf(false);
15952 
15953     CVariable* tmpLsPred = m_currShader->GetNewVariable(1, curMask->GetType(), curMask->GetAlign(), true, CName::NONE);
15954     m_encoder->Cast(tmpLsPred, lsPred);
15955 
15956     m_encoder->SetSrcModifier(1, EMOD_NOT);
15957     m_encoder->And(curMask, curMask, tmpLsPred);
15958     m_encoder->Push();
15959     m_encoder->SetP(lsPred, curMask);
15960     m_encoder->Push();
15961     m_encoder->Jump(lsPred, label);
15962     m_encoder->Push();
15963 
15964     m_encoder->SetSecondHalf(tmpSh);
15965 }
15966 
hasA64WAEnable() const15967 bool EmitPass::hasA64WAEnable() const
15968 {
15969     // Check WA table entry for current platform.
15970     if (!m_currShader->m_Platform->WaEnableA64WA())
15971         return false;
15972 
15973     // -intel-force-enable-a64WA
15974     if (m_pCtx->getModuleMetaData()->compOpt.ForceEnableA64WA)
15975         return true;
15976 
15977     // -intel-disable-a64WA
15978     if (m_pCtx->getModuleMetaData()->compOpt.DisableA64WA)
15979         return false;
15980 
15981     // Disable A64WA for kernels which specify work_group_size_hint(1, 1, 1).
15982     MetaDataUtils* pMdUtils =  m_currShader->GetMetaDataUtils();
15983     uint32_t WGSize = IGCMetaDataHelper::getThreadGroupSizeHint(*pMdUtils, m_currShader->entry);
15984     if (WGSize == 1)
15985         return false;
15986 
15987     return true;
15988 }
15989 
emitGatherA64(Value * loadInst,CVariable * dst,CVariable * offset,unsigned elemSize,unsigned numElems,bool addrUniform)15990 void EmitPass::emitGatherA64(Value* loadInst, CVariable* dst, CVariable* offset, unsigned elemSize, unsigned numElems, bool addrUniform)
15991 {
15992     if (hasA64WAEnable() && !offset->IsUniform() && !addrUniform) {
15993         CVariable* curMask = nullptr;
15994         CVariable* lsPred = nullptr;
15995         uint label = 0;
15996         A64LSLoopHead(offset, curMask, lsPred, label);
15997 
15998         // do send with pred
15999         if (isa<LoadInst>(loadInst) && !m_currShader->IsCoalesced(loadInst))
16000         {
16001             // load inst is the single def of the vISA variable and therefore a kill
16002             m_encoder->Lifetime(LIFETIME_START, dst);
16003         }
16004         m_encoder->SetPredicate(lsPred);
16005         m_encoder->GatherA64(dst, offset, elemSize, numElems);
16006         m_encoder->Push();
16007 
16008         A64LSLoopTail(curMask, lsPred, label);
16009 
16010     } else {
16011         m_encoder->GatherA64(dst, offset, elemSize, numElems);
16012     }
16013 }
16014 
emitGather4A64(Value * loadInst,CVariable * dst,CVariable * offset,bool addrUniform)16015 void EmitPass::emitGather4A64(Value* loadInst, CVariable* dst, CVariable* offset, bool addrUniform)
16016 {
16017     if (hasA64WAEnable() && !offset->IsUniform() && !addrUniform) {
16018         CVariable* curMask = nullptr;
16019         CVariable* lsPred = nullptr;
16020         uint label = 0;
16021         A64LSLoopHead(offset, curMask, lsPred, label);
16022 
16023         // do send with pred
16024         if (isa<LoadInst>(loadInst) && !m_currShader->IsCoalesced(loadInst))
16025         {
16026             // load inst is the single def of the vISA variable and therefore a kill
16027             m_encoder->Lifetime(LIFETIME_START, dst);
16028         }
16029         m_encoder->SetPredicate(lsPred);
16030         m_encoder->Gather4A64(dst, offset);
16031         m_encoder->Push();
16032 
16033         A64LSLoopTail(curMask, lsPred, label);
16034 
16035     }
16036     else {
16037         m_encoder->Gather4A64(dst, offset);
16038     }
16039 }
16040 
emitScatterA64(CVariable * val,CVariable * offset,unsigned elementSize,unsigned numElems,bool addrUniform)16041 void EmitPass::emitScatterA64(CVariable* val, CVariable* offset, unsigned elementSize, unsigned numElems, bool addrUniform)
16042 {
16043     if (hasA64WAEnable() && !offset->IsUniform() && !addrUniform) {
16044         CVariable* curMask = nullptr;
16045         CVariable* lsPred = nullptr;
16046         uint label = 0;
16047         A64LSLoopHead(offset, curMask, lsPred, label);
16048 
16049         // do send with pred
16050         m_encoder->SetPredicate(lsPred);
16051         m_encoder->ScatterA64(val, offset, elementSize, numElems);
16052         m_encoder->Push();
16053 
16054         A64LSLoopTail(curMask, lsPred, label);
16055 
16056     }
16057     else {
16058         m_encoder->ScatterA64(val, offset, elementSize, numElems);
16059     }
16060 }
16061 
emitScatter4A64(CVariable * src,CVariable * offset,bool addrUniform)16062 void EmitPass::emitScatter4A64(CVariable* src, CVariable* offset, bool addrUniform)
16063 {
16064     if (hasA64WAEnable() && !offset->IsUniform() && !addrUniform) {
16065         CVariable* curMask = nullptr;
16066         CVariable* lsPred = nullptr;
16067         uint label = 0;
16068         A64LSLoopHead(offset, curMask, lsPred, label);
16069 
16070         // do send with pred
16071         m_encoder->SetPredicate(lsPred);
16072         m_encoder->Scatter4A64(src, offset);
16073         m_encoder->Push();
16074 
16075         A64LSLoopTail(curMask, lsPred, label);
16076 
16077     }
16078     else {
16079         m_encoder->Scatter4A64(src, offset);
16080     }
16081 }
16082 
16083 
16084 
emitVectorLoad(LoadInst * inst,Value * offset,ConstantInt * immOffset)16085 void EmitPass::emitVectorLoad(LoadInst* inst, Value* offset, ConstantInt* immOffset)
16086 {
16087     int immOffsetInt = 0;
16088     if (immOffset)
16089         immOffsetInt = static_cast<int>(immOffset->getSExtValue());
16090 
16091     Value* Ptr = inst->getPointerOperand();
16092     PointerType* ptrType = cast<PointerType>(Ptr->getType());
16093     bool useA32 = !IGC::isA64Ptr(ptrType, m_currShader->GetContext());
16094 
16095     ResourceDescriptor resource = GetResourceVariable(Ptr);
16096     CountStatelessIndirectAccess(Ptr, resource);
16097     // eOffset is in bytes
16098     // offset corresponds to Int2Ptr operand obtained during pattern matching
16099     CVariable* eOffset = GetSymbol(immOffset ? offset : Ptr);
16100     if (useA32)
16101     {
16102         eOffset = TruncatePointer(eOffset);
16103     }
16104 
16105     Type* Ty = inst->getType();
16106     IGCLLVM::FixedVectorType* VTy = dyn_cast<IGCLLVM::FixedVectorType>(Ty);
16107     Type* eltTy = VTy ? VTy->getElementType() : Ty;
16108     uint32_t eltBytes = GetScalarTypeSizeInRegister(eltTy);
16109     IGC_ASSERT_MESSAGE((eltBytes == 1) || (eltBytes == 2) || (eltBytes == 4) || (eltBytes == 8),
16110         "Load's type (element type if vector) must be 1/2/4/8-byte long");
16111 
16112     uint32_t elts = VTy ? int_cast<uint32_t>(VTy->getNumElements()) : 1;
16113     uint32_t totalBytes = eltBytes * elts;
16114 
16115     bool destUniform = m_destination->IsUniform();
16116     bool srcUniform = eOffset->IsUniform();
16117     // Not possible to have uniform dest AND non-uniform src.
16118     IGC_ASSERT_MESSAGE(!(destUniform && !srcUniform),
16119         "If ld's dest is uniform, ld's src must be uniform");
16120 
16121     unsigned align = inst->getAlignment();
16122     VISA_Type destType = m_destination->GetType();
16123     uint32_t width = numLanes(m_currShader->m_SIMDSize);
16124     uint bufferIndex = 0;
16125     bool directIndexing = false;
16126     BufferType bufType = DecodeAS4GFXResource(ptrType->getAddressSpace(), directIndexing, bufferIndex);
16127 
16128     if (bufType == STATELESS_A32)
16129     {
16130         // Lower addressspace (5) loads to A32 oword ld
16131         CVariable* loadDest = m_destination;
16132         uint size = loadDest->GetSize();
16133         auto newDest = loadDest;
16134         if (bufType == STATELESS_A32)
16135         {
16136             auto r0 = m_currShader->GetR0();
16137             m_encoder->SetSimdSize(SIMDMode::SIMD1);
16138             m_encoder->SetNoMask();
16139             m_encoder->SetSrcRegion(0, 0, 1, 0);
16140             m_encoder->SetSrcSubReg(0, 0);
16141             auto dst = m_currShader->GetNewVariable(1, VISA_Type::ISA_TYPE_D, eOffset->GetAlign(), true, "OWOff");
16142             m_encoder->And(dst, r0, m_currShader->ImmToVariable(0xffffffe0, ISA_TYPE_UD));
16143             m_encoder->Push();
16144             m_encoder->SetSimdSize(SIMDMode::SIMD1);
16145             m_encoder->SetNoMask();
16146             m_encoder->Add(dst, dst, eOffset);
16147             m_encoder->Push();
16148             eOffset = dst;
16149             if (!iSTD::IsPowerOfTwo(size) || size < SIZE_OWORD)
16150             {
16151                 // Ensure payload size is power of 2 or at least 16
16152                 if (size < SIZE_OWORD)
16153                 {
16154                     size = std::max<unsigned int>(size, SIZE_OWORD);
16155                 }
16156                 else if (!iSTD::IsPowerOfTwo(size))
16157                 {
16158                     // llvm optimizer converts vector load <i64 x 4> in to <i64 x 3> if
16159                     // last element isnt used. Recompute size to next higher power of 2.
16160                     size = (uint)std::pow(2, std::ceil(std::log2(size)));
16161                 }
16162                 newDest = m_currShader->GetNewVariable(size / loadDest->GetElemSize(), loadDest->GetType(), EALIGN_GRF, true, CName::NONE);
16163             }
16164         }
16165         m_encoder->OWLoad(newDest, resource, eOffset, false, size);
16166         if (newDest != loadDest)
16167         {
16168             emitVectorCopy(loadDest, newDest, loadDest->GetNumberElement());
16169         }
16170         return;
16171     }
16172 
16173     // First, special handling for less than 4 bytes of loaded value
16174     if (totalBytes < 4)
16175     {
16176         // totalBytes is either 1 or 2, and it must be scalar or (1-element vector)
16177         // do not expect <2 x i8> or <3 x i8>
16178         IGC_ASSERT(elts == 1);
16179         IGC_ASSERT(totalBytes != 3);
16180 
16181         uint16_t nbelts = srcUniform ? 1 : width;
16182         e_alignment align = EALIGN_GRF;
16183 
16184         eOffset = ReAlignUniformVariable(eOffset, align);
16185 
16186         bool needTemporary = (totalBytes < 4) || !IsGRFAligned(m_destination, EALIGN_GRF);
16187         CVariable* gatherDst = m_destination;
16188         if (needTemporary)
16189         {
16190             gatherDst = m_currShader->GetNewVariable(nbelts, ISA_TYPE_UD, align, srcUniform, CName::NONE);
16191         }
16192 
16193         if (srcUniform)
16194         {
16195             m_encoder->SetNoMask();
16196             m_encoder->SetUniformSIMDSize(SIMDMode::SIMD1);
16197         }
16198 
16199         if (useA32)
16200         {
16201             m_encoder->ByteGather(gatherDst, resource, eOffset, 8, totalBytes);
16202         }
16203         else
16204         {
16205             emitGatherA64(inst, gatherDst, eOffset, 8, totalBytes, srcUniform);
16206         }
16207 
16208         m_encoder->Push();
16209 
16210         if (needTemporary)
16211         {
16212             gatherDst = m_currShader->GetNewAlias(gatherDst, destType, 0, 0);
16213             uint32_t vStride = srcUniform ? 0 : ((totalBytes == 1) ? 4 : 2);
16214             m_encoder->SetSrcRegion(0, vStride, 1, 0);
16215             m_encoder->Copy(m_destination, gatherDst);
16216             m_encoder->Push();
16217         }
16218         return;
16219     }
16220 
16221 
16222     bool bEmulateDWAligned = false;
16223 
16224     // generate oword-load if possible
16225     if (VTy && srcUniform)
16226     {
16227         //uint32_t totalBytes = eltBytes * VTy->getNumElements();
16228         bool rightBlockSize = (totalBytes == 16 || totalBytes == 32 || totalBytes == 64 || totalBytes == 128);
16229         bool useDWAligned = (resource.m_surfaceType != ESURFACE_SLM && align && align >= 4);
16230         //if originally, unalignedDW is used for SSS in XeHP_SDV and above, emulate it with Gather4Scaled
16231         bEmulateDWAligned = (rightBlockSize && useDWAligned &&
16232             m_currShader->m_Platform->hasScratchSurface() && resource.m_surfaceType == ESURFACE_SCRATCH && align && align >= 4);
16233         useDWAligned &= (!bEmulateDWAligned);
16234         bool useOWAligned = (resource.m_surfaceType == ESURFACE_SLM && align && align >= 16 &&
16235             m_currShader->m_Platform->supportSLMBlockMessage());
16236 
16237         if (rightBlockSize && (useDWAligned || useOWAligned))
16238         {
16239             bool needTemp = (!destUniform || !IsGRFAligned(m_destination, EALIGN_GRF));
16240             CVariable * loadDest = m_destination;
16241 
16242             if (useOWAligned)
16243             {
16244                 // Offset needs to be in OW!
16245                 // Need to create a new cvar as eOffset could be used by others.
16246 
16247                 CVariable* tmp = m_currShader->GetNewVariable(eOffset);
16248                 m_encoder->Shr(tmp, eOffset, m_currShader->ImmToVariable(4, ISA_TYPE_UD));
16249                 m_encoder->Push();
16250                 eOffset = tmp;
16251             }
16252             eOffset = ReAlignUniformVariable(eOffset, EALIGN_GRF);
16253             if (needTemp)
16254             {
16255                 loadDest = m_currShader->GetNewVariable(
16256                     int_cast<uint16_t>(VTy->getNumElements()),
16257                     m_destination->GetType(),
16258                     EALIGN_GRF,
16259                     true, CName::NONE);
16260             }
16261 
16262             if (useA32)
16263             {
16264                 m_encoder->OWLoad(loadDest, resource, eOffset, useOWAligned, loadDest->GetSize());
16265             }
16266             else
16267             {
16268                 IGC_ASSERT_MESSAGE(!useOWAligned, "SLM's pointer size must be 32 bit!");
16269                 // emit svm block read
16270                 m_encoder->OWLoadA64(loadDest, eOffset, loadDest->GetSize());
16271             }
16272             m_encoder->Push();
16273 
16274             if (needTemp)
16275             {
16276                 emitVectorCopy(m_destination, loadDest, int_cast<unsigned>(VTy->getNumElements()));
16277             }
16278             return;
16279         }
16280     }
16281 
16282     // Only handle 4/8/12/16/32 bytes here. For aligned 16/32 bytes, it should've been handled
16283     // by oword already (except for SLM).  We have 12 bytes for load of int3 (either aligned or
16284     // unaligned[vload]).
16285     //
16286     // Note that for simplicity, don't do it if totalBytes=32 and 64bit integer adds are needed
16287     // on platform that does not support 64bit integer add.
16288     //Note: it doesn't seem to be necessary to check hasNoFP64Inst() here.
16289     if (srcUniform && (totalBytes == 4 || totalBytes == 8 || totalBytes == 12 || totalBytes == 16 ||
16290         (totalBytes == 32 && (useA32 || !m_currShader->m_Platform->hasNoFullI64Support()))))
16291     {
16292         bool needTemp = !destUniform ||
16293             !IsGRFAligned(m_destination, EALIGN_GRF) ||
16294             totalBytes == 12;
16295         // For uniform src, we can map value to messages (vector re-layout) as follows
16296         //   1. A64:
16297         //      <1 x i64> for align=8 && totalBytes=8 (eltBytes == 4 or 8);
16298         //        [ (blksize, nblk) = (64, 1) ]
16299         //      <n x i32> for align=4; [ (blksize, nblk) = (32, 1) ]
16300         //      <n x S> for align < 4,
16301         //         where S = <8xi8> if eltBytes = 8, or S = <4xi8> othewise;
16302         //         [ (blksize, nblk) = (8, 8) or (8, 4) ]
16303         //   2. A32:
16304         //      <n x S>, where S = <4 x i8>, ie, block size = 8 bits and #blocks = 4
16305         //         [ (blksize, nblk) = (8, 4) ]
16306         //   where n is the member of elements
16307 
16308         // use A64 scattered RW with QW block size; Note that totalBytes == 16 with align >=4
16309         // should be handled by oword already (except for SLM).
16310         bool useQW = (!useA32) && (totalBytes == 8 || totalBytes == 16) &&
16311             (align >= 8 || eltBytes == 8);
16312 
16313         // activelanes is the number of lanes that are needed.
16314         // nbelts is activelanes rounded up to the power of 2.
16315         uint16_t activelanes = useQW ? (totalBytes / 8) : (totalBytes / 4);
16316         uint16_t nbelts = (activelanes == 3 ? 4 : activelanes);
16317 
16318         // For scattered RW
16319         uint32_t blkBits = useA32 ? 8 : (align < 4 ? 8 : (useQW ? 64 : 32));
16320         uint32_t nBlks = useA32 ? 4 : (align < 4 ? (useQW ? 8 : 4) : 1);
16321 
16322         VISA_Type ldType = useQW ? ISA_TYPE_UQ : ISA_TYPE_UD;
16323         CVariable* gatherDst;
16324         if (needTemp)
16325         {
16326             gatherDst = m_currShader->GetNewVariable(
16327                 nbelts, ldType, EALIGN_GRF, true /*srcUniform*/, CName::NONE);
16328         }
16329         else
16330         {
16331             gatherDst = m_destination;
16332             if (m_destination->GetType() != ldType)
16333             {
16334                 gatherDst = m_currShader->GetNewAlias(gatherDst, ldType, 0, nbelts);
16335             }
16336         }
16337 
16338         SIMDMode simdmode = lanesToSIMDMode(nbelts);
16339         eOffset = ReAlignUniformVariable(eOffset, useA32 ? EALIGN_GRF : EALIGN_2GRF);
16340         CVariable* gatherOff = eOffset;
16341         if (nbelts > 1)
16342         {
16343             gatherOff = m_currShader->GetNewVariable(
16344                 nbelts, eOffset->GetType(), eOffset->GetAlign(), true /*srcUniform*/, CName::NONE);
16345             // May have the following
16346             //   lane   0   1   2   3   4    5   6   7
16347             //   eOff   0   4   8   C   10   14  18  1C // DW per lane
16348             //   eOff   0   8                           // QW per lane
16349             // When nbelts = 3, lane 3 is not used. Since we don't have simd3,
16350             // use simd4 and set lane3 to lane2.
16351             uint32_t incImm = 0;
16352             uint32_t incImm1 = 0;  // for activelanes=8
16353             switch (activelanes) {
16354             default:
16355                 IGC_ASSERT_MESSAGE(0, "ICE: something wrong happened in computing activelanes!");
16356                 break;
16357             case 2:
16358                 // only can have QW in this case
16359                 incImm = useQW ? 0x80 : 0x40;
16360                 break;
16361             case 3:
16362                 // set lane3 to be the same as lane2 (it is 8)
16363                 incImm = 0x8840;
16364                 break;
16365             case 4:
16366                 incImm = 0xC840;
16367                 break;
16368             case 8:
16369                 // Make sure incImm + incImm1 = {0  4  8  C  10   14  18  1C}
16370                 incImm = 0xD951C840;
16371                 incImm1 = 0xFFFF0000;
16372                 break;
16373             }
16374 
16375             CVariable* immVar = m_currShader->ImmToVariable(incImm, ISA_TYPE_UV);
16376             if (!useA32 && m_currShader->m_Platform->hasNoInt64AddInst()) {
16377                 emitAddPair(gatherOff, eOffset, immVar);
16378             }
16379             else {
16380                 m_encoder->SetNoMask();
16381                 m_encoder->SetUniformSIMDSize(simdmode);
16382                 m_encoder->SetSrcRegion(0, 0, 1, 0);
16383                 m_encoder->Add(gatherOff, eOffset, immVar);
16384                 m_encoder->Push();
16385             }
16386 
16387             if (activelanes == 8) {
16388                 CVariable* immVar1 = m_currShader->ImmToVariable(incImm1, ISA_TYPE_UV);
16389                 m_encoder->SetNoMask();
16390                 m_encoder->SetUniformSIMDSize(simdmode);
16391                 m_encoder->SetSrcRegion(0, 8, 8, 1);
16392                 m_encoder->Add(gatherOff, gatherOff, immVar1);
16393                 m_encoder->Push();
16394             }
16395         }
16396 
16397         m_encoder->SetNoMask();
16398         m_encoder->SetUniformSIMDSize(simdmode);
16399         if (useA32)
16400         {
16401             m_encoder->SetNoMask();
16402             m_encoder->SetUniformSIMDSize(simdmode);
16403             if (m_currShader->m_Platform->hasScratchSurface() &&
16404                 align >= 4 &&
16405                 (m_currShader->m_Platform->emulateByteScraterMsgForSS() || bEmulateDWAligned) &&
16406                 (ESURFACE_SCRATCH == resource.m_surfaceType))
16407             {
16408                 m_encoder->Gather4Scaled(gatherDst, resource, gatherOff);
16409             }
16410             else
16411             {
16412                 m_encoder->ByteGather(gatherDst, resource, gatherOff, blkBits, nBlks);
16413             }
16414         }
16415         else
16416         {
16417             emitGatherA64(inst, gatherDst, gatherOff, blkBits, nBlks, srcUniform);
16418         }
16419         m_encoder->Push();
16420 
16421         if (needTemp)
16422         {
16423             CVariable* newDst = m_currShader->GetNewAlias(
16424                 gatherDst, destType, 0, m_destination->GetNumberElement());
16425             emitVectorCopy(m_destination, newDst, elts);
16426         }
16427         return;
16428     }
16429     CVariable* subLoadDst;
16430     CVariable* rawAddrVar;
16431 
16432     // some driver describe constant buffer as typed which forces us to use byte scatter message
16433     bool forceByteScatteredRW =
16434         bufType == CONSTANT_BUFFER &&
16435         UsesTypedConstantBuffer(m_currShader->GetContext(), bufType);
16436 
16437     VectorMessage VecMessInfo(this);
16438     VecMessInfo.getInfo(Ty, align, useA32, forceByteScatteredRW);
16439 
16440     // Handle uniform case in general
16441     if (srcUniform)
16442     {
16443         // Use width of 8 always, and only the value of the first lane is
16444         // used. Need to set noMask in order to have the valid value in
16445         // the first lane.
16446         uint32_t width8 = getGRFSize() / 4;
16447         for (uint32_t i = 0; i < VecMessInfo.numInsts; ++i)
16448         {
16449             // raw operand, eltOffBytes is in bytes.
16450             uint32_t eltOffBytes = VecMessInfo.insts[i].startByte;
16451             uint32_t blkInBytes = VecMessInfo.insts[i].blkInBytes;
16452             uint32_t numBlks = VecMessInfo.insts[i].numBlks;
16453 
16454             uint32_t eltOff = eltOffBytes / eltBytes;  // in unit of element
16455             uint32_t blkBits = 8 * blkInBytes;
16456             uint32_t instTotalBytes = blkInBytes * numBlks;
16457             uint32_t instElts = instTotalBytes / eltBytes;
16458             uint32_t nbelts = instElts * width8;
16459 
16460             if (i > 0)
16461             {
16462                 // Calculate the new element offset
16463                 rawAddrVar = m_currShader->GetNewVariable(eOffset);
16464                 CVariable* ImmVar = m_currShader->ImmToVariable(eltOffBytes, ISA_TYPE_UD);
16465                 if (!useA32 && m_currShader->m_Platform->hasNoInt64AddInst()) {
16466                     emitAddPair(rawAddrVar, eOffset, ImmVar);
16467                 }
16468                 else {
16469                     m_encoder->SetNoMask();
16470                     m_encoder->Add(rawAddrVar, eOffset, ImmVar);
16471                     m_encoder->Push();
16472                 }
16473             }
16474             else
16475             {
16476                 rawAddrVar = eOffset;
16477             }
16478             CVariable* addrVarSIMD8 = m_currShader->GetNewVariable(
16479                 getGRFSize() / 4, rawAddrVar->GetType(), EALIGN_GRF, CName::NONE);
16480             m_encoder->SetNoMask();
16481             m_encoder->SetSimdSize(lanesToSIMDMode(addrVarSIMD8->GetNumberElement()));
16482             m_encoder->Copy(addrVarSIMD8, rawAddrVar);
16483 
16484             subLoadDst = m_currShader->GetNewVariable(
16485                 (uint16_t)nbelts, destType, EALIGN_GRF, CName::NONE);
16486             m_encoder->SetNoMask();
16487             m_encoder->SetSimdSize(lanesToSIMDMode(addrVarSIMD8->GetNumberElement()));
16488             VectorMessage::MESSAGE_KIND messageType = VecMessInfo.insts[i].kind;
16489             switch (messageType) {
16490             case VectorMessage::MESSAGE_A32_BYTE_SCATTERED_RW:
16491                 m_encoder->ByteGather(subLoadDst, resource, addrVarSIMD8, blkBits, numBlks);
16492                 break;
16493             case VectorMessage::MESSAGE_A32_UNTYPED_SURFACE_RW:
16494                 m_encoder->Gather4Scaled(subLoadDst, resource, addrVarSIMD8);
16495                 break;
16496             case VectorMessage::MESSAGE_A64_UNTYPED_SURFACE_RW:
16497                 emitGather4A64(inst, subLoadDst, addrVarSIMD8, true);
16498                 break;
16499             case VectorMessage::MESSAGE_A64_SCATTERED_RW:
16500                 emitGatherA64(inst, subLoadDst, addrVarSIMD8, blkBits, numBlks, true);
16501                 break;
16502             default:
16503                 IGC_ASSERT_MESSAGE(0, "Somethings wrong!");
16504             }
16505             m_encoder->Push();
16506 
16507             for (uint32_t n = 0; n < instElts; ++n)
16508             {
16509                 m_encoder->SetSrcRegion(0, 0, 1, 0);
16510                 m_encoder->SetSrcSubReg(0, n * width8);
16511                 m_encoder->SetDstSubReg(eltOff + (destUniform ? n : n * width));
16512                 m_encoder->Copy(m_destination, subLoadDst);
16513                 m_encoder->Push();
16514             }
16515         }
16516 
16517         return;
16518     }
16519 
16520     // Second, src isn't uniform
16521     for (uint32_t i = 0; i < VecMessInfo.numInsts; ++i)
16522     {
16523         // raw operand, eltOffBytes is in bytes.
16524         uint32_t eltOffBytes = VecMessInfo.insts[i].startByte * width;
16525         uint32_t blkInBytes = VecMessInfo.insts[i].blkInBytes;
16526         uint32_t numBlks = VecMessInfo.insts[i].numBlks;
16527         uint32_t eltOff = eltOffBytes / eltBytes;
16528         uint32_t blkBits = 8 * blkInBytes;
16529         uint32_t instTotalBytes = blkInBytes * numBlks;
16530         uint32_t instElts = instTotalBytes / eltBytes;
16531         uint32_t nbelts = instElts * width;
16532 
16533         if (i > 0)
16534         {
16535             // Calculate the new element offset
16536             rawAddrVar = m_currShader->GetNewVariable(eOffset);
16537             CVariable* ImmVar = m_currShader->ImmToVariable(VecMessInfo.insts[i].startByte, ISA_TYPE_UD);
16538             if (!useA32 && m_currShader->m_Platform->hasNoInt64AddInst()) {
16539                 emitAddPair(rawAddrVar, eOffset, ImmVar);
16540             }
16541             else {
16542                 m_encoder->Add(rawAddrVar, eOffset, ImmVar);
16543                 m_encoder->Push();
16544             }
16545         }
16546         else
16547         {
16548             rawAddrVar = eOffset;
16549         }
16550 
16551         bool needTemp = (!IsGRFAligned(m_destination, EALIGN_GRF));
16552         CVariable* gatherDst;
16553         if (needTemp)
16554         {
16555             gatherDst = m_currShader->GetNewVariable(
16556                 (uint16_t)nbelts, destType, EALIGN_GRF, CName::NONE);
16557         }
16558         else
16559         {
16560             // No need to copy, load directly into m_destination
16561             gatherDst = m_currShader->GetNewAlias(m_destination,
16562                 destType, (uint16_t)eltOffBytes, (uint16_t)nbelts);
16563         }
16564         VectorMessage::MESSAGE_KIND messageType = VecMessInfo.insts[i].kind;
16565         switch (messageType) {
16566         case VectorMessage::MESSAGE_A32_BYTE_SCATTERED_RW:
16567             m_encoder->ByteGather(gatherDst, resource, rawAddrVar, blkBits, numBlks);
16568             break;
16569         case VectorMessage::MESSAGE_A32_UNTYPED_SURFACE_RW:
16570             m_encoder->Gather4Scaled(gatherDst, resource, rawAddrVar);
16571             break;
16572         case VectorMessage::MESSAGE_A64_UNTYPED_SURFACE_RW:
16573             emitGather4A64(inst, gatherDst, rawAddrVar, false);
16574             break;
16575         case VectorMessage::MESSAGE_A64_SCATTERED_RW:
16576             emitGatherA64(inst, gatherDst, rawAddrVar, blkBits, numBlks, false);
16577             break;
16578         default:
16579             IGC_ASSERT_MESSAGE(0, "Internal Error: unexpected message kind for load!");
16580         }
16581         m_encoder->Push();
16582 
16583         if (needTemp)
16584         {
16585             emitVectorCopy(m_destination, gatherDst, instElts, eltOff, 0);
16586         }
16587     }
16588 }
16589 
emitVectorStore(StoreInst * inst,Value * offset,ConstantInt * immOffset)16590 void EmitPass::emitVectorStore(StoreInst* inst, Value* offset, ConstantInt* immOffset)
16591 {
16592     int immOffsetInt = 0;
16593     if (immOffset)
16594         immOffsetInt = static_cast<int>(immOffset->getSExtValue());
16595 
16596     Value* Ptr = inst->getPointerOperand();
16597     PointerType* ptrType = cast<PointerType>(Ptr->getType());
16598 
16599     ResourceDescriptor resource = GetResourceVariable(Ptr);
16600     CountStatelessIndirectAccess(Ptr, resource);
16601     if (ptrType->getPointerAddressSpace() != ADDRESS_SPACE_PRIVATE)
16602     {
16603         ForceDMask(false);
16604     }
16605     // eOffset is in bytes
16606     // offset corresponds to Int2Ptr operand obtained during pattern matching
16607     CVariable* eOffset = GetSymbol(immOffset ? offset : Ptr);
16608     bool useA32 = !isA64Ptr(ptrType, m_currShader->GetContext());
16609     if (useA32)
16610     {
16611         eOffset = TruncatePointer(eOffset);
16612     }
16613 
16614     // In case eOffset isn't GRF aligned, need to create a copy
16615     // For non-uniform variable, it should be already GRF-aligned.
16616     eOffset = ReAlignUniformVariable(eOffset, EALIGN_GRF);
16617 
16618     Value* storedVal = inst->getValueOperand();
16619     Type* Ty = storedVal->getType();
16620     IGCLLVM::FixedVectorType* VTy = dyn_cast<IGCLLVM::FixedVectorType>(Ty);
16621     Type* eltTy = VTy ? VTy->getElementType() : Ty;
16622     uint32_t eltBytes = GetScalarTypeSizeInRegister(eltTy);
16623 
16624     IGC_ASSERT_MESSAGE((eltBytes == 1) || (eltBytes == 2) || (eltBytes == 4) || (eltBytes == 8),
16625         "Store type must be 1/2/4/8-bytes long");
16626 
16627     uint32_t elts = VTy ? int_cast<uint32_t>(VTy->getNumElements()) : 1;
16628     uint32_t totalBytes = elts * eltBytes;
16629     unsigned align = inst->getAlignment();
16630     CVariable* storedVar = GetSymbol(storedVal);
16631     unsigned int width = numLanes(m_currShader->m_SIMDSize);
16632 
16633     bool srcUniform = storedVar->IsUniform();
16634     bool dstUniform = eOffset->IsUniform();
16635 
16636     // Handle two cases:
16637     //   1. less than 4 bytes: need to extend it to 4 bytes
16638     //   2. destination is scalar and uniform (handle vector if needed)
16639     if (totalBytes < 4 || isUniformStoreOCL(inst))
16640     {
16641         IGC_ASSERT_MESSAGE((totalBytes == 1) || (totalBytes == 2) || (totalBytes == 4) || (totalBytes == 8) || (totalBytes == 12) || (totalBytes == 16),
16642             "Wrong total Bytes!");
16643 
16644         SIMDMode simdmode = SIMDMode::SIMD1;
16645         e_alignment grfAlign = useA32 ? EALIGN_GRF : EALIGN_2GRF;
16646         uint32_t blkBits, nBlks;
16647         if (elts > 1)
16648         {
16649             // Vector uniform store: handle uniform value only.
16650             // For elts > 1, the eltBytes must be either 4 or 8; only elts = 2|3|4 are handled.
16651             IGC_ASSERT_MESSAGE((eltBytes == 4) || (eltBytes == 8), "ICE: wrong element bytes!");
16652             IGC_ASSERT_MESSAGE(dstUniform, "ICE: for vector uniform store, both dst and src must be uniform!");
16653             IGC_ASSERT_MESSAGE(srcUniform, "ICE: for vector uniform store, both dst and src must be uniform!");
16654 
16655             // As we use simd8 for vector (SKL HW WA). Converting DW to QW
16656             // makes sense only if the final is a scalar (a single QW).
16657             bool useQW = (!useA32) &&
16658                 (eltBytes == 8 ||        // requested by vector layout
16659                 (eltBytes == 4 && totalBytes == 8 && align >= 8)); // convert DW to QW
16660 
16661            // activelanes is the number of lanes that are needed.
16662            // nbelts is activelanes rounded up to the power of 2.
16663             uint16_t activelanes = useQW ? (totalBytes / 8) : (totalBytes / 4);
16664             uint16_t nbelts = (activelanes == 3 ? 4 : activelanes);
16665 
16666             // Work around of a possible SKL HW bug. Using send(4) for "store <4xi32>v, *p"
16667             // Therefore, using simd8 for A64 vector store to get around
16668             // of this issue..
16669 
16670             // This is simdmode we wanted, but we need to work around of A64 HW bug
16671             SIMDMode simdWanted = lanesToSIMDMode(nbelts);
16672             uint16_t nbeltsWanted = nbelts;
16673             if (!useA32 && nbelts > 1) {
16674                 nbelts = 8;
16675             }
16676             simdmode = lanesToSIMDMode(nbelts);
16677 
16678             // compute offset
16679             // We have the following :
16680             //    lane   0   1   2   3
16681             //    eOff   0   4   8   C               // DW per lane
16682             //    eOff   0   8                       // QW per lane
16683             // When elts = 3, lane 3 is not used. Since we don't have simd3,
16684             // use simd4 and set lane3 to the same as lane2(8).
16685             //
16686             // When using simd8, all unused lanes will be the same as lane0.
16687             // Make sure offset & stored value are correctly set up.
16688             if (nbelts > 1)
16689             {
16690                 CVariable* NewOff = m_currShader->GetNewVariable(
16691                     nbelts, eOffset->GetType(), grfAlign, true /*dstUniform*/, CName::NONE);
16692                 uint32_t incImm =
16693                     useQW ? 0x80 : (activelanes == 2 ? 0x40 : (activelanes == 3 ? 0x8840 : 0xC840));
16694                 CVariable* immVar = m_currShader->ImmToVariable(incImm, ISA_TYPE_UV);
16695 
16696                 // When work-around of A64 SKL Si limitation of SIMD4, we use SIMD8 (nbelts > nbeltsWanted)
16697                 // in which all upper four channels are zero, meaning eOffset[0], Later, stored value
16698                 // must use storvedVar[0] for those extra lanes.
16699                 if (!useA32 && m_currShader->m_Platform->hasNoInt64AddInst()) {
16700                     emitAddPair(NewOff, eOffset, immVar);
16701                 }
16702                 else {
16703                     m_encoder->SetNoMask();
16704                     m_encoder->SetUniformSIMDSize(simdmode);
16705                     m_encoder->SetSrcRegion(0, 0, 1, 0);
16706                     m_encoder->Add(NewOff, eOffset, immVar);
16707                     m_encoder->Push();
16708                 }
16709 
16710                 eOffset = NewOff;
16711             }
16712             else
16713             {
16714                 eOffset = ReAlignUniformVariable(eOffset, grfAlign);
16715             }
16716 
16717 
16718             // (We could have useQW == true AND eltBytes == 4. Note that if useQW
16719             // is false, eltBytes must be 4.)
16720             IGC_ASSERT_MESSAGE(useQW || (eltBytes == 4), "ICE: wrong vector element type!");
16721 
16722             // Since we might change element type, need to create copy.
16723             if (useQW && eltBytes == 4)
16724             {
16725                 CVariable* tmp = m_currShader->GetNewVariable(
16726                     nbeltsWanted, ISA_TYPE_UQ, grfAlign, true /*srcUniform*/, CName::NONE);
16727                 CVariable* tmpAlias = m_currShader->GetNewAlias(tmp,
16728                     storedVar->GetType(), 0, 2 * nbeltsWanted);
16729                 IGC_ASSERT_MESSAGE((2 * nbeltsWanted) == storedVar->GetNumberElement(),
16730                     "Mismatch of the number of elements: sth wrong!");
16731                 emitVectorCopy(tmpAlias, storedVar, 2 * nbeltsWanted);
16732                 storedVar = tmp;
16733             }
16734 
16735             // Prepare stored value
16736             if (storedVar->IsImmediate() || activelanes < nbelts ||
16737                 !IsGRFAligned(storedVar, grfAlign))
16738             {
16739                 CVariable* NewVar = m_currShader->GetNewVariable(
16740                     nbelts, storedVar->GetType(), grfAlign, true /*srcUniform*/, CName::NONE);
16741 
16742                 // A64 SKL HW issue work-around: set remaining lanes to storedVar[0]
16743                 // as eOffset has been set to the first element already.
16744                 if (nbeltsWanted < nbelts)
16745                 {
16746                     m_encoder->SetNoMask();
16747                     m_encoder->SetUniformSIMDSize(simdmode);
16748                     m_encoder->SetSrcRegion(0, 0, 1, 0);
16749                     m_encoder->Copy(NewVar, storedVar);
16750                     m_encoder->Push();
16751                 }
16752 
16753                 // Values that we care
16754                 if (activelanes == 3)
16755                 {
16756                     m_encoder->SetNoMask();
16757                     m_encoder->SetUniformSIMDSize(SIMDMode::SIMD2);
16758                     m_encoder->SetSrcRegion(0, 2, 2, 1);
16759                     m_encoder->Copy(NewVar, storedVar);
16760                     m_encoder->Push();
16761 
16762                     // offset is 0x8840, so duplicate lane2
16763                     m_encoder->SetNoMask();
16764                     m_encoder->SetUniformSIMDSize(SIMDMode::SIMD2);
16765                     m_encoder->SetDstSubReg(2);
16766                     m_encoder->SetSrcSubReg(0, 2);
16767                     m_encoder->SetSrcRegion(0, 0, 1, 0);
16768                     m_encoder->Copy(NewVar, storedVar);
16769                     m_encoder->Push();
16770                 }
16771                 else
16772                 {
16773                     m_encoder->SetNoMask();
16774                     m_encoder->SetUniformSIMDSize(simdWanted);
16775                     m_encoder->SetSrcRegion(0, nbeltsWanted, nbeltsWanted, 1);
16776                     m_encoder->Copy(NewVar, storedVar);
16777                     m_encoder->Push();
16778                 }
16779                 storedVar = NewVar;
16780             }
16781 
16782             // each lane will store either DW or QW
16783             blkBits = useQW ? (align >= 8 ? 64 : 8)
16784                 : (!useA32 && align >= 4) ? 32 : 8;
16785             nBlks = useQW ? (64 / blkBits) : (32 / blkBits);
16786         }
16787         else
16788         {
16789             // scalar case (elts == 1)
16790             if (dstUniform)
16791             {
16792                 eOffset = ReAlignUniformVariable(eOffset, grfAlign);
16793                 if (!srcUniform)
16794                 {
16795                     storedVar = UniformCopy(storedVar);
16796                 }
16797                 else
16798                 {
16799                     storedVar = ReAlignUniformVariable(storedVar, grfAlign);
16800                 }
16801                 storedVar = ExtendVariable(storedVar, grfAlign);
16802             }
16803             else
16804             {
16805                 storedVar = BroadcastAndExtend(storedVar);
16806             }
16807 
16808             // use either A32 byte scatter or A64 scatter messages.
16809             //   A32 should use byte as block size always here.
16810             //   A64 uses byte/DW/QW as block size based on align and element size.
16811             // Note that this is for elts = 1, so totalBytes is bytes per-lane.
16812             blkBits = useA32 ? 8 : ((eltBytes >= 4 && align >= eltBytes) ? eltBytes * 8 : 8);
16813             nBlks = (totalBytes * 8) / blkBits;
16814         }
16815         setPredicateForDiscard();
16816 
16817         if (useA32)
16818         {
16819             m_encoder->ByteScatter(storedVar, resource, eOffset, blkBits, nBlks);
16820         }
16821         else
16822         {
16823             emitScatterA64(storedVar, eOffset, blkBits, nBlks, true);
16824         }
16825 
16826         if (dstUniform)
16827         {
16828             m_encoder->SetNoMask();
16829             m_encoder->SetUniformSIMDSize(simdmode);
16830         }
16831         m_encoder->Push();
16832     }
16833     else
16834     {
16835         eOffset = BroadcastIfUniform(eOffset);
16836         storedVar = BroadcastIfUniform(storedVar);
16837 
16838         VectorMessage VecMessInfo(this);
16839         VecMessInfo.getInfo(Ty, align, useA32);
16840 
16841         for (uint32_t i = 0; i < VecMessInfo.numInsts; ++i)
16842         {
16843             // raw operand, eltOff is in bytes
16844             uint32_t eltOffBytes = VecMessInfo.insts[i].startByte * width;
16845             uint32_t blkInBytes = VecMessInfo.insts[i].blkInBytes;
16846             uint32_t numBlks = VecMessInfo.insts[i].numBlks;
16847             uint32_t blkBits = 8 * blkInBytes;
16848             uint32_t instTotalBytes = blkInBytes * numBlks;
16849             uint32_t instElts = instTotalBytes / eltBytes;
16850             uint32_t nbelts = instElts * width;
16851 
16852             CVariable* rawAddrVar;
16853             if (i > 0)
16854             {
16855                 // Calculate the new element offset
16856                 rawAddrVar = m_currShader->GetNewVariable(eOffset);
16857                 CVariable* ImmVar = m_currShader->ImmToVariable(VecMessInfo.insts[i].startByte, ISA_TYPE_UD);
16858                 if (!useA32 && m_currShader->m_Platform->hasNoInt64AddInst()) {
16859                     emitAddPair(rawAddrVar, eOffset, ImmVar);
16860                 }
16861                 else {
16862                     m_encoder->Add(rawAddrVar, eOffset, ImmVar);
16863                     m_encoder->Push();
16864                 }
16865             }
16866             else
16867             {
16868                 rawAddrVar = eOffset;
16869             }
16870             setPredicateForDiscard();
16871             VISA_Type storedType = storedVar->GetType();
16872             IGC_ASSERT_MESSAGE((eltOffBytes < (UINT16_MAX)), "eltOffBytes > higher than 64k");
16873             IGC_ASSERT_MESSAGE((nbelts < (UINT16_MAX)), "nbelts > higher than 64k");
16874             CVariable* subStoredVar = m_currShader->GetNewAlias(storedVar, storedType, (uint16_t)eltOffBytes, (uint16_t)nbelts);
16875             switch (VecMessInfo.insts[i].kind) {
16876             case VectorMessage::MESSAGE_A32_BYTE_SCATTERED_RW:
16877                 m_encoder->ByteScatter(subStoredVar, resource, rawAddrVar, blkBits, numBlks);
16878                 break;
16879             case VectorMessage::MESSAGE_A32_UNTYPED_SURFACE_RW:
16880                 m_encoder->Scatter4Scaled(subStoredVar, resource, rawAddrVar);
16881                 break;
16882             case VectorMessage::MESSAGE_A64_UNTYPED_SURFACE_RW:
16883                 emitScatter4A64(subStoredVar, rawAddrVar, false);
16884                 break;
16885             case VectorMessage::MESSAGE_A64_SCATTERED_RW:
16886                 emitScatterA64(subStoredVar, rawAddrVar, blkBits, numBlks, false);
16887                 break;
16888             default:
16889                 IGC_ASSERT_MESSAGE(0, "Internal Error: unexpected Message kind for store");
16890             }
16891             m_encoder->Push();
16892         }
16893     }
16894     if (ptrType->getPointerAddressSpace() != ADDRESS_SPACE_PRIVATE)
16895     {
16896         ResetVMask(false);
16897     }
16898 }
16899 
16900 // prepareAddressForUniform():       for both load and store
16901 // prepareDataForUniform():          for store only
16902 //    Unaligned (less than 4 bytes) uniform load/store. One for address payload,
16903 //    and the other for data payload.
16904 //
16905 //  Example 1:  "store <4xi32> V,  <4xi32>* P, align 2"
16906 //     A new pointer pVar is create with 4 elements.
16907 //
16908 //     add  (4|M0_NM) pVar<1>:ud  P<0;1,0>:UD 0xC840:UV
16909 //     send (4|M0_NM) pVar   V
16910 //
16911 //         prepareAddressForUniform() : create pVar
16912 //         prepareDataForUniform() : return V (assuming V can be used directly)
16913 //
16914 //  Example 2:  "store <3xi32> V,  <3xi32>* P, align 2"
16915 //     Non-power of 2 vector size is rounded up to the next power of 2.
16916 //     Additional elements are duplicated with the first vector element.
16917 
16918 //     add  (4|M0_NM) pVar<1>:ud  P<0;1,0>:UD 0x0840:UV
16919 //     mov  (4|M0_NM) vVar<1>:ud  V<0;1,0>:ud
16920 //     mov  (2|M0_NM) vVar<1>:ud  V<1;1,0>:ud
16921 //     mov  (1|M0_NM) vVar.2<1>:ud  V.2<1;1,0>:ud
16922 //     send (4|M0_NM) vVar  pVar
16923 //
16924 //         prepareAddressForUniform() : create pVar
16925 //         prepareDataForUniform() : return vVar
16926 //
16927 // This function handles vector size up to 8. It also handles QW element size.
16928 // When vector size > 4, it uses 0x76543210, left-shifted by 2 (DW) or 3 (QW)
16929 // as an immediate to be added to 'AddrVar' to form a new address var.
16930 //
16931 // In addition, if 64bit add is not supported, emitAddPair() will be used to
16932 // use 32bit add/addc to emulate 64bit add.
16933 //
16934 // Note that argument 'AddrVar' in prepareAddressForUniform() is uniform, so is
16935 // its return var. The argument 'DataVar' in prepareDataForUniform() is uniform,
16936 // so is its return var.
16937 //
prepareAddressForUniform(CVariable * AddrVar,uint32_t EltBytes,uint32_t NElts,uint32_t RequiredNElts,e_alignment Align)16938 CVariable* EmitPass::prepareAddressForUniform(
16939     CVariable* AddrVar, uint32_t EltBytes, uint32_t NElts, uint32_t RequiredNElts, e_alignment Align)
16940 {
16941     // If RequiredNElts == 0,  use next power of 2 of NElts as return var's num of elements.
16942     //    otherwise,           user RequiredNElts as return var's num of elements.
16943     uint32_t pow2NElts = (uint32_t)PowerOf2Ceil(NElts);
16944     uint32_t allocNElts = (RequiredNElts > 0 ? RequiredNElts : pow2NElts);
16945     IGC_ASSERT(NElts <= 8 && (EltBytes == 4 || EltBytes == 8));
16946     IGC_ASSERT(allocNElts >= pow2NElts);
16947     if (allocNElts == NElts && NElts == 1 && IsGRFAligned(AddrVar, Align))
16948     {
16949         // No need to create a new var.
16950         return AddrVar;
16951     }
16952     bool isA64 = (AddrVar->GetElemSize() == 8);
16953     SIMDMode simdmode = lanesToSIMDMode(pow2NElts);
16954     CVariable* newVar = m_currShader->GetNewVariable(allocNElts, AddrVar->GetType(), Align, true, CName::NONE);
16955 
16956     CVariable* off;
16957     uint32_t incImm = (0x76543210 & maskTrailingOnes<uint32_t>(NElts * 4));
16958     if ((pow2NElts <= 4 && EltBytes == 4) || (pow2NElts <= 2 && EltBytes == 8))
16959     {
16960         // This case needs a single UV immediate
16961         incImm = incImm << (EltBytes == 4 ? 2 : 3);
16962         off = m_currShader->ImmToVariable(incImm, ISA_TYPE_UV);
16963     }
16964     else
16965     {
16966         // Need a temporary var to calculate offsets.
16967         // (Note that the temp is non-uniform, otherwise emitAddrPair() won't work.)
16968         off = m_currShader->GetNewVariable(pow2NElts, ISA_TYPE_UD, EALIGN_DWORD, false, CName::NONE);
16969 
16970         // Need a mov and mul
16971         m_encoder->SetNoMask();
16972         m_encoder->SetSimdSize(simdmode);
16973         m_encoder->Copy(off, m_currShader->ImmToVariable(incImm, ISA_TYPE_UV));
16974         m_encoder->Push();
16975 
16976         m_encoder->SetNoMask();
16977         m_encoder->SetSimdSize(simdmode);
16978         m_encoder->SetSrcRegion(0, 1, 1, 0);
16979         m_encoder->SetSrcRegion(1, 0, 1, 0);
16980         m_encoder->Mul(off, off, m_currShader->ImmToVariable(EltBytes, ISA_TYPE_UW));
16981         m_encoder->Push();
16982     }
16983 
16984     // Only need to initialize pow2NElts elements.
16985     if (allocNElts > pow2NElts)
16986     {
16987         newVar = m_currShader->GetNewAlias(newVar, newVar->GetType(), 0, pow2NElts);
16988     }
16989 
16990     // Currently, it's impossible to split because of NElts <= 8. In the future, NElts
16991     // could be 32 and we could need to split.
16992     bool needSplit = ((pow2NElts * newVar->GetElemSize()) > (2 * (uint32_t)getGRFSize()));
16993     if (needSplit)
16994     {
16995         IGC_ASSERT(!off->IsImmediate());
16996         uint32_t halfNElts = pow2NElts / 2;
16997         uint32_t bytes1 = halfNElts * newVar->GetElemSize();
16998         uint32_t bytes2 = halfNElts * off->GetElemSize();
16999         CVariable* newVarHi = m_currShader->GetNewAlias(newVar, newVar->GetType(), 0, halfNElts);
17000         CVariable* newVarLo = m_currShader->GetNewAlias(newVar, newVar->GetType(), bytes1, halfNElts);
17001         CVariable* offHi = m_currShader->GetNewAlias(off, off->GetType(), 0, halfNElts);
17002         CVariable* offLo = m_currShader->GetNewAlias(off, off->GetType(), bytes2, halfNElts);
17003 
17004         if (isA64 && m_currShader->m_Platform->hasNoInt64AddInst())
17005         {
17006             emitAddPair(newVarHi, AddrVar, offHi);
17007             emitAddPair(newVarLo, AddrVar, offLo);
17008         }
17009         else
17010         {
17011             SIMDMode sm = lanesToSIMDMode(halfNElts);
17012             m_encoder->SetNoMask();
17013             m_encoder->SetUniformSIMDSize(sm);
17014             m_encoder->SetSrcRegion(0, 0, 1, 0);
17015             m_encoder->SetSrcRegion(1, 1, 1, 0);
17016             m_encoder->Add(newVarHi, AddrVar, offHi);
17017             m_encoder->Push();
17018 
17019             m_encoder->SetNoMask();
17020             m_encoder->SetUniformSIMDSize(sm);
17021             m_encoder->SetSrcRegion(0, 0, 1, 0);
17022             m_encoder->SetSrcRegion(1, 1, 1, 0);
17023             m_encoder->Add(newVarLo, AddrVar, offLo);
17024             m_encoder->Push();
17025         }
17026     }
17027     else if (isA64 && m_currShader->m_Platform->hasNoInt64AddInst() && pow2NElts > 1)
17028     {
17029         emitAddPair(newVar, AddrVar, off);
17030     }
17031     else
17032     {
17033         m_encoder->SetNoMask();
17034         m_encoder->SetUniformSIMDSize(simdmode);
17035         m_encoder->SetSrcRegion(0, 0, 1, 0);
17036         m_encoder->SetSrcRegion(1, 1, 1, 0);
17037         if (pow2NElts > 1) {
17038             m_encoder->Add(newVar, AddrVar, off);
17039         }
17040         else {
17041             m_encoder->Copy(newVar, AddrVar);
17042         }
17043         m_encoder->Push();
17044     }
17045     return newVar;
17046 }
17047 
prepareDataForUniform(CVariable * DataVar,uint32_t RequiredNElts,e_alignment Align)17048 CVariable* EmitPass::prepareDataForUniform(
17049     CVariable* DataVar, uint32_t RequiredNElts, e_alignment Align)
17050 {
17051     uint32_t NElts = DataVar->GetNumberElement();
17052     uint32_t EltBytes = DataVar->GetElemSize();
17053     uint32_t pow2NElts = (uint32_t)(uint32_t)PowerOf2Ceil(NElts);
17054     uint32_t allocNElts = RequiredNElts > 0 ? RequiredNElts : pow2NElts;
17055     IGC_ASSERT(allocNElts >= pow2NElts && NElts <= 8 && (EltBytes == 4 || EltBytes == 8));
17056     if (NElts == allocNElts && !DataVar->IsImmediate() && IsGRFAligned(DataVar, Align))
17057     {
17058         return DataVar;
17059     }
17060     CVariable* newVar = m_currShader->GetNewVariable(allocNElts, DataVar->GetType(), Align, true, CName::NONE);
17061 
17062     // Need to return a var with pow2NElts elements
17063     if (allocNElts > pow2NElts)
17064     {
17065         newVar = m_currShader->GetNewAlias(newVar, newVar->GetType(), 0, pow2NElts);
17066     }
17067 
17068     // Initialize to DataVar's first element (set Elts from NElts and up to the first element).
17069     bool initWithElem0 = (pow2NElts > NElts);
17070     bool needSplit = ((pow2NElts *newVar->GetElemSize()) > (2 * (uint32_t)getGRFSize()));
17071     if (initWithElem0)
17072     {
17073         if (needSplit)
17074         {
17075             uint32_t esz = pow2NElts / 2;
17076             uint32_t bytes = esz * newVar->GetElemSize();
17077             CVariable* newVarHi = m_currShader->GetNewAlias(newVar, newVar->GetType(), 0, esz);
17078             CVariable* newVarLo = m_currShader->GetNewAlias(newVar, newVar->GetType(), bytes, esz);
17079 
17080             m_encoder->SetNoMask();
17081             m_encoder->SetUniformSIMDSize(lanesToSIMDMode(esz));
17082             m_encoder->SetSrcRegion(0, 0, 1, 0);
17083             m_encoder->Copy(newVarHi, DataVar);
17084             m_encoder->Push();
17085 
17086             m_encoder->SetNoMask();
17087             m_encoder->SetUniformSIMDSize(lanesToSIMDMode(esz));
17088             m_encoder->SetSrcRegion(0, 0, 1, 0);
17089             m_encoder->Copy(newVarLo, DataVar);
17090             m_encoder->Push();
17091         }
17092         else
17093         {
17094             m_encoder->SetNoMask();
17095             m_encoder->SetUniformSIMDSize(lanesToSIMDMode(pow2NElts));
17096             m_encoder->SetSrcRegion(0, 0, 1, 0);
17097             m_encoder->Copy(newVar, DataVar);
17098             m_encoder->Push();
17099         }
17100     }
17101 
17102     if (!initWithElem0 || NElts != 1)
17103     {
17104         emitVectorCopy(newVar, DataVar, NElts);
17105     }
17106     return newVar;
17107 }
17108 
17109 
emitVectorCopy(CVariable * Dst,CVariable * Src,uint32_t nElts,uint32_t DstSubRegOffset,uint32_t SrcSubRegOffset)17110 void EmitPass::emitVectorCopy(CVariable* Dst, CVariable* Src, uint32_t nElts,
17111     uint32_t DstSubRegOffset, uint32_t SrcSubRegOffset)
17112 {
17113     unsigned int width = numLanes(m_currShader->m_SIMDSize);
17114     bool srcUniform = Src->IsUniform();
17115     bool dstUniform = Dst->IsUniform();
17116     unsigned doff = DstSubRegOffset, soff = SrcSubRegOffset;
17117 
17118     // Uniform vector copy.
17119     if (srcUniform && dstUniform)
17120     {
17121         // The starting index of elements to be copied.
17122         unsigned i = 0;
17123         auto partialCopy = [=, &i](SIMDMode mod)
17124         {
17125             unsigned w = numLanes(mod);
17126             if (i + w > nElts)
17127             {
17128                 return false;
17129             }
17130 
17131             unsigned vStride = (mod == SIMDMode::SIMD1) ? 0 : 1;
17132             m_encoder->SetUniformSIMDSize(mod);
17133             m_encoder->SetSrcRegion(0, vStride, 1, 0);
17134             m_encoder->SetSrcSubReg(0, soff + i);
17135             m_encoder->SetDstSubReg(doff + i);
17136             m_encoder->Copy(Dst, Src);
17137             m_encoder->Push();
17138 
17139             i += w;
17140             return true;
17141         };
17142 
17143         // We may select the initial simd size based on the element type.
17144         while (partialCopy(SIMDMode::SIMD8))
17145             ;
17146         partialCopy(SIMDMode::SIMD4);
17147         partialCopy(SIMDMode::SIMD2);
17148         partialCopy(SIMDMode::SIMD1);
17149         return;
17150     }
17151 
17152     for (uint32_t i = 0; i < nElts; ++i)
17153     {
17154         uint SrcSubReg = srcUniform ? soff + i : soff + width * i;
17155         uint DstSubReg = dstUniform ? doff + i : doff + width * i;
17156 
17157         uint SrcWidth = srcUniform ? 1 : width;
17158         uint DstWidth = dstUniform ? 1 : width;
17159 
17160         if (SrcSubReg >= Src->GetNumberElement() ||
17161             DstSubReg >= Dst->GetNumberElement())
17162         {
17163             break;
17164         }
17165 
17166         bool SrcOverflow = (SrcSubReg + SrcWidth > Src->GetNumberElement());
17167         bool DstOverflow = (DstSubReg + DstWidth > Dst->GetNumberElement());
17168 
17169         // This is currently used for VME payloads whose LLVM type doesn't
17170         // necessarily match the associated CVariable size (the LLVM type
17171         // will be at least as big as the CVariable). Here, we make sure that,
17172         // if an entire vector element is not copied, we emit movs to just
17173         // read or write the appropriate number of bytes.
17174         if (SrcOverflow || DstOverflow)
17175         {
17176             if (srcUniform)
17177             {
17178                 auto partialCopy = [&](SIMDMode mode)
17179                 {
17180                     unsigned w = numLanes(mode);
17181 
17182                     if (DstSubReg + w > Dst->GetNumberElement())
17183                         return;
17184 
17185                     m_encoder->SetSimdSize(mode);
17186                     m_encoder->SetSrcSubReg(0, SrcSubReg);
17187                     m_encoder->SetDstSubReg(DstSubReg);
17188                     m_encoder->Copy(Dst, Src);
17189                     m_encoder->Push();
17190 
17191                     DstSubReg += w;
17192                 };
17193 
17194                 partialCopy(SIMDMode::SIMD8);
17195                 partialCopy(SIMDMode::SIMD4);
17196                 partialCopy(SIMDMode::SIMD2);
17197                 partialCopy(SIMDMode::SIMD1);
17198             }
17199             else
17200             {
17201                 auto partialCopy = [&](SIMDMode mode)
17202                 {
17203                     unsigned w = numLanes(mode);
17204 
17205                     if (DstSubReg + w > Dst->GetNumberElement() ||
17206                         SrcSubReg + w > Src->GetNumberElement())
17207                         return;
17208 
17209                     m_encoder->SetSimdSize(mode);
17210                     m_encoder->SetSrcSubReg(0, SrcSubReg);
17211                     m_encoder->SetDstSubReg(DstSubReg);
17212                     m_encoder->Copy(Dst, Src);
17213                     m_encoder->Push();
17214 
17215                     DstSubReg += w;
17216                     SrcSubReg += w;
17217                 };
17218 
17219                 partialCopy(SIMDMode::SIMD8);
17220                 partialCopy(SIMDMode::SIMD4);
17221                 partialCopy(SIMDMode::SIMD2);
17222                 partialCopy(SIMDMode::SIMD1);
17223             }
17224 
17225             break;
17226         }
17227 
17228         m_encoder->SetSrcSubReg(0, SrcSubReg);
17229         m_encoder->SetDstSubReg(DstSubReg);
17230         m_encoder->Copy(Dst, Src);
17231         m_encoder->Push();
17232     }
17233 }
17234 
17235 // Handle Copy intrinsic
emitGenISACopy(GenIntrinsicInst * GenCopyInst)17236 void EmitPass::emitGenISACopy(GenIntrinsicInst* GenCopyInst)
17237 {
17238     CVariable* Dst = m_destination;
17239     CVariable* Src = GetSymbol(GenCopyInst->getArgOperand(0));
17240     Type* Ty = GenCopyInst->getType();
17241     emitCopyAll(Dst, Src, Ty);
17242 }
17243 
17244 // Push a new frame onto the stack by:
17245 //  Update FP to the current SP
17246 //  Increment SP by pushSize
17247 //  Store value of previous frame's FP to the address of updated FP (for stack-walk)
emitPushFrameToStack(unsigned & pushSize)17248 void EmitPass::emitPushFrameToStack(unsigned& pushSize)
17249 {
17250     CVariable* pFP = m_currShader->GetFP();
17251     CVariable* pSP = m_currShader->GetSP();
17252 
17253     // Set FP = SP
17254     m_encoder->Copy(pFP, pSP);
17255     m_encoder->Push();
17256 
17257     if IGC_IS_FLAG_ENABLED(EnableWriteOldFPToStack)
17258     {
17259         // Allocate 1 extra oword to store previous frame's FP
17260         pushSize += SIZE_OWORD;
17261     }
17262 
17263     // Since we use unaligned oword writes, pushSize should be OW aligned address
17264     if (pushSize % SIZE_OWORD > 0)
17265         pushSize += (SIZE_OWORD - (pushSize % SIZE_OWORD));
17266 
17267     // Update SP by pushSize
17268     emitAddPointer(pSP, pSP, m_currShader->ImmToVariable(pushSize, ISA_TYPE_UD));
17269 
17270     if IGC_IS_FLAG_ENABLED(EnableWriteOldFPToStack)
17271     {
17272         // Store old FP value to current FP
17273         CVariable* pOldFP = m_currShader->GetPrevFP();
17274         // If previous FP is null (for kernel frame), we initialize it to 0
17275         if (pOldFP == nullptr)
17276         {
17277             pOldFP = m_currShader->GetNewVariable(pFP);
17278             m_encoder->Copy(pOldFP, m_currShader->ImmToVariable(0, ISA_TYPE_UQ));
17279             m_encoder->Push();
17280         }
17281 
17282         pFP = ReAlignUniformVariable(pFP, EALIGN_GRF);
17283         {
17284             m_encoder->OWStoreA64(pOldFP, pFP, SIZE_OWORD, 0);
17285             m_encoder->Push();
17286         }
17287     }
17288 }
17289 
emitAddPointer(CVariable * Dst,CVariable * Src,CVariable * offset)17290 void EmitPass::emitAddPointer(CVariable* Dst, CVariable* Src, CVariable* offset)
17291 {
17292     if (m_currShader->m_Platform->hasNoInt64AddInst() &&
17293         (Dst->GetType() == ISA_TYPE_Q || Dst->GetType() == ISA_TYPE_UQ) &&
17294         (Src->GetType() == ISA_TYPE_Q || Src->GetType() == ISA_TYPE_UQ))
17295     {
17296         emitAddPair(Dst, Src, offset);
17297     }
17298     else
17299     {
17300         m_encoder->Add(Dst, Src, offset);
17301         m_encoder->Push();
17302     }
17303 }
17304 
emitAddPair(CVariable * Dst,CVariable * Src0,CVariable * Src1)17305 void EmitPass::emitAddPair(CVariable* Dst, CVariable* Src0, CVariable* Src1) {
17306     IGC_ASSERT(Dst->GetType() == ISA_TYPE_Q || Dst->GetType() == ISA_TYPE_UQ);
17307     IGC_ASSERT(Src0->GetType() == ISA_TYPE_Q || Src0->GetType() == ISA_TYPE_UQ);
17308     IGC_ASSERT(Src1->GetType() == ISA_TYPE_UV || Src1->GetType() == ISA_TYPE_UD || Src1->GetType() == ISA_TYPE_D);
17309 
17310     bool IsUniformDst = Dst->IsUniform();
17311 
17312     unsigned short NumElts = Dst->GetNumberElement();
17313     SIMDMode Mode = lanesToSIMDMode(NumElts);
17314 
17315     VISA_Type NewType = ISA_TYPE_UD;
17316     CVariable* SrcAlias = m_currShader->GetNewAlias(Src0, NewType, 0, 0);
17317     CVariable* L0 = m_currShader->GetNewVariable(NumElts, NewType, EALIGN_GRF, IsUniformDst, CName(Src0->getName(), "Lo32"));
17318     CVariable* H0 = m_currShader->GetNewVariable(NumElts, NewType, EALIGN_GRF, IsUniformDst, CName(Src0->getName(), "Hi32"));
17319 
17320     // Split Src0 into L0 and H0
17321     // L0 := Offset[0];
17322     if (IsUniformDst) {
17323         m_encoder->SetNoMask();
17324         m_encoder->SetUniformSIMDSize(Mode);
17325     }
17326     if (Src0->IsUniform())
17327         m_encoder->SetSrcRegion(0, 0, 1, 0);
17328     else
17329         m_encoder->SetSrcRegion(0, 2, 1, 0);
17330     m_encoder->Copy(L0, SrcAlias);
17331     m_encoder->Push();
17332     // H0 := Offset[1];
17333     if (IsUniformDst) {
17334         m_encoder->SetNoMask();
17335         m_encoder->SetUniformSIMDSize(Mode);
17336     }
17337     m_encoder->SetSrcSubReg(0, 1);
17338     if (Src0->IsUniform())
17339         m_encoder->SetSrcRegion(0, 0, 1, 0);
17340     else
17341         m_encoder->SetSrcRegion(0, 2, 1, 0);
17342     m_encoder->Copy(H0, SrcAlias);
17343     m_encoder->Push();
17344 
17345     // If rc1 is a signed type value, signed extend it to L1 and H1. Otherwise we can
17346     // ignore its high-32 bit part, which will be all zeros.
17347     CVariable* L1 = nullptr;
17348     CVariable* H1 = nullptr;
17349     if (Src1->GetType() == ISA_TYPE_D) {
17350          L1 = m_currShader->GetNewVariable(NumElts, NewType, EALIGN_GRF, IsUniformDst, CName(Src1->getName(), "Lo32"));
17351          H1 = m_currShader->GetNewVariable(NumElts, NewType, EALIGN_GRF, IsUniformDst, CName(Src1->getName(), "Hi32"));
17352 
17353          // L1 := Offset[0];
17354          if (IsUniformDst) {
17355              m_encoder->SetNoMask();
17356              m_encoder->SetUniformSIMDSize(Mode);
17357          }
17358          if (Src1->IsUniform())
17359              m_encoder->SetSrcRegion(0, 0, 1, 0);
17360          else
17361              m_encoder->SetSrcRegion(0, 1, 1, 0);
17362          m_encoder->Copy(L1, Src1);
17363          m_encoder->Push();
17364          // H1 := Offset[1];
17365          if (IsUniformDst) {
17366              m_encoder->SetNoMask();
17367              m_encoder->SetUniformSIMDSize(Mode);
17368          }
17369          if (Src1->IsUniform())
17370              m_encoder->SetSrcRegion(0, 0, 1, 0);
17371          else
17372              m_encoder->SetSrcRegion(0, 1, 1, 0);
17373          m_encoder->IShr(H1, Src1, m_currShader->ImmToVariable(31, ISA_TYPE_UD));
17374          m_encoder->Push();
17375      }
17376 
17377     CVariable* Lo = m_currShader->GetNewVariable(NumElts, NewType, EALIGN_GRF, IsUniformDst, CName(Dst->getName(), "Lo32"));
17378     CVariable* Hi = m_currShader->GetNewVariable(NumElts, NewType, EALIGN_GRF, IsUniformDst, CName(Dst->getName(), "Lo32"));
17379     // (Lo, Hi) := AddPair(L0, H0, ImmLo, ImmHi);
17380     if (IsUniformDst) {
17381         m_encoder->SetNoMask();
17382         m_encoder->SetUniformSIMDSize(Mode);
17383         m_encoder->SetSrcRegion(0, 1, 1, 0);
17384         m_encoder->SetSrcRegion(1, 1, 1, 0);
17385     }
17386     if (L1 != nullptr)
17387         m_encoder->AddPair(Lo, Hi, L0, H0, L1, H1);
17388     else
17389         m_encoder->AddPair(Lo, Hi, L0, H0, Src1);
17390     m_encoder->Push();
17391 
17392     CVariable* DstAlias = m_currShader->GetNewAlias(Dst, NewType, 0, 0);
17393     // Offset[0] := Lo;
17394     if (IsUniformDst) {
17395         m_encoder->SetNoMask();
17396         m_encoder->SetUniformSIMDSize(Mode);
17397         m_encoder->SetSrcRegion(0, 1, 1, 0);
17398     }
17399     m_encoder->SetDstRegion(2);
17400     m_encoder->Copy(DstAlias, Lo);
17401     m_encoder->Push();
17402     // Offset[1] := Hi;
17403     if (IsUniformDst) {
17404         m_encoder->SetNoMask();
17405         m_encoder->SetUniformSIMDSize(Mode);
17406         m_encoder->SetSrcRegion(0, 1, 1, 0);
17407     }
17408     m_encoder->SetDstSubReg(1);
17409     m_encoder->SetDstRegion(2);
17410     m_encoder->Copy(DstAlias, Hi);
17411     m_encoder->Push();
17412 }
17413 
17414 /// \brief Copy all values from the src variable to the dst variable.
17415 /// The last argument is the underlying value type.
emitCopyAll(CVariable * Dst,CVariable * Src,llvm::Type * Ty)17416 void EmitPass::emitCopyAll(CVariable* Dst, CVariable* Src, llvm::Type* Ty)
17417 {
17418     if (Src->GetVarType() == EVARTYPE_PREDICATE)
17419     {
17420         IGC_ASSERT_MESSAGE(!Ty->isVectorTy(), "vector of predicates?");
17421         IGC_ASSERT(Dst->GetVarType() == Src->GetVarType());
17422         CVariable* Zero = m_currShader->ImmToVariable(0, ISA_TYPE_BOOL);
17423         m_encoder->Or(Dst, Src, Zero);
17424         m_encoder->Push();
17425     }
17426     else if (Ty->isVectorTy())
17427     {
17428         unsigned NElts = (unsigned)cast<IGCLLVM::FixedVectorType>(Ty)->getNumElements();
17429         emitVectorCopy(Dst, Src, NElts);
17430     }
17431     else if (Ty->isStructTy())
17432     {
17433         IGC_ASSERT(Dst->GetType() == ISA_TYPE_B);
17434         IGC_ASSERT(Src->GetType() == ISA_TYPE_B);
17435 
17436         if (!Src->IsUniform() && Dst->IsUniform())
17437         {
17438             IGC_ASSERT_MESSAGE(0, "Does not support non-uniform to uniform struct copy");
17439         }
17440 
17441         StructType* STy = dyn_cast<StructType>(Ty);
17442         const StructLayout* SL = m_DL->getStructLayout(STy);
17443         unsigned srcLanes = Src->IsUniform() ? 1 : numLanes(m_currShader->m_dispatchSize);
17444         unsigned dstLanes = Dst->IsUniform() ? 1 : numLanes(m_currShader->m_dispatchSize);
17445         for (unsigned i = 0; i < STy->getNumElements(); i++)
17446         {
17447             unsigned elementOffset = (unsigned)SL->getElementOffset(i);
17448             Type* elementType = STy->getElementType(i);
17449 
17450             unsigned numElements = 1;
17451             if (auto elementVectorType = dyn_cast<IGCLLVM::FixedVectorType>(elementType))
17452             {
17453                 numElements = (unsigned)elementVectorType->getNumElements();
17454             }
17455 
17456             VISA_Type visaTy = m_currShader->GetType(elementType);
17457 
17458             CVariable* srcElement = m_currShader->GetNewAlias(Src, visaTy, elementOffset * srcLanes, numElements * srcLanes, Src->IsUniform());
17459             CVariable* dstElement = m_currShader->GetNewAlias(Dst, visaTy, elementOffset * dstLanes, numElements * dstLanes, Dst->IsUniform());
17460             emitCopyAll(dstElement, srcElement, elementType);
17461         }
17462     }
17463     else
17464     {
17465         IGC_ASSERT_MESSAGE(Ty->isSingleValueType(), "not supported");
17466         m_encoder->Copy(Dst, Src);
17467         m_encoder->Push();
17468     }
17469 }
17470 
emitSqrt(Instruction * inst)17471 void EmitPass::emitSqrt(Instruction* inst)
17472 {
17473     GenIntrinsicInst* intrinCall = llvm::cast<GenIntrinsicInst>(inst);
17474     CVariable* src0 = GetSymbol(intrinCall->getArgOperand(0));
17475     src0 = BroadcastIfUniform(src0);
17476 
17477     m_encoder->Sqrt(m_destination, src0);
17478 }
17479 
emitFrc(llvm::GenIntrinsicInst * inst)17480 void EmitPass::emitFrc(llvm::GenIntrinsicInst* inst)
17481 {
17482     CVariable* src0 = GetSymbol(inst->getArgOperand(0));
17483     src0 = BroadcastIfUniform(src0);
17484 
17485     m_encoder->Frc(m_destination, src0);
17486 }
17487 
emitCanonicalize(llvm::Instruction * inst,const DstModifier & modifier)17488 void IGC::EmitPass::emitCanonicalize(llvm::Instruction* inst, const DstModifier& modifier)
17489 {
17490     // Force to flush denormal fp value to zero. Select one of two possible solutions:
17491     // 1. add inputVal, -0.0
17492     // 2. mul inputVal, 1.0
17493     // A normalized fp value isn't changed.
17494     // The operation is done only if particular flags are set.
17495     // If the instruction should be emitted anyway, flushing a subnormal to zero has to implemented in other way.
17496     CodeGenContext* pCodeGenContext = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
17497     bool flushVal = pCodeGenContext->m_floatDenormMode16 == ::IGC::FLOAT_DENORM_FLUSH_TO_ZERO && inst->getType()->isHalfTy();
17498     flushVal = flushVal || (pCodeGenContext->m_floatDenormMode32 == ::IGC::FLOAT_DENORM_FLUSH_TO_ZERO && inst->getType()->isFloatTy());
17499     flushVal = flushVal || (pCodeGenContext->m_floatDenormMode64 == ::IGC::FLOAT_DENORM_FLUSH_TO_ZERO && inst->getType()->isDoubleTy());
17500     if (flushVal || modifier.sat)
17501     {
17502         CVariable* inputVal = GetSymbol(inst->getOperand(0));
17503         CVariable* negativeZero = m_currShader->GetScalarConstant(llvm::ConstantFP::get(inst->getType(), -0.0));
17504         m_encoder->SetDstModifier(modifier);
17505         m_encoder->Add(m_destination, inputVal, negativeZero);
17506     }
17507 }
17508 
emitStaticConstantPatchValue(llvm::StaticConstantPatchIntrinsic * staticConstantPatch32)17509 void IGC::EmitPass::emitStaticConstantPatchValue(llvm::StaticConstantPatchIntrinsic* staticConstantPatch32)
17510 {
17511     std::string patchName = staticConstantPatch32->getPatchName().str();
17512     m_encoder->AddVISASymbol(patchName, m_destination);
17513 }
17514 
17515 // emit llvm.bswap
emitLLVMbswap(IntrinsicInst * inst)17516 void EmitPass::emitLLVMbswap(IntrinsicInst* inst)
17517 {
17518     Type* Ty = inst->getType();
17519     Value* Arg = inst->getArgOperand(0);
17520     uint32_t nBytes = int_cast<uint32_t>(m_DL->getTypeSizeInBits(Ty));
17521     IGC_ASSERT_MESSAGE(nBytes % 16 == 0, "Incorrect llvm.bswap");
17522     IGC_ASSERT_MESSAGE(!Ty->isVectorTy(), "Incorrect llvm.bswap");
17523     nBytes >>= 3;  // Now, nBytes are in unit of byte.
17524 
17525     CVariable* Src = GetSymbol(Arg);
17526     CVariable* Dst = m_destination;
17527     uint32_t width = numLanes(m_currShader->m_SIMDSize);
17528     bool srcUniform = Src->IsUniform();
17529     bool dstUniform = Dst->IsUniform();
17530 
17531     CVariable* SrcB = m_currShader->GetNewAlias(Src, ISA_TYPE_UB, 0, 0);
17532     if (nBytes == 2 || nBytes == 4)
17533     {
17534         CVariable* DstB = m_currShader->GetNewAlias(Dst, ISA_TYPE_UB, 0, 0);
17535 
17536         // Generating byte mov
17537         for (unsigned i = 0; i < nBytes / 2; ++i)
17538         {
17539             // swap bytes[i] with bytes[j].
17540             uint32_t j = (nBytes - 1) - i;
17541 
17542             m_encoder->SetSrcSubReg(0, i);
17543             m_encoder->SetSrcRegion(0, srcUniform ? 0 : nBytes, 1, 0);
17544             m_encoder->SetDstSubReg(j);
17545             m_encoder->SetDstRegion(dstUniform ? 1 : nBytes);
17546             m_encoder->Copy(DstB, SrcB);
17547             m_encoder->Push();
17548 
17549             m_encoder->SetSrcSubReg(0, j);
17550             m_encoder->SetSrcRegion(0, srcUniform ? 0 : nBytes, 1, 0);
17551             m_encoder->SetDstSubReg(i);
17552             m_encoder->SetDstRegion(dstUniform ? 1 : nBytes);
17553             m_encoder->Copy(DstB, SrcB);
17554             m_encoder->Push();
17555         }
17556     }
17557     else if (nBytes == 8)
17558     {
17559         // Need to so lower DW and upper DW separately first.
17560         m_currShader->GetNewAlias(Src, ISA_TYPE_UD, 0, 0);
17561         CVariable* DstH = m_currShader->GetNewVariable(
17562             Src->GetNumberElement(),
17563             ISA_TYPE_UD,
17564             Src->GetAlign(),
17565             srcUniform,
17566             CName::NONE);
17567         CVariable* DstL = m_currShader->GetNewVariable(
17568             Src->GetNumberElement(),
17569             ISA_TYPE_UD,
17570             Src->GetAlign(),
17571             srcUniform,
17572             CName::NONE);
17573         CVariable* DstHB = m_currShader->GetNewAlias(DstH, ISA_TYPE_UB, 0, 0);
17574         CVariable* DstLB = m_currShader->GetNewAlias(DstL, ISA_TYPE_UB, 0, 0);
17575 
17576         bool split = (width == 16);
17577         for (unsigned n = 0; n < 2; ++n)
17578         {
17579             for (unsigned i = 0; i < 4; ++i)
17580             {
17581                 // swap bytes[i] and bytes[j]
17582                 uint32_t j = 3 - i;
17583                 if (split && !srcUniform)
17584                 {
17585                     m_encoder->SetSrcSubReg(0, 4 * n + i);
17586                     m_encoder->SetSrcRegion(0, 8, 1, 0);
17587                     m_encoder->SetDstSubReg(j);
17588                     m_encoder->SetDstRegion(4);
17589                     m_encoder->SetSimdSize(SIMDMode::SIMD8);
17590                     m_encoder->SetMask(EMASK_Q1);
17591                     m_encoder->Copy(n == 0 ? DstHB : DstLB, SrcB);
17592                     m_encoder->Push();
17593 
17594                     m_encoder->SetSrcSubReg(0, 2 * getGRFSize() + 4 * n + i);
17595                     m_encoder->SetSrcRegion(0, 8, 1, 0);
17596                     m_encoder->SetDstSubReg(getGRFSize() + j);
17597                     m_encoder->SetDstRegion(4);
17598                     m_encoder->SetSimdSize(SIMDMode::SIMD8);
17599                     m_encoder->SetMask(EMASK_Q2);
17600                     m_encoder->Copy(n == 0 ? DstHB : DstLB, SrcB);
17601                     m_encoder->Push();
17602                 }
17603                 else
17604                 {
17605                     // DstH[B]/DstL[B] have the same uniformness as Src !
17606                     m_encoder->SetSrcSubReg(0, 4 * n + i);
17607                     m_encoder->SetSrcRegion(0, srcUniform ? 0 : 8, 1, 0);
17608                     m_encoder->SetDstSubReg(j);
17609                     m_encoder->SetDstRegion(srcUniform ? 1 : 4);
17610                     m_encoder->Copy(n == 0 ? DstHB : DstLB, SrcB);
17611                     m_encoder->Push();
17612                 }
17613             }
17614         }
17615 
17616         // Now, mov DstH and DstL to Dst
17617         CVariable* DstD = m_currShader->GetNewAlias(Dst, ISA_TYPE_UD, 0, 0);
17618 
17619         // When dst is uniform, dst does not cross 2 GRFs, split isn't needed.
17620         if (split && !dstUniform)
17621         {
17622             m_encoder->SetSimdSize(SIMDMode::SIMD8);
17623             m_encoder->SetMask(EMASK_Q1);
17624             m_encoder->SetDstRegion(2);
17625             m_encoder->Copy(DstD, DstL);
17626             m_encoder->Push();
17627 
17628             m_encoder->SetSimdSize(SIMDMode::SIMD8);
17629             m_encoder->SetMask(EMASK_Q2);
17630             m_encoder->SetSrcSubReg(0, srcUniform ? 0 : 8);
17631             m_encoder->SetDstSubReg(16);
17632             m_encoder->SetDstRegion(2);
17633             m_encoder->Copy(DstD, DstL);
17634             m_encoder->Push();
17635 
17636             m_encoder->SetSimdSize(SIMDMode::SIMD8);
17637             m_encoder->SetMask(EMASK_Q1);
17638             m_encoder->SetDstSubReg(1);
17639             m_encoder->SetDstRegion(2);
17640             m_encoder->Copy(DstD, DstH);
17641             m_encoder->Push();
17642 
17643             m_encoder->SetSimdSize(SIMDMode::SIMD8);
17644             m_encoder->SetMask(EMASK_Q2);
17645             m_encoder->SetSrcSubReg(0, srcUniform ? 0 : 8);
17646             m_encoder->SetDstSubReg(17);
17647             m_encoder->SetDstRegion(2);
17648             m_encoder->Copy(DstD, DstH);
17649             m_encoder->Push();
17650         }
17651         else
17652         {
17653             m_encoder->SetDstRegion(dstUniform ? 1 : 2);
17654             m_encoder->Copy(DstD, DstL);
17655             m_encoder->Push();
17656             m_encoder->SetDstSubReg(1);
17657             m_encoder->SetDstRegion(dstUniform ? 1 : 2);
17658             m_encoder->Copy(DstD, DstH);
17659             m_encoder->Push();
17660         }
17661     }
17662     else
17663     {
17664         IGC_ASSERT_MESSAGE(0, "Unsupported type for llvm.bswap!");
17665         return;
17666     }
17667 }
17668 
setPredicateForDiscard(CVariable * pPredicate)17669 void EmitPass::setPredicateForDiscard(CVariable* pPredicate)
17670 {
17671     // Input predicate parameter is used when resource variable is non-uniform
17672     // and compiler needs to create the resource loop.
17673     bool isInversePredicate = false;
17674     if (m_currShader->GetShaderType() == ShaderType::PIXEL_SHADER)
17675     {
17676         CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
17677         if (psProgram->HasDiscard() && psProgram->GetDiscardPixelMask())
17678         {
17679             if (pPredicate != nullptr)
17680             {
17681                 m_encoder->SetNoMask();
17682                 m_encoder->GenericAlu(EOPCODE_NOT, pPredicate, pPredicate, nullptr);
17683                 m_encoder->Push();
17684                 m_encoder->SetNoMask();
17685                 m_encoder->GenericAlu(EOPCODE_OR, pPredicate, pPredicate, psProgram->GetDiscardPixelMask());
17686                 m_encoder->Push();
17687             }
17688             else
17689             {
17690                 pPredicate = psProgram->GetDiscardPixelMask();
17691             }
17692             isInversePredicate = true;
17693         }
17694     }
17695     if (pPredicate != nullptr)
17696     {
17697         m_encoder->SetPredicate(pPredicate);
17698         m_encoder->SetInversePredicate(isInversePredicate);
17699     }
17700 }
17701 
ForceDMask(bool createJmpForDiscard)17702 void EmitPass::ForceDMask(bool createJmpForDiscard)
17703 {
17704     if (createJmpForDiscard &&
17705         m_currShader->GetShaderType() == ShaderType::PIXEL_SHADER)
17706     {
17707         CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
17708         if (psProgram->HasDiscard() && psProgram->GetDiscardPixelMask())
17709         {
17710             m_labelForDMaskJmp = m_encoder->GetNewLabelID("discard");
17711             m_encoder->Jump(psProgram->GetDiscardPixelMask(),
17712                 m_labelForDMaskJmp);
17713             m_encoder->Push();
17714         }
17715     }
17716 
17717     if (m_pattern->NeedVMask())
17718     {
17719         m_encoder->SetVectorMask(false);
17720     }
17721 }
17722 
ResetVMask(bool createJmpForDiscard)17723 void EmitPass::ResetVMask(bool createJmpForDiscard)
17724 {
17725     if (m_pattern->NeedVMask())
17726     {
17727         m_encoder->SetVectorMask(true);
17728     }
17729 
17730     if (createJmpForDiscard &&
17731         m_currShader->GetShaderType() == ShaderType::PIXEL_SHADER)
17732     {
17733         CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
17734         if (psProgram->HasDiscard() && psProgram->GetDiscardPixelMask())
17735         {
17736             m_encoder->Label(m_labelForDMaskJmp);
17737             m_encoder->Push();
17738         }
17739     }
17740 }
17741 
emitGetBufferPtr(GenIntrinsicInst * inst)17742 void EmitPass::emitGetBufferPtr(GenIntrinsicInst* inst)
17743 {
17744     Value* buf_idxv = inst->getOperand(0);
17745     Value* bufTyVal = inst->getOperand(1);
17746     IGC_ASSERT(isa<ConstantInt>(bufTyVal));
17747     BufferType bufType = (BufferType)(cast<ConstantInt>(bufTyVal)->getZExtValue());
17748 
17749     uint bti = 0;
17750     switch (bufType)
17751     {
17752     case UAV:
17753         bti = m_currShader->m_pBtiLayout->GetUavIndex(0);
17754         break;
17755     case CONSTANT_BUFFER:
17756         bti = m_currShader->m_pBtiLayout->GetConstantBufferIndex(0);
17757         break;
17758     case RESOURCE:
17759         bti = m_currShader->m_pBtiLayout->GetTextureIndex(0);
17760         break;
17761     case RENDER_TARGET:
17762         bti = m_currShader->m_pBtiLayout->GetRenderTargetIndex(0);
17763         break;
17764     case SAMPLER:
17765         bti = 0;
17766         break;
17767     default:
17768         IGC_ASSERT_MESSAGE(0, "unexpect buffer type for GetBufferPtr");
17769         break;
17770     }
17771     CVariable* indexCVar = GetSymbol(buf_idxv);
17772 
17773     if (bti)
17774     {
17775         CVariable* btiCVar = m_currShader->ImmToVariable(bti, ISA_TYPE_UD);
17776         m_encoder->Add(m_destination, indexCVar, btiCVar);
17777     }
17778     else
17779     {
17780         m_encoder->Copy(m_destination, indexCVar);
17781     }
17782     m_encoder->Push();
17783 
17784     // Set BTI; BTI equal zero is also a valid value.
17785     bool directIdx = (llvm::dyn_cast<llvm::ConstantInt>(inst->getOperand(0))) ? true : false;
17786     m_currShader->SetBindingTableEntryCountAndBitmap(directIdx, bufType, 0, bti);
17787 }
17788 
GetResourceVariable(Value * resourcePtr)17789 ResourceDescriptor EmitPass::GetResourceVariable(Value* resourcePtr)
17790 {
17791     ResourceDescriptor resource;
17792     BufferType bufType = BUFFER_TYPE_UNKNOWN;
17793     uint as = 0;
17794     if (auto *GII = dyn_cast<GenIntrinsicInst>(resourcePtr); GII &&
17795         GII->getIntrinsicID() == GenISAIntrinsic::GenISA_GetBufferPtr)
17796     {
17797         // from GetBufferPtr
17798         Value* bufTyVal = GII->getOperand(1);
17799         IGC_ASSERT(isa<ConstantInt>(bufTyVal));
17800         bufType = (BufferType)(cast<ConstantInt>(bufTyVal)->getZExtValue());
17801         resource.m_resource = GetSymbol(resourcePtr);
17802     }
17803     else
17804     {
17805         as = resourcePtr->getType()->getPointerAddressSpace();
17806         uint bufferIndex = 0;
17807         bool directIndexing = false;
17808 
17809         bufType = DecodeAS4GFXResource(as, directIndexing, bufferIndex);
17810 
17811         if (IsBindless(bufType) || !directIndexing)
17812         {
17813             if (isa<IntToPtrInst>(resourcePtr))
17814             {
17815                 IntToPtrInst* i2p = dyn_cast<IntToPtrInst>(resourcePtr);
17816                 resource.m_resource = GetSymbol(i2p->getOperand(0));
17817             }
17818             else
17819             {
17820                 resource.m_resource = GetSymbol(resourcePtr);
17821             }
17822 
17823             if (resource.m_resource->GetElemSize() < 4)
17824             {
17825                 // vISA assumes all BTIs to be 32 bit. Need to cast, otherwise higher bits would be uninitialized.
17826                 unsigned numInstance = resource.m_resource->GetNumberInstance();
17827                 CVariable* newResource = m_currShader->GetNewVariable(
17828                     resource.m_resource->GetNumberElement(),
17829                     ISA_TYPE_UD,
17830                     resource.m_resource->IsUniform() ? EALIGN_DWORD : EALIGN_GRF,
17831                     resource.m_resource->IsUniform(),
17832                     numInstance,
17833                     CName::NONE);
17834 
17835                 m_encoder->Cast(newResource, resource.m_resource);
17836 
17837                 if (numInstance == 2)
17838                 {
17839                     m_encoder->SetSecondHalf(!m_encoder->IsSecondHalf());
17840                     m_encoder->Cast(newResource, resource.m_resource);
17841                     m_encoder->SetSecondHalf(!m_encoder->IsSecondHalf());
17842                 }
17843 
17844                 resource.m_resource = newResource;
17845             }
17846 
17847             if (!directIndexing)
17848             {
17849                 m_currShader->SetBindingTableEntryCountAndBitmap(false, bufType, 0, 0);
17850             }
17851         }
17852         else
17853         {
17854             uint bti = 0;
17855             switch (bufType)
17856             {
17857             case UAV:
17858                 bti = m_currShader->m_pBtiLayout->GetUavIndex(bufferIndex);
17859                 break;
17860             case CONSTANT_BUFFER:
17861                 bti = m_currShader->m_pBtiLayout->GetConstantBufferIndex(bufferIndex);
17862                 break;
17863             case RESOURCE:
17864                 bti = m_currShader->m_pBtiLayout->GetTextureIndex(bufferIndex);
17865                 break;
17866             case RENDER_TARGET:
17867                 IGC_ASSERT(m_currShader->GetShaderType() == ShaderType::PIXEL_SHADER);
17868                 bti = m_currShader->m_pBtiLayout->GetRenderTargetIndex(bufferIndex);
17869                 break;
17870             case SLM:
17871                 bti = 254;  // \todo, remove hard-coding
17872                 break;
17873             default:
17874                 bti = m_currShader->m_pBtiLayout->GetStatelessBindingTableIndex();
17875                 break;
17876             }
17877             resource.m_resource = m_currShader->ImmToVariable(bti, ISA_TYPE_UD);
17878             m_currShader->SetBindingTableEntryCountAndBitmap(directIndexing, bufType, bufferIndex, bti);
17879         }
17880     }
17881 
17882 
17883     if (IsBindless(bufType))
17884     {
17885         resource.m_surfaceType = ESURFACE_BINDLESS;
17886     }
17887     else if (IsSSHbindless(bufType))
17888     {
17889         resource.m_surfaceType = ESURFACE_SSHBINDLESS;
17890     }
17891     else if (bufType == SLM)
17892     {
17893         resource.m_surfaceType = ESURFACE_SLM;
17894     }
17895     else if (bufType == CONSTANT_BUFFER || bufType == UAV ||
17896         bufType == RESOURCE || bufType == RENDER_TARGET)
17897     {
17898         resource.m_surfaceType = ESURFACE_NORMAL;
17899     }
17900     else
17901     {
17902         resource.m_surfaceType = ESURFACE_STATELESS;
17903     }
17904     return resource;
17905 }
17906 
GetSamplerVariable(Value * sampleOp)17907 SamplerDescriptor EmitPass::GetSamplerVariable(Value* sampleOp)
17908 {
17909     SamplerDescriptor sampler;
17910     unsigned int samplerIdx = 0;
17911     BufferType sampType = BUFFER_TYPE_UNKNOWN;
17912 
17913     if (GenIntrinsicInst* sample = dyn_cast<GenIntrinsicInst>(sampleOp))
17914     {
17915         if (sample->getIntrinsicID() == GenISAIntrinsic::GenISA_GetBufferPtr)
17916         {
17917             Value* bufTyVal = cast<GenIntrinsicInst>(sampleOp)->getOperand(1);
17918             IGC_ASSERT(isa<ConstantInt>(bufTyVal));
17919             sampType = (BufferType)(cast<ConstantInt>(bufTyVal)->getZExtValue());
17920             sampler.m_sampler = GetSymbol(sampleOp);
17921             IGC_ASSERT(sampType == SAMPLER);
17922             sampler.m_samplerType = ESAMPLER_NORMAL;
17923             return sampler;
17924         }
17925     }
17926 
17927     bool isBindless = false;
17928     bool directIdx = false;
17929 
17930     sampType = DecodeAS4GFXResource(
17931         sampleOp->getType()->getPointerAddressSpace(),
17932         directIdx, samplerIdx);
17933     isBindless = (sampType == BINDLESS_SAMPLER);
17934     sampler.m_samplerType =
17935         isBindless ? ESAMPLER_BINDLESS : ESAMPLER_NORMAL;
17936 
17937     if (isBindless || !directIdx)
17938     {
17939         sampler.m_sampler = GetSymbol(sampleOp);
17940     }
17941     else
17942     {
17943         sampler.m_sampler = m_currShader->ImmToVariable(
17944             samplerIdx, ISA_TYPE_UD);
17945     }
17946     return sampler;
17947 }
17948 
ResourceLoopHeader(ResourceDescriptor & resource,CVariable * & flag,uint & label)17949 bool EmitPass::ResourceLoopHeader(
17950     ResourceDescriptor& resource,
17951     CVariable*& flag,
17952     uint& label)
17953 {
17954     SamplerDescriptor sampler;
17955     return ResourceLoopHeader(resource, sampler, flag, label);
17956 }
17957 
17958 // Insert loop header to handle non-uniform resource and sampler
17959 // This generates sub-optimal code for SIMD32, this can be revisited if we need better code generation
ResourceLoopHeader(ResourceDescriptor & resource,SamplerDescriptor & sampler,CVariable * & flag,uint & label)17960 bool EmitPass::ResourceLoopHeader(
17961     ResourceDescriptor& resource,
17962     SamplerDescriptor& sampler,
17963     CVariable*& flag,
17964     uint& label)
17965 {
17966     if (resource.m_surfaceType != ESURFACE_BINDLESS &&
17967         resource.m_surfaceType != ESURFACE_SSHBINDLESS &&
17968         resource.m_surfaceType != ESURFACE_NORMAL)
17969     {
17970         // Loop only needed for access with surface state
17971         return false;
17972     }
17973     bool uniformResource = resource.m_resource == nullptr || resource.m_resource->IsUniform();
17974     bool uniformSampler = sampler.m_sampler == nullptr || sampler.m_sampler->IsUniform();
17975     if (uniformResource && uniformSampler)
17976     {
17977         return false;
17978     }
17979     CVariable* resourceFlag = nullptr;
17980     CVariable* samplerFlag = nullptr;
17981     CVariable* offset = nullptr;
17982     label = m_encoder->GetNewLabelID("resource_loop");
17983     m_encoder->Label(label);
17984     m_encoder->Push();
17985     if (!uniformResource)
17986     {
17987         ResourceDescriptor uniformResource;
17988         resourceFlag = m_currShader->GetNewVariable(numLanes(m_SimdMode), ISA_TYPE_BOOL, EALIGN_BYTE, CName::NONE);
17989         uniformResource.m_surfaceType = resource.m_surfaceType;
17990         uniformResource.m_resource = UniformCopy(resource.m_resource, offset);
17991         m_encoder->Cmp(EPREDICATE_EQ, resourceFlag, uniformResource.m_resource, resource.m_resource);
17992         m_encoder->Push();
17993         resource = uniformResource;
17994     }
17995     if (!uniformSampler)
17996     {
17997         SamplerDescriptor uniformSampler;
17998         samplerFlag = m_currShader->GetNewVariable(numLanes(m_SimdMode), ISA_TYPE_BOOL, EALIGN_BYTE, CName::NONE);
17999         uniformSampler.m_samplerType = sampler.m_samplerType;
18000         uniformSampler.m_sampler = UniformCopy(sampler.m_sampler, offset);
18001         m_encoder->Cmp(EPREDICATE_EQ, samplerFlag, uniformSampler.m_sampler, sampler.m_sampler);
18002         m_encoder->Push();
18003         sampler = uniformSampler;
18004     }
18005     if (resourceFlag && samplerFlag)
18006     {
18007         flag = m_currShader->GetNewVariable(numLanes(m_SimdMode), ISA_TYPE_BOOL, EALIGN_BYTE, CName::NONE);
18008         m_encoder->And(flag, resourceFlag, samplerFlag);
18009         m_encoder->Push();
18010     }
18011     else
18012     {
18013         flag = resourceFlag != nullptr ? resourceFlag : samplerFlag;
18014     }
18015     if (m_SimdMode == SIMDMode::SIMD32 && m_currShader->m_numberInstance == 2)
18016     {
18017         // For SIMD32 need to initialize to 1 the other half of the flag
18018         // ToDo: check if this is actually necessary, as the other half should not get used
18019         m_encoder->SetSecondHalf(!m_encoder->IsSecondHalf());
18020         m_encoder->SetSrcRegion(0, 0, 1, 0);
18021         m_encoder->SetSrcRegion(1, 0, 1, 0);
18022         m_encoder->Cmp(EPREDICATE_EQ, flag, m_currShader->GetR0(), m_currShader->GetR0());
18023         m_encoder->Push();
18024         m_encoder->SetSecondHalf(!m_encoder->IsSecondHalf());
18025     }
18026     return true;
18027 }
18028 
ResourceLoopBackEdge(bool needLoop,CVariable * flag,uint label)18029 void EmitPass::ResourceLoopBackEdge(bool needLoop, CVariable* flag, uint label)
18030 {
18031     if (needLoop)
18032     {
18033         m_encoder->SetInversePredicate(true);
18034         m_encoder->Jump(flag, label);
18035         m_encoder->Push();
18036 
18037         m_currShader->GetContext()->Stats().IncreaseI64("ResourceLoopCount", 1, numLanes(m_currShader->m_dispatchSize));
18038     }
18039 }
18040 
emitStateRegID(uint32_t BitStart,uint32_t BitEnd)18041 void EmitPass::emitStateRegID(uint32_t BitStart, uint32_t BitEnd)
18042 {
18043     // For example, emitStateRegID(14, 18) would return the value in the
18044     // range [18:14].
18045     uint32_t and_imm = BITMASK_RANGE(BitStart, BitEnd);
18046     uint32_t shr_imm = BitStart;
18047     m_encoder->And(m_destination, m_currShader->GetSR0(), m_currShader->ImmToVariable(and_imm, ISA_TYPE_UD));
18048     m_encoder->Shr(m_destination, m_destination, m_currShader->ImmToVariable(shr_imm, ISA_TYPE_UD));
18049     m_encoder->Push();
18050 }
18051 
emitMulAdd16(Instruction * I,const SSource Sources[2],const DstModifier & DstMod)18052 void EmitPass::emitMulAdd16(Instruction* I, const SSource Sources[2], const DstModifier& DstMod)
18053 {
18054     CVariable* LVar = GetSrcVariable(Sources[0]);
18055     CVariable* RVar = GetSrcVariable(Sources[1]);
18056     VISA_Type LTy = LVar->GetType();
18057     VISA_Type RTy = RVar->GetType();
18058 
18059     // Use SetSourceModifiers() to set subReg correctly.
18060     SetSourceModifiers(0, Sources[0]);
18061     SetSourceModifiers(1, Sources[1]);
18062     if (!LVar->IsUniform() && (!Sources[0].region_set) &&
18063         (LTy == ISA_TYPE_W || LTy == ISA_TYPE_UW))
18064     {
18065         m_encoder->SetSrcRegion(0, 16, 8, 2);
18066     }
18067     if (!RVar->IsUniform() && (!Sources[1].region_set) &&
18068         (RTy == ISA_TYPE_W || RTy == ISA_TYPE_UW))
18069     {
18070         m_encoder->SetSrcRegion(1, 16, 8, 2);
18071     }
18072 
18073     unsigned opc = I->getOpcode();
18074     if (opc == Instruction::Mul) {
18075         m_encoder->Mul(m_destination, LVar, RVar);
18076     }
18077     else if (opc == Instruction::Sub) {
18078         e_modifier mod = CombineModifier(EMOD_NEG, Sources[1].mod);
18079         m_encoder->SetSrcModifier(1, mod); // override modifier
18080         m_encoder->Add(m_destination, LVar, RVar);
18081     }
18082     else {
18083         IGC_ASSERT_MESSAGE(I->getOpcode() == Instruction::Add, "Unknown Opcode.");
18084         m_encoder->Add(m_destination, LVar, RVar);
18085     }
18086     m_encoder->Push();
18087 }
18088 
GetDispatchMask()18089 CVariable* EmitPass::GetDispatchMask()
18090 {
18091     return m_currShader->GetNewAlias(
18092         m_currShader->GetSR0(),
18093         ISA_TYPE_UD,
18094         (m_pattern->NeedVMask() ? 3 : 2) * SIZE_DWORD,
18095         1);
18096 }
18097 
emitThreadPause(llvm::GenIntrinsicInst * inst)18098 void EmitPass::emitThreadPause(llvm::GenIntrinsicInst* inst)
18099 {
18100     CVariable* TSC_reg = m_currShader->GetTSC();
18101     CVariable* TSC_pause = m_currShader->GetNewAlias(TSC_reg, ISA_TYPE_UD, 16, 1);
18102     uint64_t var = GetImmediateVal(inst->getOperand(0));
18103     if (var >= 32)
18104         var = 0x03E0;
18105     else if (var <= 4)
18106         var = 0x0080;
18107     else
18108         var <<= 5;
18109     m_encoder->Copy(TSC_pause, m_currShader->ImmToVariable(var, ISA_TYPE_UD));
18110     m_encoder->Push();
18111 }
18112 
18113 
emitWaveBallot(llvm::GenIntrinsicInst * inst)18114 void EmitPass::emitWaveBallot(llvm::GenIntrinsicInst* inst)
18115 {
18116     CVariable* destination = m_destination;
18117     if (!m_destination->IsUniform())
18118     {
18119         destination = m_currShader->GetNewVariable(1, ISA_TYPE_UD, EALIGN_GRF, true, CName::NONE);
18120     }
18121 
18122     bool uniform_active_lane = false;
18123     if (ConstantInt * pConst = dyn_cast<ConstantInt>(inst->getOperand(0)))
18124     {
18125         if (pConst->getZExtValue() == 1)
18126             uniform_active_lane = true;
18127     }
18128 
18129 
18130     if (!m_currShader->InsideDivergentCF(inst))
18131     {
18132         CVariable* f0 = GetSymbol(inst->getOperand(0));
18133 
18134         if (m_currShader->m_dispatchSize == SIMDMode::SIMD8 && m_currShader->HasFullDispatchMask())
18135         {
18136             // for SIMD8 make sure the higher 8 bits of the flag are not copied
18137             destination = m_currShader->GetNewVariable(1, ISA_TYPE_UB, EALIGN_BYTE, true, CName::NONE);
18138         }
18139         m_encoder->BoolToInt(destination, f0);
18140         if (!m_currShader->HasFullDispatchMask())
18141         {
18142             m_encoder->And(destination, GetDispatchMask(), destination);
18143         }
18144     }
18145     else
18146     {
18147         CVariable* exeMask = GetExecutionMask();
18148         if (!uniform_active_lane)
18149         {
18150             // (W)     and (1|M0)   r1.0:ud r0.0<0;1;0>:ud f0.0:uw
18151             CVariable* f0 = GetSymbol(inst->getOperand(0));
18152             CVariable* vf0 = m_currShader->GetNewVariable(
18153                 1, ISA_TYPE_UD, EALIGN_GRF, true, CName::NONE);
18154             m_encoder->SetSimdSize(SIMDMode::SIMD1);
18155             m_encoder->SetNoMask();
18156             m_encoder->BoolToInt(vf0, f0);
18157             m_encoder->Push();
18158 
18159             m_encoder->SetSimdSize(SIMDMode::SIMD1);
18160             m_encoder->SetNoMask();
18161             m_encoder->And(destination, exeMask, vf0);
18162             m_encoder->Push();
18163         }
18164         else
18165         {
18166             m_encoder->Cast(destination, exeMask);
18167             m_encoder->Push();
18168         }
18169     }
18170 
18171     if (destination != m_destination)
18172     {
18173         m_encoder->Cast(m_destination, destination);
18174         m_encoder->Push();
18175     }
18176 }
18177 
emitWaveInverseBallot(llvm::GenIntrinsicInst * inst)18178 void EmitPass::emitWaveInverseBallot(llvm::GenIntrinsicInst* inst)
18179 {
18180     CVariable* Mask = GetSymbol(inst->getOperand(0));
18181 
18182     if (Mask->IsUniform())
18183     {
18184         if (m_encoder->IsSecondHalf())
18185             return;
18186 
18187         m_encoder->SetP(m_destination, Mask);
18188         return;
18189     }
18190 
18191     // The uniform case should by far be the most common.  Otherwise,
18192     // fall back and compute:
18193     //
18194     // (val & (1 << id)) != 0
18195     CVariable* Temp = m_currShader->GetNewVariable(
18196         numLanes(m_currShader->m_SIMDSize), ISA_TYPE_UD, EALIGN_GRF, CName::NONE);
18197 
18198     m_currShader->GetSimdOffsetBase(Temp);
18199     m_encoder->Shl(Temp, m_currShader->ImmToVariable(1, ISA_TYPE_UD), Temp);
18200     m_encoder->And(Temp, Mask, Temp);
18201     m_encoder->Cmp(EPREDICATE_NE,
18202         m_destination, Temp, m_currShader->ImmToVariable(0, ISA_TYPE_UD));
18203 }
18204 
GetReductionOp(WaveOps op,Type * opndTy,uint64_t & identity,e_opcode & opcode,VISA_Type & type)18205 static void GetReductionOp(WaveOps op, Type* opndTy, uint64_t& identity, e_opcode& opcode, VISA_Type& type)
18206 {
18207     auto getISAType = [](Type* ty, bool isSigned = true)
18208     {
18209         if (ty->isHalfTy())
18210         {
18211             return ISA_TYPE_HF;
18212         }
18213         if (ty->isFloatTy())
18214         {
18215             return ISA_TYPE_F;
18216         }
18217         if (ty->isDoubleTy())
18218         {
18219             return ISA_TYPE_DF;
18220         }
18221         IGC_ASSERT_MESSAGE(ty->isIntegerTy(), "expect integer type");
18222         auto width = dyn_cast<IntegerType>(ty)->getBitWidth();
18223         IGC_ASSERT(width == 8 || width == 16 || width == 32 || width == 64);
18224         if (isSigned)
18225         {
18226             return width == 64 ? ISA_TYPE_Q : (width == 16 ? ISA_TYPE_W : (width == 8 ? ISA_TYPE_B : ISA_TYPE_D));
18227         }
18228         else
18229         {
18230             return width == 64 ? ISA_TYPE_UQ : (width == 16 ? ISA_TYPE_UW : (width == 8 ? ISA_TYPE_UB : ISA_TYPE_UD));
18231         }
18232     };
18233     auto getMaxVal = [](VISA_Type ty) -> uint64_t
18234     {
18235         switch (ty)
18236         {
18237         case ISA_TYPE_D:
18238             return std::numeric_limits<int>::max();
18239         case ISA_TYPE_UD:
18240             return std::numeric_limits<uint32_t>::max();
18241         case ISA_TYPE_B:
18242             return std::numeric_limits<int8_t>::max();
18243         case ISA_TYPE_UB:
18244             return std::numeric_limits<uint8_t>::max();
18245         case ISA_TYPE_W:
18246             return std::numeric_limits<int16_t>::max();
18247         case ISA_TYPE_UW:
18248             return std::numeric_limits<uint16_t>::max();
18249         case ISA_TYPE_Q:
18250             return std::numeric_limits<int64_t>::max();
18251         case ISA_TYPE_UQ:
18252             return std::numeric_limits<uint64_t>::max();
18253         default:
18254             IGC_ASSERT_MESSAGE(0, "unexpected visa type");
18255             return std::numeric_limits<int>::max();
18256         }
18257     };
18258     auto getMinVal = [](VISA_Type ty) -> uint64_t
18259     {
18260         switch (ty)
18261         {
18262         case ISA_TYPE_D:
18263             return std::numeric_limits<int>::min();
18264         case ISA_TYPE_UD:
18265             return std::numeric_limits<uint32_t>::min();
18266         case ISA_TYPE_B:
18267             return std::numeric_limits<int8_t>::min();
18268         case ISA_TYPE_UB:
18269             return std::numeric_limits<uint8_t>::min();
18270         case ISA_TYPE_W:
18271             return std::numeric_limits<int16_t>::min();
18272         case ISA_TYPE_UW:
18273             return std::numeric_limits<uint16_t>::min();
18274         case ISA_TYPE_Q:
18275             return std::numeric_limits<int64_t>::min();
18276         case ISA_TYPE_UQ:
18277             return std::numeric_limits<uint64_t>::min();
18278         default:
18279             IGC_ASSERT_MESSAGE(0, "unexpected visa type");
18280             return std::numeric_limits<int>::min();
18281         }
18282     };
18283 
18284     switch (op)
18285     {
18286     case WaveOps::SUM:
18287         identity = 0;
18288         opcode = EOPCODE_ADD;
18289         type = getISAType(opndTy);
18290         break;
18291     case WaveOps::PROD:
18292         identity = 1;
18293         opcode = EOPCODE_MUL;
18294         type = getISAType(opndTy);
18295         break;
18296     case WaveOps::UMAX:
18297         opcode = EOPCODE_MAX;
18298         type = getISAType(opndTy, false);
18299         identity = getMinVal(type);
18300         break;
18301     case WaveOps::UMIN:
18302         opcode = EOPCODE_MIN;
18303         type = getISAType(opndTy, false);
18304         identity = getMaxVal(type);
18305         break;
18306     case WaveOps::IMAX:
18307         opcode = EOPCODE_MAX;
18308         type = getISAType(opndTy);
18309         identity = getMinVal(type);
18310         break;
18311     case WaveOps::IMIN:
18312         opcode = EOPCODE_MIN;
18313         type = getISAType(opndTy);
18314         identity = getMaxVal(type);
18315         break;
18316     case WaveOps::OR:
18317         identity = 0;
18318         opcode = EOPCODE_OR;
18319         type = getISAType(opndTy, false);
18320         break;
18321     case WaveOps::XOR:
18322         identity = 0;
18323         opcode = EOPCODE_XOR;
18324         type = getISAType(opndTy, false);
18325         break;
18326     case WaveOps::AND:
18327         opcode = EOPCODE_AND;
18328         type = getISAType(opndTy, false);
18329         identity = dyn_cast<IntegerType>(opndTy)->getBitMask();
18330         break;
18331     case WaveOps::FSUM:
18332         opcode = EOPCODE_ADD;
18333         type = getISAType(opndTy);
18334         identity = 0;
18335         break;
18336     case WaveOps::FPROD:
18337         opcode = EOPCODE_MUL;
18338         type = getISAType(opndTy);
18339         identity = getFPOne(type);
18340         break;
18341     case WaveOps::FMIN:
18342         opcode = EOPCODE_MIN;
18343         type = getISAType(opndTy);
18344         identity = dyn_cast<ConstantFP>(ConstantFP::getInfinity(opndTy))->getValueAPF().bitcastToAPInt().getZExtValue();
18345         break;
18346     case WaveOps::FMAX:
18347         opcode = EOPCODE_MAX;
18348         type = getISAType(opndTy);
18349         identity = dyn_cast<ConstantFP>(ConstantFP::getInfinity(opndTy, true))->getValueAPF().bitcastToAPInt().getZExtValue();
18350         break;
18351     default:
18352         IGC_ASSERT(0);
18353     }
18354 }
18355 
emitWavePrefix(WavePrefixIntrinsic * I)18356 void EmitPass::emitWavePrefix(WavePrefixIntrinsic* I)
18357 {
18358     Value* Mask = I->getMask();
18359     if (auto * CI = dyn_cast<ConstantInt>(Mask))
18360     {
18361         // If the mask is all set, then we just pass a null
18362         // mask to emitScan() indicating we don't want to
18363         // emit any predication.
18364         if (CI->isAllOnesValue())
18365             Mask = nullptr;
18366     }
18367     emitScan(
18368         I->getSrc(), I->getOpKind(), I->isInclusiveScan(), Mask, false);
18369 }
18370 
emitQuadPrefix(QuadPrefixIntrinsic * I)18371 void EmitPass::emitQuadPrefix(QuadPrefixIntrinsic* I)
18372 {
18373     emitScan(
18374         I->getSrc(), I->getOpKind(), I->isInclusiveScan(), nullptr, true);
18375 }
18376 
emitScan(Value * Src,IGC::WaveOps Op,bool isInclusiveScan,Value * Mask,bool isQuad)18377 void EmitPass::emitScan(
18378     Value* Src, IGC::WaveOps Op,
18379     bool isInclusiveScan, Value* Mask, bool isQuad)
18380 {
18381     VISA_Type type;
18382     e_opcode opCode;
18383     uint64_t identity = 0;
18384     GetReductionOp(Op, Src->getType(), identity, opCode, type);
18385     CVariable* src = GetSymbol(Src);
18386     CVariable* dst[2] = { nullptr, nullptr };
18387     CVariable* Flag = Mask ? GetSymbol(Mask) : nullptr;
18388 
18389     emitPreOrPostFixOp(
18390         opCode, identity, type,
18391         false, src, dst, Flag,
18392         !isInclusiveScan, isQuad);
18393 
18394     // Now that we've computed the result in temporary registers,
18395     // make sure we only write the results to lanes participating in the
18396     // scan as specified by 'mask'.
18397     if (Flag)
18398         m_encoder->SetPredicate(Flag);
18399     m_encoder->Copy(m_destination, dst[0]);
18400     if (m_currShader->m_numberInstance == 2)
18401     {
18402         m_encoder->SetSecondHalf(true);
18403         m_encoder->Copy(m_destination, dst[1]);
18404     }
18405     m_encoder->Push();
18406 }
18407 
emitWaveAll(llvm::GenIntrinsicInst * inst)18408 void EmitPass::emitWaveAll(llvm::GenIntrinsicInst* inst)
18409 {
18410     CVariable* src = GetSymbol(inst->getOperand(0));
18411     const WaveOps op = static_cast<WaveOps>(cast<llvm::ConstantInt>(inst->getOperand(1))->getZExtValue());
18412     VISA_Type type;
18413     e_opcode opCode;
18414     uint64_t identity = 0;
18415     GetReductionOp(op, inst->getOperand(0)->getType(), identity, opCode, type);
18416     CVariable* dst = m_destination;
18417     emitReductionAll(opCode, identity, type, false, src, dst);
18418 }
18419 
emitWaveClustered(llvm::GenIntrinsicInst * inst)18420 void EmitPass::emitWaveClustered(llvm::GenIntrinsicInst* inst)
18421 {
18422     CVariable* src = GetSymbol(inst->getOperand(0));
18423     const WaveOps op = static_cast<WaveOps>(cast<llvm::ConstantInt>(inst->getOperand(1))->getZExtValue());
18424     const unsigned int clusterSize = int_cast<uint32_t>(cast<llvm::ConstantInt>(inst->getOperand(2))->getZExtValue());
18425     VISA_Type type;
18426     e_opcode opCode;
18427     uint64_t identity = 0;
18428     GetReductionOp(op, inst->getOperand(0)->getType(), identity, opCode, type);
18429     CVariable *dst = m_destination;
18430     emitReductionClustered(opCode, identity, type, false, clusterSize, src, dst);
18431 }
18432 
emitDP4A(GenIntrinsicInst * GII,const SSource * Sources,const DstModifier & modifier)18433 void EmitPass::emitDP4A(GenIntrinsicInst* GII, const SSource* Sources, const DstModifier& modifier) {
18434     GenISAIntrinsic::ID GIID = GII->getIntrinsicID();
18435     CVariable* dst = m_destination;
18436     CVariable *src0, *src1, *src2;
18437 
18438     // Check if Sources was set in PatternMatch
18439     if (!Sources)
18440     {
18441         src0 = GetSymbol(GII->getOperand(0));
18442         src1 = GetSymbol(GII->getOperand(1));
18443         src2 = GetSymbol(GII->getOperand(2));
18444     }
18445     else
18446     {
18447         m_encoder->SetSrcRegion(1, Sources[1].region[0], Sources[1].region[1], Sources[1].region[2]);
18448         src0 = GetSrcVariable(Sources[0]);
18449         src1 = GetSrcVariable(Sources[1]);
18450         src2 = GetSrcVariable(Sources[2]);
18451     }
18452 
18453     // Set correct signedness of src1.
18454     if (GIID == GenISAIntrinsic::GenISA_dp4a_ss ||
18455         GIID == GenISAIntrinsic::GenISA_dp4a_su)
18456         src1 = m_currShader->BitCast(src1, ISA_TYPE_D);
18457     if (GIID == GenISAIntrinsic::GenISA_dp4a_uu ||
18458         GIID == GenISAIntrinsic::GenISA_dp4a_us)
18459         src1 = m_currShader->BitCast(src1, ISA_TYPE_UD);
18460     // Set correct signedness of src2.
18461     if (GIID == GenISAIntrinsic::GenISA_dp4a_ss ||
18462         GIID == GenISAIntrinsic::GenISA_dp4a_us)
18463         src2 = m_currShader->BitCast(src2, ISA_TYPE_D);
18464     if (GIID == GenISAIntrinsic::GenISA_dp4a_uu ||
18465         GIID == GenISAIntrinsic::GenISA_dp4a_su)
18466         src2 = m_currShader->BitCast(src2, ISA_TYPE_UD);
18467     // Emit dp4a.
18468     m_encoder->SetDstModifier(modifier);
18469     m_encoder->dp4a(dst, src0, src1, src2);
18470     m_encoder->Push();
18471 }
18472 
emitUnmaskedRegionBoundary(bool start)18473 void EmitPass::emitUnmaskedRegionBoundary(bool start)
18474 {
18475     if (start) {
18476       m_encoder->BeginForcedNoMaskRegion();
18477     } else {
18478       m_encoder->EndForcedNoMaskRegion();
18479     }
18480 }
18481 
emitDebugPlaceholder(llvm::GenIntrinsicInst * I)18482 void EmitPass::emitDebugPlaceholder(llvm::GenIntrinsicInst* I)
18483 {
18484     m_encoder->Loc(I->getDebugLoc().getLine());
18485     m_encoder->DebugLinePlaceholder();
18486 }
18487 
18488 // Dummy instruction that won't be optimized away.
emitDummyInst(llvm::GenIntrinsicInst * GII)18489 void EmitPass::emitDummyInst(llvm::GenIntrinsicInst* GII)
18490 {
18491     CVariable* dst = m_currShader->GetNULL();
18492     CVariable* src = m_currShader->GetR0();
18493     m_encoder->Copy(dst, src);
18494     m_encoder->Push();
18495 }
18496 
emitImplicitArgIntrinsic(llvm::GenIntrinsicInst * I)18497 void EmitPass::emitImplicitArgIntrinsic(llvm::GenIntrinsicInst* I)
18498 {
18499     Function* parentFunc = I->getParent()->getParent();
18500     MetaDataUtils* pMdUtils = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
18501 
18502     // We can just drop the intrinsic if there are no uses for it.
18503     // It should have been lowered in LowerImplicitArgIntrinsics pass, but did not get cleaned up.
18504     if (I->getNumUses() == 0) return;
18505 
18506     if (I->getIntrinsicID() == GenISAIntrinsic::ID::GenISA_getR0)
18507     {
18508         // Returns the predefined R0 register
18509         m_encoder->SetUniformSIMDSize(lanesToSIMDMode(m_currShader->getGRFSize() / SIZE_DWORD));
18510         m_encoder->SetNoMask();
18511         m_currShader->CopyVariable(GetSymbol(I), m_currShader->GetR0());
18512         return;
18513     }
18514 
18515     Function* groupHead = nullptr;
18516     if (!m_FGA || m_FGA->isGroupHead(parentFunc)) {
18517         groupHead = parentFunc;
18518     }
18519     else {
18520         groupHead = m_FGA->getSubGroupMap(parentFunc);
18521     }
18522 
18523     if (isEntryFunc(pMdUtils, groupHead))
18524     {
18525         // Map to the root kernel's implicit arg symbol
18526         ImplicitArgs IAS(*groupHead, pMdUtils);
18527         ImplicitArg::ArgType IAtype = ImplicitArgs::getArgType(I->getIntrinsicID());
18528         Argument* arg = IAS.getImplicitArg(*groupHead, IAtype);
18529         IGC_ASSERT_MESSAGE(arg, "Implicit argument not found!");
18530         if (arg)
18531         {
18532             m_encoder->SetNoMask();
18533             m_currShader->CopyVariable(GetSymbol(I), m_currShader->getOrCreateArgumentSymbol(arg, false));
18534         }
18535     }
18536     else
18537     {
18538         IGC_ASSERT_MESSAGE(0, "Intrinsics used in stackcalls has not been lowered!");
18539     }
18540 }
18541 
emitStoreImplBufferPtr(llvm::GenIntrinsicInst * I)18542 void EmitPass::emitStoreImplBufferPtr(llvm::GenIntrinsicInst* I)
18543 {
18544     if (m_currShader->HasStackCalls())
18545         m_currShader->CopyVariable(m_currShader->GetImplArgBufPtr(), GetSymbol(I->getArgOperand(0)));
18546 }
18547 
emitStoreLocalIdBufferPtr(llvm::GenIntrinsicInst * I)18548 void EmitPass::emitStoreLocalIdBufferPtr(llvm::GenIntrinsicInst* I)
18549 {
18550     if(m_currShader->HasStackCalls())
18551         m_currShader->CopyVariable(m_currShader->GetLocalIdBufPtr(), GetSymbol(I->getArgOperand(0)));
18552 }
18553 
emitLoadImplBufferPtr(llvm::GenIntrinsicInst * I)18554 void EmitPass::emitLoadImplBufferPtr(llvm::GenIntrinsicInst* I)
18555 {
18556     m_encoder->SetUniformSIMDSize(lanesToSIMDMode(1));
18557     m_encoder->SetNoMask();
18558     m_encoder->SetSrcSubReg(0, 0);
18559     m_encoder->Copy(m_destination, m_currShader->GetImplArgBufPtr());
18560     m_encoder->Push();
18561 }
18562 
emitLoadLocalIdBufferPtr(llvm::GenIntrinsicInst * I)18563 void EmitPass::emitLoadLocalIdBufferPtr(llvm::GenIntrinsicInst* I)
18564 {
18565     m_encoder->SetUniformSIMDSize(lanesToSIMDMode(1));
18566     m_encoder->SetNoMask();
18567     m_encoder->SetSrcSubReg(0, 0);
18568     m_encoder->Copy(m_destination, m_currShader->GetLocalIdBufPtr());
18569     m_encoder->Push();
18570 }
18571 
18572 
18573 
emitDpas(GenIntrinsicInst * GII,const SSource * Sources,const DstModifier & modifier)18574 void EmitPass::emitDpas(GenIntrinsicInst* GII, const SSource* Sources, const DstModifier& modifier)
18575 {
18576     // Note that in intrinsic's arguments, activation goes before weight;
18577     // But in visa (gen isa), weight goes before activation.
18578     CVariable* dst = m_destination;
18579     CVariable* activation = GetSrcVariable(Sources[1]);
18580     CVariable* weight = GetSrcVariable(Sources[2]);
18581 
18582     // input could be null if it is integer 0 or float positive 0.0f
18583     CVariable* input = nullptr;
18584     Constant* CSTVal = dyn_cast<Constant>(Sources[0].value);
18585     if (!(CSTVal && CSTVal->isNullValue()))
18586     {
18587         input = GetSrcVariable(Sources[0]);
18588     }
18589 
18590     // float dpas uses short as bfloat16 for either input or dst.
18591     ConstantInt* pa = dyn_cast<ConstantInt>(GII->getOperand(3)); // Activation's precision
18592     ConstantInt* pb = dyn_cast<ConstantInt>(GII->getOperand(4)); // Weight's precision
18593     ConstantInt* sdepth = dyn_cast<ConstantInt>(GII->getOperand(5));
18594     ConstantInt* rcount = dyn_cast<ConstantInt>(GII->getOperand(6));
18595     ConstantInt* dpasw = dyn_cast<ConstantInt>(GII->getOperand(7));
18596     int PA = (int)pa->getSExtValue();
18597     int PB = (int)pb->getSExtValue();
18598     int SD = (int)sdepth->getSExtValue();
18599     int RC = (int)rcount->getSExtValue();
18600     bool IsDpasw = dpasw->getValue().getBoolValue();
18601 
18602     // Make sure all operands are non-uniform. If any of them are uniform
18603     // broadcast them to a non-uniform variable.
18604     // (Note that activation should be uniform for non-subgroup dpas)
18605     if (GII->getIntrinsicID() == GenISAIntrinsic::GenISA_sub_group_dpas) {
18606         activation = BroadcastIfUniform(activation);
18607     }
18608     weight = BroadcastIfUniform(weight);
18609     if (input) {
18610         input = BroadcastIfUniform(input);
18611     }
18612 
18613     // Sanity: Make sure that activation and weight are D/UD always
18614     if (activation->GetType() != ISA_TYPE_UD && activation->GetType() != ISA_TYPE_D)
18615     {
18616         activation = m_currShader->GetNewAlias(activation, ISA_TYPE_UD, 0, 0);
18617     }
18618     if (weight->GetType() != ISA_TYPE_UD && weight->GetType() != ISA_TYPE_D)
18619     {
18620         weight = m_currShader->GetNewAlias(weight, ISA_TYPE_UD, 0, 0);
18621     }
18622 
18623     m_encoder->dpas(dst, input, weight, (PrecisionType)PB, activation, (PrecisionType)PA,
18624         (uint8_t)SD, (uint8_t)RC, IsDpasw);
18625     m_encoder->Push();
18626 }
18627 
18628 // Conversion between float types
emitfcvt(llvm::GenIntrinsicInst * GII)18629 void EmitPass::emitfcvt(llvm::GenIntrinsicInst* GII)
18630 {
18631     /// Divide N into multiple of 16 and the remaining into 8, 4, 2, 1
18632     /// Each sequence takes two elements in execsizeSeq, in which first
18633     /// one has execsize, and the second one the starting offset.
18634     auto getAllExecsize = [=](SmallVector<uint32_t, 16> & execsizeSeq, uint32_t N) {
18635         // Max execution size is 16.
18636         int n = (int)N / 16;
18637         uint32_t offset = 0;
18638         for (int i = 0; i < n; ++i) {
18639             execsizeSeq.push_back(16);
18640             execsizeSeq.push_back(offset);
18641             offset += 16;
18642         }
18643 
18644         int m = (int)N % 16;
18645         for (uint32_t s = 8; m > 0; s = s / 2)
18646         {
18647             if (m >= (int)s)
18648             {
18649                 execsizeSeq.push_back(s);
18650                 execsizeSeq.push_back(offset);
18651                 offset += s;
18652                 m -= s;
18653             }
18654         }
18655     };
18656 
18657     Value* sVal = GII->getOperand(0);
18658     CVariable* src = GetSymbol(sVal);
18659     CVariable* dst = m_destination;
18660 
18661     Type* dTy = GII->getType();
18662     IGCLLVM::FixedVectorType* dVTy = dyn_cast<IGCLLVM::FixedVectorType>(dTy);
18663     Type* sTy = sVal->getType();
18664     IGCLLVM::FixedVectorType* sVTy = dyn_cast<IGCLLVM::FixedVectorType>(sTy);
18665     int nelts = dVTy ? (int)dVTy->getNumElements() : 1;
18666     int src_nelts = sVTy ? (int)sVTy->getNumElements() : 1;
18667     if (nelts != src_nelts)
18668     {
18669         IGC_ASSERT_MESSAGE(0, "Different #elements in src and dst of conversion intrinsic!");
18670         return;
18671     }
18672 
18673     bool isSrcUniform = src->IsUniform();
18674     bool isDstUniform = dst->IsUniform();
18675     uint16_t nsimdsize = numLanes(m_currShader->m_SIMDSize);
18676     GenISAIntrinsic::ID id = GII->getIntrinsicID();
18677 
18678     ERoundingMode FP_RM = static_cast<ERoundingMode>(
18679         m_pCtx->getModuleMetaData()->compOpt.FloatRoundingMode);
18680     if (id == GenISAIntrinsic::GenISA_ftobf) {
18681         ConstantInt* CI = cast<ConstantInt>(GII->getOperand(1));
18682         FP_RM = (ERoundingMode)CI->getZExtValue();
18683     }
18684     else if (id == GenISAIntrinsic::GenISA_2fto2bf)
18685     {
18686         ConstantInt* CI = cast<ConstantInt>(GII->getOperand(2));
18687         FP_RM = (ERoundingMode)CI->getZExtValue();
18688     }
18689     else {
18690         FP_RM = ERoundingMode::ROUND_TO_ANY;
18691     }
18692 
18693     if (FP_RM != ERoundingMode::ROUND_TO_ANY)
18694         SetRoundingMode_FP(FP_RM);
18695 
18696     // vISA instruction doesn't support immediate source of type BF
18697     if (id == GenISAIntrinsic::GenISA_bftof && src->IsImmediate())
18698     {
18699         uint32_t imm32 = ((uint32_t)src->GetImmediateValue()) & 0xFFFF;
18700         imm32 = imm32 << 16; // make it as float immediate
18701         CVariable* fSrc = m_currShader->ImmToVariable((uint64_t)imm32, ISA_TYPE_F);
18702         m_encoder->Copy(dst, fSrc);
18703         m_encoder->Push();
18704         return;
18705     }
18706 
18707     if (id == GenISAIntrinsic::GenISA_ftobf ||
18708         id == GenISAIntrinsic::GenISA_bftof)
18709     {
18710         CVariable* tDst = nullptr, * tSrc = nullptr;
18711         if (id == GenISAIntrinsic::GenISA_ftobf) {
18712             tDst = m_currShader->GetNewAlias(dst, ISA_TYPE_BF, 0, 0);
18713             tSrc = src;
18714         }
18715         else if (id == GenISAIntrinsic::GenISA_bftof) {
18716             tDst = dst;
18717             tSrc = m_currShader->GetNewAlias(src, ISA_TYPE_BF, 0, 0);
18718         }
18719         else {
18720             IGC_ASSERT_EXIT_MESSAGE(0, "Something wrong in cvt!");
18721         }
18722 
18723         if (isSrcUniform && isDstUniform)
18724         {
18725             SmallVector<uint32_t, 16> insts;
18726             getAllExecsize(insts, nelts);
18727             for (int i = 0, s = (int)insts.size(); i < s; i += 2)
18728             {
18729                 uint32_t esize = insts[i];
18730                 SIMDMode simdMode = lanesToSIMDMode(esize);
18731                 uint32_t offset = insts[i + 1];
18732 
18733                 m_encoder->SetNoMask();
18734                 m_encoder->SetUniformSIMDSize(simdMode);
18735                 m_encoder->SetDstSubReg(offset);
18736                 m_encoder->SetSrcSubReg(0, offset);
18737                 // by default, uniform's region is (0, 1, 0)
18738                 if (esize > 1) {
18739                     uint32_t stride = (esize >= 8 ? 8 : esize);
18740                     m_encoder->SetSrcRegion(0, stride, stride, 1);
18741                 }
18742                 m_encoder->Cast(tDst, tSrc);
18743                 m_encoder->Push();
18744             }
18745         }
18746         else
18747         {
18748             uint32_t dstOff = 0, srcOff = 0;
18749             for (int i = 0; i < nelts; ++i)
18750             {
18751                 m_encoder->SetDstSubReg(dstOff);
18752                 m_encoder->SetSrcSubReg(0, srcOff);
18753                 m_encoder->Cast(tDst, tSrc);
18754                 m_encoder->Push();
18755 
18756                 dstOff += (isDstUniform ? 1 : nsimdsize);
18757                 srcOff += (isSrcUniform ? 1 : nsimdsize);
18758             }
18759         }
18760     }
18761     else if (id == GenISAIntrinsic::GenISA_2fto2bf)
18762     {
18763         CVariable* srcs[2];
18764         srcs[0] = src;
18765         srcs[1] = GetSymbol(GII->getOperand(1));
18766         CVariable* tDst = m_currShader->GetNewAlias(dst, ISA_TYPE_BF, 0, 0);
18767         SmallVector<uint32_t, 16> insts;
18768         getAllExecsize(insts, nelts);
18769         for (int e = 0; e < 2; ++e)
18770         {
18771             CVariable* tSrc = srcs[e];
18772             isSrcUniform = tSrc->IsUniform();
18773             if (isSrcUniform && isDstUniform)
18774             {
18775                 for (int i = 0, s = (int)insts.size(); i < s; i += 2)
18776                 {
18777                     uint32_t esize = insts[i];
18778                     SIMDMode simdMode = lanesToSIMDMode(esize);
18779                     uint32_t offset = insts[i + 1];
18780 
18781                     m_encoder->SetNoMask();
18782                     m_encoder->SetUniformSIMDSize(simdMode);
18783                     m_encoder->SetDstSubReg(2 * offset + e);
18784                     m_encoder->SetDstRegion(2);
18785                     m_encoder->SetSrcSubReg(0, offset);
18786                     // by default, uniform's region is (0, 1, 0)
18787                     if (esize > 1) {
18788                         uint32_t stride = (esize >= 8 ? 8 : esize);
18789                         m_encoder->SetSrcRegion(0, stride, stride, 1);
18790                     }
18791                     m_encoder->Cast(tDst, tSrc);
18792                     m_encoder->Push();
18793                 }
18794             }
18795             else
18796             {
18797                 uint32_t dstOff = 0, srcOff = 0;
18798                 for (int i = 0; i < nelts; ++i)
18799                 {
18800                     m_encoder->SetDstSubReg(2 * dstOff + e);
18801                     m_encoder->SetDstRegion(2);
18802                     m_encoder->SetSrcSubReg(0, srcOff);
18803                     m_encoder->Cast(tDst, tSrc);
18804                     m_encoder->Push();
18805 
18806                     dstOff += (isDstUniform ? 1 : nsimdsize);
18807                     srcOff += (isSrcUniform ? 1 : nsimdsize);
18808                 }
18809             }
18810         }
18811     }
18812     else
18813     {
18814         IGC_ASSERT_MESSAGE(0, "ICE: unhandled gen intrinsic within cvt!");
18815     }
18816 
18817     if (FP_RM != ERoundingMode::ROUND_TO_ANY) {
18818         ResetRoundingMode(GII);
18819     }
18820 }
18821 
18822 
18823 
18824