1 /*========================== begin_copyright_notice ============================
2
3 Copyright (C) 2017-2021 Intel Corporation
4
5 SPDX-License-Identifier: MIT
6
7 ============================= end_copyright_notice ===========================*/
8
9 #include "IGC/common/StringMacros.hpp"
10 #include "EmitVISAPass.hpp"
11 #include "CISABuilder.hpp"
12 #include "VertexShaderCodeGen.hpp"
13 #include "GeometryShaderCodeGen.hpp"
14 #include "PixelShaderCodeGen.hpp"
15 #include "OpenCLKernelCodeGen.hpp"
16 #include "ComputeShaderCodeGen.hpp"
17 #include "HullShaderCodeGen.hpp"
18 #include "DomainShaderCodeGen.hpp"
19 #include "DeSSA.hpp"
20 #include "messageEncoding.hpp"
21 #include "PayloadMapping.hpp"
22 #include "VectorProcess.hpp"
23 #include "ShaderCodeGen.hpp"
24 #include "common/allocator.h"
25 #include "common/debug/Dump.hpp"
26 #include "common/debug/Dump.hpp"
27 #include "common/igc_regkeys.hpp"
28 #include "common/Stats.hpp"
29 #include "Compiler/CISACodeGen/helper.h"
30 #include "Compiler/DebugInfo/ScalarVISAModule.h"
31 #include "common/secure_mem.h"
32 #include "DebugInfo/VISAIDebugEmitter.hpp"
33 #include "DebugInfo/EmitterOpts.hpp"
34 #include "GenISAIntrinsics/GenIntrinsicInst.h"
35 #include "AdaptorCommon/ImplicitArgs.hpp"
36 #include "Compiler/IGCPassSupport.h"
37 #include "common/LLVMWarningsPush.hpp"
38 #include "llvmWrapper/IR/Instructions.h"
39 #include "llvmWrapper/IR/DerivedTypes.h"
40 #include "llvm/Support/Path.h"
41 #include "llvm/Support/FormattedStream.h"
42 #include "llvm/IR/AssemblyAnnotationWriter.h"
43 #include "llvmWrapper/IR/Intrinsics.h"
44 #include "common/LLVMWarningsPop.hpp"
45 #include "Probe/Assertion.h"
46
47 #include <fstream>
48
49 using namespace llvm;
50 using namespace IGC;
51 using namespace IGC::IGCMD;
52 using namespace std;
53
54 char EmitPass::ID = 0;
55
56 /// Divide N into multiple of M (must be power of two), and the remaining into M/2,
57 /// M/4, ..., 1. Each sequence takes two elements in execsizeSeq, in which first
58 /// one has execsize, and the second one the starting offset.
59 /// For example with M = 16, N = 47,
60 /// {16, 0}, {16, 16}, {8, 32}, {4, 40}, {2, 44} {1, 45}
splitIntoPowerOfTwo(SmallVector<uint32_t,16> & execsizeSeq,uint32_t N,uint32_t M)61 static void splitIntoPowerOfTwo(SmallVector<uint32_t, 16>& execsizeSeq, uint32_t N, uint32_t M)
62 {
63 // Max execution size is 16.
64 int n = (int)N / (int)M;
65 uint32_t offset = 0;
66 for (int i = 0; i < n; ++i) {
67 execsizeSeq.push_back(16);
68 execsizeSeq.push_back(offset);
69 offset += 16;
70 }
71
72 int m = (int)(N % M);
73 for (uint32_t s = M/2; m > 0; s = s / 2)
74 {
75 if (m >= (int)s)
76 {
77 execsizeSeq.push_back(s);
78 execsizeSeq.push_back(offset);
79 offset += s;
80 m -= s;
81 }
82 }
83 }
84
85 namespace IGC
86 {
87 class VisaIdAnnotator : public llvm::AssemblyAnnotationWriter
88 {
89 DenseMap<const Value*, uint32_t> m_rootToVISAId;
90 DenseMap<const BasicBlock*, uint32_t> m_blockId;
91
92 public:
VisaIdAnnotator()93 VisaIdAnnotator() {}
94
emitBasicBlockStartAnnot(const BasicBlock * BB,formatted_raw_ostream & OS)95 void emitBasicBlockStartAnnot(const BasicBlock* BB, formatted_raw_ostream& OS) override
96 {
97 OS << "; BB";
98 if (m_blockId.count(BB)) {
99 OS << m_blockId[BB] << " ";
100 }
101 OS << ":\n";
102 }
103
printInfoComment(const Value & V,formatted_raw_ostream & OS)104 void printInfoComment(const Value& V, formatted_raw_ostream& OS) override
105 {
106 if (m_rootToVISAId.count(&V))
107 OS << "\t\t; visa id: " << m_rootToVISAId[&V];
108 }
109
trackVisaId(const Instruction * I,uint32_t vid)110 void trackVisaId(const Instruction* I, uint32_t vid) { m_rootToVISAId[I] = vid; }
trackBlockId(const BasicBlock * BB,uint32_t bbid)111 void trackBlockId(const BasicBlock* BB, uint32_t bbid) { m_blockId[BB] = bbid; }
112 };
113 }
114
115
EmitPass(CShaderProgram::KernelShaderMap & shaders,SIMDMode mode,bool canAbortOnSpill,ShaderDispatchMode shaderMode,PSSignature * pSignature)116 EmitPass::EmitPass(CShaderProgram::KernelShaderMap& shaders, SIMDMode mode, bool canAbortOnSpill, ShaderDispatchMode shaderMode, PSSignature* pSignature)
117 : FunctionPass(ID),
118 m_SimdMode(mode),
119 m_ShaderDispatchMode(shaderMode),
120 m_shaders(shaders),
121 m_currShader(nullptr),
122 m_encoder(nullptr),
123 m_canAbortOnSpill(canAbortOnSpill),
124 m_roundingMode_FP(ERoundingMode::ROUND_TO_NEAREST_EVEN),
125 m_roundingMode_FPCvtInt(ERoundingMode::ROUND_TO_ZERO),
126 m_pSignature(pSignature),
127 m_isDuplicate(false)
128 {
129 //Before calling getAnalysisUsage() for EmitPass, the passes that it depends on need to be initialized
130 initializeDominatorTreeWrapperPassPass(*PassRegistry::getPassRegistry());
131 initializeWIAnalysisPass(*PassRegistry::getPassRegistry());
132 initializeCodeGenPatternMatchPass(*PassRegistry::getPassRegistry());
133 initializeDeSSAPass(*PassRegistry::getPassRegistry());
134 initializeBlockCoalescingPass(*PassRegistry::getPassRegistry());
135 initializeCoalescingEnginePass(*PassRegistry::getPassRegistry());
136 initializeMetaDataUtilsWrapperPass(*PassRegistry::getPassRegistry());
137 initializeSimd32ProfitabilityAnalysisPass(*PassRegistry::getPassRegistry());
138 initializeVariableReuseAnalysisPass(*PassRegistry::getPassRegistry());
139 initializeLiveVariablesPass(*PassRegistry::getPassRegistry());
140 }
141
~EmitPass()142 EmitPass::~EmitPass()
143 {
144 }
145
146 // Switch to payload section
147 // When switching to payload section, the code redirects vKernel pointing to the payload section
148 // m_destination (LiveOut of interploation) will be allocated before compiling the kernel.
ContextSwitchPayloadSection(bool first)149 void EmitPass::ContextSwitchPayloadSection(bool first)
150 {
151 if (m_encoder->IsCodePatchCandidate())
152 {
153 if (first)
154 {
155 m_tmpDest = m_destination;
156 }
157 m_isDuplicate = first ? m_currShader->AppendPayloadSetup(m_destination) : false;
158 // When duplication happens, multiple instructions in divergent branches write to the same VR.
159 if (m_isDuplicate)
160 {
161 auto uniformSIMDMode = m_currShader->m_Platform->getMinDispatchMode();
162 CVariable* src = m_destination;
163 uint16_t size = m_destination->IsUniform() ? numLanes(uniformSIMDMode) :
164 numLanes(m_currShader->m_SIMDSize);
165 CVariable* newSource = m_currShader->GetNewVariable(
166 size,
167 src->GetType(),
168 EALIGN_GRF,
169 m_destination->IsUniform(),
170 src->getName());
171 m_currShader->AppendPayloadSetup(newSource);
172 m_destination = newSource;
173 }
174 m_encoder->SetPayloadSectionAsPrimary();
175 }
176 }
177
ContextSwitchShaderBody(bool last)178 void EmitPass::ContextSwitchShaderBody(bool last)
179 {
180 if (m_encoder->IsCodePatchCandidate())
181 {
182 m_encoder->SetPayloadSectionAsSecondary();
183 if (last && m_isDuplicate)
184 {
185 m_encoder->Copy(m_tmpDest, m_destination);
186 m_encoder->Push();
187 m_destination = m_tmpDest;
188 }
189 }
190 }
191
isHalfGRFReturn(CVariable * dst,SIMDMode simdMode)192 bool EmitPass::isHalfGRFReturn(CVariable* dst, SIMDMode simdMode)
193 {
194 auto typeSize = CEncoder::GetCISADataTypeSize(dst->GetType());
195 return simdMode == m_currShader->m_Platform->getMinDispatchMode() &&
196 typeSize == 2 && !dst->isUnpacked();
197 }
198
DefReachUseWithinLevel(llvm::Value * def,const llvm::Instruction * use,uint level)199 static bool DefReachUseWithinLevel(llvm::Value* def, const llvm::Instruction* use, uint level)
200 {
201 if (level == 0 || !def || !use)
202 return false;
203 for (auto useIter = def->user_begin(), E = def->user_end(); useIter != E; ++useIter)
204 {
205 llvm::Instruction* useInst = dyn_cast<llvm::Instruction>(*useIter);
206 if (useInst)
207 {
208 if (useInst == use)
209 return true;
210 else
211 {
212 if (DefReachUseWithinLevel(useInst, use, level - 1))
213 return true;
214 }
215 }
216 }
217 return false;
218 }
219
DecideInstanceAndSlice(const llvm::BasicBlock & blk,SDAG & sdag,bool & slicing)220 uint EmitPass::DecideInstanceAndSlice(const llvm::BasicBlock& blk, SDAG& sdag, bool& slicing)
221 {
222 m_encoder->SetSubSpanDestination(false);
223 uint numInstance = m_currShader->m_numberInstance;
224
225 slicing = (m_SimdMode == SIMDMode::SIMD32); // set to false if we don't want slicing
226
227 bool hasValidDestination = (sdag.m_root->getType()->getTypeID() != llvm::Type::VoidTyID);
228
229 // Disable for struct type destinations
230 if (sdag.m_root->getType()->isStructTy())
231 {
232 hasValidDestination = false;
233 }
234
235 if (hasValidDestination)
236 {
237 m_destination = GetSymbol(sdag.m_root);
238 numInstance = m_destination->GetNumberInstance();
239
240 if (m_pattern->IsSubspanUse(sdag.m_root))
241 {
242 m_encoder->SetSubSpanDestination(true);
243 }
244
245 if (isa<CmpInst>(sdag.m_root))
246 {
247 if (DefReachUseWithinLevel(sdag.m_root, blk.getTerminator(), 4))
248 slicing = false;
249 }
250 else if (IsUniformAtomic(sdag.m_root))
251 {
252 numInstance = 1;
253 slicing = false;
254 }
255 else if (IsAtomicIntrinsic(GetOpCode(sdag.m_root)))
256 {
257 slicing = false;
258 }
259 else if (IsMediaIOIntrinsic(sdag.m_root))
260 {
261 numInstance = 1;
262 slicing = false;
263 }
264 else if (getGRFSize() != 32 && IsSIMDBlockIntrinsic(sdag.m_root))
265 {
266 numInstance = 1;
267 slicing = false;
268 }
269 else if (IsSubGroupIntrinsicWithSimd32Implementation(GetOpCode(sdag.m_root)))
270 {
271 numInstance = 1;
272 slicing = false;
273 }
274 else if (m_destination->IsUniform())
275 {
276 // if this uniform value is involved in phi-congruent class
277 // live-interval changed with slicing. Therefore, we need to stop slicing
278 // \todo: is it a good idea to pre-schedule all uniform operations to the beginning of the block?
279 if (m_deSSA->getRootValue(sdag.m_root))
280 slicing = false;
281 }
282 }
283 else
284 {
285 m_destination = nullptr;
286 if (StoreInst * ST = dyn_cast<StoreInst>(sdag.m_root))
287 {
288 // Limit to OpenCL so far as it has uniform load/store support.
289 if (isUniformStoreOCL(ST))
290 numInstance = 1;
291 slicing = false;
292 }
293 else if (sdag.m_root->isTerminator())
294 {
295 numInstance = 1;
296 slicing = false;
297 }
298 else if (m_currShader->GetIsUniform(sdag.m_root))
299 {
300 numInstance = 1;
301 // if this uniform value is involved in phi-congruent class
302 // live-interval changed with slicing. Therefore, we need to stop slicing
303 // \todo: is it a good idea to pre-schedule all uniform operations to the beginning of the block?
304 if (m_deSSA->getRootValue(sdag.m_root))
305 slicing = false;
306 }
307 else if (llvm::GenIntrinsicInst * pIntrinsic = llvm::dyn_cast<llvm::GenIntrinsicInst>(sdag.m_root))
308 {
309 GenISAIntrinsic::ID id = pIntrinsic->getIntrinsicID();
310 if (id == GenISAIntrinsic::GenISA_threadgroupbarrier ||
311 id == GenISAIntrinsic::GenISA_memoryfence ||
312 id == GenISAIntrinsic::GenISA_flushsampler ||
313 id == GenISAIntrinsic::GenISA_typedmemoryfence ||
314 id == GenISAIntrinsic::GenISA_vaErode ||
315 id == GenISAIntrinsic::GenISA_vaDilate ||
316 id == GenISAIntrinsic::GenISA_vaMinMax ||
317 id == GenISAIntrinsic::GenISA_vaMinMaxFilter ||
318 id == GenISAIntrinsic::GenISA_vaConvolve ||
319 id == GenISAIntrinsic::GenISA_vaConvolveGRF_16x1 ||
320 id == GenISAIntrinsic::GenISA_vaConvolveGRF_16x4 ||
321 id == GenISAIntrinsic::GenISA_vaCentroid ||
322 id == GenISAIntrinsic::GenISA_vaBoolSum ||
323 id == GenISAIntrinsic::GenISA_vaBoolCentroid ||
324 id == GenISAIntrinsic::GenISA_MediaBlockWrite ||
325 id == GenISAIntrinsic::GenISA_eu_thread_pause ||
326 id == GenISAIntrinsic::GenISA_simdBlockWrite ||
327 id == GenISAIntrinsic::GenISA_simdBlockWriteBindless)
328 {
329 numInstance = 1;
330 slicing = false;
331 }
332 }
333 }
334
335 if (CallInst * callInst = dyn_cast<CallInst>(sdag.m_root))
336 {
337 // Disable slicing for function calls
338 Function* F = dyn_cast<Function>(IGCLLVM::getCalledValue(callInst));
339 if (!F || F->hasFnAttribute("visaStackCall"))
340 {
341 numInstance = 1;
342 slicing = false;
343 }
344 }
345 return numInstance;
346 }
347
IsUndefOrZeroImmediate(const Value * value)348 bool EmitPass::IsUndefOrZeroImmediate(const Value* value)
349 {
350 if (isUndefOrConstInt0(value))
351 {
352 return true;
353 }
354
355 if (const llvm::ConstantFP* CFP = llvm::dyn_cast<llvm::ConstantFP>(value))
356 {
357 APInt api = CFP->getValueAPF().bitcastToAPInt();
358 if (api.getZExtValue() == 0)
359 {
360 return true;
361 }
362 }
363 return false;
364 }
365
setCurrentShader(llvm::Function * F)366 bool EmitPass::setCurrentShader(llvm::Function* F)
367 {
368 llvm::Function* Kernel = F;
369 if (m_FGA)
370 {
371 if (!m_FGA->getModule())
372 {
373 m_FGA->rebuild(F->getParent());
374 }
375 auto FG = m_FGA->getGroup(F);
376 if (!FG)
377 {
378 return false;
379 }
380 Kernel = FG->getHead();
381 }
382 else
383 {
384 // no analysis result avaliable.
385 m_FGA = nullptr;
386 }
387
388 auto Iter = m_shaders.find(Kernel);
389 if (Iter == m_shaders.end())
390 {
391 return false;
392 }
393 m_currShader = Iter->second->GetOrCreateShader(m_SimdMode, m_ShaderDispatchMode);
394 m_encoder = &(m_currShader->GetEncoder());
395 return true;
396 }
397
compileSymbolTableKernel(llvm::Function * F)398 bool EmitPass::compileSymbolTableKernel(llvm::Function* F)
399 {
400 IGC_ASSERT(IGC::isIntelSymbolTableVoidProgram(F));
401
402 // Check has external functions attached
403 if ((m_FGA && m_FGA->getGroup(F) && !m_FGA->getGroup(F)->isSingle()))
404 {
405 return true;
406 }
407 // Checl has global symbols attached
408 else if (!m_moduleMD->inlineProgramScopeOffsets.empty())
409 {
410 for (auto it : m_moduleMD->inlineProgramScopeOffsets)
411 {
412 GlobalVariable* pGlobal = it.first;
413 // Export the symbol if global is external/common linkage
414 if (m_moduleMD->compOpt.EnableTakeGlobalAddress && (pGlobal->hasCommonLinkage() || pGlobal->hasExternalLinkage()))
415 {
416 return true;
417 }
418
419 // Remove dead users at this point
420 pGlobal->removeDeadConstantUsers();
421
422 // Check if relocation is required by checking uses
423 for (auto user : pGlobal->users())
424 {
425 if (isa<Instruction>(user))
426 {
427 return true;
428 }
429 }
430 }
431 }
432 // Check if requiring symbol for imported function calls
433 else
434 {
435 for (auto& FI : F->getParent()->getFunctionList())
436 {
437 if (FI.isDeclaration() &&
438 FI.hasFnAttribute("referenced-indirectly") &&
439 !FI.use_empty())
440 {
441 return true;
442 }
443 }
444 }
445 return false;
446 }
447
CreateKernelShaderMap(CodeGenContext * ctx,MetaDataUtils * pMdUtils,llvm::Function & F)448 void EmitPass::CreateKernelShaderMap(CodeGenContext* ctx, MetaDataUtils* pMdUtils, llvm::Function& F)
449 {
450 /* Moving CShaderProgram instantiation to EmitPass from codegen*/
451 // Instantiate CShaderProgram and create map only if m_shaders is empty
452 if (m_shaders.empty())
453 {
454 /* OpenCL shader */
455 if (ctx->type == ShaderType::OPENCL_SHADER)
456 {
457 for (auto i = pMdUtils->begin_FunctionsInfo(), e = pMdUtils->end_FunctionsInfo(); i != e; ++i)
458 {
459 Function* pFunc = i->first;
460 // Skip non-kernel functions.
461 if (!isEntryFunc(pMdUtils, pFunc))
462 continue;
463
464 if (ctx->m_retryManager.kernelSet.empty() ||
465 ctx->m_retryManager.kernelSet.count(pFunc->getName().str()))
466 {
467 m_shaders[pFunc] = new CShaderProgram(ctx, pFunc);
468 COMPILER_SHADER_STATS_INIT(m_shaders[pFunc]->m_shaderStats);
469 }
470 }
471 }
472 /* Pixel Shader */
473 else if (ctx->type == ShaderType::PIXEL_SHADER)
474 {
475 Function* coarsePhase = nullptr;
476 Function* pixelPhase = nullptr;
477 NamedMDNode* coarseNode = ctx->getModule()->getNamedMetadata(NAMED_METADATA_COARSE_PHASE);
478 NamedMDNode* pixelNode = ctx->getModule()->getNamedMetadata(NAMED_METADATA_PIXEL_PHASE);
479 if (coarseNode)
480 {
481 coarsePhase = mdconst::dyn_extract<Function>(coarseNode->getOperand(0)->getOperand(0));
482 }
483 if (pixelNode)
484 {
485 pixelPhase = mdconst::dyn_extract<Function>(pixelNode->getOperand(0)->getOperand(0));
486 }
487 if (coarsePhase && pixelPhase)
488 {
489 //Multi stage PS
490 CShaderProgram* pProgram = new CShaderProgram(ctx, &F);
491 CPixelShader* pProgram8 =
492 static_cast<CPixelShader*>(pProgram->GetOrCreateShader(SIMDMode::SIMD8));
493 CPixelShader* pProgram16 =
494 static_cast<CPixelShader*>(pProgram->GetOrCreateShader(SIMDMode::SIMD16));
495 pProgram8->SetPSSignature(m_pSignature);
496 pProgram16->SetPSSignature(m_pSignature);
497 m_shaders[&F] = pProgram;
498 COMPILER_SHADER_STATS_INIT(pProgram->m_shaderStats);
499 }
500 else
501 {
502 // Single PS
503 // Assuming single shader information in metadata
504 Function* pFunc = getUniqueEntryFunc(pMdUtils, ctx->getModuleMetaData());
505
506 CShaderProgram* pProgram = new CShaderProgram(ctx, pFunc);
507 m_shaders[pFunc] = pProgram;
508 COMPILER_SHADER_STATS_INIT(pProgram->m_shaderStats);
509
510 }
511 }
512 /* All other shader types */
513 else
514 {
515 for (auto i = pMdUtils->begin_FunctionsInfo(), e = pMdUtils->end_FunctionsInfo(); i != e; ++i)
516 {
517 Function* pFunc = i->first;
518 // Skip non-entry functions.
519 if (!isEntryFunc(pMdUtils, pFunc))
520 {
521 continue;
522 }
523 m_shaders[pFunc] = new CShaderProgram(ctx, pFunc);
524 COMPILER_SHADER_STATS_INIT(m_shaders[pFunc]->m_shaderStats);
525 }
526 }
527 }
528 }
529
runOnFunction(llvm::Function & F)530 bool EmitPass::runOnFunction(llvm::Function& F)
531 {
532 m_currFuncHasSubroutine = false;
533
534 m_pCtx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
535 MetaDataUtils* pMdUtils = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
536 if (pMdUtils->findFunctionsInfoItem(&F) == pMdUtils->end_FunctionsInfo())
537 {
538 return false;
539 }
540 m_moduleMD = getAnalysis<MetaDataUtilsWrapper>().getModuleMetaData();
541
542 CreateKernelShaderMap(m_pCtx, pMdUtils, F);
543
544 m_FGA = getAnalysisIfAvailable<GenXFunctionGroupAnalysis>();
545
546 if ((IsStage1BestPerf(m_pCtx->m_CgFlag, m_pCtx->m_StagingCtx) ||
547 IGC_IS_FLAG_ENABLED(ForceBestSIMD)) &&
548 m_SimdMode == SIMDMode::SIMD8)
549 {
550 /* Don't do SIMD8 if SIMD16 has no spill */
551 auto Iter = m_shaders.find(&F);
552 if (Iter == m_shaders.end())
553 {
554 return false;
555 }
556
557 CShader * simd16Program = Iter->second->GetShader(SIMDMode::SIMD16);
558 if (simd16Program &&
559 simd16Program->ProgramOutput()->m_programBin != 0 &&
560 simd16Program->ProgramOutput()->m_scratchSpaceUsedBySpills == 0)
561 return false;
562 }
563
564 if (!setCurrentShader(&F))
565 {
566 return false;
567 }
568
569 // Dummy program is only used for symbol table info, check if compilation is required
570 if (IGC::isIntelSymbolTableVoidProgram(&F))
571 {
572 if (!compileSymbolTableKernel(&F))
573 {
574 return false;
575 }
576 }
577
578 m_DL = &F.getParent()->getDataLayout();
579 m_pattern = &getAnalysis<CodeGenPatternMatch>();
580 m_deSSA = &getAnalysis<DeSSA>();
581 m_blockCoalescing = &getAnalysis<BlockCoalescing>();
582 m_CE = &getAnalysis<CoalescingEngine>();
583 m_VRA = &getAnalysis<VariableReuseAnalysis>();
584
585 m_currShader->SetUniformHelper(&getAnalysis<WIAnalysis>());
586 m_currShader->SetCodeGenHelper(m_pattern);
587 m_currShader->SetDominatorTreeHelper(&getAnalysis<DominatorTreeWrapperPass>().getDomTree());
588 m_currShader->SetMetaDataUtils(getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils());
589 m_currShader->SetShaderSpecificHelper(this);
590 m_currShader->SetDataLayout(m_DL);
591 m_currShader->SetFunctionGroupAnalysis(m_FGA);
592 m_currShader->SetPushInfoHelper(&(m_moduleMD->pushInfo));
593 m_currShader->SetVariableReuseAnalysis(m_VRA);
594 if (IGC_IS_FLAG_DISABLED(DisableDeSSA))
595 {
596 m_currShader->SetDeSSAHelper(m_deSSA);
597 }
598 //Add CCtuple root variables.
599 if (IGC_IS_FLAG_DISABLED(DisablePayloadCoalescing)) {
600 m_currShader->SetCoalescingEngineHelper(m_CE);
601 }
602
603
604 CShader* prevShader = m_pCtx->m_prevShader;
605 bool isFuncGroupHead = !m_FGA || m_FGA->isGroupHead(&F);
606 bool hasStackCall = m_FGA && m_FGA->getGroup(&F) && m_FGA->getGroup(&F)->hasStackCall();
607 if (isFuncGroupHead)
608 {
609 if (hasStackCall)
610 {
611 m_currShader->SetHasStackCalls();
612 }
613 if (isIntelSymbolTableVoidProgram(&F))
614 {
615 m_currShader->SetIsIntelSymbolTableVoidProgram();
616 }
617
618 m_currShader->InitEncoder(m_SimdMode, m_canAbortOnSpill, m_ShaderDispatchMode);
619 // Pre-analysis pass to be executed before call to visa builder so we can pass scratch space offset
620 m_currShader->PreAnalysisPass();
621 if (!m_currShader->CompileSIMDSize(m_SimdMode, *this, F))
622 {
623 return false;
624 }
625
626 VISAKernel* prevKernel = nullptr;
627
628 if (prevShader &&
629 m_currShader->IsPatchablePS() &&
630 m_encoder->GetSimdSize() == prevShader->GetEncoder().GetSimdSize() &&
631 prevShader->GetEncoder().IsCodePatchCandidate() &&
632 prevShader->ProgramOutput()->m_programBin &&
633 prevShader->ProgramOutput()->m_scratchSpaceUsedBySpills == 0)
634 {
635 prevKernel = prevShader->GetEncoder().GetVISAKernel();
636 m_encoder->SetPayloadEnd(prevShader->GetEncoder().GetPayloadEnd());
637 }
638
639 if (IGC_GET_FLAG_VALUE(CodePatch) &&
640 ((!m_pCtx->hash.nosHash) || IGC_GET_FLAG_VALUE(CodePatch) > CodePatch_Enable_NoLTO) &&
641 m_currShader->IsPatchablePS() &&
642 m_SimdMode == SIMDMode::SIMD16 &&
643 (m_ShaderDispatchMode != ShaderDispatchMode::NOT_APPLICABLE || prevKernel) &&
644 (IGC_GET_FLAG_VALUE(CodePatchLimit) == 0 || 2 <= IGC_GET_FLAG_VALUE(CodePatchLimit)))
645 {
646 m_encoder->SetIsCodePatchCandidate(true);
647
648 // FIXME: Skip corner cases for now. Remove this later.
649 for (uint i = 0; i < m_pattern->m_numBlocks && m_encoder->IsCodePatchCandidate(); i++)
650 {
651 SBasicBlock& block = m_pattern->m_blocks[i];
652 auto I = block.m_dags.rbegin(), E = block.m_dags.rend();
653 while (I != E && m_encoder->IsCodePatchCandidate())
654 {
655 Instruction* llvmInst = I->m_root;
656 if (llvmInst->getOpcode() == Instruction::Call)
657 {
658 if (GenIntrinsicInst * I = dyn_cast<GenIntrinsicInst>(llvmInst))
659 {
660 switch(I->getIntrinsicID())
661 {
662 case GenISAIntrinsic::GenISA_PullSampleIndexBarys:
663 {
664 if (IGC_GET_FLAG_VALUE(CodePatchFilter) & CODE_PATCH_NO_PullSampleIndex) {
665 m_encoder->SetIsCodePatchCandidate(false);
666 }
667 }
668 break;
669 case GenISAIntrinsic::GenISA_PullSnappedBarys:
670 {
671 if (IGC_GET_FLAG_VALUE(CodePatchFilter) & CODE_PATCH_NO_PullSnapped) {
672 m_encoder->SetIsCodePatchCandidate(false);
673 }
674 }
675 break;
676 case GenISAIntrinsic::GenISA_PullCentroidBarys:
677 {
678 if (IGC_GET_FLAG_VALUE(CodePatchFilter) & CODE_PATCH_NO_PullCentroid) {
679 m_encoder->SetIsCodePatchCandidate(false);
680 }
681 }
682 break;
683 case GenISAIntrinsic::GenISA_DCL_SystemValue:
684 {
685 // This is where we will have ZWDelta
686 if (IGC_GET_FLAG_VALUE(CodePatchFilter) & CODE_PATCH_NO_ZWDelta &&
687 m_currShader->GetShaderType() == ShaderType::PIXEL_SHADER)
688 {
689 CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
690 SGVUsage usage = (SGVUsage)llvm::cast<llvm::ConstantInt>(I->getOperand(0))->getZExtValue();
691 if (usage == POSITION_Z &&
692 (psProgram->GetPhase() == PSPHASE_PIXEL || psProgram->GetPhase() == PSPHASE_COARSE))
693 {
694 m_encoder->SetIsCodePatchCandidate(false);
695 }
696 }
697 }
698 break;
699 default:
700 break;
701 }
702 }
703 }
704 ++I;
705 }
706 }
707 if ((IGC_GET_FLAG_VALUE(CodePatchFilter) & (0x1 << 0x4)) &&
708 m_pCtx->platform.getPlatformInfo().eProductFamily == IGFX_ALDERLAKE_P) {
709 m_encoder->SetIsCodePatchCandidate(false);
710 }
711 }
712 else
713 {
714 m_encoder->SetIsCodePatchCandidate(false);
715 }
716
717 // Check if the function, or the FG, has inline asm calls.
718 // We need this to set the correct builder mode to parse inline asm.
719 bool hasInlineAsmCall = m_pCtx->m_instrTypes.hasInlineAsm &&
720 m_pCtx->m_DriverInfo.SupportInlineAssembly() &&
721 (!m_FGA ? IGC::hasInlineAsmInFunc(F) : m_FGA->getGroup(&F)->hasInlineAsm());
722
723 // call builder after pre-analysis pass where scratchspace offset to VISA is calculated
724 m_encoder->InitEncoder(m_canAbortOnSpill, hasStackCall, hasInlineAsmCall, prevKernel);
725 initDefaultRoundingMode();
726 m_currShader->PreCompile();
727
728 // initialize stack if having stack usage
729 bool hasVLA = (m_FGA && m_FGA->getGroup(&F) && m_FGA->getGroup(&F)->hasVariableLengthAlloca()) || F.hasFnAttribute("hasVLA");
730 if (hasStackCall || hasVLA)
731 {
732 m_encoder->InitFuncAttribute(&F, true);
733 InitializeKernelStack(&F);
734 }
735 if (m_encoder->IsCodePatchCandidate())
736 {
737 m_currShader->SplitPayloadFromShader(&F);
738 }
739 m_currShader->AddPrologue();
740 }
741 else
742 {
743 // If kernel function is not compiled for the SIMD size then VISABuilder==nullptr
744 if (m_currShader->GetEncoder().GetVISABuilder() == nullptr)
745 {
746 return false;
747 }
748 if (!m_currShader->CompileSIMDSize(m_SimdMode, *this, F))
749 {
750 return false;
751 }
752 m_currShader->BeginFunction(&F);
753 if (m_FGA && m_FGA->useStackCall(&F))
754 {
755 m_encoder->InitFuncAttribute(&F, false);
756 emitStackFuncEntry(&F);
757 }
758 }
759
760 // Only apply WA to OCL shaders with stackcall enabled
761 // TODO: Remove this WA once vISA handles the register copy
762 bool needKernelArgOverrideWA = isFuncGroupHead && hasStackCall && m_currShader->GetShaderType() == ShaderType::OPENCL_SHADER;
763 if (needKernelArgOverrideWA)
764 {
765 // Requires early payload allocation to know the kernel arg offsets
766 m_currShader->CacheArgumentsList();
767 m_currShader->MapPushedInputs();
768 m_currShader->AllocatePayload();
769
770 // This WA copies all kernel args > r26.0 into a temp register when stackcalls are enabled.
771 // Since vISA stackcall ABI predefine the argument register to r26.0, if the payload is larger than
772 // 26GRFs then doing a stackcall will overwrite the payload registers.
773 const int visaStackCallArgRegStart = 26;
774 static const int64_t maxGRFOffset = visaStackCallArgRegStart * m_currShader->getGRFSize();
775 llvm::IRBuilder<> builder(&*F.getEntryBlock().getFirstInsertionPt());
776 for (auto& arg : F.args())
777 {
778 // Skip unused arguments
779 if (arg.user_empty()) continue;
780
781 Argument* kernArg = &arg;
782 CVariable* kernArgV = m_currShader->GetSymbol(kernArg);
783 // Get the allocated payload offset for this kernel arg
784 int64_t offset = m_currShader->GetKernelArgOffset(kernArgV);
785 // If kernel payload size exceeds maxGRFOffset, we must copy the kernel args into another register.
786 if (offset >= maxGRFOffset)
787 {
788 // Create a dummy instruction using RTV, just so we can use the LLVM replaceAllUsesWith to replace the kernelArg usages.
789 Function* pFunc = GenISAIntrinsic::getDeclaration(F.getParent(), GenISAIntrinsic::GenISA_RuntimeValue, kernArg->getType());
790 Value* tempCall = builder.CreateCall(pFunc, builder.getInt32(kernArg->getArgNo()), "kernArgCopy");
791 kernArg->replaceAllUsesWith(tempCall);
792
793 // Create another CVar to hold the copied kernelArg, and map it to the dummy instruction.
794 // When doing vISA codegen, all usages of the dummy instruction will get the value of the copied kernelArg.
795 CVariable* copiedArg = m_currShader->GetNewVariable(kernArgV);
796 emitCopyAll(copiedArg, kernArgV, kernArg->getType());
797 m_currShader->UpdateSymbolMap(tempCall, copiedArg);
798 // Temp instruction needs the same uniform analysis attribute as kernel arg
799 m_currShader->SetDependency(tempCall, m_currShader->GetDependency(kernArg));
800 }
801 }
802 }
803
804
805 if (IGC_IS_FLAG_ENABLED(DumpHasNonKernelArgLdSt)) {
806 ModuleMetaData* modMD = getAnalysis<MetaDataUtilsWrapper>().getModuleMetaData();
807 FunctionMetaData* funcMD = &modMD->FuncMD[&F];
808 if (hasStackCall || m_currFuncHasSubroutine) {
809 // conservative set the hasNonKernelArgLoad/Store to true
810 funcMD->hasNonKernelArgLoad = true;
811 funcMD->hasNonKernelArgStore = true;
812 funcMD->hasNonKernelArgAtomic = true;
813 }
814 // then write the result to the shader
815 if (m_currShader->GetShaderType() == ShaderType::OPENCL_SHADER) {
816 COpenCLKernel* kernel = static_cast<COpenCLKernel*>(m_currShader);
817 kernel->m_kernelInfo.m_hasNonKernelArgLoad = funcMD->hasNonKernelArgLoad? 1 : 0;
818 kernel->m_kernelInfo.m_hasNonKernelArgStore = funcMD->hasNonKernelArgStore? 1 : 0;
819 kernel->m_kernelInfo.m_hasNonKernelArgAtomic = funcMD->hasNonKernelArgAtomic? 1 : 0;
820 }
821 }
822
823 // Create a symbol relocation entry for each symbol used by F
824 emitSymbolRelocation(F);
825
826 m_VRA->BeginFunction(&F, numLanes(m_SimdMode));
827 if (isFuncGroupHead)
828 {
829 Function* Entry = m_currShader->entry;
830 // owned by m_pDebugEmitter
831 const bool IsPrimary = true;
832 auto vMod = IGC::ScalarVisaModule::BuildNew(m_currShader, Entry, IsPrimary);
833 IGC::DebugEmitterOpts DebugOpts;
834 DebugOpts.DebugEnabled = DebugInfoData::hasDebugInfo(m_currShader);
835 DebugOpts.EnableGTLocationDebugging = IGC_IS_FLAG_ENABLED(EnableGTLocationDebugging);
836 DebugOpts.UseOffsetInLocation = IGC_IS_FLAG_ENABLED(UseOffsetInLocation);
837 DebugOpts.EmitDebugLoc = IGC_IS_FLAG_ENABLED(EmitDebugLoc);
838 DebugOpts.EmitOffsetInDbgLoc = IGC_IS_FLAG_ENABLED(EmitOffsetInDbgLoc);
839 DebugOpts.ZeBinCompatible = IGC_IS_FLAG_ENABLED(ZeBinCompatibleDebugging) && IGC_IS_FLAG_ENABLED(EnableZEBinary);
840 DebugOpts.EnableRelocation = IGC_IS_FLAG_ENABLED(EnableRelocations) || DebugOpts.ZeBinCompatible;
841 DebugOpts.EnforceAMD64Machine = IGC_IS_FLAG_ENABLED(DebugInfoEnforceAmd64EM) || DebugOpts.ZeBinCompatible;
842 DebugOpts.EnableDebugInfoValidation = IGC_IS_FLAG_ENABLED(DebugInfoValidation);
843 m_pDebugEmitter = IDebugEmitter::Create();
844 m_pDebugEmitter->Initialize(std::move(vMod), DebugOpts);
845 }
846
847 IGC_ASSERT(m_pDebugEmitter);
848
849 if (DebugInfoData::hasDebugInfo(m_currShader))
850 {
851 m_currShader->GetDebugInfoData().m_pShader = m_currShader;
852 m_currShader->GetDebugInfoData().m_pDebugEmitter = m_pDebugEmitter;
853
854 const bool IsPrimary = isFuncGroupHead;
855 m_pDebugEmitter->resetModule(
856 IGC::ScalarVisaModule::BuildNew(m_currShader, &F, IsPrimary));
857 }
858
859 // We only invoke EndEncodingMark() to update last VISA id.
860 m_pDebugEmitter->EndEncodingMark();
861
862 phiMovToBB.clear();
863 unsigned int lineNo = 0;
864 bool disableSlicing =
865 IGC_IS_FLAG_ENABLED(DisableSIMD32Slicing) ||
866 !m_currShader->GetContext()->m_retryManager.AllowSimd32Slicing() ||
867 m_currShader->GetContext()->getModuleMetaData()->compOpt.OptDisable ||
868 m_pattern->m_samplertoRenderTargetEnable;
869
870 IGC::Debug::Dump* llvmtoVISADump = nullptr;
871 if (IGC_IS_FLAG_ENABLED(ShaderDumpEnable))
872 {
873 auto name = IGC::Debug::GetDumpNameObj(m_currShader, "visa.ll");
874 // If the function is in a function group, set the postfix string of
875 // DumpName as "entry name" + "_f" + "id".
876 if (m_FGA && !m_FGA->isGroupHead(&F)) {
877 FunctionGroup* group = m_FGA->getGroup(&F);
878 // To align with visa suffixing, make id start from 0.
879 unsigned id = -1;
880 for (auto it = group->begin(), ie = group->end(); it != ie; ++it)
881 {
882 if (*it == &F)
883 break;
884 ++id;
885 }
886 std::string postfix = group->getHead()->getName().str() + "_f" + std::to_string(id);
887 name = name.PostFix(postfix);
888 }
889 if (name.allow())
890 llvmtoVISADump = new IGC::Debug::Dump(name, IGC::Debug::DumpType::PASS_IR_TEXT);
891 }
892 VisaIdAnnotator VidAnnotator; // for visa.ll dump
893 StringRef curSrcFile, curSrcDir;
894
895 for (uint i = 0; i < m_pattern->m_numBlocks; i++)
896 {
897 SBasicBlock& block = m_pattern->m_blocks[i];
898 block.m_activeMask = nullptr; // clear for each SIMD size
899 m_currentBlock = i;
900 if (m_blockCoalescing->IsEmptyBlock(block.bb))
901 {
902 continue;
903 }
904
905 if (llvmtoVISADump)
906 {
907 VidAnnotator.trackBlockId(block.bb, i);
908 }
909
910 if (i != 0)
911 {
912 m_pDebugEmitter->BeginEncodingMark();
913 // create a label
914 m_encoder->Label(block.id);
915 m_encoder->Push();
916 m_pDebugEmitter->EndEncodingMark();
917 }
918
919 // remove cached per lane offset variables if any.
920 PerLaneOffsetVars.clear();
921
922 // Variable reuse per-block states.
923 VariableReuseAnalysis::EnterBlockRAII EnterBlock(m_VRA, block.bb);
924
925 // go through the list in reverse order
926 auto I = block.m_dags.rbegin(), E = block.m_dags.rend();
927 while (I != E)
928 {
929 Instruction* llvmInst = I->m_root;
930 if (llvmInst->getDebugLoc())
931 {
932 unsigned int curLineNumber = llvmInst->getDebugLoc().getLine();
933 auto&& srcFile = llvmInst->getDebugLoc()->getScope()->getFilename();
934 auto&& srcDir = llvmInst->getDebugLoc()->getScope()->getDirectory();
935 if (!curSrcFile.equals(srcFile) || !curSrcDir.equals(srcDir))
936 {
937 curSrcFile = srcFile;
938 curSrcDir = srcDir;
939 m_pDebugEmitter->BeginEncodingMark();
940 llvm::SmallVector<char, 1024> fileName;
941 llvm::sys::path::append(fileName, curSrcDir);
942 llvm::sys::path::append(fileName, curSrcFile);
943 std::string fileNameStr(fileName.begin(), fileName.end());
944 m_encoder->File(fileNameStr);
945 m_pDebugEmitter->EndEncodingMark();
946 }
947 if (curLineNumber != lineNo)
948 {
949 m_pDebugEmitter->BeginEncodingMark();
950 m_encoder->Loc(curLineNumber);
951 m_pDebugEmitter->EndEncodingMark();
952 lineNo = curLineNumber;
953 }
954 }
955
956 bool slicing = false;
957 uint numInstance = DecideInstanceAndSlice(*block.bb, *I, slicing);
958 IGC_ASSERT(numInstance == 1 || numInstance == 2);
959
960 if (slicing && !disableSlicing)
961 {
962 m_pDebugEmitter->BeginEncodingMark();
963 I = emitInSlice(block, I);
964 m_pDebugEmitter->EndEncodingMark();
965 llvmInst = I->m_root;
966 }
967
968 if (I != E)
969 {
970 m_pDebugEmitter->BeginInstruction(llvmInst);
971
972 // before inserting the terminator, initialize constant pool & insert the de-ssa moves
973 if (isa<BranchInst>(llvmInst))
974 {
975 m_encoder->SetSecondHalf(false);
976 // insert constant initializations.
977 InitConstant(block.bb);
978 // Insert lifetime start if there are any
979 emitLifetimeStartAtEndOfBB(block.bb);
980 // insert the de-ssa movs.
981 MovPhiSources(block.bb);
982 }
983
984 // If slicing happens, then recalculate the number of instances.
985 if (slicing)
986 {
987 numInstance = DecideInstanceAndSlice(*block.bb, *I, slicing);
988 }
989
990 if (llvmtoVISADump)
991 {
992 VidAnnotator.trackVisaId(llvmInst, m_encoder->GetVISAKernel()->getvIsaInstCount() + 1);
993 }
994
995 // Insert lifetime start if legal. Note taht m_destination
996 // shall be nullptr if this instruction has no dst.
997 emitLifetimeStart(m_destination, block.bb, llvmInst, true);
998
999 DstModifier init;
1000 if (numInstance < 2)
1001 {
1002 m_encoder->SetSecondHalf(false);
1003 I->m_pattern->Emit(this, init);
1004 ++I;
1005 }
1006 else
1007 {
1008 m_encoder->SetSecondHalf(false);
1009 I->m_pattern->Emit(this, init);
1010 m_encoder->SetSecondHalf(true);
1011 I->m_pattern->Emit(this, init);
1012 ++I;
1013 }
1014 m_pDebugEmitter->EndInstruction(llvmInst);
1015 }
1016 }
1017 }
1018
1019 if (llvmtoVISADump)
1020 {
1021 F.print(llvmtoVISADump->stream(), &VidAnnotator);
1022 delete llvmtoVISADump;
1023 }
1024
1025 if (m_FGA && !m_FGA->useStackCall(&F))
1026 {
1027 BasicBlock* exitBB = &*(F.getBasicBlockList().rbegin());
1028 if (IGC_IS_FLAG_ENABLED(ForceSubReturn) &&
1029 !isa_and_nonnull<ReturnInst>(exitBB->getTerminator()))
1030 {
1031 // No return, generate dummy return for each subroutine to meet visa requirement.
1032 m_encoder->SubroutineRet(nullptr, &F);
1033 m_encoder->Push();
1034 }
1035 }
1036
1037 if (isFuncGroupHead)
1038 {
1039 if (!needKernelArgOverrideWA)
1040 {
1041 // Cache the arguments list into a vector for faster access
1042 m_currShader->CacheArgumentsList();
1043 // Associates values pushed to CVariable
1044 m_currShader->MapPushedInputs();
1045 // Allocate the thread payload
1046 m_currShader->AllocatePayload();
1047 }
1048
1049 if (m_currShader->ProgramOutput()->m_scratchSpaceUsedBySpills)
1050 {
1051 if (IGC_GET_FLAG_VALUE(CodePatchExperiments))
1052 {
1053 errs() << "Skip Prologue : " << m_encoder->GetShaderName() << "\n";
1054 }
1055 return false;
1056 }
1057 if (m_encoder->IsCodePatchCandidate())
1058 {
1059 if (IGC_GET_FLAG_VALUE(CodePatchLimit) >= 2)
1060 {
1061 IGC_SET_FLAG_VALUE(CodePatchLimit, IGC_GET_FLAG_VALUE(CodePatchLimit) - 1);
1062 }
1063 if (IGC_GET_FLAG_VALUE(CodePatchExperiments))
1064 {
1065 errs() << IGC_GET_FLAG_VALUE(CodePatchLimit) << " Prologue/CodePatch : " << m_encoder->GetShaderName() << "\n";
1066 }
1067 }
1068 else
1069 {
1070 if (IGC_GET_FLAG_VALUE(CodePatchExperiments))
1071 {
1072 errs() << IGC_GET_FLAG_VALUE(CodePatchLimit) << " not : " << m_encoder->GetShaderName() << "\n";
1073 }
1074 }
1075 }
1076
1077 if (m_currShader->GetDebugInfoData().m_pDebugEmitter)
1078 {
1079 if (IGC_IS_FLAG_ENABLED(UseOffsetInLocation))
1080 {
1081 if (IGC::ForceAlwaysInline() ||
1082 ((OpenCLProgramContext*)(m_currShader->GetContext()))->m_InternalOptions.KernelDebugEnable)
1083 {
1084 DebugInfoData::markOutput(F, m_currShader, m_pDebugEmitter);
1085 }
1086 ScalarVisaModule* scVISAMod = (ScalarVisaModule*)(m_pDebugEmitter->getCurrentVISA());
1087 if (!scVISAMod->getPerThreadOffset() && m_currShader->hasFP())
1088 {
1089 // Stack calls in use. Nothing is needed to be marked as Output.
1090 // Just setting frame pointer is required for debug info when stack calls are in use.
1091 scVISAMod->setFramePtr(m_currShader->GetFP());
1092 }
1093 }
1094 else
1095 {
1096 m_currShader->GetDebugInfoData().markOutput(F, m_currShader);
1097 }
1098
1099 m_currShader->GetDebugInfoData().addVISAModule(&F, m_pDebugEmitter->getCurrentVISA());
1100 m_currShader->GetDebugInfoData().transferMappings(F);
1101 }
1102
1103 // Compile only when this is the last function for this kernel.
1104 bool finalize = (!m_FGA || m_FGA->isGroupTail(&F));
1105 bool destroyVISABuilder = false;
1106 if (finalize)
1107 {
1108 destroyVISABuilder = true;
1109 // We only need one symbol table per module. If there are multiple entry functions, only create a symbol
1110 // for the dummy kernel with indirect functions attached.
1111 bool compileWithSymbolTable = false;
1112 Function* currHead = m_FGA ? m_FGA->getGroupHead(&F) : &F;
1113 if (IGC::isIntelSymbolTableVoidProgram(currHead))
1114 {
1115 compileWithSymbolTable = true;
1116 }
1117 m_encoder->Compile(compileWithSymbolTable);
1118 m_pCtx->m_prevShader = m_currShader;
1119
1120 if (hasStackCall)
1121 {
1122 // Disable retry when stackcalls are present
1123 m_pCtx->m_retryManager.Disable();
1124 }
1125 }
1126
1127 if (destroyVISABuilder)
1128 {
1129 if (!m_currShader->GetDebugInfoData().m_pDebugEmitter)
1130 {
1131 IDebugEmitter::Release(m_pDebugEmitter);
1132 }
1133
1134 if (!m_encoder->IsCodePatchCandidate() ||
1135 m_encoder->HasPrevKernel() ||
1136 !m_currShader->ProgramOutput()->m_programBin ||
1137 m_currShader->ProgramOutput()->m_scratchSpaceUsedBySpills)
1138 {
1139 m_pCtx->m_prevShader = nullptr;
1140 // Postpone destroying VISA builder to
1141 // after emitting debug info and passing context for code patching
1142 m_encoder->DestroyVISABuilder();
1143 }
1144 if (m_encoder->IsCodePatchCandidate() && m_encoder->HasPrevKernel())
1145 {
1146 prevShader->GetEncoder().DestroyVISABuilder();
1147 }
1148 }
1149
1150 if ((m_currShader->GetShaderType() == ShaderType::COMPUTE_SHADER ||
1151 m_currShader->GetShaderType() == ShaderType::OPENCL_SHADER) &&
1152 m_currShader->m_Platform->supportDisableMidThreadPreemptionSwitch() &&
1153 IGC_IS_FLAG_ENABLED(EnableDisableMidThreadPreemptionOpt) &&
1154 (m_currShader->GetContext()->m_instrTypes.numLoopInsts == 0) &&
1155 (m_currShader->ProgramOutput()->m_InstructionCount < IGC_GET_FLAG_VALUE(MidThreadPreemptionDisableThreshold)))
1156 {
1157 if (m_currShader->GetShaderType() == ShaderType::COMPUTE_SHADER)
1158 {
1159 CComputeShader* csProgram = static_cast<CComputeShader*>(m_currShader);
1160 csProgram->SetDisableMidthreadPreemption();
1161 }
1162 else
1163 {
1164 COpenCLKernel* kernel = static_cast<COpenCLKernel*>(m_currShader);
1165 kernel->SetDisableMidthreadPreemption();
1166 }
1167 }
1168
1169 if (IGC_IS_FLAG_ENABLED(ForceBestSIMD))
1170 {
1171 return false;
1172 }
1173
1174 if (m_SimdMode == SIMDMode::SIMD16 &&
1175 this->m_ShaderDispatchMode == ShaderDispatchMode::NOT_APPLICABLE &&
1176 IsStage1BestPerf(m_pCtx->m_CgFlag, m_pCtx->m_StagingCtx))
1177 {
1178 m_pCtx->m_doSimd32Stage2 = m_currShader->CompileSIMDSize(SIMDMode::SIMD32, *this, F);
1179 }
1180
1181 if (m_SimdMode == SIMDMode::SIMD8 &&
1182 IsStage1FastCompile(m_pCtx->m_CgFlag, m_pCtx->m_StagingCtx))
1183 {
1184 m_pCtx->m_doSimd16Stage2 = m_currShader->CompileSIMDSize(SIMDMode::SIMD16, *this, F);
1185 m_pCtx->m_doSimd32Stage2 = m_currShader->CompileSIMDSize(SIMDMode::SIMD32, *this, F);
1186 }
1187
1188 return false;
1189 }
1190
1191 // Emit code in slice starting from (reverse) iterator I. Return the iterator to
1192 // the next pattern to emit.
1193 SBasicBlock::reverse_iterator
emitInSlice(SBasicBlock & block,SBasicBlock::reverse_iterator I)1194 EmitPass::emitInSlice(SBasicBlock& block, SBasicBlock::reverse_iterator I)
1195 {
1196 auto sliceBegin = I;
1197 auto sliceIter = I;
1198 auto E = block.m_dags.rend();
1199 DstModifier init;
1200
1201 bool slicing = true;
1202 m_encoder->SetSecondHalf(false); // the 1st-half slice for simd32
1203 while (slicing)
1204 {
1205 emitLifetimeStart(m_destination, block.bb, (*sliceIter).m_root, false);
1206
1207 (*sliceIter).m_pattern->Emit(this, init);
1208 ++sliceIter;
1209 slicing = false;
1210 if (sliceIter != E)
1211 {
1212 unsigned numInstance = DecideInstanceAndSlice(*(block.bb), (*sliceIter), slicing);
1213 IGC_ASSERT(numInstance == 1 || numInstance == 2);
1214 }
1215 }
1216
1217 // Store the point slicing stops at.
1218 auto sliceEnd = sliceIter;
1219
1220 m_encoder->SetSecondHalf(true); // the 2nd-half slice for simd32
1221 for (sliceIter = sliceBegin; sliceIter != sliceEnd; ++sliceIter)
1222 {
1223 unsigned numInstance = DecideInstanceAndSlice(*(block.bb), (*sliceIter), slicing);
1224 // uniform op only emit once
1225 if (numInstance > 1)
1226 {
1227 emitLifetimeStart(m_destination, block.bb, (*sliceIter).m_root, false);
1228
1229 (*sliceIter).m_pattern->Emit(this, init);
1230 }
1231 }
1232
1233 return sliceEnd;
1234 }
1235
1236 /// Insert moves at the end of the basic block to replace the phi node of the successors
1237 // This is a special case that we want to relocate the phi-mov's
1238 // unconditionally. Two functions, isCandidateIfStmt() and
1239 // canRelocatePhiMov(), are used to check if this is the special
1240 // case as below:
1241 //
1242 // x.1 = ...
1243 // ...
1244 // H: br i1 %cond, OtherBB, phiMovBB // target BBs interchangeable
1245 // OtherBB:
1246 // x.0 = ...
1247 // br phiBB
1248 // phiMovBB:
1249 // <empty BB>
1250 // br phiBB
1251 // phiBB:
1252 // phi x = [x.0, OtherBB] [ x.1, phiMovBB]
1253 //
1254 // Normally, a phi-mov is to be inserted into phiMovBB. This optim is to
1255 // relocate the phi-mov to H so that we have if-then-endif other than
1256 // if-then-else-endif. To make it simple and correct, the following
1257 // conditions are required:
1258 // 1. 'if' branch isn't uniform. (If uniform, it is probably not beneficial
1259 // to move phi-mov to H)
1260 // 2. either x.0 is defined in otherBB or a phi-mov must be inserted
1261 // in the otherBB.
1262 // With this, phi-mov can be relocated to H without using predicate.
1263 //
1264
1265 // canRelocatePhiMov() checks if all phi-mov to phiMovBB can be relocated.
canRelocatePhiMov(llvm::BasicBlock * otherBB,llvm::BasicBlock * phiMovBB,llvm::BasicBlock * phiBB)1266 bool EmitPass::canRelocatePhiMov(
1267 llvm::BasicBlock* otherBB,
1268 llvm::BasicBlock* phiMovBB,
1269 llvm::BasicBlock* phiBB)
1270 {
1271 // Threshold for phi-mov relocation
1272 const int CMAX_PHI_COUNT = 6;
1273
1274 int n = 0;
1275 for (auto I = phiBB->begin(), E = phiBB->end(); I != E; ++I)
1276 {
1277 llvm::PHINode* PN = llvm::dyn_cast<llvm::PHINode>(I);
1278 if (!PN)
1279 {
1280 break;
1281 }
1282
1283 CVariable* dst = m_currShader->GetSymbol(PN);
1284 for (uint i = 0, e = PN->getNumOperands(); i != e; ++i)
1285 {
1286 Value* V = PN->getOperand(i);
1287 CVariable* src = m_currShader->GetSymbol(V);
1288 if (PN->getIncomingBlock(i) == phiMovBB)
1289 {
1290 if (dst != src)
1291 {
1292 int numElt = 1;
1293 if (IGCLLVM::FixedVectorType * vTy = dyn_cast<IGCLLVM::FixedVectorType>(PN->getType()))
1294 {
1295 numElt = int_cast<int>(vTy->getNumElements());
1296 }
1297 // Conservatively assume the number of mov's is 'numElt'.
1298 n += numElt;
1299 }
1300 }
1301 else
1302 {
1303 // For case with PN->getIncomingBlock(i) == otherBB
1304 Instruction* Inst = dyn_cast<Instruction>(V);
1305 if (Inst && Inst->getParent() != otherBB && (dst == src))
1306 {
1307 // This is the case that x and x.1 are coalesced, in which
1308 // we cannot move phi-mov from emptyBB to H, as doing so
1309 // will clobber x.1 (x.1 and x are the same virtual reg).
1310 // [Can move it up with predicate always, but need to check
1311 // doing so would give us perf benefit.]
1312 // x.1 = ...
1313 // ...
1314 // H: br c, B0, B1
1315 // otherBB:
1316 // <...>
1317 // br phiBB
1318 // emptyBB:
1319 // br phiBB
1320 // phiBB:
1321 // phi x = [x.0 emptyBB] [x.1 otherBB]
1322 return false;
1323 }
1324 }
1325 }
1326 }
1327 if (m_currShader->m_dispatchSize == SIMDMode::SIMD32)
1328 {
1329 n = (2 * n);
1330 }
1331 return (n > 0) && (n < CMAX_PHI_COUNT);
1332 }
1333
1334 // Check if 'ifBB' is the If BB for if-then-else pattern in which both then & else
1335 // are single BBs and one of them is empty. It also make sure the branch is not
1336 // uniform. If it is such a BB, it returns true with emptyBB and otherBB set to
1337 // then & else.
isCandidateIfStmt(llvm::BasicBlock * ifBB,llvm::BasicBlock * & otherBB,llvm::BasicBlock * & emptyBB)1338 bool EmitPass::isCandidateIfStmt(
1339 llvm::BasicBlock* ifBB, llvm::BasicBlock*& otherBB, llvm::BasicBlock*& emptyBB)
1340 {
1341 llvm::BranchInst* Br = dyn_cast<llvm::BranchInst>(ifBB->getTerminator());
1342 if (!Br || Br->getNumSuccessors() != 2 ||
1343 m_currShader->GetIsUniform(Br->getCondition()))
1344 {
1345 return false;
1346 }
1347
1348 llvm::BasicBlock* S0 = Br->getSuccessor(0), * S1 = Br->getSuccessor(1);
1349 IGCLLVM::TerminatorInst* T0 = S0->getTerminator(), * T1 = S1->getTerminator();
1350 IGC_ASSERT_MESSAGE(nullptr != T1, "BB is missing a terminator!");
1351 IGC_ASSERT_MESSAGE(nullptr != T0, "BB is missing a terminator!");
1352 bool isMatch =
1353 S0->getSinglePredecessor() == ifBB && S1->getSinglePredecessor() == ifBB &&
1354 T0->getNumSuccessors() == 1 && T1->getNumSuccessors() == 1 &&
1355 T0->getSuccessor(0) == T1->getSuccessor(0) &&
1356 (S0->size() > 1 || S1->size() > 1) && // only one empty block
1357 (S0->size() == 1 || S1->size() == 1);
1358 if (isMatch)
1359 {
1360 if (S0->size() == 1)
1361 {
1362 emptyBB = S0;
1363 otherBB = S1;
1364 }
1365 else
1366 {
1367 emptyBB = S1;
1368 otherBB = S0;
1369 }
1370 }
1371 return isMatch;
1372 }
1373
1374 /// Insert moves at the end of the basic block to replace the phi node of the successors
MovPhiSources(llvm::BasicBlock * aBB)1375 void EmitPass::MovPhiSources(llvm::BasicBlock* aBB)
1376 {
1377 // collect all the src-side phi-moves, then find a good order for emission
1378 struct PhiSrcMoveInfo {
1379 CVariable* dstCVar;
1380 CVariable* srcCVar;
1381 Value* dstRootV; // root value of dst (dessa)
1382 Value* srcRootV; // root value of src (dessa)
1383 };
1384 BumpPtrAllocator phiAllocator;
1385 std::list<PhiSrcMoveInfo*> phiSrcDstList;
1386 std::vector<std::pair<CVariable*, CVariable*>> emitList;
1387 std::map<CVariable*, unsigned int> dstVTyMap;
1388 llvm::BasicBlock* bb = aBB;
1389 IGCLLVM::TerminatorInst* TI = aBB->getTerminator();
1390 IGC_ASSERT(nullptr != TI);
1391
1392 // main code to generate phi-mov
1393 for (unsigned succ = 0, e = TI->getNumSuccessors(); succ != e; ++succ)
1394 {
1395 llvm::BasicBlock* Succ = TI->getSuccessor(succ);
1396 for (auto II = Succ->begin(), IE = Succ->end(); II != IE; ++II)
1397 {
1398 llvm::PHINode* PN = llvm::dyn_cast<llvm::PHINode>(II);
1399 if (!PN)
1400 {
1401 break;
1402 }
1403 if (PN->use_empty())
1404 {
1405 continue;
1406 }
1407 for (uint i = 0, e = PN->getNumOperands(); i != e; ++i)
1408 {
1409 if (PN->getIncomingBlock(i) == bb)
1410 {
1411 Value* Src = PN->getOperand(i);
1412
1413 Value* dstRootV = m_deSSA ? m_deSSA->getRootValue(PN) : PN;
1414 Value* srcRootV = m_deSSA ? m_deSSA->getRootValue(Src) : Src;
1415 dstRootV = dstRootV ? dstRootV : PN;
1416 srcRootV = srcRootV ? srcRootV : Src;
1417 // To check if src-side phi mov is needed, we must use dessa
1418 // rootValue instead of CVariable, as value alias in dessa
1419 // might have the same variable with two different CVariable.
1420 if (dstRootV != srcRootV)
1421 {
1422 PhiSrcMoveInfo* phiInfo = new (phiAllocator) PhiSrcMoveInfo();
1423 phiInfo->dstCVar = m_currShader->GetSymbol(PN);
1424 phiInfo->srcCVar = m_currShader->GetSymbol(Src);
1425 phiInfo->dstRootV = dstRootV;
1426 phiInfo->srcRootV = srcRootV;
1427 phiSrcDstList.push_back(phiInfo);
1428
1429 int numElt = 0;
1430 if (IGCLLVM::FixedVectorType * vTy = dyn_cast<IGCLLVM::FixedVectorType>(PN->getType()))
1431 {
1432 numElt = int_cast<int>(vTy->getNumElements());
1433 }
1434 dstVTyMap.insert(std::pair<CVariable*, unsigned int>(phiInfo->dstCVar, numElt));
1435 }
1436 }
1437 }
1438 }
1439 }
1440
1441 // Find a good order for src-side phi-moves.
1442 //
1443 // PHI copies are parallel copy. Here, need to serialize those copies
1444 // in a way that the dst will not be overwritten by a previous copy.
1445 // For example,
1446 // (phi_1, phi_2) = (a, phi_1)
1447 // ==>
1448 // phi_2 = phi_1
1449 // phi_1 = a
1450 // If there is a cycle, have to insert a temp copy to break the cycle (see below)
1451 while (!phiSrcDstList.empty())
1452 {
1453 // search should not get into a deadlock, i.e should be able to find one to emit every iteration,
1454 auto It = phiSrcDstList.begin();
1455 auto Et = phiSrcDstList.end();
1456 for (; It != Et; ++It)
1457 {
1458 auto Cmp = [&](const PhiSrcMoveInfo* Val)
1459 {
1460 return Val->srcRootV == (*It)->dstRootV;
1461 };
1462
1463 if (0 == std::count_if (phiSrcDstList.begin(), phiSrcDstList.end(), Cmp))
1464 {
1465 break;
1466 }
1467 }
1468 if (It == Et)
1469 {
1470 // Found cyclic phi-move dependency. Pick the first one (anyone
1471 // should be good) and create a temp to break the dependence cycle.
1472 // (Note that there is no self-cycle.)
1473 // For example,
1474 // (phi_1, phi_2) = (phi_2, phi_1)
1475 // ==>
1476 // t = phi_1
1477 // phi_1 = phi_2
1478 // phi_2 = t
1479
1480 // After the temp copy of the 1st entry's dst is inserted,
1481 // the entry becomes the one to be added into emitList.
1482 It = phiSrcDstList.begin();
1483
1484 Value* dRootV = (*It)->dstRootV;
1485 CVariable* D1 = (*It)->dstCVar;
1486 CVariable* T = m_currShader->GetNewVariable(D1);
1487 dstVTyMap[T] = dstVTyMap[D1];
1488 emitList.push_back(std::pair<CVariable*, CVariable*>(D1, T));
1489
1490 // Replace with T all src that is equal to D1 (start from It+1)
1491 auto LI = It, LE = phiSrcDstList.end();
1492 for (++LI; LI != LE; ++LI)
1493 {
1494 PhiSrcMoveInfo* phiinfo = *LI;
1495 if (phiinfo->srcRootV == dRootV) {
1496 CVariable* sVar = phiinfo->srcCVar;
1497 CVariable* nVar;
1498 if (sVar->GetType() != T->GetType()) {
1499 nVar = m_currShader->GetNewAlias(
1500 T, sVar->GetType(), 0, sVar->GetNumberElement());
1501 }
1502 else {
1503 nVar = T;
1504 }
1505 phiinfo->srcCVar = nVar;
1506 }
1507 }
1508 }
1509 IGC_ASSERT(It != Et);
1510 emitList.push_back(std::pair<CVariable*, CVariable*>((*It)->srcCVar, (*It)->dstCVar));
1511 phiSrcDstList.erase(It);
1512 }
1513 // emit the src-side phi-moves
1514 for (unsigned i = 0, e = int_cast<unsigned>(emitList.size()); i != e; ++i)
1515 {
1516 CVariable* dst = emitList[i].second;
1517 CVariable* src = emitList[i].first;
1518
1519 for (uint instance = 0; instance < dst->GetNumberInstance(); instance++)
1520 {
1521 m_encoder->SetSecondHalf(instance == 1 ? true : false);
1522 unsigned int numVTyElt = dstVTyMap[dst];
1523 if (numVTyElt > 0)
1524 {
1525 emitVectorCopy(dst, src, numVTyElt);
1526 }
1527 else
1528 {
1529 m_encoder->Copy(dst, src);
1530 m_encoder->Push();
1531 }
1532 }
1533 }
1534 }
1535
InitConstant(llvm::BasicBlock * BB)1536 void EmitPass::InitConstant(llvm::BasicBlock* BB)
1537 {
1538 for (auto& I : m_pattern->ConstantPlacement)
1539 {
1540 if (I.second != BB)
1541 continue;
1542 Constant* C = I.first;
1543 CVariable* Dst = m_currShader->lookupConstantInPool(C);
1544 if (Dst)
1545 continue;
1546 Dst = m_currShader->GetConstant(C);
1547 if (!C->getType()->isVectorTy()) {
1548 CVariable* Imm = Dst;
1549 Dst = m_currShader->GetNewVector(C);
1550 m_encoder->Copy(Dst, Imm);
1551 m_encoder->Push();
1552 }
1553 m_currShader->addConstantInPool(C, Dst);
1554 }
1555 }
1556
emitLifetimeStartAtEndOfBB(BasicBlock * BB)1557 void EmitPass::emitLifetimeStartAtEndOfBB(BasicBlock* BB)
1558 {
1559 if (m_pCtx->getVectorCoalescingControl() == 0) {
1560 return;
1561 }
1562
1563 auto II = m_VRA->m_LifetimeAtEndOfBB.find(BB);
1564 if (II != m_VRA->m_LifetimeAtEndOfBB.end())
1565 {
1566 TinyPtrVector<Value*>& ARVs = II->second;
1567 for (int i = 0, sz = (int)ARVs.size(); i < sz; ++i)
1568 {
1569 Value* RootVal = ARVs[i];
1570 CVariable* Var = GetSymbol(RootVal);
1571
1572 // vISA info inst, no m_encoder->Push() needed.
1573 m_encoder->Lifetime(LIFETIME_START, Var);
1574 }
1575 }
1576 }
1577
getPairOutput(Value * V) const1578 std::pair<Value*, Value*> EmitPass::getPairOutput(Value* V) const {
1579 auto I = m_pattern->PairOutputMap.find(V);
1580 IGC_ASSERT(I != m_pattern->PairOutputMap.end());
1581 return std::make_pair(I->second.first, I->second.second);
1582 }
1583
emitGradientX(const SSource & source,const DstModifier & modifier)1584 void EmitPass::emitGradientX(const SSource& source, const DstModifier& modifier)
1585 {
1586 CVariable* src = GetSrcVariable(source);
1587 if (src->IsUniform())
1588 {
1589 m_encoder->SetSrcModifier(1, EMOD_NEG);
1590 m_encoder->Add(m_destination, src, src);
1591 m_encoder->Push();
1592 }
1593 else
1594 {
1595 // we need to combine negation with the existing source modifiers
1596 // to implement subtraction of values correct also for neg, abs, negabs
1597 const e_modifier src_mod0 = source.mod;
1598 const e_modifier src_mod1 = CombineModifier(EMOD_NEG, src_mod0);
1599 m_encoder->SetSrcModifier(0, src_mod0);
1600 m_encoder->SetSrcModifier(1, src_mod1);
1601 m_encoder->SetDstModifier(modifier);
1602 // set the regioning to get isa instruction
1603 // add dst0.0<1>:f src0.1<4;4,0>:f -src0.0<4;4,0>:f
1604 m_encoder->SetSrcRegion(0, 4, 4, 0);
1605 m_encoder->SetSrcRegion(1, 4, 4, 0);
1606 m_encoder->SetSrcSubReg(0, 1);
1607 m_encoder->SetSrcSubReg(1, 0);
1608 m_encoder->Add(m_destination, src, src);
1609 m_encoder->Push();
1610 }
1611 }
1612
emitGradientY(const SSource & source,const DstModifier & modifier)1613 void EmitPass::emitGradientY(const SSource& source, const DstModifier& modifier)
1614 {
1615 CVariable* src = GetSrcVariable(source);
1616 if (src->IsUniform())
1617 {
1618 m_encoder->SetSrcModifier(1, EMOD_NEG);
1619 m_encoder->Add(m_destination, src, src);
1620 m_encoder->Push();
1621 }
1622 else
1623 {
1624 const e_modifier src_mod0 = source.mod;
1625 const e_modifier src_mod1 = CombineModifier(EMOD_NEG, src_mod0);
1626 m_encoder->SetSrcModifier(0, src_mod0);
1627 m_encoder->SetSrcModifier(1, src_mod1);
1628 m_encoder->SetDstModifier(modifier);
1629 // set the regioning to get isa instruction
1630 // add dst0.0<1>:f src0.1<4;4,0>:f -src0.0<4;4,0>:f
1631 m_encoder->SetSrcRegion(0, 4, 4, 0);
1632 m_encoder->SetSrcRegion(1, 4, 4, 0);
1633 m_encoder->SetSrcSubReg(0, 2);
1634 m_encoder->SetSrcSubReg(1, 0);
1635 m_encoder->Add(m_destination, src, src);
1636 m_encoder->Push();
1637 }
1638 }
1639
emitGradientXFine(const SSource & source,const DstModifier & modifier)1640 void EmitPass::emitGradientXFine(const SSource& source, const DstModifier& modifier)
1641 {
1642 CVariable* src = GetSrcVariable(source);
1643 if (src->IsUniform())
1644 {
1645 m_encoder->SetSrcModifier(1, EMOD_NEG);
1646 m_encoder->Add(m_destination, src, src);
1647 m_encoder->Push();
1648 }
1649 else
1650 {
1651 const e_modifier src_mod0 = source.mod;
1652 const e_modifier src_mod1 = CombineModifier(EMOD_NEG, src_mod0);
1653 m_encoder->SetSrcModifier(0, src_mod0);
1654 m_encoder->SetSrcModifier(1, src_mod1);
1655 m_encoder->SetDstModifier(modifier);
1656 // set the regioning to get isa instruction
1657 // add dst0.0<1>:f src0.1<2;2,0>:f -src0.0<2;2,0>:f
1658 m_encoder->SetSrcRegion(0, 2, 2, 0);
1659 m_encoder->SetSrcRegion(1, 2, 2, 0);
1660 m_encoder->SetSrcSubReg(0, 1);
1661 m_encoder->SetSrcSubReg(1, 0);
1662 m_encoder->Add(m_destination, src, src);
1663 m_encoder->Push();
1664 }
1665 }
1666
1667 /// Computes derivatives with respect to screen space by subtracting values for
1668 /// adjacent pixels in vertical direction.
1669 /// Consider the following four pixels:
1670 /// +----+----+
1671 /// | P0 | P1 |
1672 /// +----+----+
1673 /// | P2 | P3 |
1674 /// +----+----+
1675 ///
1676 /// then gradient_y_fine for scalar attribute A of pixel P0 will be P0.A - P2.A
1677 /// The same value will be used for P2 since the spec leaves the freedom of
1678 /// choosing the quad alignment. The same goes for P1 and P3.
1679 ///
1680 /// Now, if we look at the attribute A as laid out in a SIMD register, we have
1681 ///
1682 /// src0 = A : | | | | | P3.A | P2.A | P1.A | P0.A |
1683 ///
1684 /// and the result register should contain
1685 ///
1686 /// dst0 = dy : | | | | | q | t | q | t |
1687 ///
1688 /// where t = P0.A - P2.A and q = P1.A - P3.A
1689 ///
1690 /// The upper half of GRF also contains data for another separate set of four pixels.
1691 ///
1692 /// We compute the result by the following sequence of instructions
1693 ///
1694 /// add (4) dst0.0<1>:f src0.0<0; 2, 1>:f -src0.2<0; 2, 1>:f // lower half
1695 /// add (4) dst0.4<1>:f src0.4<0; 2, 1>:f -src0.6<0; 2, 1>:f // upper half
1696 ///
1697 /// and if we are in simd16 mode, we need two more instructions
1698 /// if (simd16)
1699 /// {
1700 /// add (4) dst0.8<1>:f src0.8<0; 2, 1>:f -src0.10<0; 2, 1>:f
1701 /// add (4) dst0.12<1>:f src0.12<0; 2, 1>:f -src0.14<0; 2, 1>:f
1702 /// }
1703 ///
1704 /// Note: Since the source llvm instruction may contain source modifier (abs, neg, negabs)
1705 /// we need to read them and flip the sign of the second isa source accordingly.
1706 ///////////////////////////////////////////////////////////////////////////////
emitGradientYFine(const SSource & source,const DstModifier & modifier)1707 void EmitPass::emitGradientYFine(const SSource& source, const DstModifier& modifier)
1708 {
1709 CVariable* src = GetSrcVariable(source);
1710 if (src->IsUniform())
1711 {
1712 m_encoder->SetSrcModifier(1, EMOD_NEG);
1713 m_encoder->Add(m_destination, src, src);
1714 m_encoder->Push();
1715 }
1716 else
1717 {
1718 CVariable* temp = m_currShader->GetNewVariable(m_destination);
1719 const e_modifier src_mod0 = source.mod;
1720 const e_modifier src_mod1 = CombineModifier(EMOD_NEG, src_mod0);
1721
1722 m_encoder->SetSimdSize(SIMDMode::SIMD4);
1723 m_encoder->SetSrcModifier(0, src_mod0);
1724 m_encoder->SetSrcRegion(0, 0, 2, 1);
1725 m_encoder->SetSrcSubReg(0, 2);
1726
1727 m_encoder->SetSrcModifier(1, src_mod1);
1728 m_encoder->SetSrcRegion(1, 0, 2, 1);
1729 m_encoder->SetSrcSubReg(1, 0);
1730 m_encoder->SetNoMask();
1731
1732 m_encoder->SetDstModifier(modifier);
1733 m_encoder->SetDstSubReg(0);
1734 m_encoder->Add(temp, src, src);
1735 m_encoder->Push();
1736
1737 m_encoder->SetSimdSize(SIMDMode::SIMD4);
1738 m_encoder->SetSrcModifier(0, src_mod0);
1739 m_encoder->SetSrcRegion(0, 0, 2, 1);
1740 m_encoder->SetSrcSubReg(0, 6);
1741
1742 m_encoder->SetSrcModifier(1, src_mod1);
1743 m_encoder->SetSrcRegion(1, 0, 2, 1);
1744 m_encoder->SetSrcSubReg(1, 4);
1745 m_encoder->SetNoMask();
1746
1747
1748 m_encoder->SetDstModifier(modifier);
1749 m_encoder->SetDstSubReg(4);
1750 m_encoder->Add(temp, src, src);
1751 m_encoder->Push();
1752
1753 if (m_currShader->m_SIMDSize == SIMDMode::SIMD16 || m_currShader->m_SIMDSize == SIMDMode::SIMD32)
1754 {
1755 m_encoder->SetSimdSize(SIMDMode::SIMD4);
1756 m_encoder->SetSrcModifier(0, src_mod0);
1757 m_encoder->SetSrcRegion(0, 0, 2, 1);
1758 m_encoder->SetSrcSubReg(0, 10);
1759
1760 m_encoder->SetSrcModifier(1, src_mod1);
1761 m_encoder->SetSrcRegion(1, 0, 2, 1);
1762 m_encoder->SetSrcSubReg(1, 8);
1763 m_encoder->SetNoMask();
1764
1765 m_encoder->SetDstModifier(modifier);
1766 m_encoder->SetDstSubReg(8);
1767 m_encoder->Add(temp, src, src);
1768 m_encoder->Push();
1769
1770 m_encoder->SetSimdSize(SIMDMode::SIMD4);
1771 m_encoder->SetSrcModifier(0, src_mod0);
1772 m_encoder->SetSrcRegion(0, 0, 2, 1);
1773 m_encoder->SetSrcSubReg(0, 14);
1774
1775 m_encoder->SetSrcModifier(1, src_mod1);
1776 m_encoder->SetSrcRegion(1, 0, 2, 1);
1777 m_encoder->SetSrcSubReg(1, 12);
1778
1779 m_encoder->SetNoMask();
1780 m_encoder->SetDstModifier(modifier);
1781 m_encoder->SetDstSubReg(12);
1782 m_encoder->Add(temp, src, src);
1783 m_encoder->Push();
1784 }
1785
1786 if (m_currShader->m_SIMDSize == SIMDMode::SIMD32)
1787 {
1788 m_encoder->SetSimdSize(SIMDMode::SIMD4);
1789 m_encoder->SetSrcModifier(0, src_mod0);
1790 m_encoder->SetSrcRegion(0, 0, 2, 1);
1791 m_encoder->SetSrcSubReg(0, 18);
1792
1793 m_encoder->SetSrcModifier(1, src_mod1);
1794 m_encoder->SetSrcRegion(1, 0, 2, 1);
1795 m_encoder->SetSrcSubReg(1, 16);
1796 m_encoder->SetNoMask();
1797
1798 m_encoder->SetDstModifier(modifier);
1799 m_encoder->SetDstSubReg(16);
1800 m_encoder->Add(temp, src, src);
1801 m_encoder->Push();
1802
1803 m_encoder->SetSimdSize(SIMDMode::SIMD4);
1804 m_encoder->SetSrcModifier(0, src_mod0);
1805 m_encoder->SetSrcRegion(0, 0, 2, 1);
1806 m_encoder->SetSrcSubReg(0, 22);
1807
1808 m_encoder->SetSrcModifier(1, src_mod1);
1809 m_encoder->SetSrcRegion(1, 0, 2, 1);
1810 m_encoder->SetSrcSubReg(1, 20);
1811 m_encoder->SetNoMask();
1812
1813 m_encoder->SetDstModifier(modifier);
1814 m_encoder->SetDstSubReg(20);
1815 m_encoder->Add(temp, src, src);
1816 m_encoder->Push();
1817
1818
1819 m_encoder->SetSimdSize(SIMDMode::SIMD4);
1820 m_encoder->SetSrcModifier(0, src_mod0);
1821 m_encoder->SetSrcRegion(0, 0, 2, 1);
1822 m_encoder->SetSrcSubReg(0, 26);
1823
1824 m_encoder->SetSrcModifier(1, src_mod1);
1825 m_encoder->SetSrcRegion(1, 0, 2, 1);
1826 m_encoder->SetSrcSubReg(1, 24);
1827 m_encoder->SetNoMask();
1828
1829 m_encoder->SetDstModifier(modifier);
1830 m_encoder->SetDstSubReg(24);
1831 m_encoder->Add(temp, src, src);
1832 m_encoder->Push();
1833
1834 m_encoder->SetSimdSize(SIMDMode::SIMD4);
1835 m_encoder->SetSrcModifier(0, src_mod0);
1836 m_encoder->SetSrcRegion(0, 0, 2, 1);
1837 m_encoder->SetSrcSubReg(0, 30);
1838
1839 m_encoder->SetSrcModifier(1, src_mod1);
1840 m_encoder->SetSrcRegion(1, 0, 2, 1);
1841 m_encoder->SetSrcSubReg(1, 28);
1842
1843 m_encoder->SetNoMask();
1844 m_encoder->SetDstModifier(modifier);
1845 m_encoder->SetDstSubReg(28);
1846 m_encoder->Add(temp, src, src);
1847 m_encoder->Push();
1848 }
1849
1850 m_encoder->Copy(m_destination, temp);
1851 m_encoder->Push();
1852 }
1853 }
1854
EmitAluIntrinsic(llvm::CallInst * I,const SSource source[2],const DstModifier & modifier)1855 void EmitPass::EmitAluIntrinsic(llvm::CallInst* I, const SSource source[2], const DstModifier& modifier)
1856 {
1857 if (GenIntrinsicInst * CI = dyn_cast<GenIntrinsicInst>(I))
1858 {
1859 switch (CI->getIntrinsicID())
1860 {
1861 case GenISAIntrinsic::GenISA_GradientX:
1862 emitGradientX(source[0], modifier);
1863 break;
1864 case GenISAIntrinsic::GenISA_GradientXfine:
1865 emitGradientXFine(source[0], modifier);
1866 break;
1867 case GenISAIntrinsic::GenISA_GradientY:
1868 emitGradientY(source[0], modifier);
1869 break;
1870 case GenISAIntrinsic::GenISA_GradientYfine:
1871 emitGradientYFine(source[0], modifier);
1872 break;
1873 default:
1874 // no special handling
1875 EmitSimpleAlu(I, source, modifier);
1876 break;
1877 }
1878 }
1879 else if (IntrinsicInst * CI = dyn_cast<IntrinsicInst>(I))
1880 {
1881 switch (CI->getIntrinsicID())
1882 {
1883 case Intrinsic::ctlz:
1884 //Throw away source[1], since for ctlz, this is a flag we don't care about.
1885 emitCtlz(source[0]);
1886 break;
1887 default:
1888 // no special handling
1889 EmitSimpleAlu(I, source, modifier);
1890 break;
1891 }
1892 }
1893 }
1894
1895 // Those help functions are used only by this file. If other files use them,
1896 // they should be moved to helper.cpp.
GetPredicate(llvm::CmpInst::Predicate predicate)1897 static e_predicate GetPredicate(llvm::CmpInst::Predicate predicate)
1898 {
1899 switch (predicate)
1900 {
1901 case llvm::CmpInst::ICMP_UGT:
1902 case llvm::CmpInst::ICMP_SGT:
1903 case llvm::CmpInst::FCMP_UGT:
1904 case llvm::CmpInst::FCMP_OGT:
1905 return EPREDICATE_GT;
1906 case llvm::CmpInst::ICMP_UGE:
1907 case llvm::CmpInst::ICMP_SGE:
1908 case llvm::CmpInst::FCMP_UGE:
1909 case llvm::CmpInst::FCMP_OGE:
1910 return EPREDICATE_GE;
1911 case llvm::CmpInst::ICMP_ULT:
1912 case llvm::CmpInst::ICMP_SLT:
1913 case llvm::CmpInst::FCMP_ULT:
1914 case llvm::CmpInst::FCMP_OLT:
1915 return EPREDICATE_LT;
1916 case llvm::CmpInst::ICMP_ULE:
1917 case llvm::CmpInst::ICMP_SLE:
1918 case llvm::CmpInst::FCMP_ULE:
1919 case llvm::CmpInst::FCMP_OLE:
1920 return EPREDICATE_LE;
1921 case llvm::CmpInst::ICMP_EQ:
1922 case llvm::CmpInst::FCMP_UEQ:
1923 case llvm::CmpInst::FCMP_OEQ:
1924 return EPREDICATE_EQ;
1925 case llvm::CmpInst::ICMP_NE:
1926 case llvm::CmpInst::FCMP_UNE:
1927 return EPREDICATE_NE;
1928 default:
1929 break;
1930 }
1931 IGC_ASSERT(0);
1932 return EPREDICATE_EQ;
1933 }
1934
GetUnsignedType(VISA_Type type)1935 static VISA_Type GetUnsignedType(VISA_Type type)
1936 {
1937 switch (type)
1938 {
1939 case ISA_TYPE_Q:
1940 case ISA_TYPE_UQ:
1941 return ISA_TYPE_UQ;
1942 case ISA_TYPE_D:
1943 case ISA_TYPE_UD:
1944 return ISA_TYPE_UD;
1945 case ISA_TYPE_W:
1946 case ISA_TYPE_UW:
1947 return ISA_TYPE_UW;
1948 case ISA_TYPE_B:
1949 case ISA_TYPE_UB:
1950 return ISA_TYPE_UB;
1951 default:
1952 IGC_ASSERT(0);
1953 break;
1954 }
1955 return ISA_TYPE_UD;
1956 }
1957
GetSignedType(VISA_Type type)1958 static VISA_Type GetSignedType(VISA_Type type)
1959 {
1960 switch (type)
1961 {
1962 case ISA_TYPE_Q:
1963 case ISA_TYPE_UQ:
1964 return ISA_TYPE_Q;
1965 case ISA_TYPE_D:
1966 case ISA_TYPE_UD:
1967 return ISA_TYPE_D;
1968 case ISA_TYPE_W:
1969 case ISA_TYPE_UW:
1970 return ISA_TYPE_W;
1971 case ISA_TYPE_B:
1972 case ISA_TYPE_UB:
1973 return ISA_TYPE_B;
1974 default:
1975 IGC_ASSERT(0);
1976 break;
1977 }
1978 return ISA_TYPE_D;
1979 }
1980
GetUnsignedIntegerType(VISA_Type type)1981 static VISA_Type GetUnsignedIntegerType(VISA_Type type)
1982 {
1983 switch (type)
1984 {
1985 case ISA_TYPE_Q:
1986 case ISA_TYPE_UQ:
1987 return ISA_TYPE_UQ;
1988 case ISA_TYPE_D:
1989 case ISA_TYPE_UD:
1990 return ISA_TYPE_UD;
1991 case ISA_TYPE_W:
1992 case ISA_TYPE_UW:
1993 return ISA_TYPE_UW;
1994 case ISA_TYPE_B:
1995 case ISA_TYPE_UB:
1996 return ISA_TYPE_UB;
1997 case ISA_TYPE_DF:
1998 return ISA_TYPE_UQ;
1999 case ISA_TYPE_F:
2000 return ISA_TYPE_UD;
2001 case ISA_TYPE_HF:
2002 return ISA_TYPE_UW;
2003 default:
2004 IGC_ASSERT(0);
2005 break;
2006 }
2007 return ISA_TYPE_UD;
2008 }
2009
getFPOne(VISA_Type Ty)2010 static uint64_t getFPOne(VISA_Type Ty)
2011 {
2012 switch (Ty)
2013 {
2014 case ISA_TYPE_DF: return 0x3FF0000000000000;
2015 case ISA_TYPE_F: return 0x3F800000;
2016 case ISA_TYPE_BF: return 0x3F80;
2017 case ISA_TYPE_HF: return 0x3C00;
2018 default: break;
2019 }
2020 IGC_ASSERT_MESSAGE(0, "unknown floating type!");
2021 return ~0U;
2022 }
2023
GetSrcVariable(const SSource & source,bool fromConstPool)2024 CVariable* EmitPass::GetSrcVariable(const SSource& source, bool fromConstPool)
2025 {
2026 CVariable* src = m_currShader->GetSymbol(source.value, fromConstPool);
2027 // Change the type of source if needed.
2028 if (source.type != ISA_TYPE_NUM && source.type != src->GetType())
2029 {
2030 if (src->IsImmediate()) {
2031 src = m_currShader->ImmToVariable(src->GetImmediateValue(), source.type);
2032 }
2033 else {
2034 src = m_currShader->GetNewAlias(src, source.type, 0, src->GetNumberElement());
2035 }
2036 }
2037 return src;
2038 }
2039
SetSourceModifiers(unsigned int sourceIndex,const SSource & source)2040 void EmitPass::SetSourceModifiers(unsigned int sourceIndex, const SSource& source)
2041 {
2042 if (source.mod != EMOD_NONE)
2043 {
2044 m_encoder->SetSrcModifier(sourceIndex, source.mod);
2045 }
2046
2047 int numberOfLanes = 0;
2048 if (m_currShader->GetIsUniform(source.value))
2049 {
2050 numberOfLanes = 1;
2051 }
2052 else
2053 {
2054 numberOfLanes = numLanes(m_currShader->m_SIMDSize);
2055 }
2056 int calculated_offset = source.SIMDOffset * numberOfLanes + source.elementOffset;
2057 m_encoder->SetSrcSubReg(sourceIndex, calculated_offset);
2058
2059 if (source.region_set)
2060 {
2061 m_encoder->SetSrcRegion(sourceIndex, source.region[0], source.region[1], source.region[2], source.instance);
2062 }
2063 }
2064
EmitSimpleAlu(Instruction * inst,const SSource sources[2],const DstModifier & modifier)2065 void EmitPass::EmitSimpleAlu(Instruction* inst, const SSource sources[2], const DstModifier& modifier)
2066 {
2067 EmitSimpleAlu(GetOpCode(inst), sources, modifier);
2068 }
2069
EmitSimpleAlu(Instruction * inst,CVariable * dst,CVariable * src0,CVariable * src1)2070 void EmitPass::EmitSimpleAlu(Instruction* inst, CVariable* dst, CVariable* src0, CVariable* src1)
2071 {
2072 EmitSimpleAlu(GetOpCode(inst), dst, src0, src1);
2073 }
2074
EmitSimpleAlu(EOPCODE opCode,const SSource sources[2],const DstModifier & modifier)2075 void EmitPass::EmitSimpleAlu(EOPCODE opCode, const SSource sources[2], const DstModifier& modifier)
2076 {
2077 CVariable* srcs[2] = { nullptr, nullptr };
2078
2079 srcs[0] = GetSrcVariable(sources[0], sources[0].fromConstantPool);
2080 SetSourceModifiers(0, sources[0]);
2081
2082 if (sources[1].value)
2083 {
2084 srcs[1] = GetSrcVariable(sources[1], sources[1].fromConstantPool);
2085 SetSourceModifiers(1, sources[1]);
2086 }
2087 m_encoder->SetDstModifier(modifier);
2088 EmitSimpleAlu(opCode, m_destination, srcs[0], srcs[1]);
2089 }
2090
EmitSimpleAlu(EOPCODE opCode,CVariable * dst,CVariable * src0,CVariable * src1)2091 void EmitPass::EmitSimpleAlu(EOPCODE opCode, CVariable* dst, CVariable* src0, CVariable* src1)
2092 {
2093 switch (opCode)
2094 {
2095 case llvm_fmul:
2096 case llvm_mul:
2097 m_encoder->Mul(dst, src0, src1);
2098 break;
2099 case llvm_fdiv:
2100 m_encoder->Div(dst, src0, src1);
2101 break;
2102 case llvm_fadd:
2103 case llvm_add:
2104 m_encoder->Add(dst, src0, src1);
2105 break;
2106 case llvm_cos:
2107 m_encoder->Cos(dst, src0);
2108 break;
2109 case llvm_sin:
2110 m_encoder->Sin(dst, src0);
2111 break;
2112 case llvm_log:
2113 m_encoder->Log(dst, src0);
2114 break;
2115 case llvm_exp:
2116 m_encoder->Exp(dst, src0);
2117 break;
2118 case llvm_pow:
2119 m_encoder->Pow(dst, src0, src1);
2120 break;
2121 case llvm_sqrt:
2122 m_encoder->Sqrt(dst, src0);
2123 break;
2124 case llvm_rsq:
2125 m_encoder->Rsqrt(dst, src0);
2126 break;
2127 case llvm_floor:
2128 m_encoder->Floor(dst, src0);
2129 break;
2130 case llvm_ceil:
2131 m_encoder->Ceil(dst, src0);
2132 break;
2133 case llvm_round_z:
2134 m_encoder->Truncate(dst, src0);
2135 break;
2136 case llvm_roundne:
2137 m_encoder->RoundNE(dst, src0);
2138 break;
2139 case llvm_imulh:
2140 m_encoder->MulH(dst, src0, src1);
2141 break;
2142 case llvm_umulh:
2143 {
2144 src0 = m_currShader->BitCast(src0, GetUnsignedType(src0->GetType()));
2145 src1 = m_currShader->BitCast(src1, GetUnsignedType(src1->GetType()));
2146 dst = m_currShader->BitCast(dst, GetUnsignedType(dst->GetType()));
2147 m_encoder->MulH(dst, src0, src1);
2148 }
2149 break;
2150 case llvm_sext:
2151 {
2152 if (src0->GetType() == ISA_TYPE_BOOL)
2153 {
2154 CVariable* minusone = m_currShader->ImmToVariable(-1, dst->GetType());
2155 CVariable* zero = m_currShader->ImmToVariable(0, dst->GetType());
2156 m_encoder->Select(src0, dst, minusone, zero);
2157 }
2158 else
2159 {
2160 m_encoder->Cast(dst, src0);
2161 }
2162 }
2163 break;
2164 case llvm_zext:
2165 {
2166 if (src0->GetType() == ISA_TYPE_BOOL)
2167 {
2168 CVariable* one = m_currShader->ImmToVariable(1, dst->GetType());
2169 CVariable* zero = m_currShader->ImmToVariable(0, dst->GetType());
2170 m_encoder->Select(src0, dst, one, zero);
2171 }
2172 else
2173 {
2174 src0 = m_currShader->BitCast(src0, GetUnsignedType(src0->GetType()));
2175 m_encoder->Cast(dst, src0);
2176 }
2177 }
2178 break;
2179 case llvm_trunc:
2180 case llvm_fptrunc:
2181 case llvm_fpext:
2182 case llvm_fptosi:
2183 case llvm_fptoui:
2184 if (dst->GetType() == ISA_TYPE_BOOL)
2185 {
2186 m_encoder->Cmp(EPREDICATE_NE, dst, src0, m_currShader->ImmToVariable(0, src0->GetType()));
2187 }
2188 else
2189 {
2190 if (opCode == llvm_fptoui)
2191 {
2192 dst = m_currShader->BitCast(dst, GetUnsignedType(dst->GetType()));
2193 }
2194 m_encoder->Cast(dst, src0);
2195 }
2196 break;
2197 case llvm_sitofp:
2198 case llvm_uitofp:
2199 if (src0->GetType() == ISA_TYPE_BOOL)
2200 {
2201 CVariable* one = m_currShader->ImmToVariable(getFPOne(dst->GetType()), dst->GetType());
2202 CVariable* zero = m_currShader->ImmToVariable(0, dst->GetType());
2203 m_encoder->Select(src0, dst, one, zero);
2204 }
2205 else
2206 {
2207 if (opCode == llvm_uitofp)
2208 {
2209 src0 = m_currShader->BitCast(src0, GetUnsignedType(src0->GetType()));
2210 }
2211 m_encoder->Cast(dst, src0);
2212 }
2213 break;
2214 case llvm_xor:
2215 m_encoder->Xor(dst, src0, src1);
2216 break;
2217 case llvm_or:
2218 m_encoder->Or(dst, src0, src1);
2219 break;
2220 case llvm_and:
2221 m_encoder->And(dst, src0, src1);
2222 break;
2223 case llvm_udiv:
2224 {
2225 src0 = m_currShader->BitCast(src0, GetUnsignedType(src0->GetType()));
2226 src1 = m_currShader->BitCast(src1, GetUnsignedType(src1->GetType()));
2227 dst = m_currShader->BitCast(dst, GetUnsignedType(dst->GetType()));
2228 m_encoder->Div(dst, src0, src1);
2229 }
2230 break;
2231 case llvm_sdiv:
2232 m_encoder->Div(dst, src0, src1);
2233 break;
2234 case llvm_urem:
2235 {
2236 src0 = m_currShader->BitCast(src0, GetUnsignedType(src0->GetType()));
2237 src1 = m_currShader->BitCast(src1, GetUnsignedType(src1->GetType()));
2238 dst = m_currShader->BitCast(dst, GetUnsignedType(dst->GetType()));
2239 m_encoder->Mod(dst, src0, src1);
2240 }
2241 break;
2242 case llvm_srem:
2243 m_encoder->Mod(dst, src0, src1);
2244 break;
2245 case llvm_shl:
2246 m_encoder->Shl(dst, src0, src1);
2247 break;
2248 case llvm_ishr:
2249 m_encoder->IShr(dst, src0, src1);
2250 break;
2251 case llvm_ushr:
2252 {
2253 src0 = m_currShader->BitCast(src0, GetUnsignedType(src0->GetType()));
2254 m_encoder->Shr(dst, src0, src1);
2255 }
2256 break;
2257 case llvm_min:
2258 m_encoder->Min(dst, src0, src1);
2259 break;
2260 case llvm_max:
2261 m_encoder->Max(dst, src0, src1);
2262 break;
2263 case llvm_uaddc:
2264 {
2265 src0 = m_currShader->BitCast(src0, GetUnsignedType(src0->GetType()));
2266 src1 = m_currShader->BitCast(src1, GetUnsignedType(src1->GetType()));
2267 dst = m_currShader->BitCast(dst, GetUnsignedType(dst->GetType()));
2268 m_encoder->UAddC(dst, src0, src1);
2269 }
2270 break;
2271 case llvm_usubb:
2272 {
2273 src0 = m_currShader->BitCast(src0, GetUnsignedType(src0->GetType()));
2274 src1 = m_currShader->BitCast(src1, GetUnsignedType(src1->GetType()));
2275 dst = m_currShader->BitCast(dst, GetUnsignedType(dst->GetType()));
2276 m_encoder->USubB(dst, src0, src1);
2277 }
2278 break;
2279 case llvm_bfrev:
2280 m_encoder->Bfrev(dst, src0);
2281 break;
2282 case llvm_cbit: {
2283 src0 = m_currShader->BitCast(src0, GetUnsignedType(src0->GetType()));
2284 dst = m_currShader->BitCast(dst, GetUnsignedType(dst->GetType()));
2285 if (dst->GetType() == ISA_TYPE_UD) {
2286 m_encoder->CBit(dst, src0);
2287 break;
2288 }
2289 if (dst->GetType() == ISA_TYPE_UW || dst->GetType() == ISA_TYPE_UB) {
2290 // vISA only supports UD destination. Need a temporary and truncate
2291 // from it.
2292 CVariable* tmp
2293 = m_currShader->GetNewVariable(
2294 dst->GetNumberElement(),
2295 ISA_TYPE_UD,
2296 dst->IsUniform() ? EALIGN_DWORD : EALIGN_GRF,
2297 dst->IsUniform(),
2298 dst->getName());
2299 m_encoder->CBit(tmp, src0);
2300 m_encoder->Push();
2301 m_encoder->Cast(dst, tmp);
2302 break;
2303 }
2304 IGC_ASSERT(dst->GetType() == ISA_TYPE_UQ);
2305 // TODO: So far, 64-bit popcnt is handled in LLVM IR as follows:
2306 // dst = popcnt.32(src & 0xFFFFFFFF);
2307 // dst += popcnt.32(src >> 32);
2308 // We could do the same thing here if the original sequence in LLVM IR
2309 // cannot be translated efficienty.
2310 IGC_ASSERT_MESSAGE(0, "NOT IMPLEMENTED YET!");
2311 break;
2312 }
2313 case llvm_ieee_sqrt:
2314 m_encoder->IEEESqrt(dst, src0);
2315 break;
2316 case llvm_ieee_divide:
2317 m_encoder->IEEEDivide(dst, src0, src1);
2318 break;
2319 default:
2320 //need support
2321 IGC_ASSERT(0);
2322 break;
2323 }
2324 m_encoder->Push();
2325 }
2326
EmitMinMax(bool isMin,bool isUnsigned,const SSource sources[2],const DstModifier & modifier)2327 void EmitPass::EmitMinMax(bool isMin, bool isUnsigned, const SSource sources[2], const DstModifier& modifier) {
2328 EOPCODE opCode = isMin ? llvm_min : llvm_max;
2329 CVariable* srcs[2] = { nullptr, nullptr };
2330 CVariable* dst = m_destination;
2331 srcs[0] = GetSrcVariable(sources[0]);
2332 srcs[1] = GetSrcVariable(sources[1]);
2333 SetSourceModifiers(0, sources[0]);
2334 SetSourceModifiers(1, sources[1]);
2335 m_encoder->SetDstModifier(modifier);
2336 if (isUnsigned) {
2337 srcs[0] = m_currShader->BitCast(srcs[0], GetUnsignedType(srcs[0]->GetType()));
2338 srcs[1] = m_currShader->BitCast(srcs[1], GetUnsignedType(srcs[1]->GetType()));
2339 dst = m_currShader->BitCast(m_destination, GetUnsignedType(m_destination->GetType()));
2340 }
2341 EmitSimpleAlu(opCode, dst, srcs[0], srcs[1]);
2342 }
2343
EmitUAdd(llvm::BinaryOperator * inst,const DstModifier & modifier)2344 void IGC::EmitPass::EmitUAdd(llvm::BinaryOperator* inst, const DstModifier& modifier)
2345 {
2346 // the emit function should be called only if saturation is enabled. In other case the signedness of
2347 // the instruction doesn't play a role in case of computing the instruction.
2348 IGC_ASSERT(modifier.sat == true);
2349 CVariable* srcs[2] = { GetSymbol(inst->getOperand(0)), GetSymbol(inst->getOperand(1)) };
2350
2351 // create new aliases for the operands and the destination
2352 srcs[0] = m_currShader->BitCast(srcs[0], GetUnsignedType(srcs[0]->GetType()));
2353 srcs[1] = m_currShader->BitCast(srcs[1], GetUnsignedType(srcs[1]->GetType()));
2354 CVariable* dst = m_currShader->BitCast(m_destination, GetUnsignedType(m_destination->GetType()));
2355 m_encoder->SetDstModifier(modifier);
2356
2357 EmitSimpleAlu(EOPCODE::llvm_add, dst, srcs[0], srcs[1]);
2358 }
2359
EmitFullMul32(bool isUnsigned,const SSource sources[2],const DstModifier & dstMod)2360 void EmitPass::EmitFullMul32(bool isUnsigned, const SSource sources[2], const DstModifier& dstMod) {
2361 CVariable* srcs[2] = { nullptr, nullptr };
2362 srcs[0] = GetSrcVariable(sources[0]);
2363 srcs[1] = GetSrcVariable(sources[1]);
2364 SetSourceModifiers(0, sources[0]);
2365 SetSourceModifiers(1, sources[1]);
2366 m_encoder->SetDstModifier(dstMod);
2367 if (isUnsigned) {
2368 srcs[0] = m_currShader->BitCast(srcs[0], GetUnsignedType(srcs[0]->GetType()));
2369 srcs[1] = m_currShader->BitCast(srcs[1], GetUnsignedType(srcs[1]->GetType()));
2370 }
2371 // Emit *D x *D -> *Q supported by Gen
2372 EmitSimpleAlu(llvm_mul, m_destination, srcs[0], srcs[1]);
2373 }
2374
EmitFPToIntWithSat(bool isUnsigned,bool needBitCast,VISA_Type type,const SSource & source,const DstModifier & dstMod)2375 void EmitPass::EmitFPToIntWithSat(bool isUnsigned, bool needBitCast, VISA_Type type, const SSource& source, const DstModifier& dstMod) {
2376 EOPCODE op = isUnsigned ? llvm_fptoui : llvm_fptosi;
2377
2378 CVariable* dst = m_destination;
2379 if (type != m_destination->GetType()) {
2380 dst = m_currShader->GetNewVariable(
2381 dst->GetNumberElement(), type,
2382 m_currShader->getGRFAlignment(),
2383 dst->IsUniform(), m_destination->getName());
2384 }
2385 else if (needBitCast) {
2386 dst = m_currShader->BitCast(dst, GetUnsignedIntegerType(dst->GetType()));
2387 }
2388 DstModifier satDstMod = dstMod;
2389 satDstMod.sat = true;
2390 m_encoder->SetDstModifier(satDstMod);
2391
2392 CVariable* src = GetSrcVariable(source);
2393 SetSourceModifiers(0, source);
2394 EmitSimpleAlu(op, dst, src, nullptr);
2395 if (type != m_destination->GetType()) {
2396 CVariable* tmp = m_currShader->BitCast(dst, GetUnsignedType(type));
2397 dst = m_destination;
2398 if (needBitCast) {
2399 dst = m_currShader->BitCast(dst, GetUnsignedIntegerType(dst->GetType()));
2400 }
2401 m_encoder->Cast(dst, tmp);
2402 }
2403 }
2404
EmitIntegerTruncWithSat(bool isSignedDst,bool isSignedSrc,const SSource & source,const DstModifier & dstMod)2405 void EmitPass::EmitIntegerTruncWithSat(bool isSignedDst, bool isSignedSrc, const SSource& source, const DstModifier& dstMod) {
2406 CVariable* dst = m_destination;
2407 if (!isSignedDst) {
2408 dst = m_currShader->BitCast(dst, GetUnsignedIntegerType(dst->GetType()));
2409 }
2410 DstModifier satDstMod = dstMod;
2411 satDstMod.sat = true;
2412 m_encoder->SetDstModifier(satDstMod);
2413
2414 CVariable* src = GetSrcVariable(source);
2415 if (!isSignedSrc) {
2416 src = m_currShader->BitCast(src, GetUnsignedIntegerType(src->GetType()));
2417 }
2418 m_encoder->SetSrcModifier(0, source.mod);
2419
2420 m_encoder->Cast(dst, src);
2421 m_encoder->Push();
2422 }
2423
EmitInsertValueToStruct(llvm::InsertValueInst * II,bool forceVectorInit,const DstModifier & DstMod)2424 void EmitPass::EmitInsertValueToStruct(llvm::InsertValueInst* II, bool forceVectorInit, const DstModifier& DstMod)
2425 {
2426 Value* structOp = II->getOperand(0);
2427 StructType* sTy = dyn_cast<StructType>(structOp->getType());
2428 auto& DL = II->getParent()->getParent()->getParent()->getDataLayout();
2429 const StructLayout* SL = DL.getStructLayout(sTy);
2430
2431 // Get the source operand to insert
2432 CVariable* SrcV = GetSymbol(II->getOperand(1));
2433
2434 if (forceVectorInit)
2435 {
2436 IGC_ASSERT(isa<Constant>(structOp) || structOp->getValueID() == Value::UndefValueVal);
2437 }
2438 // Get the dst struct variable, or create one with constant values initialized if it does not exist
2439 CVariable* DstV = m_currShader->GetStructVariable(II, forceVectorInit);
2440
2441 IGC_ASSERT_MESSAGE((!SrcV->IsUniform() && DstV->IsUniform()) == false, "Can't insert vector value into a scalar struct!");
2442
2443 // Copy source value into the struct offset
2444 unsigned idx = *II->idx_begin();
2445 unsigned elementOffset = (unsigned)SL->getElementOffset(idx);
2446 unsigned nLanes = DstV->IsUniform() ? 1 : numLanes(m_currShader->m_dispatchSize);
2447 CVariable* elementDst = nullptr;
2448 if (SrcV->IsUniform())
2449 elementDst = m_currShader->GetNewAlias(DstV, SrcV->GetType(), elementOffset * nLanes, SrcV->GetNumberElement() * nLanes);
2450 else
2451 elementDst = m_currShader->GetNewAlias(DstV, SrcV->GetType(), elementOffset * nLanes, SrcV->GetNumberElement());
2452
2453 emitCopyAll(elementDst, SrcV, sTy->getStructElementType(idx));
2454 }
2455
EmitExtractValueFromStruct(llvm::ExtractValueInst * EI,const DstModifier & DstMod)2456 void EmitPass::EmitExtractValueFromStruct(llvm::ExtractValueInst* EI, const DstModifier& DstMod)
2457 {
2458 CVariable* SrcV = GetSymbol(EI->getOperand(0));
2459 unsigned idx = *EI->idx_begin();
2460 StructType* sTy = dyn_cast<StructType>(EI->getOperand(0)->getType());
2461 auto& DL = m_currShader->entry->getParent()->getDataLayout();
2462 const StructLayout* SL = DL.getStructLayout(sTy);
2463
2464 // For extract value, src and dest should share uniformity
2465 IGC_ASSERT(nullptr != m_destination);
2466 IGC_ASSERT(nullptr != SrcV);
2467 IGC_ASSERT(m_destination->IsUniform() == SrcV->IsUniform());
2468
2469 bool isUniform = SrcV->IsUniform();
2470 unsigned nLanes = isUniform ? 1 : numLanes(m_currShader->m_dispatchSize);
2471 unsigned elementOffset = (unsigned)SL->getElementOffset(idx) * nLanes;
2472 SrcV = m_currShader->GetNewAlias(SrcV, m_destination->GetType(), elementOffset, m_destination->GetNumberElement(), isUniform);
2473
2474 // Copy from struct to dest
2475 emitCopyAll(m_destination, SrcV, sTy->getStructElementType(idx));
2476 }
2477
EmitAddPair(GenIntrinsicInst * GII,const SSource Sources[4],const DstModifier & DstMod)2478 void EmitPass::EmitAddPair(GenIntrinsicInst* GII, const SSource Sources[4], const DstModifier& DstMod) {
2479 Value* L=nullptr, * H=nullptr;
2480 std::tie(L, H) = getPairOutput(GII);
2481 CVariable* Lo = L ? GetSymbol(L) : nullptr;
2482 CVariable* Hi = H ? GetSymbol(H) : nullptr;
2483 IGC_ASSERT(Lo == m_destination || Hi == m_destination);
2484
2485 CVariable* L0 = GetSrcVariable(Sources[0]);
2486 CVariable* H0 = GetSrcVariable(Sources[1]);
2487 CVariable* L1 = GetSrcVariable(Sources[2]);
2488 CVariable* H1 = GetSrcVariable(Sources[3]);
2489 for (unsigned srcId = 0; srcId < 4; ++srcId) {
2490 SetSourceModifiers(srcId, Sources[srcId]);
2491 }
2492
2493 m_encoder->AddPair(Lo, Hi, L0, H0, L1, H1);
2494 m_encoder->Push();
2495 }
2496
EmitSubPair(GenIntrinsicInst * GII,const SSource Sources[4],const DstModifier & DstMod)2497 void EmitPass::EmitSubPair(GenIntrinsicInst* GII, const SSource Sources[4], const DstModifier& DstMod) {
2498 Value* L = nullptr, * H = nullptr;
2499 std::tie(L, H) = getPairOutput(GII);
2500 CVariable* Lo = L ? GetSymbol(L) : nullptr;
2501 CVariable* Hi = H ? GetSymbol(H) : nullptr;
2502 IGC_ASSERT(Lo == m_destination || Hi == m_destination);
2503
2504 CVariable* L0 = GetSrcVariable(Sources[0]);
2505 CVariable* H0 = GetSrcVariable(Sources[1]);
2506 CVariable* L1 = GetSrcVariable(Sources[2]);
2507 CVariable* H1 = GetSrcVariable(Sources[3]);
2508
2509 m_encoder->SubPair(Lo, Hi, L0, H0, L1, H1);
2510 m_encoder->Push();
2511 }
2512
EmitMulPair(GenIntrinsicInst * GII,const SSource Sources[4],const DstModifier & DstMod)2513 void EmitPass::EmitMulPair(GenIntrinsicInst* GII, const SSource Sources[4], const DstModifier& DstMod) {
2514 Value* L = nullptr, * H = nullptr;
2515 std::tie(L, H) = getPairOutput(GII);
2516 CVariable* Lo = L ? GetSymbol(L) : nullptr;
2517 CVariable* Hi = H ? GetSymbol(H) : nullptr;
2518 IGC_ASSERT(Lo == m_destination || Hi == m_destination);
2519
2520 CVariable* L0 = GetSrcVariable(Sources[0]);
2521 CVariable* H0 = GetSrcVariable(Sources[1]);
2522 CVariable* L1 = GetSrcVariable(Sources[2]);
2523 CVariable* H1 = GetSrcVariable(Sources[3]);
2524
2525 // Use `UD` for Lo(s).
2526 if (Lo && Lo->GetType() != ISA_TYPE_UD) Lo = m_currShader->BitCast(Lo, ISA_TYPE_UD);
2527 if (L0->GetType() != ISA_TYPE_UD) L0 = m_currShader->BitCast(L0, ISA_TYPE_UD);
2528 if (L1->GetType() != ISA_TYPE_UD) L1 = m_currShader->BitCast(L1, ISA_TYPE_UD);
2529
2530 if (Lo == nullptr && Hi == nullptr)
2531 {
2532 return;
2533 }
2534
2535 if (Lo != nullptr && Hi == nullptr)
2536 {
2537 // Lo = A * B
2538 m_encoder->Mul(Lo, L0, L1);
2539 m_encoder->Push();
2540 return;
2541 }
2542
2543 // Algorithm:
2544 // AB - L0, L1
2545 // CD - H0, H1
2546 // ----
2547 // E
2548 // F
2549 // G
2550 // H - 'H' spills into bit 65 - only needed if overflow detection is required
2551 // --------
2552 // dstLow = E
2553 // dstHigh = F + G + carry
2554
2555 CVariable* dstHiTmp = m_currShader->GetNewVariable(
2556 Hi->GetNumberElement(), Hi->GetType(), Hi->GetAlign(), Hi->IsUniform(), Hi->getName());
2557
2558 if (Lo == nullptr && Hi != nullptr)
2559 {
2560 // Cr = carry(A * B)
2561 m_encoder->MulH(dstHiTmp, L0, L1);
2562 m_encoder->Push();
2563 }
2564 else
2565 {
2566 // For those platforms natively not support DW-DW multiply, use vISA madw instruction instead of mul/mulh to get better performance.
2567 if (m_currShader->m_Platform->noNativeDwordMulSupport())
2568 {
2569 // (Cr, E) = A * B
2570 // dst size should be GRF-aligned and doubled as it has both low and high results.
2571 // We must make the dst element number is numDWPerGRF aligned. For example, if the madw is SIMD1,
2572 // the dst has only 1 DW as low result in 1 GRF and only 1 DW as high result in another GRF. We should
2573 // set the dst as (numDWPerGRF * 2) element but not 2 DW elements. This is required by madw.
2574 auto numDWPerGRF = getGRFSize() / SIZE_DWORD;
2575 auto numElements = iSTD::Align(Lo->GetNumberElement(), numDWPerGRF);
2576 CVariable* DstTmp = m_currShader->GetNewVariable(
2577 numElements * 2, ISA_TYPE_UD, EALIGN_GRF, Lo->IsUniform(),
2578 CName(Lo->getName(), "int64Tmp"));
2579 CVariable* zero = m_currShader->ImmToVariable(0, ISA_TYPE_UD);
2580 m_encoder->Madw(DstTmp, L0, L1, zero);
2581
2582 // dstLow = E
2583 m_encoder->SetSrcRegion(0, 1, 1, 0);
2584 m_encoder->Copy(Lo, DstTmp);
2585 m_encoder->Push();
2586
2587 // dstHigh = Cr
2588 uint regOffset = (uint)std::ceil((float)(numElements * CEncoder::GetCISADataTypeSize(ISA_TYPE_UD)) / getGRFSize());
2589 m_encoder->SetSrcSubVar(0, regOffset);
2590 m_encoder->SetSrcRegion(0, 1, 1, 0);
2591 m_encoder->Copy(dstHiTmp, DstTmp);
2592 m_encoder->Push();
2593 }
2594 else
2595 {
2596 // E = A * B
2597 m_encoder->Mul(Lo, L0, L1);
2598 m_encoder->Push();
2599
2600 // Cr = carry(A * B)
2601 m_encoder->MulH(dstHiTmp, L0, L1);
2602 m_encoder->Push();
2603 }
2604 }
2605
2606 // F = A * D
2607 CVariable* T0 = m_currShader->GetNewVariable(
2608 Hi->GetNumberElement(), Hi->GetType(), Hi->GetAlign(), Hi->IsUniform(),
2609 CName(Hi->getName(), "int64HiTmp"));
2610 m_encoder->Mul(T0, L0, H1);
2611 m_encoder->Push();
2612
2613 // dstHigh = Cr + F
2614 m_encoder->Add(dstHiTmp, dstHiTmp, T0);
2615 m_encoder->Push();
2616
2617 // G = B * C
2618 m_encoder->Mul(T0, L1, H0);
2619 m_encoder->Push();
2620
2621 // dstHigh = Cr + F + G
2622 m_encoder->Add(Hi, dstHiTmp, T0);
2623 m_encoder->Push();
2624 }
2625
EmitPtrToPair(GenIntrinsicInst * GII,const SSource Sources[1],const DstModifier & DstMod)2626 void EmitPass::EmitPtrToPair(GenIntrinsicInst* GII, const SSource Sources[1], const DstModifier& DstMod) {
2627 Value* L = nullptr, * H = nullptr;
2628 std::tie(L, H) = getPairOutput(GII);
2629 CVariable* Lo = L ? GetSymbol(L) : nullptr;
2630 CVariable* Hi = H ? GetSymbol(H) : nullptr;
2631 IGC_ASSERT(Lo == m_destination || Hi == m_destination);
2632
2633 CVariable* Src = GetSrcVariable(Sources[0]);
2634 Src = m_currShader->BitCast(Src, m_destination->GetType());
2635
2636 unsigned AS = Sources[0].value->getType()->getPointerAddressSpace();
2637 bool isPtr32 = m_currShader->GetContext()->getRegisterPointerSizeInBits(AS) == 32;
2638
2639 if (Lo) {
2640 if (isPtr32) {
2641 m_encoder->Cast(Lo, Src);
2642 m_encoder->Push();
2643 }
2644 else {
2645 if (!Src->IsUniform())
2646 m_encoder->SetSrcRegion(0, 2, 1, 0);
2647 m_encoder->SetSrcSubReg(0, 0);
2648 m_encoder->Copy(Lo, Src);
2649 m_encoder->Push();
2650 }
2651 }
2652
2653 if (Hi) {
2654 if (isPtr32) {
2655 Src = m_currShader->ImmToVariable(0, m_destination->GetType());
2656 m_encoder->Cast(Hi, Src);
2657 m_encoder->Push();
2658 }
2659 else {
2660 if (!Src->IsUniform())
2661 m_encoder->SetSrcRegion(0, 2, 1, 0);
2662 m_encoder->SetSrcSubReg(0, 1);
2663 m_encoder->Copy(Hi, Src);
2664 m_encoder->Push();
2665 }
2666 }
2667 }
2668
2669
EmitSIToFPZExt(const SSource & source,const DstModifier & dstMod)2670 void EmitPass::EmitSIToFPZExt(const SSource& source, const DstModifier& dstMod) {
2671 CVariable* flag = GetSrcVariable(source);
2672 CVariable* one = m_currShader->ImmToVariable(getFPOne(m_destination->GetType()), m_destination->GetType());
2673 CVariable* zero = m_currShader->ImmToVariable(0, m_destination->GetType());
2674 m_encoder->SetDstModifier(dstMod);
2675 m_encoder->Select(flag, m_destination, one, zero);
2676 m_encoder->Push();
2677 }
2678
emitCtlz(const SSource & source)2679 void EmitPass::emitCtlz(const SSource& source)
2680 {
2681 // This does not go through the standard EmitAluIntrinsic pass because
2682 // that creates a redundant SetP due to an unused i1 literal.
2683 CVariable* src = GetSrcVariable(source);
2684 src = m_currShader->BitCast(src, GetUnsignedType(src->GetType()));
2685 CVariable* dst = m_currShader->BitCast(m_destination, GetUnsignedType(m_destination->GetType()));
2686 SetSourceModifiers(0, source);
2687 m_encoder->Ctlz(dst, src);
2688 m_encoder->Push();
2689 }
2690
emitVMESendIME2(GenIntrinsicInst * inst)2691 void EmitPass::emitVMESendIME2(GenIntrinsicInst* inst) {
2692 CVariable* inputVar = GetSymbol(inst->getArgOperand(0));
2693 CVariable* srcImgBTI = GetSymbol(inst->getArgOperand(1));
2694 CVariable* refImgBTI = GetSymbol(inst->getArgOperand(2));
2695 CVariable* bwdRefImgBTI = GetSymbol(inst->getArgOperand(3));
2696 const COMMON_ISA_VME_STREAM_MODE streamMode = (COMMON_ISA_VME_STREAM_MODE)(cast<ConstantInt>(inst->getArgOperand(4))->getZExtValue());
2697
2698 const bool isDualRef = refImgBTI->GetImmediateValue() != bwdRefImgBTI->GetImmediateValue();
2699 // If the BTIs aren't consecutive then we can't do VME.
2700 if (isDualRef)
2701 {
2702 IGC_ASSERT_MESSAGE(refImgBTI->GetImmediateValue() + 1 == bwdRefImgBTI->GetImmediateValue(), "refImg BTI and bwdRefImg BTI are not consecutive!");
2703 }
2704
2705 uint32_t regs2snd = 4 + 2;
2706 uint32_t regs2rcv = CShader::GetIMEReturnPayloadSize(inst);
2707
2708 if ((streamMode == VME_STREAM_IN) || (streamMode == VME_STREAM_IN_OUT))
2709 {
2710 regs2snd += 2;
2711 if (isDualRef)
2712 {
2713 regs2snd += 2;
2714 }
2715 }
2716
2717 // TODO: this may waste registers. We can allocate payload during evaluation
2718 // stage, but that needs to initialize and copy payload.
2719 // Need to revisit when VME initial support is done.
2720 if (inputVar->GetSize() > (regs2snd * getGRFSize()))
2721 {
2722 inputVar = m_currShader->GetNewAlias(inputVar, ISA_TYPE_UD, 0, regs2snd * 8);
2723 }
2724
2725 CVariable* outputVar = m_destination;
2726
2727 if (outputVar->GetSize() > (regs2rcv * getGRFSize()))
2728 {
2729 outputVar = m_currShader->GetNewAlias(outputVar, ISA_TYPE_UD, 0, regs2rcv * 8);
2730 }
2731
2732 const uint32_t desc = VMEDescriptor(streamMode, (uint32_t)(srcImgBTI->GetImmediateValue()),
2733 EU_GEN7_5_VME_MESSAGE_IME, regs2snd, regs2rcv);
2734
2735 CVariable* messDesc = m_currShader->ImmToVariable(desc, ISA_TYPE_UD);
2736
2737 m_encoder->Send(outputVar, inputVar, EU_MESSAGE_TARGET_SFID_VME, messDesc, false);
2738 m_encoder->Push();
2739 }
2740
emitVMESendIME(GenIntrinsicInst * inst)2741 void EmitPass::emitVMESendIME(GenIntrinsicInst* inst) {
2742 const bool has_bwd_ref_image = inst->getIntrinsicID() == GenISAIntrinsic::GenISA_vmeSendIME2;
2743 CVariable* outputVar = GetSymbol(inst->getArgOperand(0));
2744
2745 CVariable* uniInputVar = GetSymbol(inst->getArgOperand(1));
2746 CVariable* imeInputVar = GetSymbol(inst->getArgOperand(2));
2747
2748 CVariable* srcImgBTI = GetSymbol(inst->getArgOperand(3));
2749 CVariable* refImgBTI = GetSymbol(inst->getArgOperand(4));
2750 CVariable* bwdRefImgBTI = has_bwd_ref_image ? GetSymbol(inst->getArgOperand(5)) : nullptr;
2751 // If the BTIs aren't consecutive then we can't do VME.
2752 IGC_ASSERT_MESSAGE(srcImgBTI->GetImmediateValue() + 1 == refImgBTI->GetImmediateValue(), "srcImg BTI and refImg BTI are not consecutive!");
2753 if (bwdRefImgBTI != nullptr) {
2754 IGC_ASSERT_MESSAGE(srcImgBTI->GetImmediateValue() + 2 == bwdRefImgBTI->GetImmediateValue(), "srcImg BTI and bwdRefImg BTI are not consecutive!");
2755 }
2756
2757 uint rest_opnd_idx_base = has_bwd_ref_image ? 6 : 5;
2758
2759 CVariable* ref0Var = GetSymbol(inst->getArgOperand(rest_opnd_idx_base));
2760 CVariable* ref1Var = GetSymbol(inst->getArgOperand(rest_opnd_idx_base + 1));
2761 CVariable* costCenterVar = GetSymbol(inst->getArgOperand(rest_opnd_idx_base + 2));
2762
2763 // Those are raw operands, thus make sure they are GRF-aligned
2764 ref0Var = ReAlignUniformVariable(ref0Var, EALIGN_GRF);
2765 ref1Var = ReAlignUniformVariable(ref1Var, EALIGN_GRF);
2766
2767 // costCenterVar needs to be 1 GRF. If it is uniform, extend it to 1 GRF [bdw+]
2768 if (costCenterVar->IsUniform())
2769 {
2770 VISA_Type costVisaTy = costCenterVar->GetType();
2771 IGC_ASSERT_MESSAGE(SIZE_DWORD == CEncoder::GetCISADataTypeSize(costVisaTy),
2772 "VME IME's cost center var has wrong type!");
2773 CVariable* newVar = m_currShader->GetNewVariable(8, ISA_TYPE_UD, EALIGN_GRF, CName::NONE);
2774
2775 m_encoder->SetNoMask();
2776 m_encoder->SetSimdSize(SIMDMode::SIMD8);
2777 m_encoder->Copy(newVar, costCenterVar);
2778 m_encoder->Push();
2779
2780 costCenterVar = newVar;
2781 }
2782
2783 unsigned char streamMode = VME_STREAM_DISABLE;
2784 unsigned char searchControlMode = VME_SEARCH_SINGLE_REF_SINGLE_REC_SINGLE_START;
2785
2786 // Force write the costCenter here. I'd like to have uniInputVar setup before calling
2787 // emitVMESendIME so we don't burn movs each time we call this but CM uses it for now.
2788 // Fix later.
2789 {
2790 CVariable* uniAlias = m_currShader->GetNewAlias(uniInputVar, ISA_TYPE_UD, 3 * getGRFSize(), 8);
2791 m_encoder->SetNoMask();
2792 m_encoder->SetSrcRegion(0, 0, 1, 0);
2793 m_encoder->SetSimdSize(SIMDMode::SIMD8);
2794 m_encoder->Copy(uniAlias, costCenterVar);
2795 m_encoder->Push();
2796 }
2797
2798 m_encoder->SetNoMask();
2799 m_encoder->SendVmeIme(srcImgBTI,
2800 streamMode,
2801 searchControlMode,
2802 uniInputVar,
2803 imeInputVar,
2804 ref0Var,
2805 ref1Var,
2806 costCenterVar,
2807 outputVar);
2808 m_encoder->Push();
2809 return;
2810 }
2811
emitVMESendFBR(GenIntrinsicInst * inst)2812 void EmitPass::emitVMESendFBR(GenIntrinsicInst* inst) {
2813 CVariable* outputVar = GetSymbol(inst->getArgOperand(0));
2814
2815 CVariable* uniInputVar = GetSymbol(inst->getArgOperand(1));
2816 CVariable* fbrInputVar = GetSymbol(inst->getArgOperand(2));
2817
2818 CVariable* srcImgBTI = GetSymbol(inst->getArgOperand(3));
2819 CVariable* refImgBTI = GetSymbol(inst->getArgOperand(4));
2820 // If the BTIs aren't consecutive then we can't do VME.
2821 IGC_ASSERT_MESSAGE(srcImgBTI->GetImmediateValue() + 1 == refImgBTI->GetImmediateValue(), "srcImg BTI and refImg BTI are not consecutive!");
2822
2823 const uint rest_opnd_idx_base = 5;
2824 CVariable* FBRMbModeVar = GetSymbol(inst->getArgOperand(rest_opnd_idx_base));
2825 CVariable* FBRSubMbShapeVar = GetSymbol(inst->getArgOperand(rest_opnd_idx_base + 1));
2826 CVariable* FBRSubPredModeVar = GetSymbol(inst->getArgOperand(rest_opnd_idx_base + 2));
2827
2828 m_encoder->SendVmeFbr(srcImgBTI, uniInputVar, fbrInputVar, FBRMbModeVar, FBRSubMbShapeVar, FBRSubPredModeVar, outputVar);
2829 m_encoder->Push();
2830 return;
2831 }
2832
emitVMESendFBR2(GenIntrinsicInst * inst)2833 void EmitPass::emitVMESendFBR2(GenIntrinsicInst* inst) {
2834 CVariable* inputVar = GetSymbol(inst->getArgOperand(0));
2835 CVariable* srcImgBTI = GetSymbol(inst->getArgOperand(1));
2836 CVariable* refImgBTI = GetSymbol(inst->getArgOperand(2));
2837 CVariable* bwdRefImgBTI = GetSymbol(inst->getArgOperand(3));
2838
2839 const bool isDualRef = refImgBTI->GetImmediateValue() != bwdRefImgBTI->GetImmediateValue();
2840 // If the BTIs aren't consecutive then we can't do VME.
2841 if (isDualRef)
2842 {
2843 IGC_ASSERT_MESSAGE(refImgBTI->GetImmediateValue() + 1 == bwdRefImgBTI->GetImmediateValue(), "refImg BTI and bwdRefImg BTI are not consecutive!");
2844 }
2845
2846 const uint32_t regs2rcv = (7 + 0), regs2snd = (4 + 4);
2847 const uint32_t desc = VMEDescriptor(VME_STREAM_DISABLE, (uint32_t)(srcImgBTI->GetImmediateValue()),
2848 EU_GEN7_5_VME_MESSAGE_FBR, regs2snd, regs2rcv);
2849
2850 CVariable* messDesc = m_currShader->ImmToVariable(desc, ISA_TYPE_UD);
2851
2852 CVariable* outputVar = m_destination;
2853
2854 if (outputVar->GetSize() > (regs2rcv * getGRFSize()))
2855 {
2856 outputVar = m_currShader->GetNewAlias(outputVar, ISA_TYPE_UD, 0, regs2rcv * 8);
2857 }
2858
2859 m_encoder->Send(outputVar, inputVar, EU_MESSAGE_TARGET_SFID_CRE, messDesc, false);
2860 m_encoder->Push();
2861
2862 return;
2863 }
2864
emitVMESendSIC(GenIntrinsicInst * inst)2865 void EmitPass::emitVMESendSIC(GenIntrinsicInst* inst)
2866 {
2867 CVariable* outputVar = GetSymbol(inst->getArgOperand(0));
2868 CVariable* uniInputVar = GetSymbol(inst->getArgOperand(1));
2869 CVariable* sicInputVar = GetSymbol(inst->getArgOperand(2));
2870 CVariable* srcImgBTI = GetSymbol(inst->getArgOperand(3));
2871 CVariable* ref0ImgBTI = GetSymbol(inst->getArgOperand(4));
2872 CVariable* ref1ImgBTI = GetSymbol(inst->getArgOperand(5));
2873 // If the BTIs aren't consecutive then we can't do VME.
2874 IGC_ASSERT_MESSAGE(srcImgBTI->GetImmediateValue() + 1 == ref0ImgBTI->GetImmediateValue(), "srcImg BTI and ref0Img BTI are not consecutive!");
2875 // In the non-bidirectional case, we just pass the same reference image into the
2876 // forward and backward slots.
2877 if (ref0ImgBTI->GetImmediateValue() != ref1ImgBTI->GetImmediateValue())
2878 {
2879 IGC_ASSERT_MESSAGE(ref0ImgBTI->GetImmediateValue() + 1 == ref1ImgBTI->GetImmediateValue(), "ref0Img BTI and ref1Img BTI are not consecutive!");
2880 }
2881
2882 m_encoder->SendVmeSic(srcImgBTI, uniInputVar, sicInputVar, outputVar);
2883 m_encoder->Push();
2884 }
2885
emitVMESendSIC2(GenIntrinsicInst * inst)2886 void EmitPass::emitVMESendSIC2(GenIntrinsicInst* inst)
2887 {
2888 CVariable* inputVar = GetSymbol(inst->getArgOperand(0));
2889 CVariable* srcImgBTI = GetSymbol(inst->getArgOperand(1));
2890 CVariable* fwdRefImgBTI = GetSymbol(inst->getArgOperand(2));
2891 CVariable* bwdRefImgBTI = GetSymbol(inst->getArgOperand(3));
2892
2893 const bool isDualRef = fwdRefImgBTI->GetImmediateValue() != bwdRefImgBTI->GetImmediateValue();
2894 // If the BTIs aren't consecutive then we can't do VME.
2895 if (isDualRef)
2896 {
2897 IGC_ASSERT_MESSAGE(fwdRefImgBTI->GetImmediateValue() + 1 == bwdRefImgBTI->GetImmediateValue(), "refImg BTI and bwdRefImg BTI are not consecutive!");
2898 }
2899
2900 // If the BTIs aren't consecutive then we can't do VME. And this only applies to case
2901 // when either fwdRefImg or bwdRefImg is presented.
2902 if (srcImgBTI->GetImmediateValue() != fwdRefImgBTI->GetImmediateValue())
2903 {
2904 IGC_ASSERT_MESSAGE(srcImgBTI->GetImmediateValue() + 1 == fwdRefImgBTI->GetImmediateValue(), "srcImg BTI and refImg BTI are not consecutive!");
2905
2906 if (fwdRefImgBTI->GetImmediateValue() != bwdRefImgBTI->GetImmediateValue())
2907 {
2908 IGC_ASSERT_MESSAGE(srcImgBTI->GetImmediateValue() + 2 == bwdRefImgBTI->GetImmediateValue(), "srcImg BTI and bwdRefImg BTI are not consecutive!");
2909 }
2910 }
2911
2912 const uint32_t regs2rcv = (7 + 0), regs2snd = (4 + 4);
2913 const uint32_t desc = VMEDescriptor(VME_STREAM_DISABLE, (uint32_t)(srcImgBTI->GetImmediateValue()),
2914 EU_GEN7_5_VME_MESSAGE_SIC, regs2snd, regs2rcv);
2915
2916 CVariable* messDesc = m_currShader->ImmToVariable(desc, ISA_TYPE_UD);
2917
2918 CVariable* outputVar = m_destination;
2919
2920 if (outputVar->GetSize() > (regs2rcv * getGRFSize()))
2921 {
2922 outputVar = m_currShader->GetNewAlias(outputVar, ISA_TYPE_UD, 0, regs2rcv * 8);
2923 }
2924
2925 m_encoder->Send(outputVar, inputVar, EU_MESSAGE_TARGET_SFID_CRE, messDesc, false);
2926 m_encoder->Push();
2927
2928 return;
2929 }
2930
emitCreateMessagePhases(GenIntrinsicInst * inst)2931 void EmitPass::emitCreateMessagePhases(GenIntrinsicInst* inst) {
2932 IGC_ASSERT_MESSAGE((m_destination->GetType() == ISA_TYPE_UD || m_destination->GetType() == ISA_TYPE_D), "Destination type is expected to be UD or D!");
2933 IGC_ASSERT_MESSAGE(isa<ConstantInt>(inst->getArgOperand(0)), "Num phases expected to be const!");
2934 unsigned int numPhases = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(0))->getZExtValue());
2935
2936 const uint16_t numSimdLanes = numLanes(m_SimdMode);
2937 IGC_ASSERT(0 < numSimdLanes);
2938 unsigned int numWideSimdIters = numPhases * 8 / numSimdLanes;
2939 unsigned int remSimd8Iters = (numPhases * 8 % numSimdLanes) / 8;
2940
2941 // Zero as many message phases as possible using the widest SIMD
2942 for (unsigned int i = 0; i < numWideSimdIters; ++i) {
2943 CVariable* messagePhase = m_currShader->GetNewAlias(m_destination, ISA_TYPE_UD, i * numSimdLanes * SIZE_DWORD, numSimdLanes);
2944
2945 m_encoder->SetNoMask();
2946 m_encoder->SetSimdSize(m_SimdMode);
2947 m_encoder->Copy(messagePhase, m_currShader->ImmToVariable(0, ISA_TYPE_UD));
2948 m_encoder->Push();
2949 }
2950
2951 // Zero the remaining message phases using SIMD8
2952 for (unsigned int i = 0; i < remSimd8Iters; ++i) {
2953 CVariable* messagePhase = m_currShader->GetNewAlias(m_destination, ISA_TYPE_UD, (i * 8 + numWideSimdIters * numSimdLanes) * SIZE_DWORD, numLanes(SIMDMode::SIMD8));
2954
2955 m_encoder->SetNoMask();
2956 m_encoder->SetSimdSize(SIMDMode::SIMD8);
2957 m_encoder->Copy(messagePhase, m_currShader->ImmToVariable(0, ISA_TYPE_UD));
2958 m_encoder->Push();
2959 }
2960 }
2961
GetTypeFromSize(unsigned size)2962 static VISA_Type GetTypeFromSize(unsigned size)
2963 {
2964 switch (size)
2965 {
2966 case 1:
2967 return ISA_TYPE_UB;
2968 case 2:
2969 return ISA_TYPE_UW;
2970 case 4:
2971 return ISA_TYPE_UD;
2972 case 8:
2973 return ISA_TYPE_UQ;
2974 default:
2975 IGC_ASSERT_MESSAGE(0, "unknown size");
2976 return ISA_TYPE_UD;
2977 }
2978 }
2979
emitSimdMediaRegionCopy(llvm::GenIntrinsicInst * inst)2980 void EmitPass::emitSimdMediaRegionCopy(llvm::GenIntrinsicInst* inst)
2981 {
2982 CVariable* pDst = GetSymbol(inst->getArgOperand(0));
2983 unsigned dbyteoffset = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(1))->getZExtValue());
2984 unsigned dstride = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(2))->getZExtValue());
2985 unsigned dnumelem = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(3))->getZExtValue());
2986 CVariable* pSrc = GetSymbol(inst->getArgOperand(4));
2987 Value* sbyteoffset = inst->getArgOperand(5);
2988 unsigned vstride = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(6))->getZExtValue());
2989 unsigned width = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(7))->getZExtValue());
2990 unsigned hstride = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(8))->getZExtValue());
2991 unsigned typesize = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(9))->getZExtValue());
2992 unsigned execsize = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(10))->getZExtValue());
2993 unsigned snumelem = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(11))->getZExtValue());
2994
2995 CVariable* pDstOffset = m_currShader->GetNewAlias(pDst, GetTypeFromSize(typesize), (uint16_t)dbyteoffset, (uint16_t)dnumelem);
2996
2997 auto setup = [&]()
2998 {
2999 m_encoder->SetSimdSize(lanesToSIMDMode(execsize));
3000 m_encoder->SetNoMask();
3001 m_encoder->SetDstRegion(dstride);
3002 m_encoder->SetSrcRegion(0, vstride, width, hstride);
3003 };
3004
3005 if (isa<ConstantInt>(sbyteoffset))
3006 {
3007 CVariable* pSrcOffset = m_currShader->GetNewAlias(
3008 pSrc,
3009 GetTypeFromSize(typesize),
3010 int_cast<uint16_t>(cast<ConstantInt>(sbyteoffset)->getZExtValue()),
3011 (uint16_t)snumelem);
3012
3013 setup();
3014 m_encoder->Copy(pDstOffset, pSrcOffset);
3015 m_encoder->Push();
3016 }
3017 else
3018 {
3019 CVariable* pSrcOffset = m_currShader->GetNewAddressVariable(
3020 1,
3021 GetTypeFromSize(typesize),
3022 true,
3023 false,
3024 inst->getName());
3025
3026 m_encoder->AddrAdd(pSrcOffset, pSrc, m_currShader->BitCast(GetSymbol(sbyteoffset), ISA_TYPE_UW));
3027 setup();
3028 m_encoder->Copy(pDstOffset, pSrcOffset);
3029 m_encoder->Push();
3030 }
3031 }
3032
emitExtractMVAndSAD(llvm::GenIntrinsicInst * inst)3033 void EmitPass::emitExtractMVAndSAD(llvm::GenIntrinsicInst* inst)
3034 {
3035 CVariable* pMV = GetSymbol(inst->getArgOperand(0));
3036 CVariable* pSAD = GetSymbol(inst->getArgOperand(1));
3037 CVariable* pResult = GetSymbol(inst->getArgOperand(2));
3038 CVariable* pBlockType = GetSymbol(inst->getArgOperand(3));
3039
3040 // W5.0 - W5.7 from Return Data Message Phases (InterDistortion)
3041 CVariable* pDist = m_currShader->GetNewAlias(pResult, ISA_TYPE_UW, 5 * getGRFSize(), 16);
3042 CVariable* pSADAlias = m_currShader->GetNewAlias(pSAD, ISA_TYPE_UW, 0, 16);
3043
3044 CVariable* pFlag = m_currShader->GetNewVariable(
3045 16,
3046 ISA_TYPE_BOOL,
3047 EALIGN_GRF,
3048 CName::NONE);
3049
3050 auto EmitCmp = [&](unsigned imm)
3051 {
3052 m_encoder->SetSimdSize(SIMDMode::SIMD16);
3053 m_encoder->SetNoMask();
3054 m_encoder->Cmp(EPREDICATE_EQ, pFlag, pBlockType, m_currShader->ImmToVariable(imm, ISA_TYPE_UD));
3055 m_encoder->Push();
3056 };
3057
3058 // block type == 0 (16x16)
3059 EmitCmp(0);
3060
3061
3062 // Only one SAD, replicate it across.
3063 // (+f1.1) mov (16) r16.0<1>:uw r73.0<0;1,0>:uw { Align1, H1, NoMask }
3064 m_encoder->SetPredicate(pFlag);
3065 m_encoder->SetNoMask();
3066 m_encoder->SetSrcRegion(0, 0, 1, 0);
3067 m_encoder->SetSimdSize(SIMDMode::SIMD16);
3068 m_encoder->Copy(pSADAlias, pDist);
3069 m_encoder->Push();
3070
3071 // block type == 1 (8x8)
3072 EmitCmp(1);
3073
3074 // 4 SADs, copy each one 4 times.
3075 // (+f1.1) mov(4) r16.12<1>:uw r73.12<0;1,0>:uw { Align1, Q1, NoMask }
3076 // (+f1.1) mov(4) r16.8<1>:uw r73.8<0;1,0>:uw { Align1, Q1, NoMask }
3077 // (+f1.1) mov(4) r16.4<1>:uw r73.4<0;1,0>:uw { Align1, Q1, NoMask }
3078 // (+f1.1) mov(4) r16.0<1>:uw r73.0<0;1,0>:uw { Align1, Q1, NoMask }
3079 for (int i = 0; i < 4; i++)
3080 {
3081 m_encoder->SetPredicate(pFlag);
3082 m_encoder->SetNoMask();
3083 m_encoder->SetSrcRegion(0, 0, 1, 0);
3084 m_encoder->SetSimdSize(SIMDMode::SIMD4);
3085 CVariable* pDistOffset = m_currShader->GetNewAlias(pDist, ISA_TYPE_UW, i * 8, 4);
3086 CVariable* pSADOffset = m_currShader->GetNewAlias(pSADAlias, ISA_TYPE_UW, i * 8, 4);
3087 m_encoder->Copy(pSADOffset, pDistOffset);
3088 m_encoder->Push();
3089 }
3090
3091 // block type == 2 (4x4)
3092 EmitCmp(2);
3093
3094 // All 16 SADs present, copy othem over.
3095 // (+f1.1) mov (16) r16.0<1>:uw r73.0<8;8,1>:uw {Align1, H1, NoMask}
3096 m_encoder->SetPredicate(pFlag);
3097 m_encoder->SetNoMask();
3098 m_encoder->SetSimdSize(SIMDMode::SIMD16);
3099 m_encoder->Copy(pSADAlias, pDist);
3100 m_encoder->Push();
3101
3102 // Copy over MVs
3103 for (int i = 0; i < 2; i++)
3104 {
3105 CVariable* pResultOffset = m_currShader->GetNewAlias(pResult, ISA_TYPE_UD,
3106 (1 * getGRFSize()) + (2 * i * getGRFSize()),
3107 16);
3108 CVariable* pMVOffset = m_currShader->GetNewAlias(pMV, ISA_TYPE_UD,
3109 2 * i * getGRFSize(),
3110 16);
3111 m_encoder->SetNoMask();
3112 m_encoder->SetSimdSize(SIMDMode::SIMD16);
3113 m_encoder->Copy(pMVOffset, pResultOffset);
3114 m_encoder->Push();
3115 }
3116 }
3117
emitCmpSADs(llvm::GenIntrinsicInst * inst)3118 void EmitPass::emitCmpSADs(llvm::GenIntrinsicInst* inst)
3119 {
3120 // When called, this builtin will compare two SAD values
3121 // and take the minimum of the two. The minimum MV associated
3122 // with the minimum SAD is also selected.
3123 CVariable* pMVCurr = GetSymbol(inst->getArgOperand(0));
3124 CVariable* pSADCurr = GetSymbol(inst->getArgOperand(1));
3125 CVariable* pMVMin = GetSymbol(inst->getArgOperand(2));
3126 CVariable* pSADMin = GetSymbol(inst->getArgOperand(3));
3127
3128 CVariable* pFlag = m_currShader->GetNewVariable(
3129 16,
3130 ISA_TYPE_BOOL,
3131 EALIGN_GRF,
3132 CName::NONE);
3133
3134 CVariable* pSADCurrAlias = m_currShader->GetNewAlias(pSADCurr, ISA_TYPE_UW, 0, 16);
3135 CVariable* pSADMinAlias = m_currShader->GetNewAlias(pSADMin, ISA_TYPE_UW, 0, 16);
3136
3137 m_encoder->SetNoMask();
3138 m_encoder->SetSimdSize(SIMDMode::SIMD16);
3139 m_encoder->Cmp(EPREDICATE_LT, pFlag, pSADCurrAlias, pSADMinAlias);
3140 m_encoder->Push();
3141
3142 // Collect the SADs
3143 m_encoder->SetNoMask();
3144 m_encoder->SetSimdSize(SIMDMode::SIMD16);
3145 m_encoder->Select(pFlag, pSADMinAlias, pSADCurrAlias, pSADMinAlias);
3146 m_encoder->Push();
3147
3148 // Collect the MVs
3149 if (m_currShader->m_Platform->hasNoFullI64Support()) {
3150 CVariable* pMVMinAlias = m_currShader->GetNewAlias(pMVMin, ISA_TYPE_UD, 0, 32);
3151 CVariable* pMVCurrAlias = m_currShader->GetNewAlias(pMVCurr, ISA_TYPE_UD, 0, 32);
3152
3153 //(W&fX.X) mov(8|M0) r(DST).0<1>:f r(SRC).0<2;1,0>:f
3154 m_encoder->SetNoMask();
3155 m_encoder->SetSimdSize(SIMDMode::SIMD8);
3156 m_encoder->SetSrcRegion(0, 2, 1, 0);
3157 m_encoder->SetSrcRegion(1, 2, 1, 0);
3158 m_encoder->SetDstRegion(2);
3159 m_encoder->Select(pFlag, pMVMinAlias, pMVCurrAlias, pMVMinAlias);
3160 m_encoder->Push();
3161
3162 //(W&fX.X) mov(8|M0) r(DST).1<1>:f r(SRC).1<2;1,0>:f
3163 m_encoder->SetNoMask();
3164 m_encoder->SetSimdSize(SIMDMode::SIMD8);
3165 m_encoder->SetSrcRegion(0, 2, 1, 0);
3166 m_encoder->SetSrcRegion(1, 2, 1, 0);
3167 m_encoder->SetDstRegion(2);
3168 m_encoder->SetSrcSubReg(0, 1);
3169 m_encoder->SetSrcSubReg(1, 1);
3170 m_encoder->SetDstSubReg(1);
3171 m_encoder->Select(pFlag, pMVMinAlias, pMVCurrAlias, pMVMinAlias);
3172 m_encoder->Push();
3173
3174 //(W&fX.X) mov(8|M8) r(DST+2).0<2>:f r(SRC+2).0<2;1,0>:f
3175 m_encoder->SetNoMask();
3176 m_encoder->SetSimdSize(SIMDMode::SIMD8);
3177 m_encoder->SetMask(EMASK_Q2);
3178 m_encoder->SetSrcSubVar(0, 2);
3179 m_encoder->SetSrcSubVar(1, 2);
3180 m_encoder->SetDstSubVar(2);
3181 m_encoder->SetSrcRegion(0, 2, 1, 0);
3182 m_encoder->SetSrcRegion(1, 2, 1, 0);
3183 m_encoder->SetDstRegion(2);
3184 m_encoder->Select(pFlag, pMVMinAlias, pMVCurrAlias, pMVMinAlias);
3185 m_encoder->Push();
3186
3187 //(W&fX.X) mov(8|M8) r(DST+2).1<2>:f r(SRC+2).1<2;1,0>:f
3188 m_encoder->SetNoMask();
3189 m_encoder->SetSimdSize(SIMDMode::SIMD8);
3190 m_encoder->SetMask(EMASK_Q2);
3191 m_encoder->SetSrcSubVar(0, 2);
3192 m_encoder->SetSrcSubVar(1, 2);
3193 m_encoder->SetDstSubVar(2);
3194 m_encoder->SetSrcRegion(0, 2, 1, 0);
3195 m_encoder->SetSrcRegion(1, 2, 1, 0);
3196 m_encoder->SetDstRegion(2);
3197 m_encoder->SetSrcSubReg(0, 1);
3198 m_encoder->SetSrcSubReg(1, 1);
3199 m_encoder->SetDstSubReg(1);
3200 m_encoder->Select(pFlag, pMVMinAlias, pMVCurrAlias, pMVMinAlias);
3201 m_encoder->Push();
3202 }
3203 else {
3204 CVariable* pMVCurrAlias = m_currShader->GetNewAlias(pMVCurr, ISA_TYPE_UQ, 0, 16);
3205 CVariable* pMVMinAlias = m_currShader->GetNewAlias(pMVMin, ISA_TYPE_UQ, 0, 16);
3206
3207 m_encoder->SetNoMask();
3208 m_encoder->SetSimdSize(SIMDMode::SIMD16);
3209 m_encoder->Select(pFlag, pMVMinAlias, pMVCurrAlias, pMVMinAlias);
3210 m_encoder->Push();
3211 }
3212 }
3213
SameVar(CVariable * A,CVariable * B)3214 static bool SameVar(CVariable* A, CVariable* B)
3215 {
3216 A = (A->GetAlias() && A->GetAliasOffset() == 0) ? A->GetAlias() : A;
3217 B = (B->GetAlias() && B->GetAliasOffset() == 0) ? B->GetAlias() : B;
3218
3219 return A == B;
3220 }
3221
emitSimdSetMessagePhase(llvm::GenIntrinsicInst * inst)3222 void EmitPass::emitSimdSetMessagePhase(llvm::GenIntrinsicInst* inst) {
3223 CVariable* messagePhases = GetSymbol(inst->getArgOperand(0));
3224 const uint32_t phaseIndex = int_cast<uint32_t>(cast<ConstantInt>(inst->getArgOperand(1))->getZExtValue());
3225 const uint32_t numPhases = int_cast<uint32_t>(cast<ConstantInt>(inst->getArgOperand(2))->getZExtValue());
3226 const uint32_t dstSubReg = int_cast<uint32_t>(cast<ConstantInt>(inst->getArgOperand(3))->getZExtValue());
3227 const uint32_t numLanesPerPhase = int_cast<uint32_t>(cast<ConstantInt>(inst->getArgOperand(4))->getZExtValue());
3228 const SIMDMode simdMode = lanesToSIMDMode(numLanesPerPhase);
3229 Value* value = inst->getArgOperand(5);
3230 const uint16_t eltSizeInBytes = (uint16_t)m_DL->getTypeSizeInBits(value->getType()) / 8;
3231 const uint16_t numEltsPerPhase = getGRFSize() / eltSizeInBytes;
3232 const VISA_Type type = GetTypeFromSize(eltSizeInBytes);
3233
3234 CVariable* val = GetSymbol(value);
3235
3236 if (!SameVar(m_destination, messagePhases))
3237 {
3238 emitCopyAll(m_destination, messagePhases, inst->getArgOperand(0)->getType());
3239 }
3240
3241 for (uint32_t i = 0; i < numPhases; ++i) {
3242 CVariable* src = val->IsUniform() ? val : m_currShader->GetNewAlias(val, type, i * getGRFSize(), numEltsPerPhase);
3243 CVariable* dst = m_currShader->GetNewAlias(m_destination, type, (i + phaseIndex) * getGRFSize(), numEltsPerPhase);
3244
3245 m_encoder->SetNoMask();
3246 m_encoder->SetSimdSize(simdMode);
3247 m_encoder->SetDstSubReg(dstSubReg);
3248 if (!val->IsUniform())
3249 m_encoder->SetSrcRegion(0, 0, numEltsPerPhase, 1);
3250 m_encoder->Copy(dst, src);
3251 m_encoder->Push();
3252 }
3253
3254 return;
3255 }
3256
emitBroadcastMessagePhase(llvm::GenIntrinsicInst * inst)3257 void EmitPass::emitBroadcastMessagePhase(llvm::GenIntrinsicInst* inst) {
3258 const uint16_t eltSizeInBytes = (uint16_t)m_DL->getTypeSizeInBits(inst->getType()) / 8;
3259 const uint32_t width = int_cast<uint32_t>(cast<ConstantInt>(inst->getArgOperand(3))->getZExtValue());
3260 emitGetMessagePhaseType(inst, GetTypeFromSize(eltSizeInBytes), width);
3261 }
3262
emitSimdGetMessagePhase(llvm::GenIntrinsicInst * inst)3263 void EmitPass::emitSimdGetMessagePhase(llvm::GenIntrinsicInst* inst) {
3264 CVariable* messagePhases = GetSymbol(inst->getArgOperand(0));
3265 const uint32_t phaseIndex = int_cast<uint32_t>(cast<ConstantInt>(inst->getArgOperand(1))->getZExtValue());
3266 const uint32_t numPhases = int_cast<uint32_t>(cast<ConstantInt>(inst->getArgOperand(2))->getZExtValue());
3267 const uint16_t eltSizeInBytes = (uint16_t)m_DL->getTypeSizeInBits(inst->getType()) / 8;
3268 const uint16_t numEltsPerPhase = getGRFSize() / eltSizeInBytes;
3269 const VISA_Type type = GetTypeFromSize(eltSizeInBytes);
3270 SIMDMode simdMode = SIMDMode::UNKNOWN;
3271
3272 if (eltSizeInBytes == 8) {
3273 simdMode = SIMDMode::SIMD4;
3274 }
3275 else if (eltSizeInBytes == 4) {
3276 simdMode = SIMDMode::SIMD8;
3277 }
3278 else if (eltSizeInBytes == 2) {
3279 simdMode = SIMDMode::SIMD16;
3280 }
3281 else {
3282 IGC_ASSERT_MESSAGE(0, "Unhandled data type");
3283 }
3284
3285 for (uint32_t i = 0; i < numPhases; ++i) {
3286 CVariable* src = m_currShader->GetNewAlias(messagePhases, type, (i + phaseIndex) * getGRFSize(), numEltsPerPhase);
3287 CVariable* dst = m_currShader->GetNewAlias(m_destination, type, i * getGRFSize(), numEltsPerPhase);
3288
3289 m_encoder->SetNoMask();
3290 m_encoder->SetSimdSize(simdMode);
3291 m_encoder->SetSrcRegion(0, 0, numEltsPerPhase, 1);
3292 m_encoder->Copy(dst, src);
3293 m_encoder->Push();
3294 }
3295
3296 return;
3297 }
3298
emitGetMessagePhaseType(llvm::GenIntrinsicInst * inst,VISA_Type type,uint32_t width)3299 void EmitPass::emitGetMessagePhaseType(llvm::GenIntrinsicInst* inst, VISA_Type type, uint32_t width) {
3300 CVariable* messagePhases = GetSymbol(inst->getArgOperand(0));
3301 unsigned int phaseIndex = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(1))->getZExtValue());
3302 unsigned int phaseSubindex = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(2))->getZExtValue());
3303
3304 IGC_ASSERT_MESSAGE(phaseIndex * getGRFSize() < messagePhases->GetSize(), "out of bounds!");
3305
3306 CVariable* messagePhaseElem = m_currShader->GetNewAlias(messagePhases, type, phaseIndex * getGRFSize(), 1);
3307
3308 m_encoder->SetNoMask();
3309 m_encoder->SetSrcRegion(0, 0, width, 1);
3310 m_encoder->SetSrcSubReg(0, phaseSubindex);
3311
3312 m_encoder->Copy(m_destination, messagePhaseElem);
3313 m_encoder->Push();
3314 }
3315
emitGetMessagePhaseX(llvm::GenIntrinsicInst * inst)3316 void EmitPass::emitGetMessagePhaseX(llvm::GenIntrinsicInst* inst) {
3317 unsigned size = inst->getType()->getScalarSizeInBits() / 8;
3318 emitGetMessagePhaseType(inst, GetTypeFromSize(size), /* width */ 1);
3319 }
3320
emitSetMessagePhaseType_legacy(GenIntrinsicInst * inst,VISA_Type type)3321 void EmitPass::emitSetMessagePhaseType_legacy(GenIntrinsicInst* inst, VISA_Type type)
3322 {
3323 CVariable* messagePhases = GetSymbol(inst->getArgOperand(0));
3324 unsigned int phaseIndex = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(1))->getZExtValue());
3325 unsigned int phaseSubindex = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(2))->getZExtValue());
3326 CVariable* val = GetSymbol(inst->getArgOperand(3));
3327
3328 CVariable* messagePhaseElem = m_currShader->GetNewAlias(messagePhases, type, phaseIndex * getGRFSize(), 1);
3329 m_encoder->SetSimdSize(SIMDMode::SIMD1);
3330 m_encoder->SetNoMask();
3331 m_encoder->SetSrcRegion(0, 0, 1, 0);
3332 m_encoder->SetDstSubReg(phaseSubindex);
3333 m_encoder->Copy(messagePhaseElem, val);
3334 m_encoder->Push();
3335 }
3336
emitSetMessagePhaseType(GenIntrinsicInst * inst,VISA_Type type)3337 void EmitPass::emitSetMessagePhaseType(GenIntrinsicInst* inst, VISA_Type type) {
3338 CVariable* messagePhases = GetSymbol(inst->getArgOperand(0));
3339 unsigned int phaseIndex = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(1))->getZExtValue());
3340 unsigned int phaseSubindex = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(2))->getZExtValue());
3341 CVariable* val = GetSymbol(inst->getArgOperand(3));
3342
3343 IGC_ASSERT_MESSAGE(phaseIndex * getGRFSize() < messagePhases->GetSize(), "out of bounds!");
3344
3345 if (!SameVar(m_destination, messagePhases))
3346 {
3347 emitCopyAll(m_destination, messagePhases, inst->getArgOperand(0)->getType());
3348 }
3349
3350 CVariable* messagePhaseElem = m_currShader->GetNewAlias(m_destination, type, phaseIndex * getGRFSize(), 1);
3351 m_encoder->SetSimdSize(SIMDMode::SIMD1);
3352 m_encoder->SetNoMask();
3353 m_encoder->SetSrcRegion(0, 0, 1, 0);
3354 m_encoder->SetDstSubReg(phaseSubindex);
3355 m_encoder->Copy(messagePhaseElem, val);
3356 m_encoder->Push();
3357 }
3358
emitSetMessagePhaseX_legacy(GenIntrinsicInst * inst)3359 void EmitPass::emitSetMessagePhaseX_legacy(GenIntrinsicInst* inst)
3360 {
3361 Type* pTy = inst->getArgOperand(inst->getNumArgOperands() - 1)->getType();
3362 unsigned size = pTy->getScalarSizeInBits() / 8;
3363 emitSetMessagePhaseType_legacy(inst, GetTypeFromSize(size));
3364 }
3365
emitSetMessagePhaseX(GenIntrinsicInst * inst)3366 void EmitPass::emitSetMessagePhaseX(GenIntrinsicInst* inst) {
3367 Type* pTy = inst->getArgOperand(inst->getNumArgOperands() - 1)->getType();
3368 unsigned size = pTy->getScalarSizeInBits() / 8;
3369 emitSetMessagePhaseType(inst, GetTypeFromSize(size));
3370 }
3371
emitGetMessagePhase(llvm::GenIntrinsicInst * inst)3372 void EmitPass::emitGetMessagePhase(llvm::GenIntrinsicInst* inst) {
3373 if (isa<UndefValue>(inst->getArgOperand(0)))
3374 return;
3375
3376 CVariable* messagePhases = GetSymbol(inst->getArgOperand(0));
3377 unsigned int phaseIndex = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(1))->getZExtValue());
3378
3379 IGC_ASSERT_MESSAGE(phaseIndex * getGRFSize() < messagePhases->GetSize(), "out of bounds!");
3380
3381 CVariable* messagePhase = m_currShader->GetNewAlias(messagePhases, ISA_TYPE_UD, phaseIndex * getGRFSize(), 8);
3382 m_encoder->SetSimdSize(SIMDMode::SIMD8);
3383 m_encoder->SetNoMask();
3384 m_encoder->Copy(m_destination, messagePhase);
3385 m_encoder->Push();
3386 }
3387
emitSetMessagePhase_legacy(llvm::GenIntrinsicInst * inst)3388 void EmitPass::emitSetMessagePhase_legacy(llvm::GenIntrinsicInst* inst)
3389 {
3390 CVariable* messagePhases = GetSymbol(inst->getArgOperand(0));
3391 unsigned int phaseIndex = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(1))->getZExtValue());
3392 CVariable* val = GetSymbol(inst->getArgOperand(2));
3393
3394 CVariable* messagePhase = m_currShader->GetNewAlias(messagePhases, ISA_TYPE_UD, phaseIndex * getGRFSize(), 8);
3395 m_encoder->SetSimdSize(SIMDMode::SIMD8);
3396 m_encoder->SetNoMask();
3397 m_encoder->Copy(messagePhase, val);
3398 m_encoder->Push();
3399 }
3400
emitSetMessagePhase(llvm::GenIntrinsicInst * inst)3401 void EmitPass::emitSetMessagePhase(llvm::GenIntrinsicInst* inst) {
3402 CVariable* messagePhases = GetSymbol(inst->getArgOperand(0));
3403 unsigned int phaseIndex = int_cast<unsigned>(cast<ConstantInt>(inst->getArgOperand(1))->getZExtValue());
3404 CVariable* val = GetSymbol(inst->getArgOperand(2));
3405
3406 IGC_ASSERT_MESSAGE(phaseIndex * getGRFSize() < messagePhases->GetSize(), "out of bounds!");
3407
3408 if (!SameVar(m_destination, messagePhases))
3409 {
3410 emitCopyAll(m_destination, messagePhases, inst->getArgOperand(0)->getType());
3411 }
3412
3413 CVariable* messagePhase = m_currShader->GetNewAlias(m_destination, ISA_TYPE_UD, phaseIndex * getGRFSize(), 8);
3414 m_encoder->SetSimdSize(SIMDMode::SIMD8);
3415 m_encoder->SetNoMask();
3416 m_encoder->Copy(messagePhase, val);
3417 m_encoder->Push();
3418 }
3419
3420 // VA
emitVideoAnalyticSLM(llvm::GenIntrinsicInst * inst,const DWORD responseLen)3421 void EmitPass::emitVideoAnalyticSLM(llvm::GenIntrinsicInst* inst, const DWORD responseLen)
3422 {
3423 int argNum = 0;
3424 CVariable* outputVar = GetSymbol(inst->getArgOperand(argNum++));
3425 CVariable* coords = GetSymbol(inst->getArgOperand(argNum++));
3426 CVariable* size = NULL;
3427
3428 IGC_ASSERT_MESSAGE(!(m_currShader->m_dispatchSize == SIMDMode::SIMD32 && m_encoder->IsSecondHalf()), "VA Intrinsics are simd independent");
3429 GenISAIntrinsic::ID id = inst->getIntrinsicID();
3430 if (id == GenISAIntrinsic::GenISA_vaCentroid ||
3431 id == GenISAIntrinsic::GenISA_vaBoolCentroid ||
3432 id == GenISAIntrinsic::GenISA_vaBoolSum)
3433 {
3434 size = GetSymbol(inst->getArgOperand(argNum++));
3435 }
3436
3437 CVariable* srcImg = GetSymbol(inst->getArgOperand(argNum++));
3438
3439 // So far we support only one VA function per kernel, and other sample
3440 // messages are not supported when there is VA function within the kernel.
3441 // So, for now it should be fine to always use sampler 0 for VA functions.
3442 DWORD samplerIndex = 0;
3443 CVariable* sampler = m_currShader->ImmToVariable(samplerIndex, ISA_TYPE_UD);
3444
3445 uint16_t newNumElems = int_cast<uint16_t>(responseLen * getGRFSize() / SIZE_DWORD);
3446
3447 CVariable* vaResult = m_currShader->GetNewVariable(
3448 newNumElems,
3449 ISA_TYPE_UD,
3450 outputVar->GetAlign(),
3451 false,
3452 CName::NONE);
3453
3454 if (inst->getIntrinsicID() == GenISAIntrinsic::GenISA_vaConvolve)
3455 {
3456 CVariable* convResult = m_currShader->GetNewAlias(
3457 vaResult,
3458 ISA_TYPE_UW,
3459 0,
3460 newNumElems * 2);
3461
3462 m_encoder->SendVideoAnalytic(inst, convResult, coords, size, srcImg, sampler);
3463 }
3464 else
3465 {
3466 m_encoder->SendVideoAnalytic(inst, vaResult, coords, size, srcImg, sampler);
3467 }
3468 m_encoder->Push();
3469
3470 // Data port write msg header:
3471 DWORD msgLen = 2;
3472 DWORD resLen = 0;
3473 bool headerPresent = false;
3474 bool endOfThread = false;
3475 DWORD messageSpecificControl = encodeMessageSpecificControlForReadWrite(
3476 EU_DATA_PORT_WRITE_MESSAGE_TYPE_UNTYPED_SURFACE_WRITE,
3477 CHANNEL_MASK_R,
3478 SIMDMode::SIMD8);
3479 bool invalidateAfterReadEnable = false;
3480 DWORD btiIndex = SLM_BTI;
3481
3482 DWORD descValue = DataPortWrite(
3483 msgLen,
3484 resLen,
3485 headerPresent,
3486 endOfThread,
3487 EU_DATA_PORT_WRITE_MESSAGE_TYPE_UNTYPED_SURFACE_WRITE,
3488 messageSpecificControl,
3489 invalidateAfterReadEnable,
3490 btiIndex);
3491
3492 DWORD exDescValue = EU_GEN7_5_MESSAGE_TARGET_DATA_PORT_DATA_CACHE_1;
3493
3494 CVariable* desc = m_currShader->ImmToVariable(descValue, ISA_TYPE_UD);
3495 CVariable* exdesc = m_currShader->ImmToVariable(exDescValue, ISA_TYPE_UD);
3496
3497 CVariable* storeMessage = m_currShader->GetNewVariable(
3498 2 * getGRFSize() / SIZE_DWORD,
3499 ISA_TYPE_UD,
3500 outputVar->GetAlign(),
3501 false,
3502 CName::NONE);
3503
3504 m_encoder->SetSimdSize(SIMDMode::SIMD8);
3505 m_encoder->SetNoMask();
3506 m_encoder->Cast(storeMessage, m_currShader->ImmToVariable(0x76543210, ISA_TYPE_V));
3507 m_encoder->Shl(storeMessage, storeMessage, m_currShader->ImmToVariable(2, ISA_TYPE_UD));
3508 m_encoder->Push();
3509
3510 for (DWORD i = 0; i < responseLen; i++)
3511 {
3512 if (i > 0)
3513 {
3514 m_encoder->SetSimdSize(SIMDMode::SIMD8);
3515 m_encoder->SetNoMask();
3516 m_encoder->Add(storeMessage, storeMessage, m_currShader->ImmToVariable(0x20, ISA_TYPE_UD));
3517 m_encoder->Push();
3518 }
3519
3520 m_encoder->SetSimdSize(SIMDMode::SIMD8);
3521 m_encoder->SetNoMask();
3522 m_encoder->SetDstSubVar(1);
3523 m_encoder->SetSrcSubVar(0, i);
3524 m_encoder->Copy(storeMessage, vaResult);
3525
3526 m_encoder->Send(NULL, storeMessage,
3527 EU_GEN7_5_MESSAGE_TARGET_DATA_PORT_DATA_CACHE_1, exdesc, desc, false);
3528 m_encoder->Push();
3529 }
3530
3531 return;
3532 }
3533
emitVideoAnalyticGRF(llvm::GenIntrinsicInst * inst,const DWORD responseLen)3534 void EmitPass::emitVideoAnalyticGRF(llvm::GenIntrinsicInst* inst, const DWORD responseLen)
3535 {
3536 CVariable* dst = m_destination;
3537 int argNum = 0;
3538 CVariable* coords = GetSymbol(inst->getArgOperand(argNum++));
3539
3540 // So far we support only one VA function per kernel, and other sample
3541 // messages are not supported when there is VA function within the kernel.
3542 // So, for now it should be fine to always use sampler 0 for VA functions.
3543 CVariable* sampler = m_currShader->ImmToVariable(0, ISA_TYPE_UD);
3544 CVariable* srcImg = GetSymbol(inst->getArgOperand(argNum++));
3545
3546 m_encoder->SendVideoAnalytic(inst, dst, coords, nullptr, srcImg, sampler);
3547 m_encoder->Push();
3548 }
3549
BinaryUnary(llvm::Instruction * inst,const SSource source[2],const DstModifier & modifier)3550 void EmitPass::BinaryUnary(llvm::Instruction* inst, const SSource source[2], const DstModifier& modifier)
3551 {
3552 switch (inst->getOpcode())
3553 {
3554 case Instruction::FCmp:
3555 case Instruction::ICmp:
3556 Cmp(cast<CmpInst>(inst)->getPredicate(), source, modifier);
3557 break;
3558 case Instruction::Sub:
3559 case Instruction::FSub:
3560 Sub(source, modifier);
3561 break;
3562 case Instruction::FDiv:
3563 FDiv(source, modifier);
3564 break;
3565 case Instruction::Xor:
3566 Xor(source, modifier);
3567 break;
3568 case Instruction::Mul:
3569 Mul(source, modifier);
3570 break;
3571 case Instruction::Call:
3572 EmitAluIntrinsic(cast<CallInst>(inst), source, modifier);
3573 break;
3574 default:
3575 // other instruction don't need special handling
3576 EmitSimpleAlu(inst, source, modifier);
3577 break;
3578 }
3579 }
3580
Sub(const SSource sources[2],const DstModifier & modifier)3581 void EmitPass::Sub(const SSource sources[2], const DstModifier& modifier)
3582 {
3583 CVariable* src0 = GetSrcVariable(sources[0]);
3584 CVariable* src1 = GetSrcVariable(sources[1]);
3585 e_modifier mod1 = CombineModifier(EMOD_NEG, sources[1].mod);
3586
3587 m_encoder->SetDstModifier(modifier);
3588 SetSourceModifiers(0, sources[0]);
3589 SetSourceModifiers(1, sources[1]);
3590 // override modifier of source 1
3591 m_encoder->SetSrcModifier(1, mod1);
3592 m_encoder->Add(m_destination, src0, src1);
3593 m_encoder->Push();
3594
3595 }
3596
Mul64(CVariable * dst,CVariable * src[2],SIMDMode simdMode,bool noMask) const3597 void EmitPass::Mul64(CVariable* dst, CVariable* src[2], SIMDMode simdMode, bool noMask) const
3598 {
3599 auto EncoderInit = [this, simdMode, noMask]()->void
3600 {
3601 m_encoder->SetSimdSize(simdMode);
3602 if (noMask)
3603 {
3604 m_encoder->SetNoMask();
3605 }
3606 };
3607
3608 // Mul64 does not write to m_destination!
3609
3610 IGC_ASSERT_MESSAGE((src[1]->GetType() == ISA_TYPE_Q) || (src[1]->GetType() == ISA_TYPE_UQ),
3611 "Cannot multiply a qword by a non-qword type");
3612
3613 // The signedness of the hi-part type should be the same as that
3614 // of the original destination type.
3615 VISA_Type hiType;
3616 if (dst->GetType() == ISA_TYPE_Q)
3617 hiType = ISA_TYPE_D;
3618 else
3619 hiType = ISA_TYPE_UD;
3620
3621 // Figure out what the hi and what the lo part of each source is.
3622 // For non-uniforms, this requires an unpack.
3623 CVariable* srcLo[2], * srcHi[2];
3624 for (int i = 0; i < 2; ++i)
3625 {
3626 CVariable* srcAsUD;
3627 if (src[i]->IsUniform())
3628 {
3629 if (src[i]->IsImmediate())
3630 {
3631 srcLo[i] = m_currShader->ImmToVariable((uint)src[i]->GetImmediateValue(), ISA_TYPE_UD);
3632 srcHi[i] = m_currShader->ImmToVariable(src[i]->GetImmediateValue() >> 32, hiType);
3633 }
3634 else
3635 {
3636 srcAsUD = m_currShader->BitCast(src[i], ISA_TYPE_UD);
3637 srcLo[i] = m_currShader->GetNewAlias(srcAsUD, ISA_TYPE_UD, 0, 1);
3638 srcHi[i] = m_currShader->GetNewAlias(srcAsUD, hiType, SIZE_DWORD, 1);
3639 }
3640 }
3641 else
3642 {
3643 srcAsUD = m_currShader->BitCast(src[i], ISA_TYPE_UD);
3644 //TODO: Would it be better for these two to be consecutive?
3645 srcLo[i] = m_currShader->GetNewVariable(
3646 src[i]->GetNumberElement(),
3647 ISA_TYPE_UD, EALIGN_GRF, false,
3648 CName(src[i]->getName(), i == 0 ? "Lo0" : "Lo1"));
3649 srcHi[i] = m_currShader->GetNewVariable(src[i]->GetNumberElement(),
3650 hiType, EALIGN_GRF, false,
3651 CName(src[i]->getName(), i == 0 ? "Hi0" : "Hi1"));
3652 EncoderInit();
3653 m_encoder->SetSrcRegion(0, 2, 1, 0);
3654 m_encoder->Copy(srcLo[i], srcAsUD);
3655 m_encoder->Push();
3656
3657 EncoderInit();
3658 m_encoder->SetSrcSubReg(0, 1);
3659 m_encoder->SetSrcRegion(0, 2, 1, 0);
3660 m_encoder->Copy(srcHi[i], srcAsUD);
3661 m_encoder->Push();
3662
3663 }
3664 }
3665
3666 //Now, generate the required sequence of multiplies and adds
3667 TODO("Do not generate intermediate multiplies by constant 0 or 1.");
3668 TODO("Do smarter pattern matching to look for non-constant zexted/sexted sources.");
3669
3670 CVariable* dstLo, * dstHi, * dstHiTemp;
3671 dstLo = m_currShader->GetNewVariable(dst->GetNumberElement(),
3672 ISA_TYPE_UD, m_destination->GetAlign(), dst->IsUniform(),
3673 CName(m_destination->getName(), "int64Lo"));
3674 dstHi = m_currShader->GetNewVariable(dst->GetNumberElement(),
3675 hiType, m_destination->GetAlign(), dst->IsUniform(),
3676 CName(m_destination->getName(), "int64Hi"));
3677 dstHiTemp = m_currShader->GetNewVariable(dst->GetNumberElement(),
3678 hiType, m_destination->GetAlign(), dst->IsUniform(),
3679 CName(m_destination->getName(), "int64HiTmp"));
3680
3681
3682 //
3683 // Algorithm:
3684 // - Break the 64 bit sources into 32bit low/high halves.
3685 // - Perform multiplication "by hand"
3686 //
3687 // AB - srcLo[0], srcLo[1]
3688 // CD - srcHi[0], srcHi[1]
3689 // ----
3690 // E
3691 // F
3692 // G
3693 // H - 'H' spills into bit 65 - only needed if overflow detection is required
3694 // --------
3695 // dstLow = E
3696 // dstHigh = F + G + carry
3697
3698 // For those platforms natively not support DW-DW multiply, use vISA madw instruction instead of mul/mulh to get better performance.
3699 if (m_currShader->m_Platform->noNativeDwordMulSupport())
3700 {
3701 // (Cr, E) = A * B
3702 EncoderInit();
3703 // dst size should be GRF-aligned and doubled as it has both low and high results.
3704 // We must make the dst element number is numDWPerGRF aligned. For example, if the madw is SIMD1,
3705 // the dst has only 1 DW as low result in 1 GRF and only 1 DW as high result in another GRF. We should
3706 // set the dst as (numDWPerGRF * 2) element but not 2 DW elements. This is required by madw.
3707 auto numDWPerGRF = getGRFSize() / SIZE_DWORD;
3708 auto numElements = iSTD::Align(dst->GetNumberElement(), numDWPerGRF);
3709 CVariable* dstTmp = m_currShader->GetNewVariable(
3710 numElements * 2, ISA_TYPE_UD, EALIGN_GRF, dst->IsUniform(),
3711 CName(m_destination->getName(), "int64Tmp"));
3712 CVariable* zero = m_currShader->ImmToVariable(0, ISA_TYPE_UD);
3713 m_encoder->Madw(dstTmp, srcLo[0], srcLo[1], zero);
3714
3715 // copy low of A*B to dstLo
3716 EncoderInit();
3717 m_encoder->SetSrcRegion(0, 1, 1, 0);
3718 m_encoder->Copy(dstLo, dstTmp);
3719 m_encoder->Push();
3720
3721 // copy high of A*B to dstHi
3722 EncoderInit();
3723 uint regOffset = (uint)std::ceil((float)(numElements * CEncoder::GetCISADataTypeSize(ISA_TYPE_UD)) / getGRFSize());
3724 m_encoder->SetSrcSubVar(0, regOffset);
3725 m_encoder->SetSrcRegion(0, 1, 1, 0);
3726 m_encoder->Copy(dstHi, dstTmp);
3727 m_encoder->Push();
3728 }
3729 else
3730 {
3731 // E = A * B
3732 EncoderInit();
3733 m_encoder->Mul(dstLo, srcLo[0], srcLo[1]);
3734 m_encoder->Push();
3735
3736 // Cr = carry(A * B)
3737 EncoderInit();
3738 m_encoder->MulH(dstHi, srcLo[0], srcLo[1]);
3739 m_encoder->Push();
3740 }
3741
3742 // F = A * D
3743 EncoderInit();
3744 m_encoder->Mul(dstHiTemp, srcLo[0], srcHi[1]);
3745 m_encoder->Push();
3746
3747 // dstHigh = Cr + F
3748 EncoderInit();
3749 m_encoder->Add(dstHi, dstHi, dstHiTemp);
3750 m_encoder->Push();
3751
3752 // G = C * B
3753 EncoderInit();
3754 m_encoder->Mul(dstHiTemp, srcHi[0], srcLo[1]);
3755 m_encoder->Push();
3756
3757 // dstHigh = (Cr + F) + G
3758 EncoderInit();
3759 m_encoder->Add(dstHi, dstHi, dstHiTemp);
3760 m_encoder->Push();
3761
3762 //And now, pack the result
3763 CVariable* dstAsUD = m_currShader->BitCast(dst, ISA_TYPE_UD);
3764 EncoderInit();
3765 m_encoder->SetDstRegion(2);
3766 m_encoder->Copy(dstAsUD, dstLo);
3767 m_encoder->Push();
3768
3769 EncoderInit();
3770 m_encoder->SetDstRegion(2);
3771 m_encoder->SetDstSubReg(1);
3772 m_encoder->Copy(dstAsUD, dstHi);
3773 m_encoder->Push();
3774 }
3775
Mul(const SSource sources[2],const DstModifier & modifier)3776 void EmitPass::Mul(const SSource sources[2], const DstModifier& modifier)
3777 {
3778 CVariable* src[2];
3779 for (int i = 0; i < 2; ++i)
3780 {
3781 src[i] = GetSrcVariable(sources[i]);
3782 }
3783
3784 // Only i64 muls need special handling, otherwise go back to standard flow
3785 VISA_Type srcType = src[0]->GetType();
3786 if (srcType != ISA_TYPE_Q && srcType != ISA_TYPE_UQ)
3787 {
3788 Binary(EOPCODE_MUL, sources, modifier);
3789 }
3790 else
3791 {
3792 Mul64(m_destination, src, m_currShader->m_SIMDSize);
3793 }
3794 }
3795
FDiv(const SSource sources[2],const DstModifier & modifier)3796 void EmitPass::FDiv(const SSource sources[2], const DstModifier& modifier)
3797 {
3798 if (isOne(sources[0].value))
3799 {
3800 Unary(EOPCODE_INV, &sources[1], modifier);
3801 }
3802 else
3803 {
3804 Binary(EOPCODE_DIV, sources, modifier);
3805 }
3806 }
3807
isConstantAllOnes(const Value * V)3808 static inline bool isConstantAllOnes(const Value* V)
3809 {
3810 if (const Constant * C = dyn_cast<Constant>(V))
3811 return C->isAllOnesValue();
3812 return false;
3813 }
3814
Xor(const SSource sources[2],const DstModifier & modifier)3815 void EmitPass::Xor(const SSource sources[2], const DstModifier& modifier)
3816 {
3817 if (isConstantAllOnes(sources[0].value))
3818 {
3819 Unary(EOPCODE_NOT, &sources[1], modifier);
3820 }
3821 else if (isConstantAllOnes(sources[1].value))
3822 {
3823 Unary(EOPCODE_NOT, &sources[0], modifier);
3824 }
3825 else
3826 {
3827 Binary(EOPCODE_XOR, sources, modifier);
3828 }
3829 }
3830
Cmp(llvm::CmpInst::Predicate pred,const SSource sources[2],const DstModifier & modifier)3831 void EmitPass::Cmp(llvm::CmpInst::Predicate pred, const SSource sources[2], const DstModifier& modifier)
3832 {
3833 IGC_ASSERT(modifier.sat == false);
3834 IGC_ASSERT(modifier.flag == nullptr);
3835 IGC_ASSERT(nullptr != m_destination);
3836
3837 e_predicate predicate = GetPredicate(pred);
3838
3839 CVariable* src0 = GetSrcVariable(sources[0], sources[0].fromConstantPool);
3840 CVariable* src1 = GetSrcVariable(sources[1], sources[1].fromConstantPool);
3841
3842 if (IsUnsignedCmp(pred))
3843 {
3844 src0 = m_currShader->BitCast(src0, GetUnsignedType(src0->GetType()));
3845 src1 = m_currShader->BitCast(src1, GetUnsignedType(src1->GetType()));
3846 }
3847 else if (IsSignedCmp(pred))
3848 {
3849 src0 = m_currShader->BitCast(src0, GetSignedType(src0->GetType()));
3850 src1 = m_currShader->BitCast(src1, GetSignedType(src1->GetType()));
3851 }
3852
3853 CVariable* dst = m_destination;
3854 if (m_destination->GetType() != ISA_TYPE_BOOL && dst->GetType() != src0->GetType())
3855 {
3856 IGC_ASSERT_MESSAGE(CEncoder::GetCISADataTypeSize(dst->GetType()) == CEncoder::GetCISADataTypeSize(src0->GetType()),
3857 "Cmp to GRF must have the same size for source and destination");
3858 dst = m_currShader->BitCast(m_destination, src0->GetType());
3859 }
3860
3861 SetSourceModifiers(0, sources[0]);
3862 SetSourceModifiers(1, sources[1]);
3863 m_encoder->Cmp(predicate, dst, src0, src1);
3864 m_encoder->Push();
3865 }
3866
Frc(const SSource & source,const DstModifier & modifier)3867 void EmitPass::Frc(const SSource& source, const DstModifier& modifier)
3868 {
3869 Unary(EOPCODE_FRC, &source, modifier);
3870 }
3871
Floor(const SSource & source,const DstModifier & modifier)3872 void EmitPass::Floor(const SSource& source, const DstModifier& modifier)
3873 {
3874 Unary(EOPCODE_RNDD, &source, modifier);
3875 }
3876
Mov(const SSource & source,const DstModifier & modifier)3877 void EmitPass::Mov(const SSource& source, const DstModifier& modifier)
3878 {
3879 Unary(EOPCODE_MOV, &source, modifier);
3880 }
3881
Rsqrt(const SSource & source,const DstModifier & modifier)3882 void EmitPass::Rsqrt(const SSource& source, const DstModifier& modifier)
3883 {
3884 Unary(EOPCODE_RSQRT, &source, modifier);
3885 }
3886
Sqrt(const SSource & source,const DstModifier & modifier)3887 void EmitPass::Sqrt(const SSource& source, const DstModifier& modifier)
3888 {
3889 Unary(EOPCODE_SQRT, &source, modifier);
3890 }
3891
Mad(const SSource sources[3],const DstModifier & modifier)3892 void EmitPass::Mad(const SSource sources[3], const DstModifier& modifier)
3893 {
3894 Tenary(EOPCODE_MAD, sources, modifier);
3895 }
3896
Lrp(const SSource sources[3],const DstModifier & modifier)3897 void EmitPass::Lrp(const SSource sources[3], const DstModifier& modifier)
3898 {
3899 Tenary(EOPCODE_LRP, sources, modifier);
3900 }
3901
Pow(const SSource sources[2],const DstModifier & modifier)3902 void EmitPass::Pow(const SSource sources[2], const DstModifier& modifier)
3903 {
3904 Binary(EOPCODE_POW, sources, modifier);
3905 }
3906
Avg(const SSource sources[2],const DstModifier & modifier)3907 void EmitPass::Avg(const SSource sources[2], const DstModifier& modifier)
3908 {
3909 Binary(EOPCODE_AVG, sources, modifier);
3910 }
3911
Tenary(e_opcode opCode,const SSource sources[3],const DstModifier & modifier)3912 void EmitPass::Tenary(e_opcode opCode, const SSource sources[3], const DstModifier& modifier)
3913 {
3914 Alu<3>(opCode, sources, modifier);
3915 }
3916
Binary(e_opcode opCode,const SSource sources[2],const DstModifier & modifier)3917 void EmitPass::Binary(e_opcode opCode, const SSource sources[2], const DstModifier& modifier)
3918 {
3919 Alu<2>(opCode, sources, modifier);
3920 }
3921
Unary(e_opcode opCode,const SSource sources[1],const DstModifier & modifier)3922 void EmitPass::Unary(e_opcode opCode, const SSource sources[1], const DstModifier& modifier)
3923 {
3924 Alu<1>(opCode, sources, modifier);
3925 }
3926
3927 template<int N>
Alu(e_opcode opCode,const SSource sources[N],const DstModifier & modifier)3928 void EmitPass::Alu(e_opcode opCode, const SSource sources[N], const DstModifier& modifier)
3929 {
3930
3931 CVariable* srcs[3] = { nullptr, nullptr, nullptr };
3932 for (uint i = 0; i < N; i++)
3933 {
3934 bool fromConstantPool = sources[i].fromConstantPool;
3935 srcs[i] = GetSrcVariable(sources[i], fromConstantPool);
3936 SetSourceModifiers(i, sources[i]);
3937 }
3938 m_encoder->SetDstModifier(modifier);
3939 m_encoder->GenericAlu(opCode, m_destination, srcs[0], srcs[1], srcs[2]);
3940 m_encoder->Push();
3941 }
3942
Bfn(uint8_t booleanFuncCtrl,const SSource sources[3],const DstModifier & modifier)3943 void EmitPass::Bfn(uint8_t booleanFuncCtrl, const SSource sources[3], const DstModifier& modifier)
3944 {
3945 CVariable* srcs[3] = { nullptr, nullptr, nullptr };
3946 // Currently the BFN must have 3 sources, otherwise we will not generate it. Though BFN can
3947 // have only 2 sources
3948 for (uint i = 0; i < 3; i++)
3949 {
3950 bool fromConstantPool = sources[i].fromConstantPool;
3951 srcs[i] = GetSrcVariable(sources[i], fromConstantPool);
3952 }
3953 m_encoder->SetDstModifier(modifier);
3954 m_encoder->Bfn(booleanFuncCtrl, m_destination, srcs[0], srcs[1], srcs[2]);
3955 m_encoder->Push();
3956 }
3957
CmpBfn(llvm::CmpInst::Predicate predicate,const SSource cmpSources[2],uint8_t booleanFuncCtrl,const SSource bfnSources[3],const DstModifier & modifier)3958 void EmitPass::CmpBfn(llvm::CmpInst::Predicate predicate, const SSource cmpSources[2], uint8_t booleanFuncCtrl,
3959 const SSource bfnSources[3], const DstModifier& modifier)
3960 {
3961 // Cmp
3962 e_predicate pred = GetPredicate(predicate);
3963 CVariable* cmpSrc0 = GetSrcVariable(cmpSources[0]);
3964 CVariable* cmpSrc1 = GetSrcVariable(cmpSources[1]);
3965 CVariable* cmpDst = m_currShader->GetNewVariable(m_destination);
3966
3967 if (IsUnsignedCmp(predicate))
3968 {
3969 cmpSrc0 = m_currShader->BitCast(cmpSrc0, GetUnsignedType(cmpSrc0->GetType()));
3970 cmpSrc1 = m_currShader->BitCast(cmpSrc1, GetUnsignedType(cmpSrc1->GetType()));
3971 }
3972 else if (IsSignedCmp(predicate))
3973 {
3974 cmpSrc0 = m_currShader->BitCast(cmpSrc0, GetSignedType(cmpSrc0->GetType()));
3975 cmpSrc1 = m_currShader->BitCast(cmpSrc1, GetSignedType(cmpSrc1->GetType()));
3976 }
3977
3978 if (cmpDst->GetType() != cmpSrc0->GetType())
3979 {
3980 cmpDst = m_currShader->BitCast(cmpDst, cmpSrc0->GetType());
3981 }
3982
3983 SetSourceModifiers(0, cmpSources[0]);
3984 SetSourceModifiers(1, cmpSources[1]);
3985 m_encoder->Cmp(pred, cmpDst, cmpSrc0, cmpSrc1);
3986 m_encoder->Push();
3987
3988 // BFN
3989 CVariable* bfnSrc1 = GetSrcVariable(bfnSources[1], bfnSources[1].fromConstantPool);
3990 CVariable* bfnSrc2 = GetSrcVariable(bfnSources[2], bfnSources[2].fromConstantPool);
3991 if (cmpDst->GetType() != bfnSrc1->GetType())
3992 {
3993 cmpDst = m_currShader->BitCast(cmpDst, bfnSrc1->GetType());
3994 }
3995 m_encoder->Bfn(booleanFuncCtrl, m_destination, cmpDst, bfnSrc1, bfnSrc2);
3996 m_encoder->Push();
3997 }
3998
Select(const SSource sources[3],const DstModifier & modifier)3999 void EmitPass::Select(const SSource sources[3], const DstModifier& modifier)
4000 {
4001 IGC_ASSERT(modifier.flag == nullptr);
4002 IGC_ASSERT(sources[0].mod == EMOD_NONE);
4003
4004 CVariable* flag = GetSrcVariable(sources[0]);
4005
4006 bool fromConstantPool = sources[1].fromConstantPool;
4007 CVariable* src0 = GetSrcVariable(sources[1], fromConstantPool);
4008
4009 fromConstantPool = sources[2].fromConstantPool;
4010 CVariable* src1 = GetSrcVariable(sources[2], fromConstantPool);
4011
4012 SetSourceModifiers(0, sources[1]);
4013 SetSourceModifiers(1, sources[2]);
4014 m_encoder->SetDstModifier(modifier);
4015 m_encoder->SetPredicateMode(modifier.predMode);
4016
4017 m_encoder->Select(flag, m_destination, src0, src1);
4018 m_encoder->Push();
4019
4020 }
4021
PredAdd(const SSource & pred,bool invert,const SSource sources[2],const DstModifier & modifier)4022 void EmitPass::PredAdd(const SSource& pred, bool invert, const SSource sources[2], const DstModifier& modifier)
4023 {
4024 IGC_ASSERT(modifier.flag == nullptr);
4025 CVariable* flag = GetSrcVariable(pred);
4026 CVariable* src0 = GetSrcVariable(sources[0]);
4027 CVariable* src1 = GetSrcVariable(sources[1]);
4028
4029 // base condition
4030 SetSourceModifiers(0, sources[0]);
4031 m_encoder->Copy(m_destination, src0);
4032 m_encoder->Push();
4033
4034 // predicate add
4035 SetSourceModifiers(1, sources[1]);
4036 m_encoder->SetDstModifier(modifier);
4037 m_encoder->SetPredicateMode(modifier.predMode);
4038 m_encoder->SetInversePredicate(invert);
4039 m_encoder->PredAdd(flag, m_destination, m_destination, src1);
4040 m_encoder->Push();
4041 }
4042
emitOutput(llvm::GenIntrinsicInst * inst)4043 void EmitPass::emitOutput(llvm::GenIntrinsicInst* inst)
4044 {
4045 ShaderOutputType outputType =
4046 (ShaderOutputType)llvm::cast<llvm::ConstantInt>(inst->getOperand(4))->getZExtValue();
4047 if (outputType == SHADER_OUTPUT_TYPE_OMASK)
4048 {
4049 CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
4050 IGC_ASSERT_MESSAGE(psProgram->GetPhase() == PSPHASE_COARSE,
4051 "oMask intrinsics should be left only for coarse phase");
4052 if (!psProgram->IsLastPhase())
4053 {
4054 CVariable* oMask = GetSymbol(inst->getOperand(0));
4055 CVariable* temp =
4056 m_currShader->GetNewVariable(numLanes(m_SimdMode), oMask->GetType(), EALIGN_GRF, inst->getName());
4057 m_encoder->Copy(temp, oMask);
4058 oMask = temp;
4059 psProgram->SetCoarseoMask(oMask);
4060 }
4061 }
4062 else
4063 {
4064 IGC_ASSERT_MESSAGE(0, "output not supported");
4065 }
4066 }
4067
4068
emitPSInputMADHalf(llvm::Instruction * inst)4069 void EmitPass::emitPSInputMADHalf(llvm::Instruction* inst)
4070 {
4071 //create the payload and do interpolation
4072 CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
4073 uint setupIndex = 0;
4074 e_interpolation mode;
4075
4076 setupIndex = (uint)llvm::cast<llvm::ConstantInt>(inst->getOperand(0))->getZExtValue();
4077 mode = (e_interpolation)llvm::cast<llvm::ConstantInt>(inst->getOperand(1))->getZExtValue();
4078 CVariable* baryVar = nullptr;
4079
4080
4081 // mov SIMD4 deltas in to tmp
4082 // mov (4) r0.0<1>:hf r15.0<4;4,1>:f {Align1, Q1, NoMask} // #??:$26:%29
4083 CVariable* tmpDeltaDst = nullptr;
4084
4085 //inputVar
4086 /*
4087 For SIMD8 mode we generate mix-mode instructions
4088 so we won't generate down conversion for
4089 deltas
4090 */
4091 if (psProgram->LowerPSInput())
4092 {
4093 tmpDeltaDst = psProgram->GetInputDeltaLowered(setupIndex);
4094 baryVar = psProgram->GetBaryRegLoweredHalf(mode);
4095 }
4096 else
4097 {
4098 tmpDeltaDst = psProgram->GetInputDelta(setupIndex);
4099 baryVar = psProgram->GetBaryReg(mode);
4100 }
4101 ContextSwitchPayloadSection();
4102 //dst:hf = src1 * src0 + src3
4103 //dst = p * u + r
4104 //mad (16) r20.0.xyzw:hf r0.3.r:hf r0.0.r:hf r12.0.xyzw:hf {Align16, H1} // #??:$31:%209
4105 m_encoder->SetSrcSubReg(1, 0);
4106 m_encoder->SetSrcSubReg(2, 3);
4107 m_encoder->Mad(m_destination, baryVar, tmpDeltaDst, tmpDeltaDst);
4108 m_encoder->Push();
4109
4110 //dst:hf = src1 * src0 + src3
4111 //dst = q * v + dst
4112 //mad(16) r20.0.xyzw:hf r20.0.xyzw : hf r0.1.r : hf r18.0.xyzw : hf{ Align16, H1 } // #??:$32:%210
4113 //if we down converting bary coordinate values will be packed
4114 m_encoder->SetSrcSubReg(0, numLanes(m_currShader->m_SIMDSize));
4115 m_encoder->SetSrcSubReg(1, 1);
4116
4117 m_encoder->Mad(m_destination, baryVar, tmpDeltaDst, m_destination);
4118 m_encoder->Push();
4119 ContextSwitchShaderBody();
4120 }
4121
emitPSInputCst(llvm::Instruction * inst)4122 void EmitPass::emitPSInputCst(llvm::Instruction* inst)
4123 {
4124 CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
4125 unsigned int inputIndex = (uint)llvm::cast<llvm::ConstantInt>(inst->getOperand(0))->getZExtValue();
4126 psProgram->MarkConstantInterpolation(inputIndex);
4127 unsigned int setupIndex = psProgram->getSetupIndex(inputIndex);
4128 CVariable* inputVar = psProgram->GetInputDelta(setupIndex);
4129 // temp variable should be the same type as the destination
4130 // This is where we have MOV for payload
4131 ContextSwitchPayloadSection();
4132 {
4133 // A0 vertex data are in Rp.{3 + 4*n}
4134 m_encoder->SetSrcRegion(0, 0, 1, 0);
4135 m_encoder->SetSrcSubReg(0, 3);
4136 m_encoder->Cast(m_destination, inputVar);
4137 m_encoder->Push();
4138 }
4139
4140 ContextSwitchShaderBody();
4141 }
4142
4143
emitPSInput(llvm::Instruction * inst)4144 void EmitPass::emitPSInput(llvm::Instruction* inst)
4145 {
4146 e_interpolation mode = (e_interpolation)llvm::cast<llvm::ConstantInt>(inst->getOperand(1))->getZExtValue();
4147 if (mode == EINTERPOLATION_CONSTANT)
4148 {
4149 emitPSInputCst(inst);
4150 }
4151 else if (inst->getType()->isHalfTy()
4152 )
4153 {
4154 emitPSInputMADHalf(inst);
4155 }
4156 else
4157 {
4158 emitPSInputPln(inst);
4159 }
4160 }
4161
emitPlnInterpolation(CVariable * baryVar,CVariable * inputvar)4162 void EmitPass::emitPlnInterpolation(CVariable* baryVar, CVariable* inputvar)
4163 {
4164 unsigned int numPln = 1;
4165
4166 for (unsigned int i = 0; i < numPln; i++)
4167 {
4168 // plane will access 4 operands
4169 m_encoder->SetSrcRegion(0, 0, 4, 1);
4170 m_encoder->Pln(m_destination, inputvar, baryVar);
4171 m_encoder->Push();
4172 }
4173 }
4174
emitPSInputPln(llvm::Instruction * inst)4175 void EmitPass::emitPSInputPln(llvm::Instruction* inst)
4176 {
4177 //create the payload and do interpolationd
4178 CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
4179 uint setupIndex = (uint)llvm::cast<llvm::ConstantInt>(inst->getOperand(0))->getZExtValue();
4180 // temp variable should be the same type as the destination
4181 CVariable* inputVar = psProgram->GetInputDelta(setupIndex);
4182 e_interpolation mode = (e_interpolation)llvm::cast<llvm::ConstantInt>(inst->getOperand(1))->getZExtValue();
4183 // need to do interpolation unless we do constant interpolation
4184 CVariable* baryVar = psProgram->GetBaryReg(mode);
4185
4186 ContextSwitchPayloadSection();
4187 emitPlnInterpolation(baryVar, inputVar);
4188 ContextSwitchShaderBody();
4189 }
4190
emitEvalAttribute(llvm::GenIntrinsicInst * inst)4191 void EmitPass::emitEvalAttribute(llvm::GenIntrinsicInst* inst)
4192 {
4193 CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
4194 // temp variable should be the same type as the destination
4195 bool perspective = cast<ConstantInt>(inst->getOperand(inst->getNumArgOperands() - 1))->getZExtValue() != 0;
4196 EU_PIXEL_INTERPOLATOR_INTERPOLATION_MODE interpolationMode =
4197 perspective ? EU_PI_MESSAGE_PERSPECTIVE_INTERPOLATION : EU_PI_MESSAGE_LINEAR_INTERPOLATION;
4198 if (interpolationMode == EU_PI_MESSAGE_LINEAR_INTERPOLATION)
4199 {
4200 // workaround driver interface; tell the driver we use noperspective barys to turn on noperspective interpolation
4201 psProgram->GetBaryReg(EINTERPOLATION_LINEARNOPERSPECTIVE);
4202 }
4203 uint exDesc = EU_GEN7_MESSAGE_TARGET_PIXEL_INTERPOLATOR;
4204 EU_PIXEL_INTERPOLATOR_SIMD_MODE executionMode = pixelInterpolatorSimDMode(m_currShader->m_SIMDSize);
4205 uint responseLength = executionMode ? 4 : 2;
4206 if (getGRFSize() != 32)
4207 {
4208 responseLength /= 2;
4209 }
4210 uint messageLength = 1;
4211 CVariable* payload = nullptr;
4212 uint desc = 0;
4213 CVariable* messDesc = nullptr;
4214 switch (inst->getIntrinsicID())
4215 {
4216 case GenISAIntrinsic::GenISA_PullSampleIndexBarys:
4217 {
4218 payload = m_currShader->GetNewVariable(
4219 messageLength * (getGRFSize() >> 2),
4220 ISA_TYPE_D, EALIGN_GRF, inst->getName());
4221 uint sampleindex = 0;
4222 desc = PixelInterpolator(
4223 messageLength,
4224 responseLength,
4225 m_encoder->IsSecondHalf() ? 1 : 0,
4226 executionMode,
4227 EU_PI_MESSAGE_EVAL_SAMPLE_POSITION,
4228 interpolationMode,
4229 sampleindex);
4230
4231 if (ConstantInt * index = dyn_cast<ConstantInt>(inst->getOperand(0)))
4232 {
4233 sampleindex = (uint)index->getZExtValue();
4234 desc = desc | (sampleindex << 4);
4235 messDesc = psProgram->ImmToVariable(desc, ISA_TYPE_UD);
4236
4237 m_encoder->Send(m_destination, payload, exDesc, messDesc, false);
4238 m_encoder->Push();
4239 }
4240 else
4241 {
4242 ResourceDescriptor resource;
4243 CVariable* flag = nullptr;
4244 uint label;
4245 bool needLoop;
4246 CVariable* uniformId;
4247
4248 SamplerDescriptor sampler = getSampleIDVariable(inst->getOperand(0));
4249 needLoop = ResourceLoopHeader(resource, sampler, flag, label);
4250 uniformId = sampler.m_sampler;
4251
4252 messDesc = m_currShader->GetNewVariable(1, ISA_TYPE_UD, EALIGN_DWORD, true, CName::NONE);
4253
4254 CVariable* idxShift = m_currShader->GetNewVariable(uniformId);
4255 m_encoder->Shl(idxShift, uniformId, m_currShader->ImmToVariable(4, ISA_TYPE_UD));
4256 m_encoder->Or(messDesc, m_currShader->ImmToVariable(desc, ISA_TYPE_UD), idxShift);
4257 m_encoder->Push();
4258
4259 m_encoder->SetPredicate(flag);
4260 m_encoder->Send(m_destination, payload, exDesc, messDesc, false);
4261 m_encoder->Push();
4262
4263 ResourceLoopBackEdge(needLoop, flag, label);
4264 }
4265 }
4266 break;
4267
4268 case GenISAIntrinsic::GenISA_PullSnappedBarys:
4269 case GenISAIntrinsic::GenISA_PullCentroidBarys:
4270 {
4271 uint offsetX = 0;
4272 uint offsetY = 0;
4273 bool offsetIsConst = true;
4274 auto messageType = EU_PI_MESSAGE_EVAL_CENTROID_POSITION;
4275 auto numDWPerGRF = getGRFSize() / SIZE_DWORD;
4276 if (inst->getIntrinsicID() == GenISAIntrinsic::GenISA_PullSnappedBarys)
4277 {
4278 offsetIsConst = false;
4279 auto xCstOffset = llvm::dyn_cast<llvm::ConstantInt>(inst->getOperand(0));
4280 auto yCstOffset = llvm::dyn_cast<llvm::ConstantInt>(inst->getOperand(1));
4281 if (xCstOffset && yCstOffset)
4282 {
4283 offsetIsConst = true;
4284 offsetX = (uint) xCstOffset->getZExtValue();
4285 offsetY = (uint) yCstOffset->getZExtValue();
4286 }
4287
4288 messageType = offsetIsConst && psProgram->GetPhase() != PSPHASE_COARSE ?
4289 EU_PI_MESSAGE_EVAL_PER_MESSAGE_OFFSET :
4290 EU_PI_MESSAGE_EVAL_PER_SLOT_OFFSET;
4291 }
4292 if (offsetIsConst && psProgram->GetPhase() != PSPHASE_COARSE)
4293 {
4294 payload = m_currShader->GetNewVariable(
4295 messageLength * numDWPerGRF, ISA_TYPE_D, EALIGN_GRF, inst->getName());
4296 desc = PixelInterpolator(
4297 messageLength,
4298 responseLength,
4299 m_encoder->IsSecondHalf() ? 1 : 0,
4300 executionMode,
4301 messageType,
4302 interpolationMode,
4303 offsetX,
4304 offsetY);
4305 }
4306 else
4307 {
4308 IGC_ASSERT(messageType != EU_PI_MESSAGE_EVAL_CENTROID_POSITION);
4309 IGC_ASSERT(numDWPerGRF);
4310
4311 messageLength = 2 * numLanes(m_currShader->m_SIMDSize) / numDWPerGRF;
4312 payload = m_currShader->GetNewVariable(
4313 messageLength * (getGRFSize() >> 2), ISA_TYPE_D, EALIGN_GRF, inst->getName());
4314 desc = PixelInterpolator(
4315 messageLength,
4316 responseLength,
4317 m_encoder->IsSecondHalf() ? 1 : 0,
4318 psProgram->GetPhase() == PSPHASE_COARSE,
4319 executionMode,
4320 messageType,
4321 interpolationMode);
4322 CVariable* XOffset = GetSymbol(inst->getOperand(0));
4323 CVariable* YOffset = GetSymbol(inst->getOperand(1));
4324 m_encoder->Copy(payload, XOffset);
4325 m_encoder->Push();
4326
4327 m_encoder->SetDstSubVar(numLanes(m_currShader->m_SIMDSize) / numDWPerGRF);
4328 m_encoder->Copy(payload, YOffset);
4329 m_encoder->Push();
4330 }
4331 messDesc = psProgram->ImmToVariable(desc, ISA_TYPE_UD);
4332 }
4333
4334 m_encoder->Send(m_destination, payload, exDesc, messDesc, false);
4335 m_encoder->Push();
4336 break;
4337
4338 default:
4339 IGC_ASSERT(0);
4340 break;
4341 }
4342 }
4343
emitInterpolate(llvm::GenIntrinsicInst * inst)4344 void EmitPass::emitInterpolate(llvm::GenIntrinsicInst* inst)
4345 {
4346 CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
4347 CVariable* barys = GetSymbol(inst->getOperand(1));
4348 uint setupIndex = (uint)llvm::cast<llvm::ConstantInt>(inst->getOperand(0))->getZExtValue();
4349 // temp variable should be the same type as the destination
4350 CVariable* inputVar = psProgram->GetInputDelta(setupIndex);
4351
4352 {
4353 ContextSwitchPayloadSection();
4354 emitPlnInterpolation(barys, inputVar);
4355 ContextSwitchShaderBody();
4356 }
4357 }
4358
emitInterpolate2(llvm::GenIntrinsicInst * inst)4359 void EmitPass::emitInterpolate2(llvm::GenIntrinsicInst* inst)
4360 {
4361 CVariable* inputVar = GetSymbol(inst->getOperand(0));
4362 CVariable* barys = GetSymbol(inst->getOperand(1));
4363 emitPlnInterpolation(barys, inputVar);
4364 }
4365
emitInterpolant(llvm::GenIntrinsicInst * inst)4366 void EmitPass::emitInterpolant(llvm::GenIntrinsicInst* inst)
4367 {
4368 uint setupIndex = (uint)llvm::cast<llvm::ConstantInt>(inst->getOperand(0))->getZExtValue();
4369 auto psProgram = static_cast<CPixelShader*>(m_currShader);
4370 CVariable* inputVar = psProgram->GetInputDelta(setupIndex);
4371 m_encoder->SetSrcRegion(0, 4, 4, 1);
4372 m_encoder->SetSimdSize(SIMDMode::SIMD4);
4373 m_encoder->SetNoMask();
4374 m_encoder->Copy(m_destination, inputVar);
4375 m_encoder->Push();
4376 }
4377
emitDSInput(llvm::Instruction * pInst)4378 void EmitPass::emitDSInput(llvm::Instruction* pInst)
4379 {
4380 IGC_ASSERT(m_currShader->GetShaderType() == ShaderType::DOMAIN_SHADER);
4381 CVariable* dst = m_destination;
4382
4383 CDomainShader* dsProgram = static_cast<CDomainShader*>(m_currShader);
4384 // only pulled inputs reach here
4385 QuadEltUnit globalOffset(0);
4386 llvm::Value* pPayloadInputIdx = pInst->getOperand(0);
4387 llvm::ConstantInt* pConstIntPayloadVar = llvm::dyn_cast_or_null<llvm::ConstantInt>(pPayloadInputIdx);
4388 uint32_t elmIdx = 0;
4389
4390 if (pConstIntPayloadVar != nullptr)
4391 {
4392 elmIdx = int_cast<uint32_t>(cast<llvm::ConstantInt>(pConstIntPayloadVar)->getZExtValue());
4393
4394 CVariable* inputVar = dsProgram->GetInputDelta(elmIdx);
4395 if (dsProgram->GetShaderDispatchMode() == ShaderDispatchMode::DUAL_PATCH)
4396 {
4397 m_encoder->SetSrcSubReg(0, elmIdx % 4);
4398 m_encoder->SetSrcRegion(0, 4, 4, 0);
4399 }
4400 m_encoder->Copy(dst, inputVar);
4401 m_encoder->Push();
4402 }
4403 else
4404 {
4405 IGC_ASSERT_MESSAGE(0, "Only constant payload input variable index handled");
4406 }
4407 }
4408
emitInput(llvm::Instruction * inst)4409 void EmitPass::emitInput(llvm::Instruction* inst)
4410 {
4411 switch (m_currShader->GetShaderType())
4412 {
4413 case ShaderType::PIXEL_SHADER:
4414 emitPSInput(inst);
4415 break;
4416 case ShaderType::DOMAIN_SHADER:
4417 emitDSInput(inst);
4418 break;
4419 default:
4420 IGC_ASSERT(0);
4421 break;
4422 }
4423 }
4424
emitcycleCounter(llvm::Instruction * inst)4425 void EmitPass::emitcycleCounter(llvm::Instruction* inst)
4426 {
4427 CVariable* dst = m_destination;
4428 m_encoder->Copy(dst, m_currShader->GetTSC());
4429 m_encoder->Push();
4430 m_encoder->SetSrcSubReg(0, 1);
4431 m_encoder->SetDstSubReg(1);
4432 m_encoder->Copy(dst, m_currShader->GetTSC());
4433 m_encoder->Push();
4434 }
4435
emitSetDebugReg(llvm::Instruction * inst)4436 void EmitPass::emitSetDebugReg(llvm::Instruction* inst)
4437 {
4438 Value* src0 = inst->getOperand(0);
4439 if (!isa<UndefValue>(src0))
4440 {
4441 // write dbg0.0
4442 CVariable* src = GetSymbol(src0);
4443 IGC_ASSERT(nullptr != src);
4444 IGC_ASSERT(src->IsUniform());
4445 m_encoder->SetDstSubReg(0);
4446 m_encoder->Copy(m_currShader->GetDBG(), src);
4447 m_encoder->Push();
4448 }
4449
4450 // read dbg0.1
4451 m_encoder->SetSrcSubReg(0, 1);
4452 m_encoder->SetSrcRegion(0, 0, 1, 0);
4453 m_encoder->Copy(m_destination, m_currShader->GetDBG());
4454 m_encoder->Push();
4455 }
4456
ComputeSampleIntOffset(llvm::Instruction * sample,uint sourceIndex)4457 CVariable* EmitPass::ComputeSampleIntOffset(llvm::Instruction* sample, uint sourceIndex)
4458 {
4459 // The (u,v,r) offsets are encoded in SamplerMessageHeader::DW2
4460 // as [11:8], [7:4], [3:0] bitfields, respectively. Format: S3.
4461 uint offset = 0;
4462 bool dynamicOffset = false;
4463 for (uint i = 0; i < 3; i++)
4464 {
4465 if (ConstantInt * immOffset = dyn_cast<ConstantInt>(sample->getOperand(sourceIndex + i)))
4466 {
4467 uint channelOffset = static_cast<uint>(immOffset->getZExtValue());
4468 offset = (offset << 4) | (channelOffset & 0xf);
4469 }
4470 else
4471 {
4472 dynamicOffset = true;
4473 }
4474 }
4475 CVariable* packedOffset = m_currShader->ImmToVariable(offset, ISA_TYPE_UW);
4476 if (dynamicOffset)
4477 {
4478 CVariable* tempPackedOffset = m_currShader->GetNewVariable(1, ISA_TYPE_UW, EALIGN_WORD, true, "PackedOffset");
4479 for (uint i = 0; i < 3; i++)
4480 {
4481 if (!isa<ConstantInt>(sample->getOperand(sourceIndex + i)))
4482 {
4483 CVariable* offsetV = GetSymbol(sample->getOperand(sourceIndex + i));
4484 if (!offsetV->IsUniform())
4485 {
4486 offsetV = UniformCopy(offsetV);
4487 }
4488
4489 // Offset is only 4 bits, mask off remaining bits
4490 CVariable* offsetBits = m_currShader->GetNewVariable(1, ISA_TYPE_UW, EALIGN_WORD, true, "PackedOffset");
4491 m_encoder->And(offsetBits, offsetV, m_currShader->ImmToVariable(0xF, ISA_TYPE_UW));
4492 if (i != 2)
4493 {
4494 m_encoder->Shl(offsetBits, offsetBits, m_currShader->ImmToVariable(4 * (2 - i), ISA_TYPE_UW));
4495 }
4496 if (packedOffset->IsImmediate() && packedOffset->GetImmediateValue() == 0)
4497 {
4498 packedOffset = offsetBits;
4499 }
4500 else
4501 {
4502 m_encoder->Or(tempPackedOffset, packedOffset, offsetBits);
4503 packedOffset = tempPackedOffset;
4504 }
4505 }
4506 }
4507 }
4508 return packedOffset;
4509 }
4510
4511 // simple helper to reorder input depending on the generation
CorrectLdIndex(uint i,bool oldLoad)4512 uint CorrectLdIndex(uint i, bool oldLoad)
4513 {
4514 uint index = i;
4515 if (oldLoad)
4516 {
4517 if (i == 1)
4518 {
4519 index = 2;
4520 }
4521 else if (i == 2)
4522 {
4523 index = 1;
4524 }
4525 }
4526 return index;
4527 }
4528
IndexableResourceIndex(CVariable * indexVar,uint btiIndex)4529 CVariable* EmitPass::IndexableResourceIndex(CVariable* indexVar, uint btiIndex)
4530 {
4531 CVariable* bti = m_currShader->ImmToVariable(btiIndex, ISA_TYPE_UD);
4532 CVariable* dst = m_currShader->GetNewVariable(indexVar);
4533 m_encoder->Add(dst, indexVar, bti);
4534 m_encoder->Push();
4535 return dst;
4536 }
4537
PackSIMD8HFRet(CVariable * dst)4538 void EmitPass::PackSIMD8HFRet(CVariable* dst)
4539 {
4540 // the extra moves will be cleaned up by vISA
4541 auto numLanePerChannel = numLanes(m_currShader->m_Platform->getMinDispatchMode());
4542 for (uint16_t n = 0; n < m_destination->GetNumberElement() / numLanePerChannel; n++)
4543 {
4544 m_encoder->SetDstSubReg(n * numLanePerChannel);
4545 m_encoder->SetSrcSubReg(0, n * numLanePerChannel * 2);
4546 m_encoder->Copy(m_destination, dst);
4547 m_encoder->Push();
4548 }
4549 }
4550
4551
emitLdInstruction(llvm::Instruction * inst)4552 void EmitPass::emitLdInstruction(llvm::Instruction* inst)
4553 {
4554 uint numOperands = inst->getNumOperands();
4555 IGC_ASSERT_MESSAGE(7 < numOperands, "Wrong number of operands");
4556 IGC_ASSERT_MESSAGE(numOperands < 10, "Wrong number of operands");
4557
4558 const CShader::ExtractMaskWrapper writeMask(m_currShader, inst);
4559 IGC_ASSERT_MESSAGE(writeMask.hasEM() && writeMask.getEM() != 0, "Wrong write mask");
4560
4561 EOPCODE opCode = GetOpCode(inst);
4562 //Subtract the offsets, resource sources to get
4563 //the number of texture coordinates and index to texture source
4564 uint numSources = numOperands - 5;
4565 uint textureArgIdx = numOperands - 5;
4566
4567 ResourceDescriptor resource;
4568 Value* ptr = inst->getOperand(textureArgIdx);
4569 resource = GetResourceVariable(ptr);
4570 uint offsetSourceIndex = numSources + 1;
4571 CVariable* offset = ComputeSampleIntOffset(inst, offsetSourceIndex);
4572
4573 SmallVector<CVariable*, 4> payload;
4574
4575 for (uint i = numSources - 1; i > 0; i--)
4576 {
4577 uint index = CorrectLdIndex(i, m_currShader->m_Platform->hasOldLdOrder());
4578 CVariable* src = GetSymbol(inst->getOperand(index));
4579 if (!(src->IsImmediate() && src->GetImmediateValue() == 0))
4580 {
4581 break;
4582 }
4583 numSources--;
4584 }
4585
4586 bool zeroLOD = false;
4587 //SKL+ new message ld_lz
4588 if (numSources > 2 &&
4589 m_currShader->m_Platform->supportSampleAndLd_lz())
4590 {
4591 // Check if lod is 0
4592 CVariable* src = GetSymbol(inst->getOperand(2));
4593 if (src->IsImmediate() && src->GetImmediateValue() == 0)
4594 {
4595 zeroLOD = true;
4596 numSources--;
4597 }
4598 }
4599
4600 //create send payload for numSources
4601 for (uint i = 0; i < numSources; i++)
4602 {
4603 uint index = i;
4604 //no difference in ld_lz between SKL+ and BDW
4605 if (!zeroLOD)
4606 {
4607 index = CorrectLdIndex(i, m_currShader->m_Platform->hasOldLdOrder());
4608 }
4609 if (zeroLOD && index == 2)
4610 {
4611 //3D resources skip lod and read z coordinate
4612 index = 3;
4613 }
4614 CVariable* src = GetSymbol(inst->getOperand(index));
4615 if (src->IsUniform())
4616 {
4617 auto uniformSIMDMode = m_currShader->m_Platform->getMinDispatchMode();
4618 uint16_t size = m_destination->IsUniform() ? numLanes(uniformSIMDMode) :
4619 numLanes(m_currShader->m_SIMDSize);
4620 CVariable* newSource = m_currShader->GetNewVariable(
4621 size,
4622 src->GetType(),
4623 EALIGN_GRF,
4624 m_destination->IsUniform(),
4625 src->getName());
4626 m_encoder->SetUniformSIMDSize(uniformSIMDMode);
4627 m_encoder->Copy(newSource, src);
4628 m_encoder->Push();
4629 src = newSource;
4630 }
4631 payload.push_back(src);
4632
4633 }
4634
4635 //When sampler output is 16 bit float, hardware doesnt pack the output in SIMD8 mode.
4636 //Hence the movs to handle this layout in SIMD8 mode
4637 bool needPacking = false;
4638 CVariable* dst = m_destination;
4639 SIMDMode simdSize = m_currShader->m_SIMDSize;
4640 {
4641 if (dst->IsUniform())
4642 {
4643 simdSize = m_currShader->m_Platform->getMinDispatchMode();
4644 unsigned short numberOfElement = dst->GetNumberElement() * numLanes(simdSize);
4645 numberOfElement = CEncoder::GetCISADataTypeSize(dst->GetType()) == 2 ? numberOfElement * 2 : numberOfElement;
4646 dst = m_currShader->GetNewVariable(
4647 numberOfElement, dst->GetType(), EALIGN_GRF, dst->IsUniform(), dst->getName());
4648 }
4649 else
4650 {
4651 needPacking = isHalfGRFReturn(m_destination, m_SimdMode);
4652 if (needPacking)
4653 {
4654 dst = m_currShader->GetNewVariable(
4655 m_destination->GetNumberElement() * 2, m_destination->GetType(), EALIGN_GRF, dst->IsUniform(), dst->getName());
4656 }
4657 }
4658 }
4659
4660 bool feedbackEnable = writeMask.isSet(4);
4661 uint label = 0;
4662 CVariable* flag = nullptr;
4663 bool needLoop = ResourceLoopHeader(resource, flag, label);
4664 m_encoder->SetPredicate(flag);
4665 if (m_destination->IsUniform())
4666 {
4667 m_encoder->SetUniformSIMDSize(m_currShader->m_Platform->getMinDispatchMode());
4668 }
4669 m_encoder->Load(
4670 opCode,
4671 writeMask.getEM(),
4672 offset,
4673 resource,
4674 numSources,
4675 dst,
4676 payload,
4677 zeroLOD,
4678 feedbackEnable);
4679 m_encoder->Push();
4680 if (m_currShader->hasReadWriteImage(*(inst->getParent()->getParent())))
4681 {
4682 CVariable* tempdest = m_currShader->BitCast(m_destination, GetUnsignedIntegerType(m_destination->GetType()));
4683 m_encoder->Cast(m_currShader->GetNULL(), tempdest);
4684 m_encoder->Push();
4685 m_encoder->Copy(m_currShader->GetNULL(), m_currShader->GetTSC());
4686 m_encoder->Push();
4687 }
4688 ResourceLoopBackEdge(needLoop, flag, label);
4689
4690 {
4691 if (m_destination->IsUniform())
4692 {
4693 // if dst is uniform, we simply copy the first lane of each channel (including feedback enable if present)
4694 // to the packed m_destination.
4695 // Note that there's no need to handle feedback enable specially
4696 for (unsigned int i = 0; i < m_destination->GetNumberElement(); i++)
4697 {
4698 m_encoder->SetSrcRegion(0, 0, 1, 0);
4699 m_encoder->SetSrcSubVar(0, i);
4700 m_encoder->SetDstSubReg(i);
4701 m_encoder->Copy(m_destination, dst);
4702 m_encoder->Push();
4703 }
4704 }
4705 else
4706 {
4707 if (needPacking)
4708 {
4709 PackSIMD8HFRet(dst);
4710 }
4711
4712 if (feedbackEnable)
4713 {
4714 emitFeedbackEnable();
4715 }
4716 }
4717 }
4718 }
4719
4720 /// \brief Returns the offset increment in bytes, given the value's type.
GetOffsetIncrement(const DataLayout * m_DL,SIMDMode simdMode,Value * val)4721 static int GetOffsetIncrement(const DataLayout* m_DL, SIMDMode simdMode, Value* val)
4722 {
4723 int inc;
4724 inc = int_cast<int>(numLanes(simdMode) * (unsigned int)m_DL->getTypeAllocSize(val->getType()));
4725 if (val->getType()->isHalfTy() && simdMode == SIMDMode::SIMD8)
4726 {
4727 //Since alloc size for half float is = 2 and if we have simd8 mode we'll get offset = 16
4728 //but need to pad it with extra 16.
4729 IGC_ASSERT(inc <= 16);
4730 inc *= 2;
4731 }
4732 return inc;
4733 }
4734
4735 ///
4736 template <typename T>
interceptRenderTargetWritePayloadCoalescing(T * inst,CVariable ** src,CVariable * & source0Alpha,CVariable * & oMaskOpnd,CVariable * & outputDepthOpnd,CVariable * & vStencilOpnd,DenseMap<Value *,CVariable ** > & valueToVariableMap)4737 bool EmitPass::interceptRenderTargetWritePayloadCoalescing(
4738 T* inst,
4739 CVariable** src,
4740 CVariable*& source0Alpha,
4741 CVariable*& oMaskOpnd,
4742 CVariable*& outputDepthOpnd,
4743 CVariable*& vStencilOpnd,
4744 DenseMap<Value*, CVariable**>& valueToVariableMap)
4745 {
4746 CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
4747
4748 //check coalescing
4749 CoalescingEngine::CCTuple* ccTuple = nullptr;
4750 m_CE->SetCurrentPart(inst, 0);
4751 const uint numOperands = m_CE->GetNumPayloadElements(inst);
4752 Value* dummyValPtr = nullptr;
4753 int payloadToCCTupleRelativeOffset = 0;
4754
4755 ccTuple = m_CE->IsAnyValueCoalescedInCCTuple(inst,
4756 numOperands,
4757 //out:
4758 payloadToCCTupleRelativeOffset,
4759 dummyValPtr);
4760 bool payloadCovered = m_CE->IsPayloadCovered(inst, ccTuple, numOperands, payloadToCCTupleRelativeOffset);
4761 if (!payloadCovered) {
4762 return false;
4763 }
4764
4765 //This check is necessary, since IsPayloadCovered is not checking for non-homogeneous part.
4766 if (m_CE->HasNonHomogeneousPayloadElements(inst) &&
4767 !ccTuple->HasNonHomogeneousElements())
4768 {
4769 return false;
4770 }
4771
4772
4773 if (ccTuple->HasNonHomogeneousElements())
4774 {
4775 if (m_CE->GetLeftReservedOffset(ccTuple->GetRoot(), m_currShader->m_SIMDSize) <
4776 m_CE->GetLeftReservedOffset(inst, m_currShader->m_SIMDSize))
4777 {
4778 return false;
4779 }
4780 if (payloadToCCTupleRelativeOffset)
4781 {
4782 return false;
4783 }
4784 }
4785
4786 IGC_ASSERT(ccTuple);
4787 CVariable* rootPayloadVar = m_currShader->LazyCreateCCTupleBackingVariable(ccTuple);
4788
4789 //Elements are processed in the payload slot order.
4790 //Homogeneous part is looked-up through payload coalescing methods.
4791 //Payload layout for RT writer: s0Alpha oM [R G B A] sZ oS
4792 //Payload layout for dual source RT writer: oM [R0 G0 B0 A0 R1 G1 B1 A1] sZ oS
4793 int offset = 0;
4794 if (RTWriteHasSource0Alpha(inst, m_moduleMD))
4795 {
4796 IGC_ASSERT(ccTuple->HasNonHomogeneousElements());
4797
4798 VISA_Type vType = m_currShader->GetType(inst->getSource0Alpha()->getType());
4799
4800 IGC_ASSERT(source0Alpha == nullptr);
4801 CVariable* temp = m_currShader->GetNewAlias(rootPayloadVar, vType, (uint16_t)offset, 0);
4802 m_encoder->Copy(temp, GetSymbol(inst->getSource0Alpha()));
4803 m_encoder->Push();
4804 source0Alpha = temp;
4805 }
4806
4807 if (ccTuple->HasNonHomogeneousElements())
4808 {
4809 IGC_ASSERT_MESSAGE(ccTuple->GetRoot(), "in other words, there is a 'supremum' element");
4810 IGC_ASSERT(llvm::isa<llvm::RTWritIntrinsic>(ccTuple->GetRoot()) || llvm::isa<llvm::RTDualBlendSourceIntrinsic>(ccTuple->GetRoot()));
4811 if (llvm::RTWritIntrinsic * rtwi = llvm::dyn_cast<llvm::RTWritIntrinsic>(ccTuple->GetRoot()))
4812 {
4813 if (RTWriteHasSource0Alpha(rtwi, m_moduleMD))
4814 {
4815 //This is a stronger condition than querying 'inst' only, since root represents
4816 //the whole group of 'non-homogeneous' parts. E.g. it might turn out, that this
4817 //instruction does not have src0 alpha, but it was coalesced in a group that has
4818 //at least one src0 alpha. Thus, we need to take that src0 alpha into account
4819 //when computing 'left' reserved offset.
4820 offset += GetOffsetIncrement(m_DL, m_currShader->m_SIMDSize, rtwi->getSource0Alpha());
4821 }
4822 }
4823 else if (llvm::RTDualBlendSourceIntrinsic * dsrtwi = llvm::dyn_cast<llvm::RTDualBlendSourceIntrinsic>(ccTuple->GetRoot()))
4824 {
4825 IGC_ASSERT_MESSAGE(!RTWriteHasSource0Alpha(dsrtwi, m_moduleMD), "dual-source doesn't support Source0Alpha");
4826 }
4827 }
4828
4829 if (inst->hasMask())
4830 {
4831 IGC_ASSERT(!DoesRTWriteSrc0AlphaBelongToHomogeneousPart(inst, m_moduleMD));
4832 IGC_ASSERT(oMaskOpnd == nullptr);
4833
4834 CVariable* oLocalMaskOpnd = GetSymbol(inst->getOMask());
4835 oLocalMaskOpnd = psProgram->BitCast(oLocalMaskOpnd, ISA_TYPE_UW);
4836
4837 CVariable* temp = m_currShader->GetNewAlias(rootPayloadVar, ISA_TYPE_D, (uint16_t)offset, 0);
4838 psProgram->PackAndCopyVariable(temp, oLocalMaskOpnd);
4839 oMaskOpnd = temp;
4840 }
4841
4842 if (ccTuple->HasNonHomogeneousElements())
4843 {
4844 IGC_ASSERT_MESSAGE(ccTuple->GetRoot(), "in other words, there is a 'supremum' element");
4845 IGC_ASSERT(llvm::isa<llvm::RTWritIntrinsic>(ccTuple->GetRoot()) || llvm::isa<llvm::RTDualBlendSourceIntrinsic>(ccTuple->GetRoot()));
4846 if (llvm::dyn_cast<llvm::RTWritIntrinsic>(ccTuple->GetRoot()) || llvm::dyn_cast<llvm::RTDualBlendSourceIntrinsic>(ccTuple->GetRoot()))
4847 {
4848 //Take left reserved offset from 'root' of the group, not from this instruction.
4849 offset = m_CE->GetLeftReservedOffset(ccTuple->GetRoot(), m_currShader->m_SIMDSize);
4850 }
4851 }
4852
4853 IGC_ASSERT(dummyValPtr);
4854
4855 offset += payloadToCCTupleRelativeOffset *
4856 m_CE->GetSingleElementWidth(m_currShader->m_SIMDSize, m_DL, dummyValPtr);
4857
4858
4859 SmallPtrSet<Value*, 8> touchedValuesSet;
4860 IGC_ASSERT(numOperands == 4 || numOperands == 8);
4861 for (uint index = 0; index < numOperands; index++)
4862 {
4863 Value* val = m_CE->GetPayloadElementToValueMapping(inst, index);
4864 IGC_ASSERT_MESSAGE(nullptr != val, "Val cannot be NULL");
4865 VISA_Type type = m_currShader->GetType(val->getType());
4866
4867 if (touchedValuesSet.count(val)) {
4868 src[index] = m_currShader->GetNewAlias(rootPayloadVar, type, (uint16_t)(offset), 0);
4869 m_encoder->Copy(src[index], GetSymbol(val));
4870 m_encoder->Push();
4871 offset += GetOffsetIncrement(m_DL, m_currShader->m_SIMDSize, val);
4872 continue;
4873 }
4874 else {
4875 touchedValuesSet.insert(val);
4876 }
4877
4878 bool needsCopy = false;
4879 if (m_CE->IsValConstOrIsolated(val)) {
4880 needsCopy = true;
4881 }
4882 else
4883 {
4884 if (m_CE->GetValueCCTupleMapping(val))
4885 {
4886 src[index] = GetSymbol(val);
4887 }
4888 else
4889 {
4890 //this one actually encompasses the case for !getRegRoot(val)
4891 needsCopy = true;
4892 }
4893 }//if constant
4894
4895 if (needsCopy)
4896 {
4897 src[index] = m_currShader->GetNewAlias(rootPayloadVar, type, (uint16_t)offset, 0);
4898 m_encoder->Copy(src[index], GetSymbol(val));
4899 m_encoder->Push();
4900 }
4901 offset += GetOffsetIncrement(m_DL, m_currShader->m_SIMDSize, val);
4902 }//for
4903
4904 if (inst->hasDepth())
4905 {
4906 IGC_ASSERT(outputDepthOpnd == nullptr);
4907 CVariable* temp = m_currShader->GetNewAlias(rootPayloadVar, ISA_TYPE_F, (uint16_t)offset, 0);
4908 m_encoder->Copy(temp, GetSymbol(inst->getDepth()));
4909 m_encoder->Push();
4910 outputDepthOpnd = temp;
4911
4912 IGC_ASSERT(inst->getDepth()->getType()->isFloatTy());
4913 }
4914
4915 if (ccTuple->HasNonHomogeneousElements())
4916 {
4917 IGC_ASSERT_MESSAGE(ccTuple->GetRoot(), "in other words, there is a 'supremum' element");
4918 IGC_ASSERT(llvm::isa<llvm::RTWritIntrinsic>(ccTuple->GetRoot()) || llvm::isa<llvm::RTDualBlendSourceIntrinsic>(ccTuple->GetRoot()));
4919
4920 if (llvm::RTWritIntrinsic * rtwi = llvm::dyn_cast<llvm::RTWritIntrinsic>(ccTuple->GetRoot()))
4921 {
4922 if (rtwi->hasDepth())
4923 {
4924 offset += GetOffsetIncrement(m_DL, m_currShader->m_SIMDSize, inst->getDepth());
4925 }
4926 }
4927 else if (llvm::RTDualBlendSourceIntrinsic * dsrtwi = llvm::dyn_cast<llvm::RTDualBlendSourceIntrinsic>(ccTuple->GetRoot()))
4928 {
4929 if (dsrtwi->hasDepth())
4930 {
4931 offset += GetOffsetIncrement(m_DL, m_currShader->m_SIMDSize, inst->getDepth());
4932 }
4933 }
4934 }
4935
4936
4937 //Stencil is only supported in SIMD8 mode
4938 if (inst->hasStencil())
4939 {
4940 IGC_ASSERT(m_currShader->m_Platform->supportsStencil(m_currShader->m_SIMDSize));
4941 IGC_ASSERT(vStencilOpnd == nullptr);
4942
4943 CVariable* temp = m_currShader->GetNewAlias(rootPayloadVar, ISA_TYPE_UB, (uint16_t)offset, 0);
4944 CVariable* ubSrc = m_currShader->BitCast(GetSymbol(inst->getStencil()), ISA_TYPE_UB);
4945 if (ubSrc->IsUniform())
4946 {
4947 m_encoder->SetSrcRegion(0, 0, 1, 0);
4948 }
4949 else
4950 {
4951 m_encoder->SetSrcRegion(0, 32, 8, 4);
4952 }
4953 m_currShader->CopyVariable(temp, ubSrc, 0);
4954
4955 vStencilOpnd = temp;
4956 }
4957 return true;
4958 }
4959
4960 ///
4961 template <typename T>
prepareRenderTargetWritePayload(T * inst,DenseMap<Value *,CVariable ** > & valueToVariableMap,Value * color[],uint8_t colorCnt,CVariable ** src,bool * isUndefined,CVariable * & varSource0Alpha,CVariable * & varMaskOpnd,CVariable * & varDepthOpnd,CVariable * & varStencilOpnd)4962 void EmitPass::prepareRenderTargetWritePayload(
4963 T* inst,
4964 DenseMap<Value*, CVariable**>& valueToVariableMap,
4965 Value* color[],
4966 uint8_t colorCnt,
4967 //output:
4968 CVariable** src,
4969 bool* isUndefined,
4970 CVariable*& varSource0Alpha,
4971 CVariable*& varMaskOpnd,
4972 CVariable*& varDepthOpnd,
4973 CVariable*& varStencilOpnd)
4974 {
4975 CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
4976
4977 VISA_Type vType = ISA_TYPE_F;
4978
4979 if (color[0]->getType()->isHalfTy())
4980 {
4981 vType = ISA_TYPE_HF;
4982 }
4983
4984 for (uint i = 0; i < colorCnt; ++i)
4985 {
4986 if (isa<UndefValue>(color[i]))
4987 {
4988 isUndefined[i] = true;
4989 }
4990 }
4991
4992 if (interceptRenderTargetWritePayloadCoalescing(
4993 inst,
4994 src,
4995 varSource0Alpha,
4996 varMaskOpnd,
4997 varDepthOpnd,
4998 varStencilOpnd,
4999 valueToVariableMap))
5000 {
5001 return;
5002 }
5003
5004 for (uint i = 0; i < colorCnt; ++i)
5005 {
5006 CVariable* var = GetSymbol(color[i]);
5007
5008 if (!isa<UndefValue>(color[i]))
5009 {
5010 if (var->IsUniform())
5011 {
5012 //if uniform creates a move to payload
5013 src[i] = m_currShader->GetNewVariable(numLanes(m_currShader->m_SIMDSize), vType, EALIGN_GRF, CName::NONE);
5014 m_encoder->Copy(src[i], var);
5015 m_encoder->Push();
5016 }
5017 else
5018 {
5019 src[i] = var;
5020 }
5021 }
5022 }
5023
5024 if (RTWriteHasSource0Alpha(inst, m_moduleMD))
5025 {
5026 varSource0Alpha = GetSymbol(inst->getSource0Alpha());
5027 if (varSource0Alpha->IsUniform())
5028 {
5029 CVariable* temp = m_currShader->GetNewVariable(
5030 numLanes(m_currShader->m_SIMDSize), vType, EALIGN_GRF, CName::NONE);
5031 m_encoder->Copy(temp, varSource0Alpha);
5032 m_encoder->Push();
5033 varSource0Alpha = temp;
5034 }
5035 }
5036
5037 if (inst->hasMask())
5038 {
5039 varMaskOpnd = GetSymbol(inst->getOMask());
5040 //oMask has to be packed since the hardware ignores the upper half
5041 CVariable* temp = m_currShader->GetNewVariable(
5042 numLanes(m_currShader->m_SIMDSize), ISA_TYPE_D, EALIGN_GRF, CName::NONE);
5043 varMaskOpnd = psProgram->BitCast(varMaskOpnd, ISA_TYPE_UW);
5044 psProgram->PackAndCopyVariable(temp, varMaskOpnd);
5045 varMaskOpnd = temp;
5046 }
5047
5048 if (inst->hasDepth())
5049 {
5050 varDepthOpnd = GetSymbol(inst->getDepth());
5051 if (varDepthOpnd->IsUniform())
5052 {
5053 CVariable* temp = m_currShader->GetNewVariable(
5054 numLanes(m_currShader->m_SIMDSize), ISA_TYPE_F, EALIGN_GRF, CName::NONE);
5055 m_encoder->Copy(temp, varDepthOpnd);
5056 m_encoder->Push();
5057 varDepthOpnd = temp;
5058 }
5059 }
5060
5061 if (inst->hasStencil())
5062 {
5063 varStencilOpnd = GetSymbol(inst->getStencil());
5064 /*4 bytes are needed for the final destination per element*/
5065 CVariable* temp = m_currShader->GetNewVariable(
5066 numLanes(m_currShader->m_SIMDSize) * 4, ISA_TYPE_UB, EALIGN_GRF, CName::NONE);
5067 CVariable* ubSrc = m_currShader->BitCast(varStencilOpnd, ISA_TYPE_UB);
5068 if (varStencilOpnd->IsUniform())
5069 {
5070 m_encoder->SetSrcRegion(0, 0, 1, 0);
5071 }
5072 else
5073 {
5074 m_encoder->SetSrcRegion(0, 32, 8, 4);
5075 }
5076 m_currShader->CopyVariable(temp, ubSrc, 0);
5077 varStencilOpnd = temp;
5078 }
5079
5080 }
5081
5082 // Generate a predicate based on current active channels. The 'alias' is
5083 // some existing variable in context to be reused only for generating mask,
5084 // to avoid allocating a new variable.
5085
emitPredicateFromChannelIP(CVariable * dst,CVariable * alias)5086 void EmitPass::emitPredicateFromChannelIP(CVariable* dst, CVariable* alias)
5087 {
5088 CVariable* any;
5089
5090 if (alias)
5091 {
5092 any = m_currShader->GetNewAlias(alias, ISA_TYPE_UD, 0, 1);
5093 }
5094 else
5095 {
5096 any = m_currShader->GetNewVariable(1, ISA_TYPE_UD, EALIGN_DWORD, CName::NONE);
5097 }
5098
5099 m_encoder->SetSrcRegion(0, 0, 1, 0);
5100 m_encoder->SetSrcRegion(1, 0, 1, 0);
5101 m_encoder->Cmp(EPREDICATE_EQ, dst, any, any);
5102 m_encoder->Push();
5103 }
5104
emitRenderTargetWrite(llvm::RTWritIntrinsic * inst,bool fromRet)5105 void EmitPass::emitRenderTargetWrite(llvm::RTWritIntrinsic* inst, bool fromRet)
5106 {
5107 IGC_ASSERT(m_currShader->GetShaderType() == ShaderType::PIXEL_SHADER);
5108 CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
5109
5110 bool lastRenderTarget = psProgram->IsLastRTWrite(inst);
5111 bool EOT = lastRenderTarget && (m_encoder->IsSecondHalf() || m_currShader->m_numberInstance == 1);
5112 bool isNullRT = false;
5113 int RTIndex = inst->getRTIndexImm();
5114 bool oMask = inst->hasMask();
5115 bool outputDepth = inst->hasDepth();
5116 bool outputStencil = inst->hasStencil();
5117 bool perSample = inst->perSample();
5118 Value* vSrc0Alpha = inst->getSource0Alpha();
5119 Value* vMask = inst->getOMask();
5120 Value* pMask = inst->getPMask();
5121 Value* vDepth = inst->getDepth();
5122 Value* vStencil = inst->getStencil();
5123 Value* vSample = inst->getSampleIndex();
5124 Value* vColor[4] = { inst->getRed(), inst->getGreen(), inst->getBlue(), inst->getAlpha() };
5125
5126 if (outputDepth)
5127 {
5128 psProgram->OutputDepth();
5129 }
5130 if (outputStencil)
5131 {
5132 psProgram->OutputStencil();
5133 }
5134 if (oMask)
5135 {
5136 psProgram->OutputMask();
5137 }
5138 uint bindingTableIndex = 0;
5139 if (RTIndex != -1)
5140 {
5141 bindingTableIndex = m_currShader->m_pBtiLayout->GetRenderTargetIndex(RTIndex);
5142 }
5143 else
5144 {
5145 if (!psProgram->IsLastPhase())
5146 {
5147 return;
5148 }
5149 bindingTableIndex = m_currShader->m_pBtiLayout->GetNullSurfaceIdx();
5150
5151 isNullRT = true;
5152 }
5153
5154 bool directIdx = inst->isImmRTIndex();
5155 m_currShader->SetBindingTableEntryCountAndBitmap(directIdx, RENDER_TARGET, RTIndex, bindingTableIndex);
5156
5157 if (EOT)
5158 {
5159 IGC_ASSERT(psProgram->m_hasEOT == false);
5160 psProgram->m_hasEOT = true;
5161 }
5162
5163 //Following variables will receive output from a call
5164 CVariable* src[4] = { nullptr, nullptr, nullptr, nullptr };
5165 bool isUndefined[4] = { false, false, false, false };
5166 CVariable* source0Alpha = nullptr;
5167 CVariable* oMaskOpnd = nullptr;
5168 CVariable* outputDepthOpnd = nullptr;
5169 CVariable* stencilOpnd = nullptr;
5170 CVariable* pMaskOpnd = nullptr;
5171
5172 DenseMap<Value*, CVariable**> valueToVariableMap;
5173 if (!isa<UndefValue>(vSrc0Alpha)) {
5174 valueToVariableMap[vSrc0Alpha] = &source0Alpha;
5175 }
5176 if (oMask) {
5177 valueToVariableMap[vMask] = &oMaskOpnd;
5178 }
5179 if (outputDepth) {
5180 valueToVariableMap[vDepth] = &outputDepthOpnd;
5181 }
5182 if (outputStencil) {
5183 valueToVariableMap[vStencil] = &stencilOpnd;
5184 }
5185
5186 valueToVariableMap[vColor[0]] = &src[0];
5187 valueToVariableMap[vColor[1]] = &src[1];
5188 valueToVariableMap[vColor[2]] = &src[2];
5189 valueToVariableMap[vColor[3]] = &src[3];
5190
5191 prepareRenderTargetWritePayload(
5192 //in:
5193 inst,
5194 valueToVariableMap,
5195 vColor,
5196 4,
5197 //out:
5198 src,
5199 isUndefined,
5200 source0Alpha,
5201 oMaskOpnd,
5202 outputDepthOpnd,
5203 stencilOpnd);
5204
5205 CVariable* cpsCounter = nullptr;
5206 if (psProgram->GetPhase() == PSPHASE_PIXEL)
5207 {
5208 cpsCounter = psProgram->GetCurrentPhaseCounter();
5209 }
5210
5211 bool coarseMode = false;
5212 if (psProgram->GetPhase() == PSPHASE_COARSE)
5213 {
5214 coarseMode = true;
5215 }
5216
5217 CVariable* bti = m_currShader->ImmToVariable(bindingTableIndex, ISA_TYPE_D);
5218
5219 CVariable* sampleIndex = nullptr;
5220 if (m_currShader->m_Platform->supportHeaderRTW() && perSample)
5221 {
5222 sampleIndex = GetSymbol(vSample);
5223 if (!sampleIndex->IsUniform())
5224 {
5225 sampleIndex = UniformCopy(sampleIndex);
5226
5227 }
5228 }
5229
5230 if (psProgram->HasDiscard())
5231 {
5232 ConstantInt* cv = dyn_cast<ConstantInt>(pMask);
5233 if (!cv || cv->getZExtValue() == 0)
5234 {
5235 pMaskOpnd = GetSymbol(pMask);
5236 }
5237 }
5238
5239
5240 if (pMaskOpnd)
5241 {
5242 m_encoder->SetPredicate(pMaskOpnd);
5243 }
5244
5245 bool isHeaderMaskFromCe0 =
5246 !isa<ReturnInst>(inst->getParent()->getTerminator()) &&
5247 pMaskOpnd == nullptr;
5248
5249 CVariable* rtIndexOpnd;
5250 if (RTIndex < 0 || (m_moduleMD->psInfo.BlendStateDisabledMask & BIT(RTIndex)))
5251 {
5252 // if blending is disabled no need to set the RTIndex in the header
5253 rtIndexOpnd = m_currShader->ImmToVariable(0, ISA_TYPE_D);
5254 }
5255 else
5256 {
5257 if (psProgram->IsPerSample())
5258 {
5259 rtIndexOpnd = GetSymbol(inst->getBlendStateIndex());
5260 IGC_ASSERT(rtIndexOpnd->IsUniform());
5261 }
5262 else
5263 {
5264 rtIndexOpnd = m_currShader->ImmToVariable(RTIndex, ISA_TYPE_D);
5265 }
5266 }
5267
5268 m_encoder->RenderTargetWrite(
5269 src,
5270 isUndefined,
5271 lastRenderTarget,
5272 isNullRT,
5273 perSample,
5274 coarseMode,
5275 isHeaderMaskFromCe0,
5276 bti,
5277 rtIndexOpnd,
5278 source0Alpha,
5279 oMaskOpnd,
5280 outputDepthOpnd,
5281 stencilOpnd,
5282 cpsCounter /*cpscounter*/,
5283 sampleIndex,
5284 psProgram->GetR1());
5285 m_encoder->Push();
5286 }
5287
emitSimdLaneId(llvm::Instruction * inst)5288 void EmitPass::emitSimdLaneId(llvm::Instruction* inst)
5289 {
5290 m_currShader->GetSimdOffsetBase(m_destination);
5291 }
5292
emitPatchInstanceId(llvm::Instruction * inst)5293 void EmitPass::emitPatchInstanceId(llvm::Instruction* inst)
5294 {
5295 IGC_ASSERT(m_currShader->GetShaderType() == ShaderType::HULL_SHADER);
5296 CHullShader* hsProgram = static_cast<CHullShader*>(m_currShader);
5297
5298 // Set barrier encountered to true so we can program the instance Count accordingly
5299 hsProgram->SetBarrierEncountered();
5300
5301 /*
5302 ** R0.2 23:17 Instance Number. A patch-relative instance number between 0 and InstanceCount-1. BDW, SKL.
5303 ** -----------------
5304 ** R0.2 22:16 Instance Number. A patch-relative instance number between 0 and InstanceCount-1. CNL+.
5305 */
5306 unsigned int instanceIdStartBit = m_currShader->m_Platform->getHullShaderThreadInstanceIdBitFieldPosition();
5307 CVariable* mask7bit = m_currShader->ImmToVariable(0x7f, ISA_TYPE_UD);
5308 m_encoder->SetSrcRegion(0, 0, 1, 0);
5309 m_encoder->SetSrcSubReg(0, 2);
5310 m_encoder->Shr(m_destination, hsProgram->GetR0(), m_currShader->ImmToVariable(instanceIdStartBit, ISA_TYPE_UD));
5311 m_encoder->SetSrcSubReg(0, 0);
5312 m_encoder->And(m_destination, m_destination, mask7bit);
5313 m_encoder->Push();
5314 }
5315
emitSimdSize(llvm::Instruction * inst)5316 void EmitPass::emitSimdSize(llvm::Instruction* inst)
5317 {
5318 //CVariable* simdSize = m_currShader->ImmToVariable(numLanes(m_SimdMode), ISA_TYPE_UD);
5319 //m_encoder->Cast(m_destination, simdSize);
5320 //m_encoder->Push();
5321 }
5322
5323 /// Emits VISA instructions for SIMD_SHUFFLE.
emitSimdShuffle(llvm::Instruction * inst)5324 void EmitPass::emitSimdShuffle(llvm::Instruction* inst)
5325 {
5326 CVariable* data = GetSymbol(inst->getOperand(0));
5327 CVariable* simdChannel = GetSymbol(inst->getOperand(1));
5328
5329 const bool isSimd32 = (m_currShader->m_dispatchSize == SIMDMode::SIMD32);
5330
5331 if (data->IsUniform())
5332 {
5333 m_encoder->Copy(m_destination, data);
5334 m_encoder->Push();
5335 if (isSimd32 && !m_destination->IsUniform())
5336 {
5337 m_encoder->SetSecondHalf(true);
5338 m_encoder->Copy(m_destination, data);
5339 m_encoder->Push();
5340 m_encoder->SetSecondHalf(false);
5341 }
5342 }
5343 else if (simdChannel->IsImmediate())
5344 {
5345 uint dataIndex = int_cast<uint>(simdChannel->GetImmediateValue());
5346 // prevent out of bound access
5347 dataIndex = dataIndex % numLanes(m_currShader->m_dispatchSize);
5348 if (isSimd32)
5349 {
5350 const bool isSrcInSecondHalf = dataIndex >= 16;
5351 dataIndex = dataIndex % numLanes(m_encoder->GetSimdSize());
5352
5353 if (m_destination->IsUniform())
5354 {
5355 m_encoder->SetSecondHalf(isSrcInSecondHalf);
5356 m_encoder->SetSrcRegion(0, 0, 1, 0);
5357 m_encoder->SetSrcSubReg(0, dataIndex);
5358 m_encoder->Copy(m_destination, data);
5359 m_encoder->Push();
5360 m_encoder->SetSecondHalf(false);
5361 }
5362 else
5363 {
5364 // Use an intermediate uniform variable
5365 CVariable* uniformTemp = m_currShader->GetNewVariable(
5366 1,
5367 data->GetType(),
5368 m_encoder->GetCISADataTypeAlignment(data->GetType()),
5369 true, // isUniform
5370 "ShuffleTmp");
5371
5372 // Copy from source to the uniform temp...
5373 m_encoder->SetSecondHalf(isSrcInSecondHalf);
5374 m_encoder->SetSrcRegion(0, 0, 1, 0);
5375 m_encoder->SetSrcSubReg(0, dataIndex);
5376 m_encoder->SetNoMask();
5377 m_encoder->Copy(uniformTemp, data);
5378 m_encoder->Push();
5379 m_encoder->SetSecondHalf(false);
5380
5381 // ...and broadcast.
5382 m_encoder->Copy(m_destination, uniformTemp);
5383 m_encoder->Push();
5384 m_encoder->SetSecondHalf(true);
5385 m_encoder->Copy(m_destination, uniformTemp);
5386 m_encoder->SetSecondHalf(false);
5387
5388 }
5389 }
5390 else
5391 {
5392 m_encoder->SetSrcRegion(0, 0, 1, 0);
5393 m_encoder->SetSrcSubReg(0, dataIndex);
5394 m_encoder->Copy(m_destination, data);
5395 m_encoder->Push();
5396 }
5397 }
5398 else
5399 {
5400 // Emits below instructions when simdChannel isn't immediate.
5401 //shl (16) r8.0<1>:ud r6.0<0;1,0>:d 0x2:uw {Align1, H1, NoMask}
5402 //add (16) a0.0<1>:uw r8.0<16;8,2>:uw 0x80:uw {Align1, H1, NoMask}
5403 //mov (16) r10.0<1>:d r[a0.0, 0]<1,0>:d {Align1, H1}
5404 // For SIMD32:
5405 // shl(M1, 32) V465(0, 0)<1> V464(0, 0)<16; 8, 2> 0x2:uw /// $592
5406 // mov(M1, 32) V466(0, 0)<1> V70(0, 0)<1; 1, 0> /// $593
5407 // addr_add(M1, 16) A0(0)<1> &V466 + 0 V465(0, 0)<1; 1, 0> /// $594
5408 // mov(M1, 16) V463(0, 0)<1> r[A0(0), 0]<1, 0> : f /// $595
5409 // addr_add(M5, 16) A0(0)<1> &V466 + 0 V465(0, 16)<1; 1, 0> /// $596
5410 // mov(M5, 16) V463(1, 0)<1> r[A0(0), 0]<1, 0> : f /// $597
5411
5412 bool channelUniform = simdChannel->IsUniform();
5413
5414 IGC_ASSERT_MESSAGE(m_encoder->GetCISADataTypeSize(simdChannel->GetType()) == 4,
5415 "simdChannel size of simdShuffle should be 4 bytes!");
5416
5417 // Choose the shift factor.
5418 int shtAmt = 0;
5419 switch (m_encoder->GetCISADataTypeSize(m_destination->GetType()))
5420 {
5421 case 1: shtAmt = 0; break;
5422 case 2: shtAmt = 1; break;
5423 case 4: shtAmt = 2; break;
5424 case 8: shtAmt = 3; break;
5425 default: IGC_ASSERT_MESSAGE(0, "Unexpected data type size.");
5426 }
5427
5428 CVariable* simdChannelUW = m_currShader->BitCast(simdChannel, ISA_TYPE_UW);
5429 CVariable* pSrcElm = m_currShader->GetNewVariable(
5430 simdChannel->GetNumberElement(),
5431 ISA_TYPE_UW,
5432 EALIGN_GRF,
5433 channelUniform,
5434 simdChannel->GetNumberInstance(),
5435 "ShuffleTmp");
5436 if (!channelUniform)
5437 {
5438 m_encoder->SetSrcRegion(0, 16, 8, 2);
5439 }
5440 m_encoder->Shl(pSrcElm, simdChannelUW,
5441 m_currShader->ImmToVariable(shtAmt, ISA_TYPE_UW));
5442 m_encoder->Push();
5443
5444 CVariable* src = data;
5445
5446 if (isSimd32)
5447 {
5448 CVariable* contiguousData = nullptr;
5449 CVariable* upperHalfOfContiguousData = nullptr;
5450
5451 const uint16_t numElements = data->GetNumberElement();
5452 const VISA_Type dataType = data->GetType();
5453
5454 IGC_ASSERT(numElements == 16);
5455 IGC_ASSERT_MESSAGE(!m_encoder->IsSecondHalf(), "This emitter must be called only once for simd32!");
5456
5457 // Create a 32 element variable and copy both instances of data into it.
5458 contiguousData = m_currShader->GetNewVariable(
5459 numElements * 2,
5460 dataType,
5461 data->GetAlign(),
5462 false, // isUniform
5463 1,
5464 "ShuffleTmp"); // numberInstance
5465
5466 upperHalfOfContiguousData = m_currShader->GetNewAlias(
5467 contiguousData,
5468 dataType,
5469 numElements * m_encoder->GetCISADataTypeSize(dataType),
5470 numElements);
5471
5472 IGC_ASSERT(contiguousData);
5473 IGC_ASSERT(upperHalfOfContiguousData);
5474
5475 m_encoder->SetSecondHalf(false);
5476 m_encoder->Copy(contiguousData, data);
5477 m_encoder->Push();
5478
5479 m_encoder->SetSecondHalf(true);
5480 m_encoder->Copy(upperHalfOfContiguousData, data);
5481 m_encoder->Push();
5482
5483 if (!channelUniform)
5484 {
5485 // also calculate the second half of address
5486 m_encoder->SetSrcRegion(0, 16, 8, 2);
5487 m_encoder->Shl(pSrcElm, simdChannelUW,
5488 m_currShader->ImmToVariable(shtAmt, ISA_TYPE_UW));
5489 m_encoder->Push();
5490 }
5491
5492 m_encoder->SetSecondHalf(false);
5493
5494 src = contiguousData;
5495 }
5496
5497 uint16_t addrSize = channelUniform ? 1 :
5498 (m_SimdMode == SIMDMode::SIMD32 ? numLanes(SIMDMode::SIMD16) : numLanes(m_SimdMode));
5499
5500 // VectorUniform for shuffle is true as all simd lanes will
5501 // take the same data as the lane 0 !
5502 CVariable* pDstArrElm = m_currShader->GetNewAddressVariable(
5503 addrSize,
5504 m_destination->GetType(),
5505 channelUniform,
5506 true,
5507 m_destination->getName());
5508
5509 m_encoder->AddrAdd(pDstArrElm, src, pSrcElm);
5510 m_encoder->Push();
5511
5512 m_encoder->Copy(m_destination, pDstArrElm);
5513 m_encoder->Push();
5514
5515 if (isSimd32)
5516 {
5517 m_encoder->SetSecondHalf(true);
5518 m_encoder->AddrAdd(pDstArrElm, src, pSrcElm);
5519 m_encoder->Push();
5520 m_encoder->Copy(m_destination, pDstArrElm);
5521 m_encoder->Push();
5522 m_encoder->SetSecondHalf(false);
5523 }
5524 }
5525 }
5526
emitSimdShuffleDown(llvm::Instruction * inst)5527 void EmitPass::emitSimdShuffleDown(llvm::Instruction* inst)
5528 {
5529 CVariable* pCurrentData = GetSymbol(inst->getOperand(0));
5530 CVariable* pNextData = GetSymbol(inst->getOperand(1));
5531 CVariable* pDelta = m_currShader->GetSymbol(inst->getOperand(2));
5532
5533 // temp size is the sum of src0 and src1
5534 uint16_t nbElements = numLanes(m_SimdMode) * 2;
5535
5536 // Join current and Next Data
5537 CVariable* pCombinedData = m_currShader->GetNewVariable(
5538 nbElements,
5539 m_destination->GetType(),
5540 m_destination->GetAlign(),
5541 "ShuffleTmp");
5542
5543 auto CopyData = [this](CVariable* pDestinationData, CVariable* pSourceData, uint32_t offset)
5544 {
5545 for (uint32_t i = 0; i < m_currShader->m_numberInstance; i++)
5546 {
5547 IGC_ASSERT_MESSAGE(!m_encoder->IsSecondHalf(), "This emitter must be called only once for simd32!");
5548 uint32_t currentOffset = offset + numLanes(m_encoder->GetSimdSize()) * i;
5549 bool isSecondHalf = i == 1;
5550
5551 if (isSecondHalf)
5552 {
5553 m_encoder->SetSecondHalf(true);
5554 }
5555
5556 m_encoder->SetSimdSize(m_encoder->GetSimdSize());
5557 m_encoder->SetDstSubReg(currentOffset);
5558 m_encoder->SetNoMask();
5559 m_encoder->Copy(pDestinationData, pSourceData);
5560 m_encoder->Push();
5561
5562 if (isSecondHalf)
5563 {
5564 m_encoder->SetSecondHalf(false);
5565 }
5566 }
5567 };
5568
5569 CopyData(pCombinedData, pCurrentData, 0);
5570 CopyData(pCombinedData, pNextData, numLanes(m_encoder->GetSimdSize()) * m_currShader->m_numberInstance);
5571
5572 // Emit mov with direct addressing when delta is a compile-time constant.
5573 const bool useDirectAddressing = pDelta->IsImmediate()
5574 && m_currShader->m_Platform->GetPlatformFamily() != IGFX_GEN8_CORE;
5575
5576 auto nativeExecSize = numLanes(m_currShader->m_Platform->getMinDispatchMode());
5577 auto width = numLanes(m_SimdMode);
5578 if (useDirectAddressing && nativeExecSize * 2 >= width)
5579 {
5580 const uint dataIndex = pDelta->GetImmediateValue() % nbElements;
5581 int tripCount = width <= nativeExecSize ? 1 : 2;
5582 for (int i = 0; i < tripCount; ++i)
5583 {
5584 m_encoder->SetSimdSize(m_currShader->m_Platform->getMinDispatchMode());
5585 m_encoder->SetSrcRegion(0, 1, 1, 0);
5586 m_encoder->SetSrcSubReg(0, dataIndex + nativeExecSize * i);
5587 m_encoder->SetDstSubReg(nativeExecSize * i);
5588 m_encoder->Copy(m_destination, pCombinedData);
5589 m_encoder->Push();
5590 }
5591 return;
5592 }
5593
5594 // Emits below instructions:
5595 // mov (8) r12.0<1>:w 0x76543210:v {Align1, Q1, NoMask}
5596 // mov (8) r38.0<1>:ud r12.0<8;8,1>:w {Align1, Q1, NoMask}
5597 // add (8) r39.0<1>:ud r38.0<8;8,1>:ud 0x8:uw {Align1, Q1, NoMask}
5598 // add (16) r40.0<1>:ud r14.0<8;8,1>:d r38.0<8;8,1>:ud {Align1, H1, NoMask}
5599 // shl (16) r42.0<1>:ud r40.0<8;8,1>:ud 0x2:uw {Align1, H1, NoMask}
5600 // add (16) a0.0<1>:uw r42.0<16;8,2>:uw 0x440:uw {Align1, H1, NoMask}
5601 // mov (16) r49.0<1>:d r[a0.0, 0]<1,0>:d {Align1, H1}
5602
5603 CVariable* pLaneId = m_currShader->GetNewVariable(
5604 numLanes(m_SimdMode),
5605 ISA_TYPE_UD,
5606 EALIGN_GRF,
5607 "LaneId");
5608
5609 m_encoder->SetSimdSize(SIMDMode::SIMD8);
5610 m_encoder->SetNoMask();
5611 CVariable* imm0 = m_currShader->ImmToVariable(0x76543210, ISA_TYPE_V);
5612 m_encoder->Cast(pLaneId, imm0);
5613 m_encoder->Push();
5614
5615 if (m_SimdMode == SIMDMode::SIMD16 || m_SimdMode == SIMDMode::SIMD32)
5616 {
5617 m_encoder->SetDstSubVar(0);
5618 m_encoder->SetDstSubReg(8);
5619 m_encoder->SetSimdSize(SIMDMode::SIMD8);
5620 m_encoder->SetNoMask();
5621 CVariable* imm1 = m_currShader->ImmToVariable(0x8, ISA_TYPE_UD);
5622 m_encoder->Add(pLaneId, pLaneId, imm1);
5623 m_encoder->Push();
5624 }
5625
5626 if (m_SimdMode == SIMDMode::SIMD32)
5627 {
5628 m_encoder->SetSimdSize(SIMDMode::SIMD16);
5629 m_encoder->SetDstSubReg(16);
5630 m_encoder->SetNoMask();
5631 CVariable* imm1 = m_currShader->ImmToVariable(0x10, ISA_TYPE_UD);
5632 m_encoder->Add(pLaneId, pLaneId, imm1);
5633 m_encoder->Push();
5634 }
5635
5636 CVariable* pShuffleIdx = m_currShader->GetNewVariable(
5637 numLanes(m_SimdMode),
5638 ISA_TYPE_UD,
5639 EALIGN_GRF,
5640 "ShuffleIdx");
5641
5642 for (uint32_t i = 0; i < m_currShader->m_numberInstance; i++)
5643 {
5644 IGC_ASSERT_MESSAGE(!m_encoder->IsSecondHalf(), "This emitter must be called only once for simd32!");
5645 uint32_t offset = numLanes(m_encoder->GetSimdSize()) * i;
5646 bool isSecondHalf = i == 1;
5647
5648 if (isSecondHalf)
5649 {
5650 m_encoder->SetSecondHalf(true);
5651 }
5652
5653 CVariable* pCurrentLaneId = m_currShader->GetNewAlias(
5654 pLaneId,
5655 pLaneId->GetType(),
5656 offset * m_encoder->GetCISADataTypeSize(pLaneId->GetType()),
5657 numLanes(m_encoder->GetSimdSize()));
5658
5659 m_encoder->SetSimdSize(m_encoder->GetSimdSize());
5660 m_encoder->SetDstSubReg(offset);
5661 m_encoder->SetNoMask();
5662 m_encoder->Add(pShuffleIdx, pCurrentLaneId, pDelta);
5663 m_encoder->Push();
5664
5665 if (isSecondHalf)
5666 {
5667 m_encoder->SetSecondHalf(false);
5668 }
5669 }
5670
5671 CVariable* pByteOffset = m_currShader->GetNewVariable(
5672 numLanes(m_SimdMode),
5673 ISA_TYPE_UD,
5674 EALIGN_GRF,
5675 "ByteOffset");
5676
5677 uint32_t shift = m_destination->GetElemSize() / 2;
5678
5679 for (uint32_t i = 0; i < m_currShader->m_numberInstance; i++)
5680 {
5681 uint32_t offset = numLanes(m_encoder->GetSimdSize()) * i;
5682
5683 CVariable* pCurrentShuffleIdx = m_currShader->GetNewAlias(
5684 pShuffleIdx,
5685 pShuffleIdx->GetType(),
5686 offset * m_encoder->GetCISADataTypeSize(pShuffleIdx->GetType()),
5687 numLanes(m_encoder->GetSimdSize()));
5688
5689 m_encoder->SetSimdSize(m_encoder->GetSimdSize());
5690 m_encoder->SetDstSubReg(offset);
5691 m_encoder->SetNoMask();
5692 m_encoder->Shl(pByteOffset, pCurrentShuffleIdx, m_currShader->ImmToVariable(shift, ISA_TYPE_UD));
5693 m_encoder->Push();
5694 }
5695
5696
5697 uint16_t addrSize = m_SimdMode == SIMDMode::SIMD32 ? numLanes(SIMDMode::SIMD16) : numLanes(m_SimdMode);
5698
5699 CVariable* pDstArrElm = m_currShader->GetNewAddressVariable(
5700 addrSize,
5701 m_destination->GetType(),
5702 false,
5703 false,
5704 m_destination->getName());
5705
5706 for (uint32_t i = 0; i < m_currShader->m_numberInstance; i++)
5707 {
5708 IGC_ASSERT_MESSAGE(!m_encoder->IsSecondHalf(), "This emitter must be called only once for simd32!");
5709 uint32_t offset = numLanes(m_encoder->GetSimdSize()) * i;
5710 bool isSecondHalf = i == 1;
5711
5712 CVariable* pCurrentByteOffset = m_currShader->GetNewAlias(
5713 pByteOffset,
5714 pByteOffset->GetType(),
5715 offset * m_encoder->GetCISADataTypeSize(pByteOffset->GetType()),
5716 numLanes(m_encoder->GetSimdSize()));
5717
5718 m_encoder->SetNoMask();
5719 m_encoder->SetSrcRegion(1, 16, 8, 2);
5720 m_encoder->AddrAdd(pDstArrElm, pCombinedData, m_currShader->BitCast(pCurrentByteOffset, ISA_TYPE_UW));
5721 m_encoder->Push();
5722
5723 if (isSecondHalf)
5724 {
5725 m_encoder->SetSecondHalf(true);
5726 }
5727
5728 m_encoder->Copy(m_destination, pDstArrElm);
5729 m_encoder->Push();
5730
5731 if (isSecondHalf)
5732 {
5733 m_encoder->SetSecondHalf(false);
5734 }
5735 }
5736 }
5737
getBlockMsgSize(uint32_t bytesRemaining,uint32_t maxSize)5738 static uint32_t getBlockMsgSize(uint32_t bytesRemaining, uint32_t maxSize)
5739 {
5740 uint32_t size = 0;
5741 if (bytesRemaining >= 256)
5742 {
5743 size = 256;
5744 }
5745 else if (bytesRemaining >= 128)
5746 {
5747 size = 128;
5748 }
5749 else if (bytesRemaining >= 64)
5750 {
5751 size = 64;
5752 }
5753 else if (bytesRemaining >= 32)
5754 {
5755 size = 32;
5756 }
5757 else
5758 {
5759 size = 16;
5760 }
5761 return std::min(size, maxSize);
5762 }
5763
5764
emitSimdBlockWrite(llvm::Instruction * inst,llvm::Value * ptrVal)5765 void EmitPass::emitSimdBlockWrite(llvm::Instruction* inst, llvm::Value* ptrVal)
5766 {
5767 emitLegacySimdBlockWrite(inst, ptrVal);
5768
5769 }
5770
emitSimdBlockRead(llvm::Instruction * inst,llvm::Value * ptrVal)5771 void EmitPass::emitSimdBlockRead(llvm::Instruction* inst, llvm::Value* ptrVal)
5772 {
5773 emitLegacySimdBlockRead(inst, ptrVal);
5774 }
5775
emitLegacySimdBlockWrite(llvm::Instruction * inst,llvm::Value * ptrVal)5776 void EmitPass::emitLegacySimdBlockWrite(llvm::Instruction* inst, llvm::Value* ptrVal)
5777 {
5778 Value* llPtr = inst->getOperand(0);
5779 Value* dataPtr = inst->getOperand(1);
5780
5781 PointerType* ptrType = cast<PointerType>(llPtr->getType());
5782 ResourceDescriptor resource = GetResourceVariable(llPtr);
5783
5784 CVariable* src = nullptr;
5785 if (ptrVal)
5786 {
5787 src = GetSymbol(ptrVal);
5788 src = m_currShader->BitCast(src, GetUnsignedType(src->GetType()));
5789 }
5790 else
5791 {
5792 src = GetSymbol(llPtr);
5793 }
5794
5795 CVariable* data = GetSymbol(dataPtr);
5796 bool useA64 = isA64Ptr(ptrType, m_currShader->GetContext());
5797
5798 Type* Ty = dataPtr->getType();
5799 IGCLLVM::FixedVectorType* VTy = dyn_cast<IGCLLVM::FixedVectorType>(Ty);
5800 uint32_t nbElements = 0;
5801 nbElements = VTy ? int_cast<uint32_t>(VTy->getNumElements()) : 1;
5802
5803 uint32_t typeSizeInBytes = Ty->getScalarSizeInBits() / 8;
5804 uint32_t totalBytes = nbElements * typeSizeInBytes * numLanes(m_SimdMode);
5805
5806 bool isSeparated = m_SimdMode == SIMDMode::SIMD32 &&
5807 m_encoder->GetSimdSize() == SIMDMode::SIMD16;
5808
5809 // Data has other layout than expecting one by block write instructions in case of multiple instances.
5810 // The expected layout:
5811 // |0th component of data from thread 0-15 |0th component of data from thread 16-31|
5812 // |1st component of data from thread 0-15 |1st component of data from thread 16-31|
5813 // The current layout:
5814 // |0th component of data from thread 0-15 |1st component of data from thread 0-15 |
5815 // |0th component of data from thread 16-31|1st component of data from thread 16-31|
5816 if (isSeparated)
5817 {
5818 IGC_ASSERT_MESSAGE(!m_encoder->IsSecondHalf(), "This emitter must be called only once for simd32!");
5819 const uint32_t numVectorElementsPerSimd = numLanes(m_encoder->GetSimdSize());
5820 CVariable* copiedData = m_currShader->GetNewVariable(
5821 data->GetNumberElement() * data->GetNumberInstance(),
5822 data->GetType(),
5823 data->GetAlign(),
5824 "");
5825
5826 for (uint32_t i = 0; i < 2; i++)
5827 {
5828 if (i == 1)
5829 {
5830 m_encoder->SetSecondHalf(true);
5831 }
5832
5833 for (uint32_t elementIndex = 0; elementIndex < nbElements; elementIndex++)
5834 {
5835 // Offsets can be deduced from the upper comment.
5836 CVariable* destinationAlias = m_currShader->GetNewAlias(
5837 copiedData,
5838 copiedData->GetType(),
5839 numVectorElementsPerSimd * (nbElements * elementIndex + i) * m_encoder->GetCISADataTypeSize(copiedData->GetType()),
5840 numVectorElementsPerSimd);
5841 CVariable* sourceAlias = data;
5842 if (!data->IsUniform())
5843 {
5844 sourceAlias = m_currShader->GetNewAlias(
5845 data,
5846 data->GetType(),
5847 numVectorElementsPerSimd * elementIndex * m_encoder->GetCISADataTypeSize(data->GetType()),
5848 numVectorElementsPerSimd);
5849 }
5850
5851 m_encoder->SetSimdSize(m_encoder->GetSimdSize());
5852 m_encoder->SetNoMask();
5853 m_encoder->Copy(destinationAlias, sourceAlias);
5854 m_encoder->Push();
5855 }
5856 }
5857
5858 m_encoder->SetSecondHalf(false);
5859 data = copiedData;
5860 }
5861 else
5862 {
5863 // Special case for uniform data. data is expected to be non-uniform.
5864 data = BroadcastIfUniform(data);
5865 }
5866
5867
5868 // Special case for simd8 char block write, in which the total bytes = 8.
5869 // (All the other cases, the total bytes is multiple of 16 (OW).
5870 if (totalBytes == 8)
5871 {
5872 // Use Byte scattered write. If address is aligned at least QW,
5873 // we should use QW-aligned QW write!
5874 // ByteScatterred write: use (blksizeInBits, nblk) = (8, 4) and two lanes
5875 // QW write : use (blksizeInBits, nblk) = (64, 1) [todo]
5876 bool useQW = false;
5877 uint32_t blkBits = useQW ? 64 : 8;
5878 uint32_t nBlks = useQW ? 1 : 4;
5879
5880 uint16_t activelanes = useQW ? 1 : 2;
5881 // lanesToSIMDMode(activelanes);
5882 SIMDMode simdmode = useQW ? SIMDMode::SIMD1 : SIMDMode::SIMD2;
5883
5884 CVariable* eOffset = src;
5885 eOffset = ReAlignUniformVariable(src, m_currShader->getGRFAlignment());
5886 CVariable* ScatterOff = eOffset;
5887 if (activelanes > 1)
5888 {
5889 IGC_ASSERT_MESSAGE(!useQW, "Only one lane is active when using QW!");
5890
5891 ScatterOff = m_currShader->GetNewVariable(
5892 activelanes, eOffset->GetType(), eOffset->GetAlign(), true, "ScatterOff");
5893
5894 CVariable* immVar = m_currShader->ImmToVariable(0x40, ISA_TYPE_UV);
5895 if (useA64 && m_currShader->m_Platform->hasNoInt64AddInst()) {
5896 emitAddPair(ScatterOff, eOffset, immVar);
5897 }
5898 else {
5899 m_encoder->SetNoMask();
5900 m_encoder->SetUniformSIMDSize(simdmode);
5901 m_encoder->SetSrcRegion(0, 0, 1, 0);
5902 m_encoder->Add(ScatterOff, eOffset, immVar);
5903 m_encoder->Push();
5904 }
5905 }
5906
5907 m_encoder->SetNoMask();
5908 m_encoder->SetUniformSIMDSize(simdmode);
5909 if (useA64)
5910 {
5911 emitScatterA64(data, ScatterOff, blkBits, nBlks, true);
5912 }
5913 else
5914 {
5915 m_encoder->ByteScatter(data, resource, ScatterOff, blkBits, nBlks);
5916 }
5917 m_encoder->Push();
5918
5919 return;
5920 }
5921
5922 if (useA64)
5923 {
5924 uint32_t bytesRemaining = totalBytes;
5925 uint32_t srcOffset = 0;
5926 uint32_t bytesToRead = 0;
5927
5928 // Emits instructions generating one or more A64 OWORD block write instructions
5929 // The amount of data we need to write is n * Component Size OWORDs.
5930 // We can write 8, 4, or 2 OWORDs at a time. We can also write 1 OWORD,
5931 // but since this is a SIMD opcode and we're compiling SIMD8, SIMD16,
5932 // we don't expect to see a 1 OWORD write.
5933
5934 m_encoder->SetSimdSize(SIMDMode::SIMD1);
5935 m_encoder->SetNoMask();
5936 m_encoder->SetSrcRegion(0, 0, 1, 0);
5937
5938 CVariable* pTempVar = m_currShader->GetNewVariable(
5939 numLanes(SIMDMode::SIMD1),
5940 ISA_TYPE_UQ,
5941 EALIGN_QWORD, true, CName::NONE);
5942
5943 m_encoder->Copy(pTempVar, m_currShader->BitCast(src, ISA_TYPE_UQ));
5944 m_encoder->Push();
5945
5946 while (bytesRemaining)
5947 {
5948 bytesToRead = getBlockMsgSize(bytesRemaining, m_currShader->m_Platform->getMaxBlockMsgSize(false));
5949 bytesRemaining -= bytesToRead;
5950 m_encoder->OWStoreA64(data, pTempVar, bytesToRead, srcOffset);
5951
5952 srcOffset = srcOffset + bytesToRead;
5953 m_encoder->Push();
5954
5955 if (bytesRemaining)
5956 {
5957 if (m_currShader->m_Platform->hasNoInt64AddInst()) {
5958 CVariable* ImmVar = m_currShader->ImmToVariable(bytesToRead, ISA_TYPE_UD);
5959 emitAddPair(pTempVar, pTempVar, ImmVar);
5960 }
5961 else {
5962 m_encoder->SetSimdSize(SIMDMode::SIMD1);
5963 m_encoder->SetNoMask();
5964 m_encoder->SetSrcRegion(0, 0, 1, 0);
5965 m_encoder->Add(pTempVar, pTempVar, m_currShader->ImmToVariable((bytesToRead), ISA_TYPE_UQ));
5966 m_encoder->Push();
5967 }
5968 }
5969 }
5970 }
5971 else
5972 {
5973 uint32_t bytesRemaining = totalBytes;
5974
5975 // Emits instructions generating one or more OWORD block write instructions
5976 // The amount of data we need to write is n * Component Size OWORDs.
5977 // We can write 8, 4, or 2 OWORDs at a time. We can also write 1 OWORD,
5978 // but since this is a SIMD opcode and we're compiling SIMD8, SIMD16,
5979 // we don't expect to see a 1 OWORD write.
5980
5981 // shr (1) r64.2<1>:ud r60.0<0; 1, 0>:ud 0x4:uw{ Align1, H1, NoMask }
5982 // mov (16) r65.0<1>:ud r54.0<8; 8, 1>:ud{ Align1, NoMask, Compacted }
5983 // and (1) r64.5<1>:ud r0.5<0; 1, 0>:ud 0x3ff:ud{ Align1, NoMask }
5984 // send (16) null<1>:uw r64 0xa 0x60a03ff:ud{ Align1, NoMask } oword block write
5985
5986 CVariable* src0shifted = m_currShader->GetNewVariable(
5987 numLanes(SIMDMode::SIMD1),
5988 ISA_TYPE_UD,
5989 EALIGN_DWORD,
5990 "Src0Shifted");
5991
5992 m_encoder->SetSimdSize(SIMDMode::SIMD1);
5993 m_encoder->SetNoMask();
5994 m_encoder->SetSrcRegion(0, 0, 1, 0);
5995 m_encoder->Shr(src0shifted, src, m_currShader->ImmToVariable(4, ISA_TYPE_UD));
5996 m_encoder->Push();
5997
5998 uint32_t srcOffset = 0;
5999 uint32_t bytesToRead = 0;
6000 while (bytesRemaining)
6001 {
6002 bool isToSLM = ptrType->getPointerAddressSpace() == ADDRESS_SPACE_LOCAL;
6003 bytesToRead = getBlockMsgSize(bytesRemaining, m_currShader->m_Platform->getMaxBlockMsgSize(isToSLM));
6004 bytesRemaining -= bytesToRead;
6005
6006 m_encoder->OWStore(data, resource.m_surfaceType, resource.m_resource, src0shifted, bytesToRead, srcOffset);
6007
6008 srcOffset = srcOffset + bytesToRead;
6009 m_encoder->Push();
6010
6011 if (bytesRemaining)
6012 {
6013 m_encoder->SetSimdSize(SIMDMode::SIMD1);
6014 m_encoder->SetNoMask();
6015 m_encoder->SetSrcRegion(0, 0, 1, 0);
6016 m_encoder->Add(src0shifted, src0shifted, m_currShader->ImmToVariable((bytesToRead / 16), ISA_TYPE_UD)); // (bytesToRead / 16) is units of OWORDS
6017 m_encoder->Push();
6018 }
6019 }
6020 }
6021 }
6022
emitLegacySimdBlockRead(llvm::Instruction * inst,llvm::Value * ptrVal)6023 void EmitPass::emitLegacySimdBlockRead(llvm::Instruction* inst, llvm::Value* ptrVal)
6024 {
6025 Value* llPtr = inst->getOperand(0);
6026 PointerType* ptrType = cast<PointerType>(llPtr->getType());
6027 ResourceDescriptor resource = GetResourceVariable(llPtr);
6028
6029 CVariable* src = nullptr;
6030 if (ptrVal)
6031 {
6032 src = GetSymbol(ptrVal);
6033 src = m_currShader->BitCast(src, GetUnsignedType(src->GetType()));
6034 }
6035 else
6036 {
6037 src = GetSymbol(llPtr);
6038 }
6039
6040 // If it is SLM, use OW-aligned OW address. The byte address (default)
6041 // must be right-shifted by 4 bits to be OW address!
6042 bool isToSLM = (ptrType->getPointerAddressSpace() == ADDRESS_SPACE_LOCAL);
6043 bool useA64 = isA64Ptr(ptrType, m_currShader->GetContext());
6044
6045 Type* Ty = inst->getType();
6046 IGCLLVM::FixedVectorType* VTy = dyn_cast<IGCLLVM::FixedVectorType>(Ty);
6047 uint32_t nbElements = VTy ? int_cast<uint32_t>(VTy->getNumElements()) : 1;
6048
6049 uint32_t typeSizeInBytes = Ty->getScalarSizeInBits() / 8;
6050 uint32_t totalBytes = nbElements * typeSizeInBytes * numLanes(m_SimdMode);
6051
6052
6053 bool needsTempDst = m_SimdMode == SIMDMode::SIMD32 &&
6054 m_encoder->GetSimdSize() == SIMDMode::SIMD16;
6055 CVariable* dest = needsTempDst ?
6056 m_currShader->GetNewVariable(
6057 m_destination->GetNumberElement() * m_destination->GetNumberInstance(),
6058 m_destination->GetType(),
6059 m_destination->GetAlign(),
6060 "") :
6061 m_destination;
6062
6063 // Special case for simd8 char block read, in which the total bytes = 8.
6064 // (All the other cases, the total bytes is multiple of 16 (OW).
6065 if (totalBytes == 8)
6066 {
6067 // Use Byte scattered read. If address is aligned at least QW,
6068 // we should use QW-aligned QW read!
6069 // Byte Scattered read : use (blksizeInBits, nblk) = (8, 4) and two lanes
6070 // QW read : use (blksizeInBits, nblk) = (64, 1) [todo]
6071 bool useQW = false;
6072 uint32_t blkBits = useQW ? 64 : 8;
6073 uint32_t nBlks = useQW ? 1 : 4;
6074 CVariable* gatherDst = dest;
6075
6076 uint16_t activelanes = useQW ? 1 : 2;
6077 // lanesToSIMDMode(activelanes);
6078 SIMDMode simdmode = useQW ? SIMDMode::SIMD1 : SIMDMode::SIMD2;
6079
6080 CVariable* eOffset = src;
6081 eOffset = ReAlignUniformVariable(src, m_currShader->getGRFAlignment());
6082 CVariable* gatherOff = eOffset;
6083 if (activelanes > 1)
6084 {
6085 IGC_ASSERT_MESSAGE(!useQW, "Only one lane is active when using QW!");
6086
6087 gatherOff = m_currShader->GetNewVariable(
6088 activelanes, eOffset->GetType(), eOffset->GetAlign(), true, "GatherOff");
6089
6090 CVariable* immVar = m_currShader->ImmToVariable(0x40, ISA_TYPE_UV);
6091 if (useA64 && m_currShader->m_Platform->hasNoInt64AddInst()) {
6092 emitAddPair(gatherOff, eOffset, immVar);
6093 }
6094 else {
6095 m_encoder->SetNoMask();
6096 m_encoder->SetUniformSIMDSize(simdmode);
6097 m_encoder->SetSrcRegion(0, 0, 1, 0);
6098 m_encoder->Add(gatherOff, eOffset, immVar);
6099 m_encoder->Push();
6100 }
6101 }
6102
6103 m_encoder->SetNoMask();
6104 m_encoder->SetUniformSIMDSize(simdmode);
6105 if (useA64)
6106 {
6107 emitGatherA64(inst, gatherDst, gatherOff, blkBits, nBlks, true);
6108 }
6109 else
6110 {
6111 m_encoder->SetNoMask();
6112 m_encoder->SetUniformSIMDSize(simdmode);
6113 m_encoder->ByteGather(gatherDst, resource, gatherOff, blkBits, nBlks);
6114 }
6115 m_encoder->Push();
6116
6117 return;
6118 }
6119
6120 if (useA64)
6121 {
6122 IGC_ASSERT_MESSAGE(!isToSLM, "SLM's ptr size should be 32!");
6123
6124 uint32_t dstOffset = 0;
6125 uint32_t bytesRemaining = totalBytes;
6126 uint32_t bytesToRead = 0;
6127
6128 // Emits instructions generating one or more A64 OWORD block read instructions
6129 m_encoder->SetSimdSize(SIMDMode::SIMD1);
6130 m_encoder->SetNoMask();
6131 m_encoder->SetSrcRegion(0, 0, 1, 0);
6132
6133 CVariable* pTempVar = m_currShader->GetNewVariable(
6134 numLanes(SIMDMode::SIMD1),
6135 ISA_TYPE_UQ,
6136 EALIGN_QWORD, true,
6137 CName::NONE);
6138
6139 m_encoder->Copy(pTempVar, src);
6140 m_encoder->Push();
6141
6142 while (bytesRemaining)
6143 {
6144 bytesToRead = getBlockMsgSize(bytesRemaining, m_currShader->m_Platform->getMaxBlockMsgSize(false));
6145 bytesRemaining -= bytesToRead;
6146 m_encoder->OWLoadA64(dest, pTempVar, bytesToRead, dstOffset);
6147 m_encoder->Push();
6148 dstOffset += bytesToRead;
6149
6150 if (bytesRemaining)
6151 {
6152 if (m_currShader->m_Platform->hasNoInt64AddInst()) {
6153 CVariable* ImmVar = m_currShader->ImmToVariable(bytesToRead, ISA_TYPE_UD);
6154 emitAddPair(pTempVar, pTempVar, ImmVar);
6155 }
6156 else {
6157 m_encoder->SetSimdSize(SIMDMode::SIMD1);
6158 m_encoder->SetNoMask();
6159 m_encoder->SetSrcRegion(0, 0, 1, 0);
6160 m_encoder->Add(pTempVar, pTempVar, m_currShader->ImmToVariable(bytesToRead, ISA_TYPE_UQ));
6161 m_encoder->Push();
6162 }
6163 }
6164 }
6165 }
6166 else
6167 {
6168 // Emits below instructions generating one or more OWORD block read instructions:
6169 // mov (1) r20.0<1>:ud r5.1<0;1,0>:ud {Align1, Q1, NoMask, Compacted}
6170 // and (1) r21.5<1>:ud r0.5<0;1,0>:ud 0x3ff:ud {Align1, NoMask}
6171 // mov (1) r21.2<1>:ud r20.0<0;1,0>:ud {Align1, NoMask, Compacted}
6172 // send (16) r12.0<1>:w r21 0xa 0x24844ff:ud{Align1, NoMask}// unaligned oword block read
6173
6174 m_encoder->SetSimdSize(SIMDMode::SIMD1);
6175 m_encoder->SetNoMask();
6176 m_encoder->SetSrcRegion(0, 0, 1, 0);
6177
6178 CVariable* pTempVar = m_currShader->GetNewVariable(
6179 numLanes(SIMDMode::SIMD1),
6180 ISA_TYPE_UD,
6181 EALIGN_DWORD,
6182 CName::NONE);
6183
6184 if (isToSLM)
6185 {
6186 // It is OW-aligned OW address
6187 m_encoder->Shr(pTempVar, src, m_currShader->ImmToVariable(4, ISA_TYPE_UD));
6188 }
6189
6190 m_encoder->Push();
6191
6192 uint32_t dstOffset = 0;
6193 uint32_t bytesToRead = 0;
6194 uint32_t bytesRemaining = totalBytes;
6195 bool isFirstIter = true;
6196 while (bytesRemaining)
6197 {
6198
6199 bytesToRead = getBlockMsgSize(bytesRemaining, m_currShader->m_Platform->getMaxBlockMsgSize(isToSLM));
6200 bytesRemaining -= bytesToRead;
6201
6202 bool useSrc = isFirstIter && !isToSLM;
6203 m_encoder->OWLoad(dest, resource, useSrc ? src : pTempVar, isToSLM, bytesToRead, dstOffset);
6204 m_encoder->Push();
6205 dstOffset += bytesToRead;
6206
6207 if (bytesRemaining)
6208 {
6209 uint32_t offset = (isToSLM ? bytesToRead / 16 : bytesToRead);
6210 m_encoder->SetSimdSize(SIMDMode::SIMD1);
6211 m_encoder->SetNoMask();
6212 m_encoder->SetSrcRegion(0, 0, 1, 0);
6213 m_encoder->Add(pTempVar, useSrc ? src : pTempVar, m_currShader->ImmToVariable(offset, ISA_TYPE_UD));
6214 m_encoder->Push();
6215 }
6216 isFirstIter = false;
6217 }
6218 }
6219
6220 // Destination has other layout than expecting one by block write instructions in case of multiple instances.
6221 // The expected layout:
6222 // |0th component of data from thread 0-15 |1st component of data from thread 0-15 |
6223 // |0th component of data from thread 16-31|1st component of data from thread 16-31|
6224 // The current layout:
6225 // |0th component of data from thread 0-15 |0th component of data from thread 16-31|
6226 // |1st component of data from thread 0-15 |1st component of data from thread 16-31|
6227 if (needsTempDst)
6228 {
6229 IGC_ASSERT_MESSAGE(!m_encoder->IsSecondHalf(), "This emitter must be called only once for simd32!");
6230 const uint32_t numVectorElementsPerSimd = numLanes(m_encoder->GetSimdSize());
6231
6232 for (uint32_t i = 0; i < 2; i++)
6233 {
6234 if (i == 1)
6235 {
6236 m_encoder->SetSecondHalf(true);
6237 }
6238
6239 for (uint32_t elementIndex = 0; elementIndex < nbElements; elementIndex++)
6240 {
6241 // Offsets can be deduced from the upper comment.
6242 CVariable* destinationAlias = m_currShader->GetNewAlias(
6243 m_destination,
6244 m_destination->GetType(),
6245 numVectorElementsPerSimd * elementIndex * m_encoder->GetCISADataTypeSize(m_destination->GetType()),
6246 numVectorElementsPerSimd);
6247 CVariable* sourceAlias = m_currShader->GetNewAlias(
6248 dest,
6249 dest->GetType(),
6250 numVectorElementsPerSimd * (nbElements * elementIndex + i) * m_encoder->GetCISADataTypeSize(dest->GetType()),
6251 numVectorElementsPerSimd);
6252
6253 m_encoder->SetSimdSize(m_encoder->GetSimdSize());
6254 m_encoder->SetNoMask();
6255 m_encoder->Copy(destinationAlias, sourceAlias);
6256 m_encoder->Push();
6257 }
6258 }
6259
6260 m_encoder->SetSecondHalf(false);
6261 }
6262 }
6263
6264
emitMediaBlockIO(const llvm::GenIntrinsicInst * inst,bool isRead)6265 void EmitPass::emitMediaBlockIO(const llvm::GenIntrinsicInst* inst, bool isRead)
6266 {
6267 uint ImgArgIndex = (uint)GetImmediateVal(inst->getOperand(0));
6268 uint isImageTypeUAV = (uint)GetImmediateVal(inst->getOperand(3));
6269
6270 uint32_t BTI = isImageTypeUAV ?
6271 m_currShader->m_pBtiLayout->GetUavIndex(ImgArgIndex) :
6272 m_currShader->m_pBtiLayout->GetTextureIndex(ImgArgIndex);
6273
6274 bool directIdx = (llvm::dyn_cast<llvm::ConstantInt>(inst->getOperand(0))) ? true : false;
6275 m_currShader->SetBindingTableEntryCountAndBitmap(directIdx, isImageTypeUAV ? UAV : RESOURCE, ImgArgIndex, BTI);
6276
6277 CVariable* pImgBTI = m_currShader->ImmToVariable(BTI, ISA_TYPE_UD);
6278
6279 // width and height must be supplied as compile time constants.
6280 uint blockWidth = (uint)cast<ConstantInt>(inst->getOperand(4))->getZExtValue();
6281 uint blockHeight = (uint)cast<ConstantInt>(inst->getOperand(5))->getZExtValue();
6282
6283 auto* pFunc = inst->getCalledFunction();
6284 auto* pDataType = isRead ? pFunc->getReturnType() : inst->getOperand(6)->getType();
6285
6286 uint typeSize = isa<VectorType>(pDataType) ?
6287 (uint)m_DL->getTypeSizeInBits(cast<VectorType>(pDataType)->getElementType()) / 8 :
6288 (uint)m_DL->getTypeSizeInBits(pDataType) / 8;
6289
6290 uint widthInBytes = blockWidth * typeSize;
6291
6292 CVariable* pXOffset = GetSymbol(inst->getOperand(1));
6293 CVariable* pYOffset = GetSymbol(inst->getOperand(2));
6294
6295 CVariable* pDst = nullptr;
6296
6297 auto* pData = isRead ? m_destination : BroadcastIfUniform(GetSymbol(inst->getOperand(6)));
6298
6299 // For SIMD32, we need to rearrange the data from both halves
6300 // into a contiguous block to treat it as one SIMD32 write and
6301 // we need to split a read back into its two instances after
6302 // doing the read.
6303 bool mergeBlock = (m_SimdMode == SIMDMode::SIMD32);
6304 uint16_t numElements = pData->GetNumberElement();
6305 VISA_Type dataType = pData->GetType();
6306
6307 if (mergeBlock)
6308 {
6309 // Make a block twice the size to hold data for both halves
6310 pDst = m_currShader->GetNewVariable(numElements * 2,
6311 dataType, pData->GetAlign(), false, 1, CName::NONE);
6312 }
6313 else
6314 {
6315 pDst = pData;
6316 }
6317
6318 auto BlockCopy = [&](
6319 CVariable* pDst1,
6320 CVariable* pSrc1,
6321 CVariable* pDst2,
6322 CVariable* pSrc2,
6323 uint srcStride,
6324 uint dstStride)
6325 {
6326 auto VecCopy = [&](CVariable* pDst, CVariable* pSrc, uint nElts)
6327 {
6328 for (uint32_t i = 0; i < nElts; ++i)
6329 {
6330 m_encoder->SetSrcSubReg(0, srcStride * 16 * i);
6331 m_encoder->SetDstSubReg(dstStride * 16 * i);
6332 m_encoder->Copy(pDst, pSrc);
6333 m_encoder->Push();
6334 }
6335 };
6336
6337 uint nElts = isa<VectorType>(pDataType) ?
6338 (uint)cast<IGCLLVM::FixedVectorType>(pDataType)->getNumElements() :
6339 1;
6340
6341 // Now, do the copies.
6342 bool isSecondHalf = m_encoder->IsSecondHalf();
6343
6344 m_encoder->SetSecondHalf(false);
6345 VecCopy(pDst1, pSrc1, nElts);
6346
6347 m_encoder->SetSecondHalf(true);
6348 VecCopy(pDst2, pSrc2, nElts);
6349
6350 m_encoder->SetSecondHalf(isSecondHalf);
6351 };
6352
6353 CVariable* pSecondHalf = m_currShader->GetNewAlias(pDst, dataType,
6354 16 * m_encoder->GetCISADataTypeSize(dataType), numElements);
6355
6356 if (!isRead && mergeBlock)
6357 {
6358 BlockCopy(pDst, pData, pSecondHalf, pData, 1, 2);
6359 }
6360
6361 {
6362 m_encoder->MediaBlockMessage(
6363 isRead ? ISA_Opcode::ISA_MEDIA_LD : ISA_Opcode::ISA_MEDIA_ST,
6364 pDst,
6365 ESURFACE_NORMAL,
6366 pImgBTI,
6367 pXOffset,
6368 pYOffset,
6369 0,
6370 (unsigned char)widthInBytes,
6371 (unsigned char)blockHeight,
6372 0);
6373 }
6374
6375 if (isRead && mergeBlock)
6376 {
6377 BlockCopy(m_destination, pDst, m_destination, pSecondHalf, 2, 1);
6378 }
6379 }
6380
emitMediaBlockRectangleRead(llvm::Instruction * inst)6381 void EmitPass::emitMediaBlockRectangleRead(llvm::Instruction* inst)
6382 {
6383 int SrcImgBTI = int_cast<int>(GetImmediateVal(inst->getOperand(0)));
6384 int isImageTypeUAV = int_cast<int>(GetImmediateVal(inst->getOperand(3)));
6385
6386 CVariable* xOffset = GetSymbol(inst->getOperand(1));
6387 CVariable* yOffset = GetSymbol(inst->getOperand(2));
6388
6389 uint32_t bindingTableIndex = isImageTypeUAV ?
6390 m_currShader->m_pBtiLayout->GetUavIndex(SrcImgBTI) :
6391 m_currShader->m_pBtiLayout->GetTextureIndex(SrcImgBTI);
6392
6393 bool directIdx = (llvm::dyn_cast<llvm::ConstantInt>(inst->getOperand(0))) ? true : false;
6394 m_currShader->SetBindingTableEntryCountAndBitmap(directIdx, isImageTypeUAV ? UAV : RESOURCE, SrcImgBTI, bindingTableIndex);
6395
6396 CVariable* srcbti = m_currShader->ImmToVariable(bindingTableIndex, ISA_TYPE_UD);
6397
6398 CVariable* pDst = GetSymbol(inst->getOperand(6));
6399
6400 // width and height must be supplied as compile time constants.
6401 uint64_t blockWidth = cast<ConstantInt>(inst->getOperand(4))->getZExtValue();
6402 uint64_t blockHeight = cast<ConstantInt>(inst->getOperand(5))->getZExtValue();
6403
6404 IGC_ASSERT(blockWidth * blockHeight == pDst->GetSize());
6405
6406 {
6407 m_encoder->MediaBlockMessage(
6408 ISA_Opcode::ISA_MEDIA_LD,
6409 pDst,
6410 ESURFACE_NORMAL,
6411 srcbti,
6412 xOffset,
6413 yOffset,
6414 0,
6415 (unsigned char)blockWidth,
6416 (unsigned char)blockHeight,
6417 0);
6418 }
6419
6420 m_encoder->Push();
6421 }
6422
emitSimdMediaBlockRead(llvm::Instruction * inst)6423 void EmitPass::emitSimdMediaBlockRead(llvm::Instruction* inst)
6424 {
6425 uint32_t nbElements = 1;
6426 if (inst->getType()->isVectorTy())
6427 {
6428 nbElements = (uint32_t)cast<IGCLLVM::FixedVectorType>(inst->getType())->getNumElements();
6429 }
6430 IGC_ASSERT_MESSAGE(nbElements <= 8, "InValid Vector Size");
6431
6432 int SrcImgBTI = int_cast<int>(GetImmediateVal(inst->getOperand(0)));
6433 int isImageTypeUAV = int_cast<int>(GetImmediateVal(inst->getOperand(3)));
6434
6435 Value* xOffset = inst->getOperand(1);
6436 Value* yOffset = inst->getOperand(2);
6437
6438 uint32_t typeSizeInBytes = inst->getType()->getScalarType()->getScalarSizeInBits() / 8;
6439 uint32_t totalWidth = typeSizeInBytes * numLanes(m_SimdMode);
6440
6441 uint32_t pass = 0;
6442 uint32_t numPasses = 0;
6443 uint32_t bindingTableIndex = 0;
6444
6445 uint32_t dstSubReg = 0;
6446 uint32_t blockWidth = 0;
6447 uint32_t blockHeight = nbElements;
6448
6449 if (isImageTypeUAV)
6450 {
6451 bindingTableIndex = m_currShader->m_pBtiLayout->GetUavIndex(SrcImgBTI);
6452 }
6453 else // elseif imageType is Resource
6454 {
6455 bindingTableIndex = m_currShader->m_pBtiLayout->GetTextureIndex(SrcImgBTI);
6456 }
6457
6458 m_currShader->SetBindingTableEntryCountAndBitmap(true, isImageTypeUAV ? UAV : RESOURCE, SrcImgBTI, bindingTableIndex);
6459
6460 CVariable* srcbti = m_currShader->ImmToVariable(bindingTableIndex, ISA_TYPE_UD);
6461 uint32_t maxWidth = 32;
6462
6463 if (totalWidth < maxWidth)
6464 {
6465 numPasses = 1;
6466 blockWidth = totalWidth;
6467 }
6468 else
6469 {
6470 IGC_ASSERT(maxWidth);
6471 IGC_ASSERT_MESSAGE(totalWidth % maxWidth == 0, "Total width must be divisible by 32!");
6472 numPasses = totalWidth / maxWidth;
6473 blockWidth = maxWidth;
6474 }
6475
6476
6477 CVariable* pTempVar0 = nullptr;
6478 CVariable* pTempVar = nullptr;
6479
6480 uint32_t blockRegSize = 0;
6481
6482 //Following variable declaration is SIMD8 based, UD is used, so blockRegSize is total required registers.
6483 auto simdMode = lanesToSIMDMode(blockWidth / typeSizeInBytes);
6484 blockRegSize = numPasses * blockHeight * numLanes(simdMode);
6485
6486 CVariable* pTempDest = m_currShader->GetNewVariable(
6487 blockRegSize,
6488 m_destination->GetType(),
6489 m_currShader->getGRFAlignment(),
6490 CName::NONE);
6491
6492 CVariable* xVar = GetSymbol(xOffset);
6493 CVariable* yVar = GetSymbol(yOffset);
6494
6495 // Emits a MEDIA_BLOCK_READ instruction.
6496 // Considering block width as x-axis and block height as y axis:
6497 // Pass 0 reads from (xOffset,yOffset) to (xOffset+31, yOffset+blockheight)
6498 // Pass 1 reads from (xOffset+32, yOffset) to (xOffset+63, yOffset+blockheight)
6499 // Instructions generated:
6500 // mov(1) r36.1<1>:d r16.0<0; 1, 0>:d{ Align1, NoMask }
6501 // mov(1) r36.2<1>:ud 0x3001f:ud{ Align1, NoMask }
6502 // mov(1) r36.0<1>:ud r15.0<0; 1, 0>:ud{ Align1, NoMask, Compacted }
6503 // send(8) r28.0<1>:ud r36 0xc 0x2490000:ud{ Align1, NoMask } // media block read
6504 // add(1) r36.0<1>:ud r15.0<0; 1, 0>:ud 0x20:uw{ Align1, NoMask }
6505 // mov(1) r36.1<1>:d r13.1<0; 1, 0>:d{ Align1, NoMask }
6506 // send(8) r32.0<1>:ud r36 0xc 0x2490000:ud{ Align1, NoMask } // media block read
6507 // -----------------
6508 // | | |
6509 // | | |
6510 // -----------------
6511 // --------- r28 output
6512 // | |
6513 // | |
6514 // --------- r32
6515 // | |
6516 // | |
6517 // ---------
6518 // 32 or 64 bytes at most, that's the reason simd8 is used.
6519
6520 int scale = blockWidth / getGRFSize();
6521
6522 for (pass = 0; pass < numPasses; pass++)
6523 {
6524 m_encoder->SetSimdSize(SIMDMode::SIMD1);
6525 m_encoder->SetNoMask();
6526 m_encoder->SetSrcRegion(0, 0, 1, 0);
6527
6528 if (pass == 0)
6529 {
6530 pTempVar0 = m_currShader->GetNewVariable(
6531 numLanes(m_SimdMode),
6532 ISA_TYPE_UD,
6533 EALIGN_DWORD,
6534 CName::NONE);
6535
6536 m_encoder->Copy(pTempVar0, xVar);
6537 }
6538 else
6539 {
6540 m_encoder->Add(pTempVar0, pTempVar0, m_currShader->ImmToVariable(blockWidth, ISA_TYPE_UD));
6541 uint32_t subOffset = blockWidth * blockHeight;
6542 subOffset /= getGRFSize();
6543 dstSubReg = dstSubReg + subOffset;
6544 }
6545 m_encoder->Push();
6546
6547 m_encoder->SetSimdSize(SIMDMode::SIMD1);
6548 m_encoder->SetNoMask();
6549 m_encoder->SetSrcRegion(0, 0, 1, 0);
6550
6551 pTempVar = m_currShader->GetNewVariable(
6552 numLanes(m_SimdMode),
6553 ISA_TYPE_UD,
6554 EALIGN_DWORD,
6555 CName::NONE);
6556
6557 m_encoder->Copy(pTempVar, yVar);
6558 m_encoder->Push();
6559
6560 m_encoder->SetDstSubVar(dstSubReg);
6561
6562 CVariable* dstVar = numPasses == 1 ? m_destination : pTempDest;
6563
6564 {
6565 m_encoder->MediaBlockMessage(
6566 ISA_Opcode::ISA_MEDIA_LD,
6567 dstVar,
6568 ESURFACE_NORMAL,
6569 srcbti,
6570 pTempVar0,
6571 pTempVar,
6572 0,
6573 (unsigned char)blockWidth,
6574 (unsigned char)blockHeight,
6575 0);
6576 }
6577 m_encoder->Push();
6578 }
6579
6580 if (numPasses > 1)
6581 {
6582 dstSubReg = 0;
6583
6584 uint32_t srcSubReg = 0;
6585
6586 // Join data obtained from pass 0 and pass 1 to make
6587 // xOffset contiguous from 0 to 63 bytes (making SIMD 16)
6588 // mov (8) r20.0<1>:ud r28.0<8;8,1>:ud {Align1, Q1}
6589 // mov (8) r21.0<1>:ud r32.0<8;8,1>:ud {Align1, Q2}
6590 // mov (8) r22.0<1>:ud r29.0<8;8,1>:ud {Align1, Q1}
6591 // mov (8) r23.0<1>:ud r33.0<8;8,1>:ud {Align1, Q2}
6592 // mov (8) r24.0<1>:ud r30.0<8;8,1>:ud {Align1, Q1}
6593 // mov (8) r25.0<1>:ud r34.0<8;8,1>:ud {Align1, Q2}
6594 // mov (8) r26.0<1>:ud r31.0<8;8,1>:ud {Align1, Q1}
6595 // mov (8) r27.0<1>:ud r35.0<8;8,1>:ud {Align1, Q2}
6596
6597
6598 //For 64 bytes GRF, 32 bytes will be extended to
6599 //.....
6600 // A0....A1
6601 // B0....B1
6602 // C0....C1
6603 // D0....D1
6604 // E0....E1
6605 // F0....F1
6606 // G0....G1
6607 // H0....H1
6608 //
6609 // r20....A0....B0........r30....A1....B1
6610 // r21....C0....D0........r31....C1....D1
6611 // r22....E0....F0........r32....E1....F1
6612 // r23....G0....H0........r33....G1....H1
6613 //
6614 // r40<--r20,....r30
6615 // r41<--r20.8,r30.8
6616 // r42<--r21,....r31
6617 // r43<--r21.8,r31.8
6618 // r44<--r22,....r32
6619 // r45<--r22.8,r32.8
6620 // r46<--r23,....r33
6621 // r47<--r23.8,r33.8
6622 //
6623 //mov (8) r40.0<1>:ud r20.0<8;8,1>:ud {Align1, Q1}
6624 //mov (8) r40.8<1>:ud r30.0<8;8,1>:ud {Align1, Q1}
6625 //mov (8) r41<1>:ud r20.8<8;8,1>:ud {Align1, Q1}
6626 //mov (8) r41.8<1>:ud r30.8<8;8,1>:ud {Align1, Q1}
6627
6628 for (uint32_t i = 0; i < blockHeight; i++) //Height
6629 {
6630 uint32_t dstSubRegOffset = 0;
6631 uint32_t srcSubRegOffset = 0;
6632
6633 for (uint32_t pass = 0; pass < numPasses; pass++) //Width
6634 {
6635 m_encoder->SetSimdSize(simdMode);
6636 m_encoder->SetNoMask();
6637
6638 srcSubReg = ((i + blockHeight * pass) * blockWidth) / getGRFSize();
6639 srcSubRegOffset = (i * blockWidth) % getGRFSize();
6640
6641 m_encoder->SetSrcSubVar(0, srcSubReg);
6642 m_encoder->SetSrcSubReg(0, srcSubRegOffset / typeSizeInBytes);
6643
6644 m_encoder->SetDstSubVar(dstSubReg);
6645 m_encoder->SetDstSubReg(dstSubRegOffset / typeSizeInBytes);
6646
6647 dstSubRegOffset = ((pass + 1) * blockWidth) % getGRFSize();
6648 if (dstSubRegOffset == 0)
6649 {
6650 dstSubReg += (scale > 0 ? scale : 1);
6651 }
6652
6653 m_encoder->Copy(m_destination, pTempDest);
6654 m_encoder->Push();
6655 }
6656 }
6657 }
6658 }
6659
emitSimdMediaBlockWrite(llvm::Instruction * inst)6660 void EmitPass::emitSimdMediaBlockWrite(llvm::Instruction* inst)
6661 {
6662 int SrcImgBTI = int_cast<int>(GetImmediateVal(inst->getOperand(0)));
6663 int isImageTypeUAV = int_cast<int>(GetImmediateVal(inst->getOperand(3)));
6664
6665 Value* xOffset = inst->getOperand(1);
6666 Value* yOffset = inst->getOperand(2);
6667 Value* dataPtr = inst->getOperand(4);
6668
6669 uint32_t nbElements = 1;
6670 if (dataPtr->getType()->isVectorTy())
6671 {
6672 nbElements = (uint32_t)cast<IGCLLVM::FixedVectorType>(dataPtr->getType())->getNumElements();
6673 }
6674 IGC_ASSERT_MESSAGE(nbElements <= 8, "InValid Vector Size");
6675
6676 CVariable* data = GetSymbol(dataPtr);
6677 data = BroadcastIfUniform(data);
6678
6679 uint32_t typeSizeInBytes = dataPtr->getType()->getScalarType()->getScalarSizeInBits() / 8;
6680 uint32_t totalWidth = typeSizeInBytes * numLanes(m_SimdMode);
6681
6682 uint32_t pass = 0;
6683 uint32_t numPasses = 0;
6684
6685 uint32_t blockWidth = 0;
6686 uint32_t blockHeight = nbElements;
6687 uint32_t bindingTableIndex = 0;
6688
6689 if (isImageTypeUAV)
6690 {
6691 bindingTableIndex = m_currShader->m_pBtiLayout->GetUavIndex(SrcImgBTI);
6692 }
6693 else // elseif imageType is Resource
6694 {
6695 bindingTableIndex = m_currShader->m_pBtiLayout->GetTextureIndex(SrcImgBTI);
6696 }
6697
6698 m_currShader->SetBindingTableEntryCountAndBitmap(true, isImageTypeUAV ? UAV : RESOURCE, SrcImgBTI, bindingTableIndex);
6699
6700 CVariable* srcbti = m_currShader->ImmToVariable(bindingTableIndex, ISA_TYPE_UD);
6701 uint32_t maxWidth = 32;
6702
6703 if (totalWidth < maxWidth)
6704 {
6705 numPasses = 1;
6706 blockWidth = totalWidth;
6707 }
6708 else
6709 {
6710 IGC_ASSERT(maxWidth);
6711 IGC_ASSERT_MESSAGE(totalWidth % maxWidth == 0, "Total width must be divisible by 32!");
6712 numPasses = totalWidth / maxWidth;
6713 blockWidth = maxWidth;
6714 }
6715
6716
6717 CVariable* pTempVar0 = nullptr;
6718 CVariable* pTempVar = nullptr;
6719
6720 uint32_t dstSubReg = 0;
6721
6722 int scale = blockWidth / getGRFSize();
6723 auto simdMode = lanesToSIMDMode(blockWidth / typeSizeInBytes);
6724 for (pass = 0; pass < numPasses; pass++)
6725 {
6726 uint32_t srcSubVar = pass * blockWidth / getGRFSize();
6727 uint32_t dstSubVar = 0;
6728 uint32_t srcSubRegOffset = (pass * blockWidth) % getGRFSize();
6729 uint32_t dstSubRegOffset = 0;
6730
6731 CVariable* tempdst = nullptr;
6732 tempdst = m_currShader->GetNewVariable(
6733 nbElements * numLanes(simdMode),
6734 data->GetType(),
6735 m_currShader->getGRFAlignment(),
6736 CName::NONE);
6737
6738 // Split the data.
6739 // mov (8) r22.0<1>:d r14.0<8;8,1>:d {Align1, Q1, Compacted}
6740 // mov (8) r23.0<1>:d r16.0<8;8,1>:d {Align1, Q1, Compacted}
6741 // mov (8) r24.0<1>:d r18.0<8;8,1>:d {Align1, Q1, Compacted}
6742 // mov (8) r25.0<1>:d r20.0<8;8,1>:d {Align1, Q1, Compacted}
6743
6744 //FOR 64 bytes GRF:
6745 // A0....A1....A2....A3........r60....r60.8....r61....r61.8
6746 // B0....B1....B2....B3........r62....r62.8....r63....r63.8
6747 // C0....C1....C2....C3........r64....r64.8....r65....r65.8
6748 // D0....D1....D2....D3........r66....r66.8....r67....r67.8
6749 // E0....E1....E2....E3........r68....r68.8....r69....r69.8
6750 // F0....F1....F2....F3........r70....r70.8....r71....r71.8
6751 // G0....G1....G2....G3........r72....r72.8....r73....r73.8
6752 // H0....H1....H2....H3........r74....r74.8....r75....r75.8
6753 //
6754 // block 0
6755 // mov (8) r20.0<1>:d r60.0<8;8,1>:d {Align1, Q1, Compacted}
6756 // mov (8) r20.8<1>:d r62.0<8;8,1>:d {Align1, Q1, Compacted}
6757 // mov (8) r21.0<1>:d r64.0<8;8,1>:d {Align1, Q1, Compacted}
6758 // mov (8) r21.8<1>:d r66.0<8;8,1>:d {Align1, Q1, Compacted}
6759 // ...
6760 //block 1
6761 // mov (8) r30.0<1>:d r60.8<8;8,1>:d {Align1, Q1, Compacted}
6762 // mov (8) r30.8<1>:d r62.8<8;8,1>:d {Align1, Q1, Compacted}
6763 // mov (8) r31.0<1>:d r64.8<8;8,1>:d {Align1, Q1, Compacted}
6764 // mov (8) r31.8<1>:d r66.8<8;8,1>:d {Align1, Q1, Compacted}
6765 //...
6766
6767 if (numPasses > 1)
6768 {
6769 for (uint i = 0; i < nbElements; ++i)
6770 {
6771 m_encoder->SetSimdSize(simdMode);
6772 m_encoder->SetNoMask();
6773
6774 //Src
6775 m_encoder->SetSrcSubVar(0, srcSubVar);
6776 m_encoder->SetSrcSubReg(0, srcSubRegOffset / typeSizeInBytes);
6777 //Dst
6778 m_encoder->SetDstSubVar(dstSubVar);
6779 m_encoder->SetDstSubReg(dstSubRegOffset / typeSizeInBytes);
6780 //Strides for dst and src
6781 dstSubRegOffset = ((i + 1) * blockWidth) % getGRFSize();
6782 if (dstSubRegOffset == 0)
6783 {
6784 dstSubVar += scale > 0 ? scale : 1;
6785 }
6786 srcSubVar = srcSubVar + (numPasses * blockWidth / getGRFSize());
6787
6788 m_encoder->Copy(tempdst, data);
6789 m_encoder->Push();
6790 }
6791 }
6792 else
6793 {
6794 tempdst = data;
6795 }
6796 // Emits a MEDIA_BLOCK_WRITE instruction.
6797 // Considering block width as x-axis and block height as y axis:
6798 // Pass 0 writes from (xOffset,yOffset) to (xOffset+31, yOffset+blockheight)
6799 // Pass 1 writes from (xOffset+32, yOffset) to (xOffset+63, yOffset+blockheight)
6800 // mov (8) r28.0<1>:ud r0.0<8;8,1>:ud {Align1, NoMask, Compacted}
6801 // mov (1) r28.2<1>:ud 0x3001f:ud {Align1, NoMask}
6802 // mov (1) r28.0<1>:ud r6.0<0;1,0>:d {Align1, NoMask}
6803 // mov (1) r28.1<1>:ud r7.0<0;1,0>:d {Align1, NoMask}
6804 // mov (16) r29.0<1>:ud r22.0<8;8,1>:ud {Align1, NoMask, Compacted}
6805 // mov (16) r31.0<1>:ud r24.0<8;8,1>:ud {Align1, NoMask, Compacted}
6806 // send (8) null<1>:ud r28 0xc 0xa0a8002:ud{Align1, NoMask} // media block write
6807 if (pass == 0)
6808 {
6809 CVariable* xVar = GetSymbol(xOffset);
6810 CVariable* yVar = GetSymbol(yOffset);
6811 m_encoder->SetSimdSize(SIMDMode::SIMD1);
6812 m_encoder->SetNoMask();
6813 m_encoder->SetSrcRegion(0, 0, 1, 0);
6814
6815 pTempVar0 = m_currShader->GetNewVariable(
6816 numLanes(m_SimdMode),
6817 ISA_TYPE_D,
6818 EALIGN_DWORD,
6819 CName::NONE);
6820
6821 m_encoder->Cast(pTempVar0, xVar);
6822 m_encoder->Push();
6823 m_encoder->SetSimdSize(SIMDMode::SIMD1);
6824 m_encoder->SetNoMask();
6825 m_encoder->SetSrcRegion(0, 0, 1, 0);
6826
6827 pTempVar = m_currShader->GetNewVariable(
6828 numLanes(m_SimdMode),
6829 ISA_TYPE_D,
6830 EALIGN_DWORD,
6831 CName::NONE);
6832
6833 m_encoder->Cast(pTempVar, yVar);
6834 m_encoder->Push();
6835 }
6836 else
6837 {
6838 m_encoder->SetSimdSize(SIMDMode::SIMD1);
6839 m_encoder->SetNoMask();
6840 m_encoder->SetSrcRegion(0, 0, 1, 0);
6841 m_encoder->Add(pTempVar0, pTempVar0, m_currShader->ImmToVariable(blockWidth, ISA_TYPE_UD));
6842 m_encoder->Push();
6843 dstSubReg = dstSubReg + scale * blockHeight;
6844 }
6845
6846 m_encoder->SetDstSubVar(dstSubReg);
6847
6848 {
6849 m_encoder->MediaBlockMessage(
6850 ISA_Opcode::ISA_MEDIA_ST,
6851 tempdst, ESURFACE_NORMAL,
6852 srcbti,
6853 pTempVar0,
6854 pTempVar,
6855 0,
6856 (unsigned char)blockWidth,
6857 (unsigned char)blockHeight,
6858 0);
6859 }
6860 m_encoder->Push();
6861 }
6862 }
6863
emitDualBlendRT(llvm::RTDualBlendSourceIntrinsic * inst,bool fromRet)6864 void EmitPass::emitDualBlendRT(llvm::RTDualBlendSourceIntrinsic* inst, bool fromRet)
6865 {
6866 IGC_ASSERT(m_currShader->GetShaderType() == ShaderType::PIXEL_SHADER);
6867 CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
6868
6869 uint RTIndex = inst->getRTIndexImm();
6870 bool oMask = inst->hasMask();
6871 bool outputDepth = inst->hasDepth();
6872 bool outputStencil = inst->hasStencil();
6873 Value* vMask = inst->getOMask();
6874 bool perSample = inst->perSample();
6875 Value* vSample = inst->getSampleIndex();
6876
6877 uint bindingTableIndex = m_currShader->m_pBtiLayout->GetRenderTargetIndex(RTIndex);
6878 bool directIdx = (llvm::dyn_cast<llvm::ConstantInt>(inst->getRTIndex())) ? true : false;
6879 m_currShader->SetBindingTableEntryCountAndBitmap(directIdx, RENDER_TARGET, RTIndex, bindingTableIndex);
6880
6881 CVariable* pMaskOpnd = nullptr;
6882
6883 if (psProgram->HasDiscard())
6884 {
6885 ConstantInt* cv = dyn_cast<ConstantInt>(inst->getPMask());
6886 if (!cv || cv->getZExtValue() == 0)
6887 {
6888 pMaskOpnd = GetSymbol(inst->getPMask());
6889 }
6890 }
6891
6892 bool isHF = false;
6893 uint messageLength = 8;
6894
6895 if (inst->getRed0()->getType()->isHalfTy())
6896 {
6897 isHF = true;
6898 messageLength = 4;
6899 }
6900
6901 uint responseLength = 0;
6902 if (outputDepth)
6903 {
6904 messageLength += 1;
6905 }
6906 if (outputStencil)
6907 {
6908 messageLength += 1;
6909 }
6910 if (oMask)
6911 {
6912 messageLength += 1;
6913 }
6914 // Need a header in case we write per sample
6915 bool needHeader = perSample;
6916 if (needHeader)
6917 {
6918 messageLength += 2;
6919 }
6920 int nbMessage = m_SimdMode == SIMDMode::SIMD8 ? 1 : 2;
6921 for (int i = 0; i < nbMessage; i++)
6922 {
6923 uint payloadOffset = 0;
6924 bool lastRenderTarget = psProgram->IsLastRTWrite(inst);
6925
6926 bool EOT = lastRenderTarget &&
6927 i == nbMessage - 1 &&
6928 (m_encoder->IsSecondHalf() || m_currShader->m_numberInstance == 1);
6929
6930 if (EOT)
6931 {
6932 IGC_ASSERT(psProgram->m_hasEOT == false);
6933 psProgram->m_hasEOT = true;
6934 }
6935
6936 CVariable* payload =
6937 m_currShader->GetNewVariable(
6938 messageLength * (getGRFSize() >> 2),
6939 ISA_TYPE_D, EALIGN_GRF, CName::NONE);
6940
6941 if (needHeader)
6942 {
6943 m_encoder->SetNoMask();
6944 m_encoder->SetSimdSize(SIMDMode::SIMD8);
6945 m_encoder->Copy(payload, psProgram->GetR0());
6946 m_encoder->Push();
6947
6948 m_encoder->SetDstSubVar(1);
6949 m_encoder->SetNoMask();
6950 m_encoder->SetSimdSize(SIMDMode::SIMD8);
6951 m_encoder->Copy(payload, psProgram->GetR1());
6952 m_encoder->Push();
6953 if (perSample)
6954 {
6955 CVariable* sampleIndex = GetSymbol(vSample);
6956 if (!sampleIndex->IsUniform())
6957 {
6958 sampleIndex = UniformCopy(sampleIndex);
6959
6960 }
6961 CVariable* sampleIndexShifted = m_currShader->GetNewVariable(sampleIndex);
6962 m_encoder->Shl(sampleIndexShifted, sampleIndex, m_currShader->ImmToVariable(6, ISA_TYPE_D));
6963
6964 m_encoder->SetNoMask();
6965 m_encoder->SetSimdSize(SIMDMode::SIMD1);
6966 m_encoder->SetSrcRegion(0, 0, 1, 0);
6967 m_encoder->Or(payload, payload, sampleIndexShifted);
6968 m_encoder->Push();
6969 }
6970
6971 CVariable* pixelEnable = m_currShader->GetNewAlias(
6972 payload, ISA_TYPE_UW, getGRFSize() + 14 * 2, 1);
6973
6974 if (pMaskOpnd)
6975 {
6976 m_encoder->SetNoMask();
6977 m_encoder->SetSimdSize(SIMDMode::SIMD1);
6978 m_encoder->Copy(pixelEnable, pMaskOpnd);
6979 m_encoder->Push();
6980 }
6981 else
6982 if (!isa<ReturnInst>(inst->getParent()->getTerminator()))
6983 {
6984 m_encoder->SetNoMask();
6985 m_encoder->SetSimdSize(SIMDMode::SIMD1);
6986 m_encoder->Cast(pixelEnable, GetExecutionMask());
6987 m_encoder->Push();
6988 }
6989
6990 payloadOffset += 2;
6991 }
6992
6993 if (oMask)
6994 {
6995 //oMask has to be packed since the hardware ignores the upper half
6996 CVariable* src = GetSymbol(vMask);
6997 CVariable* payloadUW = psProgram->BitCast(payload, ISA_TYPE_UW);
6998 src = psProgram->BitCast(src, ISA_TYPE_UW);
6999 m_encoder->SetSimdSize(SIMDMode::SIMD8);
7000 m_encoder->SetMask(i == 0 ? EMASK_Q1 : EMASK_Q2);
7001 m_encoder->SetSrcSubVar(0, i);
7002 if (src->IsUniform())
7003 {
7004 m_encoder->SetSrcRegion(0, 0, 1, 0);
7005 }
7006 else
7007 {
7008 m_encoder->SetSrcRegion(0, 2, 1, 0);
7009 }
7010 m_encoder->SetDstSubVar(payloadOffset++);
7011 m_encoder->SetDstSubReg(i * 8);
7012 m_encoder->Copy(payloadUW, src);
7013 m_encoder->Push();
7014 }
7015
7016 CVariable* srcPayload = payload;
7017
7018 if (isHF)
7019 {
7020 srcPayload = m_currShader->GetNewAlias(payload, ISA_TYPE_HF, 0, 4 * getGRFSize());
7021 }
7022
7023 Value* colors[] = {
7024 inst->getRed0(), inst->getGreen0(), inst->getBlue0(), inst->getAlpha0(),
7025 inst->getRed1(), inst->getGreen1(), inst->getBlue1(), inst->getAlpha1()
7026 };
7027
7028 for (uint srcIdx = 0; srcIdx < 8; srcIdx++)
7029 {
7030 m_encoder->SetSimdSize(SIMDMode::SIMD8);
7031 m_encoder->SetMask(i == 0 ? EMASK_Q1 : EMASK_Q2);
7032 CVariable* src = GetSymbol(colors[srcIdx]);
7033
7034 if (!src->IsUniform())
7035 {
7036 m_encoder->SetSrcSubReg(0, i * 8);
7037 }
7038
7039 if (isHF)
7040 {
7041 // half message has src0 and src1 interleaved
7042 if (srcIdx / 4 != 0)
7043 {
7044 m_encoder->SetDstSubReg(8);
7045 }
7046 m_encoder->SetDstSubVar(payloadOffset + (srcIdx % 4));
7047 }
7048 else
7049 {
7050 m_encoder->SetDstSubVar(payloadOffset + srcIdx);
7051 }
7052
7053 m_encoder->Copy(srcPayload, src);
7054 m_encoder->Push();
7055 }
7056 payloadOffset += isHF ? 4 : 8;
7057
7058 if (outputDepth)
7059 {
7060 m_encoder->SetSimdSize(SIMDMode::SIMD8);
7061 m_encoder->SetMask(i == 0 ? EMASK_Q1 : EMASK_Q2);
7062 CVariable* src = GetSymbol(inst->getDepth());
7063 if (!src->IsUniform())
7064 {
7065 m_encoder->SetSrcSubVar(0, i);
7066 }
7067 m_encoder->SetDstSubVar(payloadOffset++);
7068 m_encoder->Copy(payload, src);
7069 m_encoder->Push();
7070 }
7071
7072 if (outputStencil)
7073 {
7074 m_encoder->SetSimdSize(SIMDMode::SIMD8);
7075 m_encoder->SetMask(i == 0 ? EMASK_Q1 : EMASK_Q2);
7076 CVariable* src = GetSymbol(inst->getStencil());
7077 CVariable* ubSrc = m_currShader->BitCast(src, ISA_TYPE_UB);
7078 m_encoder->SetSrcRegion(0, 32, 8, 4);
7079 if (!ubSrc->IsUniform())
7080 {
7081 m_encoder->SetSrcSubVar(0, i);
7082 }
7083 m_encoder->SetDstSubVar(payloadOffset++);
7084 m_encoder->Copy(payload, ubSrc);
7085 m_encoder->Push();
7086 }
7087
7088 EU_GEN6_DATA_PORT_RENDER_TARGET_WRITE_CONTROL messageType =
7089 (i == 0)
7090 ? EU_GEN6_DATA_PORT_RENDER_TARGET_WRITE_CONTROL_SIMD8_DUAL_SOURCE_LOW :
7091 EU_GEN6_DATA_PORT_RENDER_TARGET_WRITE_CONTROL_SIMD8_DUAL_SOURCE_HIGH;
7092
7093 uint Desc = PixelDataPort(
7094 isHF,
7095 messageLength,
7096 responseLength,
7097 needHeader,
7098 psProgram->GetPhase() == PSPHASE_COARSE,
7099 perSample,
7100 lastRenderTarget,
7101 m_encoder->IsSecondHalf(),
7102 messageType,
7103 bindingTableIndex);
7104
7105
7106 // TODO create a function to encode extended message
7107 CVariable* exDesc =
7108 psProgram->ImmToVariable(EU_MESSAGE_TARGET_DATA_PORT_WRITE | (EOT ? 1 << 5 : 0), ISA_TYPE_UD);
7109 CVariable* messDesc = psProgram->ImmToVariable(Desc, ISA_TYPE_UD);
7110 if (psProgram->GetPhase() == PSPHASE_PIXEL)
7111 {
7112 CVariable* temp = psProgram->GetNewVariable(1, ISA_TYPE_UD, EALIGN_DWORD, true, CName::NONE);
7113 m_encoder->Or(temp, psProgram->GetCurrentPhaseCounter(), exDesc);
7114 m_encoder->Push();
7115 exDesc = temp;
7116 }
7117
7118 //sendc
7119 if (pMaskOpnd)
7120 m_encoder->SetPredicate(pMaskOpnd);
7121 m_encoder->SetSimdSize(SIMDMode::SIMD8);
7122 m_encoder->SetMask(i == 0 ? EMASK_Q1 : EMASK_Q2);
7123 m_encoder->SendC(NULL, payload, EU_MESSAGE_TARGET_DATA_PORT_WRITE, exDesc, messDesc);
7124 m_encoder->Push();
7125 }
7126 }
7127
7128 // Common emitter for URBRead and URBReadOutput, used also in associated pattern match pass.
7129 // The offsets are calculated in the caller.
emitURBReadCommon(llvm::GenIntrinsicInst * inst,const QuadEltUnit globalOffset,llvm::Value * const perSlotOffset)7130 void EmitPass::emitURBReadCommon(llvm::GenIntrinsicInst* inst, const QuadEltUnit globalOffset,
7131 llvm::Value* const perSlotOffset)
7132 {
7133 TODO("Have VISA define the URBRead interface instead of using a raw send");
7134
7135
7136 auto GetURBInputHandle = [&]()->CVariable*
7137 {
7138 CVariable* urbInputHandle = nullptr;
7139 switch (inst->getIntrinsicID())
7140 {
7141 case GenISAIntrinsic::GenISA_URBRead:
7142 {
7143 CVariable* const pVertexIndex = GetSymbol(inst->getOperand(0));
7144 urbInputHandle = m_currShader->GetURBInputHandle(pVertexIndex);
7145 // Mark input to be pulled.
7146 m_currShader->isInputsPulled = true;
7147 break;
7148 }
7149 case GenISAIntrinsic::GenISA_URBReadOutput:
7150 {
7151 urbInputHandle = m_currShader->GetURBOutputHandle();
7152 break;
7153 }
7154 default:
7155 IGC_ASSERT(0);
7156 }
7157 IGC_ASSERT(urbInputHandle);
7158 return urbInputHandle;
7159 };
7160
7161 const EltUnit payloadSize(perSlotOffset ? 2 : 1);
7162 const Unit<Element> messageLength = payloadSize;
7163 CVariable* const payload = m_currShader->GetNewVariable(payloadSize.Count() * numLanes(SIMDMode::SIMD8),
7164 ISA_TYPE_UD, EALIGN_GRF, "URBPayload");
7165 IGC_ASSERT(numLanes(SIMDMode::SIMD8));
7166
7167
7168 Unit<Element> responseLength(m_destination->GetNumberElement() / numLanes(SIMDMode::SIMD8));
7169 {
7170 // Get the register with URBHandles and update certain per-opcode data.
7171 CVariable* urbInputHandle = GetURBInputHandle();
7172 m_encoder->Copy(payload, urbInputHandle);
7173 m_encoder->Push();
7174
7175 if (perSlotOffset)
7176 {
7177 m_encoder->SetDstSubVar(1);
7178 CVariable* offset = m_currShader->GetSymbol(perSlotOffset);
7179 m_encoder->Copy(payload, offset);
7180 m_encoder->Push();
7181 }
7182
7183 constexpr bool eot = false;
7184 constexpr bool channelMaskPresent = false;
7185 const uint desc = UrbMessage(
7186 messageLength.Count(),
7187 responseLength.Count(),
7188 eot,
7189 perSlotOffset != nullptr,
7190 channelMaskPresent,
7191 globalOffset.Count(),
7192 EU_URB_OPCODE_SIMD8_READ);
7193
7194 constexpr uint exDesc = EU_MESSAGE_TARGET_URB;
7195 CVariable* const pMessDesc = m_currShader->ImmToVariable(desc, ISA_TYPE_UD);
7196 m_encoder->Send(m_destination, payload, exDesc, pMessDesc);
7197 m_encoder->Push();
7198 }
7199
7200 }
7201
7202 // Emitter for URBRead and URBReadOutput.
emitURBRead(llvm::GenIntrinsicInst * inst)7203 void EmitPass::emitURBRead(llvm::GenIntrinsicInst* inst)
7204 {
7205 llvm::Value* offset = nullptr;
7206 switch (inst->getIntrinsicID())
7207 {
7208 case GenISAIntrinsic::GenISA_URBRead:
7209 offset = inst->getOperand(1);
7210 break;
7211 case GenISAIntrinsic::GenISA_URBReadOutput:
7212 offset = inst->getOperand(0);
7213 break;
7214 default:
7215 IGC_ASSERT(0);
7216 }
7217 IGC_ASSERT_MESSAGE(!isa<ConstantInt>(offset), "Constant offsets are expected to be handled elsewhere.");
7218 emitURBReadCommon(inst, QuadEltUnit(0), offset);
7219 }
7220
emitURBWrite(llvm::GenIntrinsicInst * inst)7221 void EmitPass::emitURBWrite(llvm::GenIntrinsicInst* inst)
7222 {
7223 // input: GenISA_URBWrite(%offset, %mask, %data0, ..., %data7)
7224 CVariable* offset = m_currShader->GetSymbol(inst->getOperand(0));
7225 CVariable* channelMask = m_currShader->GetSymbol(inst->getOperand(1));
7226 CVariable* URBHandle = m_currShader->GetURBOutputHandle();
7227
7228 {
7229 // If offset or channel mask is not immediate value, we need per-slot offsets and/or channel mask
7230 // to contain data in all the channels. However, if the variable is uniform,
7231 // the uniform analysis makes it a scalar value, so we need to broadcast it to simd form.
7232 if (!channelMask->IsImmediate())
7233 {
7234 channelMask = BroadcastIfUniform(channelMask);
7235 }
7236
7237 if (!offset->IsImmediate())
7238 {
7239 offset = BroadcastIfUniform(offset);
7240 }
7241 }
7242
7243 {
7244 CVariable* payload = nullptr;
7245 int payloadElementOffset = 0;
7246 {
7247 payload = m_CE->PrepareExplicitPayload(
7248 m_currShader,
7249 m_encoder,
7250 m_SimdMode,
7251 m_DL,
7252 inst,
7253 payloadElementOffset);
7254 }
7255
7256 m_encoder->URBWrite(payload, payloadElementOffset, offset, URBHandle, channelMask);
7257 m_encoder->Push();
7258 }
7259 }
7260
interceptSamplePayloadCoalescing(llvm::SampleIntrinsic * inst,uint numPart,SmallVector<CVariable *,4> & payload,bool & payloadCovered)7261 void EmitPass::interceptSamplePayloadCoalescing(
7262 llvm::SampleIntrinsic* inst,
7263 uint numPart,
7264 //out:
7265 SmallVector<CVariable*, 4> & payload,
7266 bool& payloadCovered)
7267 {
7268 m_CE->SetCurrentPart(inst, numPart);
7269
7270 const uint numPayloadOperands = m_CE->GetNumPayloadElements(inst);
7271 CoalescingEngine::CCTuple* ccTuple = nullptr;
7272 int payloadToCCTupleRelativeOffset = 0;
7273 Value* representativeValPtr = nullptr;
7274
7275 ccTuple = m_CE->IsAnyValueCoalescedInCCTuple(inst,
7276 numPayloadOperands,
7277 //out:
7278 payloadToCCTupleRelativeOffset,
7279 representativeValPtr
7280 );
7281
7282 payloadCovered = m_CE->IsPayloadCovered(inst,
7283 ccTuple,
7284 numPayloadOperands,
7285 payloadToCCTupleRelativeOffset);
7286
7287 if (payloadToCCTupleRelativeOffset < 0)
7288 {
7289 payloadCovered = false;
7290 }
7291
7292 //Once we are here, there is no rolling back - all the conditions for preparing
7293 //a coalesced load/sample are satisfied at this point, so just proceed with
7294 //preparing one.
7295 if (!payloadCovered)
7296 {
7297 return;
7298 }
7299 else
7300 {
7301 IGC_ASSERT(ccTuple);
7302 CVariable* rootPayloadVar = m_currShader->LazyCreateCCTupleBackingVariable(ccTuple);
7303
7304 SmallPtrSet<Value*, 8> touchedValuesSet;
7305
7306 IGC_ASSERT(representativeValPtr);
7307 IGC_ASSERT(payloadToCCTupleRelativeOffset >= 0);
7308 int byteOffset = payloadToCCTupleRelativeOffset *
7309 m_CE->GetSingleElementWidth(m_currShader->m_SIMDSize, m_DL, representativeValPtr);
7310
7311 if (ccTuple->HasNonHomogeneousElements())
7312 {
7313 byteOffset += m_CE->GetLeftReservedOffset(ccTuple->GetRoot(), m_currShader->m_SIMDSize);
7314 }
7315
7316 for (uint index = 0; index < numPayloadOperands; index++)
7317 {
7318 CVariable* src = nullptr;
7319
7320 Value* val = m_CE->GetPayloadElementToValueMapping(inst, index);
7321 VISA_Type type = m_currShader->GetType(val->getType());
7322
7323 bool needsAlias = false;
7324 if (touchedValuesSet.count(val))
7325 {
7326 //We have a copy of an element used at least twice in a payload.
7327 src = m_currShader->GetNewAlias(rootPayloadVar, type, (uint16_t)byteOffset, 0);
7328 if (inst->IsDerivative())
7329 {
7330 m_encoder->SetNoMask();
7331 }
7332 m_encoder->Copy(src, GetSymbol(val));
7333 m_encoder->Push();
7334
7335 byteOffset += GetOffsetIncrement(m_DL, m_currShader->m_SIMDSize, val);
7336
7337 IGC_ASSERT(src);
7338 payload.push_back(src);
7339 continue;
7340
7341 }
7342 else
7343 {
7344 touchedValuesSet.insert(val);
7345 }
7346
7347 if (m_CE->IsValConstOrIsolated(val))
7348 {
7349 needsAlias = true;
7350 }
7351 else
7352 {
7353 if (m_CE->GetValueCCTupleMapping(val))
7354 {
7355 src = GetSymbol(val);
7356 }
7357 else
7358 {
7359 //this one actually encompasses the case for !getRegRoot(val)
7360 needsAlias = true;
7361 }
7362 } //if constant
7363
7364 if (needsAlias)
7365 {
7366 src = m_currShader->GetNewAlias(rootPayloadVar, type, (uint16_t)byteOffset, 0);
7367 //TODO:WARNING: workaround
7368 if (inst->IsDerivative() /*&& GetSymbol(val)->IsUniform()*/)
7369 {
7370 m_encoder->SetNoMask();
7371 }
7372 m_encoder->Copy(src, GetSymbol(val));
7373 m_encoder->Push();
7374 }
7375 IGC_ASSERT(src);
7376 payload.push_back(src);
7377
7378 byteOffset += GetOffsetIncrement(m_DL, m_currShader->m_SIMDSize, val);
7379
7380
7381 }
7382
7383 }
7384
7385 }
7386
7387
GetSampleResourceHelper(SampleIntrinsic * inst)7388 ResourceDescriptor EmitPass::GetSampleResourceHelper(SampleIntrinsic* inst)
7389 {
7390 llvm::Value* texOp = inst->getTextureValue();
7391 ResourceDescriptor resource = GetResourceVariable(texOp);
7392 return resource;
7393 }
7394
emitSampleInstruction(SampleIntrinsic * inst)7395 void EmitPass::emitSampleInstruction(SampleIntrinsic* inst)
7396 {
7397 EOPCODE opCode = GetOpCode(inst);
7398
7399 ResourceDescriptor resource = GetSampleResourceHelper(inst);
7400
7401
7402 //Get sampler index in the array of operands
7403 llvm::Value* samplerOp = inst->getSamplerValue();
7404 SamplerDescriptor sampler = GetSamplerVariable(samplerOp);
7405
7406 const uint numOperands = inst->getNumOperands();
7407 // offset
7408 CVariable* immOffset = m_currShader->ImmToVariable(0, ISA_TYPE_UW);
7409 if (!inst->IsLODInst())
7410 {
7411 uint offsetSourceIndex = numOperands - 4;
7412 immOffset = ComputeSampleIntOffset(inst, offsetSourceIndex);
7413 }
7414
7415 const CShader::ExtractMaskWrapper writeMask(m_currShader, inst);
7416 IGC_ASSERT_MESSAGE(writeMask.hasEM() && writeMask.getEM() != 0, "Wrong write mask");
7417
7418 bool derivativeSample = inst->IsDerivative();
7419
7420 bool cpsEnable = derivativeSample &&
7421 m_currShader->GetShaderType() == ShaderType::PIXEL_SHADER &&
7422 static_cast<CPixelShader*>(m_currShader)->GetPhase() == PSPHASE_COARSE;
7423
7424 SmallVector<CVariable*, 4> payload;
7425
7426 bool doIntercept = true;
7427 // Skip sample_d* instructions in SIMD16 and SIMD32.
7428 if ((m_currShader->m_SIMDSize > SIMDMode::SIMD8 &&
7429 (opCode == llvm_sample_dptr ||
7430 opCode == llvm_sample_dcptr)))
7431 {
7432 doIntercept = false;
7433 }
7434 uint numSources = 0;
7435 const uint numParts = m_CE->GetNumSplitParts(inst);
7436 for (uint part = 0; part < numParts; part++)
7437 {
7438 bool payloadCovered = false;
7439 m_CE->SetCurrentPart(inst, part);
7440 const unsigned int numPartSources = m_CE->GetNumPayloadElements(inst);
7441 numSources += numPartSources;
7442 if (doIntercept)
7443 {
7444 interceptSamplePayloadCoalescing(inst, part, payload, payloadCovered);
7445 }
7446
7447 if (!payloadCovered)
7448 {
7449 m_CE->SetCurrentPart(inst, part);
7450
7451 //create send payload for numSources
7452 for (uint i = 0; i < numPartSources; i++)
7453 {
7454 Value* v = m_CE->GetPayloadElementToValueMapping(inst, i);
7455 CVariable* src = GetSymbol(v);
7456 if (src->IsUniform())
7457 {
7458 CVariable* srcReg = m_currShader->GetNewVariable(
7459 numLanes(m_currShader->m_SIMDSize), src->GetType(), EALIGN_GRF, CName::NONE);
7460 if (derivativeSample)
7461 {
7462 m_encoder->SetNoMask();
7463 }
7464 m_encoder->Copy(srcReg, src);
7465 m_encoder->Push();
7466 src = srcReg;
7467 }
7468 payload.push_back(src);
7469 }
7470 }
7471 }
7472
7473 // the responses to the sample + killpix and feedback messages have an extra register that contains a mask.
7474 bool hasMaskResponse = writeMask.isSet(4);
7475
7476 CVariable* dst = m_destination;
7477 //When sampler output is 16 bit float, hardware doesnt pack the output in SIMD8 mode.
7478 //Hence the movs to handle this layout in SIMD8 mode
7479 bool simd8HFRet = isHalfGRFReturn(m_destination, m_SimdMode);
7480
7481 if (simd8HFRet)
7482 {
7483 dst = m_currShader->GetNewVariable(
7484 m_destination->GetNumberElement() * 2, ISA_TYPE_HF, EALIGN_GRF, false, CName::NONE);
7485 }
7486 uint label = 0;
7487 CVariable* flag = nullptr;
7488 bool zeroLOD = m_currShader->m_Platform->supportSampleAndLd_lz() && inst->ZeroLOD() &&
7489 !m_currShader->m_Platform->WaDisableSampleLz();
7490 bool needLoop = ResourceLoopHeader(resource, sampler, flag, label);
7491
7492 if (m_currShader->m_Platform->getWATable().Wa_22011157800 && !IGC_IS_FLAG_DISABLED(DiableWaSamplerNoMask))
7493 {
7494 m_encoder->SetNoMask();
7495 }
7496 else
7497 {
7498 m_encoder->SetPredicate(flag);
7499 }
7500 m_encoder->Sample(
7501 opCode,
7502 writeMask.getEM(),
7503 immOffset,
7504 resource,
7505 sampler,
7506 numSources,
7507 dst,
7508 payload,
7509 zeroLOD,
7510 cpsEnable,
7511 hasMaskResponse,
7512 needLoop);
7513 m_encoder->Push();
7514
7515 if (m_currShader->hasReadWriteImage(*(inst->getParent()->getParent())))
7516 {
7517 CVariable* tempdest = m_currShader->BitCast(m_destination, GetUnsignedIntegerType(m_destination->GetType()));
7518 m_encoder->Cast(m_currShader->GetNULL(), tempdest);
7519 m_encoder->Push();
7520 m_encoder->Copy(m_currShader->GetNULL(), m_currShader->GetTSC());
7521 m_encoder->Push();
7522 }
7523 ResourceLoopBackEdge(needLoop, flag, label);
7524
7525 {
7526 if (simd8HFRet)
7527 {
7528 PackSIMD8HFRet(dst);
7529 }
7530
7531 if (hasMaskResponse)
7532 {
7533 CVariable* flag = m_currShader->GetNewVariable(
7534 numLanes(m_currShader->m_dispatchSize), ISA_TYPE_BOOL, EALIGN_BYTE, CName::NONE);
7535 uint subvar = numLanes(m_currShader->m_SIMDSize) * 4 / (getGRFSize() >> 2);
7536 m_encoder->SetSrcSubVar(0, subvar);
7537 m_encoder->SetSrcRegion(0, 0, 1, 0);
7538 CVariable* newdestination = m_currShader->BitCast(m_destination, ISA_TYPE_UD);
7539 m_encoder->SetP(flag, newdestination);
7540 m_encoder->Push();
7541
7542 // Use integer types for select in case driver uses alt mode
7543 // (0xFFFFFFFF is a NaN value, so the result is always 0).
7544 VISA_Type dstIntType = GetUnsignedIntegerType(m_destination->GetType());
7545 CVariable* pred = m_currShader->ImmToVariable(0xFFFFFFFF, dstIntType);
7546 CVariable* zero = m_currShader->ImmToVariable(0x0, dstIntType);
7547 CVariable* dstAlias = m_currShader->GetNewAlias(m_destination, dstIntType, 0, m_destination->GetNumberElement());
7548 m_encoder->SetDstSubVar(subvar);
7549 m_encoder->Select(flag, dstAlias, pred, zero);
7550 m_encoder->Push();
7551 }
7552 }
7553 }
7554
7555 // Initialize global discard mask as ~dmask.
emitInitDiscardMask(llvm::GenIntrinsicInst * inst)7556 void EmitPass::emitInitDiscardMask(llvm::GenIntrinsicInst* inst)
7557 {
7558 if (m_encoder->IsSecondHalf())
7559 return;
7560
7561 // (W) not (1|M0) f0.0:uw sr0.2<0;1,0>:ud
7562 CVariable* t = m_currShader->GetNewVariable(
7563 numLanes(m_currShader->m_dispatchSize),
7564 ISA_TYPE_BOOL, EALIGN_BYTE, CName::NONE);
7565 m_encoder->SetNoMask();
7566 m_encoder->SetSrcSubReg(0, 2);
7567 m_encoder->SetP(t, m_currShader->GetSR0());
7568 m_encoder->Push();
7569
7570 m_encoder->SetNoMask();
7571 m_encoder->SetSimdSize(m_currShader->m_dispatchSize);
7572 m_encoder->GenericAlu(EOPCODE_NOT, m_destination, t, nullptr);
7573 m_encoder->Push();
7574
7575 IGC_ASSERT(m_currShader->GetShaderType() == ShaderType::PIXEL_SHADER);
7576 CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
7577 psProgram->SetDiscardPixelMask(m_destination);
7578 }
7579
7580 // update global discard mask with discard condition
emitUpdateDiscardMask(llvm::GenIntrinsicInst * inst)7581 void EmitPass::emitUpdateDiscardMask(llvm::GenIntrinsicInst* inst)
7582 {
7583 IGC_ASSERT(m_currShader->GetShaderType() == ShaderType::PIXEL_SHADER);
7584 CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
7585 CVariable* discardMask;
7586 CVariable* maskp;
7587
7588 discardMask = GetSymbol(inst->getArgOperand(0));
7589 IGC_ASSERT(discardMask == psProgram->GetDiscardPixelMask());
7590
7591 if (ConstantInt * ci = dyn_cast<ConstantInt>(inst->getArgOperand(1)))
7592 {
7593 if (ci->getZExtValue() == 1)
7594 {
7595 CVariable* dummyVar = m_currShader->GetNewVariable(1, ISA_TYPE_UW, EALIGN_WORD, true, CName::NONE);
7596 m_encoder->Cmp(EPREDICATE_EQ, discardMask, dummyVar, dummyVar);
7597 m_encoder->Push();
7598 }
7599 else
7600 {
7601 return;
7602 }
7603 }
7604 else
7605 {
7606 maskp = GetSymbol(inst->getArgOperand(1));
7607 m_encoder->Or(discardMask, discardMask, maskp);
7608 m_encoder->Push();
7609 }
7610 }
7611
7612 // get live pixel mask for RTWrite from global discard mask
emitGetPixelMask(llvm::GenIntrinsicInst * inst)7613 void EmitPass::emitGetPixelMask(llvm::GenIntrinsicInst* inst)
7614 {
7615 IGC_ASSERT(m_currShader->GetShaderType() == ShaderType::PIXEL_SHADER);
7616 CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
7617 CVariable* globalMask;
7618
7619 globalMask = GetSymbol(inst->getArgOperand(0));
7620 IGC_ASSERT(globalMask == psProgram->GetDiscardPixelMask());
7621
7622 CVariable* dst = m_destination;
7623 m_encoder->SetNoMask();
7624 m_encoder->GenericAlu(EOPCODE_NOT, dst, globalMask, nullptr);
7625 m_encoder->Push();
7626 }
7627
emitDiscard(llvm::Instruction * inst)7628 void EmitPass::emitDiscard(llvm::Instruction* inst)
7629 {
7630 IGC_ASSERT_MESSAGE(0, "No codegen for discard intrinsic");
7631 }
7632
emitInfoInstruction(InfoIntrinsic * inst)7633 void EmitPass::emitInfoInstruction(InfoIntrinsic* inst)
7634 {
7635 EOPCODE opCode = GetOpCode(inst);
7636 llvm::Value* texOp = inst->getOperand(0);
7637
7638 ResourceDescriptor resource = GetResourceVariable(texOp);
7639
7640
7641 CVariable* lod = nullptr;
7642 if (opCode != llvm_sampleinfoptr)
7643 {
7644 lod = GetSymbol(inst->getOperand(1));
7645 }
7646 if (lod && lod->IsUniform())
7647 {
7648 auto uniformSIMDMode = m_currShader->m_Platform->getMinDispatchMode();
7649 CVariable* srcReg = m_currShader->GetNewVariable(
7650 m_destination->IsUniform() ? numLanes(uniformSIMDMode) : numLanes(m_currShader->m_SIMDSize),
7651 ISA_TYPE_F,
7652 EALIGN_GRF,
7653 m_destination->IsUniform(),
7654 lod->getName());
7655 m_encoder->SetUniformSIMDSize(uniformSIMDMode);
7656 m_encoder->Copy(srcReg, lod);
7657 m_encoder->Push();
7658 lod = srcReg;
7659 }
7660
7661 CVariable* tempDest = m_destination;
7662 if (m_destination->IsUniform())
7663 {
7664 auto uniformSIMDMode = m_currShader->m_Platform->getMinDispatchMode();
7665 tempDest = m_currShader->GetNewVariable(
7666 m_destination->GetNumberElement() * numLanes(uniformSIMDMode),
7667 ISA_TYPE_UD, EALIGN_GRF, true, m_destination->getName());
7668 m_encoder->SetUniformSIMDSize(uniformSIMDMode);
7669 }
7670
7671 uint label = 0;
7672 CVariable* flag = nullptr;
7673 bool needLoop = ResourceLoopHeader(resource, flag, label);
7674 m_encoder->SetPredicate(flag);
7675
7676 const CShader::ExtractMaskWrapper writeMask(m_currShader, inst);
7677 IGC_ASSERT_MESSAGE(writeMask.hasEM() && writeMask.getEM() != 0, "Wrong write mask");
7678
7679 m_encoder->Info(opCode, writeMask.getEM(), resource, lod, tempDest);
7680 m_encoder->Push();
7681
7682 ResourceLoopBackEdge(needLoop, flag, label);
7683
7684 if (tempDest != m_destination)
7685 {
7686 unsigned int writemask = 0;
7687 for (auto I = inst->user_begin(), E = inst->user_end(); I != E; ++I)
7688 {
7689 if (llvm::ExtractElementInst * extract = llvm::dyn_cast<llvm::ExtractElementInst>(*I))
7690 {
7691 if (llvm::ConstantInt * index = llvm::dyn_cast<ConstantInt>(extract->getIndexOperand()))
7692 {
7693 writemask |= BIT(static_cast<uint>(index->getZExtValue()));
7694 continue;
7695 }
7696 }
7697 writemask = 0xF;
7698 break;
7699 }
7700 for (uint i = 0; i < 4; i++)
7701 {
7702 if (BIT(i) & writemask)
7703 {
7704 m_encoder->SetSrcSubVar(0, i);
7705 m_encoder->SetDstSubReg(i);
7706 m_encoder->Copy(m_destination, tempDest);
7707 m_encoder->Push();
7708 }
7709 }
7710 }
7711 }
7712
emitSurfaceInfo(GenIntrinsicInst * inst)7713 void EmitPass::emitSurfaceInfo(GenIntrinsicInst* inst)
7714 {
7715 ResourceDescriptor resource = GetResourceVariable(inst->getOperand(0));
7716 ForceDMask(false);
7717
7718 DATA_PORT_TARGET_CACHE targetCache = DATA_PORT_TARGET_CONSTANT_CACHE;
7719 EU_MESSAGE_TARGET messageTarget = EU_MESSAGE_TARGET_DATA_PORT_DATA_CACHE_READ_ONLY;
7720 if (m_currShader->m_Platform->supportSamplerCacheResinfo())
7721 {
7722 targetCache = DATA_PORT_TARGET_SAMPLER_CACHE;
7723 messageTarget = EU_MESSAGE_TARGET_DATA_PORT_READ;
7724 }
7725
7726 uint messageSpecificControl = DataPortRead(
7727 1,
7728 2,
7729 false,
7730 EU_DATA_PORT_READ_MESSAGE_TYPE_SURFACE_INFO_READ,
7731 0,
7732 false,
7733 targetCache,
7734 resource.m_surfaceType == ESURFACE_BINDLESS ? BINDLESS_BTI : (uint)resource.m_resource->GetImmediateValue());
7735
7736 CVariable* pMessDesc = m_currShader->ImmToVariable(messageSpecificControl, ISA_TYPE_D);
7737
7738 CVariable* exDesc =
7739 m_currShader->ImmToVariable(messageTarget, ISA_TYPE_D);
7740 if (resource.m_surfaceType == ESURFACE_BINDLESS)
7741 {
7742 CVariable* temp = m_currShader->GetNewVariable(resource.m_resource);
7743 m_encoder->Add(temp, resource.m_resource, exDesc);
7744 m_encoder->Push();
7745 exDesc = temp;
7746 }
7747 uint label = 0;
7748 CVariable* flag = nullptr;
7749 bool needLoop = ResourceLoopHeader(resource, flag, label);
7750 CVariable* payload = m_currShader->GetNewVariable(8, ISA_TYPE_UD, EALIGN_GRF, CName::NONE);
7751
7752 m_encoder->SetSimdSize(SIMDMode::SIMD8);
7753 m_encoder->SetNoMask();
7754 m_encoder->Copy(payload, m_currShader->ImmToVariable(0, ISA_TYPE_UD));
7755 m_encoder->Push();
7756
7757 m_encoder->SetUniformSIMDSize(SIMDMode::SIMD8);
7758 m_encoder->SetNoMask();
7759 m_encoder->Send(m_destination, payload,
7760 messageTarget, exDesc, pMessDesc);
7761 m_encoder->Push();
7762
7763 IGC_ASSERT(m_destination->IsUniform());
7764 ResourceLoopBackEdge(needLoop, flag, label);
7765 ResetVMask(false);
7766 }
7767
emitFeedbackEnable()7768 void EmitPass::emitFeedbackEnable()
7769 {
7770 // if feedback is enabled we always return all 4 channels
7771 CVariable* flag = m_currShader->GetNewVariable(
7772 numLanes(m_currShader->m_dispatchSize), ISA_TYPE_BOOL, EALIGN_BYTE, CName::NONE);
7773 uint typeSize = CEncoder::GetCISADataTypeSize(m_destination->GetType());
7774 uint subvar = (numLanes(m_currShader->m_SIMDSize) * typeSize * 4) / getGRFSize();
7775
7776 m_encoder->SetSrcSubVar(0, subvar);
7777 m_encoder->SetSrcRegion(0, 0, 1, 0);
7778 CVariable* newdestination = m_currShader->BitCast(m_destination, ISA_TYPE_UD);
7779 m_encoder->SetP(flag, newdestination);
7780 m_encoder->Push();
7781
7782 CVariable* pred = m_currShader->ImmToVariable(0xFFFFFFFF, m_destination->GetType());
7783 CVariable* zero = m_currShader->ImmToVariable(0x0, m_destination->GetType());
7784 m_encoder->SetDstSubVar(subvar);
7785 m_encoder->Select(flag, m_destination, pred, zero);
7786 m_encoder->Push();
7787 }
7788
emitGather4Instruction(SamplerGatherIntrinsic * inst)7789 void EmitPass::emitGather4Instruction(SamplerGatherIntrinsic* inst)
7790 {
7791 EOPCODE opCode = GetOpCode(inst);
7792 uint numOperands = inst->getNumOperands();
7793
7794 //Subtract the offsets, resource and sampler sources to get
7795 //the number of texture coordinates, src channel select and index to texture source
7796 uint numSources = numOperands - 7;
7797
7798 Value* textureValue = inst->getTextureValue();
7799 ResourceDescriptor resource = GetResourceVariable(textureValue);
7800
7801 SamplerDescriptor sampler;
7802 Value* samplerValue = inst->getSamplerValue();
7803
7804 sampler = GetSamplerVariable(samplerValue);
7805
7806 //Check for valid number of sources from the end of the list
7807 for (uint i = (numSources - 1); i >= 1; i--)
7808 {
7809 CVariable* validSrc = GetSymbol(inst->getOperand(i));
7810 if (validSrc->IsImmediate() &&
7811 validSrc->GetImmediateValue() == 0)
7812 {
7813 numSources--;
7814 }
7815 else
7816 {
7817 break;
7818 }
7819 }
7820
7821 // offset
7822 uint offsetSourceIndex = numOperands - 5;
7823 CVariable* offset = ComputeSampleIntOffset(inst, offsetSourceIndex);
7824
7825 uint channelIndx = numOperands - 2;
7826 uint channel = int_cast<uint>(GetImmediateVal(inst->getOperand(channelIndx)));
7827 SmallVector<CVariable*, 4> payload;
7828
7829
7830 //create send payload for numSources
7831 for (uint i = 0; i < numSources; i++)
7832 {
7833 CVariable* src = GetSymbol(inst->getOperand(i));
7834 if (src->IsUniform())
7835 {
7836 CVariable* srcReg = m_currShader->GetNewVariable(
7837 numLanes(m_currShader->m_SIMDSize), ISA_TYPE_F,
7838 m_currShader->getGRFAlignment(),
7839 src->getName());
7840 m_encoder->Copy(srcReg, src);
7841 m_encoder->Push();
7842 src = srcReg;
7843 }
7844 payload.push_back(src);
7845 }
7846
7847 CVariable* dst = m_destination;
7848 //When sampler output is 16 bit float, hardware doesnt pack the output in SIMD8 mode.
7849 //Hence the movs to handle this layout in SIMD8 mode
7850 bool simd8HFRet = isHalfGRFReturn(m_destination, m_SimdMode);
7851 if (simd8HFRet)
7852 {
7853 dst = m_currShader->GetNewVariable(
7854 m_destination->GetNumberElement() * 2, ISA_TYPE_HF, EALIGN_GRF, false, CName::NONE);
7855 }
7856
7857 bool feedbackEnable = (m_destination->GetNumberElement() / numLanes(m_currShader->m_SIMDSize) == 5) ? true : false;
7858 uint label = 0;
7859 CVariable* flag = nullptr;
7860 bool needLoop = ResourceLoopHeader(resource, sampler, flag, label);
7861 m_encoder->SetPredicate(flag);
7862 m_encoder->Gather4Inst(
7863 opCode,
7864 offset,
7865 resource,
7866 sampler,
7867 numSources,
7868 dst,
7869 payload,
7870 channel,
7871 feedbackEnable);
7872 m_encoder->Push();
7873 if (m_currShader->hasReadWriteImage(*(inst->getParent()->getParent())))
7874 {
7875 CVariable* tempdest = m_currShader->BitCast(m_destination, GetUnsignedIntegerType(m_destination->GetType()));
7876 m_encoder->Cast(m_currShader->GetNULL(), tempdest);
7877 m_encoder->Push();
7878 m_encoder->Copy(m_currShader->GetNULL(), m_currShader->GetTSC());
7879 m_encoder->Push();
7880 }
7881 ResourceLoopBackEdge(needLoop, flag, label);
7882
7883 {
7884 if (simd8HFRet)
7885 {
7886 PackSIMD8HFRet(dst);
7887 }
7888
7889 if (feedbackEnable)
7890 {
7891 emitFeedbackEnable();
7892 }
7893 }
7894 }
7895
emitLdmsInstruction(llvm::Instruction * inst)7896 void EmitPass::emitLdmsInstruction(llvm::Instruction* inst)
7897 {
7898 uint numOperands = inst->getNumOperands();
7899 EOPCODE opCode = GetOpCode(inst);
7900 //Subtract the offsets, and texture resource, lod to get
7901 //the number of texture coordinates and index to texture source
7902 uint numSources = numOperands - 5;
7903 uint textureArgIdx = numOperands - 5;
7904
7905 for (uint i = numSources - 1; i > 0; i--)
7906 {
7907 CVariable* validSrc = GetSymbol(inst->getOperand(i));
7908 if (!(validSrc->IsImmediate() && validSrc->GetImmediateValue() == 0))
7909 {
7910 break;
7911 }
7912 numSources--;
7913 }
7914
7915 // Figure out the write mask from the size of the destination we want to write
7916 const CShader::ExtractMaskWrapper writeMask(m_currShader, inst);
7917 IGC_ASSERT_MESSAGE(writeMask.hasEM() && writeMask.getEM() != 0, "Wrong write mask");
7918
7919 Value* texOperand = inst->getOperand(textureArgIdx);
7920 ResourceDescriptor resource = GetResourceVariable(texOperand);
7921
7922 uint offsetSourceIndex = numOperands - 4;
7923 CVariable* offset = ComputeSampleIntOffset(inst, offsetSourceIndex);
7924
7925 SmallVector<CVariable*, 4> payload;
7926
7927 //create send payload for numSources
7928 for (uint i = 0; i < numSources; i++)
7929 {
7930 CVariable* src = GetSymbol(inst->getOperand(i));
7931 src = BroadcastIfUniform(src);
7932 IGC_ASSERT(src->GetAliasOffset() % getGRFSize() == 0);
7933 payload.push_back(src);
7934 }
7935
7936 CVariable* dst = m_destination;
7937 //When sampler output is 16 bit float, hardware doesnt pack the output in SIMD8 mode.
7938 //Hence the movs to handle this layout in SIMD8 mode
7939 bool simd8HFRet = isHalfGRFReturn(m_destination, m_SimdMode);
7940 if (simd8HFRet)
7941 {
7942 dst = m_currShader->GetNewVariable(
7943 m_destination->GetNumberElement() * 2, ISA_TYPE_HF, EALIGN_GRF, false, CName::NONE);
7944 }
7945
7946 bool feedbackEnable = writeMask.isSet(4);
7947 uint label = 0;
7948 CVariable* flag = nullptr;
7949 bool needLoop = ResourceLoopHeader(resource, flag, label);
7950 m_encoder->SetPredicate(flag);
7951 m_encoder->LoadMS(opCode, writeMask.getEM(), offset, resource, numSources, dst, payload, feedbackEnable);
7952 m_encoder->Push();
7953 if (m_currShader->hasReadWriteImage(*(inst->getParent()->getParent())))
7954 {
7955 CVariable* tempdest = m_currShader->BitCast(m_destination, GetUnsignedIntegerType(m_destination->GetType()));
7956 m_encoder->Cast(m_currShader->GetNULL(), tempdest);
7957 m_encoder->Push();
7958 m_encoder->Copy(m_currShader->GetNULL(), m_currShader->GetTSC());
7959 m_encoder->Push();
7960 }
7961 ResourceLoopBackEdge(needLoop, flag, label);
7962
7963 if (simd8HFRet)
7964 {
7965 PackSIMD8HFRet(dst);
7966 }
7967
7968 if (feedbackEnable)
7969 {
7970 emitFeedbackEnable();
7971 }
7972 }
7973
emitCSSGV(GenIntrinsicInst * inst)7974 void EmitPass::emitCSSGV(GenIntrinsicInst* inst)
7975 {
7976 CComputeShader* csProgram = static_cast<CComputeShader*>(m_currShader);
7977 SGVUsage usage =
7978 static_cast<SGVUsage>(llvm::cast<llvm::ConstantInt>(inst->getOperand(0))->getZExtValue());
7979 CVariable* pThreadIdInGroup = nullptr;
7980 switch (usage)
7981 {
7982 case THREAD_GROUP_ID_X:
7983 {
7984 m_encoder->SetSrcRegion(0, 0, 1, 0);
7985 if (csProgram->GetDispatchAlongY())
7986 {
7987 m_encoder->SetSrcSubReg(0, 6);
7988 }
7989 else
7990 {
7991 m_encoder->SetSrcSubReg(0, 1);
7992 }
7993 m_encoder->Copy(m_destination, csProgram->GetR0());
7994 m_encoder->Push();
7995 break;
7996 }
7997 case THREAD_GROUP_ID_Y:
7998 {
7999 m_encoder->SetSrcRegion(0, 0, 1, 0);
8000 if (csProgram->GetDispatchAlongY())
8001 {
8002 m_encoder->SetSrcSubReg(0, 1);
8003 }
8004 else
8005 {
8006 m_encoder->SetSrcSubReg(0, 6);
8007 }
8008 m_encoder->Copy(m_destination, csProgram->GetR0());
8009 m_encoder->Push();
8010 break;
8011 }
8012 case THREAD_GROUP_ID_Z:
8013 {
8014 m_encoder->SetSrcRegion(0, 0, 1, 0);
8015 m_encoder->SetSrcSubReg(0, 7);
8016 m_encoder->Copy(m_destination, csProgram->GetR0());
8017 m_encoder->Push();
8018 break;
8019 }
8020 case THREAD_ID_IN_GROUP_X:
8021 {
8022 IGC_ASSERT_MESSAGE(inst->getType() == Type::getInt16Ty(inst->getContext()), "only 16bit ThreadID is supported now.");
8023 pThreadIdInGroup = csProgram->CreateThreadIDsinGroup(THREAD_ID_IN_GROUP_X);
8024 m_currShader->CopyVariable(m_destination, pThreadIdInGroup);
8025 break;
8026 }
8027 case THREAD_ID_IN_GROUP_Y:
8028 {
8029 IGC_ASSERT_MESSAGE(inst->getType() == Type::getInt16Ty(inst->getContext()), "only 16bit ThreadID is supported now.");
8030 pThreadIdInGroup = csProgram->CreateThreadIDsinGroup(THREAD_ID_IN_GROUP_Y);
8031 m_currShader->CopyVariable(m_destination, pThreadIdInGroup);
8032 break;
8033 }
8034 case THREAD_ID_IN_GROUP_Z:
8035 {
8036 IGC_ASSERT_MESSAGE(inst->getType() == Type::getInt16Ty(inst->getContext()), "only 16bit ThreadID is supported now.");
8037 pThreadIdInGroup = csProgram->CreateThreadIDsinGroup(THREAD_ID_IN_GROUP_Z);
8038 m_currShader->CopyVariable(m_destination, pThreadIdInGroup);
8039 break;
8040 }
8041 default:
8042 break;
8043 }
8044 }
8045
8046 // Store Coarse Pixel (Actual) size in the destination variable
getCoarsePixelSize(CVariable * destination,const uint component,bool isCodePatchCandidate)8047 void EmitPass::getCoarsePixelSize(CVariable* destination, const uint component, bool isCodePatchCandidate)
8048 {
8049 IGC_ASSERT(component < 2);
8050
8051 CPixelShader* const psProgram = static_cast<CPixelShader*>(m_currShader);
8052 CVariable* r;
8053 bool isR1Lo = false;
8054 // Coarse pixel sizes are in R1 for both simd32 halves.
8055 {
8056 r = psProgram->GetPhase() == PSPHASE_PIXEL ? psProgram->GetCoarseR1() : psProgram->GetR1();
8057 isR1Lo = true;
8058 }
8059 r = m_currShader->GetVarHalf(r, 0);
8060 CVariable* const coarsePixelSize = m_currShader->BitCast(r, ISA_TYPE_UB);
8061 if (isR1Lo && isCodePatchCandidate)
8062 {
8063 psProgram->AppendR1Lo(coarsePixelSize);
8064 }
8065 m_encoder->SetSrcRegion(0, 0, 1, 0);
8066 uint subReg;
8067 {
8068 subReg = (component == 0) ? 0 : 1;
8069 }
8070 m_encoder->SetSrcSubReg(0, subReg);
8071 if (isCodePatchCandidate)
8072 {
8073 m_encoder->SetPayloadSectionAsPrimary();
8074 }
8075 m_encoder->Cast(destination, coarsePixelSize);
8076 m_encoder->Push();
8077 if (isCodePatchCandidate)
8078 {
8079 m_encoder->SetPayloadSectionAsSecondary();
8080 }
8081 }
8082
emitPSSGV(GenIntrinsicInst * inst)8083 void EmitPass::emitPSSGV(GenIntrinsicInst* inst)
8084 {
8085 CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
8086 CVariable* dst = m_destination;
8087 const SGVUsage usage = (SGVUsage)llvm::cast<llvm::ConstantInt>(inst->getOperand(0))->getZExtValue();
8088
8089 // Helper lambda to copy SGV data from thread payload when no data
8090 // processing/calculation is needed.
8091 auto CopySGV = [this, &psProgram, usage](
8092 CVariable* dst, CVariable* src)->void
8093 {
8094 IGC_ASSERT(usage == POSITION_Z ||
8095 usage == POSITION_W ||
8096 usage == INPUT_COVERAGE_MASK);
8097 {
8098 m_encoder->Copy(dst, src);
8099 m_encoder->Push();
8100 }
8101 };
8102
8103 switch (usage)
8104 {
8105 case POSITION_Z:
8106 {
8107 if (psProgram->GetPhase() == PSPHASE_PIXEL || psProgram->GetPhase() == PSPHASE_COARSE)
8108 {
8109 // source depth:
8110 // src_z = (x - xstart)*z_cx + (y - ystart)*z_cy + z_c0
8111 CVariable* delta = psProgram->GetZWDelta();
8112 CVariable* floatR1 = nullptr;
8113 {
8114 floatR1 = psProgram->BitCast(psProgram->GetR1(), ISA_TYPE_F);
8115 if (m_encoder->IsCodePatchCandidate())
8116 {
8117 psProgram->AppendR1Lo(floatR1);
8118 }
8119 }
8120
8121 // Returns (x - xstart) or (y - ystart) in float.
8122 auto getPixelPositionDelta = [this, psProgram, delta, floatR1](const uint component)->CVariable*
8123 {
8124 IGC_ASSERT(component < 2);
8125 CVariable* uintPixelPosition =
8126 m_currShader->GetNewVariable(
8127 numLanes(m_currShader->m_SIMDSize), ISA_TYPE_UW, EALIGN_GRF, CName::NONE);
8128 getPixelPosition(uintPixelPosition, component, m_encoder->IsCodePatchCandidate());
8129
8130 CVariable* floatPixelPosition =
8131 m_currShader->GetNewVariable(
8132 numLanes(m_currShader->m_SIMDSize), ISA_TYPE_F, EALIGN_GRF, CName::NONE);
8133 if (m_encoder->IsCodePatchCandidate())
8134 {
8135 m_encoder->SetPayloadSectionAsPrimary();
8136 }
8137 m_encoder->Cast(floatPixelPosition, uintPixelPosition);
8138 m_encoder->Push();
8139 if (m_encoder->IsCodePatchCandidate())
8140 {
8141 m_encoder->SetPayloadSectionAsSecondary();
8142 }
8143
8144 // Pixel location is center in all APIs that use CPS.
8145 {
8146 CVariable* pixelCenter = m_currShader->ImmToVariable(0x3f000000, ISA_TYPE_F, m_encoder->IsCodePatchCandidate()); // 0.5f
8147 if (psProgram->GetPhase() == PSPHASE_COARSE)
8148 {
8149 CVariable* coarsePixelSize = m_currShader->GetNewVariable(
8150 numLanes(m_currShader->m_SIMDSize), ISA_TYPE_F, EALIGN_GRF, CName::NONE);
8151 getCoarsePixelSize(coarsePixelSize, component, m_encoder->IsCodePatchCandidate());
8152 if (m_encoder->IsCodePatchCandidate())
8153 {
8154 m_encoder->SetPayloadSectionAsPrimary();
8155 m_currShader->AddPatchTempSetup(coarsePixelSize);
8156 }
8157 m_encoder->Mul(coarsePixelSize, coarsePixelSize, pixelCenter);
8158 m_encoder->Push();
8159 if (m_encoder->IsCodePatchCandidate())
8160 {
8161 m_encoder->SetPayloadSectionAsSecondary();
8162 }
8163 pixelCenter = coarsePixelSize;
8164 }
8165 if (m_encoder->IsCodePatchCandidate())
8166 {
8167 m_encoder->SetPayloadSectionAsPrimary();
8168 m_currShader->AddPatchTempSetup(floatPixelPosition);
8169 }
8170 m_encoder->Add(floatPixelPosition, floatPixelPosition, pixelCenter);
8171 m_encoder->Push();
8172 if (m_encoder->IsCodePatchCandidate())
8173 {
8174 m_encoder->SetPayloadSectionAsSecondary();
8175 }
8176 }
8177
8178 CVariable* floatPixelPositionDelta = floatPixelPosition; //reuse the same variable for the final delta
8179
8180 m_encoder->SetSrcRegion(1, 0, 1, 0);
8181 CVariable* startCoordinate = floatR1;
8182 uint topLeftVertexStartSubReg = (component == 0 ? 1 : 6); // R1.1 for XStart and R1.6 for YStart
8183
8184 {
8185 if (psProgram->m_Platform->hasStartCoordinatesDeliveredWithDeltas())
8186 {
8187 startCoordinate = delta;
8188 topLeftVertexStartSubReg = (component == 0 ? 2 : 6);
8189 }
8190 m_encoder->SetSrcSubReg(1, topLeftVertexStartSubReg);
8191 m_encoder->SetSrcModifier(1, EMOD_NEG);
8192 if (m_encoder->IsCodePatchCandidate())
8193 {
8194 m_encoder->SetPayloadSectionAsPrimary();
8195 }
8196 m_encoder->Add(floatPixelPositionDelta, floatPixelPosition, startCoordinate);
8197 m_encoder->Push();
8198 if (m_encoder->IsCodePatchCandidate())
8199 {
8200 m_encoder->SetPayloadSectionAsSecondary();
8201 }
8202 }
8203 return floatPixelPositionDelta;
8204 };
8205 const uint componentX = 0;
8206 const uint componentY = 1;
8207 // (x - xstart)
8208 CVariable* floatPixelPositionDeltaX = getPixelPositionDelta(componentX);
8209 // (y - ystart)
8210 CVariable* floatPixelPositionDeltaY = getPixelPositionDelta(componentY);
8211
8212 // (y - ystart)*z_cy + z_c0
8213 {
8214 {
8215 m_encoder->SetSrcRegion(1, 0, 1, 0);
8216 m_encoder->SetSrcRegion(2, 0, 1, 0);
8217 }
8218 m_encoder->SetSrcSubReg(1, 0);
8219 m_encoder->SetSrcSubReg(2, 3);
8220 ContextSwitchPayloadSection();
8221 m_encoder->Mad(floatPixelPositionDeltaY, floatPixelPositionDeltaY, delta, delta);
8222 m_encoder->Push();
8223 }
8224 // (x - xstart)*z_cx + (y - ystart)*z_cy + z_c0
8225 {
8226 {
8227 m_encoder->SetSrcRegion(1, 0, 1, 0);
8228 }
8229 m_encoder->SetSrcSubReg(1, 1);
8230 m_encoder->Mad(m_destination, floatPixelPositionDeltaX, delta, floatPixelPositionDeltaY);
8231 m_encoder->Push();
8232 }
8233 ContextSwitchShaderBody();
8234 }
8235 else
8236 {
8237 CopySGV(dst, psProgram->GetPositionZ());
8238 }
8239 break;
8240 }
8241 case POSITION_W:
8242 {
8243 CopySGV(dst, psProgram->GetPositionW());
8244 break;
8245 }
8246 case POSITION_X_OFFSET:
8247 case POSITION_Y_OFFSET:
8248 {
8249 // This returns to you the register value in the payload containing PSXYPositionOffset
8250 CVariable* pPositionXYOffset = psProgram->GetPositionXYOffset();
8251 // Access the correct subregion for the interleaved XY follow Spec
8252 m_encoder->SetSrcRegion(0, 16, 8, 2);
8253 m_encoder->SetSrcSubReg(0, usage == POSITION_X_OFFSET ? 0 : 1);
8254
8255 // U4.4 encoding= upper 4 bits represent integer part and lower 4 bits represent decimal part
8256 // Extract integer part by AND with 11110000 and right shifting 4 bits
8257 CVariable* intVal_B = m_currShader->GetNewVariable(
8258 numLanes(m_currShader->m_SIMDSize),
8259 ISA_TYPE_B,
8260 EALIGN_GRF,
8261 CName::NONE);
8262 m_encoder->And(intVal_B, pPositionXYOffset, m_currShader->ImmToVariable(0xf0, ISA_TYPE_B));
8263 m_encoder->Push();
8264
8265 CVariable* intVal_F = m_currShader->GetNewVariable(
8266 numLanes(m_currShader->m_SIMDSize),
8267 ISA_TYPE_F,
8268 EALIGN_GRF,
8269 CName::NONE);
8270 m_encoder->Shr(intVal_B, intVal_B, m_currShader->ImmToVariable(0x04, ISA_TYPE_B));
8271 m_encoder->Cast(intVal_F, intVal_B);
8272 m_encoder->Push();
8273
8274 // Extract decimal part by AND with 00001111 and divide by 16
8275 CVariable* deciVal_B = m_currShader->GetNewVariable(
8276 numLanes(m_currShader->m_SIMDSize),
8277 ISA_TYPE_B,
8278 EALIGN_GRF,
8279 CName::NONE);
8280
8281 m_encoder->SetSrcRegion(0, 16, 8, 2);
8282 m_encoder->SetSrcSubReg(0, usage == POSITION_X_OFFSET ? 0 : 1);
8283 m_encoder->And(deciVal_B, pPositionXYOffset, m_currShader->ImmToVariable(0x0f, ISA_TYPE_B));
8284 m_encoder->Push();
8285
8286 CVariable* deciVal_F = m_currShader->GetNewVariable(
8287 numLanes(m_currShader->m_SIMDSize),
8288 ISA_TYPE_F,
8289 EALIGN_GRF,
8290 CName::NONE);
8291 m_encoder->Cast(deciVal_F, deciVal_B);
8292 // Divide lower 4 bits decimal value by 16 = cheaper operation of cheaper Mul Op by 0.0625
8293 CVariable* temp = m_currShader->GetNewVariable(
8294 numLanes(m_currShader->m_SIMDSize),
8295 ISA_TYPE_F,
8296 EALIGN_GRF,
8297 CName::NONE);
8298 m_encoder->Mul(temp, deciVal_F, m_currShader->ImmToVariable(0x3d800000, ISA_TYPE_F));
8299 m_encoder->Push();
8300
8301 // Add decimal and integer to compute our PSXYOffsetValue
8302 m_encoder->Add(dst, intVal_F, temp);
8303 m_encoder->Push();
8304 }
8305 break;
8306 case RENDER_TARGET_ARRAY_INDEX:
8307 case VIEWPORT_INDEX:
8308 case VFACE:
8309 {
8310 // VFACE in shader's payload is one bit: 0/1 for front/back facing, respectively.
8311 // As it is sign bit of R1.2:w value in payload, the value may be simply converted to float and set as
8312 // dst - float(VFACE) >=0 (<0) means front (back) facing.
8313 {
8314 unsigned int numTri = 1;
8315 SIMDMode simdSize = psProgram->m_SIMDSize;
8316 CVariable* reg = psProgram->GetR0();
8317 unsigned int subReg = 0;
8318 if (usage == VFACE)
8319 {
8320 for (unsigned int i = 0; i < numTri; i++)
8321 {
8322 CVariable* src = m_currShader->BitCast(reg, ISA_TYPE_W);
8323 m_encoder->SetSrcSubReg(0, 2 * (subReg + i * 5));
8324 m_encoder->SetSrcRegion(0, 0, 1, 0);
8325 m_encoder->SetSimdSize(simdSize);
8326 m_encoder->SetMask(i == 0 ? EMASK_Q1 : EMASK_Q2);
8327 m_encoder->SetDstSubVar(i);
8328 if (m_encoder->IsCodePatchCandidate())
8329 {
8330 psProgram->AppendR1Lo(src);
8331 }
8332 ContextSwitchPayloadSection(i == 0);
8333 m_encoder->Cast(dst, src);
8334 m_encoder->Push();
8335 ContextSwitchShaderBody(i == numTri);
8336 }
8337 }
8338 else if (usage == RENDER_TARGET_ARRAY_INDEX)
8339 {
8340 dst = m_currShader->BitCast(dst, ISA_TYPE_UD);
8341 CVariable* temp = m_currShader->GetNewVariable(dst);
8342 for (unsigned int i = 0; i < numTri; i++)
8343 {
8344 m_encoder->SetSrcRegion(0, 0, 1, 0);
8345 m_encoder->SetSrcSubReg(0, subReg + i * 5);
8346 m_encoder->SetSimdSize(simdSize);
8347 m_encoder->SetMask(i == 0 ? EMASK_Q1 : EMASK_Q2);
8348 m_encoder->SetDstSubVar(i);
8349 m_encoder->Shr(temp, reg, m_currShader->ImmToVariable(16, ISA_TYPE_UD));
8350 m_encoder->Push();
8351 }
8352 m_encoder->And(dst, temp, m_currShader->ImmToVariable(BITMASK(11), ISA_TYPE_UD));
8353 m_encoder->Push();
8354 }
8355 else if (usage == VIEWPORT_INDEX)
8356 {
8357 dst = m_currShader->BitCast(dst, ISA_TYPE_UD);
8358 CVariable* temp = m_currShader->GetNewVariable(dst);
8359 for (unsigned int i = 0; i < numTri; i++)
8360 {
8361 m_encoder->SetSrcRegion(0, 0, 1, 0);
8362 m_encoder->SetSrcSubReg(0, subReg + i * 5);
8363 m_encoder->SetSimdSize(simdSize);
8364 m_encoder->SetMask(i == 0 ? EMASK_Q1 : EMASK_Q2);
8365 m_encoder->SetDstSubVar(i);
8366 m_encoder->Shr(temp, reg, m_currShader->ImmToVariable(27, ISA_TYPE_UD));
8367 m_encoder->Push();
8368 }
8369 m_encoder->And(dst, temp, m_currShader->ImmToVariable(BITMASK(4), ISA_TYPE_UD));
8370 m_encoder->Push();
8371 }
8372 }
8373 }
8374 break;
8375 case SAMPLEINDEX:
8376 {
8377 // Sample index is stored in one half byte per subspan. We shift right
8378 // each lane with a different value to get the right number for each subspan
8379 // shr (8) r9.0<1>:uw r1.0<0;1,0>:uw 0x0c080400:uv { Align1, NoMask, Q1 }
8380 // and (16) r9.0<1>:ud r9.0<1;4,0>:ud 0x0000000f:uw { Align1, Q1 }
8381 CVariable* shiftPos = m_currShader->ImmToVariable(0x0C080400, ISA_TYPE_UV);
8382 CVariable* temp = nullptr;
8383 {
8384 CVariable* r1 = m_currShader->BitCast(psProgram->GetR1(), ISA_TYPE_UW);
8385 temp = m_currShader->GetNewVariable(8, ISA_TYPE_UW, EALIGN_GRF,
8386 "SampleIndexExtracted");
8387 m_encoder->SetSrcRegion(0, 0, 1, 0);
8388 m_encoder->SetSimdSize(SIMDMode::SIMD8);
8389 m_encoder->SetNoMask();
8390 m_encoder->Shr(temp, r1, shiftPos);
8391 m_encoder->Push();
8392 }
8393
8394 CVariable* andMask = m_currShader->ImmToVariable(0x0000000F, ISA_TYPE_UD);
8395 dst = m_currShader->BitCast(dst, ISA_TYPE_UD);
8396 temp = m_currShader->BitCast(temp, ISA_TYPE_UD);
8397 m_encoder->SetSrcRegion(0, 1, 4, 0);
8398 m_encoder->And(dst, temp, andMask);
8399 m_encoder->Push();
8400 }
8401 break;
8402 case INPUT_COVERAGE_MASK:
8403 {
8404 CVariable* pInputCoverageMask = psProgram->GetInputCoverageMask();
8405 CopySGV(dst, pInputCoverageMask);
8406 }
8407 break;
8408 case ACTUAL_COARSE_SIZE_X:
8409 case ACTUAL_COARSE_SIZE_Y:
8410 {
8411 getCoarsePixelSize(m_destination, (usage == ACTUAL_COARSE_SIZE_X ? 0 : 1));
8412 }
8413 break;
8414 case REQUESTED_COARSE_SIZE_X:
8415 case REQUESTED_COARSE_SIZE_Y:
8416 {
8417 CVariable* requestedSize = (usage == REQUESTED_COARSE_SIZE_X) ?
8418 psProgram->GetCPSRequestedSizeX() : psProgram->GetCPSRequestedSizeY();
8419 m_encoder->SetSrcRegion(0, 1, 4, 0);
8420 m_encoder->Cast(m_destination, requestedSize);
8421 m_encoder->Push();
8422 }
8423 break;
8424 case MSAA_RATE:
8425 {
8426 dst = m_currShader->BitCast(dst, ISA_TYPE_UD);
8427 m_encoder->SetSrcRegion(0, 0, 1, 0);
8428 CVariable* r;
8429 {
8430 m_encoder->SetSrcSubReg(0, 1);
8431 r = psProgram->GetR1();
8432 }
8433 m_encoder->And(
8434 dst,
8435 m_currShader->BitCast(r, ISA_TYPE_UW),
8436 m_currShader->ImmToVariable(BITMASK(4), ISA_TYPE_UW));
8437 m_encoder->Push();
8438 }
8439 break;
8440
8441 default:
8442 IGC_ASSERT(0);
8443 break;
8444 }
8445
8446 psProgram->DeclareSGV(usage);
8447 }
8448
emitDSSGV(llvm::GenIntrinsicInst * pInst)8449 void EmitPass::emitDSSGV(llvm::GenIntrinsicInst* pInst)
8450 {
8451 IGC_ASSERT(m_currShader->GetShaderType() == ShaderType::DOMAIN_SHADER);
8452 CDomainShader* dsProgram = static_cast<CDomainShader*>(m_currShader);
8453 SGVUsage usage = static_cast<SGVUsage>(llvm::dyn_cast<llvm::ConstantInt>(pInst->getOperand(0))->getZExtValue());
8454 if (PRIMITIVEID == usage)
8455 {
8456 if (dsProgram->m_ShaderDispatchMode == ShaderDispatchMode::DUAL_PATCH)
8457 {
8458 m_encoder->SetSrcRegion(0, 4, 4, 0);
8459 m_encoder->SetSrcSubReg(0, 0);
8460 m_encoder->Copy(m_destination, dsProgram->GetPrimitiveID());
8461 m_encoder->Push();
8462 }
8463 else
8464 {
8465 m_encoder->SetSrcRegion(0, 0, 1, 0);
8466 m_encoder->SetSrcSubReg(0, 1);
8467 m_encoder->Copy(m_destination, dsProgram->GetR0());
8468 m_encoder->Push();
8469 }
8470 }
8471 }
8472
emitHSSGV(llvm::GenIntrinsicInst * pInst)8473 void EmitPass::emitHSSGV(llvm::GenIntrinsicInst* pInst)
8474 {
8475 IGC_ASSERT(m_currShader->GetShaderType() == ShaderType::HULL_SHADER);
8476 CHullShader* hsProgram = static_cast<CHullShader*>(m_currShader);
8477 SGVUsage usage = static_cast<SGVUsage>(llvm::dyn_cast<llvm::ConstantInt>(pInst->getOperand(0))->getZExtValue());
8478 if (PRIMITIVEID == usage)
8479 {
8480 if (hsProgram->GetShaderDispatchMode() == SINGLE_PATCH_DISPATCH_MODE)
8481 {
8482 m_encoder->SetSrcRegion(0, 0, 1, 0);
8483 m_encoder->SetSrcSubReg(0, 1);
8484 m_encoder->Copy(m_destination, hsProgram->GetR0());
8485 m_encoder->Push();
8486 }
8487 else
8488 {
8489 // eight patch dispatch mode
8490 m_encoder->Copy(m_destination, hsProgram->GetR2());
8491 m_encoder->Push();
8492 }
8493 }
8494 else
8495 {
8496 IGC_ASSERT_MESSAGE(0, "Hull Shader SGV not supported");
8497 }
8498 }
8499
8500
8501 // Store integer pixel position in the destination variable.
8502 // Only X and Y components are handled here!
getPixelPosition(CVariable * destination,const uint component,bool isCodePatchCandidate)8503 void EmitPass::getPixelPosition(CVariable* destination, const uint component, bool isCodePatchCandidate)
8504 {
8505 IGC_ASSERT(component < 2);
8506 IGC_ASSERT(nullptr != destination);
8507 IGC_ASSERT(nullptr != m_encoder);
8508 IGC_ASSERT(m_encoder->IsIntegerType(destination->GetType()));
8509
8510 const bool getX = (component == 0);
8511
8512 CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
8513 CVariable* imm = m_currShader->ImmToVariable(
8514 getX ? 0x10101010 : 0x11001100, ISA_TYPE_V, isCodePatchCandidate);
8515 CVariable* pixelSize = nullptr;
8516 if (psProgram->GetPhase() == PSPHASE_COARSE)
8517 {
8518 // Coarse pixel sizes are in R1 for both simd32 halves.
8519 CVariable* r;
8520 bool isR1Lo = false;
8521 {
8522 r = m_currShader->GetVarHalf(psProgram->GetR1(), 0);
8523 isR1Lo = true;
8524 }
8525 CVariable* CPSize = m_currShader->BitCast(r, ISA_TYPE_UB);
8526 if (isR1Lo && isCodePatchCandidate)
8527 {
8528 psProgram->AppendR1Lo(CPSize);
8529 }
8530 pixelSize =
8531 m_currShader->GetNewVariable(
8532 numLanes(m_currShader->m_SIMDSize), ISA_TYPE_UW, EALIGN_GRF, CName::NONE);
8533 m_encoder->SetSrcRegion(0, 0, 1, 0);
8534 uint subReg;
8535 {
8536 subReg = getX ? 0 : 1;
8537 }
8538 m_encoder->SetSrcSubReg(0, subReg);
8539 if (isCodePatchCandidate)
8540 {
8541 m_encoder->SetPayloadSectionAsPrimary();
8542 m_currShader->AddPatchTempSetup(pixelSize);
8543 }
8544 m_encoder->Mul(pixelSize, CPSize, imm);
8545 m_encoder->Push();
8546 if (isCodePatchCandidate)
8547 {
8548 m_encoder->SetPayloadSectionAsSecondary();
8549 }
8550 }
8551 else
8552 {
8553 pixelSize = imm;
8554 }
8555
8556 {
8557 CVariable* position = m_currShader->BitCast(psProgram->GetR1(), ISA_TYPE_UW);
8558 // subreg 4 as position_x and subreg 5 as position_y
8559 m_encoder->SetSrcSubReg(0, getX ? 4 : 5);
8560 m_encoder->SetSrcRegion(0, 2, 4, 0);
8561 if (isCodePatchCandidate)
8562 {
8563 m_encoder->SetPayloadSectionAsPrimary();
8564 psProgram->AppendR1Lo(position);
8565 m_currShader->AddPatchTempSetup(destination);
8566 }
8567 m_encoder->Add(destination, position, pixelSize);
8568 m_encoder->Push();
8569 if (isCodePatchCandidate)
8570 {
8571 m_encoder->SetPayloadSectionAsSecondary();
8572 }
8573 }
8574 }
8575
8576
emitPixelPosition(llvm::GenIntrinsicInst * inst)8577 void EmitPass::emitPixelPosition(llvm::GenIntrinsicInst* inst)
8578 {
8579 const GenISAIntrinsic::ID IID = inst->getIntrinsicID();
8580 const uint component = IID == GenISAIntrinsic::GenISA_PixelPositionX ? 0 : 1;
8581 getPixelPosition(m_destination, component);
8582 }
8583
emitSGV(SGVIntrinsic * inst)8584 void EmitPass::emitSGV(SGVIntrinsic* inst)
8585 {
8586 switch (m_currShader->GetShaderType())
8587 {
8588 case ShaderType::PIXEL_SHADER:
8589 emitPSSGV(inst);
8590 break;
8591 case ShaderType::COMPUTE_SHADER:
8592 emitCSSGV(inst);
8593 break;
8594 case ShaderType::DOMAIN_SHADER:
8595 emitDSSGV(inst);
8596 break;
8597 case ShaderType::HULL_SHADER:
8598 emitHSSGV(inst);
8599 break;
8600 case ShaderType::GEOMETRY_SHADER:
8601 emitGS_SGV(inst);
8602 break;
8603 default:
8604 IGC_ASSERT_MESSAGE(0, "This shader should not have SGV");
8605 break;
8606 }
8607 }
8608
emitAluNoModifier(llvm::GenIntrinsicInst * inst)8609 void EmitPass::emitAluNoModifier(llvm::GenIntrinsicInst* inst)
8610 {
8611 CVariable* pSrc0 = GetSymbol(inst->getOperand(0));
8612 CVariable* pSrc1;
8613 CVariable* pSrc2;
8614 CVariable* dst;
8615
8616 switch (inst->getIntrinsicID())
8617 {
8618 case GenISAIntrinsic::GenISA_bfi:
8619 {
8620 pSrc1 = GetSymbol(inst->getOperand(1));
8621 pSrc2 = GetSymbol(inst->getOperand(2));
8622 CVariable* pSrc3 = GetSymbol(inst->getOperand(3));
8623 m_encoder->Bfi(m_destination, pSrc0, pSrc1, pSrc2, pSrc3);
8624 }
8625 break;
8626 case GenISAIntrinsic::GenISA_ibfe:
8627 pSrc1 = GetSymbol(inst->getOperand(1));
8628 pSrc2 = GetSymbol(inst->getOperand(2));
8629 m_encoder->Bfe(m_destination, pSrc0, pSrc1, pSrc2);
8630 break;
8631 case GenISAIntrinsic::GenISA_ubfe:
8632 pSrc1 = GetSymbol(inst->getOperand(1));
8633 pSrc2 = GetSymbol(inst->getOperand(2));
8634 pSrc0 = m_currShader->BitCast(pSrc0, ISA_TYPE_UD);
8635 pSrc1 = m_currShader->BitCast(pSrc1, ISA_TYPE_UD);
8636 pSrc2 = m_currShader->BitCast(pSrc2, ISA_TYPE_UD);
8637 dst = m_currShader->BitCast(m_destination, GetUnsignedType(m_destination->GetType()));
8638 m_encoder->Bfe(dst, pSrc0, pSrc1, pSrc2);
8639 break;
8640 case GenISAIntrinsic::GenISA_firstbitLo:
8641 dst = m_currShader->BitCast(m_destination, GetUnsignedType(m_destination->GetType()));
8642 m_encoder->Fbl(dst, pSrc0);
8643 break;
8644 case GenISAIntrinsic::GenISA_firstbitHi:
8645 pSrc0 = m_currShader->BitCast(pSrc0, ISA_TYPE_UD);
8646 dst = m_currShader->BitCast(m_destination, GetUnsignedType(m_destination->GetType()));
8647 m_encoder->Fbh(dst, pSrc0);
8648 break;
8649 case GenISAIntrinsic::GenISA_firstbitShi:
8650 dst = m_currShader->BitCast(m_destination, GetUnsignedType(m_destination->GetType()));
8651 m_encoder->Fbh(dst, pSrc0);
8652 break;
8653 default:
8654 break;
8655 }
8656 m_encoder->Push();
8657 }
8658
EmitGenIntrinsicMessage(llvm::GenIntrinsicInst * inst)8659 void EmitPass::EmitGenIntrinsicMessage(llvm::GenIntrinsicInst* inst)
8660 {
8661 switch (inst->getIntrinsicID())
8662 {
8663 case GenISAIntrinsic::GenISA_OUTPUT:
8664 emitOutput(inst);
8665 break;
8666 case GenISAIntrinsic::GenISA_RTWrite:
8667 emitRenderTargetWrite(cast<RTWritIntrinsic>(inst), false);
8668 break;
8669 case GenISAIntrinsic::GenISA_RTDualBlendSource:
8670 emitDualBlendRT(cast<RTDualBlendSourceIntrinsic>(inst), false);
8671 break;
8672 case GenISAIntrinsic::GenISA_simdLaneId:
8673 emitSimdLaneId(inst);
8674 break;
8675 case GenISAIntrinsic::GenISA_patchInstanceId:
8676 emitPatchInstanceId(inst);
8677 break;
8678 case GenISAIntrinsic::GenISA_simdSize:
8679 emitSimdSize(inst);
8680 break;
8681 case GenISAIntrinsic::GenISA_simdShuffleDown:
8682 emitSimdShuffleDown(inst);
8683 break;
8684 case GenISAIntrinsic::GenISA_simdBlockRead:
8685 emitSimdBlockRead(inst);
8686 break;
8687 case GenISAIntrinsic::GenISA_simdBlockReadBindless:
8688 emitSimdBlockRead(inst, inst->getOperand(1));
8689 break;
8690 case GenISAIntrinsic::GenISA_simdBlockWrite:
8691 emitSimdBlockWrite(inst);
8692 break;
8693 case GenISAIntrinsic::GenISA_simdBlockWriteBindless:
8694 emitSimdBlockWrite(inst, inst->getOperand(2));
8695 break;
8696 case GenISAIntrinsic::GenISA_MediaBlockRead:
8697 emitMediaBlockIO(inst, true);
8698 break;
8699 case GenISAIntrinsic::GenISA_MediaBlockWrite:
8700 emitMediaBlockIO(inst, false);
8701 break;
8702 case GenISAIntrinsic::GenISA_MediaBlockRectangleRead:
8703 emitMediaBlockRectangleRead(inst);
8704 break;
8705 case GenISAIntrinsic::GenISA_simdMediaBlockRead:
8706 emitSimdMediaBlockRead(inst);
8707 break;
8708 case GenISAIntrinsic::GenISA_simdMediaBlockWrite:
8709 emitSimdMediaBlockWrite(inst);
8710 break;
8711 case GenISAIntrinsic::GenISA_frc:
8712 emitFrc(inst);
8713 break;
8714 case GenISAIntrinsic::GenISA_RenderTargetRead:
8715 case GenISAIntrinsic::GenISA_RenderTargetReadSampleFreq:
8716 emitRenderTargetRead(inst);
8717 break;
8718 case GenISAIntrinsic::GenISA_URBWrite:
8719 emitURBWrite(inst);
8720 break;
8721 case GenISAIntrinsic::GenISA_URBRead:
8722 case GenISAIntrinsic::GenISA_URBReadOutput:
8723 emitURBRead(inst);
8724 break;
8725 case GenISAIntrinsic::GenISA_cycleCounter:
8726 emitcycleCounter(inst);
8727 break;
8728 case GenISAIntrinsic::GenISA_SetDebugReg:
8729 emitSetDebugReg(inst);
8730 break;
8731 case GenISAIntrinsic::GenISA_vmeSendIME:
8732 emitVMESendIME(inst);
8733 break;
8734 case GenISAIntrinsic::GenISA_vmeSendIME2:
8735 emitVMESendIME2(inst);
8736 break;
8737 case GenISAIntrinsic::GenISA_vmeSendFBR:
8738 emitVMESendFBR(inst);
8739 break;
8740 case GenISAIntrinsic::GenISA_vmeSendFBR2:
8741 emitVMESendFBR2(inst);
8742 break;
8743 case GenISAIntrinsic::GenISA_vmeSendSIC2:
8744 emitVMESendSIC2(inst);
8745 break;
8746 case GenISAIntrinsic::GenISA_vmeSendSIC:
8747 emitVMESendSIC(inst);
8748 break;
8749 case GenISAIntrinsic::GenISA_vaErode:
8750 case GenISAIntrinsic::GenISA_vaDilate:
8751 case GenISAIntrinsic::GenISA_vaMinMax:
8752 emitVideoAnalyticSLM(inst, 1);
8753 break;
8754 case GenISAIntrinsic::GenISA_vaMinMaxFilter:
8755 emitVideoAnalyticSLM(inst, 8);
8756 break;
8757 case GenISAIntrinsic::GenISA_vaConvolve:
8758 case GenISAIntrinsic::GenISA_vaCentroid:
8759 emitVideoAnalyticSLM(inst, 4);
8760 break;
8761 case GenISAIntrinsic::GenISA_vaConvolveGRF_16x1:
8762 case GenISAIntrinsic::GenISA_vaConvolveGRF_16x4:
8763 emitVideoAnalyticGRF(inst, 1);
8764 break;
8765 case GenISAIntrinsic::GenISA_vaBoolSum:
8766 case GenISAIntrinsic::GenISA_vaBoolCentroid:
8767 emitVideoAnalyticSLM(inst, 2);
8768 break;
8769 case GenISAIntrinsic::GenISA_createMessagePhasesNoInit:
8770 case GenISAIntrinsic::GenISA_createMessagePhasesNoInitV:
8771 break;
8772 case GenISAIntrinsic::GenISA_createMessagePhases:
8773 case GenISAIntrinsic::GenISA_createMessagePhasesV:
8774 emitCreateMessagePhases(inst);
8775 break;
8776 case GenISAIntrinsic::GenISA_getMessagePhaseX:
8777 case GenISAIntrinsic::GenISA_getMessagePhaseXV:
8778 emitGetMessagePhaseX(inst);
8779 break;
8780 case GenISAIntrinsic::GenISA_simdGetMessagePhase:
8781 case GenISAIntrinsic::GenISA_simdGetMessagePhaseV:
8782 emitSimdGetMessagePhase(inst);
8783 break;
8784 case GenISAIntrinsic::GenISA_broadcastMessagePhase:
8785 case GenISAIntrinsic::GenISA_broadcastMessagePhaseV:
8786 emitBroadcastMessagePhase(inst);
8787 return;
8788 case GenISAIntrinsic::GenISA_simdSetMessagePhase:
8789 case GenISAIntrinsic::GenISA_simdSetMessagePhaseV:
8790 emitSimdSetMessagePhase(inst);
8791 break;
8792 case GenISAIntrinsic::GenISA_simdMediaRegionCopy:
8793 emitSimdMediaRegionCopy(inst);
8794 break;
8795 case GenISAIntrinsic::GenISA_extractMVAndSAD:
8796 emitExtractMVAndSAD(inst);
8797 break;
8798 case GenISAIntrinsic::GenISA_cmpSADs:
8799 emitCmpSADs(inst);
8800 break;
8801 case GenISAIntrinsic::GenISA_setMessagePhaseX_legacy:
8802 emitSetMessagePhaseX_legacy(inst);
8803 break;
8804 case GenISAIntrinsic::GenISA_setMessagePhase_legacy:
8805 emitSetMessagePhase_legacy(inst);
8806 break;
8807 case GenISAIntrinsic::GenISA_setMessagePhaseX:
8808 case GenISAIntrinsic::GenISA_setMessagePhaseXV:
8809 emitSetMessagePhaseX(inst);
8810 break;
8811 case GenISAIntrinsic::GenISA_getMessagePhase:
8812 case GenISAIntrinsic::GenISA_getMessagePhaseV:
8813 emitGetMessagePhase(inst);
8814 break;
8815 case GenISAIntrinsic::GenISA_setMessagePhase:
8816 case GenISAIntrinsic::GenISA_setMessagePhaseV:
8817 emitSetMessagePhase(inst);
8818 break;
8819 case GenISAIntrinsic::GenISA_DCL_ShaderInputVec:
8820 case GenISAIntrinsic::GenISA_DCL_inputVec:
8821 emitInput(inst);
8822 break;
8823 case GenISAIntrinsic::GenISA_PullSampleIndexBarys:
8824 case GenISAIntrinsic::GenISA_PullSnappedBarys:
8825 case GenISAIntrinsic::GenISA_PullCentroidBarys:
8826 emitEvalAttribute(inst);
8827 break;
8828 case GenISAIntrinsic::GenISA_Interpolate:
8829 emitInterpolate(inst);
8830 break;
8831 case GenISAIntrinsic::GenISA_Interpolate2:
8832 emitInterpolate2(inst);
8833 break;
8834 case GenISAIntrinsic::GenISA_Interpolant:
8835 emitInterpolant(inst);
8836 break;
8837 case GenISAIntrinsic::GenISA_DCL_DSCntrlPtInputVec:
8838 emitInput(inst);
8839 break;
8840 case GenISAIntrinsic::GenISA_ldptr:
8841 emitLdInstruction(inst);
8842 break;
8843 case GenISAIntrinsic::GenISA_sampleptr:
8844 case GenISAIntrinsic::GenISA_sampleBptr:
8845 case GenISAIntrinsic::GenISA_sampleCptr:
8846 case GenISAIntrinsic::GenISA_sampleDptr:
8847 case GenISAIntrinsic::GenISA_sampleDCptr:
8848 case GenISAIntrinsic::GenISA_sampleLptr:
8849 case GenISAIntrinsic::GenISA_sampleLCptr:
8850 case GenISAIntrinsic::GenISA_sampleBCptr:
8851 case GenISAIntrinsic::GenISA_lodptr:
8852 case GenISAIntrinsic::GenISA_sampleKillPix:
8853 emitSampleInstruction(cast<SampleIntrinsic>(inst));
8854 break;
8855 case GenISAIntrinsic::GenISA_discard:
8856 emitDiscard(inst);
8857 break;
8858 case GenISAIntrinsic::GenISA_resinfoptr:
8859 case GenISAIntrinsic::GenISA_sampleinfoptr:
8860 emitInfoInstruction(cast<InfoIntrinsic>(inst));
8861 break;
8862 case GenISAIntrinsic::GenISA_gather4ptr:
8863 case GenISAIntrinsic::GenISA_gather4Cptr:
8864 case GenISAIntrinsic::GenISA_gather4POptr:
8865 case GenISAIntrinsic::GenISA_gather4POCptr:
8866 emitGather4Instruction(cast<SamplerGatherIntrinsic>(inst));
8867 break;
8868 case GenISAIntrinsic::GenISA_ldmcsptr:
8869 case GenISAIntrinsic::GenISA_ldmsptr:
8870 case GenISAIntrinsic::GenISA_ldmsptr16bit:
8871 emitLdmsInstruction(inst);
8872 break;
8873 case GenISAIntrinsic::GenISA_DCL_SystemValue:
8874 emitSGV(cast<SGVIntrinsic>(inst));
8875 break;
8876 case GenISAIntrinsic::GenISA_PixelPositionX:
8877 case GenISAIntrinsic::GenISA_PixelPositionY:
8878 emitPixelPosition(inst);
8879 break;
8880 case GenISAIntrinsic::GenISA_DCL_GSsystemValue:
8881 emitGS_SGV(cast<SGVIntrinsic>(inst));
8882 break;
8883 case GenISAIntrinsic::GenISA_SampleOffsetX:
8884 case GenISAIntrinsic::GenISA_SampleOffsetY:
8885 emitSampleOffset(inst);
8886 break;
8887 case GenISAIntrinsic::GenISA_typedread:
8888 emitTypedRead(inst);
8889 break;
8890 case GenISAIntrinsic::GenISA_typedwrite:
8891 emitTypedWrite(inst);
8892 break;
8893 case GenISAIntrinsic::GenISA_threadgroupbarrier:
8894 case GenISAIntrinsic::GenISA_threadgroupbarrier_signal:
8895 case GenISAIntrinsic::GenISA_threadgroupbarrier_wait:
8896 emitThreadGroupBarrier(inst);
8897 break;
8898 case GenISAIntrinsic::GenISA_memoryfence:
8899 emitMemoryFence(inst);
8900 break;
8901 case GenISAIntrinsic::GenISA_flushsampler:
8902 emitFlushSamplerCache();
8903 break;
8904 case GenISAIntrinsic::GenISA_typedmemoryfence:
8905 emitTypedMemoryFence(inst);
8906 break;
8907 case GenISAIntrinsic::GenISA_assume_uniform:
8908 // nothing to do
8909 break;
8910 case GenISAIntrinsic::GenISA_intatomicraw:
8911 case GenISAIntrinsic::GenISA_floatatomicraw:
8912 case GenISAIntrinsic::GenISA_intatomicrawA64:
8913 case GenISAIntrinsic::GenISA_floatatomicrawA64:
8914 case GenISAIntrinsic::GenISA_icmpxchgatomicraw:
8915 case GenISAIntrinsic::GenISA_fcmpxchgatomicraw:
8916 case GenISAIntrinsic::GenISA_icmpxchgatomicrawA64:
8917 case GenISAIntrinsic::GenISA_fcmpxchgatomicrawA64:
8918 emitAtomicRaw(inst);
8919 break;
8920 case GenISAIntrinsic::GenISA_intatomictyped:
8921 case GenISAIntrinsic::GenISA_icmpxchgatomictyped:
8922 emitAtomicTyped(inst);
8923 break;
8924 case GenISAIntrinsic::GenISA_atomiccounterinc:
8925 case GenISAIntrinsic::GenISA_atomiccounterpredec:
8926 emitAtomicCounter(inst);
8927 break;
8928 case GenISAIntrinsic::GenISA_bfi:
8929 case GenISAIntrinsic::GenISA_ubfe:
8930 case GenISAIntrinsic::GenISA_ibfe:
8931 case GenISAIntrinsic::GenISA_firstbitLo:
8932 case GenISAIntrinsic::GenISA_firstbitHi:
8933 case GenISAIntrinsic::GenISA_firstbitShi:
8934 emitAluNoModifier(inst);
8935 break;
8936 case GenISAIntrinsic::GenISA_OutputTessFactors:
8937 emitHSTessFactors(inst);
8938 break;
8939 case GenISAIntrinsic::GenISA_f32tof16_rtz:
8940 emitf32tof16_rtz(inst);
8941 break;
8942 case GenISAIntrinsic::GenISA_ftoi_rtn:
8943 case GenISAIntrinsic::GenISA_ftoi_rtp:
8944 case GenISAIntrinsic::GenISA_ftoi_rte:
8945 case GenISAIntrinsic::GenISA_ftoui_rtn:
8946 case GenISAIntrinsic::GenISA_ftoui_rtp:
8947 case GenISAIntrinsic::GenISA_ftoui_rte:
8948 emitftoi(inst);
8949 break;
8950 case GenISAIntrinsic::GenISA_itof_rtn:
8951 case GenISAIntrinsic::GenISA_itof_rtp:
8952 case GenISAIntrinsic::GenISA_itof_rtz:
8953 case GenISAIntrinsic::GenISA_uitof_rtn:
8954 case GenISAIntrinsic::GenISA_uitof_rtp:
8955 case GenISAIntrinsic::GenISA_uitof_rtz:
8956 case GenISAIntrinsic::GenISA_ftof_rte:
8957 case GenISAIntrinsic::GenISA_ftof_rtn:
8958 case GenISAIntrinsic::GenISA_ftof_rtp:
8959 case GenISAIntrinsic::GenISA_ftof_rtz:
8960 emitfitof(inst);
8961 break;
8962 case GenISAIntrinsic::GenISA_ftobf:
8963 case GenISAIntrinsic::GenISA_bftof:
8964 case GenISAIntrinsic::GenISA_2fto2bf:
8965 emitfcvt(inst);
8966 break;
8967 case GenISAIntrinsic::GenISA_uavSerializeAll:
8968 case GenISAIntrinsic::GenISA_uavSerializeOnResID:
8969 emitUAVSerialize();
8970 break;
8971 case GenISAIntrinsic::GenISA_globalSync:
8972 emitMemoryFence();
8973 break;
8974 case GenISAIntrinsic::GenISA_PHASE_OUTPUT:
8975 case GenISAIntrinsic::GenISA_PHASE_OUTPUTVEC:
8976 emitPhaseOutput(inst);
8977 break;
8978 case GenISAIntrinsic::GenISA_PHASE_INPUT:
8979 case GenISAIntrinsic::GenISA_PHASE_INPUTVEC:
8980 emitPhaseInput(inst);
8981 break;
8982 case GenISAIntrinsic::GenISA_ldrawvector_indexed:
8983 case GenISAIntrinsic::GenISA_ldraw_indexed:
8984 emitLoadRawIndexed(
8985 cast<LdRawIntrinsic>(inst),
8986 cast<LdRawIntrinsic>(inst)->getOffsetValue(),
8987 nullptr);
8988 break;
8989 case GenISAIntrinsic::GenISA_storerawvector_indexed:
8990 case GenISAIntrinsic::GenISA_storeraw_indexed:
8991 emitStoreRawIndexed(
8992 cast<StoreRawIntrinsic>(inst),
8993 cast<StoreRawIntrinsic>(inst)->getOffsetValue(),
8994 nullptr);
8995 break;
8996 case GenISAIntrinsic::GenISA_GetBufferPtr:
8997 emitGetBufferPtr(inst);
8998 break;
8999 case GenISAIntrinsic::GenISA_readsurfaceinfoptr:
9000 emitSurfaceInfo(inst);
9001 break;
9002 case GenISAIntrinsic::GenISA_mov_identity:
9003 {
9004 // Use Or instead of a Copy, as VISA will remove redundant movs.
9005 auto Var = GetSymbol(inst->getOperand(0));
9006 CVariable* Zero = m_currShader->ImmToVariable(0, ISA_TYPE_UD);
9007 m_encoder->Or(Var, Var, Zero);
9008 m_encoder->Push();
9009 break;
9010 }
9011 case GenISAIntrinsic::GenISA_source_value:
9012 {
9013 m_encoder->Copy(m_currShader->GetNULL(), GetSymbol(inst->getOperand(0)));
9014 m_encoder->Push();
9015 break;
9016 }
9017 case GenISAIntrinsic::GenISA_movcr:
9018 {
9019 m_encoder->SetSrcSubReg(0, static_cast<uint16_t>(GetImmediateVal(inst->getOperand(0))));
9020 m_encoder->Copy(m_destination, m_currShader->GetCR0());
9021 m_encoder->Push();
9022 break;
9023 }
9024 case GenISAIntrinsic::GenISA_hw_thread_id:
9025 case GenISAIntrinsic::GenISA_hw_thread_id_alloca:
9026 {
9027 m_encoder->Copy(m_destination, m_currShader->GetHWTID());
9028 m_encoder->Push();
9029 break;
9030 }
9031 case GenISAIntrinsic::GenISA_slice_id:
9032 {
9033 if (m_currShader->m_Platform->GetPlatformFamily() == IGFX_GEN8_CORE ||
9034 m_currShader->m_Platform->GetPlatformFamily() == IGFX_GEN9_CORE)
9035 emitStateRegID(14, 15);
9036 else if (m_currShader->m_Platform->GetPlatformFamily() == IGFX_GEN12_CORE ||
9037 m_currShader->m_Platform->GetPlatformFamily() == IGFX_XE_HP_CORE)
9038 emitStateRegID(11, 13);
9039 else
9040 emitStateRegID(12, 14);
9041 break;
9042 }
9043 case GenISAIntrinsic::GenISA_subslice_id:
9044 {
9045 if (m_currShader->m_Platform->GetPlatformFamily() == IGFX_GEN8_CORE ||
9046 m_currShader->m_Platform->GetPlatformFamily() == IGFX_GEN9_CORE)
9047 emitStateRegID(12, 13);
9048 else
9049 emitStateRegID(8, 8);
9050 break;
9051 }
9052 case GenISAIntrinsic::GenISA_dual_subslice_id:
9053 {
9054 if (m_currShader->m_Platform->GetPlatformFamily() == IGFX_GEN11_CORE ||
9055 m_currShader->m_Platform->GetPlatformFamily() == IGFX_GEN11LP_CORE ||
9056 m_currShader->m_Platform->GetPlatformFamily() == IGFX_GEN12LP_CORE)
9057 emitStateRegID(9, 11);
9058 else if (m_currShader->m_Platform->GetPlatformFamily() == IGFX_GEN12_CORE ||
9059 m_currShader->m_Platform->GetPlatformFamily() == IGFX_XE_HP_CORE)
9060 emitStateRegID(9, 10);
9061 else
9062 IGC_ASSERT_MESSAGE(0, "No support for Dual Subslice in current platform");
9063 break;
9064 }
9065 case GenISAIntrinsic::GenISA_eu_id:
9066 {
9067 if (m_currShader->m_Platform->GetPlatformFamily() == IGFX_GEN8_CORE ||
9068 m_currShader->m_Platform->GetPlatformFamily() == IGFX_GEN9_CORE)
9069 emitStateRegID(8, 11);
9070 else
9071 emitStateRegID(4, 7);
9072 break;
9073 }
9074 case GenISAIntrinsic::GenISA_getSR0:
9075 {
9076 m_encoder->SetSrcSubReg(0, static_cast<uint16_t>(GetImmediateVal(inst->getOperand(0))));
9077 m_encoder->Copy(m_destination, m_currShader->GetSR0());
9078 m_encoder->Push();
9079 break;
9080 }
9081 case GenISAIntrinsic::GenISA_getSR0_0:
9082 {
9083 m_encoder->SetSrcSubReg(0, 0);
9084 m_encoder->Copy(m_destination, m_currShader->GetSR0());
9085 m_encoder->Push();
9086 break;
9087 }
9088 case GenISAIntrinsic::GenISA_eu_thread_id:
9089 emitStateRegID(0, 2);
9090 break;
9091 case GenISAIntrinsic::GenISA_eu_thread_pause:
9092 emitThreadPause(inst);
9093 break;
9094 case GenISAIntrinsic::GenISA_pair_to_ptr:
9095 emitPairToPtr(inst);
9096 break;
9097 case GenISAIntrinsic::GenISA_StackAlloca:
9098 emitStackAlloca(inst);
9099 break;
9100 case GenISAIntrinsic::GenISA_VLAStackAlloca:
9101 emitVLAStackAlloca(inst);
9102 break;
9103 case GenISAIntrinsic::GenISA_WaveBallot:
9104 emitWaveBallot(inst);
9105 break;
9106 case GenISAIntrinsic::GenISA_WaveInverseBallot:
9107 emitWaveInverseBallot(inst);
9108 break;
9109 case GenISAIntrinsic::GenISA_WaveShuffleIndex:
9110 emitSimdShuffle(inst);
9111 break;
9112 case GenISAIntrinsic::GenISA_WavePrefix:
9113 emitWavePrefix(cast<WavePrefixIntrinsic>(inst));
9114 break;
9115 case GenISAIntrinsic::GenISA_QuadPrefix:
9116 emitQuadPrefix(cast<QuadPrefixIntrinsic>(inst));
9117 break;
9118 case GenISAIntrinsic::GenISA_WaveAll:
9119 emitWaveAll(inst);
9120 break;
9121 case GenISAIntrinsic::GenISA_WaveClustered:
9122 emitWaveClustered(inst);
9123 break;
9124 case GenISAIntrinsic::GenISA_InitDiscardMask:
9125 emitInitDiscardMask(inst);
9126 break;
9127 case GenISAIntrinsic::GenISA_UpdateDiscardMask:
9128 emitUpdateDiscardMask(inst);
9129 break;
9130 case GenISAIntrinsic::GenISA_GetPixelMask:
9131 emitGetPixelMask(inst);
9132 break;
9133 case GenISAIntrinsic::GenISA_dp4a_ss:
9134 case GenISAIntrinsic::GenISA_dp4a_uu:
9135 case GenISAIntrinsic::GenISA_dp4a_su:
9136 case GenISAIntrinsic::GenISA_dp4a_us:
9137 emitDP4A(inst);
9138 break;
9139 case GenISAIntrinsic::GenISA_evaluateSampler:
9140 // nothing to do
9141 break;
9142 case GenISAIntrinsic::GenISA_wavebarrier:
9143 // nothing to do
9144 break;
9145 case GenISAIntrinsic::GenISA_mul_rtz:
9146 case GenISAIntrinsic::GenISA_fma_rtz:
9147 case GenISAIntrinsic::GenISA_add_rtz:
9148 emitFPOrtz(inst);
9149 break;
9150 case GenISAIntrinsic::GenISA_fma_rtp:
9151 emitFMArtp(inst);
9152 break;
9153 case GenISAIntrinsic::GenISA_fma_rtn:
9154 emitFMArtn(inst);
9155 break;
9156 case GenISAIntrinsic::GenISA_CatchAllDebugLine:
9157 emitDebugPlaceholder(inst);
9158 break;
9159 case GenISAIntrinsic::GenISA_getR0:
9160 case GenISAIntrinsic::GenISA_getPayloadHeader:
9161 case GenISAIntrinsic::GenISA_getWorkDim:
9162 case GenISAIntrinsic::GenISA_getNumWorkGroups:
9163 case GenISAIntrinsic::GenISA_getLocalSize:
9164 case GenISAIntrinsic::GenISA_getGlobalSize:
9165 case GenISAIntrinsic::GenISA_getEnqueuedLocalSize:
9166 case GenISAIntrinsic::GenISA_getLocalID_X:
9167 case GenISAIntrinsic::GenISA_getLocalID_Y:
9168 case GenISAIntrinsic::GenISA_getLocalID_Z:
9169 case GenISAIntrinsic::GenISA_getPrivateBase:
9170 case GenISAIntrinsic::GenISA_getPrintfBuffer:
9171 case GenISAIntrinsic::GenISA_getStageInGridOrigin:
9172 case GenISAIntrinsic::GenISA_getStageInGridSize:
9173 case GenISAIntrinsic::GenISA_getSyncBuffer:
9174 emitImplicitArgIntrinsic(inst);
9175 break;
9176 case GenISAIntrinsic::GenISA_dummyInst:
9177 emitDummyInst(inst);
9178 break;
9179 case GenISAIntrinsic::GenISA_vectorUniform:
9180 break; // pseudo instruction, do nothing
9181 case GenISAIntrinsic::GenISA_staticConstantPatchValue:
9182 emitStaticConstantPatchValue(cast<StaticConstantPatchIntrinsic>(inst));
9183 case GenISAIntrinsic::GenISA_SetImplicitBufferPtr:
9184 emitStoreImplBufferPtr(inst);
9185 break;
9186 case GenISAIntrinsic::GenISA_SetLocalIdBufferPtr:
9187 emitStoreLocalIdBufferPtr(inst);
9188 break;
9189 case GenISAIntrinsic::GenISA_GetImplicitBufferPtr:
9190 emitLoadImplBufferPtr(inst);
9191 break;
9192 case GenISAIntrinsic::GenISA_GetLocalIdBufferPtr:
9193 emitLoadLocalIdBufferPtr(inst);
9194 break;
9195 default:
9196 // we assume that some of gen-intrinsic should always be pattern-matched away,
9197 // therefore we do not handle them in visa-emission.
9198 // let us know if you see a case that hits this assertion by those intrinsics
9199 inst->print(IGC::Debug::ods());
9200 IGC_ASSERT_MESSAGE(0, "unknown intrinsic");
9201 break;
9202 }
9203 }
9204
EmitIntrinsicMessage(llvm::IntrinsicInst * inst)9205 void EmitPass::EmitIntrinsicMessage(llvm::IntrinsicInst* inst)
9206 {
9207 switch (inst->getIntrinsicID())
9208 {
9209 case Intrinsic::lifetime_start:
9210 case Intrinsic::lifetime_end:
9211 case Intrinsic::fabs:
9212 case Intrinsic::trap:
9213 // do nothing
9214 break;
9215 case Intrinsic::stacksave:
9216 // If stack is not initialized (no SP), we can assume there's no VLA.
9217 // We can ignore llvm.stacksave and llvm.stackrestore intrinsics
9218 if (m_currShader->hasSP())
9219 emitLLVMStackSave(inst);
9220 break;
9221
9222 case Intrinsic::stackrestore:
9223 // If stack is not initialized (no SP), we can assume there's no VLA.
9224 // We can ignore llvm.stacksave and llvm.stackrestore intrinsics
9225 if (m_currShader->hasSP())
9226 emitLLVMStackRestore(inst);
9227 break;
9228
9229 case Intrinsic::bswap:
9230 emitLLVMbswap(inst);
9231 break;
9232
9233 case Intrinsic::sqrt:
9234 emitSqrt(inst);
9235 break;
9236
9237 default:
9238 inst->print(IGC::Debug::ods());
9239 IGC_ASSERT_MESSAGE(0, "unknown intrinsic");
9240 break;
9241 }
9242 }
9243
validateInlineAsmConstraints(llvm::CallInst * inst,SmallVector<StringRef,8> & constraints)9244 bool EmitPass::validateInlineAsmConstraints(llvm::CallInst* inst, SmallVector<StringRef, 8> & constraints)
9245 {
9246 IGC_ASSERT(inst->isInlineAsm());
9247 InlineAsm* IA = cast<InlineAsm>(IGCLLVM::getCalledValue(inst));
9248 StringRef constraintStr(IA->getConstraintString());
9249 if (constraintStr.empty()) return true;
9250
9251 //lambda for checking constraint types
9252 auto CheckConstraintTypes = [this](StringRef str, CVariable* cv = nullptr)->bool
9253 {
9254 unsigned matchVal;
9255 if (str.equals("=rw"))
9256 {
9257 return true;
9258 }
9259 else if (str.equals("rw"))
9260 {
9261 return true;
9262 }
9263 else if (str.getAsInteger(10, matchVal) == 0)
9264 {
9265 // Also allows matching input reg to output reg
9266 return true;
9267 }
9268 else if (str.equals("i"))
9269 {
9270 return cv && cv->IsImmediate();
9271 }
9272 else if (str.equals("rw.u"))
9273 {
9274 return cv && cv->IsUniform();
9275 }
9276 else
9277 {
9278 IGC_ASSERT_MESSAGE(0, "Unsupported constraint type!");
9279 return false;
9280 }
9281 };
9282
9283 // Get a list of constraint tokens
9284 constraintStr.split(constraints, ',');
9285
9286 bool success = true;
9287
9288 unsigned index = 0;
9289
9290 // Check the output constraint tokens
9291 for (; index < constraints.size(); index++)
9292 {
9293 StringRef &str = constraints[index];
9294 if (str.startswith("="))
9295 {
9296 success &= CheckConstraintTypes(str);
9297 }
9298 else
9299 {
9300 break;
9301 }
9302 }
9303 if (success)
9304 {
9305 // Check the input constraint tokens
9306 for (unsigned i = 0; i < inst->getNumArgOperands(); i++, index++)
9307 {
9308 CVariable* cv = GetSymbol(inst->getArgOperand(i));
9309 success &= CheckConstraintTypes(constraints[index], cv);
9310 }
9311 }
9312 return success;
9313 }
9314
9315 // Parse the inlined asm string to generate VISA operands
9316 // Example: "mul (M1, 16) $0(0, 0)<1> $1(0, 0)<1;1,0> $2(0, 0)<1;1,0>", "=r,r,r"(float %6, float %7)
EmitInlineAsm(llvm::CallInst * inst)9317 void EmitPass::EmitInlineAsm(llvm::CallInst* inst)
9318 {
9319 std::stringstream& str = m_encoder->GetVISABuilder()->GetAsmTextStream();
9320 InlineAsm* IA = cast<InlineAsm>(IGCLLVM::getCalledValue(inst));
9321 string asmStr = IA->getAsmString();
9322 smallvector<CVariable*, 8> opnds;
9323 SmallVector<StringRef, 8> constraints;
9324
9325 if (asmStr.empty())
9326 return;
9327
9328 if (!validateInlineAsmConstraints(inst, constraints))
9329 {
9330 IGC_ASSERT_MESSAGE(0, "Constraints for inline assembly cannot be validated");
9331 return;
9332 }
9333
9334 if (inst->getType()->isStructTy())
9335 {
9336 // Handle multiple outputs
9337 unsigned numOutputs = inst->getType()->getStructNumElements();
9338 std::vector<CVariable*> outputs(numOutputs);
9339 for (auto var : outputs) var = nullptr;
9340
9341 for (auto user : inst->users())
9342 {
9343 ExtractValueInst* ex = dyn_cast<ExtractValueInst>(user);
9344 IGC_ASSERT_MESSAGE(nullptr != ex, "Invalid user of inline asm call");
9345 unsigned id = *ex->idx_begin();
9346 IGC_ASSERT(id < numOutputs);
9347 IGC_ASSERT(outputs[id] == nullptr);
9348 outputs[id] = GetSymbol(ex);
9349 }
9350 for (auto var : outputs) opnds.push_back(var);
9351 }
9352 else if (m_destination)
9353 {
9354 opnds.push_back(m_destination);
9355 }
9356 for (unsigned i = 0; i < inst->getNumArgOperands(); i++)
9357 {
9358 CVariable* cv = GetSymbol(inst->getArgOperand(i));
9359 opnds.push_back(cv);
9360 }
9361
9362 IGC_ASSERT(opnds.size() == constraints.size());
9363
9364 // Check for read/write registers
9365 if (!inst->getType()->isVoidTy())
9366 {
9367 for (unsigned i = 0; i < constraints.size(); i++)
9368 {
9369 unsigned destID;
9370 if (constraints[i].getAsInteger(10, destID) == 0)
9371 {
9372 // If input is linked to output reg, move the input value into the output
9373 CVariable* cv = opnds[i];
9374 CVariable* dest = opnds[destID];
9375 if (cv && dest && cv != dest)
9376 {
9377 if (inst->getType()->isVectorTy())
9378 {
9379 emitVectorCopy(dest, cv, int_cast<unsigned>(dyn_cast<IGCLLVM::FixedVectorType>(inst->getType())->getNumElements()));
9380 }
9381 else
9382 {
9383 m_encoder->Copy(dest, cv);
9384 m_encoder->Push();
9385 }
9386 }
9387 }
9388 }
9389 }
9390
9391 for (unsigned i = 0; i < opnds.size(); i++)
9392 {
9393 CVariable* opVar = opnds[i];
9394 StringRef constraint = constraints[i];
9395
9396 // All uniform variables must be broadcasted if 'rw' constraint was specified
9397 if (opVar && opVar->IsUniform() && constraint.equals("rw"))
9398 {
9399 opnds[i] = BroadcastIfUniform(opVar);
9400 }
9401 // Special handling if LLVM replaces a variable with an immediate, we need to insert an extra move
9402 else if (opVar && opVar->IsImmediate() && !constraint.equals("i"))
9403 {
9404 CVariable* tempMov = m_currShader->GetNewVariable(
9405 1, opVar->GetType(), EALIGN_GRF, true, opVar->getName());
9406 m_encoder->Copy(tempMov, opVar);
9407 m_encoder->Push();
9408 opnds[i] = tempMov;
9409 }
9410 }
9411
9412 // Replace all instances of ${:uid} with a label string unique to this asm block.
9413 // Clang translates the '%=' format string to '${:uid}' in LLVMIR.
9414 // This option is useful when creating local labels and referring to them multiple times
9415 // in a single template that generates multiple assembler instructions.
9416 {
9417 string hashStr = m_encoder->GetUniqueInlineAsmLabel();
9418 string uniqueIDStr = "${:uid}";
9419 size_t pos = 0;
9420 while (pos < asmStr.size())
9421 {
9422 size_t varPos = asmStr.find(uniqueIDStr, pos);
9423 if (varPos == string::npos)
9424 break;
9425 asmStr.replace(varPos, uniqueIDStr.size(), hashStr);
9426 pos = varPos + hashStr.size();
9427 }
9428 }
9429
9430 str << endl << "/// Inlined ASM" << endl;
9431 // Look for variables to replace with the VISA variable
9432 size_t startPos = 0;
9433 while (startPos < asmStr.size())
9434 {
9435 size_t varPos = asmStr.find('$', startPos);
9436 if (varPos == string::npos)
9437 break;
9438
9439 // Find the operand number
9440 const char* idStart = &(asmStr[varPos + 1]);
9441 const char* idEnd = idStart;
9442 while (*idEnd >= '0' && *idEnd <= '9')
9443 ++idEnd;
9444
9445 unsigned val = 0;
9446 if (StringRef(idStart, idEnd - idStart).getAsInteger(10, val))
9447 {
9448 IGC_ASSERT_MESSAGE(0, "Invalid operand format");
9449 return;
9450 }
9451 if (val >= opnds.size())
9452 {
9453 IGC_ASSERT_MESSAGE(0, "Invalid operand index");
9454 return;
9455 }
9456 string varName = opnds[val] ? m_encoder->GetVariableName(opnds[val]) : "null";
9457 asmStr.replace(varPos, (idEnd - idStart + 1), varName);
9458
9459 startPos = varPos + varName.size();
9460 }
9461
9462 str << asmStr;
9463 if (asmStr.back() != '\n') str << endl;
9464 str << "/// End Inlined ASM" << endl << endl;
9465 }
9466
Mul(CVariable * Src0,CVariable * Src1,const CVariable * DstPrototype)9467 CVariable* EmitPass::Mul(CVariable* Src0, CVariable* Src1, const CVariable* DstPrototype)
9468 {
9469 bool IsSrc0Imm = Src0->IsImmediate();
9470 bool IsSrc1Imm = Src1->IsImmediate();
9471 if (IsSrc0Imm && IsSrc1Imm) {
9472 uint64_t Prod = Src0->GetImmediateValue() * Src1->GetImmediateValue();
9473 return m_currShader->ImmToVariable(Prod, DstPrototype->GetType());
9474 }
9475 if (IsSrc0Imm && !IsSrc1Imm) {
9476 std::swap(Src0, Src1);
9477 }
9478 if (IsSrc1Imm) {
9479 APInt Imm(APInt(m_DL->getPointerSizeInBits(), Src1->GetImmediateValue()));
9480 if (Imm == 0) {
9481 return Src1;
9482 }
9483 if (Imm == 1) {
9484 return Src0;
9485 }
9486 if (Imm.isPowerOf2()) {
9487 unsigned Amt = Imm.logBase2();
9488 CVariable* VarAmt = m_currShader->ImmToVariable(Amt, ISA_TYPE_UD);
9489 CVariable* Dst = m_currShader->GetNewVariable(DstPrototype);
9490 m_encoder->Shl(Dst, Src0, VarAmt);
9491 m_encoder->Push();
9492 return Dst;
9493 }
9494 }
9495
9496 CVariable* Dst = m_currShader->GetNewVariable(DstPrototype);
9497 VISA_Type srcType = Src0->GetType();
9498
9499 // Only i64 muls need special handling, otherwise go back to standard flow
9500 if (srcType != ISA_TYPE_Q && srcType != ISA_TYPE_UQ)
9501 {
9502 m_encoder->Mul(Dst, Src0, Src1);
9503 m_encoder->Push();
9504 }
9505 else {
9506 CVariable* src[] = { Src0, Src1 };
9507 Mul64(Dst, src, m_currShader->m_SIMDSize);
9508 }
9509 return Dst;
9510 }
9511
Add(CVariable * Src0,CVariable * Src1,const CVariable * DstPrototype)9512 CVariable* EmitPass::Add(CVariable* Src0, CVariable* Src1, const CVariable* DstPrototype)
9513 {
9514 bool IsSrc0Imm = Src0->IsImmediate();
9515 bool IsSrc1Imm = Src1->IsImmediate();
9516 if (IsSrc1Imm && !Src1->GetImmediateValue()) {
9517 return Src0;
9518 }
9519 if (IsSrc0Imm && !Src0->GetImmediateValue()) {
9520 return Src1;
9521 }
9522 if (IsSrc0Imm && IsSrc1Imm) {
9523 uint64_t Sum = Src0->GetImmediateValue() + Src1->GetImmediateValue();
9524 return m_currShader->ImmToVariable(Sum, DstPrototype->GetType());
9525 }
9526 CVariable* Dst = m_currShader->GetNewVariable(DstPrototype);
9527 m_encoder->Add(Dst, Src0, Src1);
9528 m_encoder->Push();
9529 return Dst;
9530 }
9531
9532 // Insert lifetime start right before instruction I if it is a candidate.
emitLifetimeStart(CVariable * Var,BasicBlock * BB,Instruction * I,bool ForAllInstance)9533 void EmitPass::emitLifetimeStart(CVariable* Var, BasicBlock* BB, Instruction* I, bool ForAllInstance)
9534 {
9535 if (m_pCtx->getVectorCoalescingControl() == 0 || Var == nullptr) {
9536 return;
9537 }
9538
9539 // m_LifetimeAt1stDefOfBB uses dessa root of aliasee as its key
9540 Value* ARV = m_VRA->getAliasRootValue(I);
9541 ARV = m_VRA->getRootValue(ARV);
9542
9543 auto II = m_VRA->m_LifetimeAt1stDefOfBB.find(ARV);
9544 if (II != m_VRA->m_LifetimeAt1stDefOfBB.end())
9545 {
9546 // Insert lifetime start on the root value
9547 // Note that lifetime is a kind of info directive,
9548 // thus no m_encoder->Push() is needed.
9549 CVariable* RootVar = GetSymbol(ARV);
9550 if (ForAllInstance)
9551 {
9552 for (uint instance = 0; instance < RootVar->GetNumberInstance(); instance++)
9553 {
9554 m_encoder->SetSecondHalf(instance == 0 ? false : true);
9555 m_encoder->Lifetime(LIFETIME_START, RootVar);
9556 }
9557 }
9558 else {
9559 // Current instance, set already in the calling context.
9560 m_encoder->Lifetime(LIFETIME_START, RootVar);
9561 }
9562
9563 // Once inserted, remove it from map to
9564 // prevent from inserting again.
9565 m_VRA->m_LifetimeAt1stDefOfBB.erase(II);
9566 }
9567 }
9568
emitGEP(llvm::Instruction * I)9569 void EmitPass::emitGEP(llvm::Instruction* I)
9570 {
9571 GetElementPtrInst& GEP = cast<GetElementPtrInst>(*I);
9572 unsigned AddrSpace = I->getType()->getPointerAddressSpace();
9573 VISA_Type PtrTy =
9574 m_currShader->GetContext()->getRegisterPointerSizeInBits(AddrSpace) == 64 ? ISA_TYPE_UQ : ISA_TYPE_UD;
9575
9576 // First compute the offset from the base to benefit from constant folding,
9577 // and then add to the base (which is less likely to be a constant).
9578
9579 // vOffset is the value of the advancing offset in the loop below
9580 // Use the pre-allocated variable for storage
9581 CVariable* vOffset = m_destination;
9582 // vN is the current offset at the begining of each iteration in the loop below
9583 CVariable* vN = m_currShader->ImmToVariable(0, PtrTy);
9584 // Note that the pointer operand may be a vector of pointers. Take the scalar
9585 // element which holds a pointer.
9586 Type* Ty = GEP.getPointerOperand()->getType()->getScalarType();
9587
9588 // Prototype temporary used for cloning from
9589 CVariable* vTmp = m_currShader->GetNewVariable(
9590 numLanes(m_currShader->m_SIMDSize),
9591 PtrTy,
9592 m_currShader->getGRFAlignment(),
9593 m_destination->IsUniform(),
9594 CName::NONE);
9595
9596 gep_type_iterator GTI = gep_type_begin(GEP);
9597 for (auto OI = GEP.op_begin() + 1, E = GEP.op_end(); OI != E; ++OI, ++GTI) {
9598 Value* Idx = *OI;
9599 // Offset of element contributed by current index being visited
9600 CVariable* vElemOffset;
9601 if (StructType * StTy = GTI.getStructTypeOrNull()) {
9602 // GEP indices into structs are always constant i32's
9603 unsigned Field = int_cast<unsigned>(cast<Constant>(Idx)->getUniqueInteger().getZExtValue());
9604 uint64_t Offset = 0;
9605 if (Field) {
9606 Offset = m_DL->getStructLayout(StTy)->getElementOffset(Field);
9607 }
9608 vElemOffset = m_currShader->ImmToVariable(Offset, ISA_TYPE_UD);
9609 Ty = StTy->getElementType(Field);
9610 }
9611 else {
9612 Ty = GTI.getIndexedType();
9613 // vElemOffset = vIdx * vElemSize
9614 CVariable* vElemSize = m_currShader->ImmToVariable(m_DL->getTypeAllocSize(Ty), PtrTy);
9615 CVariable* vIdx = GetSymbol(Idx);
9616 // The Mul does a push and takes care of constant folding
9617 vElemOffset = Mul(vIdx, vElemSize, vTmp);
9618 }
9619 // vOffset = vN + vElemOffset
9620 vOffset = Add(vElemOffset, vN, vTmp); // The Add does a m_encoder->push
9621 vN = vOffset; // After eating an index operand, advance the current offset
9622 }
9623
9624 CVariable* vBasePtr = GetSymbol(GEP.getPointerOperand());
9625 // GEP = VBasePtrt + VOffset
9626 vTmp = Add(vBasePtr, vOffset, vTmp); // The Add does a m_encoder->push
9627 // Copy the result
9628 if (CEncoder::GetCISADataTypeSize(vTmp->GetType()) <
9629 CEncoder::GetCISADataTypeSize(m_destination->GetType()))
9630 {
9631 // If both offset and the base are immediates, we may end up with an offset of a smaller
9632 // type than the destination, due to immediate creation optimizations in the Add.
9633 m_encoder->Cast(m_destination, vTmp);
9634 }
9635 else
9636 {
9637 m_encoder->Copy(m_destination, vTmp);
9638 }
9639 m_encoder->Push();
9640 }
9641
emitIntToPtr(llvm::IntToPtrInst * I2P)9642 void EmitPass::emitIntToPtr(llvm::IntToPtrInst* I2P)
9643 {
9644 CVariable* src = GetSymbol(I2P->getOperand(0));
9645 CVariable* IntVar = m_currShader->BitCast(src, GetUnsignedType(src->GetType()));
9646 m_encoder->Cast(m_destination, IntVar);
9647 m_encoder->Push();
9648 }
9649
emitBitCast(llvm::BitCastInst * btCst)9650 void EmitPass::emitBitCast(llvm::BitCastInst* btCst)
9651 {
9652 Type* srcType = btCst->getOperand(0)->getType();
9653 Type* dstType = btCst->getType();
9654 unsigned int numSrcElement = srcType->isVectorTy() ? (unsigned)cast<IGCLLVM::FixedVectorType>(srcType)->getNumElements() : 1;
9655 unsigned int numDstElement = dstType->isVectorTy() ? (unsigned)cast<IGCLLVM::FixedVectorType>(dstType)->getNumElements() : 1;
9656
9657 if (srcType->isPointerTy())
9658 {
9659 IGC_ASSERT_MESSAGE(dstType->isPointerTy(), "Expected both src and dst have pointer type.");
9660 }
9661
9662 if (btCst->getOperand(0)->getType()->isVectorTy() ||
9663 btCst->getType()->isVectorTy())
9664 {
9665 emitVectorBitCast(btCst);
9666 return;
9667 }
9668
9669 CVariable* src = GetSymbol(btCst->getOperand(0));
9670 CVariable* dst = m_destination;
9671 IGC_ASSERT(nullptr != src);
9672 IGC_ASSERT(nullptr != dst);
9673 IGC_ASSERT_MESSAGE(numSrcElement == 1, "vector to vector bitcast not supported");
9674 IGC_ASSERT_MESSAGE(numDstElement == 1, "vector to vector bitcast not supported");
9675
9676 src = m_currShader->BitCast(src, dst->GetType());
9677 m_encoder->Copy(dst, src);
9678 m_encoder->Push();
9679 }
9680
emitPtrToInt(llvm::PtrToIntInst * P2I)9681 void EmitPass::emitPtrToInt(llvm::PtrToIntInst* P2I)
9682 {
9683 CVariable* dst = m_currShader->BitCast(m_destination, GetUnsignedType(m_destination->GetType()));
9684 CVariable* PtrVar = GetSymbol(P2I->getOperand(0));
9685 m_encoder->Cast(dst, PtrVar);
9686 m_encoder->Push();
9687 }
9688
emitAddrSpaceToGenericCast(llvm::AddrSpaceCastInst * addrSpaceCast,CVariable * srcV,unsigned tag)9689 void EmitPass::emitAddrSpaceToGenericCast(llvm::AddrSpaceCastInst* addrSpaceCast, CVariable* srcV, unsigned tag)
9690 {
9691 if (m_pCtx->m_hasEmu64BitInsts && m_currShader->m_Platform->hasNoFullI64Support())
9692 {
9693 if (m_currShader->GetContext()->getRegisterPointerSizeInBits(addrSpaceCast->getSrcAddressSpace()) == 32)
9694 {
9695 // Add tag to high part
9696 CVariable* dstAlias = m_currShader->BitCast(m_destination, ISA_TYPE_UD);
9697 // Low:
9698 m_encoder->SetDstRegion(2);
9699 m_encoder->Copy(dstAlias, srcV);
9700 m_encoder->Push();
9701 // High:
9702 m_encoder->SetDstSubReg(1);
9703 m_encoder->SetDstRegion(2);
9704 m_encoder->Copy(dstAlias, m_currShader->ImmToVariable(tag << 29, ISA_TYPE_UD));
9705 m_encoder->Push();
9706 }
9707 else
9708 {
9709 // Src
9710 CVariable* srcAlias = m_currShader->GetNewAlias(srcV, ISA_TYPE_UD, 0, 0);
9711 CVariable* srcLow = m_currShader->GetNewVariable(
9712 numLanes(m_currShader->m_SIMDSize),
9713 ISA_TYPE_UD, EALIGN_GRF, m_destination->IsUniform(),
9714 CName(srcV->getName(), "Lo"));
9715 CVariable* srcHigh = m_currShader->GetNewVariable(
9716 numLanes(m_currShader->m_SIMDSize),
9717 ISA_TYPE_UD, EALIGN_GRF, m_destination->IsUniform(),
9718 CName(srcV->getName(), "Hi"));
9719
9720 // Split Src into {Low, High}
9721 // Low:
9722 m_encoder->SetSrcSubReg(0, 0);
9723 m_encoder->SetSrcRegion(0, 2, 1, 0);
9724 m_encoder->Copy(srcLow, srcAlias);
9725 m_encoder->Push();
9726 // High:
9727 m_encoder->SetSrcSubReg(0, 1);
9728 m_encoder->SetSrcRegion(0, 2, 1, 0);
9729 m_encoder->Copy(srcHigh, srcAlias);
9730 m_encoder->Push();
9731
9732 // Add tag to high part
9733 m_encoder->Or(srcHigh, srcHigh, m_currShader->ImmToVariable(tag << 29, ISA_TYPE_UD));
9734 m_encoder->Push();
9735
9736 // Copy result to Dst
9737 CVariable* dstAlias = m_currShader->BitCast(m_destination, ISA_TYPE_UD);
9738 // Low:
9739 m_encoder->SetDstRegion(2);
9740 m_encoder->Copy(dstAlias, srcLow);
9741 m_encoder->Push();
9742 // High:
9743 m_encoder->SetDstSubReg(1);
9744 m_encoder->SetDstRegion(2);
9745 m_encoder->Copy(dstAlias, srcHigh);
9746 m_encoder->Push();
9747 }
9748 }
9749 else
9750 {
9751 CVariable* pTempVar = m_currShader->GetNewVariable(
9752 numLanes(m_currShader->m_SIMDSize),
9753 ISA_TYPE_UQ, m_currShader->getGRFAlignment(),
9754 m_destination->IsUniform(), CName::NONE);
9755 m_encoder->Or(pTempVar, srcV, m_currShader->ImmToVariable(static_cast<uint64_t>(tag) << 61, ISA_TYPE_UQ));
9756 m_encoder->Cast(m_destination, pTempVar);
9757 m_encoder->Push();
9758 }
9759 }
9760
emitAddrSpaceCast(llvm::AddrSpaceCastInst * addrSpaceCast)9761 void EmitPass::emitAddrSpaceCast(llvm::AddrSpaceCastInst* addrSpaceCast)
9762 {
9763 // Tags are used to determine the address space of generic pointers
9764 // casted from private, local or global pointers.
9765 // Bit[60:63] are used for this purpose. bit[60] is reserved for future use.
9766 // Address space tag on bit[61:63] can be:
9767 // 001: private
9768 // 010: local
9769 // 000/111: global
9770
9771 // In platforms that don't support 64bit operations, 64bit pointers are emulated
9772 // with pair{i32, i32}. So tags on generic pointers are added/removed by using:
9773 // - 64bit Or/And operations directly in platforms with 64bit operation support.
9774 // - 32bit Or/And operations on second element of the pair in platforms with no
9775 // 64bit operation support.
9776
9777 CVariable* srcV = GetSymbol(addrSpaceCast->getOperand(0));
9778
9779 if ((m_pCtx->allocatePrivateAsGlobalBuffer() || m_pCtx->hasNoPrivateToGenericCast()) &&
9780 m_pCtx->hasNoLocalToGenericCast())
9781 {
9782 // If forcing global memory allocacion and there are no generic pointers to local AS,
9783 // there is no need to tag generic pointers.
9784 m_encoder->Cast(m_destination, srcV);
9785 m_encoder->Push();
9786 return;
9787 }
9788
9789 if (srcV->IsImmediate() && srcV->GetImmediateValue() == 0x0)
9790 {
9791 // If casting from null, don't do tagging
9792 m_encoder->Cast(m_destination, srcV);
9793 m_encoder->Push();
9794 return;
9795 }
9796
9797 unsigned sourceAddrSpace = addrSpaceCast->getSrcAddressSpace();
9798 unsigned destAddrSpace = addrSpaceCast->getDestAddressSpace();
9799
9800 if (destAddrSpace == ADDRESS_SPACE_GENERIC)
9801 {
9802 // Address space cast is in the form of {private, local, global} -> generic
9803 // A tag is added according to the address space of the source
9804
9805 MDNode* genericMD = addrSpaceCast->getMetadata("generic.arith");
9806 if (genericMD)
9807 {
9808 m_encoder->Cast(m_destination, srcV);
9809 m_encoder->Push();
9810 return;
9811 }
9812
9813 if (sourceAddrSpace == ADDRESS_SPACE_PRIVATE && !m_pCtx->allocatePrivateAsGlobalBuffer())
9814 {
9815 emitAddrSpaceToGenericCast(addrSpaceCast, srcV, 1);
9816 }
9817 else if (sourceAddrSpace == ADDRESS_SPACE_LOCAL)
9818 {
9819 emitAddrSpaceToGenericCast(addrSpaceCast, srcV, 2);
9820 }
9821 else // ADDRESS_SPACE_GLOBAL
9822 {
9823 m_encoder->Cast(m_destination, srcV);
9824 m_encoder->Push();
9825 }
9826 }
9827 else if (sourceAddrSpace == ADDRESS_SPACE_GENERIC &&
9828 (destAddrSpace == ADDRESS_SPACE_PRIVATE || destAddrSpace == ADDRESS_SPACE_LOCAL))
9829 {
9830 // Address space cast is in the form of generic -> {private, local, global}
9831 // Tag is removed according to the address space of the destination
9832
9833 // The initial address could be in canonical form, that means bit 47 is replicated
9834 // to the upper bits. As bits [60:63] are spoiled already we need to restore the
9835 // address to the canonical form. This is done by merging bits [56:59], which we
9836 // assume are in canonical form, into bits [60:63].
9837
9838 if (m_pCtx->m_hasEmu64BitInsts && m_currShader->m_Platform->hasNoFullI64Support())
9839 {
9840 if (m_currShader->GetContext()->getRegisterPointerSizeInBits(destAddrSpace) == 32)
9841 {
9842 // Src
9843 CVariable* srcAlias = m_currShader->GetNewAlias(srcV, ISA_TYPE_UD, 0, 0);
9844 CVariable* srcLow = m_currShader->GetNewVariable(
9845 numLanes(m_currShader->m_SIMDSize),
9846 ISA_TYPE_UD, EALIGN_GRF, m_destination->IsUniform(),
9847 CName(srcV->getName(), "Lo"));
9848
9849 // Get low part of srcV
9850 m_encoder->SetSrcSubReg(0, 0);
9851 m_encoder->SetSrcRegion(0, 2, 1, 0);
9852 m_encoder->Copy(srcLow, srcAlias);
9853 m_encoder->Push();
9854
9855 // Copy result to Dst
9856 m_encoder->Cast(m_destination, srcLow);
9857 m_encoder->Push();
9858 }
9859 else
9860 {
9861 // Src
9862 CVariable* srcAlias = m_currShader->GetNewAlias(srcV, ISA_TYPE_UD, 0, 0);
9863 CVariable* srcLow = m_currShader->GetNewVariable(
9864 numLanes(m_currShader->m_SIMDSize),
9865 ISA_TYPE_UD, EALIGN_GRF, m_destination->IsUniform(),
9866 CName(srcV->getName(), "Lo"));
9867 CVariable* srcHigh = m_currShader->GetNewVariable(
9868 numLanes(m_currShader->m_SIMDSize),
9869 ISA_TYPE_UD, EALIGN_GRF, m_destination->IsUniform(),
9870 CName(srcV->getName(), "Hi"));
9871 CVariable* tempVar = m_currShader->GetNewVariable(
9872 numLanes(m_currShader->m_SIMDSize),
9873 ISA_TYPE_D, EALIGN_GRF, m_destination->IsUniform(),
9874 CName::NONE);
9875
9876 // Split Src into {Low, High}
9877 // Low:
9878 m_encoder->SetSrcSubReg(0, 0);
9879 m_encoder->SetSrcRegion(0, 2, 1, 0);
9880 m_encoder->Copy(srcLow, srcAlias);
9881 m_encoder->Push();
9882 // High:
9883 m_encoder->SetSrcSubReg(0, 1);
9884 m_encoder->SetSrcRegion(0, 2, 1, 0);
9885 m_encoder->Copy(srcHigh, srcAlias);
9886 m_encoder->Push();
9887
9888 // Clear tag in the high part and restore address canonical form
9889 m_encoder->Shl(tempVar, srcHigh, m_currShader->ImmToVariable(4, ISA_TYPE_D));
9890 m_encoder->IShr(srcHigh, tempVar, m_currShader->ImmToVariable(4, ISA_TYPE_D));
9891 m_encoder->Push();
9892
9893 // Copy to Dst
9894 CVariable* dstAlias = m_currShader->BitCast(m_destination, ISA_TYPE_UD);
9895 // Low:
9896 m_encoder->SetDstRegion(2);
9897 m_encoder->Copy(dstAlias, srcLow);
9898 m_encoder->Push();
9899 // High:
9900 m_encoder->SetDstSubReg(1);
9901 m_encoder->SetDstRegion(2);
9902 m_encoder->Copy(dstAlias, srcHigh);
9903 m_encoder->Push();
9904 }
9905 }
9906 else
9907 {
9908 CVariable* pTempVar = m_currShader->GetNewVariable(
9909 numLanes(m_currShader->m_SIMDSize),
9910 ISA_TYPE_Q, m_currShader->getGRFAlignment(),
9911 m_destination->IsUniform(), CName::NONE);
9912 // Clear tag in the high part and restore address canonical form
9913 m_encoder->Shl(pTempVar, srcV, m_currShader->ImmToVariable(4, ISA_TYPE_D));
9914 m_encoder->IShr(pTempVar, pTempVar, m_currShader->ImmToVariable(4, ISA_TYPE_D));
9915 m_encoder->Cast(m_destination, pTempVar);
9916 m_encoder->Push();
9917 }
9918 }
9919 else // ADDRESS_SPACE_GLOBAL
9920 {
9921 m_encoder->Cast(m_destination, srcV);
9922 m_encoder->Push();
9923 }
9924 }
9925
emitExtract(llvm::Instruction * inst)9926 void EmitPass::emitExtract(llvm::Instruction* inst)
9927 {
9928 IGC_ASSERT(llvm::isa<llvm::ExtractElementInst>(inst));
9929 llvm::ExtractElementInst* Extract = llvm::cast<llvm::ExtractElementInst>(inst);
9930 llvm::Value* vecOperand = Extract->getVectorOperand();
9931 auto vectorBCI = dyn_cast<BitCastInst>(vecOperand);
9932 CVariable* vector = m_currShader->GetSymbol(vecOperand, true);
9933
9934 if (llvm::ConstantInt * pConstElem = llvm::dyn_cast<llvm::ConstantInt>(Extract->getIndexOperand()))
9935 {
9936 uint element = m_currShader->AdjustExtractIndex(vecOperand, int_cast<uint16_t>(pConstElem->getZExtValue()));
9937 // Do not use allocated type to compute the offsets; otherwise the computed
9938 // offsets may be out-of-bound. The alignment information of the base
9939 // element type should not impact the offset.
9940 uint eltBytes = GetScalarTypeSizeInRegister(Extract->getType());
9941 IGC_ASSERT_MESSAGE(eltBytes, "illegal ExtractElement instruction");
9942
9943 if (m_currShader->CanTreatAsAlias(Extract))
9944 {
9945 if (vectorBCI && m_currShader->getCVarForVectorBCI(vectorBCI, element))
9946 {
9947 //do nothing as we can reuse the symbol from the vector bitcast
9948 return;
9949 }
9950 uint offset = 0;
9951 if (m_currShader->GetIsUniform(inst->getOperand(0)))
9952 {
9953 offset = element * eltBytes;
9954 }
9955 else
9956 {
9957 offset = vector->getOffsetMultiplier() * element * numLanes(m_currShader->m_SIMDSize) * eltBytes;
9958 }
9959 // the symbol table should have coalesced those two values;
9960 // TODO: clean up when we get generic coalescing
9961 IGC_ASSERT(vector == m_destination->GetAlias() || vector->GetAlias() == m_destination->GetAlias());
9962 IGC_ASSERT(m_destination->GetAliasOffset() == (offset + vector->GetAliasOffset()));
9963 }
9964 else
9965 {
9966 if (vectorBCI)
9967 {
9968 if (auto var = m_currShader->getCVarForVectorBCI(vectorBCI, element))
9969 {
9970 // use the separate CVar for each index instead
9971 m_encoder->Copy(m_destination, var);
9972 m_encoder->Push();
9973 return;
9974 }
9975 }
9976
9977 if (m_currShader->GetIsUniform(inst->getOperand(0)))
9978 {
9979 uint offset = element * eltBytes;
9980 m_encoder->SetSrcSubVar(0, (offset / getGRFSize()));
9981 m_encoder->SetSrcSubReg(0, ((offset % getGRFSize()) / eltBytes));
9982 }
9983 else
9984 {
9985 uint offset = vector->getOffsetMultiplier() * element * numLanes(m_currShader->m_SIMDSize) * eltBytes;
9986 uint subvar = offset / getGRFSize();
9987 m_encoder->SetSrcSubVar(0, subvar);
9988 m_encoder->SetSrcSubReg(0, ((offset % getGRFSize()) / eltBytes));
9989 }
9990 m_encoder->Copy(m_destination, vector);
9991 m_encoder->Push();
9992 }
9993 }
9994 else
9995 {
9996 // We got an index which is not a value known at compile-time.
9997 llvm::Value* pIndex = Extract->getIndexOperand();
9998 llvm::Type* pVecType = vecOperand->getType();
9999
10000 // When the index type is i32, it is better to create a uw alias since
10001 // the following address computation will be in uw.
10002 CVariable* pIndexVar = GetSymbol(pIndex);
10003 IGC_ASSERT(pIndex->getType()->getPrimitiveSizeInBits() <= 64);
10004
10005 bool DoAliasing = pIndex->getType()->getPrimitiveSizeInBits() >= 32;
10006 if (DoAliasing)
10007 {
10008 pIndexVar = m_currShader->BitCast(pIndexVar, ISA_TYPE_UW);
10009 }
10010
10011 // size of vector entry
10012 const uint vectorEntrySimdWidth = vector->IsUniform() ?
10013 1 : numLanes(m_currShader->m_SIMDSize);
10014
10015 const uint vecTypeSize = GetScalarTypeSizeInRegister(pVecType);
10016
10017 const uint offset = vectorEntrySimdWidth * vecTypeSize;
10018
10019 CVariable* pOffset1 = m_currShader->ImmToVariable(offset, ISA_TYPE_UW);
10020
10021 // offset2 is the offset within the array expressed in bytes (index*element size in bytes)
10022 CVariable* pOffset2 = m_currShader->GetNewVariable(
10023 pIndexVar->IsUniform() ? 1 : numLanes(m_currShader->m_SIMDSize),
10024 ISA_TYPE_UW,
10025 pIndexVar->IsUniform() ? EALIGN_WORD : EALIGN_HWORD,
10026 pIndexVar->IsUniform(),
10027 CName::NONE);
10028
10029 // We bitcast the address as uw so it is an "unpacked" uw
10030 if (!pIndexVar->IsUniform() && DoAliasing)
10031 {
10032 m_encoder->SetSrcRegion(0, 2, 1, 0);
10033 }
10034
10035 m_encoder->Mul(pOffset2, pIndexVar, pOffset1);
10036 m_encoder->Push();
10037
10038 // if pIndexVar is non-uniform, we will need to use VxH addressing.
10039 // And if both pIndexVar and pVectorVar are non-uniform, need to add
10040 // per-element offsets to the content of address register
10041 CVariable* pOffset3 = nullptr;
10042 if (!pIndexVar->IsUniform() && !vector->IsUniform())
10043 {
10044 pOffset3 = m_currShader->GetNewVariable(
10045 numLanes(m_currShader->m_SIMDSize),
10046 ISA_TYPE_UW,
10047 EALIGN_HWORD,
10048 false,
10049 CName::NONE);
10050 CVariable* OffsetVar = getOrCreatePerLaneOffsetVariable(vecTypeSize);
10051 m_encoder->Add(pOffset3, pOffset2, OffsetVar);
10052 m_encoder->Push();
10053 }
10054 else
10055 {
10056 // no need to add per-lane offsets
10057 pOffset3 = pOffset2;
10058 }
10059
10060 {
10061 // address variable represents register a0
10062 CVariable* pDstArrElm = m_currShader->GetNewAddressVariable(
10063 pIndexVar->IsUniform() ? 1 : numLanes(m_currShader->m_SIMDSize),
10064 m_destination->GetType(),
10065 pIndexVar->IsUniform(),
10066 vector->IsUniform(),
10067 m_destination->getName());
10068
10069 // we add offsets to the base that is the beginning of the vector variable
10070 m_encoder->AddrAdd(pDstArrElm, vector, pOffset3);
10071 m_encoder->Push();
10072
10073 // finally, we move the indirectly addressed values to the destination register
10074 m_encoder->Copy(m_destination, pDstArrElm);
10075 m_encoder->Push();
10076 }
10077 }
10078 }
10079
emitUAVSerialize()10080 void EmitPass::emitUAVSerialize()
10081 {
10082 m_encoder->Wait();
10083 m_encoder->Push();
10084 }
10085
10086
emitLoadRawIndexed(LdRawIntrinsic * inst,Value * varOffset,ConstantInt * immOffset)10087 void EmitPass::emitLoadRawIndexed(
10088 LdRawIntrinsic * inst, Value * varOffset, ConstantInt * immOffset)
10089 {
10090 Value* bufPtrv = inst->getResourceValue();
10091
10092 ResourceDescriptor resource = GetResourceVariable(bufPtrv);
10093 m_currShader->isMessageTargetDataCacheDataPort = true;
10094 IGC_ASSERT(immOffset == nullptr);
10095 emitLoad3DInner(inst, resource, varOffset);
10096 }
10097
emitLoad3DInner(LdRawIntrinsic * inst,ResourceDescriptor & resource,Value * elem_idxv)10098 void EmitPass::emitLoad3DInner(LdRawIntrinsic* inst, ResourceDescriptor& resource, Value* elem_idxv)
10099 {
10100 IGC::e_predefSurface predDefSurface = resource.m_surfaceType;
10101 CVariable* gOffset = m_currShader->ImmToVariable(0x0, ISA_TYPE_UD);
10102
10103 CVariable* src_offset = GetSymbol(elem_idxv);
10104
10105 // still collect buffer type here to work around some alignment problem with different messages
10106 BufferType bufType = GetBufferType(inst->getOperand(0)->getType()->getPointerAddressSpace());
10107
10108 // generate oword_load if it is uniform
10109 // otherwise, generate gather/gather4
10110 if (m_currShader->GetIsUniform(inst))
10111 {
10112 IGC_ASSERT_MESSAGE(predDefSurface != ESURFACE_STATELESS, "scratch cannot be uniform");
10113 Type* loadType = inst->getType();
10114 uint numElement = loadType->isVectorTy() ? (uint)cast<IGCLLVM::FixedVectorType>(loadType)->getNumElements() : 1;
10115 if (predDefSurface == ESURFACE_SLM)
10116 {
10117 IGC_ASSERT(numElement <= 4);
10118 uint numLane = (numElement == 3) ? 4 : numElement;
10119 // there is no oword-block read for SLM, also we expect loading only up to 4-dwords
10120 CVariable* imm = m_currShader->ImmToVariable(0x0C840, ISA_TYPE_UV);
10121 CVariable* srcTmp = m_currShader->GetNewVariable(
10122 (uint16_t)numLane, ISA_TYPE_UD, m_currShader->getGRFAlignment(), true,
10123 CName(src_offset->getName(), "Broadcast"));
10124 m_encoder->SetNoMask();
10125 m_encoder->SetUniformSIMDSize(lanesToSIMDMode(numLane));
10126 m_encoder->Add(srcTmp, src_offset, imm);
10127 m_encoder->Push();
10128 CVariable* dstTmp = m_destination;
10129 if (numElement != numLane)
10130 {
10131 dstTmp = m_currShader->GetNewVariable(
10132 (uint16_t)numLane, ISA_TYPE_D, m_currShader->getGRFAlignment(), true,
10133 CName::NONE);
10134 }
10135 m_encoder->SetNoMask();
10136 m_encoder->SetUniformSIMDSize(lanesToSIMDMode(numLane));
10137 m_encoder->ByteGather(dstTmp, resource, srcTmp, 8, 4);
10138 m_encoder->Push();
10139
10140 // generate an extract-element due to dst-size difference when numElement == 3
10141 // \todo, we should canonicalize <floatx3> to <floatx4> before code-gen to avoid this
10142 if (dstTmp != m_destination)
10143 {
10144 for (uint i = 0; i < numElement; i++)
10145 {
10146 m_encoder->SetSrcSubReg(0, i);
10147 m_encoder->SetDstSubReg(i);
10148 m_encoder->SetSrcRegion(0, 0, 1, 0);
10149 m_encoder->Copy(m_destination, dstTmp);
10150 m_encoder->Push();
10151 }
10152 }
10153 }
10154 else if (predDefSurface == ESURFACE_SCRATCH && m_currShader->m_Platform->hasScratchSurface() && inst->getAlignment() >= 4)
10155 {
10156 IGC_ASSERT(numElement <= 8);
10157 CVariable* tmpAddress = nullptr;
10158 if (numElement > 1)
10159 {
10160 tmpAddress = m_currShader->GetNewVariable(numElement, ISA_TYPE_UD, EALIGN_GRF, true, CName::NONE);
10161 m_encoder->SetNoMask();
10162 m_encoder->SetUniformSIMDSize(lanesToSIMDMode(numElement));
10163 m_encoder->Shl(tmpAddress, m_currShader->ImmToVariable(0x76543210, ISA_TYPE_V), m_currShader->ImmToVariable(2, ISA_TYPE_D));
10164 m_encoder->Push();
10165 m_encoder->SetNoMask();
10166 m_encoder->SetUniformSIMDSize(lanesToSIMDMode(numElement));
10167 m_encoder->Add(tmpAddress, tmpAddress, src_offset);
10168 m_encoder->Push();
10169 }
10170 else
10171 {
10172 tmpAddress = m_currShader->GetNewVariable(numElement, ISA_TYPE_UD, EALIGN_GRF, true, CName::NONE);
10173 m_encoder->SetNoMask();
10174 m_encoder->SetUniformSIMDSize(SIMDMode::SIMD1);
10175 m_encoder->Copy(tmpAddress, src_offset);
10176 m_encoder->Push();
10177 }
10178
10179 bool needsTempDest = numElement < 4;
10180 CVariable* destination = m_destination;
10181 if (needsTempDest)
10182 {
10183 uint elemSize = m_destination->GetElemSize();
10184 destination = m_currShader->GetNewVariable(
10185 numElement * SIZE_DWORD / elemSize, m_destination->GetType(),
10186 EALIGN_GRF, m_destination->IsUniform(), CName::NONE);
10187 }
10188
10189 m_encoder->SetNoMask();
10190 m_encoder->SetUniformSIMDSize(lanesToSIMDMode(numElement));
10191 m_encoder->Gather4Scaled(destination, resource, tmpAddress);
10192 m_encoder->Push();
10193 if (needsTempDest)
10194 {
10195 // generate an extract-element
10196 for (uint i = 0; i < numElement; i++)
10197 {
10198 m_encoder->SetSrcSubReg(0, i);
10199 m_encoder->SetDstSubReg(i);
10200 m_encoder->SetSrcRegion(0, 0, 1, 0);
10201 m_encoder->Copy(m_destination, destination);
10202 m_encoder->Push();
10203 }
10204 }
10205 }
10206 else
10207 {
10208 bool owordAligned = false;
10209 // need to clear lower two-bits for unaligned
10210 CVariable* visaOffset = nullptr;
10211 if (bufType == CONSTANT_BUFFER)
10212 {
10213 visaOffset = src_offset;
10214 }
10215 else if (src_offset->IsImmediate())
10216 {
10217 // clear lower-two-bits
10218 visaOffset = m_currShader->ImmToVariable(src_offset->GetImmediateValue() & 0xfffffffc, ISA_TYPE_UD);
10219 }
10220 else
10221 {
10222 // clear lower-two-bits
10223 CVariable* masklast2bits = m_currShader->ImmToVariable(0xfffffffc, ISA_TYPE_UD);
10224 visaOffset = m_currShader->GetNewVariable(
10225 src_offset->GetNumberElement(),
10226 ISA_TYPE_UD,
10227 src_offset->GetAlign(),
10228 src_offset->IsUniform(),
10229 src_offset->getName());
10230 m_encoder->And(visaOffset, m_currShader->BitCast(src_offset, ISA_TYPE_UD), masklast2bits);
10231 m_encoder->Push();
10232 }
10233 if (numElement >= 4)
10234 {
10235 m_encoder->OWLoad(m_destination, resource, visaOffset, owordAligned, m_destination->GetSize());
10236 m_encoder->Push();
10237 }
10238 else
10239 {
10240 IGC_ASSERT(GetPrimitiveTypeSizeInRegisterInBits(loadType) < SIZE_DWORD * 8 * 4);
10241 uint elemSize = m_destination->GetElemSize();
10242
10243 if (elemSize > 0)
10244 {
10245 unsigned int alignment = inst->getAlignment();
10246 if (alignment < SIZE_DWORD && !(src_offset->IsImmediate() && src_offset->GetImmediateValue() % SIZE_DWORD == 0))
10247 {
10248 IGC_ASSERT(alignment == 1 || alignment == 2);
10249 IGC_ASSERT(src_offset->IsUniform());
10250 uint numElements = m_destination->GetSize() / alignment;
10251 VISA_Type realType = alignment == 1 ? ISA_TYPE_UB : ISA_TYPE_UW;
10252 CVariable* tmp = m_currShader->GetNewVariable(
10253 numElements * (SIZE_DWORD / alignment), realType, EALIGN_GRF, true, CName::NONE);
10254 if (numElements > 1)
10255 {
10256 IGC_ASSERT(numElements <= 8);
10257 CVariable* offsetVector = m_currShader->GetNewVariable(numElements, ISA_TYPE_UD, EALIGN_GRF, CName::NONE);
10258 m_encoder->SetSimdSize(lanesToSIMDMode(numElements));
10259 m_encoder->SetNoMask();
10260 m_encoder->Add(offsetVector, src_offset, m_currShader->ImmToVariable(alignment * 0x76543210, ISA_TYPE_UV));
10261 m_encoder->Push();
10262 src_offset = offsetVector;
10263 }
10264 else if (src_offset->IsImmediate() || src_offset->GetAlign() != EALIGN_GRF)
10265 {
10266 IGC_ASSERT(numElements == 1);
10267 CVariable* tmpSrcOffset = m_currShader->GetNewVariable(numElements, ISA_TYPE_UD, EALIGN_GRF, true, CName::NONE);
10268 m_encoder->SetSimdSize(lanesToSIMDMode(numElements));
10269 m_encoder->SetNoMask();
10270 m_encoder->Cast(tmpSrcOffset, src_offset);
10271 m_encoder->Push();
10272 src_offset = tmpSrcOffset;
10273 }
10274 m_encoder->SetSimdSize(lanesToSIMDMode(numElements));
10275 m_encoder->SetNoMask();
10276 m_encoder->ByteGather(tmp, resource, src_offset, 8, alignment);
10277 m_encoder->Push();
10278 CVariable* dstWordAlias = m_currShader->GetNewAlias(m_destination, realType, 0, 0, false);
10279 m_encoder->SetSimdSize(lanesToSIMDMode(numElements));
10280 m_encoder->SetNoMask();
10281 m_encoder->SetSrcRegion(0, SIZE_DWORD / alignment, 1, 0);
10282 m_encoder->Copy(dstWordAlias, tmp);
10283 m_encoder->Push();
10284 }
10285 else
10286 {
10287 CVariable* tmp = m_currShader->GetNewVariable(
10288 4 * SIZE_DWORD / elemSize, m_destination->GetType(), EALIGN_GRF, m_destination->IsUniform(), CName::NONE);
10289 m_encoder->OWLoad(tmp, resource, visaOffset, owordAligned, tmp->GetSize());
10290 m_encoder->Push();
10291 // generate an extract-element
10292 for (uint i = 0; i < numElement; i++)
10293 {
10294 m_encoder->SetSrcSubReg(0, i);
10295 m_encoder->SetDstSubReg(i);
10296 m_encoder->SetSrcRegion(0, 0, 1, 0);
10297 m_encoder->Copy(m_destination, tmp);
10298 m_encoder->Push();
10299 }
10300 }
10301 }
10302 }
10303 }
10304 }
10305 else
10306 {
10307 uint label = 0;
10308 CVariable* flag = nullptr;
10309 bool needLoop = ResourceLoopHeader(resource, flag, label);
10310 uint sizeInBits = GetPrimitiveTypeSizeInRegisterInBits(inst->getType());
10311 IGC_ASSERT_MESSAGE((sizeInBits == 8) || (sizeInBits == 16) || (sizeInBits == 32) || (sizeInBits == 64) || (sizeInBits == 96) || (sizeInBits == 128),
10312 "load type must be 1/2/4/8/12/16 bytes long");
10313 IGC::CVariable* visaOffset = BroadcastIfUniform(src_offset);
10314 unsigned int alignment = inst->getAlignment();
10315 if (sizeInBits == 32 && resource.m_surfaceType == ESURFACE_STATELESS &&
10316 m_currShader->m_Platform->getWATable().WaNoA32ByteScatteredStatelessMessages)
10317 {
10318 // DWORD gather
10319 CVariable* shiftedPtr = m_currShader->GetNewVariable(visaOffset);
10320 m_encoder->Shr(shiftedPtr, visaOffset, m_currShader->ImmToVariable(2, ISA_TYPE_UD));
10321 m_encoder->Push();
10322 visaOffset = shiftedPtr;
10323 m_encoder->SetPredicate(flag);
10324 m_encoder->Gather(m_destination, resource.m_resource, visaOffset, gOffset, resource.m_surfaceType, 4);
10325 m_encoder->Push();
10326 }
10327 else if (sizeInBits == 32 && (bufType == CONSTANT_BUFFER || resource.m_surfaceType == ESURFACE_STATELESS || alignment < 4))
10328 {
10329 // uav and resource cannot be changed to this path due to alignment issue encountered in some tests
10330 uint elementSize = 8;
10331 uint numElems = 4;
10332 m_encoder->SetPredicate(flag);
10333 m_encoder->ByteGather(m_destination, resource, visaOffset, elementSize, numElems);
10334 m_encoder->Push();
10335 }
10336 else if (sizeInBits >= 32)
10337 {
10338 // constant-buffer cannot go this way due to driver surface-state setting to RGBA-F32
10339 if (bufType == CONSTANT_BUFFER || bufType == BINDLESS_CONSTANT_BUFFER)
10340 {
10341 IGC_ASSERT(!UsesTypedConstantBuffer(m_currShader->GetContext(), bufType));
10342 }
10343
10344 m_encoder->SetPredicate(flag);
10345 m_encoder->Gather4ScaledNd(m_destination, resource, visaOffset, sizeInBits / 32);
10346 m_encoder->Push();
10347 }
10348 else if (sizeInBits == 8 || sizeInBits == 16)
10349 {
10350 uint elementSize = 8;
10351 uint numElems = sizeInBits / 8;
10352 uint hStride = 32 / sizeInBits;
10353 uint16_t vStride = numLanes(m_currShader->m_SIMDSize);
10354 CVariable* gatherDest = m_currShader->GetNewVariable(vStride, ISA_TYPE_UD, EALIGN_GRF, CName::NONE);
10355 m_encoder->SetPredicate(flag);
10356 m_encoder->ByteGather(gatherDest, resource, visaOffset, elementSize, numElems);
10357 m_encoder->Push();
10358
10359 gatherDest = m_currShader->GetNewAlias(gatherDest, m_destination->GetType(), 0, 0);
10360 m_encoder->SetSrcRegion(0, vStride, vStride / hStride, hStride);
10361 m_encoder->Cast(m_destination, gatherDest);
10362 m_encoder->Push();
10363 }
10364 ResourceLoopBackEdge(needLoop, flag, label);
10365 }
10366 }
10367
emitLoad(LoadInst * inst,Value * offset,ConstantInt * immOffset)10368 void EmitPass::emitLoad(LoadInst* inst, Value* offset, ConstantInt* immOffset)
10369 {
10370 emitVectorLoad(inst, offset, immOffset);
10371 }
10372
EmitNoModifier(llvm::Instruction * inst)10373 void EmitPass::EmitNoModifier(llvm::Instruction* inst)
10374 {
10375 // This is a single instruction pattern emitter
10376 // Check if this inst has been turned into noop due to alias.
10377 // If so, no code shall be emitted for this instruction.
10378 if (m_currShader->HasBecomeNoop(inst))
10379 {
10380 return;
10381 }
10382
10383 if (IGC_IS_FLAG_ENABLED(EnableDeSSAAlias) &&
10384 m_deSSA && m_deSSA->isNoopAliaser(inst))
10385 {
10386 return;
10387 }
10388
10389 switch (inst->getOpcode())
10390 {
10391 case Instruction::Ret:
10392 emitReturn(cast<ReturnInst>(inst));
10393 break;
10394 case Instruction::Call:
10395 if (GenIntrinsicInst * I = dyn_cast<GenIntrinsicInst>(inst))
10396 {
10397 EmitGenIntrinsicMessage(I);
10398 }
10399 else if (IntrinsicInst * I = dyn_cast<IntrinsicInst>(inst))
10400 {
10401 EmitIntrinsicMessage(I);
10402 }
10403 else if (cast<CallInst>(inst)->isInlineAsm())
10404 {
10405 EmitInlineAsm(cast<CallInst>(inst));
10406 }
10407 else
10408 {
10409 emitCall(cast<CallInst>(inst));
10410 }
10411 break;
10412 case Instruction::Store:
10413 emitStore(cast<StoreInst>(inst),
10414 cast<StoreInst>(inst)->getPointerOperand(),
10415 nullptr);
10416 break;
10417 case Instruction::Load:
10418 emitLoad(
10419 cast<LoadInst>(inst),
10420 cast<LoadInst>(inst)->getPointerOperand(),
10421 nullptr);
10422 break;
10423 case Instruction::GetElementPtr:
10424 emitGEP(cast<GetElementPtrInst>(inst));
10425 break;
10426 case Instruction::BitCast:
10427 emitBitCast(cast<BitCastInst>(inst));
10428 break;
10429 case Instruction::PtrToInt:
10430 emitPtrToInt(cast<PtrToIntInst>(inst));
10431 break;
10432 case Instruction::IntToPtr:
10433 emitIntToPtr(cast<IntToPtrInst>(inst));
10434 break;
10435 case Instruction::AddrSpaceCast:
10436 emitAddrSpaceCast(cast<AddrSpaceCastInst>(inst));
10437 break;
10438 case Instruction::InsertElement:
10439 emitInsert(cast<InsertElementInst>(inst));
10440 break;
10441 case Instruction::ExtractElement:
10442 emitExtract(cast<ExtractElementInst>(inst));
10443 break;
10444 case Instruction::Unreachable:
10445 break;
10446 default:
10447 IGC_ASSERT_MESSAGE(0, "need to add code gen support for this instruction");
10448 }
10449 }
10450
emitPairToPtr(GenIntrinsicInst * GII)10451 void EmitPass::emitPairToPtr(GenIntrinsicInst* GII) {
10452 CVariable* Lo = GetSymbol(GII->getOperand(0));
10453 CVariable* Hi = GetSymbol(GII->getOperand(1));
10454
10455 unsigned AS = GII->getType()->getPointerAddressSpace();
10456 if (m_currShader->GetContext()->getRegisterPointerSizeInBits(AS) == 32) {
10457 CVariable* Tmp = m_currShader->BitCast(Lo, GetUnsignedType(Lo->GetType()));
10458 m_encoder->Cast(m_destination, Tmp);
10459 m_encoder->Push();
10460 return;
10461 }
10462
10463 IGC_ASSERT_MESSAGE(m_currShader->GetContext()->getRegisterPointerSizeInBits(AS) == 64,
10464 "Pointer size should be either 32 or 64!");
10465
10466 CVariable* Dst32 = m_currShader->BitCast(m_destination, ISA_TYPE_UD);
10467 // Lo
10468 m_encoder->SetDstRegion(2);
10469 m_encoder->Copy(Dst32, Lo);
10470 m_encoder->Push();
10471 // Hi
10472 m_encoder->SetDstRegion(2);
10473 m_encoder->SetDstSubReg(1);
10474 m_encoder->Copy(Dst32, Hi);
10475 m_encoder->Push();
10476 }
10477
emitLLVMStackSave(llvm::IntrinsicInst * inst)10478 void EmitPass::emitLLVMStackSave(llvm::IntrinsicInst* inst) {
10479 // save current SP
10480 CVariable* pSP = m_currShader->GetSP();
10481 m_encoder->Copy(m_destination, pSP);
10482 m_encoder->Push();
10483 }
10484
emitLLVMStackRestore(llvm::IntrinsicInst * inst)10485 void EmitPass::emitLLVMStackRestore(llvm::IntrinsicInst* inst) {
10486 // restore the SP to arg(0)
10487 CVariable* pSP = m_currShader->GetSP();
10488 CVariable* savedSP = m_currShader->GetSymbol(inst->getOperand(0));
10489 // stacksave and stackrestore are forced to be uniform in WIAnalysis.
10490 // While here we still set to scalar region just in case
10491 m_encoder->SetSrcRegion(0, 0, 1, 0);
10492 m_encoder->Copy(pSP, savedSP);
10493 m_encoder->Push();
10494 }
10495
emitVLAStackAlloca(llvm::GenIntrinsicInst * intrinsic)10496 void EmitPass::emitVLAStackAlloca(llvm::GenIntrinsicInst* intrinsic)
10497 {
10498 CVariable* pSP = m_currShader->GetSP();
10499 CVariable* lane_off = m_currShader->GetSymbol(intrinsic->getOperand(0));
10500 // m_destination = curr_SP + lane_offset
10501 emitAddPointer(m_destination, pSP, lane_off);
10502 m_encoder->Push();
10503
10504 if (m_currShader->m_numberInstance == 1 || m_encoder->IsSecondHalf()) {
10505 // SP = SP + vla_size * simdWidth
10506 CVariable* vla_size = m_currShader->GetSymbol(intrinsic->getOperand(1));
10507 // vla_size must be uniform, if it's not uniform, set region to take only <0;1,0>
10508 m_encoder->SetSrcRegion(0, 0, 1, 0);
10509 m_encoder->Mul(vla_size, vla_size,
10510 m_currShader->ImmToVariable(numLanes(m_currShader->m_SIMDSize), ISA_TYPE_UW));
10511 m_encoder->Push();
10512
10513 m_encoder->SetSrcRegion(1, 0, 1, 0);
10514 emitAddPointer(pSP, pSP, vla_size);
10515 m_encoder->Push();
10516 }
10517 }
10518
emitStackAlloca(GenIntrinsicInst * GII)10519 void EmitPass::emitStackAlloca(GenIntrinsicInst* GII)
10520 {
10521 // Static private mem access is done through the FP
10522 CVariable* pFP = m_currShader->GetFP();
10523 if IGC_IS_FLAG_ENABLED(EnableWriteOldFPToStack)
10524 {
10525 // If we have written the previous FP to the current frame's start, the start of
10526 // private memory will be offset by 16 bytes
10527 CVariable* tempFP = m_currShader->GetNewVariable(pFP);
10528 emitAddPointer(tempFP, pFP, m_currShader->ImmToVariable(getFPOffset(), ISA_TYPE_UD));
10529 pFP = tempFP;
10530 }
10531 CVariable* pOffset = m_currShader->GetSymbol(GII->getOperand(0));
10532 emitAddPointer(m_destination, pFP, pOffset);
10533 }
10534
emitCall(llvm::CallInst * inst)10535 void EmitPass::emitCall(llvm::CallInst* inst)
10536 {
10537 llvm::Function* F = inst->getCalledFunction();
10538 if (!F || F->hasFnAttribute("referenced-indirectly") || (m_FGA && m_FGA->useStackCall(F)))
10539 {
10540 emitStackCall(inst);
10541 return;
10542 }
10543
10544 IGC_ASSERT_MESSAGE(!F->empty(), "unexpanded builtin?");
10545
10546 unsigned i = 0;
10547 for (auto& Arg : F->args())
10548 {
10549 // Skip unused arguments if any.
10550 if (Arg.use_empty())
10551 {
10552 ++i;
10553 continue;
10554 }
10555
10556 CVariable* Dst = m_currShader->getOrCreateArgumentSymbol(&Arg, true);
10557 CVariable* Src = GetSymbol(inst->getArgOperand(i++));
10558
10559 // When both symbols are the same, then this argument passing has been
10560 // lifted to use a global vISA variable, just skip the copy.
10561 if (Dst != Src)
10562 {
10563 emitCopyAll(Dst, Src, Arg.getType());
10564 }
10565 }
10566 m_currFuncHasSubroutine = true;
10567 m_encoder->SubroutineCall(nullptr, F);
10568 m_encoder->Push();
10569
10570 // Emit the return value if used.
10571 if (!inst->use_empty())
10572 {
10573 CVariable* Dst = GetSymbol(inst);
10574 CVariable* Src = m_currShader->getOrCreateReturnSymbol(F);
10575 emitCopyAll(Dst, Src, inst->getType());
10576 }
10577 }
10578
emitReturn(llvm::ReturnInst * inst)10579 void EmitPass::emitReturn(llvm::ReturnInst* inst)
10580 {
10581 llvm::Function* F = inst->getParent()->getParent();
10582 MetaDataUtils* pMdUtils = m_currShader->GetMetaDataUtils();
10583
10584 // return from a function (not a kernel)
10585 if (!isEntryFunc(pMdUtils, F))
10586 {
10587 if (m_FGA && m_FGA->useStackCall(F))
10588 {
10589 emitStackFuncExit(inst);
10590 return;
10591 }
10592
10593 llvm::Type* RetTy = F->getReturnType();
10594 if (!RetTy->isVoidTy())
10595 {
10596 CVariable* Dst = m_currShader->getOrCreateReturnSymbol(F);
10597 CVariable* Src = GetSymbol(inst->getReturnValue());
10598 emitCopyAll(Dst, Src, RetTy);
10599 }
10600
10601 m_encoder->SubroutineRet(nullptr, F);
10602 m_encoder->Push();
10603 return;
10604 }
10605
10606 if (m_currShader->GetShaderType() == ShaderType::PIXEL_SHADER)
10607 {
10608 CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
10609 unsigned nRTWrites = int_cast<unsigned>(psProgram->rtWriteList.size());
10610
10611 for (unsigned i = 0; i < nRTWrites; i++)
10612 {
10613 GenIntrinsicInst* inst;
10614 bool isSecondHalf;
10615
10616 inst = cast<GenIntrinsicInst>(psProgram->rtWriteList[i].first);
10617 isSecondHalf = psProgram->rtWriteList[i].second;
10618 m_encoder->SetSecondHalf(isSecondHalf);
10619
10620 switch (inst->getIntrinsicID())
10621 {
10622 case GenISAIntrinsic::GenISA_RTWrite:
10623 emitRenderTargetWrite(cast<RTWritIntrinsic>(inst), true);
10624 break;
10625 case GenISAIntrinsic::GenISA_RTDualBlendSource:
10626 emitDualBlendRT(cast<RTDualBlendSourceIntrinsic>(inst), true);
10627 break;
10628 default:
10629 IGC_ASSERT_MESSAGE(0, "unknown intrinsic");
10630 break;
10631 }
10632 }
10633 // restore encoder's second half flag.
10634 if (psProgram->m_numberInstance == 2)
10635 {
10636 m_encoder->SetSecondHalf(false);
10637 }
10638
10639 // check to make sure we will have EOT
10640 IGC_ASSERT(psProgram->m_hasEOT || psProgram->GetPhase() != PSPHASE_LEGACY);
10641 }
10642
10643 m_currShader->AddEpilogue(inst);
10644 }
10645
10646 /// Initializes the kernel for stack call by initializing the SP and FP
InitializeKernelStack(Function * pKernel)10647 void EmitPass::InitializeKernelStack(Function* pKernel)
10648 {
10649 m_currShader->InitializeStackVariables();
10650 auto pCtx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
10651 auto pModuleMetadata = pCtx->getModuleMetaData();
10652
10653 CVariable* pStackBufferBase = m_currShader->GetPrivateBase();
10654
10655 CVariable* pHWTID = m_currShader->GetHWTID();
10656
10657 CVariable* pSize = nullptr;
10658
10659 uint32_t MaxPrivateSize = pModuleMetadata->FuncMD[pKernel].privateMemoryPerWI;
10660 FunctionGroup* FG = m_FGA ? m_FGA->getGroup(pKernel) : nullptr;
10661 if (FG)
10662 {
10663 // Get the max PrivateMem used in the FG, which is set by
10664 // PrivateMemoryResolution.cpp after analyzing the call depth
10665 MaxPrivateSize = FG->getMaxPrivateMemOnStack();
10666
10667 // If there are indirect calls or recursions, we no longer
10668 // know the call depth, so just add 4KB and hope we don't overflow.
10669 if (FG->hasIndirectCall() || FG->hasRecursion())
10670 MaxPrivateSize += (4 * 1024);
10671 // Add another 1KB for VLA
10672 if (FG->hasVariableLengthAlloca())
10673 MaxPrivateSize += 1024;
10674 }
10675
10676 if (IGC_IS_FLAG_ENABLED(EnableRuntimeFuncAttributePatching))
10677 {
10678 // Experimental: Patch private memory size
10679 std::string patchName = "INTEL_PATCH_PRIVATE_MEMORY_SIZE";
10680 pSize = m_currShader->GetNewVariable(1, ISA_TYPE_UD, CVariable::getAlignment(getGRFSize()), true, CName(patchName));
10681 m_encoder->AddVISASymbol(patchName, pSize);
10682 }
10683 else
10684 {
10685 // hard-code per-workitem private-memory size to max size
10686 pSize = m_currShader->ImmToVariable(MaxPrivateSize * numLanes(m_currShader->m_dispatchSize), ISA_TYPE_UD);
10687 }
10688
10689 CVariable* pThreadOffset = m_currShader->GetNewVariable(1, ISA_TYPE_UD, EALIGN_DWORD, true, 1, CName::NONE);
10690 m_encoder->Mul(pThreadOffset, pHWTID, pSize);
10691 m_encoder->Push();
10692
10693 unsigned totalAllocaSize = 0;
10694
10695 // reserve space for alloca
10696 auto funcMDItr = pModuleMetadata->FuncMD.find(pKernel);
10697 if (funcMDItr != pModuleMetadata->FuncMD.end() && funcMDItr->second.privateMemoryPerWI != 0)
10698 {
10699 totalAllocaSize += funcMDItr->second.privateMemoryPerWI * numLanes(m_currShader->m_dispatchSize);
10700 }
10701
10702 if (IGC_IS_FLAG_DISABLED(EnableRuntimeFuncAttributePatching))
10703 {
10704 // If we don't return per-function private memory size,
10705 // modify private-memory size to a large setting.
10706 // This will be reported through patch-tokens as per-kernel requirement.
10707 pModuleMetadata->FuncMD[pKernel].privateMemoryPerWI = MaxPrivateSize;
10708 }
10709
10710 // Initialize SP to per-thread kernel stack base
10711 CVariable* pSP = m_currShader->GetSP();
10712 emitAddPointer(pSP, pStackBufferBase, pThreadOffset);
10713
10714 // Push a new stack frame
10715 emitPushFrameToStack(totalAllocaSize);
10716
10717 // Set the total alloca size for the entry function
10718 m_encoder->SetFunctionAllocaStackSize(pKernel, totalAllocaSize);
10719 }
10720
10721 // Either do a block load or store to the stack-pointer given a vector of function arguments
emitStackArgumentLoadOrStore(std::vector<CVariable * > & Args,bool isWrite)10722 uint EmitPass::emitStackArgumentLoadOrStore(std::vector<CVariable*>& Args, bool isWrite)
10723 {
10724 uint32_t offsetS = 0;
10725 SmallVector<std::tuple<CVariable*, uint32_t, uint32_t, uint32_t>, 8> dataBlks;
10726 for (auto Arg : Args)
10727 {
10728 // stack offset is always oword-aligned
10729 offsetS = int_cast<unsigned>(llvm::alignTo(offsetS, SIZE_OWORD));
10730
10731 // calculate block sizes for each arg
10732 int32_t RmnBytes = Arg->GetSize();
10733 uint32_t ArgOffset = 0;
10734 do
10735 {
10736 uint32_t BlkSize = 0;
10737 {
10738 BlkSize = getBlockMsgSize(RmnBytes, m_currShader->m_Platform->getMaxBlockMsgSize(false));
10739 }
10740 dataBlks.push_back(std::make_tuple(Arg, offsetS, BlkSize, ArgOffset));
10741
10742 offsetS += BlkSize;
10743 ArgOffset += BlkSize;
10744 RmnBytes -= BlkSize;
10745 } while (RmnBytes > 0);
10746 }
10747
10748 if (offsetS > 0)
10749 {
10750 // Get current SP
10751 CVariable* pSP = m_currShader->GetSP();
10752 if (isWrite)
10753 {
10754 // If storing to stack, first push SP by total store bytes
10755 CVariable* pPushSize = m_currShader->ImmToVariable(offsetS, ISA_TYPE_UD);
10756 emitAddPointer(pSP, pSP, pPushSize);
10757 }
10758
10759 // Load or store each OWORD block to stack
10760 for (auto& I : dataBlks)
10761 {
10762 CVariable* Arg = std::get<0>(I);
10763 uint32_t StackOffset = std::get<1>(I);
10764 uint32_t BlkSize = std::get<2>(I);
10765 uint32_t ArgOffset = std::get<3>(I);
10766 // spOffset is a negative offset from SP
10767 int32_t spOffset = StackOffset - offsetS;
10768
10769 if (isWrite) // Write args to stack
10770 {
10771 {
10772 // SP offset for each block
10773 CVariable* pTempSP = m_currShader->GetNewVariable(pSP);
10774 emitAddPointer(pTempSP, pSP, m_currShader->ImmToVariable(spOffset, ISA_TYPE_D));
10775
10776 m_encoder->OWStoreA64(Arg, pTempSP, BlkSize, ArgOffset);
10777 m_encoder->Push();
10778 }
10779 }
10780 else // Read args from stack
10781 {
10782 CVariable* LdDst = Arg;
10783 if (Arg->GetType() == ISA_TYPE_BOOL)
10784 {
10785 LdDst = m_currShader->GetNewVariable(numLanes(m_currShader->m_dispatchSize), ISA_TYPE_W, EALIGN_HWORD, false, 1, CName::NONE);
10786 }
10787
10788 int RmnBytes = LdDst->GetSize() - ArgOffset;
10789 bool needRmCopy = BlkSize == SIZE_OWORD && RmnBytes > 0 && RmnBytes < SIZE_OWORD;
10790 {
10791 // SP offset for each block
10792 CVariable* pTempSP = m_currShader->GetNewVariable(pSP);
10793 emitAddPointer(pTempSP, pSP, m_currShader->ImmToVariable(spOffset, ISA_TYPE_D));
10794
10795 if (!needRmCopy)
10796 {
10797 m_encoder->OWLoadA64(LdDst, pTempSP, BlkSize, ArgOffset);
10798 m_encoder->Push();
10799 }
10800 else
10801 {
10802 // Reading less than one oword, read one oword, then copy
10803 uint ldDstElemSize = LdDst->GetElemSize();
10804 if (ldDstElemSize > 0)
10805 {
10806 CVariable* pTempDst = m_currShader->GetNewVariable(SIZE_OWORD / ldDstElemSize, LdDst->GetType(), m_currShader->getGRFAlignment(), true, 1, CName::NONE);
10807 m_encoder->OWLoadA64(pTempDst, pTempSP, SIZE_OWORD);
10808 m_encoder->Push();
10809 emitVectorCopy(LdDst, pTempDst, RmnBytes / ldDstElemSize, ArgOffset, 0);
10810 }
10811 }
10812 }
10813 if (LdDst != Arg)
10814 {
10815 // only happens to bool
10816 IGC_ASSERT(Arg->GetType() == ISA_TYPE_BOOL);
10817 m_encoder->Cmp(EPREDICATE_NE, Arg, LdDst, m_currShader->ImmToVariable(0, LdDst->GetType()));
10818 }
10819 }
10820 }
10821 }
10822 return offsetS;
10823 }
10824
emitStackCall(llvm::CallInst * inst)10825 void EmitPass::emitStackCall(llvm::CallInst* inst)
10826 {
10827 llvm::Function* F = inst->getCalledFunction();
10828
10829 bool isIndirectFCall = !F || F->hasFnAttribute("referenced-indirectly");
10830 bool isInvokeSIMDTarget = F && F->hasFnAttribute("invoke_simd_target");
10831 CVariable* ArgBlkVar = m_currShader->GetARGV();
10832 uint32_t offsetA = 0; // visa argument offset
10833 uint32_t offsetS = 0; // visa stack offset
10834 std::vector<CVariable*> argsOnStack;
10835 SmallVector<std::tuple<CVariable*, Type*, uint32_t>, 8> argsOnRegister;
10836
10837 for (uint32_t i = 0; i < inst->getNumArgOperands(); i++)
10838 {
10839 Value* operand = inst->getArgOperand(i);
10840 CVariable* Src = GetSymbol(operand);
10841 Type* argType = operand->getType();
10842
10843 if (!isIndirectFCall)
10844 {
10845 // Skip unused arguments if any for direct call
10846 auto argIter = F->arg_begin();
10847 std::advance(argIter, i);
10848 if (argIter->use_empty()) continue;
10849 }
10850
10851 if (Src->GetType() == ISA_TYPE_BOOL)
10852 {
10853 // bool args are treated as a vector of WORDs
10854 uint nElts = numLanes(m_currShader->m_dispatchSize);
10855 CVariable* ReplaceArg = m_currShader->GetNewVariable(
10856 nElts,
10857 ISA_TYPE_W,
10858 EALIGN_HWORD, false, 1,
10859 CName::NONE);
10860 CVariable* one = m_currShader->ImmToVariable(1, ISA_TYPE_W);
10861 CVariable* zero = m_currShader->ImmToVariable(0, ISA_TYPE_W);
10862 m_encoder->Select(Src, ReplaceArg, one, zero);
10863
10864 argType = IntegerType::getInt16Ty(inst->getContext());
10865 Src = ReplaceArg;
10866 }
10867
10868 // adjust offset for alignment
10869 uint align = getGRFSize();
10870 offsetA = int_cast<unsigned>(llvm::alignTo(offsetA, align));
10871 // check if an argument can be written to ARGV based upon offset + arg-size
10872 unsigned argSize = Src->GetSize();
10873 if (Src->IsUniform())
10874 {
10875 argSize = Src->GetSize() * numLanes(m_currShader->m_dispatchSize);
10876 }
10877 bool overflow = ((offsetA + argSize) > ArgBlkVar->GetSize());
10878 if (!overflow)
10879 {
10880 argsOnRegister.push_back(std::make_tuple(Src, argType, offsetA));
10881 offsetA += argSize;
10882 }
10883 else
10884 {
10885 // Vectorize, then push to stack
10886 if (Src->IsUniform())
10887 {
10888 uint16_t nElts = (uint16_t)m_currShader->GetNumElts(argType, false);
10889 CVariable* SrcVec = m_currShader->GetNewVariable(nElts, Src->GetType(), m_currShader->getGRFAlignment(), false, Src->getName());
10890 emitCopyAll(SrcVec, Src, argType);
10891 Src = SrcVec;
10892 }
10893 argsOnStack.push_back(Src);
10894 }
10895 }
10896 // Write all arguments that does not fit in GRF to stack
10897 offsetS = emitStackArgumentLoadOrStore(argsOnStack, true);
10898
10899 uint retSize = 0;
10900 if (!inst->use_empty())
10901 {
10902 CVariable* Dst = GetSymbol(inst);
10903 if (Dst->GetType() == ISA_TYPE_BOOL)
10904 {
10905 retSize = numLanes(m_currShader->m_dispatchSize) * SIZE_WORD;
10906 }
10907 else
10908 {
10909 retSize = Dst->GetSize();
10910 }
10911 CVariable* Src = m_currShader->GetRETV();
10912 IGC_ASSERT_MESSAGE(retSize <= Src->GetSize(), "No support for return on stack!");
10913 }
10914
10915 unsigned char argSizeInGRF = (offsetA + getGRFSize() - 1) / getGRFSize();
10916 unsigned char retSizeInGRF = (retSize + getGRFSize() - 1) / getGRFSize();
10917
10918 // lamda to copy arguments to arg register block
10919 auto CopyArgBlkVariables = [&](void)->void
10920 {
10921 for (auto& I : argsOnRegister)
10922 {
10923 CVariable * Src = std::get<0>(I);
10924 Type* argType = std::get<1>(I);
10925 uint32_t offset = std::get<2>(I);
10926
10927 uint16_t nElts = (uint16_t)m_currShader->GetNumElts(argType, false);
10928 CVariable* Dst = m_currShader->GetNewAlias(ArgBlkVar, m_currShader->GetType(argType), offset, nElts, false);
10929 emitCopyAll(Dst, Src, argType);
10930 }
10931 };
10932
10933 // lambda to read the return value
10934 auto CopyReturnValue = [this](CallInst* inst)->void
10935 {
10936 // No need to copy if there are no uses
10937 if (inst->use_empty())
10938 return;
10939
10940 CVariable* Dst = GetSymbol(inst);
10941 CVariable* Src = m_currShader->GetRETV();
10942 if (Dst->GetType() == ISA_TYPE_BOOL)
10943 {
10944 CVariable* SrcAlias = m_currShader->GetNewAlias(Src, ISA_TYPE_W, 0, numLanes(m_currShader->m_dispatchSize), false);
10945 m_encoder->Cmp(EPREDICATE_NE, Dst, SrcAlias, m_currShader->ImmToVariable(0, ISA_TYPE_W));
10946 }
10947 else
10948 {
10949 IGC_ASSERT(Dst->GetSize() <= Src->GetSize());
10950 if (Dst->GetType() != Src->GetType() || Src->IsUniform() != Dst->IsUniform())
10951 {
10952 Src = m_currShader->GetNewAlias(Src, Dst->GetType(), 0, Dst->GetNumberElement(), Dst->IsUniform());
10953 }
10954 emitCopyAll(Dst, Src, inst->getType());
10955 }
10956 };
10957
10958 CVariable* funcAddr = GetSymbol(IGCLLVM::getCalledValue(inst));
10959 if (!isIndirectFCall || isInvokeSIMDTarget)
10960 {
10961 CopyArgBlkVariables();
10962 m_encoder->StackCall(nullptr, F, argSizeInGRF, retSizeInGRF);
10963 m_encoder->Push();
10964 CopyReturnValue(inst);
10965 }
10966 else
10967 {
10968 if (funcAddr->IsUniform() || IGC_IS_FLAG_ENABLED(AssumeUniformIndirectCall))
10969 {
10970 CopyArgBlkVariables();
10971 funcAddr = TruncatePointer(funcAddr);
10972 m_encoder->IndirectStackCall(nullptr, funcAddr, argSizeInGRF, retSizeInGRF);
10973 m_encoder->Push();
10974 CopyReturnValue(inst);
10975 }
10976 else
10977 {
10978 // If the call is not uniform, we have to make a uniform call per lane
10979 // First get the execution mask for active lanes
10980 CVariable* eMask = GetExecutionMask();
10981 // Create a label for the loop
10982 uint label = m_encoder->GetNewLabelID("non_unif_call_body");
10983 m_encoder->Label(label);
10984 m_encoder->Push();
10985
10986 // Get the first active lane's function address
10987 CVariable* offset = nullptr;
10988 funcAddr = TruncatePointer(funcAddr);
10989 CVariable* uniformAddr = UniformCopy(funcAddr, offset, eMask);
10990 // Set the predicate to true for all lanes with the same address
10991 CVariable* callPred = m_currShader->ImmToVariable(0, ISA_TYPE_BOOL);
10992 m_encoder->Cmp(EPREDICATE_EQ, callPred, uniformAddr, funcAddr);
10993 m_encoder->Push();
10994
10995 uint callLabel = m_encoder->GetNewLabelID("non_unif_call_end");
10996 m_encoder->SetInversePredicate(true);
10997 m_encoder->Jump(callPred, callLabel);
10998 m_encoder->Push();
10999
11000 // Copy args to ArgBlk on each iteration of the loop, such that arg registers
11001 // won't be corrupted by previous iterations.
11002 CopyArgBlkVariables();
11003
11004 // Indirect call for all lanes set by the flag
11005 m_encoder->IndirectStackCall(nullptr, uniformAddr, argSizeInGRF, retSizeInGRF);
11006 m_encoder->Copy(eMask, eMask);
11007 m_encoder->Push();
11008
11009 // For non-uniform call, copy the ret inside this loop so that it'll honor the loop mask
11010 CopyReturnValue(inst);
11011
11012 // Label for lanes that skipped the call
11013 m_encoder->Label(callLabel);
11014 m_encoder->Push();
11015
11016 // Unset the bits in execution mask for lanes that were called
11017 CVariable* callMask = m_currShader->GetNewVariable(1, eMask->GetType(), eMask->GetAlign(), true, CName::NONE);
11018 CVariable* loopPred = m_currShader->ImmToVariable(0, ISA_TYPE_BOOL);
11019 m_encoder->Cast(callMask, callPred);
11020 m_encoder->Not(callMask, callMask);
11021 m_encoder->And(eMask, eMask, callMask);
11022 m_encoder->Push();
11023 m_encoder->SetP(loopPred, eMask);
11024 m_encoder->Push();
11025
11026 // Loop while there are bits still left in the mask
11027 m_encoder->Jump(loopPred, label);
11028 m_encoder->Push();
11029 }
11030 }
11031
11032 if (offsetS > 0)
11033 {
11034 // Set the max stack sized pushed in the parent function for this call's args
11035 m_encoder->SetFunctionMaxArgumentStackSize(inst->getParent()->getParent(), offsetS);
11036
11037 // pop stack pointer after the call
11038 CVariable* pSP = m_currShader->GetSP();
11039 CVariable* pPopSize = m_currShader->ImmToVariable((uint64_t)(~offsetS + 1), ISA_TYPE_D);
11040 emitAddPointer(pSP, pSP, pPopSize);
11041 }
11042 }
11043
isFuncSRetArg(Argument * arg)11044 static inline bool isFuncSRetArg(Argument * arg)
11045 {
11046 Function * F = arg->getParent();
11047 return (arg == F->arg_begin() &&
11048 arg != F->arg_end() &&
11049 arg->hasStructRetAttr() &&
11050 F->getReturnType()->isVoidTy());
11051 }
11052
emitStackFuncEntry(Function * F)11053 void EmitPass::emitStackFuncEntry(Function* F)
11054 {
11055 m_encoder->SetDispatchSimdSize();
11056 m_currShader->InitializeStackVariables();
11057
11058 if (F->hasFnAttribute("referenced-indirectly"))
11059 {
11060 m_encoder->SetExternFunctionFlag();
11061 }
11062
11063 CVariable* ArgBlkVar = m_currShader->GetARGV();
11064 uint32_t offsetA = 0; // visa argument offset
11065 uint32_t offsetS = 0; // visa stack offset
11066 std::vector<CVariable*> argsOnStack;
11067 for (auto& Arg : F->args())
11068 {
11069 if (!F->hasFnAttribute("referenced-indirectly"))
11070 {
11071 // Skip unused arguments if any for direct call
11072 if (Arg.use_empty()) continue;
11073 }
11074
11075 // adjust offset for alignment
11076 CVariable* Dst = m_currShader->getOrCreateArgumentSymbol(&Arg, false, true);
11077 uint align = getGRFSize();
11078 offsetA = int_cast<unsigned>(llvm::alignTo(offsetA, align));
11079 uint argSize = Dst->GetSize();
11080 if (Dst->GetType() == ISA_TYPE_BOOL)
11081 {
11082 argSize = numLanes(m_currShader->m_dispatchSize) * SIZE_WORD;
11083 }
11084 // check if an argument can be written to ARGV based upon offset + arg-size
11085 bool overflow = ((offsetA + argSize) > ArgBlkVar->GetSize());
11086 if (!overflow)
11087 {
11088 if (!Arg.use_empty())
11089 {
11090 CVariable* Src = ArgBlkVar;
11091 if (Dst->GetType() == ISA_TYPE_BOOL)
11092 {
11093 Src = m_currShader->GetNewAlias(ArgBlkVar, ISA_TYPE_W, (uint16_t)offsetA, numLanes(m_currShader->m_dispatchSize), false);
11094 m_encoder->Cmp(EPREDICATE_NE, Dst, Src, m_currShader->ImmToVariable(0, ISA_TYPE_W));
11095 }
11096 else if (m_FGA->isLeafFunc(F))
11097 {
11098 // Directly map the dst register to an alias of ArgBlkVar, and update symbol mapping for future uses
11099 Dst = m_currShader->GetNewAlias(ArgBlkVar, Dst->GetType(), (uint16_t)offsetA, Dst->GetNumberElement(), Dst->IsUniform());
11100 m_currShader->UpdateSymbolMap(&Arg, Dst);
11101 }
11102 else
11103 {
11104 // For calls not guaranteed to preserve the ARG register, we copy it first to a temp
11105 if (Src->GetType() != Dst->GetType() || offsetA != 0 || Src->IsUniform() != Dst->IsUniform())
11106 {
11107 Src = m_currShader->GetNewAlias(ArgBlkVar, Dst->GetType(), (uint16_t)offsetA, Dst->GetNumberElement(), Dst->IsUniform());
11108 }
11109 emitCopyAll(Dst, Src, Arg.getType());
11110 }
11111 }
11112 offsetA += argSize;
11113 }
11114 else
11115 {
11116 argsOnStack.push_back(Dst);
11117 }
11118
11119 // Get the symbol for arg0 if it has the "sret" attribute and save it.
11120 if (isFuncSRetArg(&Arg)) m_currShader->SaveSRet(Dst);
11121 }
11122 m_encoder->SetStackFunctionArgSize((offsetA + getGRFSize() - 1) / getGRFSize());
11123
11124 // Read all stack-pushed args back into registers
11125 offsetS = emitStackArgumentLoadOrStore(argsOnStack, false);
11126
11127 unsigned totalAllocaSize = 0;
11128
11129 // reserve space for all the alloca in the function subgroup
11130 auto funcMDItr = m_currShader->m_ModuleMetadata->FuncMD.find(F);
11131 if (funcMDItr != m_currShader->m_ModuleMetadata->FuncMD.end() && funcMDItr->second.privateMemoryPerWI != 0)
11132 {
11133 totalAllocaSize += funcMDItr->second.privateMemoryPerWI * numLanes(m_currShader->m_dispatchSize);
11134 }
11135
11136 // save FP before allocation
11137 m_currShader->SaveStackState();
11138
11139 // Push a new stack frame
11140 emitPushFrameToStack(totalAllocaSize);
11141
11142 // Set the per-function private mem size
11143 m_encoder->SetFunctionAllocaStackSize(F, totalAllocaSize);
11144 }
11145
emitStackFuncExit(llvm::ReturnInst * inst)11146 void EmitPass::emitStackFuncExit(llvm::ReturnInst* inst)
11147 {
11148 // restore SP and FP
11149 m_currShader->RestoreStackState();
11150
11151 llvm::Function* F = inst->getParent()->getParent();
11152 llvm::Type* RetTy = F->getReturnType();
11153 CVariable* Dst = m_currShader->GetRETV();
11154 if (!RetTy->isVoidTy())
11155 {
11156 unsigned RetSize = 0;
11157 unsigned nLanes = numLanes(m_currShader->m_dispatchSize);
11158 CVariable* Src = GetSymbol(inst->getReturnValue());
11159
11160 if (Src->GetType() == ISA_TYPE_BOOL)
11161 {
11162 CVariable* one = m_currShader->ImmToVariable(1, ISA_TYPE_W);
11163 CVariable* zero = m_currShader->ImmToVariable(0, ISA_TYPE_W);
11164 CVariable* DstAlias = m_currShader->GetNewAlias(Dst, ISA_TYPE_W, 0, nLanes, false);
11165 m_encoder->Select(Src, DstAlias, one, zero);
11166 RetSize = nLanes * SIZE_WORD;
11167 }
11168 else
11169 {
11170 bool isSrcUniform = Src->IsUniform();
11171 RetSize = isSrcUniform ? nLanes * Src->GetSize() : Src->GetSize();
11172 IGC_ASSERT_MESSAGE(RetSize <= Dst->GetSize(), "No support for return on stack!");
11173
11174 if (Dst->GetType() != Src->GetType() || Dst->IsUniform() != Src->IsUniform())
11175 {
11176 unsigned elements = isSrcUniform ? Src->GetNumberElement() * nLanes : Src->GetNumberElement();
11177 Dst = m_currShader->GetNewAlias(Dst, Src->GetType(), 0, elements, false);
11178 }
11179 emitCopyAll(Dst, Src, RetTy);
11180 }
11181 m_encoder->SetStackFunctionRetSize((RetSize + getGRFSize() - 1) / getGRFSize());
11182 }
11183 else
11184 {
11185 // Based on other arch's ABIs, the sret argument is guaranteed to be written to the return register upon function exit.
11186 // vISA ABI states that the return and argument registers start at the same location. If the function is non-void, %retVal
11187 // starts at r26. Otherwise, %arg0 will start at r26.
11188 // Here we write the saved arg0 value back into arg0. Since arg0 has the "sret" attribute, the function is guaranteed to be void,
11189 // thus writing to %arg0 is the same as writing to %retval.
11190 // We still set the retSize to 0 to match the LLVM IR function signature, so we avoid writing to vISA's return reg directly.
11191 // Note: For leaf functions, we don't need to copy since we are guaranteed that %arg0 will not be overwritten.
11192 CVariable* sretPtr = m_currShader->GetAndResetSRet();
11193 if (sretPtr && isFuncSRetArg(F->arg_begin()) && !m_FGA->isLeafFunc(F))
11194 {
11195 // If the sret value is saved, copy it back into arg0
11196 CVariable* ArgBlk = m_currShader->GetARGV();
11197 CVariable* Arg0 = m_currShader->GetNewAlias(ArgBlk, sretPtr->GetType(), 0, sretPtr->GetNumberElement(), sretPtr->IsUniform());
11198 m_encoder->Copy(Arg0, sretPtr);
11199 m_encoder->Push();
11200 }
11201 m_encoder->SetStackFunctionRetSize(0);
11202 }
11203 // emit return
11204 m_encoder->StackRet(nullptr);
11205 m_encoder->Push();
11206 }
11207
emitSymbolRelocation(Function & F)11208 void EmitPass::emitSymbolRelocation(Function& F)
11209 {
11210 Module* pModule = F.getParent();
11211
11212 SmallSet<Function*, 16> funcAddrSymbols;
11213 SmallSet<GlobalVariable*, 16> globalAddrSymbols;
11214
11215 ModuleMetaData* moduleMD = getAnalysis<MetaDataUtilsWrapper>().getModuleMetaData();
11216
11217 std::function<void(Value*)> usedValues =
11218 [&usedValues, &funcAddrSymbols, &globalAddrSymbols, moduleMD, pModule]
11219 (Value* v)
11220 {
11221 if (Function* pFunc = dyn_cast<Function>(v))
11222 {
11223 if (pModule == pFunc->getParent() &&
11224 pFunc->hasFnAttribute("referenced-indirectly"))
11225 funcAddrSymbols.insert(pFunc);
11226 }
11227 else if (GlobalVariable* pGlobal = dyn_cast<GlobalVariable>(v))
11228 {
11229 if (pModule == pGlobal->getParent() &&
11230 moduleMD->inlineProgramScopeOffsets.count(pGlobal) > 0)
11231 globalAddrSymbols.insert(pGlobal);
11232 }
11233 else if (Constant* C = dyn_cast<Constant>(v))
11234 {
11235 for (auto it = C->value_op_begin(), end = C->value_op_end(); it != end; it++)
11236 usedValues(*it);
11237 }
11238 };
11239
11240 for (auto&& BB : F)
11241 {
11242 for (auto& I : BB)
11243 {
11244 for (auto it = I.value_op_begin(), end = I.value_op_end(); it != end; it++)
11245 usedValues(*it);
11246 }
11247 }
11248
11249 for (auto pFunc : funcAddrSymbols)
11250 {
11251 m_currShader->CreateFunctionSymbol(pFunc);
11252 }
11253
11254 for (auto pGlobal : globalAddrSymbols)
11255 {
11256 m_currShader->CreateGlobalSymbol(pGlobal);
11257 }
11258 }
11259
emitStoreRawIndexed(StoreRawIntrinsic * inst,Value * varOffset,ConstantInt * immOffset)11260 void EmitPass::emitStoreRawIndexed(
11261 StoreRawIntrinsic* inst, Value* varOffset, ConstantInt* immOffset)
11262 {
11263 Value* pBufPtr = inst->getResourceValue();
11264 Value* pValToStore = inst->getStoreValue();
11265
11266 m_currShader->isMessageTargetDataCacheDataPort = true;
11267
11268 IGC_ASSERT(immOffset == nullptr);
11269 emitStore3DInner(pValToStore, pBufPtr, varOffset);
11270 }
11271
emitStore3D(StoreInst * inst,Value * elmIdxV)11272 void EmitPass::emitStore3D(StoreInst* inst, Value* elmIdxV)
11273 {
11274 // Only support for scratch space added currently during emitStore
11275 Value* pllValToStore = inst->getValueOperand();
11276 Value* pllDstPtr = inst->getPointerOperand();
11277
11278
11279 emitStore3DInner(pllValToStore, pllDstPtr, elmIdxV);
11280 }
11281
emitStore3DInner(Value * pllValToStore,Value * pllDstPtr,Value * pllElmIdx)11282 void EmitPass::emitStore3DInner(Value* pllValToStore, Value* pllDstPtr, Value* pllElmIdx)
11283 {
11284 IGC_ASSERT(pllDstPtr != nullptr);
11285
11286 bool isPrivateMem = pllDstPtr->getType()->getPointerAddressSpace() == ADDRESS_SPACE_PRIVATE;
11287 if (!isPrivateMem)
11288 {
11289 ForceDMask(false);
11290 }
11291
11292 ResourceDescriptor resource = GetResourceVariable(pllDstPtr);
11293
11294 uint sizeInBits = GetPrimitiveTypeSizeInRegisterInBits(pllValToStore->getType());
11295
11296 IGC_ASSERT_MESSAGE((sizeInBits == 8) || (sizeInBits == 16) || (sizeInBits == 32) || (sizeInBits == 64) || (sizeInBits == 96) || (sizeInBits == 128),
11297 "Stored type must be 1/2/4/8/12/16 bytes long");
11298
11299 CVariable* storedVal = GetSymbol(pllValToStore);
11300
11301 IGC_ASSERT(pllElmIdx);
11302 CVariable* ptr = GetSymbol(pllElmIdx);
11303
11304 IGC_ASSERT(pllDstPtr->getType()->isPointerTy());
11305 if (!IGC::isA64Ptr(cast<PointerType>(pllDstPtr->getType()), m_currShader->GetContext()))
11306 {
11307 ptr = TruncatePointer(ptr);
11308 }
11309
11310 CVariable* gOffset = m_currShader->ImmToVariable(0x0, ISA_TYPE_UD);
11311
11312 // The stored value and the ptr must be placed aligned in GRFs, as SIMDSize DWORDs.
11313 // So if it's not already in this form, bring it to it:
11314 // Broadcast the value, and extend it (doesn't matter if it's sext, zext, or any
11315 // other kind of extend).
11316
11317 CVariable* storedValOriginal = storedVal;
11318 CVariable* ptrOriginal = ptr;
11319
11320 storedVal = BroadcastIfUniform(storedVal);
11321 ptr = BroadcastIfUniform(ptr);
11322
11323 uint label = 0;
11324 CVariable* flag = nullptr;
11325 bool needLoop = ResourceLoopHeader(resource, flag, label);
11326 if (sizeInBits == 32)
11327 {
11328 if (resource.m_surfaceType == ESURFACE_STATELESS &&
11329 m_currShader->m_Platform->getWATable().WaNoA32ByteScatteredStatelessMessages)
11330 {
11331 // DWORD scatter
11332 CVariable* shiftedPtr = m_currShader->GetNewVariable(ptr);
11333 m_encoder->Shr(shiftedPtr, ptr, m_currShader->ImmToVariable(2, ISA_TYPE_UD));
11334 m_encoder->Push();
11335 ptr = shiftedPtr;
11336 setPredicateForDiscard(flag);
11337 m_encoder->Scatter(
11338 storedVal,
11339 resource.m_resource,
11340 ptr,
11341 gOffset,
11342 resource.m_surfaceType,
11343 4);
11344 m_encoder->Push();
11345 }
11346 else
11347 {
11348 if (m_currShader->m_Platform->emulateByteScraterMsgForSS() &&
11349 (ESURFACE_SCRATCH == resource.m_surfaceType))
11350 {
11351 setPredicateForDiscard(flag);
11352 bool isUniformInst = (ptrOriginal->IsUniform() && storedValOriginal->IsUniform());
11353 ptrOriginal = (isUniformInst ? ReAlignUniformVariable(ptrOriginal, EALIGN_GRF) : ptr);
11354 storedValOriginal = (isUniformInst ? ReAlignUniformVariable(storedValOriginal, EALIGN_GRF) : storedVal);
11355 m_encoder->Scatter4Scaled(storedValOriginal, resource, ptrOriginal);
11356 }
11357 else
11358 {
11359 // using byte scatter
11360 uint elementSize = 8;
11361 uint numElems = 4;
11362 setPredicateForDiscard(flag);
11363 m_encoder->ByteScatter(
11364 storedVal,
11365 resource,
11366 ptr,
11367 elementSize,
11368 numElems);
11369 }
11370 m_encoder->Push();
11371 }
11372 }
11373 else if (sizeInBits == 16 || sizeInBits == 8)
11374 {
11375 // using byte scatter
11376 uint elementSize = 8;
11377 uint numElems = sizeInBits / 8;
11378 VISA_Type elementType = (sizeInBits == 8) ? ISA_TYPE_UB : ISA_TYPE_UW;
11379 CVariable* val = m_currShader->GetNewVariable(
11380 numLanes(m_currShader->m_SIMDSize), ISA_TYPE_UD, EALIGN_GRF, CName::NONE);
11381 storedVal = m_currShader->GetNewAlias(storedVal, elementType, 0, 0);
11382 m_encoder->Cast(val, storedVal);
11383 setPredicateForDiscard(flag);
11384 m_encoder->ByteScatter(
11385 val,
11386 resource,
11387 ptr,
11388 elementSize,
11389 numElems);
11390 m_encoder->Push();
11391 }
11392 else // (sizeInBits > 32)
11393 {
11394 setPredicateForDiscard(flag);
11395 m_encoder->Scatter4Scaled(storedVal, resource, ptr);
11396 m_encoder->Push();
11397 }
11398 ResourceLoopBackEdge(needLoop, flag, label);
11399 if (!isPrivateMem)
11400 {
11401 ResetVMask(false);
11402 }
11403 }
11404
emitStore(StoreInst * inst,Value * varOffset,ConstantInt * immOffset)11405 void EmitPass::emitStore(StoreInst* inst, Value* varOffset, ConstantInt* immOffset)
11406 {
11407 emitVectorStore(inst, varOffset, immOffset);
11408 }
11409
GetSymbol(llvm::Value * v) const11410 CVariable* EmitPass::GetSymbol(llvm::Value* v) const
11411 {
11412 return m_currShader->GetSymbol(v);
11413 }
11414
CountStatelessIndirectAccess(llvm::Value * pointer,ResourceDescriptor resource)11415 void EmitPass::CountStatelessIndirectAccess(llvm::Value* pointer, ResourceDescriptor resource)
11416 {
11417 instrMap.clear();
11418 IGC_ASSERT_MESSAGE(isa<PointerType>(pointer->getType()), "Value should be a pointer");
11419 if (resource.m_surfaceType == ESURFACE_STATELESS && IsIndirectAccess(pointer))
11420 {
11421 m_currShader->IncIndirectStatelessCount();
11422 }
11423 }
11424
IsIndirectAccess(llvm::Value * pointer)11425 bool EmitPass::IsIndirectAccess(llvm::Value* pointer)
11426 {
11427 Instruction* inst = dyn_cast<Instruction>(pointer);
11428 if (inst == nullptr)
11429 {
11430 return false;
11431 }
11432
11433 // we cache the instructions
11434 // when we meet the instruction again know it has already been checked
11435 if (instrMap.count(inst))
11436 {
11437 return instrMap.lookup(inst);
11438 }
11439
11440 bool isIndirect = false;
11441 instrMap.try_emplace(inst, isIndirect);
11442
11443 if (LoadInst* loadInst = dyn_cast<LoadInst>(inst))
11444 {
11445 isIndirect = true;
11446 }
11447 else if (CallInst* callInstr = dyn_cast<CallInst>(inst))
11448 {
11449 // if the call instruction isn't intrinsic we assume that it should be indirect
11450 // because intrinsic is rather the simple arithmetic
11451 GenIntrinsicInst* pIntrinsic = dyn_cast<GenIntrinsicInst>(callInstr);
11452 if (pIntrinsic == nullptr)
11453 {
11454 isIndirect = true;
11455 }
11456 }
11457
11458 if (!isIndirect)
11459 {
11460 for (unsigned int i = 0; i < inst->getNumOperands(); i++)
11461 {
11462 if (IsIndirectAccess(inst->getOperand(i)))
11463 {
11464 isIndirect = true;
11465 break;
11466 }
11467 }
11468 }
11469 instrMap.insert(std::make_pair(inst, isIndirect));
11470 return isIndirect;
11471 }
11472
emitInsert(llvm::Instruction * inst)11473 void EmitPass::emitInsert(llvm::Instruction* inst)
11474 {
11475 auto IEI = llvm::cast<llvm::InsertElementInst>(inst);
11476 // Skip emit scalar copy if this `insertelement` could be aliased.
11477 if (m_currShader->CanTreatScalarSourceAsAlias(IEI))
11478 return;
11479
11480 llvm::Type* eTy = inst->getOperand(1)->getType();
11481 // Do not use allocated type to compute the offsets; otherwise the computed
11482 // offsets may be out-of-bound. The alignment information of the base
11483 // element type should not impact the offset.
11484 uint32_t eBytes = GetScalarTypeSizeInRegister(eTy);
11485 IGC_ASSERT_MESSAGE(eBytes, "illegal InsertElementInst instruction");
11486
11487 llvm::Value* pVec = inst->getOperand(0);
11488 CVariable* pInstVar = GetSymbol(inst);
11489 CVariable* pVecVar = nullptr;
11490 llvm::Type* pVecType = inst->getType();
11491 if (!isa<UndefValue>(pVec))
11492 {
11493 if (isa<ConstantVector>(pVec))
11494 {
11495 auto CV = cast<ConstantVector>(pVec);
11496 pInstVar = m_currShader->GetConstant(CV, pInstVar);
11497 }
11498 else
11499 {
11500 pVecVar = GetSymbol(pVec);
11501 if (pVecVar != pInstVar)
11502 {
11503 emitVectorCopy(pInstVar, pVecVar, int_cast<unsigned>(dyn_cast<IGCLLVM::FixedVectorType>(pVecType)->getNumElements()));
11504 }
11505 }
11506 }
11507
11508 if (llvm::ConstantInt * pConstElem = llvm::dyn_cast<llvm::ConstantInt>(IEI->getOperand(2)))
11509 {
11510 CVariable* pElm = GetSymbol(inst->getOperand(1));
11511
11512 uint element = int_cast<uint>(pConstElem->getZExtValue());
11513 uint eStartBytes;
11514 if (m_currShader->GetIsUniform(inst) && m_currShader->GetIsUniform(pVec))
11515 {
11516 eStartBytes = eBytes * element;
11517 }
11518 else
11519 {
11520 eStartBytes = numLanes(m_currShader->m_SIMDSize) * eBytes * element;
11521 }
11522
11523 uint subVar = (eStartBytes / getGRFSize());
11524 uint subReg = (eStartBytes % getGRFSize()) / eBytes; // unit of element(eTy)
11525 m_encoder->SetDstSubVar(subVar);
11526 m_encoder->SetDstSubReg(subReg);
11527 m_encoder->Copy(m_destination, pElm);
11528 m_encoder->Push();
11529 }
11530 else
11531 {
11532 // the index is not a compile-time constant, we need to use runtime indirect addressing
11533 llvm::Value* pElement = inst->getOperand(1); // element to insert
11534 llvm::Value* pIndex = inst->getOperand(2); // index to insert at
11535 CVariable* pIndexVar = m_currShader->BitCast(GetSymbol(pIndex), ISA_TYPE_UW);
11536 CVariable* pElemVar = GetSymbol(pElement);
11537
11538 // size of vector entry
11539 const uint vectorEntrySimdWidth = pInstVar->IsUniform() ?
11540 1 : numLanes(m_currShader->m_SIMDSize);
11541
11542 const uint vecTypeSize =
11543 GetPrimitiveTypeSizeInRegister(cast<VectorType>(pVecType)->getElementType());
11544
11545 const uint offset = vectorEntrySimdWidth * vecTypeSize;
11546
11547 CVariable* pOffset1 = m_currShader->ImmToVariable(offset, ISA_TYPE_UW);
11548
11549 // offset2 = index * sizeof(vector entry) <-- offset within the vector counted in bytes
11550 CVariable* pOffset2 = m_currShader->GetNewVariable(
11551 pIndexVar->IsUniform() ? 1 : numLanes(m_currShader->m_SIMDSize),
11552 ISA_TYPE_UW,
11553 EALIGN_WORD,
11554 pIndexVar->IsUniform(), CName::NONE);
11555
11556 if (!pIndexVar->IsUniform())
11557 {
11558 m_encoder->SetSrcRegion(0, 16, 8, 2);
11559 }
11560 m_encoder->Mul(pOffset2, pIndexVar, pOffset1);
11561 m_encoder->Push();
11562
11563 // a0 = addressof(vector variable) + offset2 <-- address of element to insert at
11564 if (pIndexVar->IsUniform())
11565 {
11566 CVariable* pDstArrElm =
11567 m_currShader->GetNewAddressVariable(
11568 1,
11569 m_destination->GetType(),
11570 true,
11571 pInstVar->IsUniform(),
11572 m_destination->getName());
11573 m_encoder->AddrAdd(pDstArrElm, m_destination, pOffset2);
11574 m_encoder->Push();
11575 m_encoder->Copy(pDstArrElm, pElemVar);
11576 m_encoder->Push();
11577 }
11578 else
11579 {
11580 // Lower execution size to avoid complains of indirectly addressing across more than two GRFs.
11581 // One example is below:
11582 //(W) mov (1|M0) f1.1<1>:uw 0x100:uw
11583 //(f1.1) mov(16|M0) r[a0.8]<1>:f r63.0 < 0; 1, 0 >:f
11584 //will be changed to
11585 //(W) mov (1|M0) f1.1<1>:uw 0x100:uw
11586 //(f1.1) mov(8|M8) r[a0.8+0x20]<1>:f r63.0 < 0; 1, 0 >:f
11587 // To avoid complains, we limit the execSizeNew*datatypesize to the same memory size of getMinDispatchMode()
11588 // In above example, say, getMinDispatchMode()==8, that means the execSizeNew should be 8
11589 // because 8 * SIZE_DWORD = getMinDispatchMode() * SIZE_DWORD
11590 // But if datatype is 64bit, then, execSizeNew should be 4
11591 // because 4 * SIZE_QWORD = getMinDispatchMode() * SIZE_DWORD
11592 // Changing to simd1 needs more work and might cause extra overhead as well.
11593 // indirect address, emaskoffset should be offsetted correspondingly
11594 SIMDMode simdMode = std::min(m_currShader->m_SIMDSize, SIMDMode::SIMD16);
11595 SIMDMode minDispatchMode = m_currShader->m_Platform->getMinDispatchMode();
11596 SIMDMode execSizeNew = minDispatchMode;
11597 bool bWAMultiGRF = false;
11598 if (!pInstVar->IsUniform() && m_currShader->m_Platform->enableMultiGRFAccessWA())
11599 {
11600 uint32_t dataTypeSize = GetScalarTypeSizeInRegisterInBits(pElement->getType());
11601 uint32_t memSizeToUse = numLanes(simdMode) * dataTypeSize / 8;
11602 uint32_t memSizeMinDisp = numLanes(minDispatchMode) * SIZE_DWORD;
11603 bWAMultiGRF = (memSizeToUse > memSizeMinDisp);
11604 if (bWAMultiGRF)
11605 {
11606 execSizeNew = lanesToSIMDMode(memSizeMinDisp * 8 / dataTypeSize);
11607 uint32_t lanesNew = numLanes(execSizeNew);
11608 int cnt = memSizeToUse / memSizeMinDisp;
11609 for (int i=1; i<cnt; i++)
11610 {
11611 CVariable* pOffset1_2ndHalf = m_currShader->ImmToVariable(memSizeMinDisp * i, ISA_TYPE_UW);
11612 uint32_t laneIdx = lanesNew * i;
11613 CVariable* pOffset2_2ndHalf = m_currShader->GetNewAlias(pOffset2, ISA_TYPE_UW, laneIdx * SIZE_WORD, 0);
11614 m_encoder->SetSrcRegion(0, lanesNew, lanesNew, 1);
11615 m_encoder->SetSimdSize(execSizeNew);
11616 m_encoder->SetMask((laneIdx / 8) % 2 ? EMASK_Q2 : EMASK_Q1);
11617 m_encoder->SetSecondNibble((laneIdx / 4) % 2 ? true : false);
11618 m_encoder->Add(pOffset2_2ndHalf, pOffset2_2ndHalf, pOffset1_2ndHalf);
11619 m_encoder->Push();
11620 }
11621 m_encoder->SetSecondNibble(false);
11622 }
11623 }
11624
11625 int loopCount = (m_currShader->m_dispatchSize == SIMDMode::SIMD32 && m_currShader->m_numberInstance == 1) ? 2 : 1;
11626 for (int i = 0; i < loopCount; ++i)
11627 {
11628 CVariable* dst = m_destination;
11629 if (i == 1)
11630 {
11631 // explicitly set second half as we are manually splitting
11632 m_encoder->SetSecondHalf(true);
11633 m_encoder->SetSrcSubReg(1, 16);
11634 dst = m_currShader->GetNewAlias(dst, dst->GetType(), 16 * dst->GetElemSize(), 0);
11635 }
11636 CVariable* pDstArrElm = m_currShader->GetNewAddressVariable(
11637 numLanes(simdMode),
11638 m_destination->GetType(),
11639 false,
11640 pInstVar->IsUniform(),
11641 m_destination->getName());
11642
11643 m_encoder->SetSimdSize(simdMode);
11644 m_encoder->AddrAdd(pDstArrElm, dst, pOffset2);
11645 m_encoder->Push();
11646
11647 // Handle the case when the index is non-uniform - we need to lookup a different value
11648 // for each simd lane.
11649 // Since HW doesn't support scattered GRF writes, we need to simulate
11650 // scattered write by a sequence of instructions, each one writing to a single simd-lane.
11651 for (uint lane = 0; lane < numLanes(simdMode); ++lane)
11652 {
11653 uint position = lane + i * 16;
11654 // write to uniform-vector has no-mask and no-predicate
11655 if (!pInstVar->IsUniform())
11656 {
11657 CVariable* immMask = m_currShader->ImmToVariable(1ULL << lane, ISA_TYPE_UD);
11658 CVariable* dstPred = m_currShader->GetNewVariable(
11659 numLanes(m_SimdMode),
11660 ISA_TYPE_BOOL,
11661 EALIGN_BYTE,
11662 CName::NONE);
11663
11664 m_encoder->SetSimdSize(simdMode);
11665 m_encoder->SetP(dstPred, immMask);
11666 m_encoder->Push();
11667 m_encoder->SetPredicate(dstPred);
11668 }
11669 if (!pElemVar->IsUniform())
11670 {
11671 m_encoder->SetSrcSubReg(0, position);
11672 }
11673 m_encoder->SetSrcRegion(0, 0, 1, 0);
11674 m_encoder->SetDstSubReg(lane);
11675 if (bWAMultiGRF)
11676 {
11677 m_encoder->SetMask((lane / 8) % 2 ? EMASK_Q2 : EMASK_Q1);
11678 if (execSizeNew == SIMDMode::SIMD4)
11679 {
11680 m_encoder->SetSecondNibble((lane / 4) % 2 ? true : false);
11681 }
11682 m_encoder->SetSimdSize(execSizeNew);
11683 }
11684 else if (pInstVar->IsUniform())
11685 {
11686 m_encoder->SetSimdSize(SIMDMode::SIMD1);
11687 m_encoder->SetNoMask();
11688 }
11689 else
11690 {
11691 m_encoder->SetSimdSize(simdMode);
11692 }
11693 m_encoder->Copy(pDstArrElm, pElemVar);
11694 m_encoder->Push();
11695 m_encoder->SetSecondNibble(false);
11696 }
11697 }
11698 }
11699 }
11700 }
11701
emitBranch(llvm::BranchInst * branch,const SSource & cond,e_predMode predMode)11702 void EmitPass::emitBranch(llvm::BranchInst* branch, const SSource& cond, e_predMode predMode)
11703 {
11704 llvm::BasicBlock* next = m_blockCoalescing->SkipEmptyBasicBlock(branch->getParent()->getNextNode());
11705 if (branch->isConditional())
11706 {
11707 CVariable* flag = GetSrcVariable(cond);
11708 bool inversePred = cond.mod == EMOD_NOT;;
11709 // if it is not a fallthrough
11710 BasicBlock* succ0 = m_blockCoalescing->FollowEmptyBlock(branch->getSuccessor(0));
11711 BasicBlock* succ1 = m_blockCoalescing->FollowEmptyBlock(branch->getSuccessor(1));
11712 uint label0 = m_pattern->GetBlockId(succ0);
11713 uint label1 = m_pattern->GetBlockId(succ1);
11714
11715 m_encoder->SetPredicateMode(predMode);
11716 m_encoder->SetInversePredicate(inversePred);
11717
11718 if (next == NULL || (next != succ0 && next != succ1))
11719 {
11720 // Both succ0 and succ1 are not next. Thus, need one conditional jump and
11721 // one unconditional jump. There are three cases for selecting the target
11722 // of the conditional jump:
11723 // 1. both are backward, select one with the larger ID (closer to branch) as target
11724 // L0:
11725 // ....
11726 // L1:
11727 // ...
11728 // [+-flag] goto L1
11729 // goto L0
11730 //
11731 // 2. both are forward, select one with the larger ID (farther to branch) as target
11732 // [+- flag] goto L1
11733 // goto L0
11734 // ...
11735 // L0:
11736 // ......
11737 // L1:
11738 // (making sense in this way ?)
11739 // 3. one is backward and one is forward, select the backward one as target.
11740 //
11741 uint label = m_pattern->GetBlockId(branch->getParent());
11742 uint condTarget, uncondTarget;
11743 if ((label0 <= label && label1 <= label) || (label0 > label && label1 > label))
11744 {
11745 // case 1 & 2
11746 condTarget = (label0 < label1) ? label1 : label0;
11747 uncondTarget = (label0 < label1) ? label0 : label1;
11748 }
11749 else
11750 {
11751 // case 3
11752 condTarget = (label0 <= label) ? label0 : label1;
11753 uncondTarget = (label0 <= label) ? label1 : label0;
11754 }
11755
11756 if (condTarget == uncondTarget)
11757 { // sanity check. label0 == label1 (we don't expect it, but it's legal)
11758 m_encoder->Jump(condTarget);
11759 m_encoder->Push();
11760 }
11761 else
11762 {
11763 if (condTarget != label0)
11764 {
11765 m_encoder->SetInversePredicate(!inversePred);
11766 }
11767 m_encoder->Jump(flag, condTarget);
11768 m_encoder->Push();
11769
11770 m_encoder->Jump(uncondTarget);
11771 m_encoder->Push();
11772 }
11773 }
11774 else if (next != succ0)
11775 {
11776 IGC_ASSERT_MESSAGE(next == succ1, "next should be succ1!");
11777
11778 m_encoder->Jump(flag, label0);
11779 m_encoder->Push();
11780 }
11781 else
11782 {
11783 IGC_ASSERT_MESSAGE(next == succ0, "next should be succ0");
11784
11785 m_encoder->SetInversePredicate(!inversePred);
11786 m_encoder->Jump(flag, label1);
11787 m_encoder->Push();
11788 }
11789 }
11790 else
11791 {
11792 BasicBlock* succ = m_blockCoalescing->FollowEmptyBlock(branch->getSuccessor(0));
11793 if ((next == NULL) || (next != succ))
11794 {
11795 uint label = m_pattern->GetBlockId(succ);
11796 m_encoder->Jump(label);
11797 m_encoder->Push();
11798 }
11799 }
11800 }
11801
emitDiscardBranch(BranchInst * branch,const SSource & cond)11802 void EmitPass::emitDiscardBranch(
11803 BranchInst* branch, const SSource& cond)
11804 {
11805 if (m_pattern->NeedVMask())
11806 {
11807 emitBranch(branch, cond, EPRED_ALL);
11808 }
11809 else
11810 {
11811 emitBranch(branch, cond, EPRED_NORMAL);
11812 }
11813 }
11814
SplitSIMD(llvm::Instruction * inst,uint numSources,uint headerSize,CVariable * payload,SIMDMode mode,uint half)11815 void EmitPass::SplitSIMD(llvm::Instruction* inst, uint numSources, uint headerSize, CVariable* payload, SIMDMode mode, uint half)
11816 {
11817 for (uint i = 0; i < numSources; ++i)
11818 {
11819 const unsigned int GRFSizeBy4 = (getGRFSize() >> 2);
11820 IGC_ASSERT(GRFSizeBy4);
11821
11822 uint subVarIdx = numLanes(mode) / GRFSizeBy4 * i + headerSize;
11823
11824 CVariable* rawDst = payload;
11825 CVariable* src = GetSymbol(inst->getOperand(i));
11826 // The source have to match for a raw copy
11827 if (src->GetType() != payload->GetType())
11828 {
11829 rawDst = m_currShader->BitCast(payload, src->GetType());
11830 }
11831 m_encoder->SetSimdSize(mode);
11832 m_encoder->SetDstSubVar(subVarIdx);
11833 m_encoder->SetSrcSubVar(0, half);
11834 m_encoder->SetMask(half == 0 ? EMASK_Q1 : EMASK_Q2);
11835 m_encoder->Copy(rawDst, src);
11836 m_encoder->Push();
11837 }
11838 }
11839
11840 template<size_t N>
JoinSIMD(CVariable * (& tempdst)[N],uint responseLength,SIMDMode mode)11841 void EmitPass::JoinSIMD(CVariable* (&tempdst)[N], uint responseLength, SIMDMode mode)
11842 {
11843 auto origMode = mode == SIMDMode::SIMD8 ? SIMDMode::SIMD16 : SIMDMode::SIMD32;
11844 uint iterationCount = numLanes(m_currShader->m_SIMDSize) / numLanes(mode);
11845 for (uint half = 0; half < iterationCount; half++)
11846 {
11847 for (uint i = 0; i < responseLength; ++i)
11848 {
11849 const unsigned int GRFSizeBy4 = (getGRFSize() >> 2);
11850 IGC_ASSERT(GRFSizeBy4);
11851 m_encoder->SetSimdSize(mode);
11852 const unsigned int subVarIdx = numLanes(origMode) / GRFSizeBy4 * i;
11853 m_encoder->SetSrcSubVar(0, i);
11854 m_encoder->SetDstSubVar(subVarIdx + half);
11855 m_encoder->SetMask(half == 0 ? (mode == SIMDMode::SIMD8 ? EMASK_Q1 : EMASK_H1) :
11856 (mode == SIMDMode::SIMD8 ? EMASK_Q2 : EMASK_H2));
11857 IGC_ASSERT(half < ARRAY_COUNT(tempdst));
11858 m_encoder->Copy(m_destination, tempdst[half]);
11859 m_encoder->Push();
11860 }
11861 }
11862 }
11863
BroadcastIfUniform(CVariable * pVar,bool nomask)11864 CVariable* EmitPass::BroadcastIfUniform(CVariable* pVar, bool nomask)
11865 {
11866 IGC_ASSERT_MESSAGE(nullptr != pVar, "pVar is null");
11867 VISA_Type VarT = pVar->GetType();
11868 bool Need64BitEmu = m_currShader->m_Platform->hasNoFullI64Support() &&
11869 (VarT == ISA_TYPE_Q || VarT == ISA_TYPE_UQ);
11870 bool IsImm = pVar->IsImmediate();
11871 if (pVar->IsUniform())
11872 {
11873 uint32_t width = numLanes(m_currShader->m_SIMDSize);
11874 uint elts = IsImm ? 1 : pVar->GetNumberElement();
11875 CVariable* pBroadcast =
11876 m_currShader->GetNewVariable(elts * width, pVar->GetType(),
11877 EALIGN_GRF, CName(pVar->getName(), "Broadcast"));
11878 CVariable* Dst = pBroadcast;
11879 CVariable* Src = pVar;
11880 CVariable* ImmLo = nullptr, * ImmHi = nullptr;
11881 unsigned Stride = 1;
11882 if (Need64BitEmu) {
11883 Dst = m_currShader->GetNewAlias(pBroadcast, ISA_TYPE_UD, 0, 0);
11884 if (IsImm) {
11885 uint64_t Imm = pVar->GetImmediateValue();
11886 ImmLo = m_currShader->ImmToVariable(Imm & 0xFFFFFFFFULL, ISA_TYPE_UD);
11887 ImmHi = m_currShader->ImmToVariable(Imm >> 32, ISA_TYPE_UD);
11888 }
11889 else {
11890 Src = m_currShader->GetNewAlias(pVar, ISA_TYPE_UD, 0, 0);
11891 }
11892 Stride = 2;
11893 }
11894
11895 for (uint i = 0; i < elts; ++i)
11896 {
11897 if (nomask)
11898 m_encoder->SetNoMask();
11899 m_encoder->SetSrcSubReg(0, i * Stride);
11900 if (Stride != 1) m_encoder->SetDstRegion(Stride);
11901 m_encoder->SetDstSubReg((i * Stride) * width);
11902 m_encoder->Copy(Dst, ImmLo ? ImmLo : Src);
11903 m_encoder->Push();
11904 if (Need64BitEmu) {
11905 if (nomask)
11906 m_encoder->SetNoMask();
11907 m_encoder->SetSrcSubReg(0, i * Stride + 1);
11908 if (Stride != 1) m_encoder->SetDstRegion(Stride);
11909 m_encoder->SetDstSubReg((i * Stride) * width + 1);
11910 m_encoder->Copy(Dst, ImmHi ? ImmHi : Src);
11911 m_encoder->Push();
11912 }
11913 }
11914
11915 pVar = pBroadcast;
11916 }
11917
11918 return pVar;
11919 }
11920
11921 // Get either the 1st/2nd of the execution mask based on whether IsSecondHalf() is set
11922 // Note that for SIMD32 kernels we always return UD with one half zeroed-out
GetHalfExecutionMask()11923 CVariable* EmitPass::GetHalfExecutionMask()
11924 {
11925 auto& currBlock = getCurrentBlock();
11926 if (!currBlock.m_activeMask)
11927 {
11928 bool isSecondHalf = m_encoder->IsSecondHalf();
11929 bool isSubSpanDst = m_encoder->IsSubSpanDestination();
11930 m_encoder->SetSecondHalf(false);
11931 m_encoder->SetSubSpanDestination(false);
11932 CVariable* flag = m_currShader->ImmToVariable(0, ISA_TYPE_BOOL);
11933 CVariable* dummyVar = m_currShader->GetNewVariable(1, ISA_TYPE_UW, EALIGN_WORD, true, CName::NONE);
11934 m_encoder->Cmp(EPREDICATE_EQ, flag, dummyVar, dummyVar);
11935 m_encoder->Push();
11936
11937 if (m_currShader->m_dispatchSize > SIMDMode::SIMD16)
11938 {
11939 m_encoder->SetSecondHalf(true);
11940 m_encoder->Cmp(EPREDICATE_EQ, flag, dummyVar, dummyVar);
11941 m_encoder->Push();
11942 }
11943 m_encoder->SetSecondHalf(isSecondHalf);
11944 m_encoder->SetSubSpanDestination(isSubSpanDst);
11945 currBlock.m_activeMask = flag;
11946 }
11947
11948 VISA_Type maskType = m_currShader->m_dispatchSize > SIMDMode::SIMD16 ? ISA_TYPE_UD : ISA_TYPE_UW;
11949 CVariable* eMask = m_currShader->GetNewVariable(1, maskType, EALIGN_DWORD, true, CName::NONE);
11950 m_encoder->SetNoMask();
11951 m_encoder->Cast(eMask, currBlock.m_activeMask);
11952 m_encoder->Push();
11953
11954 // for SIMD32, clear out the other half
11955 if (maskType == ISA_TYPE_UD)
11956 {
11957 CVariable* halfMask = m_currShader->GetNewVariable(1, maskType, EALIGN_DWORD, true, CName::NONE);
11958 m_encoder->SetNoMask();
11959 m_encoder->And(halfMask, eMask, m_currShader->ImmToVariable(m_encoder->IsSecondHalf() ? 0xFFFF0000 : 0xFFFF, ISA_TYPE_UD));
11960 m_encoder->Push();
11961 return halfMask;
11962 }
11963
11964 return eMask;
11965 }
11966
GetExecutionMask(CVariable * & vecMaskVar)11967 CVariable* EmitPass::GetExecutionMask(CVariable*& vecMaskVar)
11968 {
11969 bool isSecondHalf = m_encoder->IsSecondHalf();
11970 bool isSubSpanDst = m_encoder->IsSubSpanDestination();
11971 m_encoder->SetSecondHalf(false);
11972 m_encoder->SetSubSpanDestination(false);
11973 CVariable* flag = m_currShader->ImmToVariable(0, ISA_TYPE_BOOL);
11974
11975 CVariable* dummyVar = m_currShader->GetNewVariable(1, ISA_TYPE_UW, EALIGN_WORD, true, CName::NONE);
11976 m_encoder->Cmp(EPREDICATE_EQ, flag, dummyVar, dummyVar);
11977 m_encoder->Push();
11978
11979 if (m_currShader->m_dispatchSize > SIMDMode::SIMD16 && m_currShader->m_SIMDSize != SIMDMode::SIMD32)
11980 {
11981 m_encoder->SetSecondHalf(true);
11982 m_encoder->Cmp(EPREDICATE_EQ, flag, dummyVar, dummyVar);
11983 m_encoder->Push();
11984 }
11985 m_encoder->SetSecondHalf(isSecondHalf);
11986 m_encoder->SetSubSpanDestination(isSubSpanDst);
11987 vecMaskVar = flag;
11988
11989 VISA_Type maskType = m_currShader->m_dispatchSize > SIMDMode::SIMD16 ? ISA_TYPE_UD : ISA_TYPE_UW;
11990 CVariable* eMask = m_currShader->GetNewVariable(1, maskType, EALIGN_DWORD, true, CName::NONE);
11991 m_encoder->SetNoMask();
11992 m_encoder->Cast(eMask, flag);
11993 m_encoder->Push();
11994 return eMask;
11995 }
11996
GetExecutionMask()11997 CVariable* EmitPass::GetExecutionMask()
11998 {
11999 CVariable* vecMask = nullptr;
12000 return GetExecutionMask(vecMask);
12001 }
12002
12003 /// UniformCopy - Copy a non-uniform source into a uniform variable by copying
12004 /// ANY active elements.
12005
UniformCopy(CVariable * var)12006 CVariable* EmitPass::UniformCopy(CVariable* var)
12007 {
12008 CVariable* offset = nullptr;
12009 CVariable* eMask = nullptr;
12010 return UniformCopy(var, offset, eMask);
12011 }
12012
12013 /// Uniform copy allowing to reuse the off calculated by a previous call
12014 /// This allow avoiding redundant code
UniformCopy(CVariable * var,CVariable * & off,CVariable * eMask,bool doSub)12015 CVariable* EmitPass::UniformCopy(CVariable* var, CVariable*& off, CVariable* eMask, bool doSub)
12016 {
12017 IGC_ASSERT_MESSAGE(!var->IsUniform(), "Expect non-uniform source!");
12018
12019 if (eMask == nullptr)
12020 {
12021 eMask = GetExecutionMask();
12022 }
12023 if (off == nullptr)
12024 {
12025 // Get offset to any 1s. For simplicity, use 'fbl' to find the lowest 1s.
12026 off = m_currShader->GetNewVariable(1, ISA_TYPE_UD, EALIGN_DWORD, true, CName::NONE);
12027 if (doSub && m_encoder->IsSecondHalf())
12028 {
12029 // here our eMask is UD but we only want the upper 16-bit
12030 // use an UW alias to the high 16-bit instead
12031 auto uwMask = m_currShader->GetNewAlias(eMask, ISA_TYPE_UW, 2, 1);
12032 m_encoder->Fbl(off, uwMask);
12033 }
12034 else
12035 {
12036 m_encoder->Fbl(off, eMask);
12037 }
12038 m_encoder->Push();
12039
12040 // Calculate byte offset
12041 CVariable* shAmt = nullptr;
12042 switch (var->GetElemSize()) {
12043 case 1:
12044 // No need to shift.
12045 break;
12046 case 2:
12047 shAmt = m_currShader->ImmToVariable(1, ISA_TYPE_W);
12048 break;
12049 case 4:
12050 shAmt = m_currShader->ImmToVariable(2, ISA_TYPE_W);
12051 break;
12052 case 8:
12053 shAmt = m_currShader->ImmToVariable(3, ISA_TYPE_W);
12054 break;
12055 default:
12056 IGC_ASSERT_MESSAGE(0, "Unsupported element size!");
12057 break;
12058 }
12059 if (shAmt) {
12060 m_encoder->Shl(off, off, shAmt);
12061 m_encoder->Push();
12062 }
12063 off = m_currShader->BitCast(off, ISA_TYPE_UW);
12064 }
12065 // Calculate that active lane address.
12066 CVariable* addr =
12067 m_currShader->GetNewAddressVariable(1, var->GetType(), true, true, var->getName());
12068
12069 // Now, we need to jump through a few hoops for SIMD32, since the variables
12070 // representing all of the SIMD lanes may not be consecutive.
12071 uint8_t numInstances = var->GetNumberInstance();
12072
12073 if (numInstances == 2)
12074 {
12075 uint16_t numElements = var->GetNumberElement();
12076 VISA_Type dataType = var->GetType();
12077
12078 // Create a variable into which we'll merge both instances of the original variable,
12079 // and an alias into the upper half.
12080 CVariable* merged = m_currShader->GetNewVariable(numElements * numInstances,
12081 dataType, var->GetAlign(), false, 1, CName(var->getName(), "Merged"));
12082 CVariable* upperMerged = m_currShader->GetNewAlias(merged, dataType,
12083 numElements * m_encoder->GetCISADataTypeSize(dataType), numElements);
12084
12085 // Now, do the copies.
12086 bool isSecondHalf = m_encoder->IsSecondHalf();
12087
12088 m_encoder->SetSecondHalf(false);
12089 m_encoder->Copy(merged, var);
12090 m_encoder->Push();
12091
12092 m_encoder->SetSecondHalf(true);
12093 m_encoder->Copy(upperMerged, var);
12094 m_encoder->Push();
12095
12096 m_encoder->SetSecondHalf(false);
12097 m_encoder->AddrAdd(addr, merged, off);
12098 m_encoder->Push();
12099 m_encoder->SetSecondHalf(isSecondHalf);
12100 }
12101 else
12102 {
12103 m_encoder->AddrAdd(addr, var, off);
12104 m_encoder->Push();
12105 }
12106
12107 // Indirect access to that active scalar register.
12108 CVariable* exVal = m_currShader->GetNewVariable(
12109 1, var->GetType(), CEncoder::GetCISADataTypeAlignment(var->GetType()), true, CName::NONE);
12110 m_encoder->Copy(exVal, addr);
12111
12112 return exVal;
12113 }
12114
ExtendVariable(CVariable * pVar,e_alignment uniformAlign)12115 CVariable* EmitPass::ExtendVariable(CVariable* pVar, e_alignment uniformAlign) {
12116 if (pVar->GetElemSize() >= 4) {
12117 // There's no need to extend the operand. But, if the variable holding
12118 // a uniform value is not aligned to GRF, additional copy is required
12119 // to align it for SIMD1 gather/scatter.
12120 if (!pVar->IsUniform())
12121 return pVar;
12122 if (!pVar->IsImmediate() && IsGRFAligned(pVar, EALIGN_GRF))
12123 return pVar;
12124 // Otherwise, we need to re-align the variable holding that uniform value.
12125 }
12126
12127 VISA_Type NewType = ISA_TYPE_UD;
12128 if (pVar->GetElemSize() > 4)
12129 NewType = ISA_TYPE_UQ;
12130
12131 // Cast to extend and/or re-align the variable.
12132 CVariable* NewVar = 0;
12133 if (pVar->IsUniform()) {
12134 NewVar = m_currShader->GetNewVariable(1, NewType, uniformAlign, true, pVar->getName());
12135 }
12136 else {
12137 NewVar = m_currShader->GetNewVariable(
12138 numLanes(m_currShader->m_SIMDSize), NewType, EALIGN_GRF, pVar->getName());
12139 }
12140
12141 if (pVar->IsImmediate()) {
12142 pVar =
12143 m_currShader->ImmToVariable(
12144 pVar->GetImmediateValue(),
12145 GetUnsignedIntegerType(pVar->GetType()));
12146 }
12147 else {
12148 pVar =
12149 m_currShader->GetNewAlias(
12150 pVar, GetUnsignedIntegerType(pVar->GetType()), 0, 0);
12151 }
12152
12153 m_encoder->Cast(NewVar, pVar);
12154 m_encoder->Push();
12155 return NewVar;
12156 }
12157
BroadcastAndExtend(CVariable * pVar)12158 CVariable* EmitPass::BroadcastAndExtend(CVariable* pVar)
12159 {
12160 VISA_Type varType = pVar->GetType();
12161 const int typeSize = CEncoder::GetCISADataTypeSize(varType);
12162
12163 if (!pVar->IsUniform() && typeSize >= 4)
12164 {
12165 return pVar;
12166 }
12167
12168 if (pVar->IsImmediate())
12169 {
12170 pVar = m_currShader->ImmToVariable(
12171 pVar->GetImmediateValue(),
12172 GetUnsignedIntegerType(pVar->GetType()));
12173 }
12174 else
12175 {
12176 pVar = m_currShader->GetNewAlias(pVar, GetUnsignedIntegerType(pVar->GetType()), 0, 0);
12177 }
12178
12179 const VISA_Type broadcastType = typeSize == 8 ? ISA_TYPE_UQ : ISA_TYPE_UD;
12180
12181 CVariable* pBroadcast = m_currShader->GetNewVariable(
12182 numLanes(m_currShader->m_SIMDSize),
12183 broadcastType,
12184 EALIGN_GRF,
12185 CName(pVar->getName(), "Broadcast"));
12186
12187 m_encoder->Cast(pBroadcast, pVar);
12188 m_encoder->Push();
12189
12190 return pBroadcast;
12191 }
12192
TruncatePointer(CVariable * pVar)12193 CVariable* EmitPass::TruncatePointer(CVariable* pVar) {
12194 // Truncate pointer is used to prepare pointers for A32 and A64
12195 // messages and in stateful loads and stores to prepare the
12196 // offset value.
12197 // For stateless messages pointer data type can only be 32 or 64 bits wide.
12198 // For stateful messages offset data type can be 8, 16, 32 or 64 bits wide.
12199
12200 // 32-bit integer
12201 if (pVar->GetElemSize() == 4) {
12202 if (!pVar->IsUniform())
12203 return pVar;
12204 // For uniform variable, we need to re-align to GRF to ensure it's
12205 // placed at the 1st element.
12206 if (!pVar->IsImmediate() && IsGRFAligned(pVar, EALIGN_GRF))
12207 return pVar;
12208 // Re-align the container of the pointer.
12209 }
12210
12211 // Cast to truncate and/or re-align the variable.
12212 CVariable* NewVar = 0;
12213 if (pVar->IsUniform()) {
12214 NewVar = m_currShader->GetNewVariable(1, ISA_TYPE_UD, EALIGN_GRF, true, CName(pVar->getName(), "Trunc"));
12215 }
12216 else {
12217 NewVar = m_currShader->GetNewVariable(
12218 numLanes(m_currShader->m_SIMDSize), ISA_TYPE_UD, EALIGN_GRF, CName(pVar->getName(), "Trunc"));
12219 }
12220 m_encoder->Cast(NewVar, pVar);
12221 m_encoder->Push();
12222
12223 return NewVar;
12224 }
12225
ReAlignUniformVariable(CVariable * pVar,e_alignment align)12226 CVariable* EmitPass::ReAlignUniformVariable(CVariable* pVar, e_alignment align) {
12227 if (!pVar->IsUniform())
12228 return pVar;
12229
12230 if (!pVar->IsImmediate() && IsGRFAligned(pVar, EALIGN_GRF))
12231 return pVar;
12232
12233 CVariable* NewVar = m_currShader->GetNewVariable(
12234 1, pVar->GetType(), align, true, pVar->getName());
12235
12236 m_encoder->Cast(NewVar, pVar);
12237 m_encoder->Push();
12238
12239 return NewVar;
12240 }
12241
BroadcastAndTruncPointer(CVariable * pVar)12242 CVariable* EmitPass::BroadcastAndTruncPointer(CVariable* pVar)
12243 {
12244 if (pVar->GetElemSize() == 8)
12245 {
12246 // If the pointer is 64-bit, trunc it to 32-bit.
12247 // Note that we don't care if the pointer is uniform or not,
12248 // if it's uniform the trunc will also broadcast.
12249 CVariable* pTrunc = m_currShader->GetNewVariable(
12250 numLanes(m_currShader->m_SIMDSize),
12251 ISA_TYPE_UD,
12252 m_currShader->getGRFAlignment(),
12253 CName(pVar->getName(),"Broadcast64b"));
12254
12255 m_encoder->Cast(pTrunc, pVar);
12256 m_encoder->Push();
12257 pVar = pTrunc;
12258 }
12259 else
12260 {
12261 pVar = BroadcastIfUniform(pVar);
12262 }
12263
12264 return pVar;
12265 }
12266
12267 // Method used to emit reads from GS SGV variables that are not per-vertex.
12268 // Only two cases exist: PrimitiveID, GSInstanceID
emitGS_SGV(SGVIntrinsic * pInst)12269 void EmitPass::emitGS_SGV(SGVIntrinsic* pInst)
12270 {
12271 IGC_ASSERT(m_currShader->GetShaderType() == ShaderType::GEOMETRY_SHADER);
12272 CGeometryShader* gsProgram = static_cast<CGeometryShader*>(m_currShader);
12273 switch (pInst->getUsage())
12274 {
12275 case PRIMITIVEID:
12276 {
12277 CVariable* pPrimitiveID = gsProgram->GetPrimitiveID();
12278 m_currShader->CopyVariable(m_destination, pPrimitiveID);
12279 break;
12280 }
12281 case GS_INSTANCEID:
12282 {
12283 CVariable* pInstanceID = gsProgram->GetInstanceID();
12284 IGC_ASSERT(pInstanceID != nullptr);
12285 m_currShader->CopyVariable(m_destination, pInstanceID);
12286 break;
12287 }
12288 default:
12289 IGC_ASSERT_MESSAGE(0, "This should not happen after lowering to URB reads.");
12290 }
12291 }
12292
emitSampleOffset(GenIntrinsicInst * inst)12293 void EmitPass::emitSampleOffset(GenIntrinsicInst* inst)
12294 {
12295 IGC_ASSERT(m_currShader->GetShaderType() == ShaderType::PIXEL_SHADER);
12296 CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
12297 CVariable* offsets = nullptr;
12298 if (inst->getIntrinsicID() == GenISAIntrinsic::GenISA_SampleOffsetX)
12299 {
12300 offsets = psProgram->GetSampleOffsetX();
12301 }
12302 else
12303 {
12304 offsets = psProgram->GetSampleOffsetY();
12305 }
12306
12307 CVariable* pDstArrElm = nullptr;
12308
12309 CVariable* index = GetSymbol(inst->getOperand(0));
12310
12311 CVariable* pIndexVar = m_currShader->BitCast(index, ISA_TYPE_UW);
12312
12313 {
12314 pDstArrElm = m_currShader->GetNewAddressVariable(
12315 numLanes(m_currShader->m_SIMDSize),
12316 offsets->GetType(),
12317 false,
12318 true,
12319 offsets->getName());
12320
12321 if (!pIndexVar->IsUniform())
12322 {
12323 m_encoder->SetSrcRegion(1, 16, 8, 2);
12324 }
12325
12326 m_encoder->AddrAdd(pDstArrElm, offsets, pIndexVar);
12327 m_encoder->Push();
12328 }
12329
12330 m_encoder->Cast(m_destination, pDstArrElm);
12331 m_encoder->Push();
12332
12333 }
12334
12335 // Copy identity value to dst with no mask, then src to dst with mask. Notes:
12336 // * dst may be nullptr - it will be created then
12337 // * actual second half setting is preserved
ScanReducePrepareSrc(VISA_Type type,uint64_t identityValue,bool negate,bool secondHalf,CVariable * src,CVariable * dst,CVariable * flag)12338 CVariable* EmitPass::ScanReducePrepareSrc(VISA_Type type, uint64_t identityValue, bool negate, bool secondHalf,
12339 CVariable* src, CVariable* dst, CVariable* flag)
12340 {
12341 if (!dst)
12342 {
12343 dst = m_currShader->GetNewVariable(
12344 numLanes(m_currShader->m_SIMDSize),
12345 type,
12346 EALIGN_GRF,
12347 false,
12348 src->getName());
12349 }
12350 else
12351 {
12352 IGC_ASSERT(0 < dst->GetElemSize());
12353 IGC_ASSERT(numLanes(m_currShader->m_SIMDSize) == (dst->GetSize() / dst->GetElemSize()));
12354 IGC_ASSERT(dst->GetType() == type);
12355 IGC_ASSERT(dst->GetAlign() == EALIGN_GRF);
12356 IGC_ASSERT(!dst->IsUniform());
12357 }
12358
12359 IGC_ASSERT(nullptr != m_encoder);
12360
12361 const bool savedSecondHalf = m_encoder->IsSecondHalf();
12362 m_encoder->SetSecondHalf(secondHalf);
12363
12364 // Set the GRF to <identity> with no mask. This will set all the registers to <identity>
12365 CVariable* pIdentityValue = m_currShader->ImmToVariable(identityValue, type);
12366 m_encoder->SetNoMask();
12367 m_encoder->Copy(dst, pIdentityValue);
12368 m_encoder->Push();
12369
12370 // Now copy the src with a mask so the disabled lanes still keep their <identity>
12371 if (negate)
12372 {
12373 m_encoder->SetSrcModifier(0, EMOD_NEG);
12374 }
12375 if (flag)
12376 {
12377 m_encoder->SetPredicate(flag);
12378 }
12379 m_encoder->Copy(dst, src);
12380 m_encoder->Push();
12381
12382 m_encoder->SetSecondHalf(savedSecondHalf);
12383
12384 return dst;
12385 }
12386
12387 // Reduction all reduce helper: dst_lane{k} = src_lane{simd + k} OP src_lane{k}, k = 0..(simd-1)
ReductionReduceHelper(e_opcode op,VISA_Type type,SIMDMode simd,CVariable * src)12388 CVariable* EmitPass::ReductionReduceHelper(e_opcode op, VISA_Type type, SIMDMode simd, CVariable* src)
12389 {
12390 const bool isInt64Mul = (op == EOPCODE_MUL && CEncoder::IsIntegerType(type) &&
12391 CEncoder::GetCISADataTypeSize(type) == 8);
12392 const bool is64bitType = type == ISA_TYPE_Q || type == ISA_TYPE_UQ || type == ISA_TYPE_DF;
12393 const auto alignment = is64bitType ? IGC::EALIGN_QWORD : IGC::EALIGN_DWORD;
12394 CVariable* previousTemp = src;
12395 CVariable* temp = m_currShader->GetNewVariable(
12396 numLanes(simd),
12397 type,
12398 alignment,
12399 false,
12400 CName::NONE);
12401
12402 if (isInt64Mul)
12403 {
12404 m_encoder->SetSimdSize(simd);
12405 m_encoder->SetNoMask();
12406 m_encoder->SetSrcSubReg(0, numLanes(simd));
12407 m_encoder->Copy(temp, previousTemp);
12408 m_encoder->Push();
12409 CVariable* pMulSrc[2] = { previousTemp, temp };
12410 Mul64(temp, pMulSrc, simd, true /*noMask*/);
12411 }
12412 else
12413 {
12414 m_encoder->SetNoMask();
12415 m_encoder->SetSimdSize(simd);
12416 m_encoder->SetSrcSubReg(1, numLanes(simd));
12417 m_encoder->GenericAlu(op, temp, previousTemp, previousTemp);
12418 m_encoder->Push();
12419 }
12420 return temp;
12421 }
12422
12423 // Reduction all expand helper: dst_lane{0..(simd-1)} = src_lane{0} OP src_lane{1}
ReductionExpandHelper(e_opcode op,VISA_Type type,CVariable * src,CVariable * dst)12424 void EmitPass::ReductionExpandHelper(e_opcode op, VISA_Type type, CVariable* src, CVariable* dst)
12425 {
12426 const bool isInt64Mul = (op == EOPCODE_MUL && CEncoder::IsIntegerType(type) &&
12427 CEncoder::GetCISADataTypeSize(type) == 8);
12428
12429 if (isInt64Mul)
12430 {
12431 CVariable* tmpMulSrc[2] = {};
12432 tmpMulSrc[0] = m_currShader->GetNewAlias(src, type, 0, 1, true);
12433 tmpMulSrc[1] = m_currShader->GetNewAlias(src, type, sizeof(QWORD), 1, true);
12434 Mul64(dst, tmpMulSrc, m_currShader->m_SIMDSize, false /*noMask*/);
12435 }
12436 else
12437 {
12438 m_encoder->SetSrcSubReg(1, 1);
12439 m_encoder->SetSrcRegion(0, 0, 1, 0);
12440 m_encoder->SetSrcRegion(1, 0, 1, 0);
12441 m_encoder->GenericAlu(op, dst, src, src);
12442 m_encoder->Push();
12443 }
12444 }
12445
12446 // Reduction clustered: rearrange src by copying src data elements from even subregisters
12447 // to adjacent subregisters of a new variable. Then do the same for odd src subregisters.
12448 // Rearranged src is a pair of the new variables.
12449 // Notes:
12450 // * numLanes refers to the number of elements of each of new variables (same as dst variable used for reduction)
12451 // * numInst cannot be deduced from numLanes and type
12452 // * second half setting is not preserved by this function
ReductionClusteredSrcHelper(CVariable * (& pSrc)[2],CVariable * src,uint16_t numLanes,VISA_Type type,uint numInst,bool secondHalf)12453 void EmitPass::ReductionClusteredSrcHelper(CVariable* (&pSrc)[2], CVariable* src, uint16_t numLanes,
12454 VISA_Type type, uint numInst, bool secondHalf)
12455 {
12456 const bool is64bitType = type == ISA_TYPE_Q || type == ISA_TYPE_UQ || type == ISA_TYPE_DF;
12457 const auto alignment = is64bitType ? IGC::EALIGN_QWORD : IGC::EALIGN_DWORD;
12458
12459 pSrc[0] = m_currShader->GetNewVariable(
12460 numLanes,
12461 type,
12462 alignment,
12463 false, CName::NONE);
12464 pSrc[1] = m_currShader->GetNewVariable(pSrc[0]);
12465 IGC_ASSERT(pSrc[0]);
12466 IGC_ASSERT(pSrc[1]);
12467
12468 CVariable* srcTmp = src;
12469 CVariable* pSrcTmp[2] = { pSrc[0], pSrc[1] };
12470
12471 IGC_ASSERT(nullptr != m_encoder);
12472 m_encoder->SetSecondHalf(secondHalf);
12473 for (uint i = 0; i < numInst; ++i)
12474 {
12475 const e_mask mask = secondHalf ? (i == 1 ? EMASK_Q4 : EMASK_Q3) : (i == 1 ? EMASK_Q2 : EMASK_Q1);
12476
12477 for (uint j = 0; j < 2; ++j)
12478 {
12479 IGC_ASSERT(numInst);
12480 m_encoder->SetSimdSize(lanesToSIMDMode(numLanes / numInst));
12481 m_encoder->SetNoMask();
12482 m_encoder->SetMask(mask);
12483 m_encoder->SetSrcRegion(0, 2, 1, 0);
12484 m_encoder->SetSrcSubReg(0, j);
12485 m_encoder->SetSrcSubVar(0, 2 * i);
12486 m_encoder->SetDstSubVar(i);
12487 m_encoder->Copy(pSrcTmp[j], srcTmp);
12488 m_encoder->Push();
12489 }
12490 }
12491 m_encoder->SetSecondHalf(false);
12492 }
12493
12494 // Reduction clustered reduce helper: dst_lane{k} = src_lane{2k} OP src_lane{2k+1}, k = 0..(simd-1)
12495 // For certain opcodes src must be rearranged, to move operation's arguments to the same subreg of different regs.
12496 // Notes:
12497 // * simd is SIMD mode after reduction
12498 // * second half setting is not preserved by this function
12499 // * src and dst may be the same variable
ReductionClusteredReduceHelper(e_opcode op,VISA_Type type,SIMDMode simd,bool secondHalf,CVariable * src,CVariable * dst)12500 CVariable* EmitPass::ReductionClusteredReduceHelper(e_opcode op, VISA_Type type, SIMDMode simd, bool secondHalf,
12501 CVariable* src, CVariable* dst)
12502 {
12503 const bool is64bitType = type == ISA_TYPE_Q || type == ISA_TYPE_UQ || type == ISA_TYPE_DF;
12504 const bool isInt64Mul = (op == EOPCODE_MUL && CEncoder::IsIntegerType(type) &&
12505 CEncoder::GetCISADataTypeSize(type) == 8);
12506 const uint numInst = is64bitType && simd == (getGRFSize() > 32 ? SIMDMode::SIMD16 : SIMDMode::SIMD8) ? 2 : 1;
12507
12508 IGC_ASSERT(simd == SIMDMode::SIMD2 || simd == SIMDMode::SIMD4 || simd == SIMDMode::SIMD8 || (simd == SIMDMode::SIMD16 && getGRFSize() > 32));
12509
12510 // The op is performed on pairs of adjacent src data elements.
12511 // In certain cases it is mandatory or might be beneficial for performance reasons
12512 // to ensure that for each such pair the src data elements are in separate GRFs
12513 // and that their regioning patterns match.
12514 bool isRearrangementRequired = isInt64Mul;
12515 if (isRearrangementRequired)
12516 {
12517 // Rearrange src
12518 CVariable* pSrc[2] = {};
12519 ReductionClusteredSrcHelper(pSrc, src, numLanes(simd), type, numInst, secondHalf);
12520
12521 // Perform reduction with op
12522 m_encoder->SetSecondHalf(secondHalf);
12523 if (isInt64Mul)
12524 {
12525 Mul64(dst, pSrc, simd, true /*noMask*/);
12526 }
12527 else
12528 {
12529 m_encoder->SetSimdSize(simd);
12530 m_encoder->SetNoMask();
12531 m_encoder->GenericAlu(op, dst, pSrc[0], pSrc[1]);
12532 m_encoder->Push();
12533 }
12534 m_encoder->SetSecondHalf(false);
12535 }
12536 else
12537 {
12538 m_encoder->SetSecondHalf(secondHalf);
12539 for (uint i = 0; i < numInst; ++i)
12540 {
12541 IGC_ASSERT(numInst);
12542 m_encoder->SetSimdSize(lanesToSIMDMode(numLanes(simd) / numInst));
12543 m_encoder->SetNoMask();
12544 const e_mask mask = secondHalf ? (i == 1 ? EMASK_Q4 : EMASK_Q3) : (i == 1 ? EMASK_Q2 : EMASK_Q1);
12545 m_encoder->SetMask(mask);
12546 m_encoder->SetSrcRegion(0, 2, 1, 0);
12547 m_encoder->SetSrcSubVar(0, 2 * i);
12548 m_encoder->SetSrcSubReg(0, 0);
12549 m_encoder->SetSrcRegion(1, 2, 1, 0);
12550 m_encoder->SetSrcSubVar(1, 2 * i);
12551 m_encoder->SetSrcSubReg(1, 1);
12552 m_encoder->SetDstSubVar(i);
12553 m_encoder->GenericAlu(op, dst, src, src);
12554 m_encoder->Push();
12555 }
12556 m_encoder->SetSecondHalf(false);
12557 }
12558
12559 return dst;
12560 }
12561
12562 // Final reduction and expansion clustered expand helper: for each cluster reduce one pair of values to one value,
12563 // and broadcast it to the whole cluster.
12564 // For certain opcodes the src must be rearranged, to keep operation's arguments in the same subreg of different regs.
12565 // Notes:
12566 // * simd is shader's SIMD size
12567 // * second half setting is not preserved by this function
12568 // * src and dst may be the same variable
ReductionClusteredExpandHelper(e_opcode op,VISA_Type type,SIMDMode simd,const uint clusterSize,bool secondHalf,CVariable * src,CVariable * dst)12569 void EmitPass::ReductionClusteredExpandHelper(e_opcode op, VISA_Type type, SIMDMode simd, const uint clusterSize,
12570 bool secondHalf, CVariable* src, CVariable* dst)
12571 {
12572 const bool is64bitType = type == ISA_TYPE_Q || type == ISA_TYPE_UQ || type == ISA_TYPE_DF;
12573 const bool isInt64Mul = (op == EOPCODE_MUL && CEncoder::IsIntegerType(type) &&
12574 CEncoder::GetCISADataTypeSize(type) == 8);
12575 const uint numInst = is64bitType && simd == (getGRFSize() > 32 ? SIMDMode::SIMD32 : SIMDMode::SIMD16) ? 2 : 1;
12576 IGC_ASSERT(clusterSize == 2 || clusterSize == 4 || clusterSize == 8 || clusterSize == 16);
12577 IGC_ASSERT_MESSAGE(clusterSize * CEncoder::GetCISADataTypeSize(type) <= int_cast<uint>(2 * getGRFSize()),
12578 "Will generate instructions that cross 2 GRFs boundary.");
12579
12580 // For information on rearrangement see EmitPass::ReductionClusteredReduceHelper()
12581 bool isRearrangementRequired = isInt64Mul;
12582 if (isRearrangementRequired)
12583 {
12584 // Rearrange src
12585 CVariable* pSrc[2] = {};
12586 // For src the 2 grf boundary may be crossed for 2-clusters only in SIMD16 for 64-bit types.
12587 const uint srcNumInst = clusterSize == 2 ? numInst : 1;
12588 IGC_ASSERT(clusterSize);
12589 ReductionClusteredSrcHelper(pSrc, src, numLanes(simd) / clusterSize, type, srcNumInst, secondHalf);
12590
12591 // Perform reduction with op
12592 CVariable* tempDst = m_currShader->GetNewVariable(dst);
12593 m_encoder->SetSecondHalf(secondHalf);
12594 IGC_ASSERT(clusterSize);
12595 const SIMDMode tmpSimd = lanesToSIMDMode(numLanes(simd) / clusterSize);
12596 if (isInt64Mul)
12597 {
12598 Mul64(tempDst, pSrc, tmpSimd, true /*noMask*/);
12599 }
12600 else
12601 {
12602 m_encoder->SetSimdSize(tmpSimd);
12603 m_encoder->SetNoMask();
12604 m_encoder->GenericAlu(op, tempDst, pSrc[0], pSrc[1]);
12605 m_encoder->Push();
12606 }
12607 m_encoder->SetSecondHalf(false);
12608
12609 // In certain cases a 64-bit move may need to be split into two 32-bit uint moves
12610 const bool use32BitMov = false;
12611
12612 // Broadcast to clusters
12613 // Example for a 4-clusters of QWORDs:
12614 // * with 64-bit MOVs:
12615 // mov (8|M8) r11.0<1>:uq r21.2<1;4,0>:uq
12616 // mov (8|M0) r35.0<1>:uq r21.0<1;4,0>:uq
12617 // * with 32-bit MOVs:
12618 // mov (8|M8) r33.0<2>:ud r21.4<2;4,0>:ud
12619 // mov (8|M8) r33.1<2>:ud r21.5<2;4,0>:ud
12620 // mov (8|M0) r31.0<2>:ud r21.0<2;4,0>:ud
12621 // mov (8|M0) r31.1<2>:ud r21.1<2;4,0>:ud
12622 m_encoder->SetSecondHalf(secondHalf);
12623 for (uint i = numInst; i-- != 0;)
12624 {
12625 const uint numMovPerElement = use32BitMov ? 2u : 1u;
12626 for (uint j = 0; j < numMovPerElement; ++j)
12627 {
12628 // Outer loop is for 64-bit types in SIMD16 only (cluster size is always <= 8)
12629 // to broadcast data to upper dst's half which crosses 2-grf boundary.
12630 // The inner is for movement splitting: one 64-bit to a pair of 32-bit.
12631 IGC_ASSERT(numInst);
12632 uint lanes = numLanes(simd) / numInst;
12633 IGC_ASSERT(clusterSize);
12634 uint clustersPerInst = lanes / clusterSize;
12635 uint srcSubReg = i * clustersPerInst * numMovPerElement + j;
12636 const e_mask mask = simd == SIMDMode::SIMD32 ? (i == 1 ? EMASK_H2 : EMASK_H1) :
12637 secondHalf ? (i == 1 ? EMASK_Q4 : EMASK_Q3) : (i == 1 ? EMASK_Q2 : EMASK_Q1);
12638
12639 m_encoder->SetSimdSize(lanesToSIMDMode(lanes));
12640 m_encoder->SetMask(mask);
12641 m_encoder->SetSrcRegion(0, numMovPerElement, clusterSize, 0);
12642 m_encoder->SetSrcSubReg(0, srcSubReg);
12643 m_encoder->SetSrcSubVar(0, 0);
12644 m_encoder->SetDstRegion(numMovPerElement);
12645 m_encoder->SetDstSubReg(j);
12646 m_encoder->SetDstSubVar(2 * i);
12647
12648 CVariable* broadcastSrc = tempDst;
12649 CVariable* broadcastDst = dst;
12650 if (use32BitMov)
12651 {
12652 broadcastSrc = m_currShader->GetNewAlias(broadcastSrc, VISA_Type::ISA_TYPE_UD, 0, 0);
12653 broadcastDst = m_currShader->GetNewAlias(broadcastDst, VISA_Type::ISA_TYPE_UD, 0, 0);
12654 }
12655 m_encoder->Copy(broadcastDst, broadcastSrc);
12656 m_encoder->Push();
12657 }
12658 }
12659 m_encoder->SetSecondHalf(false);
12660 }
12661 else
12662 {
12663 m_encoder->SetSecondHalf(secondHalf);
12664 for (uint i = numInst; i-- > 0;)
12665 {
12666 const uint srcSubVar = i * (4 / clusterSize);
12667 const uint srcSubReg = i * (clusterSize == 8 ? 2 : 0);
12668
12669 m_encoder->SetSimdSize(lanesToSIMDMode(numLanes(simd) / numInst));
12670 m_encoder->SetNoMask();
12671 const e_mask mask = secondHalf ? (i == 1 ? EMASK_Q4 : EMASK_Q3) : (i == 1 ? EMASK_Q2 : EMASK_Q1);
12672 m_encoder->SetMask(mask);
12673 m_encoder->SetSrcRegion(0, 2, clusterSize, 0);
12674 m_encoder->SetSrcSubReg(0, srcSubReg);
12675 m_encoder->SetSrcSubVar(0, srcSubVar);
12676 m_encoder->SetSrcRegion(1, 2, clusterSize, 0);
12677 m_encoder->SetSrcSubReg(1, srcSubReg + 1);
12678 m_encoder->SetSrcSubVar(1, srcSubVar);
12679 m_encoder->SetDstSubVar(2 * i);
12680 m_encoder->GenericAlu(op, dst, src, src);
12681 m_encoder->Push();
12682 }
12683 m_encoder->SetSecondHalf(false);
12684 }
12685 }
12686
12687 // do reduction and accumulate all the activate channels, return a uniform
emitReductionAll(e_opcode op,uint64_t identityValue,VISA_Type type,bool negate,CVariable * src,CVariable * dst)12688 void EmitPass::emitReductionAll(
12689 e_opcode op, uint64_t identityValue, VISA_Type type, bool negate, CVariable* src, CVariable* dst)
12690 {
12691 const bool isInt64Mul = (op == EOPCODE_MUL && CEncoder::IsIntegerType(type) &&
12692 CEncoder::GetCISADataTypeSize(type) == 8);
12693
12694 CVariable* srcH1 = ScanReducePrepareSrc(type, identityValue, negate, false /*secondHalf*/, src, nullptr /*dst*/);
12695 CVariable* temp = srcH1;
12696 if (m_currShader->m_dispatchSize == SIMDMode::SIMD32)
12697 {
12698 if (m_currShader->m_numberInstance == 1)
12699 {
12700 temp = ReductionReduceHelper(op, type, SIMDMode::SIMD16, temp);
12701 }
12702 else
12703 {
12704 CVariable* srcH2 = ScanReducePrepareSrc(type, identityValue, negate, true /*secondHalf*/, src, nullptr /*dst*/);
12705
12706 temp = m_currShader->GetNewVariable(
12707 numLanes(SIMDMode::SIMD16),
12708 type,
12709 EALIGN_GRF,
12710 false,
12711 CName::NONE);
12712 if (isInt64Mul)
12713 {
12714 CVariable* tmpMulSrc[2] = { srcH1, srcH2 };
12715 Mul64(temp, tmpMulSrc, SIMDMode::SIMD16, true /*noMask*/);
12716 }
12717 else
12718 {
12719 m_encoder->SetNoMask();
12720 m_encoder->SetSimdSize(SIMDMode::SIMD16);
12721 m_encoder->GenericAlu(op, temp, srcH1, srcH2);
12722 m_encoder->Push();
12723 }
12724 }
12725 }
12726 if (m_currShader->m_dispatchSize >= SIMDMode::SIMD16)
12727 {
12728 temp = ReductionReduceHelper(op, type, SIMDMode::SIMD8, temp);
12729 }
12730 temp = ReductionReduceHelper(op, type, SIMDMode::SIMD4, temp);
12731 temp = ReductionReduceHelper(op, type, SIMDMode::SIMD2, temp);
12732 ReductionExpandHelper(op, type, temp, dst);
12733 }
12734
12735 // for all the active channels within each cluster do reduction and accumulate, return a non-uniform
emitReductionClustered(const e_opcode op,const uint64_t identityValue,const VISA_Type type,const bool negate,const unsigned int clusterSize,CVariable * const src,CVariable * const dst)12736 void EmitPass::emitReductionClustered(const e_opcode op, const uint64_t identityValue, const VISA_Type type,
12737 const bool negate, const unsigned int clusterSize, CVariable* const src, CVariable* const dst)
12738 {
12739 const bool isInt64Type = type == ISA_TYPE_Q || type == ISA_TYPE_UQ;
12740 const bool isFP64Type = type == ISA_TYPE_DF;
12741 const bool is64bitType = isInt64Type || isFP64Type;
12742 const bool isInt64Mul = (op == EOPCODE_MUL && CEncoder::IsIntegerType(type) &&
12743 CEncoder::GetCISADataTypeSize(type) == 8);
12744
12745 IGC_ASSERT_MESSAGE(iSTD::BitCount(clusterSize) == 1, "Cluster size must be a power of two.");
12746 IGC_ASSERT_MESSAGE(!is64bitType || CEncoder::GetCISADataTypeSize(type) == 8, "Unsupported 64-bit type.");
12747
12748 IGC_ASSERT_MESSAGE(!isInt64Type || !m_currShader->m_Platform->hasNoFullI64Support(), "Int64 emulation is not supported.");
12749 IGC_ASSERT_MESSAGE(!isFP64Type || !m_currShader->m_Platform->hasNoFP64Inst(), "FP64 emulation is not supported.");
12750 // Src might be uniform, as its value will be broadcasted during src preparation.
12751 // Dst uniformness depends on actual support in WIAnalysis, so far implemented for 32-clusters only.
12752 IGC_ASSERT(!dst->IsUniform() || clusterSize == 32);
12753
12754 const unsigned int dispatchSize = numLanes(m_currShader->m_dispatchSize);
12755 const bool useReduceAll = clusterSize >= dispatchSize;
12756
12757 if (clusterSize == 1)
12758 {
12759 IGC_ASSERT_MESSAGE(0, "Simple copy. For performance reasons handle it somehow at earlier stage.");
12760 for (uint half = 0; half < m_currShader->m_numberInstance; ++half)
12761 {
12762 const bool secondHalf = half > 0;
12763 m_encoder->SetSecondHalf(secondHalf);
12764 if (negate)
12765 {
12766 m_encoder->SetSrcModifier(0, EMOD_NEG);
12767 }
12768 m_encoder->Copy(dst, src);
12769 m_encoder->Push();
12770 m_encoder->SetSecondHalf(false);
12771 }
12772 }
12773 else if (useReduceAll)
12774 {
12775 // TODO: consider if it is possible to detect and handle this case in frontends
12776 // and emit GenISA_WaveAll there, to enable optimizations specific to the ReduceAll intrinsic.
12777 emitReductionAll(op, identityValue, type, negate, src, dst);
12778 }
12779 else
12780 {
12781 for (uint half = 0; half < m_currShader->m_numberInstance; ++half)
12782 {
12783 const bool secondHalf = half > 0;
12784
12785 // Use the "ReduceAll()" approach if code generated by the
12786 // "optimized path" would generate instructions that cross 2-GRF
12787 // boundary. The "optimized path" is code generated by
12788 // ReductionClusteredReduceHelper() + ReductionClusteredExpandHelper().
12789 const bool mayCross2GRFs =
12790 clusterSize * CEncoder::GetCISADataTypeSize(type) > int_cast<uint>(2 * getGRFSize());
12791 if (mayCross2GRFs)
12792 {
12793 CVariable* temp = ScanReducePrepareSrc(type, identityValue, negate, secondHalf, src, nullptr);
12794 // Two halves, for each half src and dst cross 2 grf boundary - "ReduceAll" approach.
12795 m_encoder->SetSecondHalf(secondHalf);
12796 IGC_ASSERT(clusterSize == 16);
12797 temp = ReductionReduceHelper(op, type, SIMDMode::SIMD8, temp);
12798 temp = ReductionReduceHelper(op, type, SIMDMode::SIMD4, temp);
12799 temp = ReductionReduceHelper(op, type, SIMDMode::SIMD2, temp);
12800 ReductionExpandHelper(op, type, temp, dst);
12801 m_encoder->SetSecondHalf(false);
12802 }
12803 else
12804 {
12805 // For certain types it is more beneficial (e.g. due to HW restrictions) to perform clustered
12806 // operations on values converted to another type.
12807 VISA_Type tmpType = type;
12808 CVariable* tmpSrc = src;
12809 CVariable* tmpDst = dst;
12810 uint64_t tmpIdentityValue = identityValue;
12811 if (type == VISA_Type::ISA_TYPE_B || type == VISA_Type::ISA_TYPE_UB)
12812 {
12813 const bool isSigned = type == VISA_Type::ISA_TYPE_B;
12814 tmpType = isSigned ? VISA_Type::ISA_TYPE_W : VISA_Type::ISA_TYPE_UW;
12815 tmpSrc = m_currShader->GetNewVariable(
12816 src->GetNumberElement(),
12817 tmpType,
12818 IGC::EALIGN_DWORD,
12819 false,
12820 src->getName());
12821 m_encoder->SetSecondHalf(secondHalf);
12822 m_encoder->Cast(tmpSrc, src);
12823 m_encoder->Push();
12824 m_encoder->SetSecondHalf(false);
12825 tmpDst = m_currShader->GetNewVariable(
12826 dst->GetNumberElement(),
12827 tmpType,
12828 IGC::EALIGN_DWORD,
12829 false,
12830 CName::NONE);
12831 switch (op)
12832 {
12833 case EOPCODE_MAX:
12834 tmpIdentityValue = isSigned ? std::numeric_limits<int16_t>::min() :
12835 std::numeric_limits<uint16_t>::min();
12836 break;
12837 case EOPCODE_MIN:
12838 tmpIdentityValue = isSigned ? std::numeric_limits<int16_t>::max() :
12839 std::numeric_limits<uint16_t>::max();
12840 break;
12841 case EOPCODE_AND:
12842 tmpIdentityValue = 0xFFFF;
12843 break;
12844 default:
12845 break;
12846 }
12847 }
12848
12849 CVariable* temp = ScanReducePrepareSrc(tmpType, tmpIdentityValue, negate, secondHalf, tmpSrc, nullptr);
12850
12851 SIMDMode simd = secondHalf ? SIMDMode::SIMD16 : m_currShader->m_SIMDSize;
12852
12853 // Reduce with op: SIMDN -> SIMD2; that is, N/2 value pairs -> 1 value pair
12854 for (uint32_t reducedClusterSize = clusterSize;
12855 reducedClusterSize > 2; reducedClusterSize /= 2)
12856 {
12857 simd = lanesToSIMDMode(numLanes(simd) / 2);
12858 ReductionClusteredReduceHelper(op, tmpType, simd, secondHalf, temp, temp);
12859 }
12860
12861 ReductionClusteredExpandHelper(op, tmpType, m_currShader->m_SIMDSize, clusterSize, secondHalf, temp, tmpDst);
12862
12863 if (type == VISA_Type::ISA_TYPE_B || type == VISA_Type::ISA_TYPE_UB)
12864 {
12865 m_encoder->SetSecondHalf(secondHalf);
12866 m_encoder->Cast(dst, tmpDst);
12867 m_encoder->Push();
12868 m_encoder->SetSecondHalf(false);
12869 }
12870 }
12871 }
12872 }
12873 }
12874
12875 // do prefix op across all activate channels
emitPreOrPostFixOp(e_opcode op,uint64_t identityValue,VISA_Type type,bool negateSrc,CVariable * pSrc,CVariable * pSrcsArr[2],CVariable * Flag,bool isPrefix,bool isQuad)12876 void EmitPass::emitPreOrPostFixOp(
12877 e_opcode op, uint64_t identityValue, VISA_Type type, bool negateSrc,
12878 CVariable* pSrc, CVariable* pSrcsArr[2], CVariable* Flag,
12879 bool isPrefix, bool isQuad)
12880 {
12881 const bool isInt64Mul = (op == EOPCODE_MUL && CEncoder::IsIntegerType(type) && CEncoder::GetCISADataTypeSize(type) == 8);
12882
12883 if (m_currShader->m_Platform->doScalar64bScan() && CEncoder::GetCISADataTypeSize(type) == 8 && !isQuad)
12884 {
12885 emitPreOrPostFixOpScalar(
12886 op, identityValue, type, negateSrc,
12887 pSrc, pSrcsArr, Flag,
12888 isPrefix);
12889 return;
12890 }
12891
12892 bool isSimd32 = m_currShader->m_numberInstance == 2;
12893 int counter = isSimd32 ? 2 : 1;
12894
12895 CVariable* maskedSrc[2] = { 0 };
12896 for (int i = 0; i < counter; ++i)
12897 {
12898 // This is to handle cases when not all lanes are enabled. In that case we fill the lanes with identity.
12899 CVariable* pSrcCopy = ScanReducePrepareSrc(type, identityValue, negateSrc, i == 1 /*secondHalf*/,
12900 pSrc, nullptr /*dst*/, Flag);
12901
12902 m_encoder->SetSecondHalf(i == 1);
12903
12904 // For case where we need the prefix shift the source by 1 lane
12905 if (isPrefix)
12906 {
12907 maskedSrc[i] = pSrcCopy;
12908 pSrcCopy = m_currShader->GetNewVariable(pSrcCopy);
12909 // Copy identity
12910 m_encoder->SetSimdSize(SIMDMode::SIMD1);
12911 m_encoder->SetNoMask();
12912 if (i == 0)
12913 {
12914 CVariable* pIdentityValue = m_currShader->ImmToVariable(identityValue, type);
12915 m_encoder->Copy(pSrcCopy, pIdentityValue);
12916 }
12917 else
12918 {
12919 m_encoder->SetSrcSubReg(0, 15);
12920 m_encoder->Copy(pSrcCopy, maskedSrc[i - 1]);
12921 }
12922 m_encoder->Push();
12923 // Copy remained data
12924 unsigned int simdsize = numLanes(m_currShader->m_SIMDSize);
12925 unsigned int offset = 1;
12926 while (simdsize > 1)
12927 {
12928 simdsize = simdsize >> 1;
12929 int numInst = m_encoder->GetCISADataTypeSize(type) == 8 &&
12930 simdsize == 8 ? 2 : 1;
12931 for (int instNum = 0; instNum < numInst; ++instNum)
12932 {
12933 m_encoder->SetSimdSize(lanesToSIMDMode(simdsize / numInst));
12934 m_encoder->SetDstSubReg(offset + instNum * 4);
12935 m_encoder->SetSrcSubReg(0, offset - 1 + instNum * 4);
12936 m_encoder->SetNoMask();
12937 m_encoder->Copy(pSrcCopy, maskedSrc[i]);
12938 m_encoder->Push();
12939 }
12940 offset += simdsize;
12941 }
12942 }
12943 pSrcsArr[i] = pSrcCopy;
12944 }
12945
12946 auto CreateAlu = [this, op, type, isInt64Mul](
12947 const SIMDMode simdSize,
12948 const uint numInst,
12949 CVariable* pDst,
12950 CVariable* pSrc0,
12951 CVariable* pSrc1,
12952 const uint src0SubReg,
12953 const uint src0Region[3],
12954 const uint src1SubReg,
12955 const uint src1Region[3],
12956 const uint dstSubReg,
12957 const uint dstRegion)->void
12958 {
12959 if (isInt64Mul)
12960 {
12961 // 64 bit integer multiply case is done in 3 steps:
12962 // - copy source data to temporary registers to apply
12963 // sources regioning and subregister values
12964 // - call Mul64() emulation usig temporary sources and
12965 // a temporary destination
12966 // - copy the result from the temporary destination
12967 // and apply destination regioning and subregister
12968 // values
12969 // Note: Consider passing regioning information
12970 // directly to the Mul64() emulation function instead
12971 // of using the temporary registers.
12972 CVariable* pMulSrc[2] = {};
12973 const uint16_t maxNumLanes = numLanes(simdSize);
12974 pMulSrc[0] = m_currShader->GetNewVariable(
12975 maxNumLanes,
12976 type,
12977 EALIGN_GRF,
12978 false,
12979 pSrc0->getName());
12980 pMulSrc[1] = m_currShader->GetNewVariable(
12981 maxNumLanes,
12982 type,
12983 EALIGN_GRF,
12984 false,
12985 pSrc1->getName());
12986 CVariable* pMulDst = m_currShader->GetNewVariable(
12987 maxNumLanes,
12988 type,
12989 EALIGN_GRF,
12990 false,
12991 pDst->getName());
12992
12993 for (uint instNum = 0; instNum < numInst; ++instNum)
12994 {
12995 // copy sources with regioning
12996 m_encoder->SetSimdSize(simdSize);
12997 m_encoder->SetNoMask();
12998 m_encoder->SetSrcSubVar(0, instNum * 2);
12999 m_encoder->SetSrcRegion(0, src0Region[0], src0Region[1], src0Region[2]);
13000 m_encoder->SetSrcSubReg(0, src0SubReg);
13001 m_encoder->Copy(pMulSrc[0], pSrc0);
13002 m_encoder->SetSrcRegion(0, src1Region[0], src1Region[1], src1Region[2]);
13003 m_encoder->SetSrcSubReg(0, src1SubReg);
13004 m_encoder->Copy(pMulSrc[1], pSrc1);
13005 m_encoder->Push();
13006 // create emulation code
13007 Mul64(pMulDst, pMulSrc, simdSize, true /*noMask*/);
13008 // copy destination with regioning
13009 m_encoder->SetSimdSize(simdSize);
13010 m_encoder->SetNoMask();
13011 m_encoder->SetDstSubVar(instNum * 2);
13012 m_encoder->SetDstRegion(dstRegion);
13013 m_encoder->SetDstSubReg(dstSubReg);
13014 m_encoder->Copy(pDst, pMulDst);
13015 m_encoder->Push();
13016 }
13017 }
13018 else
13019 {
13020 for (uint instNum = 0; instNum < numInst; ++instNum)
13021 {
13022 m_encoder->SetSimdSize(simdSize);
13023 m_encoder->SetNoMask();
13024 m_encoder->SetSrcSubVar(0, instNum * 2);
13025 m_encoder->SetSrcRegion(0, src0Region[0], src0Region[1], src0Region[2]);
13026 m_encoder->SetSrcSubReg(0, src0SubReg);
13027 m_encoder->SetSrcSubVar(1, instNum * 2);
13028 m_encoder->SetSrcRegion(1, src1Region[0], src1Region[1], src1Region[2]);
13029 m_encoder->SetSrcSubReg(1, src1SubReg);
13030 m_encoder->SetDstSubVar(instNum * 2);
13031 m_encoder->SetDstRegion(dstRegion);
13032 m_encoder->SetDstSubReg(dstSubReg);
13033 m_encoder->GenericAlu(op, pDst, pSrc0, pSrc1);
13034 m_encoder->Push();
13035 }
13036 }
13037 };
13038
13039
13040 for (int i = 0; i < counter; ++i)
13041 {
13042 /*
13043 Copy the adjacent elements.
13044 for example: let r10 be the register
13045 Assume we are performing addition for this example
13046 ____ ____ ____ ____
13047 __|____|____|____|____|____|____|____|_
13048 | 7 | 6 | 5 | 4 | 9 | 5 | 3 | 2 |
13049 ---------------------------------------
13050 */
13051
13052 {
13053 // So then start adding from r10.0 & r10.1
13054 uint numInst = m_encoder->GetCISADataTypeSize(type) == 8 &&
13055 m_currShader->m_SIMDSize != SIMDMode::SIMD8 ? 2 : 1;
13056 auto simdSize = m_encoder->GetCISADataTypeSize(type) == 8 ||
13057 m_currShader->m_SIMDSize == SIMDMode::SIMD8 ? SIMDMode::SIMD4 : SIMDMode::SIMD8;
13058 const uint srcRegion[3] = { 2, 1, 0 };
13059 CreateAlu(
13060 simdSize, numInst, pSrcsArr[i], pSrcsArr[i], pSrcsArr[i],
13061 0 /*src0 subreg*/, srcRegion /*src0 region*/,
13062 1 /*src1 subreg*/, srcRegion /*src1 region*/,
13063 1 /*dst subreg*/, 2 /*dst region*/);
13064 }
13065
13066 /*
13067 ____ ____
13068 _______|____|________________|____|______ ___________________________________________
13069 | 13 | 6 | 9 | 4 | 14 | 5 | 5 | 2 | ==> | 13 | 15 | 9 | 4 | 14 | 10 | 5 | 2 |
13070 ----------------------------------------- -------------------------------------------
13071 */
13072 // Now we have a weird copy happening. This will be done by SIMD 2 instructions.
13073
13074 {
13075 uint numInst = m_encoder->GetCISADataTypeSize(type) == 8 &&
13076 m_currShader->m_SIMDSize != SIMDMode::SIMD8 ? 2 : 1;
13077 auto simdSize = m_encoder->GetCISADataTypeSize(type) == 8 ||
13078 m_currShader->m_SIMDSize == SIMDMode::SIMD8 ? SIMDMode::SIMD2 : SIMDMode::SIMD4;
13079 const uint srcRegion[3] = { 4, 1, 0 };
13080 CreateAlu(
13081 simdSize, numInst, pSrcsArr[i], pSrcsArr[i], pSrcsArr[i],
13082 2 /*src0 subreg*/, srcRegion /*src0 region*/,
13083 1 /*src1 subreg*/, srcRegion /*src1 region*/,
13084 2 /*dst subreg*/, 4 /*dst region*/);
13085 }
13086
13087 /*
13088 ___________ ___________
13089 __|___________|_________|___________|______ ___________________________________________
13090 | 13 | 15 | 9 | 4 | 14 | 10 | 5 | 2 | ==> | 22 | 15 | 9 | 4 | 19 | 10 | 5 | 2 |
13091 ------------------------------------------- -------------------------------------------
13092 */
13093
13094 {
13095 uint numInst = m_encoder->GetCISADataTypeSize(type) == 8 &&
13096 m_currShader->m_SIMDSize != SIMDMode::SIMD8 ? 2 : 1;
13097 auto simdSize = m_encoder->GetCISADataTypeSize(type) == 8 ||
13098 m_currShader->m_SIMDSize == SIMDMode::SIMD8 ? SIMDMode::SIMD2 : SIMDMode::SIMD4;
13099 const uint srcRegion[3] = { 4, 1, 0 };
13100 CreateAlu(
13101 simdSize, numInst, pSrcsArr[i], pSrcsArr[i], pSrcsArr[i],
13102 3 /*src0 subreg*/, srcRegion /*src0 region*/,
13103 1 /*src1 subreg*/, srcRegion /*src1 region*/,
13104 3 /*dst subreg*/, 4 /*dst region*/);
13105 }
13106
13107 if (isQuad)
13108 {
13109 // For quads, we don't want ALU ops across SIMD4 lanes, so stop here
13110 continue;
13111 }
13112
13113 /*
13114 ____
13115 __________________|____|_________________ ____________________________________________
13116 | 22 | 15 | 9 | 4 | 19 | 10 | 5 | 2 | ==> | 22 | 15 | 9 | 23 | 19 | 10 | 5 | 2 |
13117 ----------------------------------------- --------------------------------------------
13118 _________
13119 _____________|_________|_________________ _____________________________________________
13120 | 22 | 15 | 9 | 4 | 19 | 10 | 5 | 2 | ==> | 22 | 15 | 28 | 23 | 19 | 10 | 5 | 2 |
13121 ----------------------------------------- ---------------------------------------------
13122
13123 ______________
13124 ________|______________|_________________ _____________________________________________
13125 | 22 | 15 | 9 | 4 | 19 | 10 | 5 | 2 | ==> | 22 | 34 | 28 | 23 | 19 | 10 | 5 | 2 |
13126 ----------------------------------------- ---------------------------------------------
13127
13128 ____________________
13129 __|____________________|_________________ _____________________________________________
13130 | 22 | 15 | 9 | 4 | 19 | 10 | 5 | 2 | ==> | 41 | 34 | 28 | 23 | 19 | 10 | 5 | 2 |
13131 ----------------------------------------- ---------------------------------------------
13132 */
13133
13134 // Because we write continuous elements in the one above, for SIMD16 we have to split into
13135 // 2 SIMD4's.
13136 const unsigned int numLanesForSimd8 = numLanes(SIMDMode::SIMD8);
13137 IGC_ASSERT(numLanesForSimd8);
13138 const unsigned int numTimesToLoop = numLanes(m_currShader->m_SIMDSize) / numLanesForSimd8;
13139
13140 for (uint loop_counter = 0; loop_counter < numTimesToLoop; ++loop_counter)
13141 {
13142 const uint src0Region[3] = { 0, 1, 0 };
13143 const uint src1Region[3] = { 4, 4, 1 };
13144 CreateAlu(
13145 SIMDMode::SIMD4, 1, pSrcsArr[i], pSrcsArr[i], pSrcsArr[i],
13146 (loop_counter * 8 + 3) /*src0 subreg*/, src0Region /*src0 region*/,
13147 (loop_counter * 8 + 4) /*src1 subreg*/, src1Region /*src1 region*/,
13148 (loop_counter * 8 + 4) /*dst subreg*/, 1 /*dst region*/);
13149 }
13150
13151 if (m_currShader->m_SIMDSize == SIMDMode::SIMD16 || isSimd32)
13152 {
13153 // Add the last element of the 1st GRF to all the elements of the 2nd GRF
13154 const uint src0Region[3] = { 0, 1, 0 };
13155 const uint src1Region[3] = { 1, 1, 0 };
13156 CreateAlu(
13157 SIMDMode::SIMD8, 1, pSrcsArr[i], pSrcsArr[i], pSrcsArr[i],
13158 7 /*src0 subreg*/, src0Region /*src0 region*/,
13159 8 /*src1 subreg*/, src1Region /*src1 region*/,
13160 8 /*dst subreg*/, 1 /*dst region*/);
13161 }
13162 }
13163
13164 if (isSimd32 && !isQuad)
13165 {
13166 // For SIMD32 we need to write the last element of the prev element to the next 16 elements
13167 const uint src0Region[3] = { 0, 1, 0 };
13168 const uint src1Region[3] = { 1, 1, 0 };
13169 CreateAlu(
13170 SIMDMode::SIMD16, 1, pSrcsArr[1], pSrcsArr[0], pSrcsArr[1],
13171 (numLanes(m_currShader->m_SIMDSize) - 1) /*src0 subreg*/, src0Region /*src0 region*/,
13172 0 /*src1 subreg*/, src1Region /*src1 region*/,
13173 0 /*dst subreg*/, 1 /*dst region*/);
13174 }
13175 // reset second half state
13176 m_encoder->SetSecondHalf(false);
13177 }
13178
13179 // scalar version of the scan operation for 64b types
emitPreOrPostFixOpScalar(e_opcode op,uint64_t identityValue,VISA_Type type,bool negateSrc,CVariable * src,CVariable * result[2],CVariable * Flag,bool isPrefix)13180 void EmitPass::emitPreOrPostFixOpScalar(
13181 e_opcode op,
13182 uint64_t identityValue,
13183 VISA_Type type,
13184 bool negateSrc,
13185 CVariable* src,
13186 CVariable* result[2],
13187 CVariable* Flag,
13188 bool isPrefix)
13189 {
13190 const bool isInt64Mul = (op == EOPCODE_MUL && CEncoder::IsIntegerType(type) &&
13191 CEncoder::GetCISADataTypeSize(type) == 8);
13192
13193 bool isSimd32 = m_currShader->m_numberInstance == 2;
13194 int counter = isSimd32 ? 2 : 1;
13195 CVariable* pSrcCopy[2] = {};
13196 for (int i = 0; i < counter; ++i)
13197 {
13198 // This is to handle cases when not all lanes are enabled. In that case we fill the lanes with identity.
13199 pSrcCopy[i] = ScanReducePrepareSrc(type, identityValue, negateSrc, i == 1 /*secondHalf*/,
13200 src, nullptr /*dst*/, Flag);
13201
13202 result[i] = m_currShader->GetNewVariable(
13203 numLanes(m_currShader->m_SIMDSize),
13204 type,
13205 EALIGN_GRF,
13206 false,
13207 CName::NONE);
13208
13209 m_encoder->SetSecondHalf(i == 1);
13210
13211 int srcIdx = 0;
13212 m_encoder->SetSimdSize(SIMDMode::SIMD1);
13213 m_encoder->SetNoMask();
13214 if (isPrefix)
13215 {
13216 // For case where we need the prefix shift the source by 1 lane.
13217 if (i == 0)
13218 {
13219 // (W) mov (1) result[0] identity
13220 CVariable* pIdentityValue = m_currShader->ImmToVariable(identityValue, type);
13221 m_encoder->Copy(result[i], pIdentityValue);
13222 }
13223 else
13224 {
13225 // (W) mov (1) result[16] srcCopy[15]
13226 m_encoder->SetSrcSubReg(0, 15);
13227 m_encoder->SetSrcRegion(0, 0, 1, 0);
13228 m_encoder->Copy(result[i], pSrcCopy[0]);
13229 }
13230 }
13231 else
13232 {
13233 // (W) mov (1) result[0/16] srcCopy[0/16]
13234 m_encoder->SetSrcSubReg(0, 0);
13235 m_encoder->SetSrcRegion(0, 0, 1, 0);
13236 m_encoder->Copy(result[i], pSrcCopy[i]);
13237 srcIdx = 1;
13238 }
13239 m_encoder->Push();
13240
13241 CVariable* tmpDst = isInt64Mul ?
13242 m_currShader->GetNewVariable(
13243 1,
13244 type,
13245 EALIGN_GRF,
13246 true,
13247 result[0]->getName()) : nullptr;
13248
13249 for (int dstIdx = 1; dstIdx < numLanes(m_currShader->m_SIMDSize); ++dstIdx, ++srcIdx)
13250 {
13251 // do the scan one by one
13252 // (W) op (1) result[dstIdx] srcCopy[srcIdx] result[dstIdx-1]
13253 if (isInt64Mul)
13254 {
13255 CVariable* pMulSrc[2] = {
13256 m_currShader->GetNewAlias(pSrcCopy[i], type, srcIdx * sizeof(QWORD), 1, true),
13257 m_currShader->GetNewAlias(result[i], type, (dstIdx - 1) * sizeof(QWORD), 1, true) };
13258 Mul64(tmpDst, pMulSrc, SIMDMode::SIMD1, true /*noMask*/);
13259 // (W) mov (1) result[dstIdx] tmpDst
13260 m_encoder->SetSimdSize(SIMDMode::SIMD1);
13261 m_encoder->SetNoMask();
13262 m_encoder->SetDstSubReg(dstIdx);
13263 m_encoder->Copy(result[i], tmpDst);
13264 m_encoder->Push();
13265 }
13266 else
13267 {
13268 m_encoder->SetSimdSize(SIMDMode::SIMD1);
13269 m_encoder->SetNoMask();
13270 m_encoder->SetSrcSubReg(0, srcIdx);
13271 m_encoder->SetSrcRegion(0, 0, 1, 0);
13272 m_encoder->SetSrcRegion(1, 0, 1, 0);
13273 m_encoder->SetSrcSubReg(1, dstIdx - 1);
13274 m_encoder->SetDstSubReg(dstIdx);
13275 m_encoder->GenericAlu(op, result[i], pSrcCopy[i], result[i]);
13276 m_encoder->Push();
13277 }
13278 }
13279
13280 m_encoder->SetSecondHalf(false);
13281 }
13282
13283 if (isSimd32)
13284 {
13285 m_encoder->SetSecondHalf(true);
13286
13287 // For SIMD32 we need to write the last element of the prev element to the next 16 elements.
13288 if (isInt64Mul)
13289 {
13290 CVariable* pMulSrc[2] = {
13291 m_currShader->GetNewAlias(result[0], type, 15 * sizeof(QWORD), 1, true),
13292 result[1] };
13293 Mul64(result[1], pMulSrc, SIMDMode::SIMD16, true /*noMask*/);
13294 }
13295 else
13296 {
13297 m_encoder->SetSimdSize(SIMDMode::SIMD16);
13298 m_encoder->SetNoMask();
13299 m_encoder->SetSrcRegion(0, 0, 1, 0);
13300 m_encoder->SetSrcSubReg(0, 15);
13301 m_encoder->GenericAlu(op, result[1], result[0], result[1]);
13302 m_encoder->Push();
13303 }
13304
13305 m_encoder->SetSecondHalf(false);
13306 }
13307 }
13308
13309 /*
13310 ScalarAtomics: This optimization attempts to reduce the number of atomic instructions issued when
13311 the destination addresses and the source are both uniform. For example lets say we have an atomic
13312 add happens with destination address as <addr> = constant. <src> = constant too. In this case, lets
13313 say for SIMD8 there are 8 lanes trying to write to the same address. H/W will serialize this to
13314 8 back to back atomic instructions which are extremely slow to execute.
13315 */
emitScalarAtomics(llvm::Instruction * pInst,ResourceDescriptor & resource,AtomicOp atomic_op,CVariable * pDstAddr,CVariable * pSrc,bool isA64,int bitWidth)13316 void EmitPass::emitScalarAtomics(
13317 llvm::Instruction* pInst,
13318 ResourceDescriptor& resource,
13319 AtomicOp atomic_op,
13320 CVariable* pDstAddr,
13321 CVariable* pSrc,
13322 bool isA64,
13323 int bitWidth)
13324 {
13325 e_opcode op = EOPCODE_ADD;
13326 // find the value for which opcode(x, identity) == x
13327 unsigned int identityValue = 0;
13328 switch (atomic_op)
13329 {
13330 case EATOMIC_IADD:
13331 case EATOMIC_SUB:
13332 case EATOMIC_INC:
13333 case EATOMIC_DEC:
13334 identityValue = 0;
13335 op = EOPCODE_ADD;
13336 break;
13337 case EATOMIC_UMAX:
13338 identityValue = 0;
13339 op = EOPCODE_MAX;
13340 break;
13341 case EATOMIC_IMAX:
13342 identityValue = 0x80000000;
13343 op = EOPCODE_MAX;
13344 break;
13345 case EATOMIC_UMIN:
13346 identityValue = 0xFFFFFFFF;
13347 op = EOPCODE_MIN;
13348 break;
13349 case EATOMIC_IMIN:
13350 identityValue = 0X7FFFFFFF;
13351 op = EOPCODE_MIN;
13352 break;
13353 default:
13354 IGC_ASSERT_MESSAGE(0, "unsupported scalar atomic type");
13355 break;
13356 }
13357
13358 VISA_Type type =
13359 bitWidth == 16 ? ISA_TYPE_W :
13360 bitWidth == 32 ? ISA_TYPE_D :
13361 ISA_TYPE_Q;
13362 IGC_ASSERT_MESSAGE((bitWidth == 16) || (bitWidth == 32) || (bitWidth == 64), "invalid bitsize");
13363 if (atomic_op == EATOMIC_INC || atomic_op == EATOMIC_DEC)
13364 {
13365 if (atomic_op == EATOMIC_INC)
13366 {
13367 atomic_op = EATOMIC_IADD;
13368 }
13369 else
13370 {
13371 atomic_op = EATOMIC_SUB;
13372 }
13373
13374 pSrc = m_currShader->ImmToVariable(1, type);
13375 }
13376 if (atomic_op == EATOMIC_UMAX || atomic_op == EATOMIC_UMIN)
13377 {
13378 type = GetUnsignedType(type);
13379 }
13380 AtomicOp uniformAtomicOp = atomic_op;
13381 bool negateSrc = false;
13382 if (atomic_op == EATOMIC_SUB)
13383 {
13384 negateSrc = true;
13385 uniformAtomicOp = EATOMIC_IADD;
13386 }
13387 bool returnsImmValue = (!pInst->use_empty());
13388 CVariable* pFinalAtomicSrcVal = m_currShader->GetNewVariable(
13389 1,
13390 type,
13391 isA64 ? EALIGN_2GRF : EALIGN_GRF,
13392 true,
13393 CName::NONE);
13394 CVariable* pSrcsArr[2] = { nullptr, nullptr };
13395 if (returnsImmValue)
13396 {
13397 // sum all the lanes
13398 emitPreOrPostFixOp(op, identityValue, type, negateSrc, pSrc, pSrcsArr);
13399
13400 CVariable* pSrcCopy = pSrcsArr[0];
13401 if (m_currShader->m_numberInstance == 2)
13402 {
13403 pSrcCopy = pSrcsArr[1];
13404 }
13405
13406 m_encoder->SetSrcRegion(0, 0, 1, 0);
13407 m_encoder->SetSrcSubReg(0, numLanes(m_currShader->m_SIMDSize) - 1);
13408 m_encoder->Copy(pFinalAtomicSrcVal, pSrcCopy);
13409 m_encoder->Push();
13410 }
13411 else
13412 {
13413 emitReductionAll(op, identityValue, type, negateSrc, pSrc, pFinalAtomicSrcVal);
13414 }
13415
13416 if (pDstAddr->IsImmediate())
13417 {
13418 CVariable* pDstAddrCopy = m_currShader->GetNewVariable(
13419 1, ISA_TYPE_UD, EALIGN_GRF, true, CName::NONE);
13420 m_encoder->SetSimdSize(SIMDMode::SIMD1);
13421 m_encoder->SetNoMask();
13422 m_encoder->Copy(pDstAddrCopy, pDstAddr);
13423 m_encoder->Push();
13424 pDstAddr = pDstAddrCopy;
13425 }
13426
13427 m_encoder->SetSimdSize(SIMDMode::SIMD1);
13428 m_encoder->SetNoMask();
13429
13430 CVariable* pReturnVal = returnsImmValue ?
13431 m_currShader->GetNewVariable(
13432 1, ISA_TYPE_UD, EALIGN_GRF, true, CName::NONE) :
13433 nullptr;
13434
13435 if (bitWidth == 16)
13436 {
13437 CVariable* pCastAtomicSrcVal =
13438 m_currShader->GetNewVariable(1, ISA_TYPE_UD, EALIGN_GRF, true, CName::NONE);
13439
13440 m_encoder->Cast(pCastAtomicSrcVal, pFinalAtomicSrcVal);
13441 pFinalAtomicSrcVal = pCastAtomicSrcVal;
13442 }
13443
13444 if (isA64)
13445 {
13446 m_encoder->AtomicRawA64(
13447 uniformAtomicOp, resource,
13448 pReturnVal, pDstAddr,
13449 pFinalAtomicSrcVal, nullptr,
13450 bitWidth);
13451 }
13452 else
13453 {
13454 m_encoder->DwordAtomicRaw(
13455 uniformAtomicOp, resource,
13456 pReturnVal, pDstAddr,
13457 pFinalAtomicSrcVal,
13458 nullptr, bitWidth == 16);
13459 }
13460 m_encoder->Push();
13461
13462 if (returnsImmValue)
13463 {
13464 unsigned int counter = m_currShader->m_numberInstance;
13465 IGC_ASSERT_MESSAGE(op == EOPCODE_ADD, "we can only get the return value for add right now");
13466 for (unsigned int i = 0; i < counter; ++i)
13467 {
13468 m_encoder->SetNoMask();
13469 m_encoder->Add(pSrcsArr[i], pSrcsArr[i], pReturnVal);
13470 m_encoder->Push();
13471
13472 if (atomic_op == EATOMIC_IADD)
13473 {
13474 m_encoder->SetSrcModifier(1, EMOD_NEG);
13475 }
13476
13477 m_encoder->SetSecondHalf(i == 1);
13478 m_encoder->Add(m_destination, pSrcsArr[i], pSrc);
13479 m_encoder->Push();
13480 }
13481 }
13482 }
13483
13484 //
13485 // We emulate an atomic_load with an atomic_or with zero.
13486 // when the atomic is uniform we can directly generate a SIMD1 atomic_or
13487 //
emitScalarAtomicLoad(llvm::Instruction * pInst,ResourceDescriptor & resource,CVariable * pDstAddr,CVariable * pSrc,bool isA64,int bitWidth)13488 void EmitPass::emitScalarAtomicLoad(
13489 llvm::Instruction* pInst,
13490 ResourceDescriptor& resource,
13491 CVariable* pDstAddr,
13492 CVariable* pSrc,
13493 bool isA64,
13494 int bitWidth)
13495 {
13496 if (pDstAddr->IsImmediate())
13497 {
13498 CVariable* pDstAddrCopy = m_currShader->GetNewVariable(1, ISA_TYPE_UD, EALIGN_GRF, true, pDstAddr->getName());
13499 m_encoder->SetSimdSize(SIMDMode::SIMD1);
13500 m_encoder->SetNoMask();
13501 m_encoder->Copy(pDstAddrCopy, pDstAddr);
13502 m_encoder->Push();
13503 pDstAddr = pDstAddrCopy;
13504 }
13505
13506 {
13507 // pSrc is imm zero
13508 CVariable* pSrcCopy = m_currShader->GetNewVariable(1, ISA_TYPE_UD, EALIGN_GRF, true, pSrc->getName());
13509 m_encoder->SetSimdSize(SIMDMode::SIMD1);
13510 m_encoder->SetNoMask();
13511 m_encoder->Copy(pSrcCopy, pSrc);
13512 m_encoder->Push();
13513 pSrc = pSrcCopy;
13514 }
13515
13516 m_encoder->SetSimdSize(SIMDMode::SIMD1);
13517 m_encoder->SetNoMask();
13518
13519 CVariable* atomicDst = !pInst->use_empty() ?
13520 m_currShader->GetNewVariable(
13521 1,
13522 ISA_TYPE_UD,
13523 isA64 ? EALIGN_2GRF : EALIGN_GRF,
13524 true,
13525 pDstAddr->getName()) : nullptr;
13526
13527 if (isA64)
13528 {
13529 m_encoder->AtomicRawA64(
13530 EATOMIC_OR, resource,
13531 atomicDst, pDstAddr,
13532 pSrc, nullptr,
13533 bitWidth);
13534 }
13535 else
13536 {
13537 m_encoder->DwordAtomicRaw(
13538 EATOMIC_OR, resource,
13539 atomicDst, pDstAddr,
13540 pSrc,
13541 nullptr, bitWidth == 16);
13542 }
13543 m_encoder->Push();
13544
13545 if (!pInst->use_empty())
13546 {
13547 // we need to broadcast the return value
13548 // ToDo: change divergence analysis to mark scalar atomic load as uniform
13549 unsigned int counter = m_currShader->m_numberInstance;
13550 for (unsigned int i = 0; i < counter; ++i)
13551 {
13552 m_encoder->SetSecondHalf(i == 1);
13553 m_encoder->Copy(m_destination, atomicDst);
13554 m_encoder->Push();
13555 }
13556 }
13557 }
13558
IsUniformAtomic(llvm::Instruction * pInst)13559 bool EmitPass::IsUniformAtomic(llvm::Instruction* pInst)
13560 {
13561 if (llvm::GenIntrinsicInst * pIntrinsic = llvm::dyn_cast<llvm::GenIntrinsicInst>(pInst))
13562 {
13563 GenISAIntrinsic::ID id = pIntrinsic->getIntrinsicID();
13564
13565 // Dst address in bytes.
13566 if (id == GenISAIntrinsic::GenISA_intatomicraw ||
13567 id == GenISAIntrinsic::GenISA_intatomicrawA64)
13568 {
13569 Function* F = pInst->getParent()->getParent();
13570 if (IGC_IS_FLAG_ENABLED(DisableScalarAtomics) ||
13571 F->hasFnAttribute("KMPLOCK") ||
13572 m_currShader->m_DriverInfo->WASLMPointersDwordUnit())
13573 return false;
13574 llvm::Value* pllDstAddr = pInst->getOperand(1);
13575 CVariable* pDstAddr = GetSymbol(pllDstAddr);
13576 if (pDstAddr->IsUniform())
13577 {
13578 AtomicOp atomic_op = static_cast<AtomicOp>(llvm::cast<llvm::ConstantInt>(pInst->getOperand(3))->getZExtValue());
13579
13580 bool isAddAtomic = atomic_op == EATOMIC_IADD ||
13581 atomic_op == EATOMIC_INC ||
13582 atomic_op == EATOMIC_SUB;
13583 bool isMinMaxAtomic =
13584 atomic_op == EATOMIC_UMAX ||
13585 atomic_op == EATOMIC_UMIN ||
13586 atomic_op == EATOMIC_IMIN ||
13587 atomic_op == EATOMIC_IMAX;
13588
13589 // capture the special case of atomic_or with 0 (it's used to simulate atomic_load)
13590 bool isOrWith0Atomic = atomic_op == EATOMIC_OR &&
13591 isa<ConstantInt>(pInst->getOperand(2)) && cast<ConstantInt>(pInst->getOperand(2))->isZero();
13592
13593 if (isAddAtomic || (isMinMaxAtomic && pInst->use_empty()) || isOrWith0Atomic)
13594 return true;
13595 }
13596 }
13597 }
13598
13599 return false;
13600 }
13601
UnpackOrBroadcastIfUniform(CVariable * pVar)13602 CVariable* EmitPass::UnpackOrBroadcastIfUniform(CVariable* pVar)
13603 {
13604 if (pVar->GetElemSize() == 4 || pVar->GetElemSize() == 8)
13605 return BroadcastIfUniform(pVar);
13606
13607 IGC_ASSERT(pVar->GetElemSize() == 2);
13608
13609 uint16_t elts = numLanes(m_currShader->m_SIMDSize);
13610 // 16-bit atomics are still aligned at dword boundaries
13611 // with the upper 16-bits ignored.
13612 CVariable* pUnpacked =
13613 m_currShader->GetNewVariable(elts, ISA_TYPE_UD, EALIGN_GRF, CName(pVar->getName(), "Unpacked"));
13614
13615 m_encoder->Cast(pUnpacked, m_currShader->BitCast(pVar, ISA_TYPE_UW));
13616 return pUnpacked;
13617 }
13618
emitAtomicRaw(llvm::GenIntrinsicInst * pInsn)13619 void EmitPass::emitAtomicRaw(llvm::GenIntrinsicInst* pInsn)
13620 {
13621 ForceDMask();
13622 // Currently, Dword Atomics can be called by matching 2 intrinsics. One is the DwordAtomicRaw
13623 // and AtomicCmpXchg (which has 2 srcs unlike the other atomics).
13624 IGC_ASSERT(pInsn->getNumArgOperands() == 4);
13625
13626 /// Immediate Atomics return the value before the atomic operation is performed. So that flag
13627 /// needs to be set for this.
13628 bool returnsImmValue = !pInsn->use_empty();
13629
13630 llvm::Value* pllbuffer = pInsn->getOperand(0);
13631 llvm::Value* pllDstAddr = pInsn->getOperand(1);
13632 llvm::Value* pllSrc0 = pInsn->getOperand(2);
13633 ResourceDescriptor resource = GetResourceVariable(pllbuffer);
13634 CountStatelessIndirectAccess(pllbuffer, resource);
13635 AtomicOp atomic_op = EATOMIC_UNDEF;
13636
13637 if (pllbuffer->getType()->getPointerAddressSpace() == ADDRESS_SPACE_GLOBAL)
13638 {
13639 m_currShader->SetHasGlobalAtomics();
13640 }
13641
13642 CVariable* pSrc0 = nullptr;
13643 CVariable* pSrc1 = nullptr;
13644 llvm::GenIntrinsicInst* pIntrinCall = llvm::cast<llvm::GenIntrinsicInst>(pInsn);
13645 GenISAIntrinsic::ID IID = pIntrinCall->getIntrinsicID();
13646 if (IID == GenISAIntrinsic::GenISA_icmpxchgatomicraw ||
13647 IID == GenISAIntrinsic::GenISA_fcmpxchgatomicraw ||
13648 IID == GenISAIntrinsic::GenISA_icmpxchgatomicrawA64 ||
13649 IID == GenISAIntrinsic::GenISA_fcmpxchgatomicrawA64)
13650 {
13651 llvm::Value* pllSrc1 = pInsn->getOperand(3);
13652 pSrc1 = GetSymbol(pllSrc1);
13653
13654 Function* F = pInsn->getParent()->getParent();
13655 if (F->hasFnAttribute("KMPLOCK") && m_currShader->GetIsUniform(pInsn))
13656 {
13657 m_encoder->SetSimdSize(SIMDMode::SIMD1);
13658 m_encoder->SetNoMask();
13659 }
13660
13661 pSrc1 = UnpackOrBroadcastIfUniform(pSrc1);
13662 if (IID == GenISAIntrinsic::GenISA_fcmpxchgatomicraw ||
13663 IID == GenISAIntrinsic::GenISA_fcmpxchgatomicrawA64)
13664 {
13665 atomic_op = EATOMIC_FCMPWR;
13666 }
13667 else
13668 {
13669 atomic_op = EATOMIC_CMPXCHG;
13670 }
13671 }
13672 else
13673 {
13674 atomic_op = static_cast<AtomicOp>(llvm::cast<llvm::ConstantInt>(pInsn->getOperand(3))->getZExtValue());
13675 }
13676
13677
13678 unsigned short bitwidth = pInsn->getType()->getScalarSizeInBits();
13679 const bool is16Bit = (pInsn->getType()->getScalarSizeInBits() == 16);
13680
13681
13682 // atomic_inc and atomic_dec don't have both src0 and src1.
13683 if (atomic_op != EATOMIC_INC && atomic_op != EATOMIC_DEC &&
13684 atomic_op != EATOMIC_INC64 && atomic_op != EATOMIC_DEC64 &&
13685 atomic_op != EATOMIC_PREDEC && atomic_op != EATOMIC_PREDEC64)
13686 {
13687 pSrc0 = GetSymbol(pllSrc0);
13688 }
13689
13690 // Dst address in bytes.
13691 CVariable* pDstAddr = GetSymbol(pllDstAddr);
13692 // If DisableScalarAtomics regkey is enabled or DisableIGCOptimizations regkey is enabled then
13693 // don't enable scalar atomics, also do not enable for 64 bit
13694 if (IsUniformAtomic(pInsn) && bitwidth != 64)
13695 {
13696 PointerType* PtrTy = dyn_cast<PointerType>(pllDstAddr->getType());
13697 bool isA64 = PtrTy && isA64Ptr(PtrTy, m_currShader->GetContext());
13698 e_alignment uniformAlign = isA64 ? EALIGN_2GRF : EALIGN_GRF;
13699 // Re-align the pointer if it's not GRF aligned.
13700 pDstAddr = ReAlignUniformVariable(pDstAddr, uniformAlign);
13701 if (atomic_op == EATOMIC_OR)
13702 {
13703 // special case of atomic_load
13704 emitScalarAtomicLoad(pInsn, resource, pDstAddr, pSrc0, isA64, bitwidth);
13705 }
13706 else
13707 {
13708 emitScalarAtomics(pInsn, resource, atomic_op, pDstAddr, pSrc0, isA64, bitwidth);
13709 ResetVMask();
13710 }
13711 return;
13712 }
13713
13714 Function* F = pInsn->getParent()->getParent();
13715 if (F->hasFnAttribute("KMPLOCK") && m_currShader->GetIsUniform(pInsn))
13716 {
13717 m_encoder->SetSimdSize(SIMDMode::SIMD1);
13718 m_encoder->SetNoMask();
13719 }
13720 pDstAddr = BroadcastIfUniform(pDstAddr);
13721
13722 if (F->hasFnAttribute("KMPLOCK") && m_currShader->GetIsUniform(pInsn))
13723 {
13724 m_encoder->SetSimdSize(SIMDMode::SIMD1);
13725 m_encoder->SetNoMask();
13726 }
13727 if (pSrc0)
13728 {
13729 pSrc0 = UnpackOrBroadcastIfUniform(pSrc0);
13730 }
13731
13732 if (F->hasFnAttribute("KMPLOCK") && m_currShader->GetIsUniform(pInsn))
13733 {
13734 m_encoder->SetSimdSize(SIMDMode::SIMD1);
13735 m_encoder->SetNoMask();
13736 }
13737
13738 {
13739 CVariable* pDst = returnsImmValue ?
13740 m_currShader->GetNewVariable(
13741 numLanes(m_currShader->m_SIMDSize),
13742 bitwidth != 64 ? ISA_TYPE_UD : ISA_TYPE_UQ,
13743 EALIGN_GRF, CName::NONE) :
13744 nullptr;
13745
13746 PointerType* PtrTy = dyn_cast<PointerType>(pllDstAddr->getType());
13747 bool isA64 = PtrTy && isA64Ptr(PtrTy, m_currShader->GetContext());
13748 bool extendPointer = (bitwidth == 64 && !isA64);
13749 if (isA64 || extendPointer)
13750 {
13751 if (extendPointer)
13752 {
13753 pDstAddr = m_currShader->BitCast(pDstAddr, GetUnsignedIntegerType(pDstAddr->GetType()));
13754 CVariable* pDstAddr2 = m_currShader->GetNewVariable(
13755 pDstAddr->GetNumberElement(), ISA_TYPE_UQ, EALIGN_GRF, CName::NONE);
13756 m_encoder->Cast(pDstAddr2, pDstAddr);
13757 m_encoder->AtomicRawA64(atomic_op, resource, pDst, pDstAddr2, pSrc0, pSrc1, bitwidth);
13758 m_encoder->Push();
13759 }
13760 else
13761 {
13762 m_encoder->AtomicRawA64(atomic_op, resource, pDst, pDstAddr, pSrc0, pSrc1, bitwidth);
13763 m_encoder->Push();
13764 }
13765
13766 if (returnsImmValue) //This is needed for repacking of 16bit atomics otherwise it will be a vanilla mov
13767 {
13768 m_encoder->Cast(
13769 m_currShader->BitCast(m_destination, GetUnsignedIntegerType(m_destination->GetType())),
13770 pDst);
13771 m_encoder->Push();
13772 }
13773 }
13774 else
13775 {
13776 // TODO: SEND SLM OFFSET IN BYTES
13777 CodeGenContext* ctx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
13778 if (resource.m_surfaceType == ESURFACE_SLM && ctx->m_DriverInfo.WASLMPointersDwordUnit())
13779 {
13780 CVariable* pDwordAddr =
13781 m_currShader->GetNewVariable(numLanes(m_currShader->m_SIMDSize),
13782 ISA_TYPE_D, EALIGN_GRF, CName::NONE);
13783
13784 m_encoder->Shl(pDwordAddr, pDstAddr,
13785 m_currShader->ImmToVariable(0x2, ISA_TYPE_D));
13786 m_encoder->Push();
13787 pDstAddr = pDwordAddr;
13788 }
13789 pDstAddr = m_currShader->BitCast(pDstAddr, ISA_TYPE_UD);
13790
13791 if (pSrc0)
13792 {
13793 pSrc0 = m_currShader->BitCast(pSrc0, bitwidth != 64 ? ISA_TYPE_UD : ISA_TYPE_UQ);
13794 }
13795
13796 if (pSrc1)
13797 {
13798 pSrc1 = m_currShader->BitCast(pSrc1, bitwidth != 64 ? ISA_TYPE_UD : ISA_TYPE_UQ);
13799 }
13800 uint label = 0;
13801 CVariable* flag = nullptr;
13802 bool needLoop = ResourceLoopHeader(resource, flag, label);
13803 m_encoder->DwordAtomicRaw(
13804 atomic_op,
13805 resource,
13806 pDst,
13807 pDstAddr,
13808 pSrc0,
13809 pSrc1,
13810 is16Bit);
13811 m_encoder->Push();
13812 if (returnsImmValue)
13813 {
13814 m_encoder->Cast(
13815 m_currShader->BitCast(m_destination, GetUnsignedIntegerType(m_destination->GetType())),
13816 pDst);
13817 m_encoder->Push();
13818 }
13819 ResourceLoopBackEdge(needLoop, flag, label);
13820 }
13821
13822
13823 }
13824 ResetVMask();
13825 m_currShader->isMessageTargetDataCacheDataPort = true;
13826 }
13827
emitAtomicTyped(GenIntrinsicInst * pInsn)13828 void EmitPass::emitAtomicTyped(GenIntrinsicInst* pInsn)
13829 {
13830 ForceDMask();
13831 // Currently, Dword Atomics can be called by matching 2 intrinsics. One is the DwordAtomicRaw
13832 // and AtomicCmpXchg (which has 2 srcs unlike the other atomics).
13833 IGC_ASSERT(pInsn->getNumArgOperands() == 6);
13834
13835 /// Immediate Atomics return the value before the atomic operation is performed. So that flag
13836 /// needs to be set for this.
13837 bool returnsImmValue = !pInsn->user_empty();
13838
13839 llvm::Value* pllbuffer = pInsn->getOperand(0);
13840 llvm::Value* pllU = pInsn->getOperand(1);
13841 llvm::Value* pllV = pInsn->getOperand(2);
13842 llvm::Value* pllR = pInsn->getOperand(3);
13843 llvm::Value* pllSrc0 = pInsn->getOperand(4);
13844
13845 AtomicOp atomic_op = EATOMIC_UNDEF;
13846
13847 CVariable* pSrc0 = nullptr;
13848 CVariable* pSrc1 = nullptr;
13849
13850 if (pInsn->getIntrinsicID() == GenISAIntrinsic::GenISA_icmpxchgatomictyped)
13851 {
13852 llvm::Value* pllSrc1 = pInsn->getOperand(5);
13853 pSrc1 = GetSymbol(pllSrc1);
13854 pSrc1 = UnpackOrBroadcastIfUniform(pSrc1);
13855 atomic_op = EATOMIC_CMPXCHG;
13856 }
13857 else
13858 {
13859 atomic_op = static_cast<AtomicOp>(cast<ConstantInt>(pInsn->getOperand(5))->getZExtValue());
13860 }
13861
13862 if (atomic_op != EATOMIC_INC && atomic_op != EATOMIC_DEC)
13863 {
13864 pSrc0 = GetSymbol(pllSrc0);
13865 pSrc0 = UnpackOrBroadcastIfUniform(pSrc0);
13866 }
13867
13868 ResourceDescriptor resource = GetResourceVariable(pllbuffer);
13869
13870 CVariable* pU = GetSymbol(pllU);
13871 CVariable* pV = GetSymbol(pllV);
13872 CVariable* pR = GetSymbol(pllR);
13873
13874 pU = BroadcastIfUniform(pU);
13875 pV = BroadcastIfUniform(pV);
13876 pR = BroadcastIfUniform(pR);
13877
13878 if (m_currShader->GetIsUniform(pInsn))
13879 {
13880 IGC_ASSERT_MESSAGE(0, "Uniform DWordAtomicTyped not implemented yet");
13881 }
13882 else
13883 {
13884 uint addrDimension = 3;
13885 while (addrDimension > 1 && isUndefOrConstInt0(pInsn->getOperand(addrDimension)))
13886 {
13887 addrDimension--;
13888 }
13889
13890 TODO("Adding headers to atomic typed ops is a workaround, verify if this is needed");
13891 const bool headerPresent = true;
13892
13893 const uint parameterLength =
13894 addrDimension + (pSrc0 != nullptr) + (pSrc1 != nullptr) + headerPresent;
13895
13896 auto hw_atomic_op_enum = getHwAtomicOpEnum(atomic_op);
13897 uint responseLength = returnsImmValue;
13898
13899 unsigned int bti = 0;
13900 if (resource.m_surfaceType == ESURFACE_BINDLESS)
13901 {
13902 bti = BINDLESS_BTI;
13903 }
13904 else if (resource.m_resource->IsImmediate())
13905 {
13906 bti = (uint)resource.m_resource->GetImmediateValue();
13907 }
13908
13909 const auto messageType = EU_GEN7_5_DATA_CACHE_1_MESSAGE_TYPE_TYPED_ATOMIC_OPERATION;
13910
13911 uint messageSpecificControl = encodeMessageDescriptorForAtomicUnaryOp(
13912 parameterLength,
13913 responseLength,
13914 headerPresent,
13915 messageType,
13916 returnsImmValue,
13917 m_currShader->m_SIMDSize,
13918 hw_atomic_op_enum,
13919 bti);
13920
13921 CVariable* pMessDesc = m_currShader->ImmToVariable(messageSpecificControl, ISA_TYPE_D);
13922 CVariable* exDesc =
13923 m_currShader->ImmToVariable(EU_GEN7_5_MESSAGE_TARGET_DATA_PORT_DATA_CACHE_1, ISA_TYPE_D);
13924 if (resource.m_surfaceType == ESURFACE_BINDLESS)
13925 {
13926 CVariable* temp = m_currShader->GetNewVariable(resource.m_resource);
13927 m_encoder->Add(temp, resource.m_resource, exDesc);
13928 m_encoder->Push();
13929
13930 exDesc = temp;
13931 }
13932 CVariable* tempdst = returnsImmValue ?
13933 m_currShader->GetNewVariable(
13934 numLanes(m_currShader->m_SIMDSize), ISA_TYPE_UD, EALIGN_GRF, CName::NONE) :
13935 nullptr;
13936 CVariable* pPayload[2] = { nullptr, nullptr };
13937
13938 const unsigned int numLanesForSimd8 = numLanes(SIMDMode::SIMD8);
13939 IGC_ASSERT(numLanesForSimd8);
13940 const unsigned int loopIter = numLanes(m_currShader->m_SIMDSize) / numLanesForSimd8;
13941
13942 for (uint i = 0; i < loopIter; ++i)
13943 {
13944 pPayload[i] = m_currShader->GetNewVariable(
13945 parameterLength * numLanes(SIMDMode::SIMD8),
13946 ISA_TYPE_F,
13947 EALIGN_GRF,
13948 CName::NONE);
13949
13950 int writeIndex = 0;
13951 if (headerPresent)
13952 {
13953 m_encoder->SetSimdSize(SIMDMode::SIMD1);
13954 m_encoder->SetDstSubReg(7);
13955 m_encoder->SetNoMask();
13956 m_encoder->Copy(pPayload[i], m_currShader->ImmToVariable(0xFF, ISA_TYPE_D));
13957 m_encoder->Push();
13958 ++writeIndex;
13959 }
13960
13961 auto CopyVar = [&](CVariable* pVar)
13962 {
13963 m_encoder->SetSimdSize(SIMDMode::SIMD8);
13964 m_encoder->SetMask((i == 0) ? EMASK_Q1 : EMASK_Q2);
13965 if (!pVar->IsUniform())
13966 {
13967 m_encoder->SetSrcSubVar(0, i);
13968 }
13969 m_encoder->SetDstSubVar(writeIndex);
13970 m_encoder->Copy(pPayload[i], pVar);
13971 m_encoder->Push();
13972 ++writeIndex;
13973 };
13974
13975 CopyVar(pU);
13976
13977 if (addrDimension > 1)
13978 CopyVar(pV);
13979
13980 if (addrDimension > 2)
13981 CopyVar(pR);
13982
13983 if (pSrc0)
13984 CopyVar(pSrc0);
13985
13986 if (pSrc1)
13987 CopyVar(pSrc1);
13988 }
13989
13990 uint label = 0;
13991 CVariable* flag = nullptr;
13992 bool needLoop = ResourceLoopHeader(resource, flag, label);
13993 if (resource.m_surfaceType == ESURFACE_BINDLESS && !exDesc->IsUniform())
13994 {
13995 exDesc = UniformCopy(exDesc);
13996 }
13997 if (resource.m_surfaceType == ESURFACE_NORMAL && !resource.m_resource->IsImmediate())
13998 {
13999 CVariable* indirectMess = m_currShader->GetNewVariable(
14000 1, ISA_TYPE_UD, EALIGN_DWORD, true, CName::NONE);
14001 m_encoder->Or(indirectMess, pMessDesc, resource.m_resource);
14002 m_encoder->Push();
14003 pMessDesc = indirectMess;
14004 }
14005 for (uint i = 0; i < loopIter; ++i)
14006 {
14007 m_encoder->SetPredicate(flag);
14008 m_encoder->SetSimdSize(SIMDMode::SIMD8);
14009 m_encoder->SetMask((i == 0) ? EMASK_Q1 : EMASK_Q2);
14010 m_encoder->SetDstSubVar(i);
14011 m_encoder->Send(tempdst, pPayload[i],
14012 EU_GEN7_5_MESSAGE_TARGET_DATA_PORT_DATA_CACHE_1, exDesc, pMessDesc);
14013 m_encoder->Push();
14014 }
14015 ResourceLoopBackEdge(needLoop, flag, label);
14016
14017 if (returnsImmValue)
14018 {
14019 m_encoder->Cast(
14020 m_currShader->BitCast(m_destination, GetUnsignedIntegerType(m_destination->GetType())),
14021 tempdst);
14022 m_encoder->Push();
14023 }
14024 }
14025 ResetVMask();
14026 m_currShader->isMessageTargetDataCacheDataPort = true;
14027 }
14028
setSIMDSizeMask(CEncoder * m_encoder,const CShader * m_currShader,int i)14029 void setSIMDSizeMask(CEncoder* m_encoder, const CShader* m_currShader, int i)
14030 {
14031 m_encoder->SetSimdSize(SIMDMode::SIMD8);
14032 m_encoder->SetMask((i == 0) ? EMASK_Q1 : EMASK_Q2);
14033
14034
14035 return;
14036 }
14037
emitTypedRead(llvm::Instruction * pInsn)14038 void EmitPass::emitTypedRead(llvm::Instruction* pInsn)
14039 {
14040 const CShader::ExtractMaskWrapper writeMask(m_currShader, pInsn);
14041 IGC_ASSERT_MESSAGE(writeMask.hasEM() && writeMask.getEM() != 0, "Wrong write mask");
14042
14043 llvm::Value* pllSrcBuffer = pInsn->getOperand(0);
14044 llvm::Value* pllU = pInsn->getOperand(1);
14045 llvm::Value* pllV = pInsn->getOperand(2);
14046 llvm::Value* pllR = pInsn->getOperand(3);
14047 llvm::Value* pllLOD = getOperandIfExist(pInsn, 4);
14048
14049 CVariable* pLOD = isUndefOrConstInt0(pllLOD) ? nullptr : GetSymbol(pllLOD);
14050 CVariable* pR = (pLOD == nullptr && isUndefOrConstInt0(pllR)) ? nullptr : GetSymbol(pllR);
14051 CVariable* pV = (pR == nullptr && isUndefOrConstInt0(pllV)) ? nullptr : GetSymbol(pllV);
14052 CVariable* pU = GetSymbol(pllU);
14053
14054 pU = BroadcastIfUniform(pU, m_currShader->GetIsUniform(pInsn));
14055 pV = pV ? BroadcastIfUniform(pV, m_currShader->GetIsUniform(pInsn)) : nullptr;
14056 pR = pR ? BroadcastIfUniform(pR, m_currShader->GetIsUniform(pInsn)) : nullptr;
14057 pLOD = pLOD ? BroadcastIfUniform(pLOD, m_currShader->GetIsUniform(pInsn)) : nullptr;
14058
14059 ResourceDescriptor resource = GetResourceVariable(pllSrcBuffer);
14060
14061 uint numChannels = iSTD::BitCount(writeMask.getEM());
14062
14063 if (m_currShader->GetIsUniform(pInsn))
14064 {
14065 SIMDMode nativeDispatchMode = m_currShader->m_Platform->getMinDispatchMode();
14066 CVariable* tempdst = nullptr;
14067 tempdst = m_currShader->GetNewVariable(
14068 numChannels * numLanes(nativeDispatchMode),
14069 ISA_TYPE_F,
14070 EALIGN_GRF,
14071 CName("tyReadDest"));
14072 m_encoder->SetSimdSize(nativeDispatchMode);
14073 m_encoder->SetPredicate(nullptr);
14074 m_encoder->SetNoMask();
14075 m_encoder->TypedRead4(resource, pU, pV, pR, pLOD, tempdst, writeMask.getEM());
14076
14077 m_encoder->Push();
14078
14079 // Mov the required channel values to m_destination
14080 m_encoder->SetSimdSize(SIMDMode::SIMD1);
14081 m_encoder->SetNoMask();
14082
14083 for (uint i = 0; i < numChannels; ++i)
14084 {
14085 m_encoder->SetSrcSubReg(0, i * numLanes(nativeDispatchMode));
14086 m_encoder->SetDstSubReg(i);
14087 m_encoder->Copy(m_destination, tempdst);
14088 m_encoder->Push();
14089 }
14090 }
14091 else
14092 {
14093 uint label = 0;
14094 CVariable* flag = nullptr;
14095 bool needLoop = ResourceLoopHeader(resource, flag, label);
14096 CVariable* tempdst[4] = { nullptr, nullptr, nullptr, nullptr };
14097 SIMDMode instWidth = std::min(
14098 m_currShader->m_Platform->supportsSIMD16TypedRW() ? SIMDMode::SIMD16 : SIMDMode::SIMD8,
14099 m_currShader->m_SIMDSize);
14100 bool needsSplit = m_currShader->m_SIMDSize > instWidth;
14101
14102 if (!needsSplit)
14103 {
14104 m_encoder->SetPredicate(flag);
14105 m_encoder->TypedRead4(resource, pU, pV, pR, pLOD, m_destination, writeMask.getEM());
14106
14107 m_encoder->Push();
14108 }
14109 else
14110 {
14111 const unsigned int numLanesForInstWidth = numLanes(instWidth);
14112 IGC_ASSERT(numLanesForInstWidth);
14113 const unsigned int splitInstCount = numLanes(m_currShader->m_SIMDSize) / numLanesForInstWidth;
14114
14115 for (uint i = 0; i < splitInstCount; ++i)
14116 {
14117 tempdst[i] = m_currShader->GetNewVariable(
14118 numChannels * numLanes(instWidth),
14119 ISA_TYPE_F,
14120 EALIGN_GRF,
14121 CName::NONE);
14122
14123 setSIMDSizeMask(m_encoder, m_currShader, i);
14124 m_encoder->SetSrcSubVar(0, i);
14125 m_encoder->SetSrcSubVar(1, i);
14126 m_encoder->SetSrcSubVar(2, i);
14127 m_encoder->SetPredicate(flag);
14128 m_encoder->TypedRead4(resource, pU, pV, pR, pLOD, tempdst[i], writeMask.getEM());
14129 m_encoder->Push();
14130 }
14131 }
14132 ResourceLoopBackEdge(needLoop, flag, label);
14133
14134 if (m_currShader->m_SIMDSize != instWidth)
14135 {
14136 JoinSIMD(tempdst, numChannels, instWidth);
14137 }
14138 }
14139 m_currShader->isMessageTargetDataCacheDataPort = true;
14140 }
14141
emitTypedWrite(llvm::Instruction * pInsn)14142 void EmitPass::emitTypedWrite(llvm::Instruction* pInsn)
14143 {
14144 ForceDMask();
14145 llvm::Value* pllDstBuffer = pInsn->getOperand(0);
14146 llvm::Value* pllU = pInsn->getOperand(1);
14147 llvm::Value* pllV = pInsn->getOperand(2);
14148 llvm::Value* pllR = pInsn->getOperand(3);
14149 llvm::Value* pllLOD = pInsn->getOperand(4);
14150 llvm::Value* pllSrc_X = pInsn->getOperand(5);
14151 llvm::Value* pllSrc_Y = pInsn->getOperand(6);
14152 llvm::Value* pllSrc_Z = pInsn->getOperand(7);
14153 llvm::Value* pllSrc_W = pInsn->getOperand(8);
14154
14155 CVariable* pLOD = isUndefOrConstInt0(pllLOD) ? nullptr : GetSymbol(pllLOD);
14156 CVariable* pR = (pLOD == nullptr && isUndefOrConstInt0(pllR)) ? nullptr : GetSymbol(pllR);
14157 CVariable* pV = (pR == nullptr && isUndefOrConstInt0(pllV)) ? nullptr : GetSymbol(pllV);
14158 CVariable* pU = GetSymbol(pllU);
14159
14160 CVariable* pSrc_X = GetSymbol(pllSrc_X);
14161 CVariable* pSrc_Y = GetSymbol(pllSrc_Y);
14162 CVariable* pSrc_Z = GetSymbol(pllSrc_Z);
14163 CVariable* pSrc_W = GetSymbol(pllSrc_W);
14164
14165 pU = BroadcastIfUniform(pU);
14166 pV = pV ? BroadcastIfUniform(pV) : nullptr;
14167 pR = pR ? BroadcastIfUniform(pR) : nullptr;
14168 pLOD = pLOD ? BroadcastIfUniform(pLOD) : nullptr;
14169
14170 uint writeMask =
14171 (!llvm::isa<UndefValue>(pllSrc_X) ? 1 : 0) |
14172 (!llvm::isa<UndefValue>(pllSrc_Y) ? 2 : 0) |
14173 (!llvm::isa<UndefValue>(pllSrc_Z) ? 4 : 0) |
14174 (!llvm::isa<UndefValue>(pllSrc_W) ? 8 : 0);
14175
14176 ResourceDescriptor resource = GetResourceVariable(pllDstBuffer);
14177
14178 if (m_currShader->GetIsUniform(pInsn))
14179 {
14180 IGC_ASSERT_MESSAGE(0, "Uniform store_uav_typed not implemented yet");
14181 }
14182 else
14183 {
14184 uint label = 0;
14185 CVariable* flag = nullptr;
14186 bool needLoop = ResourceLoopHeader(resource, flag, label);
14187 uint parameterLength = 4;
14188
14189 SIMDMode instWidth = std::min(
14190 m_currShader->m_Platform->supportsSIMD16TypedRW() ? SIMDMode::SIMD16 : SIMDMode::SIMD8,
14191 m_currShader->m_SIMDSize);
14192 bool needsSplit = m_currShader->m_SIMDSize > instWidth;
14193
14194 if (!needsSplit)
14195 {
14196 CVariable* pPayload = m_currShader->GetNewVariable(
14197 parameterLength * numLanes(m_currShader->m_SIMDSize),
14198 ISA_TYPE_F,
14199 EALIGN_GRF,
14200 CName::NONE);
14201 // pSrcX, Y, Z & W are broadcast to uniform by this function itself.
14202 m_currShader->CopyVariable(pPayload, pSrc_X, 0);
14203 m_currShader->CopyVariable(pPayload, pSrc_Y, 1);
14204 m_currShader->CopyVariable(pPayload, pSrc_Z, 2);
14205 m_currShader->CopyVariable(pPayload, pSrc_W, 3);
14206 m_encoder->SetPredicate(flag);
14207 m_encoder->TypedWrite4(resource, pU, pV, pR, pLOD, pPayload, writeMask);
14208
14209 m_encoder->Push();
14210 }
14211 else
14212 {
14213 IGC_ASSERT(instWidth == SIMDMode::SIMD8 ||
14214 instWidth == SIMDMode::SIMD16);
14215 IGC_ASSERT(m_currShader->m_SIMDSize > instWidth);
14216 const uint numInst = numLanes(m_currShader->m_SIMDSize) / numLanes(instWidth);
14217 std::vector<CVariable*> pPayload(numInst);
14218 for (uint i = 0; i < numInst; ++i)
14219 {
14220 pPayload[i] = m_currShader->GetNewVariable(
14221 parameterLength * numLanes(instWidth),
14222 ISA_TYPE_F,
14223 EALIGN_GRF, CName::NONE);
14224 setSIMDSizeMask(m_encoder, m_currShader, i);
14225 if (!pSrc_X->IsUniform())
14226 {
14227 m_encoder->SetSrcSubVar(0, i);
14228 }
14229 m_encoder->SetDstSubVar(0);
14230 m_encoder->Copy(pPayload[i], pSrc_X);
14231 m_encoder->Push();
14232
14233 setSIMDSizeMask(m_encoder, m_currShader, i);
14234 if (!pSrc_Y->IsUniform())
14235 {
14236 m_encoder->SetSrcSubVar(0, i);
14237 }
14238 m_encoder->SetDstSubVar(1);
14239 m_encoder->Copy(pPayload[i], pSrc_Y);
14240 m_encoder->Push();
14241
14242 setSIMDSizeMask(m_encoder, m_currShader, i);
14243 if (!pSrc_Z->IsUniform())
14244 {
14245 m_encoder->SetSrcSubVar(0, i);
14246 }
14247 m_encoder->SetDstSubVar(2);
14248 m_encoder->Copy(pPayload[i], pSrc_Z);
14249 m_encoder->Push();
14250
14251 setSIMDSizeMask(m_encoder, m_currShader, i);
14252 if (!pSrc_W->IsUniform())
14253 {
14254 m_encoder->SetSrcSubVar(0, i);
14255 }
14256 m_encoder->SetDstSubVar(3);
14257 m_encoder->Copy(pPayload[i], pSrc_W);
14258 m_encoder->Push();
14259 if (!m_currShader->m_Platform->canFuseTypedWrite())
14260 {
14261 setSIMDSizeMask(m_encoder, m_currShader, i);
14262 m_encoder->SetSrcSubVar(0, i);
14263 m_encoder->SetSrcSubVar(1, i);
14264 m_encoder->SetSrcSubVar(2, i);
14265 m_encoder->SetSrcSubVar(3, i);
14266 m_encoder->SetPredicate(flag);
14267 m_encoder->TypedWrite4(resource, pU, pV, pR, pLOD, pPayload[i], writeMask);
14268 m_encoder->Push();
14269 }
14270 }
14271 if (m_currShader->m_Platform->canFuseTypedWrite())
14272 {
14273 for (uint i = 0; i < numInst; ++i)
14274 {
14275 setSIMDSizeMask(m_encoder, m_currShader, i);
14276 m_encoder->SetSrcSubVar(0, i);
14277 m_encoder->SetSrcSubVar(1, i);
14278 m_encoder->SetSrcSubVar(2, i);
14279 m_encoder->SetSrcSubVar(3, i);
14280 m_encoder->SetPredicate(flag);
14281 m_encoder->TypedWrite4(resource, pU, pV, pR, pLOD, pPayload[i], writeMask);
14282 m_encoder->Push();
14283 }
14284 }
14285 }
14286 ResourceLoopBackEdge(needLoop, flag, label);
14287 }
14288 ResetVMask();
14289 m_currShader->isMessageTargetDataCacheDataPort = true;
14290 }
14291
divergentBarrierCheck(const CShader * Shader,const CodeGenContext & Ctx,const Instruction * I)14292 static void divergentBarrierCheck(
14293 const CShader* Shader, const CodeGenContext &Ctx, const Instruction* I)
14294 {
14295 if (IGC_IS_FLAG_DISABLED(EnableDivergentBarrierCheck))
14296 return;
14297
14298 if (Shader->InsideWorkgroupDivergentCF(I))
14299 {
14300 Debug::DumpName name =
14301 IGC::Debug::GetDumpNameObj(Shader, "divergent_barrier.log");
14302 std::string Path = name.str();
14303 std::ofstream OS(Path, std::ios::app);
14304 if (OS.is_open())
14305 {
14306 std::string Repr;
14307 raw_string_ostream SS(Repr);
14308 I->print(SS, true);
14309 SS.flush();
14310 OS << '\n' << Repr;
14311 Ctx.EmitError(OS, "Possible divergent barrier found", I);
14312 }
14313 }
14314 }
14315
emitThreadGroupBarrier(llvm::Instruction * inst)14316 void EmitPass::emitThreadGroupBarrier(llvm::Instruction* inst)
14317 {
14318 if (m_currShader->GetShaderType() == ShaderType::HULL_SHADER)
14319 {
14320 // set barrier counter bits in R0.2 (for use by VISA barrier instruction)
14321
14322 CHullShader* hsProgram = static_cast<CHullShader*>(m_currShader);
14323 int instanceCount = hsProgram->DetermineInstanceCount();
14324 // This sets the barrier message counter bits which is needed for HS
14325 unsigned int counterBits = m_currShader->m_Platform->getBarrierCountBits(instanceCount);
14326 CVariable* tmpVar = m_currShader->GetNewVariable(1, ISA_TYPE_UD, EALIGN_GRF, true, CName::NONE);
14327
14328 if (m_currShader->m_Platform->needsHSBarrierIDWorkaround())
14329 {
14330 // move barrier id into bits 27:24 of R0.2 in the payload to match with GPGPU payload for barrier id
14331 // VISA assumes barrier id is found in bits 27:24 as in GPGPU payload and to avoid any IGC/VISA change
14332 // this is a simple WA which needs to be applied
14333
14334 CVariable* masklower24bit = m_currShader->ImmToVariable(0xf000000, ISA_TYPE_UD);
14335 m_encoder->SetSrcRegion(0, 0, 1, 0);
14336 m_encoder->SetSrcSubReg(0, 2);
14337 m_encoder->Shl(tmpVar, hsProgram->GetR0(), m_currShader->ImmToVariable(11, ISA_TYPE_UD));
14338 m_encoder->Push();
14339 m_encoder->And(tmpVar, tmpVar, masklower24bit);
14340 m_encoder->Push();
14341 m_encoder->Or(tmpVar, tmpVar, m_currShader->ImmToVariable(counterBits, ISA_TYPE_UD));
14342 m_encoder->Push();
14343 }
14344 else
14345 {
14346 // If barrier - id bits match GPGPU payload
14347 m_encoder->SetSrcRegion(0, 0, 1, 0);
14348 m_encoder->SetSrcSubReg(0, 2);
14349 m_encoder->Or(tmpVar, hsProgram->GetR0(), m_currShader->ImmToVariable(counterBits, ISA_TYPE_UD));
14350 m_encoder->Push();
14351 }
14352
14353 m_encoder->SetDstSubReg(2);
14354 m_encoder->SetSimdSize(SIMDMode::SIMD1);
14355 m_encoder->SetNoMask();
14356 m_encoder->Copy(hsProgram->GetR0(), tmpVar);
14357 m_encoder->Push();
14358 }
14359
14360 // OPT: Remove barrier instruction when thread group size is less or equal than simd size.
14361 bool skipBarrierInstructionInCS = false;
14362 if (m_currShader->GetShaderType() == ShaderType::COMPUTE_SHADER)
14363 {
14364 unsigned int threadGroupSizeCS = (static_cast<CComputeShader*>(m_currShader))->GetThreadGroupSize();
14365 if (threadGroupSizeCS <= numLanes(m_SimdMode))
14366 {
14367 skipBarrierInstructionInCS = true;
14368 }
14369 }
14370 else if (m_currShader->GetShaderType() == ShaderType::OPENCL_SHADER) {
14371 Function* F = inst->getParent()->getParent();
14372 MetaDataUtils* pMdUtils = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
14373 uint32_t sz = IGCMetaDataHelper::getThreadGroupSize(*pMdUtils, F);
14374 if (sz != 0 && sz <= numLanes(m_SimdMode)) {
14375 skipBarrierInstructionInCS = true;
14376 }
14377 }
14378
14379 if (!skipBarrierInstructionInCS)
14380 {
14381 e_barrierKind BarrierKind = EBARRIER_NORMAL; // default
14382 GenIntrinsicInst* geninst = cast<GenIntrinsicInst>(inst);
14383 if (geninst->getIntrinsicID() == GenISAIntrinsic::GenISA_threadgroupbarrier_signal) {
14384 BarrierKind = EBARRIER_SIGNAL;
14385 }
14386 else if (geninst->getIntrinsicID() == GenISAIntrinsic::GenISA_threadgroupbarrier_wait) {
14387 BarrierKind = EBARRIER_WAIT;
14388 }
14389 m_encoder->Barrier(BarrierKind);
14390 m_encoder->Push();
14391
14392 // Set if barrier was used for this function
14393 m_encoder->SetFunctionHasBarrier(inst->getFunction());
14394
14395 divergentBarrierCheck(m_currShader, *m_pCtx, inst);
14396 }
14397 }
14398
14399
emitMemoryFence(llvm::Instruction * inst)14400 void EmitPass::emitMemoryFence(llvm::Instruction* inst)
14401 {
14402 CodeGenContext* ctx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
14403
14404 bool CommitEnable = llvm::cast<llvm::ConstantInt>((inst->getOperand(0)))->getValue().getBoolValue();
14405 bool L3_Flush_RW_Data = llvm::cast<llvm::ConstantInt>((inst->getOperand(1)))->getValue().getBoolValue();
14406 bool L3_Flush_Constant_Data = llvm::cast<llvm::ConstantInt>((inst->getOperand(2)))->getValue().getBoolValue();
14407 bool L3_Flush_Texture_Data = llvm::cast<llvm::ConstantInt>((inst->getOperand(3)))->getValue().getBoolValue();
14408 bool L3_Flush_Instructions = llvm::cast<llvm::ConstantInt>((inst->getOperand(4)))->getValue().getBoolValue();
14409 bool Global_Mem_Fence = true;
14410 bool L1_Invalidate = ctx->platform.hasL1ReadOnlyCache();
14411
14412 // If passed a non-constant parameter, be conservative and assume that the parameter is true
14413 if (ConstantInt* globalConst = llvm::dyn_cast<llvm::ConstantInt>(inst->getOperand(6)))
14414 {
14415 L1_Invalidate &= globalConst->getValue().getBoolValue();
14416 }
14417
14418 bool EmitFence = true;
14419 // If passed a non-constant parameter, be conservative and emit a fence.
14420 // We really don't want to add control-flow at this point.
14421 if (ConstantInt* globalConst = llvm::dyn_cast<llvm::ConstantInt>(inst->getOperand(5)))
14422 {
14423 Global_Mem_Fence = globalConst->getValue().getBoolValue();
14424 if (globalConst->isZero())
14425 {
14426 // Check whether we know this is a local fence. If we do, don't emit fence for a BDW+SKL/BXT only.
14427 // case CLK_LOCAL_MEM_FENCE:
14428
14429 if (ctx->platform.localMemFenceSupress())
14430 {
14431 EmitFence = false;
14432 }
14433 }
14434 }
14435
14436 // for untyped memory fence L3 flush is never necessary.
14437 L3_Flush_RW_Data = false;
14438 if (L3_Flush_RW_Data)
14439 {
14440 // dont flush L1 if L3 is also being flushed
14441 L1_Invalidate = false;
14442 }
14443
14444
14445 m_encoder->Fence(CommitEnable,
14446 L3_Flush_RW_Data,
14447 L3_Flush_Constant_Data,
14448 L3_Flush_Texture_Data,
14449 L3_Flush_Instructions,
14450 Global_Mem_Fence,
14451 L1_Invalidate,
14452 !EmitFence);
14453
14454 m_encoder->Push();
14455 }
14456
emitMemoryFence()14457 void EmitPass::emitMemoryFence()
14458 {
14459 m_encoder->Fence(true,
14460 false,
14461 false,
14462 false,
14463 false,
14464 true,
14465 false,
14466 false);
14467 m_encoder->Push();
14468 }
14469
emitTypedMemoryFence(llvm::Instruction * inst)14470 void EmitPass::emitTypedMemoryFence(llvm::Instruction* inst)
14471 {
14472 CodeGenContext* ctx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
14473
14474 bool CommitEnable = true;
14475 bool L3_Flush_RW_Data = m_currShader->m_Platform->flushL3ForTypedMemory();
14476 bool L3_Flush_Constant_Data = false;
14477 bool L3_Flush_Texture_Data = false;
14478 bool L3_Flush_Instructions = false;
14479 bool Global_Mem_Fence = true;
14480 bool L1_Invalidate = ctx->platform.hasL1ReadOnlyCache();
14481
14482 // If passed a non-constant parameter, be conservative and assume that the parameter is true
14483 if (ConstantInt* globalConst = llvm::dyn_cast<llvm::ConstantInt>(inst->getOperand(0)))
14484 {
14485 L1_Invalidate &= globalConst->getValue().getBoolValue();
14486 }
14487
14488
14489
14490 m_encoder->Fence(CommitEnable,
14491 L3_Flush_RW_Data,
14492 L3_Flush_Constant_Data,
14493 L3_Flush_Texture_Data,
14494 L3_Flush_Instructions,
14495 Global_Mem_Fence,
14496 L1_Invalidate,
14497 false);
14498 emitFlushSamplerCache();
14499 }
14500
14501
emitFlushSamplerCache()14502 void EmitPass::emitFlushSamplerCache()
14503 {
14504 m_encoder->FlushSamplerCache();
14505 m_encoder->Push();
14506 }
14507
emitPhaseOutput(llvm::GenIntrinsicInst * inst)14508 void EmitPass::emitPhaseOutput(llvm::GenIntrinsicInst* inst)
14509 {
14510 IGC_ASSERT(m_currShader->GetShaderType() == ShaderType::PIXEL_SHADER);
14511 CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
14512 IGC_ASSERT(nullptr != psProgram);
14513 IGC_ASSERT(psProgram->GetPhase() == PSPHASE_COARSE);
14514
14515 unsigned int outputIndex = (unsigned int)cast<llvm::ConstantInt>(inst->getOperand(1))->getZExtValue();
14516 CVariable* output = GetSymbol(inst->getOperand(0));
14517 if (inst->getIntrinsicID() == GenISAIntrinsic::GenISA_PHASE_OUTPUT)
14518 {
14519 CVariable* temp =
14520 m_currShader->GetNewVariable(numLanes(m_SimdMode), output->GetType(), EALIGN_GRF, CName::NONE);
14521 m_encoder->Copy(temp, output);
14522 output = temp;
14523 }
14524
14525 psProgram->AddCoarseOutput(output, outputIndex);
14526 }
14527
emitPhaseInput(llvm::GenIntrinsicInst * inst)14528 void EmitPass::emitPhaseInput(llvm::GenIntrinsicInst* inst)
14529 {
14530 IGC_ASSERT(m_currShader->GetShaderType() == ShaderType::PIXEL_SHADER);
14531 CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
14532 IGC_ASSERT(psProgram->GetPhase() == PSPHASE_PIXEL);
14533
14534 unsigned int inputIndex = (unsigned int)cast<llvm::ConstantInt>(inst->getOperand(0))->getZExtValue();
14535 bool isVectorInput = inst->getIntrinsicID() == GenISAIntrinsic::GenISA_PHASE_INPUTVEC;
14536 uint16_t vectorSize = isVectorInput ?
14537 int_cast<uint16_t>(cast<ConstantInt>(inst->getArgOperand(2))->getZExtValue()) : (uint16_t)1;
14538 CVariable* input = psProgram->GetCoarseInput(inputIndex, vectorSize, m_destination->GetType());
14539
14540 // address variable represents register a0
14541 CVariable* pDstArrElm = m_currShader->GetNewAddressVariable(
14542 numLanes(m_currShader->m_SIMDSize),
14543 input->GetType(),
14544 false,
14545 true,
14546 input->getName());
14547
14548 // we add offsets to the base that is the beginning of the vector variable
14549 CVariable* index = psProgram->GetCoarseParentIndex();
14550 CVariable* byteAddress = psProgram->GetNewVariable(numLanes(m_SimdMode), ISA_TYPE_UW, EALIGN_OWORD, CName::NONE);
14551 DWORD shiftAmount = iSTD::Log2(CEncoder::GetCISADataTypeSize(input->GetType()));
14552 m_encoder->Shl(byteAddress, index, psProgram->ImmToVariable(shiftAmount, ISA_TYPE_UW));
14553 m_encoder->Push();
14554
14555 if (isVectorInput)
14556 {
14557 CVariable* elementOffset = psProgram->GetNewVariable(numLanes(m_SimdMode), ISA_TYPE_UW, EALIGN_OWORD, CName::NONE);
14558 uint elementSize = numLanes(m_SimdMode) * CEncoder::GetCISADataTypeSize(input->GetType());
14559 m_encoder->Mul(elementOffset, GetSymbol(inst->getArgOperand(1)), psProgram->ImmToVariable(elementSize, ISA_TYPE_UW));
14560 m_encoder->Push();
14561 CVariable* adjustedByteAddress = psProgram->GetNewVariable(numLanes(m_SimdMode), ISA_TYPE_UW, EALIGN_OWORD, CName::NONE);
14562 m_encoder->Add(adjustedByteAddress, byteAddress, elementOffset);
14563 m_encoder->Push();
14564 byteAddress = adjustedByteAddress;
14565 }
14566
14567 m_encoder->AddrAdd(pDstArrElm, input, byteAddress);
14568 m_encoder->Push();
14569
14570 m_encoder->Copy(m_destination, pDstArrElm);
14571 m_encoder->Push();
14572 }
14573
emitUniformAtomicCounter(llvm::GenIntrinsicInst * pInsn)14574 void EmitPass::emitUniformAtomicCounter(llvm::GenIntrinsicInst* pInsn)
14575 {
14576 ForceDMask();
14577 IGC_ASSERT(pInsn->getNumOperands() == 2);
14578 GenISAIntrinsic::ID IID = pInsn->getIntrinsicID();
14579 /// Immediate Atomics return the value before the atomic operation is performed. So that flag
14580 /// needs to be set for this.
14581 bool returnsImmValue = !pInsn->user_empty();
14582
14583 llvm::Value* pllbuffer = pInsn->getOperand(0);
14584 ResourceDescriptor resource = GetResourceVariable(pllbuffer);
14585 uint binding_table_index = 0;
14586
14587 CVariable* prefixVar[2] = { nullptr, nullptr };
14588 CVariable* dst = m_destination;
14589 bool hasheader = m_currShader->m_Platform->needsHeaderForAtomicCounter();
14590
14591 EU_DATA_PORT_ATOMIC_OPERATION_TYPE atomicType = EU_DATA_PORT_ATOMIC_OPERATION_ADD;
14592 // for SIMD dispatch greater than 8 it is more efficient to emit a SIMD1 atomic
14593 CVariable* src = m_currShader->ImmToVariable(
14594 IID == GenISAIntrinsic::GenISA_atomiccounterinc ? 1 : -1, ISA_TYPE_D);
14595 emitPreOrPostFixOp(EOPCODE_ADD, 0, ISA_TYPE_D, false, src, prefixVar);
14596 CVariable* pSrcCopy = prefixVar[0];
14597 if (m_currShader->m_numberInstance == 2)
14598 {
14599 pSrcCopy = prefixVar[1];
14600 }
14601
14602 CVariable* pHeader = nullptr;
14603 if (hasheader)
14604 {
14605 pHeader = m_currShader->GetNewVariable(
14606 numLanes(SIMDMode::SIMD8),
14607 ISA_TYPE_UD,
14608 EALIGN_GRF, CName::NONE);
14609
14610 m_encoder->SetNoMask();
14611 m_encoder->SetSimdSize(SIMDMode::SIMD1);
14612 m_encoder->SetDstSubReg(7);
14613 m_encoder->Copy(pHeader, m_currShader->ImmToVariable(0xFFFF, ISA_TYPE_UD));
14614 m_encoder->Push();
14615 }
14616
14617 CVariable* pPayload = m_currShader->GetNewVariable(
14618 8,
14619 ISA_TYPE_D,
14620 EALIGN_GRF,
14621 true, CName::NONE);
14622 m_encoder->SetSimdSize(SIMDMode::SIMD1);
14623 m_encoder->SetSrcRegion(0, 0, 1, 0);
14624 m_encoder->SetSrcSubReg(0, numLanes(m_currShader->m_SIMDSize) - 1);
14625 m_encoder->Copy(pPayload, pSrcCopy);
14626 m_encoder->Push();
14627 dst = m_currShader->GetNewVariable(
14628 8,
14629 ISA_TYPE_D,
14630 EALIGN_GRF,
14631 true, CName::NONE);
14632
14633 if (resource.m_surfaceType == ESURFACE_SSHBINDLESS)
14634 binding_table_index = SSH_BINDLESS_BTI;
14635 else if (resource.m_surfaceType == ESURFACE_BINDLESS)
14636 binding_table_index = BINDLESS_BTI;
14637 else
14638 binding_table_index = (uint)resource.m_resource->GetImmediateValue();
14639
14640 uint messageDescriptor = encodeMessageDescriptorForAtomicUnaryOp(
14641 1,
14642 returnsImmValue ? 1 : 0,
14643 hasheader,
14644 EU_GEN7_5_DATA_CACHE_1_MESSAGE_TYPE_ATOMIC_COUNTER_OPERATION,
14645 returnsImmValue,
14646 SIMDMode::SIMD8,
14647 atomicType,
14648 binding_table_index);
14649
14650 CVariable* pMessDesc = m_currShader->ImmToVariable(messageDescriptor, ISA_TYPE_D);
14651 // src1 len = 1, SFID = DC1
14652 uint32_t src1Len = hasheader ? 1 : 0;
14653 //src1Len is not encoded in ext descriptor in case of 26bit bso
14654 if (m_currShader->m_Platform->support26BitBSOFormat() &&
14655 (resource.m_surfaceType == ESURFACE_BINDLESS || resource.m_surfaceType == ESURFACE_SCRATCH))
14656 {
14657 src1Len = 0;
14658 }
14659 uint32_t exDescVal = (src1Len << 6) | EU_GEN7_5_MESSAGE_TARGET_DATA_PORT_DATA_CACHE_1;
14660 CVariable* exDesc =
14661 m_currShader->ImmToVariable(exDescVal, ISA_TYPE_D);
14662
14663 if (resource.m_surfaceType == ESURFACE_BINDLESS || resource.m_surfaceType == ESURFACE_SSHBINDLESS)
14664 {
14665 CVariable* temp = m_currShader->GetNewVariable(resource.m_resource);
14666 m_encoder->Add(temp, resource.m_resource, exDesc);
14667 m_encoder->Push();
14668 exDesc = temp;
14669 }
14670
14671 m_encoder->SetSimdSize(SIMDMode::SIMD1);
14672 m_encoder->SetNoMask();
14673
14674 if (hasheader)
14675 {
14676 m_encoder->Sends(returnsImmValue ? dst : nullptr, pHeader, pPayload,
14677 EU_GEN7_5_MESSAGE_TARGET_DATA_PORT_DATA_CACHE_1, exDesc, pMessDesc);
14678 }
14679 else
14680 {
14681 m_encoder->Send(
14682 returnsImmValue ? dst : NULL,
14683 pPayload,
14684 EU_GEN7_5_MESSAGE_TARGET_DATA_PORT_DATA_CACHE_1,
14685 exDesc,
14686 pMessDesc);
14687 }
14688 m_encoder->Push();
14689
14690 if (returnsImmValue)
14691 {
14692 unsigned int counter = m_currShader->m_numberInstance;
14693 for (unsigned int i = 0; i < counter; ++i)
14694 {
14695 m_encoder->SetSecondHalf(i == 1);
14696 m_encoder->Add(m_destination, prefixVar[i], dst);
14697 m_encoder->Push();
14698
14699 if (IID == GenISAIntrinsic::GenISA_atomiccounterinc)
14700 {
14701 CVariable* src = m_currShader->ImmToVariable(-1, ISA_TYPE_D);
14702 m_encoder->Add(m_destination, m_destination, src);
14703 m_encoder->Push();
14704 }
14705 }
14706 }
14707
14708 ResetVMask();
14709 m_currShader->isMessageTargetDataCacheDataPort = true;
14710 }
14711
emitAtomicCounter(llvm::GenIntrinsicInst * pInsn)14712 void EmitPass::emitAtomicCounter(llvm::GenIntrinsicInst* pInsn)
14713 {
14714
14715 IGC_ASSERT(pInsn->getNumOperands() == 2);
14716
14717 bool uniformAtomic = IsUniformAtomic(pInsn) &&
14718 (m_currShader->m_SIMDSize != SIMDMode::SIMD8 || !m_currShader->m_Platform->HDCCoalesceAtomicCounterAccess());
14719 if (uniformAtomic)
14720 {
14721 emitUniformAtomicCounter(pInsn);
14722 return;
14723 }
14724
14725 ForceDMask();
14726 GenISAIntrinsic::ID IID = pInsn->getIntrinsicID();
14727 /// Immediate Atomics return the value before the atomic operation is performed. So that flag
14728 /// needs to be set for this.
14729 bool returnsImmValue = !pInsn->user_empty();
14730
14731 llvm::Value* pllbuffer = pInsn->getOperand(0);
14732 ResourceDescriptor resource = GetResourceVariable(pllbuffer);
14733
14734 CVariable* dst = m_destination;
14735
14736 bool hasheader = true;
14737 unsigned int num_split = m_currShader->m_SIMDSize == SIMDMode::SIMD16 ? 2 : 1;
14738
14739 // header
14740 CVariable* pPayload = m_currShader->GetNewVariable(
14741 numLanes(SIMDMode::SIMD8),
14742 ISA_TYPE_UD,
14743 EALIGN_GRF, CName::NONE);
14744 m_encoder->SetNoMask();
14745 m_encoder->SetSimdSize(SIMDMode::SIMD1);
14746 m_encoder->SetDstSubReg(7);
14747 m_encoder->Copy(pPayload, m_currShader->ImmToVariable(0xFFFF, ISA_TYPE_UD));
14748 m_encoder->Push();
14749
14750 EU_DATA_PORT_ATOMIC_OPERATION_TYPE atomicType = EU_DATA_PORT_ATOMIC_OPERATION_INC;
14751 if (IID == GenISAIntrinsic::GenISA_atomiccounterpredec)
14752 {
14753 atomicType = m_currShader->m_Platform->hasAtomicPreDec() ?
14754 EU_DATA_PORT_ATOMIC_OPERATION_PREDEC : EU_DATA_PORT_ATOMIC_OPERATION_DEC;
14755 }
14756
14757 uint label = 0;
14758 CVariable* flag = nullptr;
14759 bool needLoop = ResourceLoopHeader(resource, flag, label);
14760
14761 uint messageDescriptor = encodeMessageDescriptorForAtomicUnaryOp(
14762 1,
14763 returnsImmValue ? 1 : 0,
14764 hasheader,
14765 EU_GEN7_5_DATA_CACHE_1_MESSAGE_TYPE_ATOMIC_COUNTER_OPERATION,
14766 returnsImmValue,
14767 SIMDMode::SIMD8,
14768 atomicType,
14769 resource.m_surfaceType == ESURFACE_BINDLESS ? BINDLESS_BTI : (uint)resource.m_resource->GetImmediateValue());
14770
14771 CVariable* pMessDesc = m_currShader->ImmToVariable(messageDescriptor, ISA_TYPE_D);
14772 CVariable* exDesc =
14773 m_currShader->ImmToVariable(EU_GEN7_5_MESSAGE_TARGET_DATA_PORT_DATA_CACHE_1, ISA_TYPE_D);
14774
14775 if (resource.m_surfaceType == ESURFACE_BINDLESS)
14776 {
14777 CVariable* temp = m_currShader->GetNewVariable(resource.m_resource);
14778 m_encoder->Add(temp, resource.m_resource, exDesc);
14779 m_encoder->Push();
14780
14781 exDesc = temp;
14782 }
14783
14784 for (uint32_t i = 0; i < num_split; ++i)
14785 {
14786 m_encoder->SetSimdSize(SIMDMode::SIMD8);
14787 m_encoder->SetDstSubVar(i);
14788 m_encoder->SetMask((i == 0) ? EMASK_Q1 : EMASK_Q2);
14789
14790 m_encoder->Send(
14791 returnsImmValue ? dst : NULL,
14792 pPayload,
14793 EU_GEN7_5_MESSAGE_TARGET_DATA_PORT_DATA_CACHE_1,
14794 exDesc,
14795 pMessDesc);
14796 m_encoder->Push();
14797 }
14798
14799 if (IID == GenISAIntrinsic::GenISA_atomiccounterpredec &&
14800 !m_currShader->m_Platform->hasAtomicPreDec())
14801 {
14802 unsigned int counter = m_currShader->m_numberInstance;
14803 for (unsigned int i = 0; i < counter; ++i)
14804 {
14805 m_encoder->SetSecondHalf(i == 1);
14806 CVariable* src = m_currShader->ImmToVariable(-1, ISA_TYPE_D);
14807 m_encoder->Add(m_destination, m_destination, src);
14808 m_encoder->Push();
14809 }
14810 }
14811
14812 ResourceLoopBackEdge(needLoop, flag, label);
14813 ResetVMask();
14814 m_currShader->isMessageTargetDataCacheDataPort = true;
14815 }
14816
CmpBoolOp(llvm::BinaryOperator * inst,llvm::CmpInst::Predicate predicate,const SSource cmpSources[2],const SSource & bitSource,const DstModifier & modifier)14817 void EmitPass::CmpBoolOp(llvm::BinaryOperator* inst,
14818 llvm::CmpInst::Predicate predicate,
14819 const SSource cmpSources[2],
14820 const SSource& bitSource,
14821 const DstModifier& modifier)
14822 {
14823
14824 DstModifier init;
14825 Cmp(predicate, cmpSources, init);
14826
14827 IGC_ASSERT(bitSource.mod == EMOD_NONE);
14828 CVariable* boolOpSource = GetSrcVariable(bitSource);
14829 m_encoder->SetDstModifier(modifier);
14830
14831 EmitSimpleAlu(inst, m_destination, m_destination, boolOpSource);
14832 }
14833
emitAluConditionMod(Pattern * aluPattern,Instruction * alu,CmpInst * cmp,int aluOprdNum)14834 void EmitPass::emitAluConditionMod(Pattern* aluPattern, Instruction* alu, CmpInst* cmp, int aluOprdNum)
14835 {
14836 CVariable* temp = m_currShader->GetNewVector(alu);
14837 CVariable* dst = m_destination;
14838 m_destination = temp;
14839 DstModifier init;
14840
14841 aluPattern->Emit(this, init);
14842
14843 // condMod is in the form of "alu cmpOp 0". If pattern is in the form of
14844 // "0 cmpOp alu", cmp's predicate should be swapped. aluOprdNum indicates
14845 // which form this pattern is.
14846 auto llvmPredicate = (aluOprdNum == 0 ? cmp->getPredicate() : cmp->getSwappedPredicate());
14847 e_predicate predicate = GetPredicate(llvmPredicate);
14848 if (IsUnsignedCmp(llvmPredicate))
14849 {
14850 temp = m_currShader->BitCast(temp, GetUnsignedType(temp->GetType()));
14851 }
14852 m_encoder->Cmp(predicate, dst, temp, m_currShader->ImmToVariable(0, temp->GetType()));
14853 m_encoder->Push();
14854 m_destination = dst;
14855 }
14856
emitHSTessFactors(llvm::Instruction * pInst)14857 void EmitPass::emitHSTessFactors(llvm::Instruction* pInst)
14858 {
14859 IGC_ASSERT(m_currShader->GetShaderType() == ShaderType::HULL_SHADER);
14860 CHullShader* hsProgram = static_cast<CHullShader*>(m_currShader);
14861 CVariable* payload[8];
14862
14863 for (uint32_t channel = 2; channel < 8; channel++)
14864 {
14865 payload[channel] = GetSymbol(pInst->getOperand(channel - 2));
14866 }
14867
14868 bool endOfThread = llvm::isa<llvm::ReturnInst>(pInst->getNextNode());
14869 hsProgram->EmitPatchConstantHeader(payload, endOfThread);
14870 }
14871
emitRenderTargetRead(llvm::GenIntrinsicInst * inst)14872 void EmitPass::emitRenderTargetRead(llvm::GenIntrinsicInst* inst)
14873 {
14874 IGC_ASSERT(m_currShader->GetShaderType() == ShaderType::PIXEL_SHADER);
14875 CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
14876 uint RTIndex = 0;
14877 bool isRTIndexConstant = false;
14878 if (llvm::ConstantInt * pRenderTargetCnst = llvm::dyn_cast<llvm::ConstantInt>(inst->getOperand(0)))
14879 {
14880 RTIndex = (uint)llvm::cast<llvm::ConstantInt>(pRenderTargetCnst)->getZExtValue();
14881 isRTIndexConstant = true;
14882 }
14883
14884 uint bindingTableIndex = m_currShader->m_pBtiLayout->GetRenderTargetIndex(RTIndex);
14885 m_currShader->SetBindingTableEntryCountAndBitmap(isRTIndexConstant, RENDER_TARGET, RTIndex, bindingTableIndex);
14886 CVariable* pSampleIndexR0 = nullptr;
14887
14888 uint hasSampleIndex = (inst->getIntrinsicID() == GenISAIntrinsic::GenISA_RenderTargetReadSampleFreq);
14889 if (hasSampleIndex)
14890 {
14891 CVariable* pShiftedSampleIndex = nullptr;
14892 if (llvm::isa<llvm::ConstantInt>(inst->getOperand(1)))
14893 {
14894 uint sampleIndex = int_cast<uint>(cast<llvm::ConstantInt>(inst->getOperand(1))->getZExtValue());
14895 pShiftedSampleIndex = m_currShader->ImmToVariable((sampleIndex << 6), ISA_TYPE_D);
14896 }
14897 else
14898 {
14899 CVariable* SampleIndex = GetSymbol(inst->getOperand(1));
14900 if (!SampleIndex->IsUniform())
14901 {
14902 SampleIndex = UniformCopy(SampleIndex);
14903 }
14904 pShiftedSampleIndex = m_currShader->GetNewVariable(SampleIndex);
14905 m_encoder->Shl(pShiftedSampleIndex, SampleIndex, m_currShader->ImmToVariable(6, ISA_TYPE_D));
14906 m_encoder->Push();
14907 }
14908
14909 // and (1) r15.0<1>:ud r0.0<0;1,0>:ud 0xFFFFFC3F:ud
14910 // or (1) r16.0<1>:ud r15.0<0;1,0>:ud r14.0<0;1,0>:ud
14911 pSampleIndexR0 = m_currShader->GetNewVariable(numLanes(m_SimdMode), ISA_TYPE_D, EALIGN_GRF, CName::NONE);
14912 m_encoder->SetSimdSize(SIMDMode::SIMD8);
14913 m_encoder->SetNoMask();
14914 m_encoder->Copy(pSampleIndexR0, psProgram->GetR0());
14915 m_encoder->Push();
14916
14917 m_encoder->SetSimdSize(SIMDMode::SIMD1);
14918 m_encoder->SetSrcRegion(0, 0, 1, 0);
14919 m_encoder->SetSrcSubReg(0, 0);
14920 m_encoder->SetNoMask();
14921 m_encoder->And(pSampleIndexR0, pSampleIndexR0, m_currShader->ImmToVariable(0xFFFFFC3F, ISA_TYPE_UD));
14922 m_encoder->Push();
14923
14924 m_encoder->SetSimdSize(SIMDMode::SIMD1);
14925 m_encoder->SetSrcRegion(0, 0, 1, 0);
14926 m_encoder->SetSrcSubReg(0, 0);
14927 m_encoder->SetSrcRegion(1, 0, 1, 0);
14928 m_encoder->SetSrcSubReg(1, 0);
14929 m_encoder->SetNoMask();
14930 m_encoder->Or(pSampleIndexR0, pSampleIndexR0, pShiftedSampleIndex);
14931 m_encoder->Push();
14932 }
14933
14934 // RT read header is 2 GRF
14935 uint messageLength = 2;
14936 uint responseLength = 4 * numLanes(m_currShader->m_SIMDSize) / 8;
14937 bool headerRequired = true;
14938
14939 // We shouldn't need any copies since R0 and R1 are already aligned
14940 // but we don't want to declare R0 and R1 as one variable in V-ISA
14941 // The problem could be fixed by moving away from raw_send for this message
14942 CVariable* payload =
14943 m_currShader->GetNewVariable(messageLength * (getGRFSize() >> 2), ISA_TYPE_D, EALIGN_GRF, CName::NONE);
14944 m_encoder->SetNoMask();
14945 m_encoder->SetSimdSize(SIMDMode::SIMD8);
14946 m_encoder->Copy(payload, (hasSampleIndex ? pSampleIndexR0 : psProgram->GetR0()));
14947 m_encoder->Push();
14948
14949 // The following bits must be set to 0 for render target read messages:
14950 // Bit 11 - Source0 Alpha Present to Render Target
14951 // Bit 12 - oMask to Render Target
14952 // Bit 13 - Source Depth Present to Render Target
14953 // Bit 14 - Stencil Present to Render Target
14954 m_encoder->SetSimdSize(SIMDMode::SIMD1);
14955 m_encoder->SetSrcRegion(0, 0, 1, 0);
14956 m_encoder->SetSrcSubReg(0, 0);
14957 m_encoder->SetNoMask();
14958 m_encoder->And(payload, payload, m_currShader->ImmToVariable(0xFFFF87FF, ISA_TYPE_UD));
14959 m_encoder->Push();
14960
14961 m_encoder->SetNoMask();
14962 m_encoder->SetSimdSize(SIMDMode::SIMD8);
14963 m_encoder->SetDstSubVar(1);
14964 m_encoder->Copy(payload, psProgram->GetR1());
14965 m_encoder->Push();
14966
14967 uint msgControl =
14968 (m_SimdMode == SIMDMode::SIMD8)
14969 ? EU_GEN9_DATA_PORT_RENDER_TARGET_READ_CONTROL_SIMD8_SINGLE_SOURCE_LOW
14970 : EU_GEN9_DATA_PORT_RENDER_TARGET_READ_CONTROL_SIMD16_SINGLE_SOURCE;
14971 msgControl |=
14972 m_encoder->IsSecondHalf() ? EU_GEN6_DATA_PORT_RENDER_TARGET_WRITE_SLOTGRP_HI : EU_GEN6_DATA_PORT_RENDER_TARGET_WRITE_SLOTGRP_LO;
14973 msgControl |= psProgram->IsPerSample() ? EU_GEN9_DATA_PORT_RENDER_TARGET_READ_CONTROL_PER_SAMPLE_ENABLE : 0;
14974
14975 uint Desc = DataPortRead(
14976 messageLength,
14977 responseLength,
14978 headerRequired,
14979 EU_DATA_PORT_READ_MESSAGE_TYPE_RENDER_TARGET_READ,
14980 msgControl,
14981 hasSampleIndex ? true : false,
14982 DATA_PORT_TARGET_RENDER_CACHE,
14983 bindingTableIndex);
14984
14985 uint exDesc = EU_MESSAGE_TARGET_DATA_PORT_WRITE;
14986
14987 CVariable* messDesc;
14988 if (isRTIndexConstant)
14989 {
14990 messDesc = psProgram->ImmToVariable(Desc, ISA_TYPE_UD);
14991 }
14992 else
14993 {
14994 messDesc = m_currShader->GetNewVariable(1, ISA_TYPE_UD, EALIGN_DWORD, true, CName::NONE);
14995 // RTIndex is not a constant, so OR the value with desc to get the correct RTIndex
14996 m_encoder->Add(messDesc, GetSymbol(inst->getOperand(0)), psProgram->ImmToVariable(Desc, ISA_TYPE_UD));
14997 m_encoder->Push();
14998 }
14999 //sendc
15000 m_encoder->SendC(m_destination, payload, exDesc, messDesc);
15001 m_encoder->Push();
15002 }
15003
GetRoundingMode_FPCvtInt(Instruction * pInst)15004 ERoundingMode EmitPass::GetRoundingMode_FPCvtInt(Instruction* pInst)
15005 {
15006 if (isa<FPToSIInst>(pInst) || isa <FPToUIInst>(pInst))
15007 {
15008 const ERoundingMode defaultRoundingMode_FPCvtInt = static_cast<ERoundingMode>(
15009 m_pCtx->getModuleMetaData()->compOpt.FloatCvtIntRoundingMode);
15010 return defaultRoundingMode_FPCvtInt;
15011 }
15012
15013 if (GenIntrinsicInst* GII = dyn_cast<GenIntrinsicInst>(pInst))
15014 {
15015 switch (GII->getIntrinsicID())
15016 {
15017 default:
15018 break;
15019 case GenISAIntrinsic::GenISA_ftoui_rtn:
15020 case GenISAIntrinsic::GenISA_ftoi_rtn:
15021 return ERoundingMode::ROUND_TO_NEGATIVE;
15022 case GenISAIntrinsic::GenISA_ftoui_rtp:
15023 case GenISAIntrinsic::GenISA_ftoi_rtp:
15024 return ERoundingMode::ROUND_TO_POSITIVE;
15025 case GenISAIntrinsic::GenISA_ftoui_rte:
15026 case GenISAIntrinsic::GenISA_ftoi_rte:
15027 return ERoundingMode::ROUND_TO_NEAREST_EVEN;
15028 }
15029 }
15030 // rounding not needed!
15031 return ERoundingMode::ROUND_TO_ANY;
15032 }
15033
GetRoundingMode_FP(Instruction * inst)15034 ERoundingMode EmitPass::GetRoundingMode_FP(Instruction* inst)
15035 {
15036 // Float rounding mode
15037 ERoundingMode RM = static_cast<ERoundingMode>(m_pCtx->getModuleMetaData()->compOpt.FloatRoundingMode);
15038 if (GenIntrinsicInst* GII = dyn_cast<GenIntrinsicInst>(inst))
15039 {
15040 switch (GII->getIntrinsicID())
15041 {
15042 case GenISAIntrinsic::GenISA_f32tof16_rtz:
15043 case GenISAIntrinsic::GenISA_ftof_rtz:
15044 case GenISAIntrinsic::GenISA_itof_rtz:
15045 case GenISAIntrinsic::GenISA_uitof_rtz:
15046 case GenISAIntrinsic::GenISA_add_rtz:
15047 case GenISAIntrinsic::GenISA_mul_rtz:
15048 case GenISAIntrinsic::GenISA_fma_rtz:
15049 RM = ERoundingMode::ROUND_TO_ZERO;
15050 break;
15051 case GenISAIntrinsic::GenISA_ftof_rtn:
15052 case GenISAIntrinsic::GenISA_itof_rtn:
15053 case GenISAIntrinsic::GenISA_uitof_rtn:
15054 case GenISAIntrinsic::GenISA_fma_rtn:
15055 RM = ERoundingMode::ROUND_TO_NEGATIVE;
15056 break;
15057 case GenISAIntrinsic::GenISA_ftof_rtp:
15058 case GenISAIntrinsic::GenISA_itof_rtp:
15059 case GenISAIntrinsic::GenISA_uitof_rtp:
15060 case GenISAIntrinsic::GenISA_fma_rtp:
15061 RM = ERoundingMode::ROUND_TO_POSITIVE;
15062 break;
15063 case GenISAIntrinsic::GenISA_ftof_rte:
15064 RM = ERoundingMode::ROUND_TO_NEAREST_EVEN;
15065 break;
15066 case GenISAIntrinsic::GenISA_ftobf:
15067 case GenISAIntrinsic::GenISA_2fto2bf:
15068 {
15069 ConstantInt* rmVal;
15070 if (GII->getIntrinsicID() == GenISAIntrinsic::GenISA_2fto2bf) {
15071 rmVal = cast<ConstantInt>(GII->getArgOperand(2));
15072 }
15073 else {
15074 rmVal = cast<ConstantInt>(GII->getArgOperand(1));
15075 }
15076 RM = (ERoundingMode)rmVal->getZExtValue();
15077 break;
15078 }
15079 default:
15080 break;
15081 }
15082 }
15083 return RM;
15084 }
15085
ignoreRoundingMode(llvm::Instruction * inst) const15086 bool EmitPass::ignoreRoundingMode(llvm::Instruction* inst) const
15087 {
15088 auto isFZero = [](Value* V)->bool {
15089 if (ConstantFP* FCST = dyn_cast<ConstantFP>(V))
15090 {
15091 return FCST->isZero();
15092 }
15093 return false;
15094 };
15095
15096 if (isa<InsertElementInst>(inst) ||
15097 isa<ExtractElementInst>(inst) ||
15098 isa<BitCastInst>(inst) ||
15099 isa<ICmpInst>(inst) ||
15100 isa<FCmpInst>(inst) ||
15101 isa<SelectInst>(inst) ||
15102 isa<TruncInst>(inst) ||
15103 isa<LoadInst>(inst) ||
15104 isa<StoreInst>(inst))
15105 {
15106 // these are not affected by rounding mode.
15107 return true;
15108 }
15109
15110 if (BinaryOperator* BOP = dyn_cast<BinaryOperator>(inst))
15111 {
15112 if (BOP->getType()->isIntOrIntVectorTy()) {
15113 // Integer binary op does not need rounding mode
15114 return true;
15115 }
15116
15117 // float operations on EM uses RTNE only and are not affected
15118 // by rounding mode.
15119 if (BOP->getType()->isFPOrFPVectorTy())
15120 {
15121 switch (BOP->getOpcode())
15122 {
15123 default:
15124 break;
15125 case Instruction::FDiv:
15126 return true;
15127 case Instruction::FSub:
15128 // Negation is okay for any rounding mode
15129 if (isFZero(BOP->getOperand(0))) {
15130 return true;
15131 }
15132 break;
15133 }
15134 }
15135 }
15136 if (IntrinsicInst* II = dyn_cast<IntrinsicInst>(inst))
15137 {
15138 switch (II->getIntrinsicID())
15139 {
15140 default:
15141 break;
15142 case IGCLLVM::Intrinsic::exp2:
15143 case IGCLLVM::Intrinsic::sqrt:
15144 return true;
15145 }
15146 }
15147
15148 if (GenIntrinsicInst * GII = dyn_cast<GenIntrinsicInst>(inst))
15149 {
15150 GenISAIntrinsic::ID id = GII->getIntrinsicID();
15151 switch (id)
15152 {
15153 case GenISAIntrinsic::GenISA_bftof:
15154 return true;
15155 default:
15156 break;
15157 }
15158 }
15159 // add more instr as needed
15160 return false;
15161 }
15162
initDefaultRoundingMode()15163 void EmitPass::initDefaultRoundingMode()
15164 {
15165 const ERoundingMode defaultRM_FP = static_cast<ERoundingMode>(m_pCtx->getModuleMetaData()->compOpt.FloatRoundingMode);
15166 const ERoundingMode defaultRM_FPCvtInt = static_cast<ERoundingMode>(m_pCtx->getModuleMetaData()->compOpt.FloatCvtIntRoundingMode);
15167
15168 // Rounding modes must meet the following restrictions
15169 // in order to be used as default:
15170 // 1. if FPCvtInt's RM is rtz, FP's RM can be any;
15171 // 2. otherwise, FPCvtIn's RM must be the same as FP's RM
15172 const bool supportedDefaultRoundingModes =
15173 ((defaultRM_FPCvtInt == ERoundingMode::ROUND_TO_ZERO) ||
15174 (defaultRM_FPCvtInt == defaultRM_FP));
15175
15176 IGC_ASSERT_EXIT(supportedDefaultRoundingModes);
15177
15178 m_roundingMode_FPCvtInt = defaultRM_FPCvtInt;
15179 m_roundingMode_FP = defaultRM_FP;
15180 }
15181
SetRoundingMode_FP(ERoundingMode newRM_FP)15182 void EmitPass::SetRoundingMode_FP(ERoundingMode newRM_FP)
15183 {
15184 if (newRM_FP != ERoundingMode::ROUND_TO_ANY &&
15185 newRM_FP != m_roundingMode_FP)
15186 {
15187 m_encoder->SetRoundingMode_FP(m_roundingMode_FP, newRM_FP);
15188 m_roundingMode_FP = newRM_FP;
15189
15190 if (m_roundingMode_FPCvtInt != ERoundingMode::ROUND_TO_ZERO)
15191 {
15192 // If FPCvtInt's RM is not RTZ, it must be the same as FP's
15193 m_roundingMode_FPCvtInt = m_roundingMode_FP;
15194 }
15195 }
15196 }
15197
SetRoundingMode_FPCvtInt(ERoundingMode newRM_FPCvtInt)15198 void EmitPass::SetRoundingMode_FPCvtInt(ERoundingMode newRM_FPCvtInt)
15199 {
15200 if (newRM_FPCvtInt != ERoundingMode::ROUND_TO_ANY &&
15201 newRM_FPCvtInt != m_roundingMode_FPCvtInt)
15202 {
15203 m_encoder->SetRoundingMode_FPCvtInt(m_roundingMode_FPCvtInt, newRM_FPCvtInt);
15204 m_roundingMode_FPCvtInt = newRM_FPCvtInt;
15205
15206 if (m_roundingMode_FPCvtInt != ERoundingMode::ROUND_TO_ZERO)
15207 {
15208 // If FPCvtInt's RM is not RTZ, it must be the same as FP's
15209 m_roundingMode_FP = m_roundingMode_FPCvtInt;
15210 }
15211 }
15212 }
15213
15214 // Return true if inst needs specific rounding mode; false otherwise.
15215 //
15216 // Currently, only gen intrinsic needs rounding mode other than the default.
setRMExplicitly(Instruction * inst)15217 bool EmitPass::setRMExplicitly(Instruction* inst)
15218 {
15219 if (GenIntrinsicInst* GII = dyn_cast<GenIntrinsicInst>(inst))
15220 {
15221 switch (GII->getIntrinsicID())
15222 {
15223 case GenISAIntrinsic::GenISA_f32tof16_rtz:
15224 case GenISAIntrinsic::GenISA_ftof_rtz:
15225 case GenISAIntrinsic::GenISA_itof_rtz:
15226 case GenISAIntrinsic::GenISA_uitof_rtz:
15227 case GenISAIntrinsic::GenISA_add_rtz:
15228 case GenISAIntrinsic::GenISA_mul_rtz:
15229 case GenISAIntrinsic::GenISA_fma_rtz:
15230 case GenISAIntrinsic::GenISA_fma_rtp:
15231 case GenISAIntrinsic::GenISA_fma_rtn:
15232 case GenISAIntrinsic::GenISA_ftof_rtn:
15233 case GenISAIntrinsic::GenISA_itof_rtn:
15234 case GenISAIntrinsic::GenISA_uitof_rtn:
15235 case GenISAIntrinsic::GenISA_ftof_rtp:
15236 case GenISAIntrinsic::GenISA_itof_rtp:
15237 case GenISAIntrinsic::GenISA_uitof_rtp:
15238 case GenISAIntrinsic::GenISA_ftobf:
15239 case GenISAIntrinsic::GenISA_2fto2bf:
15240 return true;
15241 default:
15242 break;
15243 }
15244 }
15245 return false;
15246 }
15247
ResetRoundingMode(Instruction * inst)15248 void EmitPass::ResetRoundingMode(Instruction* inst)
15249 {
15250 // Reset rounding modes to default if they are not. Howerver, if
15251 // next inst requires non-default, which requires to set
15252 // RM explicitly, don't set default rounding modes and let the next
15253 // inst to set it explicitly.
15254 const ERoundingMode defaultRoundingMode_FP = static_cast<ERoundingMode>(
15255 m_pCtx->getModuleMetaData()->compOpt.FloatRoundingMode);
15256 const ERoundingMode defaultRoundingMode_FPCvtInt = static_cast<ERoundingMode>(
15257 m_pCtx->getModuleMetaData()->compOpt.FloatCvtIntRoundingMode);
15258
15259 if (m_roundingMode_FP == defaultRoundingMode_FP &&
15260 m_roundingMode_FPCvtInt == defaultRoundingMode_FPCvtInt)
15261 {
15262 // Already in default mode.
15263 return;
15264 }
15265
15266 // Those two variables are set to true if default RM is required before the next
15267 // explicit-RM setting instruction (genintrinsic).
15268 bool nextImplicitFPCvtInt = false;
15269 bool nextImplicitFP = false;
15270 for (auto nextInst = GetNextInstruction(inst);
15271 nextInst != nullptr;
15272 nextInst = GetNextInstruction(nextInst))
15273 {
15274 if (ignoreRoundingMode(nextInst))
15275 {
15276 continue;
15277 }
15278 if (setRMExplicitly(nextInst))
15279 {
15280 // As nextInst will set RM explicitly, no need to go further.
15281 break;
15282 }
15283
15284 // At this point, a default RM is needed. For FPCvtInt, we know
15285 // precisely whether FPCvtInt RM is needed or not; but for FP, we
15286 // do it conservatively as we do not scan all instructions here.
15287 ERoundingMode intRM = GetRoundingMode_FPCvtInt(nextInst);
15288
15289 // If it is not ROUND_TO_ANY, it uses FPCvtInt RM;
15290 // otherwise, it does not use FPCvtInt RM.
15291 if (intRM != ERoundingMode::ROUND_TO_ANY) {
15292 nextImplicitFPCvtInt = true;
15293 }
15294 else {
15295 // Conservatively assume FP default RM is used.
15296 nextImplicitFP = true;
15297 }
15298
15299 if (nextImplicitFPCvtInt && nextImplicitFP) {
15300 break;
15301 }
15302 }
15303
15304 if (nextImplicitFPCvtInt && !nextImplicitFP)
15305 {
15306 SetRoundingMode_FPCvtInt(defaultRoundingMode_FPCvtInt);
15307 }
15308 else if (nextImplicitFP && !nextImplicitFPCvtInt)
15309 {
15310 SetRoundingMode_FP(defaultRoundingMode_FP);
15311 }
15312 else if (nextImplicitFP && nextImplicitFPCvtInt)
15313 {
15314 // Need to set default for both
15315 if (defaultRoundingMode_FPCvtInt == ERoundingMode::ROUND_TO_ZERO)
15316 {
15317 SetRoundingMode_FP(defaultRoundingMode_FP);
15318 }
15319 else
15320 {
15321 SetRoundingMode_FPCvtInt(defaultRoundingMode_FPCvtInt);
15322 }
15323 }
15324 }
15325
emitf32tof16_rtz(llvm::GenIntrinsicInst * inst)15326 void EmitPass::emitf32tof16_rtz(llvm::GenIntrinsicInst* inst)
15327 {
15328 CVariable* src = GetSymbol(inst->getOperand(0));
15329 CVariable imm0_hf(0, ISA_TYPE_HF);
15330 CVariable* dst_hf = m_currShader->BitCast(m_destination, ISA_TYPE_HF);
15331
15332 SetRoundingMode_FP(ERoundingMode::ROUND_TO_ZERO);
15333
15334 m_encoder->SetDstRegion(2);
15335 m_encoder->Cast(dst_hf, src);
15336 m_encoder->Push();
15337
15338 m_encoder->SetDstRegion(2);
15339 m_encoder->SetDstSubReg(1);
15340 m_encoder->Copy(dst_hf, &imm0_hf);
15341 m_encoder->Push();
15342
15343 ResetRoundingMode(inst);
15344 }
15345
emitfitof(llvm::GenIntrinsicInst * inst)15346 void EmitPass::emitfitof(llvm::GenIntrinsicInst* inst)
15347 {
15348 CVariable* src = GetSymbol(inst->getOperand(0));
15349 ERoundingMode RM = GetRoundingMode_FP(inst);
15350 CVariable* dst = m_destination;
15351
15352 GenISAIntrinsic::ID id = inst->getIntrinsicID();
15353 if (id == GenISAIntrinsic::GenISA_uitof_rtn ||
15354 id == GenISAIntrinsic::GenISA_uitof_rtp ||
15355 id == GenISAIntrinsic::GenISA_uitof_rtz)
15356 {
15357 src = m_currShader->BitCast(src, GetUnsignedType(src->GetType()));
15358 }
15359
15360 SetRoundingMode_FP(RM);
15361
15362 m_encoder->Cast(dst, src);
15363 m_encoder->Push();
15364
15365 ResetRoundingMode(inst);
15366 }
15367
15368 // Emit FP Operations (FPO) using round-to-zero (rtz)
emitFPOrtz(llvm::GenIntrinsicInst * inst)15369 void EmitPass::emitFPOrtz(llvm::GenIntrinsicInst* inst)
15370 {
15371 IGC_ASSERT_MESSAGE(inst->getNumArgOperands() >= 2, "ICE: incorrect gen intrinsic");
15372
15373 GenISAIntrinsic::ID GID = inst->getIntrinsicID();
15374 CVariable* src0 = GetSymbol(inst->getOperand(0));
15375 CVariable* src1 = GetSymbol(inst->getOperand(1));
15376 CVariable* dst = m_destination;
15377
15378 SetRoundingMode_FP(ERoundingMode::ROUND_TO_ZERO);
15379
15380 switch (GID)
15381 {
15382 default:
15383 IGC_ASSERT_MESSAGE(0, "ICE: unexpected Gen Intrinsic");
15384 break;
15385 case GenISAIntrinsic::GenISA_mul_rtz:
15386 m_encoder->Mul(dst, src0, src1);
15387 m_encoder->Push();
15388 break;
15389 case GenISAIntrinsic::GenISA_add_rtz:
15390 m_encoder->Add(dst, src0, src1);
15391 m_encoder->Push();
15392 break;
15393 case GenISAIntrinsic::GenISA_fma_rtz:
15394 {
15395 CVariable* src2 = GetSymbol(inst->getOperand(2));
15396 m_encoder->Mad(dst, src0, src1, src2);
15397 m_encoder->Push();
15398 break;
15399 }
15400 }
15401
15402 ResetRoundingMode(inst);
15403 }
15404
15405 // Emit FP mad (FMA) using round-to-positive-infinity (rtp)
emitFMArtp(llvm::GenIntrinsicInst * inst)15406 void EmitPass::emitFMArtp(llvm::GenIntrinsicInst *inst) {
15407 IGC_ASSERT_MESSAGE(inst->getNumArgOperands() == 3, "ICE: incorrect gen intrinsic");
15408
15409 CVariable *src0 = GetSymbol(inst->getOperand(0));
15410 CVariable *src1 = GetSymbol(inst->getOperand(1));
15411 CVariable *src2 = GetSymbol(inst->getOperand(2));
15412 CVariable *dst = m_destination;
15413
15414 SetRoundingMode_FP(ERoundingMode::ROUND_TO_POSITIVE);
15415
15416 m_encoder->Mad(dst, src0, src1, src2);
15417 m_encoder->Push();
15418
15419 ResetRoundingMode(inst);
15420 }
15421
15422 // Emit FP mad (FMA) using round-to-negative-infinity (rtn)
emitFMArtn(llvm::GenIntrinsicInst * inst)15423 void EmitPass::emitFMArtn(llvm::GenIntrinsicInst *inst) {
15424 IGC_ASSERT_MESSAGE(inst->getNumArgOperands() == 3, "ICE: incorrect gen intrinsic");
15425
15426 CVariable *src0 = GetSymbol(inst->getOperand(0));
15427 CVariable *src1 = GetSymbol(inst->getOperand(1));
15428 CVariable *src2 = GetSymbol(inst->getOperand(2));
15429 CVariable *dst = m_destination;
15430
15431 SetRoundingMode_FP(ERoundingMode::ROUND_TO_NEGATIVE);
15432
15433 m_encoder->Mad(dst, src0, src1, src2);
15434 m_encoder->Push();
15435
15436 ResetRoundingMode(inst);
15437 }
15438
emitftoi(llvm::GenIntrinsicInst * inst)15439 void EmitPass::emitftoi(llvm::GenIntrinsicInst* inst)
15440 {
15441 IGC_ASSERT_MESSAGE(inst->getOperand(0)->getType()->isFloatingPointTy(), "Unsupported type");
15442 CVariable* src = GetSymbol(inst->getOperand(0));
15443 CVariable* dst = m_destination;
15444 ERoundingMode RM = GetRoundingMode_FPCvtInt(inst);
15445 IGC_ASSERT_MESSAGE(RM != ERoundingMode::ROUND_TO_ANY, "Not valid FP->int rounding mode!");
15446
15447 GenISAIntrinsic::ID id = inst->getIntrinsicID();
15448 if (id == GenISAIntrinsic::GenISA_ftoui_rtn ||
15449 id == GenISAIntrinsic::GenISA_ftoui_rtp ||
15450 id == GenISAIntrinsic::GenISA_ftoui_rte)
15451 {
15452 dst = m_currShader->BitCast(dst, GetUnsignedType(dst->GetType()));
15453 }
15454
15455 SetRoundingMode_FPCvtInt(RM);
15456
15457 m_encoder->Cast(dst, src);
15458 m_encoder->Push();
15459
15460 ResetRoundingMode(inst);
15461 }
15462
isUniformStoreOCL(Value * ptr,Value * storeVal)15463 bool EmitPass::isUniformStoreOCL(Value* ptr, Value* storeVal)
15464 {
15465 if (m_currShader->GetShaderType() != ShaderType::OPENCL_SHADER ||
15466 !m_currShader->GetIsUniform(ptr))
15467 {
15468 return false;
15469 }
15470
15471 Type* Ty = storeVal->getType();
15472 IGCLLVM::FixedVectorType* VTy = dyn_cast<IGCLLVM::FixedVectorType>(Ty);
15473 uint32_t elts = VTy ? int_cast<uint32_t>(VTy->getNumElements()) : 1;
15474 Type* eltTy = VTy ? VTy->getElementType() : Ty;
15475
15476 // use TypeSize to be consistent with VectorLoad/Store
15477 uint32_t totalBytes = elts * ((uint32_t)m_DL->getTypeSizeInBits(eltTy) / 8);
15478
15479 // Note that when elts > 1, VectorProcess make sure that its element
15480 // size must be 4 or 8. Also, note that if totalBytes = 4, elts must be 1.
15481 bool doUniformStore = (elts == 1 ||
15482 (m_currShader->GetIsUniform(storeVal) &&
15483 (totalBytes == 8 || totalBytes == 12 || totalBytes == 16)));
15484 return doUniformStore;
15485 }
15486
15487 // Return true if this store will be emit as uniform store
isUniformStoreOCL(llvm::StoreInst * SI)15488 bool EmitPass::isUniformStoreOCL(llvm::StoreInst* SI)
15489 {
15490 return isUniformStoreOCL(SI->getPointerOperand(), SI->getValueOperand());
15491 }
15492
emitVectorBitCast(llvm::BitCastInst * BCI)15493 void EmitPass::emitVectorBitCast(llvm::BitCastInst* BCI)
15494 {
15495 const CShader::ExtractMaskWrapper destMask(m_currShader, BCI);
15496
15497 CVariable* src = GetSymbol(BCI->getOperand(0));
15498 llvm::Type* srcTy = BCI->getOperand(0)->getType();
15499 llvm::Type* dstTy = BCI->getType();
15500 llvm::Type* srcEltTy, * dstEltTy;
15501 uint32_t srcNElts, dstNElts;
15502
15503 IGC_ASSERT_MESSAGE((srcTy->isVectorTy() || dstTy->isVectorTy()), "No vector type !");
15504
15505 if (srcTy->isVectorTy())
15506 {
15507 srcEltTy = cast<VectorType>(srcTy)->getElementType();
15508 srcNElts = (uint32_t)cast<IGCLLVM::FixedVectorType>(srcTy)->getNumElements();
15509 }
15510 else
15511 {
15512 srcEltTy = srcTy;
15513 srcNElts = 1;
15514 }
15515 if (dstTy->isVectorTy())
15516 {
15517 dstEltTy = cast<VectorType>(dstTy)->getElementType();
15518 dstNElts = (uint32_t)cast<IGCLLVM::FixedVectorType>(dstTy)->getNumElements();
15519 }
15520 else
15521 {
15522 dstEltTy = dstTy;
15523 dstNElts = 1;
15524 }
15525
15526 if (src->IsImmediate())
15527 {
15528 CVariable* reg = m_currShader->GetNewVariable(
15529 1,
15530 src->GetType(),
15531 m_encoder->GetCISADataTypeAlignment(src->GetType()),
15532 true,
15533 1, CName::NONE);
15534
15535 m_encoder->Copy(reg, src);
15536 m_encoder->Push();
15537
15538 src = reg;
15539 }
15540
15541 uint32_t width = numLanes(m_currShader->m_SIMDSize);
15542 uint32_t dstEltBytes = GetPrimitiveTypeSizeInRegister(dstEltTy);
15543 uint32_t srcEltBytes = GetPrimitiveTypeSizeInRegister(srcEltTy);
15544 bool srcUniform = src->IsUniform();
15545 bool dstUniform = m_destination->IsUniform();
15546 if (srcUniform && dstUniform &&
15547 (dstNElts == 2 || dstNElts == 4 || dstNElts == 8) &&
15548 m_destination != src &&
15549 destMask.getEM() == ((1U << dstNElts) - 1)/* Full mask */ &&
15550 /* If alignment of source is safe to be aliased to the dst type. */
15551 src->GetAlign() >= CEncoder::GetCISADataTypeAlignment(m_destination->GetType()) &&
15552 /* Exclude bitcast from/to 16-bit */
15553 srcEltBytes != 2 && dstEltBytes != 2) {
15554 // TODO; Add uniform vector bitcast support. A simple copy is enough but
15555 // the ideal resolution is to teach DeSSA to handle that.
15556 CVariable* dst = m_destination;
15557 src = m_currShader->BitCast(src, dst->GetType());
15558 m_encoder->SetNoMask();
15559 m_encoder->SetUniformSIMDSize(lanesToSIMDMode(dstNElts));
15560 m_encoder->SetSrcRegion(0, dstNElts, dstNElts, 1);
15561 m_encoder->Copy(dst, src);
15562 m_encoder->Push();
15563 return;
15564 }
15565 if (srcEltBytes == dstEltBytes)
15566 {
15567 // This should not happen now, but generate code anyway.
15568 // CISABuilder does split if there is any spliting.
15569
15570 // Special case for: 1 element vectors to scalars
15571 // %15 = bitcast <1 x i64> %4 to i64
15572 if ((srcEltTy == dstEltTy) &&
15573 (srcNElts == dstNElts) && (srcNElts == 1))
15574 {
15575 m_encoder->Copy(m_destination, src);
15576 m_encoder->Push();
15577 }
15578 else if (m_destination != src)
15579 {
15580 for (uint32_t i = 0, offset = 0; i < dstNElts; ++i)
15581 {
15582 if (destMask.isSet(i))
15583 {
15584 m_encoder->SetSrcRegion(0,
15585 srcUniform ? 0 : 1,
15586 srcUniform ? 1 : 1,
15587 srcUniform ? 0 : 0);
15588 m_encoder->SetSrcSubReg(0, srcUniform ? i : i * width);
15589 m_encoder->SetDstRegion(1);
15590 m_encoder->SetDstSubReg(dstUniform ? offset : offset * width);
15591 m_encoder->Copy(m_destination, src);
15592 m_encoder->Push();
15593 offset++;
15594 }
15595 }
15596 }
15597 }
15598 else if (dstEltBytes > srcEltBytes)
15599 {
15600 IGC_ASSERT(0 < srcEltBytes);
15601 CVariable* aliasDst = m_currShader->GetNewAlias(m_destination, src->GetType(), 0, 0);
15602 uint32_t N = dstEltBytes / srcEltBytes;
15603 IGC_ASSERT_MESSAGE((dstEltBytes % srcEltBytes) == 0, "Basic types should be power of 2");
15604 // Since srcEltBytes can be the second largest element type (32bit)
15605 // and region hstride == 1, Src will not need splitting!
15606 // Only dst might need spliting.
15607 bool splitDst = (!dstUniform && (dstEltBytes * width > m_currShader->getGRFSize() * 2));
15608 IGC_ASSERT_MESSAGE((!splitDst || (width == 16) || (width == 32)),
15609 "Internal Error: Dst needs splitting only under SIMD16!");
15610 if (N > 4)
15611 {
15612 // Special case for N = 8 as dst's stride can be 1/2/4, not 8.
15613 // for example, <1xi64> Y = bitcast <8xi8> X
15614 // we will do the following (simd8)
15615 // .decl X type=q num_elts=8
15616 // .decl Y type=b num_elts=64
15617 // .decl Y_alias type=d num_elts=16 alias=<Y,0>
15618 // .decl V0 type=d num_elts=8
15619 // .decl V1 type=d num_elts=8
15620 // .decl V0_alias type=b num_elts=32 alias=<V0, 0>
15621 // .decl V1_alias type=b num_elts=32 alias=<V1, 0>
15622 //
15623 // mov (8) V0_alias.0<4> X(0,0)<8;8:1>
15624 // mov (8) V0_alias.1<4> X(0,8)<8;8:1>
15625 // mov (8) V0_alias.2<4> X(0,16)<8;8:1>
15626 // mov (8) V0_alias.3<4> X(0,24)<8;8:1>
15627 // mov (8) V1_alias.0<4> X(1,0)<8;8:1>
15628 // mov (8) V1_alias.1<4> X(1,8)<8;8:1>
15629 // mov (8) V1_alias.2<4> X(1,16)<8;8:1>
15630 // mov (8) V1_alias.3<4> X(1,24)<8;8:1>
15631 //
15632 // then, combine V0 and V1 to create Y
15633 // mov (8) Y_alias.0<2> V0(0,0)<8;8,1>
15634 // mov (8) Y_alias.1<2> V1(0,0)<8;8,1>
15635 //
15636 // For SIMD16, the above two movs will span across two GRFs for their
15637 // dst operands, therefore, they need splitting, that is
15638 // mov (16) Y_alias.0<2> V0(0,0)<16;16,1>
15639 // mov (16) Y_alias.1<2> V1(0,0)<16;16,1>
15640 // should be splitted into the following:
15641 // mov (8, Q1) Y_alias.0<2> V0(0,0)<8;8,1>
15642 // mov (8, Q2) Y_alias.16<2> V0(1,0)<8;8,1>
15643 // mov (8, Q1) Y_alias.1<2> V1(0,0)<8;8,1>
15644 // mov (8, Q2) Y_alias.17<2> V1(1,0)<8;8,1>
15645 //
15646 IGC_ASSERT(N == 8);
15647 IGC_ASSERT(srcEltBytes == 1);
15648 const uint32_t N2 = N / 2; // 4
15649 VISA_Type TyD = (src->GetType() == ISA_TYPE_UB) ? ISA_TYPE_UD : ISA_TYPE_D;
15650 CVariable* V0 = m_currShader->GetNewVariable(dstUniform ? 1 : width, TyD, EALIGN_GRF, dstUniform, CName::NONE);
15651 CVariable* V1 = m_currShader->GetNewVariable(dstUniform ? 1 : width, TyD, EALIGN_GRF, dstUniform, CName::NONE);
15652 CVariable* V0_alias = m_currShader->GetNewAlias(V0, src->GetType(), 0, 0);
15653 CVariable* V1_alias = m_currShader->GetNewAlias(V1, src->GetType(), 0, 0);
15654 CVariable* dst_alias = m_currShader->GetNewAlias(m_destination, V0->GetType(), 0, 0);
15655 for (unsigned i = 0, offset = 0; i < dstNElts; ++i)
15656 {
15657 if (destMask.isSet(i))
15658 {
15659 for (unsigned j = 0; j < N; ++j)
15660 {
15661 bool useV0 = (j < N2);
15662 uint32_t oft = useV0 ? j : j - N2;
15663 m_encoder->SetSrcRegion(0, srcUniform ? 0 : 1, 1, 0);
15664 m_encoder->SetSrcSubReg(0, srcUniform ? (i * N + j) : (width * (i * N + j)));
15665 m_encoder->SetDstRegion(dstUniform ? 1 : N2);
15666 m_encoder->SetDstSubReg(oft);
15667 m_encoder->Copy(useV0 ? V0_alias : V1_alias, src);
15668 m_encoder->Push();
15669 }
15670 // combine V0 and V1 into dst
15671 if (splitDst)
15672 {
15673 SIMDMode simdSize = SIMDMode::SIMD8;
15674 int exSize = simdSize == SIMDMode::SIMD16 ? 16 : 8;
15675 // Dst must not be uniform and it must be SIMD16!
15676 // first simd8/simd16 : dst_alias = V0
15677 m_encoder->SetDstRegion(2);
15678 m_encoder->SetDstSubReg(2 * offset * width);
15679 m_encoder->SetSimdSize(simdSize);
15680 m_encoder->SetMask(simdSize != SIMDMode::SIMD16 ? EMASK_Q1 : EMASK_H1);
15681 m_encoder->Copy(dst_alias, V0);
15682 m_encoder->Push();
15683 // second simd8/simd16: dst_alias=V0
15684 m_encoder->SetSrcSubReg(0, exSize);
15685 m_encoder->SetDstRegion(2);
15686 m_encoder->SetDstSubReg(2 * offset * width + 2 * exSize);
15687 m_encoder->SetSimdSize(simdSize);
15688 m_encoder->SetMask(simdSize != SIMDMode::SIMD16 ? EMASK_Q2 : EMASK_H2);
15689 m_encoder->Copy(dst_alias, V0);
15690 m_encoder->Push();
15691
15692 // first simd8/simd16 : dist_alias=V1
15693 m_encoder->SetDstRegion(2);
15694 m_encoder->SetDstSubReg(2 * offset * width + 1);
15695 m_encoder->SetSimdSize(simdSize);
15696 m_encoder->SetMask(simdSize != SIMDMode::SIMD16 ? EMASK_Q1 : EMASK_H1);
15697 m_encoder->Copy(dst_alias, V1);
15698 m_encoder->Push();
15699 // first simd8/simd16 : dist_alias=V1
15700 m_encoder->SetSrcSubReg(0, exSize);
15701 m_encoder->SetDstRegion(2);
15702 m_encoder->SetDstSubReg(2 * offset * width + 2 * exSize + 1);
15703 m_encoder->SetSimdSize(simdSize);
15704 m_encoder->SetMask(simdSize != SIMDMode::SIMD16 ? EMASK_Q2 : EMASK_H2);
15705 m_encoder->Copy(dst_alias, V1);
15706 m_encoder->Push();
15707 }
15708 else
15709 {
15710 m_encoder->SetDstRegion(dstUniform ? 1 : 2);
15711 m_encoder->SetDstSubReg(dstUniform ? (2 * offset) : (2 * offset * width));
15712 m_encoder->Copy(dst_alias, V0);
15713 m_encoder->Push();
15714 m_encoder->SetDstRegion(dstUniform ? 1 : 2);
15715 m_encoder->SetDstSubReg(dstUniform ? (2 * offset + 1) : (2 * offset * width + 1));
15716 m_encoder->Copy(dst_alias, V1);
15717 m_encoder->Push();
15718 }
15719 offset++;
15720 }
15721 }
15722 }
15723 else
15724 {
15725 for (unsigned i = 0, offset = 0; i < dstNElts; ++i)
15726 {
15727 if (destMask.isSet(i))
15728 {
15729 for (unsigned j = 0; j < N; ++j)
15730 {
15731 if (splitDst)
15732 {
15733 // !dstUniform
15734 // first half
15735 SIMDMode mode = m_currShader->m_SIMDSize == SIMDMode::SIMD32 ? SIMDMode::SIMD16 : SIMDMode::SIMD8;
15736 int exSize = mode == SIMDMode::SIMD16 ? 16 : 8;
15737 m_encoder->SetSrcRegion(0, srcUniform ? 0 : 1, 1, 0);
15738 m_encoder->SetSrcSubReg(0, srcUniform ? (i * N + j) : (width * (i * N + j)));
15739 m_encoder->SetDstRegion(N);
15740 m_encoder->SetDstSubReg(offset * N * width + j);
15741 m_encoder->SetSimdSize(mode);
15742 m_encoder->SetMask(mode == SIMDMode::SIMD16 ? EMASK_H1 : EMASK_Q1);
15743 m_encoder->Copy(aliasDst, src);
15744 m_encoder->Push();
15745
15746 // second half
15747 m_encoder->SetSrcRegion(0, srcUniform ? 0 : 1, 1, 0);
15748 m_encoder->SetSrcSubReg(0, srcUniform ? (i * N + j) : (width * (i * N + j) + exSize));
15749 m_encoder->SetDstRegion(N);
15750 m_encoder->SetDstSubReg(offset * N * width + N * exSize + j);
15751 m_encoder->SetSimdSize(mode);
15752 m_encoder->SetMask(mode == SIMDMode::SIMD16 ? EMASK_H2 : EMASK_Q2);
15753 m_encoder->Copy(aliasDst, src);
15754 m_encoder->Push();
15755 }
15756 else
15757 {
15758 m_encoder->SetSrcRegion(0, srcUniform ? 0 : 1, 1, 0);
15759 m_encoder->SetSrcSubReg(0, srcUniform ? (i * N + j) : (width * (i * N + j)));
15760 m_encoder->SetDstRegion(dstUniform ? 1 : N);
15761 m_encoder->SetDstSubReg(dstUniform ? (offset * N + j) : (offset * N * width + j));
15762 m_encoder->Copy(aliasDst, src);
15763 m_encoder->Push();
15764 }
15765 }
15766 offset++;
15767 }
15768 }
15769 }
15770 }
15771 else // (dstEltBytes < srcEltBytes)
15772 {
15773 IGC_ASSERT(0 < dstEltBytes);
15774 // Create an aliase to src and mov the alias to the dst
15775 CVariable* aliasSrc = m_currShader->GetNewAlias(src, m_destination->GetType(), 0, 0);
15776 uint32_t N = srcEltBytes / dstEltBytes;
15777 // Similar to dstEltBytes > srcEltBytes, dstEltBytes can be 32bit
15778 // at most and dst's stride == 1, so it will not need spliting.
15779 bool splitSrc = (!srcUniform && (srcEltBytes * width > m_currShader->getGRFSize() * 2));
15780 IGC_ASSERT_MESSAGE((!splitSrc || (width == 16) || (width == 32)),
15781 "Internal Error: Src needs splitting only under SIMD16!");
15782 IGC_ASSERT_MESSAGE((srcEltBytes % dstEltBytes) == 0, "Basic types should be power of 2");
15783 // avoid coalescing the dst variable if all of its uses are EEI with constant index
15784 // this give RA more freedom (e.g. for bank conflict assignments)
15785 auto allUsesAreEEwithImm = [this](BitCastInst* BCI)
15786 {
15787 for (auto I = BCI->user_begin(), E = BCI->user_end(); I != E; ++I)
15788 {
15789 if (auto EEInst = dyn_cast<ExtractElementInst>(*I))
15790 {
15791 if (dyn_cast<ConstantInt>(EEInst->getIndexOperand()))
15792 {
15793 continue;
15794 }
15795 }
15796 return false;
15797 }
15798 return true;
15799 };
15800
15801 SmallVector<CVariable*, 8> VectorBCICVars;
15802 bool useSeparateCVar = m_currShader->m_numberInstance == 1 &&
15803 !dstUniform && srcNElts == 1 && N <= 8 &&
15804 allUsesAreEEwithImm(BCI);
15805
15806 // Once BCI has been coalesced, don't separate CVar for BCI
15807 // [todo evaluate the performance impact and let alias handle it
15808 // if needed]
15809 if (m_currShader->IsCoalesced(BCI))
15810 useSeparateCVar = false;
15811
15812 for (unsigned i = 0, offset = 0; i < srcNElts; ++i)
15813 {
15814 for (unsigned j = 0; j < N; ++j)
15815 {
15816 if (destMask.isSet(i * N + j))
15817 {
15818 if (useSeparateCVar)
15819 {
15820 CVariable* newDst = m_currShader->GetNewVariable(
15821 width, m_destination->GetType(),
15822 m_destination->GetAlign(),
15823 CName::NONE);
15824 VectorBCICVars.push_back(newDst);
15825 m_destination = newDst;
15826 }
15827 if (splitSrc)
15828 {
15829 // !srcUniform
15830 // first half
15831 SIMDMode mode = m_currShader->m_SIMDSize == SIMDMode::SIMD32 ? SIMDMode::SIMD16 : SIMDMode::SIMD8;
15832 int exSize = mode == SIMDMode::SIMD16 ? 16 : 8;
15833 m_encoder->SetSrcRegion(0, N, 1, 0); // = (0, width*N, width, N)
15834 m_encoder->SetSrcSubReg(0, i * N * width + j);
15835 m_encoder->SetDstSubReg(dstUniform ? offset : (width * offset));
15836 m_encoder->SetDstRegion(1);
15837 m_encoder->SetSimdSize(mode);
15838 m_encoder->SetMask(mode == SIMDMode::SIMD16 ? EMASK_H1 : EMASK_Q1);
15839 m_encoder->Copy(m_destination, aliasSrc);
15840 m_encoder->Push();
15841
15842 // second half
15843 m_encoder->SetSrcRegion(0, N, 1, 0); // = (0, width*N, width, N)
15844 m_encoder->SetSrcSubReg(0, i * N * width + N * exSize + j);
15845 m_encoder->SetDstSubReg(dstUniform ? offset : (width * offset + exSize));
15846 m_encoder->SetDstRegion(1);
15847 m_encoder->SetSimdSize(mode);
15848 m_encoder->SetMask(mode == SIMDMode::SIMD16 ? EMASK_H2 : EMASK_Q2);
15849 m_encoder->Copy(m_destination, aliasSrc);
15850 m_encoder->Push();
15851 }
15852 else
15853 {
15854 m_encoder->SetSrcRegion(0, srcUniform ? 0 : N, 1, 0); // = (0, width*N, width, N)
15855 m_encoder->SetSrcSubReg(0, srcUniform ? (i * N + j) : (i * N * width + j));
15856 m_encoder->SetDstSubReg(dstUniform ? offset : (width * offset));
15857 m_encoder->SetDstRegion(1);
15858 m_encoder->Copy(m_destination, aliasSrc);
15859 m_encoder->Push();
15860 }
15861 if (!useSeparateCVar)
15862 {
15863 // offset stays as zero if we are using distinct variablbes for each EEI
15864 offset++;
15865 }
15866 }
15867 }
15868 }
15869
15870 if (useSeparateCVar)
15871 {
15872 m_currShader->addCVarsForVectorBC(BCI, VectorBCICVars);
15873 }
15874 }
15875 }
15876
GetPrimitiveTypeSizeInRegisterInBits(const Type * Ty) const15877 unsigned int EmitPass::GetPrimitiveTypeSizeInRegisterInBits(const Type* Ty) const
15878 {
15879 return m_currShader->GetPrimitiveTypeSizeInRegisterInBits(Ty);
15880 }
15881
GetPrimitiveTypeSizeInRegister(const Type * Ty) const15882 unsigned int EmitPass::GetPrimitiveTypeSizeInRegister(const Type* Ty) const
15883 {
15884 return m_currShader->GetPrimitiveTypeSizeInRegister(Ty);
15885 }
15886
GetScalarTypeSizeInRegisterInBits(const Type * Ty) const15887 unsigned int EmitPass::GetScalarTypeSizeInRegisterInBits(const Type* Ty) const
15888 {
15889 return m_currShader->GetScalarTypeSizeInRegisterInBits(Ty);
15890 }
15891
GetScalarTypeSizeInRegister(const Type * Ty) const15892 unsigned int EmitPass::GetScalarTypeSizeInRegister(const Type* Ty) const
15893 {
15894 return m_currShader->GetScalarTypeSizeInRegister(Ty);
15895 }
15896
15897
A64LSLoopHead(CVariable * addr,CVariable * & curMask,CVariable * & lsPred,uint & label)15898 void EmitPass::A64LSLoopHead(
15899 CVariable* addr, CVariable*& curMask, CVariable*& lsPred, uint& label)
15900 {
15901 // Create a loop to calculate LS's pred (lsPred) that make sure for every active lane of the LS,
15902 // the address hi part must be the same
15903 //
15904 // pseudo code (including A64LSLoopHead and A64LSLoopTail):
15905 // addrHigh = packed addr hi part
15906 // curMask = executionMask
15907 // label:
15908 // uniformAddrHi = the_first_active_lane_of_CurMask(addrHigh)
15909 // lsPred = cmp(uniformAddrHi, addrHigh)
15910 // (lsPred) send // the original LS instruction
15911 // lsPred = ~lsPred
15912 // CurMask = lsPred & CurMask
15913 // lsPred = CurMask
15914 // (lsPred) jmp label
15915
15916 SIMDMode simdMode = m_encoder->GetSimdSize();
15917 uint16_t execSize = numLanes(simdMode);
15918 IGC_ASSERT(simdMode == SIMDMode::SIMD8 || simdMode == SIMDMode::SIMD16);
15919
15920 // get address hi part
15921 CVariable* addrAlias = m_currShader->GetNewAlias(addr, ISA_TYPE_UD, 0, execSize * 2);
15922 CVariable* addrHigh = m_currShader->GetNewVariable(
15923 execSize, ISA_TYPE_UD, EALIGN_GRF, false, CName::NONE);
15924 m_encoder->SetSrcSubReg(0, 1);
15925 m_encoder->SetSrcRegion(0, 2, 1, 0);
15926 m_encoder->Copy(addrHigh, addrAlias);
15927 m_encoder->Push();
15928
15929 curMask = GetHalfExecutionMask();
15930
15931 // create loop
15932 label = m_encoder->GetNewLabelID("a64_loop");
15933 m_encoder->Label(label);
15934 m_encoder->Push();
15935
15936 // Get the first active lane's address-hi
15937 CVariable* ufoffset = nullptr;
15938 CVariable* uniformAddrHi = UniformCopy(addrHigh, ufoffset, curMask, true);
15939
15940 // Set the predicate lsPred to true for all lanes with the same address_hi
15941 lsPred = m_currShader->GetNewVariable(
15942 numLanes(m_currShader->m_dispatchSize), ISA_TYPE_BOOL, EALIGN_BYTE, CName::NONE);
15943 m_encoder->Cmp(EPREDICATE_EQ, lsPred, uniformAddrHi, addrHigh);
15944 m_encoder->Push();
15945 }
15946
A64LSLoopTail(CVariable * curMask,CVariable * lsPred,uint label)15947 void EmitPass::A64LSLoopTail(CVariable* curMask, CVariable* lsPred, uint label)
15948 {
15949 // Unset the bits in the mask for lanes that were executed
15950 bool tmpSh = m_encoder->IsSecondHalf();
15951 m_encoder->SetSecondHalf(false);
15952
15953 CVariable* tmpLsPred = m_currShader->GetNewVariable(1, curMask->GetType(), curMask->GetAlign(), true, CName::NONE);
15954 m_encoder->Cast(tmpLsPred, lsPred);
15955
15956 m_encoder->SetSrcModifier(1, EMOD_NOT);
15957 m_encoder->And(curMask, curMask, tmpLsPred);
15958 m_encoder->Push();
15959 m_encoder->SetP(lsPred, curMask);
15960 m_encoder->Push();
15961 m_encoder->Jump(lsPred, label);
15962 m_encoder->Push();
15963
15964 m_encoder->SetSecondHalf(tmpSh);
15965 }
15966
hasA64WAEnable() const15967 bool EmitPass::hasA64WAEnable() const
15968 {
15969 // Check WA table entry for current platform.
15970 if (!m_currShader->m_Platform->WaEnableA64WA())
15971 return false;
15972
15973 // -intel-force-enable-a64WA
15974 if (m_pCtx->getModuleMetaData()->compOpt.ForceEnableA64WA)
15975 return true;
15976
15977 // -intel-disable-a64WA
15978 if (m_pCtx->getModuleMetaData()->compOpt.DisableA64WA)
15979 return false;
15980
15981 // Disable A64WA for kernels which specify work_group_size_hint(1, 1, 1).
15982 MetaDataUtils* pMdUtils = m_currShader->GetMetaDataUtils();
15983 uint32_t WGSize = IGCMetaDataHelper::getThreadGroupSizeHint(*pMdUtils, m_currShader->entry);
15984 if (WGSize == 1)
15985 return false;
15986
15987 return true;
15988 }
15989
emitGatherA64(Value * loadInst,CVariable * dst,CVariable * offset,unsigned elemSize,unsigned numElems,bool addrUniform)15990 void EmitPass::emitGatherA64(Value* loadInst, CVariable* dst, CVariable* offset, unsigned elemSize, unsigned numElems, bool addrUniform)
15991 {
15992 if (hasA64WAEnable() && !offset->IsUniform() && !addrUniform) {
15993 CVariable* curMask = nullptr;
15994 CVariable* lsPred = nullptr;
15995 uint label = 0;
15996 A64LSLoopHead(offset, curMask, lsPred, label);
15997
15998 // do send with pred
15999 if (isa<LoadInst>(loadInst) && !m_currShader->IsCoalesced(loadInst))
16000 {
16001 // load inst is the single def of the vISA variable and therefore a kill
16002 m_encoder->Lifetime(LIFETIME_START, dst);
16003 }
16004 m_encoder->SetPredicate(lsPred);
16005 m_encoder->GatherA64(dst, offset, elemSize, numElems);
16006 m_encoder->Push();
16007
16008 A64LSLoopTail(curMask, lsPred, label);
16009
16010 } else {
16011 m_encoder->GatherA64(dst, offset, elemSize, numElems);
16012 }
16013 }
16014
emitGather4A64(Value * loadInst,CVariable * dst,CVariable * offset,bool addrUniform)16015 void EmitPass::emitGather4A64(Value* loadInst, CVariable* dst, CVariable* offset, bool addrUniform)
16016 {
16017 if (hasA64WAEnable() && !offset->IsUniform() && !addrUniform) {
16018 CVariable* curMask = nullptr;
16019 CVariable* lsPred = nullptr;
16020 uint label = 0;
16021 A64LSLoopHead(offset, curMask, lsPred, label);
16022
16023 // do send with pred
16024 if (isa<LoadInst>(loadInst) && !m_currShader->IsCoalesced(loadInst))
16025 {
16026 // load inst is the single def of the vISA variable and therefore a kill
16027 m_encoder->Lifetime(LIFETIME_START, dst);
16028 }
16029 m_encoder->SetPredicate(lsPred);
16030 m_encoder->Gather4A64(dst, offset);
16031 m_encoder->Push();
16032
16033 A64LSLoopTail(curMask, lsPred, label);
16034
16035 }
16036 else {
16037 m_encoder->Gather4A64(dst, offset);
16038 }
16039 }
16040
emitScatterA64(CVariable * val,CVariable * offset,unsigned elementSize,unsigned numElems,bool addrUniform)16041 void EmitPass::emitScatterA64(CVariable* val, CVariable* offset, unsigned elementSize, unsigned numElems, bool addrUniform)
16042 {
16043 if (hasA64WAEnable() && !offset->IsUniform() && !addrUniform) {
16044 CVariable* curMask = nullptr;
16045 CVariable* lsPred = nullptr;
16046 uint label = 0;
16047 A64LSLoopHead(offset, curMask, lsPred, label);
16048
16049 // do send with pred
16050 m_encoder->SetPredicate(lsPred);
16051 m_encoder->ScatterA64(val, offset, elementSize, numElems);
16052 m_encoder->Push();
16053
16054 A64LSLoopTail(curMask, lsPred, label);
16055
16056 }
16057 else {
16058 m_encoder->ScatterA64(val, offset, elementSize, numElems);
16059 }
16060 }
16061
emitScatter4A64(CVariable * src,CVariable * offset,bool addrUniform)16062 void EmitPass::emitScatter4A64(CVariable* src, CVariable* offset, bool addrUniform)
16063 {
16064 if (hasA64WAEnable() && !offset->IsUniform() && !addrUniform) {
16065 CVariable* curMask = nullptr;
16066 CVariable* lsPred = nullptr;
16067 uint label = 0;
16068 A64LSLoopHead(offset, curMask, lsPred, label);
16069
16070 // do send with pred
16071 m_encoder->SetPredicate(lsPred);
16072 m_encoder->Scatter4A64(src, offset);
16073 m_encoder->Push();
16074
16075 A64LSLoopTail(curMask, lsPred, label);
16076
16077 }
16078 else {
16079 m_encoder->Scatter4A64(src, offset);
16080 }
16081 }
16082
16083
16084
emitVectorLoad(LoadInst * inst,Value * offset,ConstantInt * immOffset)16085 void EmitPass::emitVectorLoad(LoadInst* inst, Value* offset, ConstantInt* immOffset)
16086 {
16087 int immOffsetInt = 0;
16088 if (immOffset)
16089 immOffsetInt = static_cast<int>(immOffset->getSExtValue());
16090
16091 Value* Ptr = inst->getPointerOperand();
16092 PointerType* ptrType = cast<PointerType>(Ptr->getType());
16093 bool useA32 = !IGC::isA64Ptr(ptrType, m_currShader->GetContext());
16094
16095 ResourceDescriptor resource = GetResourceVariable(Ptr);
16096 CountStatelessIndirectAccess(Ptr, resource);
16097 // eOffset is in bytes
16098 // offset corresponds to Int2Ptr operand obtained during pattern matching
16099 CVariable* eOffset = GetSymbol(immOffset ? offset : Ptr);
16100 if (useA32)
16101 {
16102 eOffset = TruncatePointer(eOffset);
16103 }
16104
16105 Type* Ty = inst->getType();
16106 IGCLLVM::FixedVectorType* VTy = dyn_cast<IGCLLVM::FixedVectorType>(Ty);
16107 Type* eltTy = VTy ? VTy->getElementType() : Ty;
16108 uint32_t eltBytes = GetScalarTypeSizeInRegister(eltTy);
16109 IGC_ASSERT_MESSAGE((eltBytes == 1) || (eltBytes == 2) || (eltBytes == 4) || (eltBytes == 8),
16110 "Load's type (element type if vector) must be 1/2/4/8-byte long");
16111
16112 uint32_t elts = VTy ? int_cast<uint32_t>(VTy->getNumElements()) : 1;
16113 uint32_t totalBytes = eltBytes * elts;
16114
16115 bool destUniform = m_destination->IsUniform();
16116 bool srcUniform = eOffset->IsUniform();
16117 // Not possible to have uniform dest AND non-uniform src.
16118 IGC_ASSERT_MESSAGE(!(destUniform && !srcUniform),
16119 "If ld's dest is uniform, ld's src must be uniform");
16120
16121 unsigned align = inst->getAlignment();
16122 VISA_Type destType = m_destination->GetType();
16123 uint32_t width = numLanes(m_currShader->m_SIMDSize);
16124 uint bufferIndex = 0;
16125 bool directIndexing = false;
16126 BufferType bufType = DecodeAS4GFXResource(ptrType->getAddressSpace(), directIndexing, bufferIndex);
16127
16128 if (bufType == STATELESS_A32)
16129 {
16130 // Lower addressspace (5) loads to A32 oword ld
16131 CVariable* loadDest = m_destination;
16132 uint size = loadDest->GetSize();
16133 auto newDest = loadDest;
16134 if (bufType == STATELESS_A32)
16135 {
16136 auto r0 = m_currShader->GetR0();
16137 m_encoder->SetSimdSize(SIMDMode::SIMD1);
16138 m_encoder->SetNoMask();
16139 m_encoder->SetSrcRegion(0, 0, 1, 0);
16140 m_encoder->SetSrcSubReg(0, 0);
16141 auto dst = m_currShader->GetNewVariable(1, VISA_Type::ISA_TYPE_D, eOffset->GetAlign(), true, "OWOff");
16142 m_encoder->And(dst, r0, m_currShader->ImmToVariable(0xffffffe0, ISA_TYPE_UD));
16143 m_encoder->Push();
16144 m_encoder->SetSimdSize(SIMDMode::SIMD1);
16145 m_encoder->SetNoMask();
16146 m_encoder->Add(dst, dst, eOffset);
16147 m_encoder->Push();
16148 eOffset = dst;
16149 if (!iSTD::IsPowerOfTwo(size) || size < SIZE_OWORD)
16150 {
16151 // Ensure payload size is power of 2 or at least 16
16152 if (size < SIZE_OWORD)
16153 {
16154 size = std::max<unsigned int>(size, SIZE_OWORD);
16155 }
16156 else if (!iSTD::IsPowerOfTwo(size))
16157 {
16158 // llvm optimizer converts vector load <i64 x 4> in to <i64 x 3> if
16159 // last element isnt used. Recompute size to next higher power of 2.
16160 size = (uint)std::pow(2, std::ceil(std::log2(size)));
16161 }
16162 newDest = m_currShader->GetNewVariable(size / loadDest->GetElemSize(), loadDest->GetType(), EALIGN_GRF, true, CName::NONE);
16163 }
16164 }
16165 m_encoder->OWLoad(newDest, resource, eOffset, false, size);
16166 if (newDest != loadDest)
16167 {
16168 emitVectorCopy(loadDest, newDest, loadDest->GetNumberElement());
16169 }
16170 return;
16171 }
16172
16173 // First, special handling for less than 4 bytes of loaded value
16174 if (totalBytes < 4)
16175 {
16176 // totalBytes is either 1 or 2, and it must be scalar or (1-element vector)
16177 // do not expect <2 x i8> or <3 x i8>
16178 IGC_ASSERT(elts == 1);
16179 IGC_ASSERT(totalBytes != 3);
16180
16181 uint16_t nbelts = srcUniform ? 1 : width;
16182 e_alignment align = EALIGN_GRF;
16183
16184 eOffset = ReAlignUniformVariable(eOffset, align);
16185
16186 bool needTemporary = (totalBytes < 4) || !IsGRFAligned(m_destination, EALIGN_GRF);
16187 CVariable* gatherDst = m_destination;
16188 if (needTemporary)
16189 {
16190 gatherDst = m_currShader->GetNewVariable(nbelts, ISA_TYPE_UD, align, srcUniform, CName::NONE);
16191 }
16192
16193 if (srcUniform)
16194 {
16195 m_encoder->SetNoMask();
16196 m_encoder->SetUniformSIMDSize(SIMDMode::SIMD1);
16197 }
16198
16199 if (useA32)
16200 {
16201 m_encoder->ByteGather(gatherDst, resource, eOffset, 8, totalBytes);
16202 }
16203 else
16204 {
16205 emitGatherA64(inst, gatherDst, eOffset, 8, totalBytes, srcUniform);
16206 }
16207
16208 m_encoder->Push();
16209
16210 if (needTemporary)
16211 {
16212 gatherDst = m_currShader->GetNewAlias(gatherDst, destType, 0, 0);
16213 uint32_t vStride = srcUniform ? 0 : ((totalBytes == 1) ? 4 : 2);
16214 m_encoder->SetSrcRegion(0, vStride, 1, 0);
16215 m_encoder->Copy(m_destination, gatherDst);
16216 m_encoder->Push();
16217 }
16218 return;
16219 }
16220
16221
16222 bool bEmulateDWAligned = false;
16223
16224 // generate oword-load if possible
16225 if (VTy && srcUniform)
16226 {
16227 //uint32_t totalBytes = eltBytes * VTy->getNumElements();
16228 bool rightBlockSize = (totalBytes == 16 || totalBytes == 32 || totalBytes == 64 || totalBytes == 128);
16229 bool useDWAligned = (resource.m_surfaceType != ESURFACE_SLM && align && align >= 4);
16230 //if originally, unalignedDW is used for SSS in XeHP_SDV and above, emulate it with Gather4Scaled
16231 bEmulateDWAligned = (rightBlockSize && useDWAligned &&
16232 m_currShader->m_Platform->hasScratchSurface() && resource.m_surfaceType == ESURFACE_SCRATCH && align && align >= 4);
16233 useDWAligned &= (!bEmulateDWAligned);
16234 bool useOWAligned = (resource.m_surfaceType == ESURFACE_SLM && align && align >= 16 &&
16235 m_currShader->m_Platform->supportSLMBlockMessage());
16236
16237 if (rightBlockSize && (useDWAligned || useOWAligned))
16238 {
16239 bool needTemp = (!destUniform || !IsGRFAligned(m_destination, EALIGN_GRF));
16240 CVariable * loadDest = m_destination;
16241
16242 if (useOWAligned)
16243 {
16244 // Offset needs to be in OW!
16245 // Need to create a new cvar as eOffset could be used by others.
16246
16247 CVariable* tmp = m_currShader->GetNewVariable(eOffset);
16248 m_encoder->Shr(tmp, eOffset, m_currShader->ImmToVariable(4, ISA_TYPE_UD));
16249 m_encoder->Push();
16250 eOffset = tmp;
16251 }
16252 eOffset = ReAlignUniformVariable(eOffset, EALIGN_GRF);
16253 if (needTemp)
16254 {
16255 loadDest = m_currShader->GetNewVariable(
16256 int_cast<uint16_t>(VTy->getNumElements()),
16257 m_destination->GetType(),
16258 EALIGN_GRF,
16259 true, CName::NONE);
16260 }
16261
16262 if (useA32)
16263 {
16264 m_encoder->OWLoad(loadDest, resource, eOffset, useOWAligned, loadDest->GetSize());
16265 }
16266 else
16267 {
16268 IGC_ASSERT_MESSAGE(!useOWAligned, "SLM's pointer size must be 32 bit!");
16269 // emit svm block read
16270 m_encoder->OWLoadA64(loadDest, eOffset, loadDest->GetSize());
16271 }
16272 m_encoder->Push();
16273
16274 if (needTemp)
16275 {
16276 emitVectorCopy(m_destination, loadDest, int_cast<unsigned>(VTy->getNumElements()));
16277 }
16278 return;
16279 }
16280 }
16281
16282 // Only handle 4/8/12/16/32 bytes here. For aligned 16/32 bytes, it should've been handled
16283 // by oword already (except for SLM). We have 12 bytes for load of int3 (either aligned or
16284 // unaligned[vload]).
16285 //
16286 // Note that for simplicity, don't do it if totalBytes=32 and 64bit integer adds are needed
16287 // on platform that does not support 64bit integer add.
16288 //Note: it doesn't seem to be necessary to check hasNoFP64Inst() here.
16289 if (srcUniform && (totalBytes == 4 || totalBytes == 8 || totalBytes == 12 || totalBytes == 16 ||
16290 (totalBytes == 32 && (useA32 || !m_currShader->m_Platform->hasNoFullI64Support()))))
16291 {
16292 bool needTemp = !destUniform ||
16293 !IsGRFAligned(m_destination, EALIGN_GRF) ||
16294 totalBytes == 12;
16295 // For uniform src, we can map value to messages (vector re-layout) as follows
16296 // 1. A64:
16297 // <1 x i64> for align=8 && totalBytes=8 (eltBytes == 4 or 8);
16298 // [ (blksize, nblk) = (64, 1) ]
16299 // <n x i32> for align=4; [ (blksize, nblk) = (32, 1) ]
16300 // <n x S> for align < 4,
16301 // where S = <8xi8> if eltBytes = 8, or S = <4xi8> othewise;
16302 // [ (blksize, nblk) = (8, 8) or (8, 4) ]
16303 // 2. A32:
16304 // <n x S>, where S = <4 x i8>, ie, block size = 8 bits and #blocks = 4
16305 // [ (blksize, nblk) = (8, 4) ]
16306 // where n is the member of elements
16307
16308 // use A64 scattered RW with QW block size; Note that totalBytes == 16 with align >=4
16309 // should be handled by oword already (except for SLM).
16310 bool useQW = (!useA32) && (totalBytes == 8 || totalBytes == 16) &&
16311 (align >= 8 || eltBytes == 8);
16312
16313 // activelanes is the number of lanes that are needed.
16314 // nbelts is activelanes rounded up to the power of 2.
16315 uint16_t activelanes = useQW ? (totalBytes / 8) : (totalBytes / 4);
16316 uint16_t nbelts = (activelanes == 3 ? 4 : activelanes);
16317
16318 // For scattered RW
16319 uint32_t blkBits = useA32 ? 8 : (align < 4 ? 8 : (useQW ? 64 : 32));
16320 uint32_t nBlks = useA32 ? 4 : (align < 4 ? (useQW ? 8 : 4) : 1);
16321
16322 VISA_Type ldType = useQW ? ISA_TYPE_UQ : ISA_TYPE_UD;
16323 CVariable* gatherDst;
16324 if (needTemp)
16325 {
16326 gatherDst = m_currShader->GetNewVariable(
16327 nbelts, ldType, EALIGN_GRF, true /*srcUniform*/, CName::NONE);
16328 }
16329 else
16330 {
16331 gatherDst = m_destination;
16332 if (m_destination->GetType() != ldType)
16333 {
16334 gatherDst = m_currShader->GetNewAlias(gatherDst, ldType, 0, nbelts);
16335 }
16336 }
16337
16338 SIMDMode simdmode = lanesToSIMDMode(nbelts);
16339 eOffset = ReAlignUniformVariable(eOffset, useA32 ? EALIGN_GRF : EALIGN_2GRF);
16340 CVariable* gatherOff = eOffset;
16341 if (nbelts > 1)
16342 {
16343 gatherOff = m_currShader->GetNewVariable(
16344 nbelts, eOffset->GetType(), eOffset->GetAlign(), true /*srcUniform*/, CName::NONE);
16345 // May have the following
16346 // lane 0 1 2 3 4 5 6 7
16347 // eOff 0 4 8 C 10 14 18 1C // DW per lane
16348 // eOff 0 8 // QW per lane
16349 // When nbelts = 3, lane 3 is not used. Since we don't have simd3,
16350 // use simd4 and set lane3 to lane2.
16351 uint32_t incImm = 0;
16352 uint32_t incImm1 = 0; // for activelanes=8
16353 switch (activelanes) {
16354 default:
16355 IGC_ASSERT_MESSAGE(0, "ICE: something wrong happened in computing activelanes!");
16356 break;
16357 case 2:
16358 // only can have QW in this case
16359 incImm = useQW ? 0x80 : 0x40;
16360 break;
16361 case 3:
16362 // set lane3 to be the same as lane2 (it is 8)
16363 incImm = 0x8840;
16364 break;
16365 case 4:
16366 incImm = 0xC840;
16367 break;
16368 case 8:
16369 // Make sure incImm + incImm1 = {0 4 8 C 10 14 18 1C}
16370 incImm = 0xD951C840;
16371 incImm1 = 0xFFFF0000;
16372 break;
16373 }
16374
16375 CVariable* immVar = m_currShader->ImmToVariable(incImm, ISA_TYPE_UV);
16376 if (!useA32 && m_currShader->m_Platform->hasNoInt64AddInst()) {
16377 emitAddPair(gatherOff, eOffset, immVar);
16378 }
16379 else {
16380 m_encoder->SetNoMask();
16381 m_encoder->SetUniformSIMDSize(simdmode);
16382 m_encoder->SetSrcRegion(0, 0, 1, 0);
16383 m_encoder->Add(gatherOff, eOffset, immVar);
16384 m_encoder->Push();
16385 }
16386
16387 if (activelanes == 8) {
16388 CVariable* immVar1 = m_currShader->ImmToVariable(incImm1, ISA_TYPE_UV);
16389 m_encoder->SetNoMask();
16390 m_encoder->SetUniformSIMDSize(simdmode);
16391 m_encoder->SetSrcRegion(0, 8, 8, 1);
16392 m_encoder->Add(gatherOff, gatherOff, immVar1);
16393 m_encoder->Push();
16394 }
16395 }
16396
16397 m_encoder->SetNoMask();
16398 m_encoder->SetUniformSIMDSize(simdmode);
16399 if (useA32)
16400 {
16401 m_encoder->SetNoMask();
16402 m_encoder->SetUniformSIMDSize(simdmode);
16403 if (m_currShader->m_Platform->hasScratchSurface() &&
16404 align >= 4 &&
16405 (m_currShader->m_Platform->emulateByteScraterMsgForSS() || bEmulateDWAligned) &&
16406 (ESURFACE_SCRATCH == resource.m_surfaceType))
16407 {
16408 m_encoder->Gather4Scaled(gatherDst, resource, gatherOff);
16409 }
16410 else
16411 {
16412 m_encoder->ByteGather(gatherDst, resource, gatherOff, blkBits, nBlks);
16413 }
16414 }
16415 else
16416 {
16417 emitGatherA64(inst, gatherDst, gatherOff, blkBits, nBlks, srcUniform);
16418 }
16419 m_encoder->Push();
16420
16421 if (needTemp)
16422 {
16423 CVariable* newDst = m_currShader->GetNewAlias(
16424 gatherDst, destType, 0, m_destination->GetNumberElement());
16425 emitVectorCopy(m_destination, newDst, elts);
16426 }
16427 return;
16428 }
16429 CVariable* subLoadDst;
16430 CVariable* rawAddrVar;
16431
16432 // some driver describe constant buffer as typed which forces us to use byte scatter message
16433 bool forceByteScatteredRW =
16434 bufType == CONSTANT_BUFFER &&
16435 UsesTypedConstantBuffer(m_currShader->GetContext(), bufType);
16436
16437 VectorMessage VecMessInfo(this);
16438 VecMessInfo.getInfo(Ty, align, useA32, forceByteScatteredRW);
16439
16440 // Handle uniform case in general
16441 if (srcUniform)
16442 {
16443 // Use width of 8 always, and only the value of the first lane is
16444 // used. Need to set noMask in order to have the valid value in
16445 // the first lane.
16446 uint32_t width8 = getGRFSize() / 4;
16447 for (uint32_t i = 0; i < VecMessInfo.numInsts; ++i)
16448 {
16449 // raw operand, eltOffBytes is in bytes.
16450 uint32_t eltOffBytes = VecMessInfo.insts[i].startByte;
16451 uint32_t blkInBytes = VecMessInfo.insts[i].blkInBytes;
16452 uint32_t numBlks = VecMessInfo.insts[i].numBlks;
16453
16454 uint32_t eltOff = eltOffBytes / eltBytes; // in unit of element
16455 uint32_t blkBits = 8 * blkInBytes;
16456 uint32_t instTotalBytes = blkInBytes * numBlks;
16457 uint32_t instElts = instTotalBytes / eltBytes;
16458 uint32_t nbelts = instElts * width8;
16459
16460 if (i > 0)
16461 {
16462 // Calculate the new element offset
16463 rawAddrVar = m_currShader->GetNewVariable(eOffset);
16464 CVariable* ImmVar = m_currShader->ImmToVariable(eltOffBytes, ISA_TYPE_UD);
16465 if (!useA32 && m_currShader->m_Platform->hasNoInt64AddInst()) {
16466 emitAddPair(rawAddrVar, eOffset, ImmVar);
16467 }
16468 else {
16469 m_encoder->SetNoMask();
16470 m_encoder->Add(rawAddrVar, eOffset, ImmVar);
16471 m_encoder->Push();
16472 }
16473 }
16474 else
16475 {
16476 rawAddrVar = eOffset;
16477 }
16478 CVariable* addrVarSIMD8 = m_currShader->GetNewVariable(
16479 getGRFSize() / 4, rawAddrVar->GetType(), EALIGN_GRF, CName::NONE);
16480 m_encoder->SetNoMask();
16481 m_encoder->SetSimdSize(lanesToSIMDMode(addrVarSIMD8->GetNumberElement()));
16482 m_encoder->Copy(addrVarSIMD8, rawAddrVar);
16483
16484 subLoadDst = m_currShader->GetNewVariable(
16485 (uint16_t)nbelts, destType, EALIGN_GRF, CName::NONE);
16486 m_encoder->SetNoMask();
16487 m_encoder->SetSimdSize(lanesToSIMDMode(addrVarSIMD8->GetNumberElement()));
16488 VectorMessage::MESSAGE_KIND messageType = VecMessInfo.insts[i].kind;
16489 switch (messageType) {
16490 case VectorMessage::MESSAGE_A32_BYTE_SCATTERED_RW:
16491 m_encoder->ByteGather(subLoadDst, resource, addrVarSIMD8, blkBits, numBlks);
16492 break;
16493 case VectorMessage::MESSAGE_A32_UNTYPED_SURFACE_RW:
16494 m_encoder->Gather4Scaled(subLoadDst, resource, addrVarSIMD8);
16495 break;
16496 case VectorMessage::MESSAGE_A64_UNTYPED_SURFACE_RW:
16497 emitGather4A64(inst, subLoadDst, addrVarSIMD8, true);
16498 break;
16499 case VectorMessage::MESSAGE_A64_SCATTERED_RW:
16500 emitGatherA64(inst, subLoadDst, addrVarSIMD8, blkBits, numBlks, true);
16501 break;
16502 default:
16503 IGC_ASSERT_MESSAGE(0, "Somethings wrong!");
16504 }
16505 m_encoder->Push();
16506
16507 for (uint32_t n = 0; n < instElts; ++n)
16508 {
16509 m_encoder->SetSrcRegion(0, 0, 1, 0);
16510 m_encoder->SetSrcSubReg(0, n * width8);
16511 m_encoder->SetDstSubReg(eltOff + (destUniform ? n : n * width));
16512 m_encoder->Copy(m_destination, subLoadDst);
16513 m_encoder->Push();
16514 }
16515 }
16516
16517 return;
16518 }
16519
16520 // Second, src isn't uniform
16521 for (uint32_t i = 0; i < VecMessInfo.numInsts; ++i)
16522 {
16523 // raw operand, eltOffBytes is in bytes.
16524 uint32_t eltOffBytes = VecMessInfo.insts[i].startByte * width;
16525 uint32_t blkInBytes = VecMessInfo.insts[i].blkInBytes;
16526 uint32_t numBlks = VecMessInfo.insts[i].numBlks;
16527 uint32_t eltOff = eltOffBytes / eltBytes;
16528 uint32_t blkBits = 8 * blkInBytes;
16529 uint32_t instTotalBytes = blkInBytes * numBlks;
16530 uint32_t instElts = instTotalBytes / eltBytes;
16531 uint32_t nbelts = instElts * width;
16532
16533 if (i > 0)
16534 {
16535 // Calculate the new element offset
16536 rawAddrVar = m_currShader->GetNewVariable(eOffset);
16537 CVariable* ImmVar = m_currShader->ImmToVariable(VecMessInfo.insts[i].startByte, ISA_TYPE_UD);
16538 if (!useA32 && m_currShader->m_Platform->hasNoInt64AddInst()) {
16539 emitAddPair(rawAddrVar, eOffset, ImmVar);
16540 }
16541 else {
16542 m_encoder->Add(rawAddrVar, eOffset, ImmVar);
16543 m_encoder->Push();
16544 }
16545 }
16546 else
16547 {
16548 rawAddrVar = eOffset;
16549 }
16550
16551 bool needTemp = (!IsGRFAligned(m_destination, EALIGN_GRF));
16552 CVariable* gatherDst;
16553 if (needTemp)
16554 {
16555 gatherDst = m_currShader->GetNewVariable(
16556 (uint16_t)nbelts, destType, EALIGN_GRF, CName::NONE);
16557 }
16558 else
16559 {
16560 // No need to copy, load directly into m_destination
16561 gatherDst = m_currShader->GetNewAlias(m_destination,
16562 destType, (uint16_t)eltOffBytes, (uint16_t)nbelts);
16563 }
16564 VectorMessage::MESSAGE_KIND messageType = VecMessInfo.insts[i].kind;
16565 switch (messageType) {
16566 case VectorMessage::MESSAGE_A32_BYTE_SCATTERED_RW:
16567 m_encoder->ByteGather(gatherDst, resource, rawAddrVar, blkBits, numBlks);
16568 break;
16569 case VectorMessage::MESSAGE_A32_UNTYPED_SURFACE_RW:
16570 m_encoder->Gather4Scaled(gatherDst, resource, rawAddrVar);
16571 break;
16572 case VectorMessage::MESSAGE_A64_UNTYPED_SURFACE_RW:
16573 emitGather4A64(inst, gatherDst, rawAddrVar, false);
16574 break;
16575 case VectorMessage::MESSAGE_A64_SCATTERED_RW:
16576 emitGatherA64(inst, gatherDst, rawAddrVar, blkBits, numBlks, false);
16577 break;
16578 default:
16579 IGC_ASSERT_MESSAGE(0, "Internal Error: unexpected message kind for load!");
16580 }
16581 m_encoder->Push();
16582
16583 if (needTemp)
16584 {
16585 emitVectorCopy(m_destination, gatherDst, instElts, eltOff, 0);
16586 }
16587 }
16588 }
16589
emitVectorStore(StoreInst * inst,Value * offset,ConstantInt * immOffset)16590 void EmitPass::emitVectorStore(StoreInst* inst, Value* offset, ConstantInt* immOffset)
16591 {
16592 int immOffsetInt = 0;
16593 if (immOffset)
16594 immOffsetInt = static_cast<int>(immOffset->getSExtValue());
16595
16596 Value* Ptr = inst->getPointerOperand();
16597 PointerType* ptrType = cast<PointerType>(Ptr->getType());
16598
16599 ResourceDescriptor resource = GetResourceVariable(Ptr);
16600 CountStatelessIndirectAccess(Ptr, resource);
16601 if (ptrType->getPointerAddressSpace() != ADDRESS_SPACE_PRIVATE)
16602 {
16603 ForceDMask(false);
16604 }
16605 // eOffset is in bytes
16606 // offset corresponds to Int2Ptr operand obtained during pattern matching
16607 CVariable* eOffset = GetSymbol(immOffset ? offset : Ptr);
16608 bool useA32 = !isA64Ptr(ptrType, m_currShader->GetContext());
16609 if (useA32)
16610 {
16611 eOffset = TruncatePointer(eOffset);
16612 }
16613
16614 // In case eOffset isn't GRF aligned, need to create a copy
16615 // For non-uniform variable, it should be already GRF-aligned.
16616 eOffset = ReAlignUniformVariable(eOffset, EALIGN_GRF);
16617
16618 Value* storedVal = inst->getValueOperand();
16619 Type* Ty = storedVal->getType();
16620 IGCLLVM::FixedVectorType* VTy = dyn_cast<IGCLLVM::FixedVectorType>(Ty);
16621 Type* eltTy = VTy ? VTy->getElementType() : Ty;
16622 uint32_t eltBytes = GetScalarTypeSizeInRegister(eltTy);
16623
16624 IGC_ASSERT_MESSAGE((eltBytes == 1) || (eltBytes == 2) || (eltBytes == 4) || (eltBytes == 8),
16625 "Store type must be 1/2/4/8-bytes long");
16626
16627 uint32_t elts = VTy ? int_cast<uint32_t>(VTy->getNumElements()) : 1;
16628 uint32_t totalBytes = elts * eltBytes;
16629 unsigned align = inst->getAlignment();
16630 CVariable* storedVar = GetSymbol(storedVal);
16631 unsigned int width = numLanes(m_currShader->m_SIMDSize);
16632
16633 bool srcUniform = storedVar->IsUniform();
16634 bool dstUniform = eOffset->IsUniform();
16635
16636 // Handle two cases:
16637 // 1. less than 4 bytes: need to extend it to 4 bytes
16638 // 2. destination is scalar and uniform (handle vector if needed)
16639 if (totalBytes < 4 || isUniformStoreOCL(inst))
16640 {
16641 IGC_ASSERT_MESSAGE((totalBytes == 1) || (totalBytes == 2) || (totalBytes == 4) || (totalBytes == 8) || (totalBytes == 12) || (totalBytes == 16),
16642 "Wrong total Bytes!");
16643
16644 SIMDMode simdmode = SIMDMode::SIMD1;
16645 e_alignment grfAlign = useA32 ? EALIGN_GRF : EALIGN_2GRF;
16646 uint32_t blkBits, nBlks;
16647 if (elts > 1)
16648 {
16649 // Vector uniform store: handle uniform value only.
16650 // For elts > 1, the eltBytes must be either 4 or 8; only elts = 2|3|4 are handled.
16651 IGC_ASSERT_MESSAGE((eltBytes == 4) || (eltBytes == 8), "ICE: wrong element bytes!");
16652 IGC_ASSERT_MESSAGE(dstUniform, "ICE: for vector uniform store, both dst and src must be uniform!");
16653 IGC_ASSERT_MESSAGE(srcUniform, "ICE: for vector uniform store, both dst and src must be uniform!");
16654
16655 // As we use simd8 for vector (SKL HW WA). Converting DW to QW
16656 // makes sense only if the final is a scalar (a single QW).
16657 bool useQW = (!useA32) &&
16658 (eltBytes == 8 || // requested by vector layout
16659 (eltBytes == 4 && totalBytes == 8 && align >= 8)); // convert DW to QW
16660
16661 // activelanes is the number of lanes that are needed.
16662 // nbelts is activelanes rounded up to the power of 2.
16663 uint16_t activelanes = useQW ? (totalBytes / 8) : (totalBytes / 4);
16664 uint16_t nbelts = (activelanes == 3 ? 4 : activelanes);
16665
16666 // Work around of a possible SKL HW bug. Using send(4) for "store <4xi32>v, *p"
16667 // Therefore, using simd8 for A64 vector store to get around
16668 // of this issue..
16669
16670 // This is simdmode we wanted, but we need to work around of A64 HW bug
16671 SIMDMode simdWanted = lanesToSIMDMode(nbelts);
16672 uint16_t nbeltsWanted = nbelts;
16673 if (!useA32 && nbelts > 1) {
16674 nbelts = 8;
16675 }
16676 simdmode = lanesToSIMDMode(nbelts);
16677
16678 // compute offset
16679 // We have the following :
16680 // lane 0 1 2 3
16681 // eOff 0 4 8 C // DW per lane
16682 // eOff 0 8 // QW per lane
16683 // When elts = 3, lane 3 is not used. Since we don't have simd3,
16684 // use simd4 and set lane3 to the same as lane2(8).
16685 //
16686 // When using simd8, all unused lanes will be the same as lane0.
16687 // Make sure offset & stored value are correctly set up.
16688 if (nbelts > 1)
16689 {
16690 CVariable* NewOff = m_currShader->GetNewVariable(
16691 nbelts, eOffset->GetType(), grfAlign, true /*dstUniform*/, CName::NONE);
16692 uint32_t incImm =
16693 useQW ? 0x80 : (activelanes == 2 ? 0x40 : (activelanes == 3 ? 0x8840 : 0xC840));
16694 CVariable* immVar = m_currShader->ImmToVariable(incImm, ISA_TYPE_UV);
16695
16696 // When work-around of A64 SKL Si limitation of SIMD4, we use SIMD8 (nbelts > nbeltsWanted)
16697 // in which all upper four channels are zero, meaning eOffset[0], Later, stored value
16698 // must use storvedVar[0] for those extra lanes.
16699 if (!useA32 && m_currShader->m_Platform->hasNoInt64AddInst()) {
16700 emitAddPair(NewOff, eOffset, immVar);
16701 }
16702 else {
16703 m_encoder->SetNoMask();
16704 m_encoder->SetUniformSIMDSize(simdmode);
16705 m_encoder->SetSrcRegion(0, 0, 1, 0);
16706 m_encoder->Add(NewOff, eOffset, immVar);
16707 m_encoder->Push();
16708 }
16709
16710 eOffset = NewOff;
16711 }
16712 else
16713 {
16714 eOffset = ReAlignUniformVariable(eOffset, grfAlign);
16715 }
16716
16717
16718 // (We could have useQW == true AND eltBytes == 4. Note that if useQW
16719 // is false, eltBytes must be 4.)
16720 IGC_ASSERT_MESSAGE(useQW || (eltBytes == 4), "ICE: wrong vector element type!");
16721
16722 // Since we might change element type, need to create copy.
16723 if (useQW && eltBytes == 4)
16724 {
16725 CVariable* tmp = m_currShader->GetNewVariable(
16726 nbeltsWanted, ISA_TYPE_UQ, grfAlign, true /*srcUniform*/, CName::NONE);
16727 CVariable* tmpAlias = m_currShader->GetNewAlias(tmp,
16728 storedVar->GetType(), 0, 2 * nbeltsWanted);
16729 IGC_ASSERT_MESSAGE((2 * nbeltsWanted) == storedVar->GetNumberElement(),
16730 "Mismatch of the number of elements: sth wrong!");
16731 emitVectorCopy(tmpAlias, storedVar, 2 * nbeltsWanted);
16732 storedVar = tmp;
16733 }
16734
16735 // Prepare stored value
16736 if (storedVar->IsImmediate() || activelanes < nbelts ||
16737 !IsGRFAligned(storedVar, grfAlign))
16738 {
16739 CVariable* NewVar = m_currShader->GetNewVariable(
16740 nbelts, storedVar->GetType(), grfAlign, true /*srcUniform*/, CName::NONE);
16741
16742 // A64 SKL HW issue work-around: set remaining lanes to storedVar[0]
16743 // as eOffset has been set to the first element already.
16744 if (nbeltsWanted < nbelts)
16745 {
16746 m_encoder->SetNoMask();
16747 m_encoder->SetUniformSIMDSize(simdmode);
16748 m_encoder->SetSrcRegion(0, 0, 1, 0);
16749 m_encoder->Copy(NewVar, storedVar);
16750 m_encoder->Push();
16751 }
16752
16753 // Values that we care
16754 if (activelanes == 3)
16755 {
16756 m_encoder->SetNoMask();
16757 m_encoder->SetUniformSIMDSize(SIMDMode::SIMD2);
16758 m_encoder->SetSrcRegion(0, 2, 2, 1);
16759 m_encoder->Copy(NewVar, storedVar);
16760 m_encoder->Push();
16761
16762 // offset is 0x8840, so duplicate lane2
16763 m_encoder->SetNoMask();
16764 m_encoder->SetUniformSIMDSize(SIMDMode::SIMD2);
16765 m_encoder->SetDstSubReg(2);
16766 m_encoder->SetSrcSubReg(0, 2);
16767 m_encoder->SetSrcRegion(0, 0, 1, 0);
16768 m_encoder->Copy(NewVar, storedVar);
16769 m_encoder->Push();
16770 }
16771 else
16772 {
16773 m_encoder->SetNoMask();
16774 m_encoder->SetUniformSIMDSize(simdWanted);
16775 m_encoder->SetSrcRegion(0, nbeltsWanted, nbeltsWanted, 1);
16776 m_encoder->Copy(NewVar, storedVar);
16777 m_encoder->Push();
16778 }
16779 storedVar = NewVar;
16780 }
16781
16782 // each lane will store either DW or QW
16783 blkBits = useQW ? (align >= 8 ? 64 : 8)
16784 : (!useA32 && align >= 4) ? 32 : 8;
16785 nBlks = useQW ? (64 / blkBits) : (32 / blkBits);
16786 }
16787 else
16788 {
16789 // scalar case (elts == 1)
16790 if (dstUniform)
16791 {
16792 eOffset = ReAlignUniformVariable(eOffset, grfAlign);
16793 if (!srcUniform)
16794 {
16795 storedVar = UniformCopy(storedVar);
16796 }
16797 else
16798 {
16799 storedVar = ReAlignUniformVariable(storedVar, grfAlign);
16800 }
16801 storedVar = ExtendVariable(storedVar, grfAlign);
16802 }
16803 else
16804 {
16805 storedVar = BroadcastAndExtend(storedVar);
16806 }
16807
16808 // use either A32 byte scatter or A64 scatter messages.
16809 // A32 should use byte as block size always here.
16810 // A64 uses byte/DW/QW as block size based on align and element size.
16811 // Note that this is for elts = 1, so totalBytes is bytes per-lane.
16812 blkBits = useA32 ? 8 : ((eltBytes >= 4 && align >= eltBytes) ? eltBytes * 8 : 8);
16813 nBlks = (totalBytes * 8) / blkBits;
16814 }
16815 setPredicateForDiscard();
16816
16817 if (useA32)
16818 {
16819 m_encoder->ByteScatter(storedVar, resource, eOffset, blkBits, nBlks);
16820 }
16821 else
16822 {
16823 emitScatterA64(storedVar, eOffset, blkBits, nBlks, true);
16824 }
16825
16826 if (dstUniform)
16827 {
16828 m_encoder->SetNoMask();
16829 m_encoder->SetUniformSIMDSize(simdmode);
16830 }
16831 m_encoder->Push();
16832 }
16833 else
16834 {
16835 eOffset = BroadcastIfUniform(eOffset);
16836 storedVar = BroadcastIfUniform(storedVar);
16837
16838 VectorMessage VecMessInfo(this);
16839 VecMessInfo.getInfo(Ty, align, useA32);
16840
16841 for (uint32_t i = 0; i < VecMessInfo.numInsts; ++i)
16842 {
16843 // raw operand, eltOff is in bytes
16844 uint32_t eltOffBytes = VecMessInfo.insts[i].startByte * width;
16845 uint32_t blkInBytes = VecMessInfo.insts[i].blkInBytes;
16846 uint32_t numBlks = VecMessInfo.insts[i].numBlks;
16847 uint32_t blkBits = 8 * blkInBytes;
16848 uint32_t instTotalBytes = blkInBytes * numBlks;
16849 uint32_t instElts = instTotalBytes / eltBytes;
16850 uint32_t nbelts = instElts * width;
16851
16852 CVariable* rawAddrVar;
16853 if (i > 0)
16854 {
16855 // Calculate the new element offset
16856 rawAddrVar = m_currShader->GetNewVariable(eOffset);
16857 CVariable* ImmVar = m_currShader->ImmToVariable(VecMessInfo.insts[i].startByte, ISA_TYPE_UD);
16858 if (!useA32 && m_currShader->m_Platform->hasNoInt64AddInst()) {
16859 emitAddPair(rawAddrVar, eOffset, ImmVar);
16860 }
16861 else {
16862 m_encoder->Add(rawAddrVar, eOffset, ImmVar);
16863 m_encoder->Push();
16864 }
16865 }
16866 else
16867 {
16868 rawAddrVar = eOffset;
16869 }
16870 setPredicateForDiscard();
16871 VISA_Type storedType = storedVar->GetType();
16872 IGC_ASSERT_MESSAGE((eltOffBytes < (UINT16_MAX)), "eltOffBytes > higher than 64k");
16873 IGC_ASSERT_MESSAGE((nbelts < (UINT16_MAX)), "nbelts > higher than 64k");
16874 CVariable* subStoredVar = m_currShader->GetNewAlias(storedVar, storedType, (uint16_t)eltOffBytes, (uint16_t)nbelts);
16875 switch (VecMessInfo.insts[i].kind) {
16876 case VectorMessage::MESSAGE_A32_BYTE_SCATTERED_RW:
16877 m_encoder->ByteScatter(subStoredVar, resource, rawAddrVar, blkBits, numBlks);
16878 break;
16879 case VectorMessage::MESSAGE_A32_UNTYPED_SURFACE_RW:
16880 m_encoder->Scatter4Scaled(subStoredVar, resource, rawAddrVar);
16881 break;
16882 case VectorMessage::MESSAGE_A64_UNTYPED_SURFACE_RW:
16883 emitScatter4A64(subStoredVar, rawAddrVar, false);
16884 break;
16885 case VectorMessage::MESSAGE_A64_SCATTERED_RW:
16886 emitScatterA64(subStoredVar, rawAddrVar, blkBits, numBlks, false);
16887 break;
16888 default:
16889 IGC_ASSERT_MESSAGE(0, "Internal Error: unexpected Message kind for store");
16890 }
16891 m_encoder->Push();
16892 }
16893 }
16894 if (ptrType->getPointerAddressSpace() != ADDRESS_SPACE_PRIVATE)
16895 {
16896 ResetVMask(false);
16897 }
16898 }
16899
16900 // prepareAddressForUniform(): for both load and store
16901 // prepareDataForUniform(): for store only
16902 // Unaligned (less than 4 bytes) uniform load/store. One for address payload,
16903 // and the other for data payload.
16904 //
16905 // Example 1: "store <4xi32> V, <4xi32>* P, align 2"
16906 // A new pointer pVar is create with 4 elements.
16907 //
16908 // add (4|M0_NM) pVar<1>:ud P<0;1,0>:UD 0xC840:UV
16909 // send (4|M0_NM) pVar V
16910 //
16911 // prepareAddressForUniform() : create pVar
16912 // prepareDataForUniform() : return V (assuming V can be used directly)
16913 //
16914 // Example 2: "store <3xi32> V, <3xi32>* P, align 2"
16915 // Non-power of 2 vector size is rounded up to the next power of 2.
16916 // Additional elements are duplicated with the first vector element.
16917
16918 // add (4|M0_NM) pVar<1>:ud P<0;1,0>:UD 0x0840:UV
16919 // mov (4|M0_NM) vVar<1>:ud V<0;1,0>:ud
16920 // mov (2|M0_NM) vVar<1>:ud V<1;1,0>:ud
16921 // mov (1|M0_NM) vVar.2<1>:ud V.2<1;1,0>:ud
16922 // send (4|M0_NM) vVar pVar
16923 //
16924 // prepareAddressForUniform() : create pVar
16925 // prepareDataForUniform() : return vVar
16926 //
16927 // This function handles vector size up to 8. It also handles QW element size.
16928 // When vector size > 4, it uses 0x76543210, left-shifted by 2 (DW) or 3 (QW)
16929 // as an immediate to be added to 'AddrVar' to form a new address var.
16930 //
16931 // In addition, if 64bit add is not supported, emitAddPair() will be used to
16932 // use 32bit add/addc to emulate 64bit add.
16933 //
16934 // Note that argument 'AddrVar' in prepareAddressForUniform() is uniform, so is
16935 // its return var. The argument 'DataVar' in prepareDataForUniform() is uniform,
16936 // so is its return var.
16937 //
prepareAddressForUniform(CVariable * AddrVar,uint32_t EltBytes,uint32_t NElts,uint32_t RequiredNElts,e_alignment Align)16938 CVariable* EmitPass::prepareAddressForUniform(
16939 CVariable* AddrVar, uint32_t EltBytes, uint32_t NElts, uint32_t RequiredNElts, e_alignment Align)
16940 {
16941 // If RequiredNElts == 0, use next power of 2 of NElts as return var's num of elements.
16942 // otherwise, user RequiredNElts as return var's num of elements.
16943 uint32_t pow2NElts = (uint32_t)PowerOf2Ceil(NElts);
16944 uint32_t allocNElts = (RequiredNElts > 0 ? RequiredNElts : pow2NElts);
16945 IGC_ASSERT(NElts <= 8 && (EltBytes == 4 || EltBytes == 8));
16946 IGC_ASSERT(allocNElts >= pow2NElts);
16947 if (allocNElts == NElts && NElts == 1 && IsGRFAligned(AddrVar, Align))
16948 {
16949 // No need to create a new var.
16950 return AddrVar;
16951 }
16952 bool isA64 = (AddrVar->GetElemSize() == 8);
16953 SIMDMode simdmode = lanesToSIMDMode(pow2NElts);
16954 CVariable* newVar = m_currShader->GetNewVariable(allocNElts, AddrVar->GetType(), Align, true, CName::NONE);
16955
16956 CVariable* off;
16957 uint32_t incImm = (0x76543210 & maskTrailingOnes<uint32_t>(NElts * 4));
16958 if ((pow2NElts <= 4 && EltBytes == 4) || (pow2NElts <= 2 && EltBytes == 8))
16959 {
16960 // This case needs a single UV immediate
16961 incImm = incImm << (EltBytes == 4 ? 2 : 3);
16962 off = m_currShader->ImmToVariable(incImm, ISA_TYPE_UV);
16963 }
16964 else
16965 {
16966 // Need a temporary var to calculate offsets.
16967 // (Note that the temp is non-uniform, otherwise emitAddrPair() won't work.)
16968 off = m_currShader->GetNewVariable(pow2NElts, ISA_TYPE_UD, EALIGN_DWORD, false, CName::NONE);
16969
16970 // Need a mov and mul
16971 m_encoder->SetNoMask();
16972 m_encoder->SetSimdSize(simdmode);
16973 m_encoder->Copy(off, m_currShader->ImmToVariable(incImm, ISA_TYPE_UV));
16974 m_encoder->Push();
16975
16976 m_encoder->SetNoMask();
16977 m_encoder->SetSimdSize(simdmode);
16978 m_encoder->SetSrcRegion(0, 1, 1, 0);
16979 m_encoder->SetSrcRegion(1, 0, 1, 0);
16980 m_encoder->Mul(off, off, m_currShader->ImmToVariable(EltBytes, ISA_TYPE_UW));
16981 m_encoder->Push();
16982 }
16983
16984 // Only need to initialize pow2NElts elements.
16985 if (allocNElts > pow2NElts)
16986 {
16987 newVar = m_currShader->GetNewAlias(newVar, newVar->GetType(), 0, pow2NElts);
16988 }
16989
16990 // Currently, it's impossible to split because of NElts <= 8. In the future, NElts
16991 // could be 32 and we could need to split.
16992 bool needSplit = ((pow2NElts * newVar->GetElemSize()) > (2 * (uint32_t)getGRFSize()));
16993 if (needSplit)
16994 {
16995 IGC_ASSERT(!off->IsImmediate());
16996 uint32_t halfNElts = pow2NElts / 2;
16997 uint32_t bytes1 = halfNElts * newVar->GetElemSize();
16998 uint32_t bytes2 = halfNElts * off->GetElemSize();
16999 CVariable* newVarHi = m_currShader->GetNewAlias(newVar, newVar->GetType(), 0, halfNElts);
17000 CVariable* newVarLo = m_currShader->GetNewAlias(newVar, newVar->GetType(), bytes1, halfNElts);
17001 CVariable* offHi = m_currShader->GetNewAlias(off, off->GetType(), 0, halfNElts);
17002 CVariable* offLo = m_currShader->GetNewAlias(off, off->GetType(), bytes2, halfNElts);
17003
17004 if (isA64 && m_currShader->m_Platform->hasNoInt64AddInst())
17005 {
17006 emitAddPair(newVarHi, AddrVar, offHi);
17007 emitAddPair(newVarLo, AddrVar, offLo);
17008 }
17009 else
17010 {
17011 SIMDMode sm = lanesToSIMDMode(halfNElts);
17012 m_encoder->SetNoMask();
17013 m_encoder->SetUniformSIMDSize(sm);
17014 m_encoder->SetSrcRegion(0, 0, 1, 0);
17015 m_encoder->SetSrcRegion(1, 1, 1, 0);
17016 m_encoder->Add(newVarHi, AddrVar, offHi);
17017 m_encoder->Push();
17018
17019 m_encoder->SetNoMask();
17020 m_encoder->SetUniformSIMDSize(sm);
17021 m_encoder->SetSrcRegion(0, 0, 1, 0);
17022 m_encoder->SetSrcRegion(1, 1, 1, 0);
17023 m_encoder->Add(newVarLo, AddrVar, offLo);
17024 m_encoder->Push();
17025 }
17026 }
17027 else if (isA64 && m_currShader->m_Platform->hasNoInt64AddInst() && pow2NElts > 1)
17028 {
17029 emitAddPair(newVar, AddrVar, off);
17030 }
17031 else
17032 {
17033 m_encoder->SetNoMask();
17034 m_encoder->SetUniformSIMDSize(simdmode);
17035 m_encoder->SetSrcRegion(0, 0, 1, 0);
17036 m_encoder->SetSrcRegion(1, 1, 1, 0);
17037 if (pow2NElts > 1) {
17038 m_encoder->Add(newVar, AddrVar, off);
17039 }
17040 else {
17041 m_encoder->Copy(newVar, AddrVar);
17042 }
17043 m_encoder->Push();
17044 }
17045 return newVar;
17046 }
17047
prepareDataForUniform(CVariable * DataVar,uint32_t RequiredNElts,e_alignment Align)17048 CVariable* EmitPass::prepareDataForUniform(
17049 CVariable* DataVar, uint32_t RequiredNElts, e_alignment Align)
17050 {
17051 uint32_t NElts = DataVar->GetNumberElement();
17052 uint32_t EltBytes = DataVar->GetElemSize();
17053 uint32_t pow2NElts = (uint32_t)(uint32_t)PowerOf2Ceil(NElts);
17054 uint32_t allocNElts = RequiredNElts > 0 ? RequiredNElts : pow2NElts;
17055 IGC_ASSERT(allocNElts >= pow2NElts && NElts <= 8 && (EltBytes == 4 || EltBytes == 8));
17056 if (NElts == allocNElts && !DataVar->IsImmediate() && IsGRFAligned(DataVar, Align))
17057 {
17058 return DataVar;
17059 }
17060 CVariable* newVar = m_currShader->GetNewVariable(allocNElts, DataVar->GetType(), Align, true, CName::NONE);
17061
17062 // Need to return a var with pow2NElts elements
17063 if (allocNElts > pow2NElts)
17064 {
17065 newVar = m_currShader->GetNewAlias(newVar, newVar->GetType(), 0, pow2NElts);
17066 }
17067
17068 // Initialize to DataVar's first element (set Elts from NElts and up to the first element).
17069 bool initWithElem0 = (pow2NElts > NElts);
17070 bool needSplit = ((pow2NElts *newVar->GetElemSize()) > (2 * (uint32_t)getGRFSize()));
17071 if (initWithElem0)
17072 {
17073 if (needSplit)
17074 {
17075 uint32_t esz = pow2NElts / 2;
17076 uint32_t bytes = esz * newVar->GetElemSize();
17077 CVariable* newVarHi = m_currShader->GetNewAlias(newVar, newVar->GetType(), 0, esz);
17078 CVariable* newVarLo = m_currShader->GetNewAlias(newVar, newVar->GetType(), bytes, esz);
17079
17080 m_encoder->SetNoMask();
17081 m_encoder->SetUniformSIMDSize(lanesToSIMDMode(esz));
17082 m_encoder->SetSrcRegion(0, 0, 1, 0);
17083 m_encoder->Copy(newVarHi, DataVar);
17084 m_encoder->Push();
17085
17086 m_encoder->SetNoMask();
17087 m_encoder->SetUniformSIMDSize(lanesToSIMDMode(esz));
17088 m_encoder->SetSrcRegion(0, 0, 1, 0);
17089 m_encoder->Copy(newVarLo, DataVar);
17090 m_encoder->Push();
17091 }
17092 else
17093 {
17094 m_encoder->SetNoMask();
17095 m_encoder->SetUniformSIMDSize(lanesToSIMDMode(pow2NElts));
17096 m_encoder->SetSrcRegion(0, 0, 1, 0);
17097 m_encoder->Copy(newVar, DataVar);
17098 m_encoder->Push();
17099 }
17100 }
17101
17102 if (!initWithElem0 || NElts != 1)
17103 {
17104 emitVectorCopy(newVar, DataVar, NElts);
17105 }
17106 return newVar;
17107 }
17108
17109
emitVectorCopy(CVariable * Dst,CVariable * Src,uint32_t nElts,uint32_t DstSubRegOffset,uint32_t SrcSubRegOffset)17110 void EmitPass::emitVectorCopy(CVariable* Dst, CVariable* Src, uint32_t nElts,
17111 uint32_t DstSubRegOffset, uint32_t SrcSubRegOffset)
17112 {
17113 unsigned int width = numLanes(m_currShader->m_SIMDSize);
17114 bool srcUniform = Src->IsUniform();
17115 bool dstUniform = Dst->IsUniform();
17116 unsigned doff = DstSubRegOffset, soff = SrcSubRegOffset;
17117
17118 // Uniform vector copy.
17119 if (srcUniform && dstUniform)
17120 {
17121 // The starting index of elements to be copied.
17122 unsigned i = 0;
17123 auto partialCopy = [=, &i](SIMDMode mod)
17124 {
17125 unsigned w = numLanes(mod);
17126 if (i + w > nElts)
17127 {
17128 return false;
17129 }
17130
17131 unsigned vStride = (mod == SIMDMode::SIMD1) ? 0 : 1;
17132 m_encoder->SetUniformSIMDSize(mod);
17133 m_encoder->SetSrcRegion(0, vStride, 1, 0);
17134 m_encoder->SetSrcSubReg(0, soff + i);
17135 m_encoder->SetDstSubReg(doff + i);
17136 m_encoder->Copy(Dst, Src);
17137 m_encoder->Push();
17138
17139 i += w;
17140 return true;
17141 };
17142
17143 // We may select the initial simd size based on the element type.
17144 while (partialCopy(SIMDMode::SIMD8))
17145 ;
17146 partialCopy(SIMDMode::SIMD4);
17147 partialCopy(SIMDMode::SIMD2);
17148 partialCopy(SIMDMode::SIMD1);
17149 return;
17150 }
17151
17152 for (uint32_t i = 0; i < nElts; ++i)
17153 {
17154 uint SrcSubReg = srcUniform ? soff + i : soff + width * i;
17155 uint DstSubReg = dstUniform ? doff + i : doff + width * i;
17156
17157 uint SrcWidth = srcUniform ? 1 : width;
17158 uint DstWidth = dstUniform ? 1 : width;
17159
17160 if (SrcSubReg >= Src->GetNumberElement() ||
17161 DstSubReg >= Dst->GetNumberElement())
17162 {
17163 break;
17164 }
17165
17166 bool SrcOverflow = (SrcSubReg + SrcWidth > Src->GetNumberElement());
17167 bool DstOverflow = (DstSubReg + DstWidth > Dst->GetNumberElement());
17168
17169 // This is currently used for VME payloads whose LLVM type doesn't
17170 // necessarily match the associated CVariable size (the LLVM type
17171 // will be at least as big as the CVariable). Here, we make sure that,
17172 // if an entire vector element is not copied, we emit movs to just
17173 // read or write the appropriate number of bytes.
17174 if (SrcOverflow || DstOverflow)
17175 {
17176 if (srcUniform)
17177 {
17178 auto partialCopy = [&](SIMDMode mode)
17179 {
17180 unsigned w = numLanes(mode);
17181
17182 if (DstSubReg + w > Dst->GetNumberElement())
17183 return;
17184
17185 m_encoder->SetSimdSize(mode);
17186 m_encoder->SetSrcSubReg(0, SrcSubReg);
17187 m_encoder->SetDstSubReg(DstSubReg);
17188 m_encoder->Copy(Dst, Src);
17189 m_encoder->Push();
17190
17191 DstSubReg += w;
17192 };
17193
17194 partialCopy(SIMDMode::SIMD8);
17195 partialCopy(SIMDMode::SIMD4);
17196 partialCopy(SIMDMode::SIMD2);
17197 partialCopy(SIMDMode::SIMD1);
17198 }
17199 else
17200 {
17201 auto partialCopy = [&](SIMDMode mode)
17202 {
17203 unsigned w = numLanes(mode);
17204
17205 if (DstSubReg + w > Dst->GetNumberElement() ||
17206 SrcSubReg + w > Src->GetNumberElement())
17207 return;
17208
17209 m_encoder->SetSimdSize(mode);
17210 m_encoder->SetSrcSubReg(0, SrcSubReg);
17211 m_encoder->SetDstSubReg(DstSubReg);
17212 m_encoder->Copy(Dst, Src);
17213 m_encoder->Push();
17214
17215 DstSubReg += w;
17216 SrcSubReg += w;
17217 };
17218
17219 partialCopy(SIMDMode::SIMD8);
17220 partialCopy(SIMDMode::SIMD4);
17221 partialCopy(SIMDMode::SIMD2);
17222 partialCopy(SIMDMode::SIMD1);
17223 }
17224
17225 break;
17226 }
17227
17228 m_encoder->SetSrcSubReg(0, SrcSubReg);
17229 m_encoder->SetDstSubReg(DstSubReg);
17230 m_encoder->Copy(Dst, Src);
17231 m_encoder->Push();
17232 }
17233 }
17234
17235 // Handle Copy intrinsic
emitGenISACopy(GenIntrinsicInst * GenCopyInst)17236 void EmitPass::emitGenISACopy(GenIntrinsicInst* GenCopyInst)
17237 {
17238 CVariable* Dst = m_destination;
17239 CVariable* Src = GetSymbol(GenCopyInst->getArgOperand(0));
17240 Type* Ty = GenCopyInst->getType();
17241 emitCopyAll(Dst, Src, Ty);
17242 }
17243
17244 // Push a new frame onto the stack by:
17245 // Update FP to the current SP
17246 // Increment SP by pushSize
17247 // Store value of previous frame's FP to the address of updated FP (for stack-walk)
emitPushFrameToStack(unsigned & pushSize)17248 void EmitPass::emitPushFrameToStack(unsigned& pushSize)
17249 {
17250 CVariable* pFP = m_currShader->GetFP();
17251 CVariable* pSP = m_currShader->GetSP();
17252
17253 // Set FP = SP
17254 m_encoder->Copy(pFP, pSP);
17255 m_encoder->Push();
17256
17257 if IGC_IS_FLAG_ENABLED(EnableWriteOldFPToStack)
17258 {
17259 // Allocate 1 extra oword to store previous frame's FP
17260 pushSize += SIZE_OWORD;
17261 }
17262
17263 // Since we use unaligned oword writes, pushSize should be OW aligned address
17264 if (pushSize % SIZE_OWORD > 0)
17265 pushSize += (SIZE_OWORD - (pushSize % SIZE_OWORD));
17266
17267 // Update SP by pushSize
17268 emitAddPointer(pSP, pSP, m_currShader->ImmToVariable(pushSize, ISA_TYPE_UD));
17269
17270 if IGC_IS_FLAG_ENABLED(EnableWriteOldFPToStack)
17271 {
17272 // Store old FP value to current FP
17273 CVariable* pOldFP = m_currShader->GetPrevFP();
17274 // If previous FP is null (for kernel frame), we initialize it to 0
17275 if (pOldFP == nullptr)
17276 {
17277 pOldFP = m_currShader->GetNewVariable(pFP);
17278 m_encoder->Copy(pOldFP, m_currShader->ImmToVariable(0, ISA_TYPE_UQ));
17279 m_encoder->Push();
17280 }
17281
17282 pFP = ReAlignUniformVariable(pFP, EALIGN_GRF);
17283 {
17284 m_encoder->OWStoreA64(pOldFP, pFP, SIZE_OWORD, 0);
17285 m_encoder->Push();
17286 }
17287 }
17288 }
17289
emitAddPointer(CVariable * Dst,CVariable * Src,CVariable * offset)17290 void EmitPass::emitAddPointer(CVariable* Dst, CVariable* Src, CVariable* offset)
17291 {
17292 if (m_currShader->m_Platform->hasNoInt64AddInst() &&
17293 (Dst->GetType() == ISA_TYPE_Q || Dst->GetType() == ISA_TYPE_UQ) &&
17294 (Src->GetType() == ISA_TYPE_Q || Src->GetType() == ISA_TYPE_UQ))
17295 {
17296 emitAddPair(Dst, Src, offset);
17297 }
17298 else
17299 {
17300 m_encoder->Add(Dst, Src, offset);
17301 m_encoder->Push();
17302 }
17303 }
17304
emitAddPair(CVariable * Dst,CVariable * Src0,CVariable * Src1)17305 void EmitPass::emitAddPair(CVariable* Dst, CVariable* Src0, CVariable* Src1) {
17306 IGC_ASSERT(Dst->GetType() == ISA_TYPE_Q || Dst->GetType() == ISA_TYPE_UQ);
17307 IGC_ASSERT(Src0->GetType() == ISA_TYPE_Q || Src0->GetType() == ISA_TYPE_UQ);
17308 IGC_ASSERT(Src1->GetType() == ISA_TYPE_UV || Src1->GetType() == ISA_TYPE_UD || Src1->GetType() == ISA_TYPE_D);
17309
17310 bool IsUniformDst = Dst->IsUniform();
17311
17312 unsigned short NumElts = Dst->GetNumberElement();
17313 SIMDMode Mode = lanesToSIMDMode(NumElts);
17314
17315 VISA_Type NewType = ISA_TYPE_UD;
17316 CVariable* SrcAlias = m_currShader->GetNewAlias(Src0, NewType, 0, 0);
17317 CVariable* L0 = m_currShader->GetNewVariable(NumElts, NewType, EALIGN_GRF, IsUniformDst, CName(Src0->getName(), "Lo32"));
17318 CVariable* H0 = m_currShader->GetNewVariable(NumElts, NewType, EALIGN_GRF, IsUniformDst, CName(Src0->getName(), "Hi32"));
17319
17320 // Split Src0 into L0 and H0
17321 // L0 := Offset[0];
17322 if (IsUniformDst) {
17323 m_encoder->SetNoMask();
17324 m_encoder->SetUniformSIMDSize(Mode);
17325 }
17326 if (Src0->IsUniform())
17327 m_encoder->SetSrcRegion(0, 0, 1, 0);
17328 else
17329 m_encoder->SetSrcRegion(0, 2, 1, 0);
17330 m_encoder->Copy(L0, SrcAlias);
17331 m_encoder->Push();
17332 // H0 := Offset[1];
17333 if (IsUniformDst) {
17334 m_encoder->SetNoMask();
17335 m_encoder->SetUniformSIMDSize(Mode);
17336 }
17337 m_encoder->SetSrcSubReg(0, 1);
17338 if (Src0->IsUniform())
17339 m_encoder->SetSrcRegion(0, 0, 1, 0);
17340 else
17341 m_encoder->SetSrcRegion(0, 2, 1, 0);
17342 m_encoder->Copy(H0, SrcAlias);
17343 m_encoder->Push();
17344
17345 // If rc1 is a signed type value, signed extend it to L1 and H1. Otherwise we can
17346 // ignore its high-32 bit part, which will be all zeros.
17347 CVariable* L1 = nullptr;
17348 CVariable* H1 = nullptr;
17349 if (Src1->GetType() == ISA_TYPE_D) {
17350 L1 = m_currShader->GetNewVariable(NumElts, NewType, EALIGN_GRF, IsUniformDst, CName(Src1->getName(), "Lo32"));
17351 H1 = m_currShader->GetNewVariable(NumElts, NewType, EALIGN_GRF, IsUniformDst, CName(Src1->getName(), "Hi32"));
17352
17353 // L1 := Offset[0];
17354 if (IsUniformDst) {
17355 m_encoder->SetNoMask();
17356 m_encoder->SetUniformSIMDSize(Mode);
17357 }
17358 if (Src1->IsUniform())
17359 m_encoder->SetSrcRegion(0, 0, 1, 0);
17360 else
17361 m_encoder->SetSrcRegion(0, 1, 1, 0);
17362 m_encoder->Copy(L1, Src1);
17363 m_encoder->Push();
17364 // H1 := Offset[1];
17365 if (IsUniformDst) {
17366 m_encoder->SetNoMask();
17367 m_encoder->SetUniformSIMDSize(Mode);
17368 }
17369 if (Src1->IsUniform())
17370 m_encoder->SetSrcRegion(0, 0, 1, 0);
17371 else
17372 m_encoder->SetSrcRegion(0, 1, 1, 0);
17373 m_encoder->IShr(H1, Src1, m_currShader->ImmToVariable(31, ISA_TYPE_UD));
17374 m_encoder->Push();
17375 }
17376
17377 CVariable* Lo = m_currShader->GetNewVariable(NumElts, NewType, EALIGN_GRF, IsUniformDst, CName(Dst->getName(), "Lo32"));
17378 CVariable* Hi = m_currShader->GetNewVariable(NumElts, NewType, EALIGN_GRF, IsUniformDst, CName(Dst->getName(), "Lo32"));
17379 // (Lo, Hi) := AddPair(L0, H0, ImmLo, ImmHi);
17380 if (IsUniformDst) {
17381 m_encoder->SetNoMask();
17382 m_encoder->SetUniformSIMDSize(Mode);
17383 m_encoder->SetSrcRegion(0, 1, 1, 0);
17384 m_encoder->SetSrcRegion(1, 1, 1, 0);
17385 }
17386 if (L1 != nullptr)
17387 m_encoder->AddPair(Lo, Hi, L0, H0, L1, H1);
17388 else
17389 m_encoder->AddPair(Lo, Hi, L0, H0, Src1);
17390 m_encoder->Push();
17391
17392 CVariable* DstAlias = m_currShader->GetNewAlias(Dst, NewType, 0, 0);
17393 // Offset[0] := Lo;
17394 if (IsUniformDst) {
17395 m_encoder->SetNoMask();
17396 m_encoder->SetUniformSIMDSize(Mode);
17397 m_encoder->SetSrcRegion(0, 1, 1, 0);
17398 }
17399 m_encoder->SetDstRegion(2);
17400 m_encoder->Copy(DstAlias, Lo);
17401 m_encoder->Push();
17402 // Offset[1] := Hi;
17403 if (IsUniformDst) {
17404 m_encoder->SetNoMask();
17405 m_encoder->SetUniformSIMDSize(Mode);
17406 m_encoder->SetSrcRegion(0, 1, 1, 0);
17407 }
17408 m_encoder->SetDstSubReg(1);
17409 m_encoder->SetDstRegion(2);
17410 m_encoder->Copy(DstAlias, Hi);
17411 m_encoder->Push();
17412 }
17413
17414 /// \brief Copy all values from the src variable to the dst variable.
17415 /// The last argument is the underlying value type.
emitCopyAll(CVariable * Dst,CVariable * Src,llvm::Type * Ty)17416 void EmitPass::emitCopyAll(CVariable* Dst, CVariable* Src, llvm::Type* Ty)
17417 {
17418 if (Src->GetVarType() == EVARTYPE_PREDICATE)
17419 {
17420 IGC_ASSERT_MESSAGE(!Ty->isVectorTy(), "vector of predicates?");
17421 IGC_ASSERT(Dst->GetVarType() == Src->GetVarType());
17422 CVariable* Zero = m_currShader->ImmToVariable(0, ISA_TYPE_BOOL);
17423 m_encoder->Or(Dst, Src, Zero);
17424 m_encoder->Push();
17425 }
17426 else if (Ty->isVectorTy())
17427 {
17428 unsigned NElts = (unsigned)cast<IGCLLVM::FixedVectorType>(Ty)->getNumElements();
17429 emitVectorCopy(Dst, Src, NElts);
17430 }
17431 else if (Ty->isStructTy())
17432 {
17433 IGC_ASSERT(Dst->GetType() == ISA_TYPE_B);
17434 IGC_ASSERT(Src->GetType() == ISA_TYPE_B);
17435
17436 if (!Src->IsUniform() && Dst->IsUniform())
17437 {
17438 IGC_ASSERT_MESSAGE(0, "Does not support non-uniform to uniform struct copy");
17439 }
17440
17441 StructType* STy = dyn_cast<StructType>(Ty);
17442 const StructLayout* SL = m_DL->getStructLayout(STy);
17443 unsigned srcLanes = Src->IsUniform() ? 1 : numLanes(m_currShader->m_dispatchSize);
17444 unsigned dstLanes = Dst->IsUniform() ? 1 : numLanes(m_currShader->m_dispatchSize);
17445 for (unsigned i = 0; i < STy->getNumElements(); i++)
17446 {
17447 unsigned elementOffset = (unsigned)SL->getElementOffset(i);
17448 Type* elementType = STy->getElementType(i);
17449
17450 unsigned numElements = 1;
17451 if (auto elementVectorType = dyn_cast<IGCLLVM::FixedVectorType>(elementType))
17452 {
17453 numElements = (unsigned)elementVectorType->getNumElements();
17454 }
17455
17456 VISA_Type visaTy = m_currShader->GetType(elementType);
17457
17458 CVariable* srcElement = m_currShader->GetNewAlias(Src, visaTy, elementOffset * srcLanes, numElements * srcLanes, Src->IsUniform());
17459 CVariable* dstElement = m_currShader->GetNewAlias(Dst, visaTy, elementOffset * dstLanes, numElements * dstLanes, Dst->IsUniform());
17460 emitCopyAll(dstElement, srcElement, elementType);
17461 }
17462 }
17463 else
17464 {
17465 IGC_ASSERT_MESSAGE(Ty->isSingleValueType(), "not supported");
17466 m_encoder->Copy(Dst, Src);
17467 m_encoder->Push();
17468 }
17469 }
17470
emitSqrt(Instruction * inst)17471 void EmitPass::emitSqrt(Instruction* inst)
17472 {
17473 GenIntrinsicInst* intrinCall = llvm::cast<GenIntrinsicInst>(inst);
17474 CVariable* src0 = GetSymbol(intrinCall->getArgOperand(0));
17475 src0 = BroadcastIfUniform(src0);
17476
17477 m_encoder->Sqrt(m_destination, src0);
17478 }
17479
emitFrc(llvm::GenIntrinsicInst * inst)17480 void EmitPass::emitFrc(llvm::GenIntrinsicInst* inst)
17481 {
17482 CVariable* src0 = GetSymbol(inst->getArgOperand(0));
17483 src0 = BroadcastIfUniform(src0);
17484
17485 m_encoder->Frc(m_destination, src0);
17486 }
17487
emitCanonicalize(llvm::Instruction * inst,const DstModifier & modifier)17488 void IGC::EmitPass::emitCanonicalize(llvm::Instruction* inst, const DstModifier& modifier)
17489 {
17490 // Force to flush denormal fp value to zero. Select one of two possible solutions:
17491 // 1. add inputVal, -0.0
17492 // 2. mul inputVal, 1.0
17493 // A normalized fp value isn't changed.
17494 // The operation is done only if particular flags are set.
17495 // If the instruction should be emitted anyway, flushing a subnormal to zero has to implemented in other way.
17496 CodeGenContext* pCodeGenContext = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
17497 bool flushVal = pCodeGenContext->m_floatDenormMode16 == ::IGC::FLOAT_DENORM_FLUSH_TO_ZERO && inst->getType()->isHalfTy();
17498 flushVal = flushVal || (pCodeGenContext->m_floatDenormMode32 == ::IGC::FLOAT_DENORM_FLUSH_TO_ZERO && inst->getType()->isFloatTy());
17499 flushVal = flushVal || (pCodeGenContext->m_floatDenormMode64 == ::IGC::FLOAT_DENORM_FLUSH_TO_ZERO && inst->getType()->isDoubleTy());
17500 if (flushVal || modifier.sat)
17501 {
17502 CVariable* inputVal = GetSymbol(inst->getOperand(0));
17503 CVariable* negativeZero = m_currShader->GetScalarConstant(llvm::ConstantFP::get(inst->getType(), -0.0));
17504 m_encoder->SetDstModifier(modifier);
17505 m_encoder->Add(m_destination, inputVal, negativeZero);
17506 }
17507 }
17508
emitStaticConstantPatchValue(llvm::StaticConstantPatchIntrinsic * staticConstantPatch32)17509 void IGC::EmitPass::emitStaticConstantPatchValue(llvm::StaticConstantPatchIntrinsic* staticConstantPatch32)
17510 {
17511 std::string patchName = staticConstantPatch32->getPatchName().str();
17512 m_encoder->AddVISASymbol(patchName, m_destination);
17513 }
17514
17515 // emit llvm.bswap
emitLLVMbswap(IntrinsicInst * inst)17516 void EmitPass::emitLLVMbswap(IntrinsicInst* inst)
17517 {
17518 Type* Ty = inst->getType();
17519 Value* Arg = inst->getArgOperand(0);
17520 uint32_t nBytes = int_cast<uint32_t>(m_DL->getTypeSizeInBits(Ty));
17521 IGC_ASSERT_MESSAGE(nBytes % 16 == 0, "Incorrect llvm.bswap");
17522 IGC_ASSERT_MESSAGE(!Ty->isVectorTy(), "Incorrect llvm.bswap");
17523 nBytes >>= 3; // Now, nBytes are in unit of byte.
17524
17525 CVariable* Src = GetSymbol(Arg);
17526 CVariable* Dst = m_destination;
17527 uint32_t width = numLanes(m_currShader->m_SIMDSize);
17528 bool srcUniform = Src->IsUniform();
17529 bool dstUniform = Dst->IsUniform();
17530
17531 CVariable* SrcB = m_currShader->GetNewAlias(Src, ISA_TYPE_UB, 0, 0);
17532 if (nBytes == 2 || nBytes == 4)
17533 {
17534 CVariable* DstB = m_currShader->GetNewAlias(Dst, ISA_TYPE_UB, 0, 0);
17535
17536 // Generating byte mov
17537 for (unsigned i = 0; i < nBytes / 2; ++i)
17538 {
17539 // swap bytes[i] with bytes[j].
17540 uint32_t j = (nBytes - 1) - i;
17541
17542 m_encoder->SetSrcSubReg(0, i);
17543 m_encoder->SetSrcRegion(0, srcUniform ? 0 : nBytes, 1, 0);
17544 m_encoder->SetDstSubReg(j);
17545 m_encoder->SetDstRegion(dstUniform ? 1 : nBytes);
17546 m_encoder->Copy(DstB, SrcB);
17547 m_encoder->Push();
17548
17549 m_encoder->SetSrcSubReg(0, j);
17550 m_encoder->SetSrcRegion(0, srcUniform ? 0 : nBytes, 1, 0);
17551 m_encoder->SetDstSubReg(i);
17552 m_encoder->SetDstRegion(dstUniform ? 1 : nBytes);
17553 m_encoder->Copy(DstB, SrcB);
17554 m_encoder->Push();
17555 }
17556 }
17557 else if (nBytes == 8)
17558 {
17559 // Need to so lower DW and upper DW separately first.
17560 m_currShader->GetNewAlias(Src, ISA_TYPE_UD, 0, 0);
17561 CVariable* DstH = m_currShader->GetNewVariable(
17562 Src->GetNumberElement(),
17563 ISA_TYPE_UD,
17564 Src->GetAlign(),
17565 srcUniform,
17566 CName::NONE);
17567 CVariable* DstL = m_currShader->GetNewVariable(
17568 Src->GetNumberElement(),
17569 ISA_TYPE_UD,
17570 Src->GetAlign(),
17571 srcUniform,
17572 CName::NONE);
17573 CVariable* DstHB = m_currShader->GetNewAlias(DstH, ISA_TYPE_UB, 0, 0);
17574 CVariable* DstLB = m_currShader->GetNewAlias(DstL, ISA_TYPE_UB, 0, 0);
17575
17576 bool split = (width == 16);
17577 for (unsigned n = 0; n < 2; ++n)
17578 {
17579 for (unsigned i = 0; i < 4; ++i)
17580 {
17581 // swap bytes[i] and bytes[j]
17582 uint32_t j = 3 - i;
17583 if (split && !srcUniform)
17584 {
17585 m_encoder->SetSrcSubReg(0, 4 * n + i);
17586 m_encoder->SetSrcRegion(0, 8, 1, 0);
17587 m_encoder->SetDstSubReg(j);
17588 m_encoder->SetDstRegion(4);
17589 m_encoder->SetSimdSize(SIMDMode::SIMD8);
17590 m_encoder->SetMask(EMASK_Q1);
17591 m_encoder->Copy(n == 0 ? DstHB : DstLB, SrcB);
17592 m_encoder->Push();
17593
17594 m_encoder->SetSrcSubReg(0, 2 * getGRFSize() + 4 * n + i);
17595 m_encoder->SetSrcRegion(0, 8, 1, 0);
17596 m_encoder->SetDstSubReg(getGRFSize() + j);
17597 m_encoder->SetDstRegion(4);
17598 m_encoder->SetSimdSize(SIMDMode::SIMD8);
17599 m_encoder->SetMask(EMASK_Q2);
17600 m_encoder->Copy(n == 0 ? DstHB : DstLB, SrcB);
17601 m_encoder->Push();
17602 }
17603 else
17604 {
17605 // DstH[B]/DstL[B] have the same uniformness as Src !
17606 m_encoder->SetSrcSubReg(0, 4 * n + i);
17607 m_encoder->SetSrcRegion(0, srcUniform ? 0 : 8, 1, 0);
17608 m_encoder->SetDstSubReg(j);
17609 m_encoder->SetDstRegion(srcUniform ? 1 : 4);
17610 m_encoder->Copy(n == 0 ? DstHB : DstLB, SrcB);
17611 m_encoder->Push();
17612 }
17613 }
17614 }
17615
17616 // Now, mov DstH and DstL to Dst
17617 CVariable* DstD = m_currShader->GetNewAlias(Dst, ISA_TYPE_UD, 0, 0);
17618
17619 // When dst is uniform, dst does not cross 2 GRFs, split isn't needed.
17620 if (split && !dstUniform)
17621 {
17622 m_encoder->SetSimdSize(SIMDMode::SIMD8);
17623 m_encoder->SetMask(EMASK_Q1);
17624 m_encoder->SetDstRegion(2);
17625 m_encoder->Copy(DstD, DstL);
17626 m_encoder->Push();
17627
17628 m_encoder->SetSimdSize(SIMDMode::SIMD8);
17629 m_encoder->SetMask(EMASK_Q2);
17630 m_encoder->SetSrcSubReg(0, srcUniform ? 0 : 8);
17631 m_encoder->SetDstSubReg(16);
17632 m_encoder->SetDstRegion(2);
17633 m_encoder->Copy(DstD, DstL);
17634 m_encoder->Push();
17635
17636 m_encoder->SetSimdSize(SIMDMode::SIMD8);
17637 m_encoder->SetMask(EMASK_Q1);
17638 m_encoder->SetDstSubReg(1);
17639 m_encoder->SetDstRegion(2);
17640 m_encoder->Copy(DstD, DstH);
17641 m_encoder->Push();
17642
17643 m_encoder->SetSimdSize(SIMDMode::SIMD8);
17644 m_encoder->SetMask(EMASK_Q2);
17645 m_encoder->SetSrcSubReg(0, srcUniform ? 0 : 8);
17646 m_encoder->SetDstSubReg(17);
17647 m_encoder->SetDstRegion(2);
17648 m_encoder->Copy(DstD, DstH);
17649 m_encoder->Push();
17650 }
17651 else
17652 {
17653 m_encoder->SetDstRegion(dstUniform ? 1 : 2);
17654 m_encoder->Copy(DstD, DstL);
17655 m_encoder->Push();
17656 m_encoder->SetDstSubReg(1);
17657 m_encoder->SetDstRegion(dstUniform ? 1 : 2);
17658 m_encoder->Copy(DstD, DstH);
17659 m_encoder->Push();
17660 }
17661 }
17662 else
17663 {
17664 IGC_ASSERT_MESSAGE(0, "Unsupported type for llvm.bswap!");
17665 return;
17666 }
17667 }
17668
setPredicateForDiscard(CVariable * pPredicate)17669 void EmitPass::setPredicateForDiscard(CVariable* pPredicate)
17670 {
17671 // Input predicate parameter is used when resource variable is non-uniform
17672 // and compiler needs to create the resource loop.
17673 bool isInversePredicate = false;
17674 if (m_currShader->GetShaderType() == ShaderType::PIXEL_SHADER)
17675 {
17676 CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
17677 if (psProgram->HasDiscard() && psProgram->GetDiscardPixelMask())
17678 {
17679 if (pPredicate != nullptr)
17680 {
17681 m_encoder->SetNoMask();
17682 m_encoder->GenericAlu(EOPCODE_NOT, pPredicate, pPredicate, nullptr);
17683 m_encoder->Push();
17684 m_encoder->SetNoMask();
17685 m_encoder->GenericAlu(EOPCODE_OR, pPredicate, pPredicate, psProgram->GetDiscardPixelMask());
17686 m_encoder->Push();
17687 }
17688 else
17689 {
17690 pPredicate = psProgram->GetDiscardPixelMask();
17691 }
17692 isInversePredicate = true;
17693 }
17694 }
17695 if (pPredicate != nullptr)
17696 {
17697 m_encoder->SetPredicate(pPredicate);
17698 m_encoder->SetInversePredicate(isInversePredicate);
17699 }
17700 }
17701
ForceDMask(bool createJmpForDiscard)17702 void EmitPass::ForceDMask(bool createJmpForDiscard)
17703 {
17704 if (createJmpForDiscard &&
17705 m_currShader->GetShaderType() == ShaderType::PIXEL_SHADER)
17706 {
17707 CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
17708 if (psProgram->HasDiscard() && psProgram->GetDiscardPixelMask())
17709 {
17710 m_labelForDMaskJmp = m_encoder->GetNewLabelID("discard");
17711 m_encoder->Jump(psProgram->GetDiscardPixelMask(),
17712 m_labelForDMaskJmp);
17713 m_encoder->Push();
17714 }
17715 }
17716
17717 if (m_pattern->NeedVMask())
17718 {
17719 m_encoder->SetVectorMask(false);
17720 }
17721 }
17722
ResetVMask(bool createJmpForDiscard)17723 void EmitPass::ResetVMask(bool createJmpForDiscard)
17724 {
17725 if (m_pattern->NeedVMask())
17726 {
17727 m_encoder->SetVectorMask(true);
17728 }
17729
17730 if (createJmpForDiscard &&
17731 m_currShader->GetShaderType() == ShaderType::PIXEL_SHADER)
17732 {
17733 CPixelShader* psProgram = static_cast<CPixelShader*>(m_currShader);
17734 if (psProgram->HasDiscard() && psProgram->GetDiscardPixelMask())
17735 {
17736 m_encoder->Label(m_labelForDMaskJmp);
17737 m_encoder->Push();
17738 }
17739 }
17740 }
17741
emitGetBufferPtr(GenIntrinsicInst * inst)17742 void EmitPass::emitGetBufferPtr(GenIntrinsicInst* inst)
17743 {
17744 Value* buf_idxv = inst->getOperand(0);
17745 Value* bufTyVal = inst->getOperand(1);
17746 IGC_ASSERT(isa<ConstantInt>(bufTyVal));
17747 BufferType bufType = (BufferType)(cast<ConstantInt>(bufTyVal)->getZExtValue());
17748
17749 uint bti = 0;
17750 switch (bufType)
17751 {
17752 case UAV:
17753 bti = m_currShader->m_pBtiLayout->GetUavIndex(0);
17754 break;
17755 case CONSTANT_BUFFER:
17756 bti = m_currShader->m_pBtiLayout->GetConstantBufferIndex(0);
17757 break;
17758 case RESOURCE:
17759 bti = m_currShader->m_pBtiLayout->GetTextureIndex(0);
17760 break;
17761 case RENDER_TARGET:
17762 bti = m_currShader->m_pBtiLayout->GetRenderTargetIndex(0);
17763 break;
17764 case SAMPLER:
17765 bti = 0;
17766 break;
17767 default:
17768 IGC_ASSERT_MESSAGE(0, "unexpect buffer type for GetBufferPtr");
17769 break;
17770 }
17771 CVariable* indexCVar = GetSymbol(buf_idxv);
17772
17773 if (bti)
17774 {
17775 CVariable* btiCVar = m_currShader->ImmToVariable(bti, ISA_TYPE_UD);
17776 m_encoder->Add(m_destination, indexCVar, btiCVar);
17777 }
17778 else
17779 {
17780 m_encoder->Copy(m_destination, indexCVar);
17781 }
17782 m_encoder->Push();
17783
17784 // Set BTI; BTI equal zero is also a valid value.
17785 bool directIdx = (llvm::dyn_cast<llvm::ConstantInt>(inst->getOperand(0))) ? true : false;
17786 m_currShader->SetBindingTableEntryCountAndBitmap(directIdx, bufType, 0, bti);
17787 }
17788
GetResourceVariable(Value * resourcePtr)17789 ResourceDescriptor EmitPass::GetResourceVariable(Value* resourcePtr)
17790 {
17791 ResourceDescriptor resource;
17792 BufferType bufType = BUFFER_TYPE_UNKNOWN;
17793 uint as = 0;
17794 if (auto *GII = dyn_cast<GenIntrinsicInst>(resourcePtr); GII &&
17795 GII->getIntrinsicID() == GenISAIntrinsic::GenISA_GetBufferPtr)
17796 {
17797 // from GetBufferPtr
17798 Value* bufTyVal = GII->getOperand(1);
17799 IGC_ASSERT(isa<ConstantInt>(bufTyVal));
17800 bufType = (BufferType)(cast<ConstantInt>(bufTyVal)->getZExtValue());
17801 resource.m_resource = GetSymbol(resourcePtr);
17802 }
17803 else
17804 {
17805 as = resourcePtr->getType()->getPointerAddressSpace();
17806 uint bufferIndex = 0;
17807 bool directIndexing = false;
17808
17809 bufType = DecodeAS4GFXResource(as, directIndexing, bufferIndex);
17810
17811 if (IsBindless(bufType) || !directIndexing)
17812 {
17813 if (isa<IntToPtrInst>(resourcePtr))
17814 {
17815 IntToPtrInst* i2p = dyn_cast<IntToPtrInst>(resourcePtr);
17816 resource.m_resource = GetSymbol(i2p->getOperand(0));
17817 }
17818 else
17819 {
17820 resource.m_resource = GetSymbol(resourcePtr);
17821 }
17822
17823 if (resource.m_resource->GetElemSize() < 4)
17824 {
17825 // vISA assumes all BTIs to be 32 bit. Need to cast, otherwise higher bits would be uninitialized.
17826 unsigned numInstance = resource.m_resource->GetNumberInstance();
17827 CVariable* newResource = m_currShader->GetNewVariable(
17828 resource.m_resource->GetNumberElement(),
17829 ISA_TYPE_UD,
17830 resource.m_resource->IsUniform() ? EALIGN_DWORD : EALIGN_GRF,
17831 resource.m_resource->IsUniform(),
17832 numInstance,
17833 CName::NONE);
17834
17835 m_encoder->Cast(newResource, resource.m_resource);
17836
17837 if (numInstance == 2)
17838 {
17839 m_encoder->SetSecondHalf(!m_encoder->IsSecondHalf());
17840 m_encoder->Cast(newResource, resource.m_resource);
17841 m_encoder->SetSecondHalf(!m_encoder->IsSecondHalf());
17842 }
17843
17844 resource.m_resource = newResource;
17845 }
17846
17847 if (!directIndexing)
17848 {
17849 m_currShader->SetBindingTableEntryCountAndBitmap(false, bufType, 0, 0);
17850 }
17851 }
17852 else
17853 {
17854 uint bti = 0;
17855 switch (bufType)
17856 {
17857 case UAV:
17858 bti = m_currShader->m_pBtiLayout->GetUavIndex(bufferIndex);
17859 break;
17860 case CONSTANT_BUFFER:
17861 bti = m_currShader->m_pBtiLayout->GetConstantBufferIndex(bufferIndex);
17862 break;
17863 case RESOURCE:
17864 bti = m_currShader->m_pBtiLayout->GetTextureIndex(bufferIndex);
17865 break;
17866 case RENDER_TARGET:
17867 IGC_ASSERT(m_currShader->GetShaderType() == ShaderType::PIXEL_SHADER);
17868 bti = m_currShader->m_pBtiLayout->GetRenderTargetIndex(bufferIndex);
17869 break;
17870 case SLM:
17871 bti = 254; // \todo, remove hard-coding
17872 break;
17873 default:
17874 bti = m_currShader->m_pBtiLayout->GetStatelessBindingTableIndex();
17875 break;
17876 }
17877 resource.m_resource = m_currShader->ImmToVariable(bti, ISA_TYPE_UD);
17878 m_currShader->SetBindingTableEntryCountAndBitmap(directIndexing, bufType, bufferIndex, bti);
17879 }
17880 }
17881
17882
17883 if (IsBindless(bufType))
17884 {
17885 resource.m_surfaceType = ESURFACE_BINDLESS;
17886 }
17887 else if (IsSSHbindless(bufType))
17888 {
17889 resource.m_surfaceType = ESURFACE_SSHBINDLESS;
17890 }
17891 else if (bufType == SLM)
17892 {
17893 resource.m_surfaceType = ESURFACE_SLM;
17894 }
17895 else if (bufType == CONSTANT_BUFFER || bufType == UAV ||
17896 bufType == RESOURCE || bufType == RENDER_TARGET)
17897 {
17898 resource.m_surfaceType = ESURFACE_NORMAL;
17899 }
17900 else
17901 {
17902 resource.m_surfaceType = ESURFACE_STATELESS;
17903 }
17904 return resource;
17905 }
17906
GetSamplerVariable(Value * sampleOp)17907 SamplerDescriptor EmitPass::GetSamplerVariable(Value* sampleOp)
17908 {
17909 SamplerDescriptor sampler;
17910 unsigned int samplerIdx = 0;
17911 BufferType sampType = BUFFER_TYPE_UNKNOWN;
17912
17913 if (GenIntrinsicInst* sample = dyn_cast<GenIntrinsicInst>(sampleOp))
17914 {
17915 if (sample->getIntrinsicID() == GenISAIntrinsic::GenISA_GetBufferPtr)
17916 {
17917 Value* bufTyVal = cast<GenIntrinsicInst>(sampleOp)->getOperand(1);
17918 IGC_ASSERT(isa<ConstantInt>(bufTyVal));
17919 sampType = (BufferType)(cast<ConstantInt>(bufTyVal)->getZExtValue());
17920 sampler.m_sampler = GetSymbol(sampleOp);
17921 IGC_ASSERT(sampType == SAMPLER);
17922 sampler.m_samplerType = ESAMPLER_NORMAL;
17923 return sampler;
17924 }
17925 }
17926
17927 bool isBindless = false;
17928 bool directIdx = false;
17929
17930 sampType = DecodeAS4GFXResource(
17931 sampleOp->getType()->getPointerAddressSpace(),
17932 directIdx, samplerIdx);
17933 isBindless = (sampType == BINDLESS_SAMPLER);
17934 sampler.m_samplerType =
17935 isBindless ? ESAMPLER_BINDLESS : ESAMPLER_NORMAL;
17936
17937 if (isBindless || !directIdx)
17938 {
17939 sampler.m_sampler = GetSymbol(sampleOp);
17940 }
17941 else
17942 {
17943 sampler.m_sampler = m_currShader->ImmToVariable(
17944 samplerIdx, ISA_TYPE_UD);
17945 }
17946 return sampler;
17947 }
17948
ResourceLoopHeader(ResourceDescriptor & resource,CVariable * & flag,uint & label)17949 bool EmitPass::ResourceLoopHeader(
17950 ResourceDescriptor& resource,
17951 CVariable*& flag,
17952 uint& label)
17953 {
17954 SamplerDescriptor sampler;
17955 return ResourceLoopHeader(resource, sampler, flag, label);
17956 }
17957
17958 // Insert loop header to handle non-uniform resource and sampler
17959 // This generates sub-optimal code for SIMD32, this can be revisited if we need better code generation
ResourceLoopHeader(ResourceDescriptor & resource,SamplerDescriptor & sampler,CVariable * & flag,uint & label)17960 bool EmitPass::ResourceLoopHeader(
17961 ResourceDescriptor& resource,
17962 SamplerDescriptor& sampler,
17963 CVariable*& flag,
17964 uint& label)
17965 {
17966 if (resource.m_surfaceType != ESURFACE_BINDLESS &&
17967 resource.m_surfaceType != ESURFACE_SSHBINDLESS &&
17968 resource.m_surfaceType != ESURFACE_NORMAL)
17969 {
17970 // Loop only needed for access with surface state
17971 return false;
17972 }
17973 bool uniformResource = resource.m_resource == nullptr || resource.m_resource->IsUniform();
17974 bool uniformSampler = sampler.m_sampler == nullptr || sampler.m_sampler->IsUniform();
17975 if (uniformResource && uniformSampler)
17976 {
17977 return false;
17978 }
17979 CVariable* resourceFlag = nullptr;
17980 CVariable* samplerFlag = nullptr;
17981 CVariable* offset = nullptr;
17982 label = m_encoder->GetNewLabelID("resource_loop");
17983 m_encoder->Label(label);
17984 m_encoder->Push();
17985 if (!uniformResource)
17986 {
17987 ResourceDescriptor uniformResource;
17988 resourceFlag = m_currShader->GetNewVariable(numLanes(m_SimdMode), ISA_TYPE_BOOL, EALIGN_BYTE, CName::NONE);
17989 uniformResource.m_surfaceType = resource.m_surfaceType;
17990 uniformResource.m_resource = UniformCopy(resource.m_resource, offset);
17991 m_encoder->Cmp(EPREDICATE_EQ, resourceFlag, uniformResource.m_resource, resource.m_resource);
17992 m_encoder->Push();
17993 resource = uniformResource;
17994 }
17995 if (!uniformSampler)
17996 {
17997 SamplerDescriptor uniformSampler;
17998 samplerFlag = m_currShader->GetNewVariable(numLanes(m_SimdMode), ISA_TYPE_BOOL, EALIGN_BYTE, CName::NONE);
17999 uniformSampler.m_samplerType = sampler.m_samplerType;
18000 uniformSampler.m_sampler = UniformCopy(sampler.m_sampler, offset);
18001 m_encoder->Cmp(EPREDICATE_EQ, samplerFlag, uniformSampler.m_sampler, sampler.m_sampler);
18002 m_encoder->Push();
18003 sampler = uniformSampler;
18004 }
18005 if (resourceFlag && samplerFlag)
18006 {
18007 flag = m_currShader->GetNewVariable(numLanes(m_SimdMode), ISA_TYPE_BOOL, EALIGN_BYTE, CName::NONE);
18008 m_encoder->And(flag, resourceFlag, samplerFlag);
18009 m_encoder->Push();
18010 }
18011 else
18012 {
18013 flag = resourceFlag != nullptr ? resourceFlag : samplerFlag;
18014 }
18015 if (m_SimdMode == SIMDMode::SIMD32 && m_currShader->m_numberInstance == 2)
18016 {
18017 // For SIMD32 need to initialize to 1 the other half of the flag
18018 // ToDo: check if this is actually necessary, as the other half should not get used
18019 m_encoder->SetSecondHalf(!m_encoder->IsSecondHalf());
18020 m_encoder->SetSrcRegion(0, 0, 1, 0);
18021 m_encoder->SetSrcRegion(1, 0, 1, 0);
18022 m_encoder->Cmp(EPREDICATE_EQ, flag, m_currShader->GetR0(), m_currShader->GetR0());
18023 m_encoder->Push();
18024 m_encoder->SetSecondHalf(!m_encoder->IsSecondHalf());
18025 }
18026 return true;
18027 }
18028
ResourceLoopBackEdge(bool needLoop,CVariable * flag,uint label)18029 void EmitPass::ResourceLoopBackEdge(bool needLoop, CVariable* flag, uint label)
18030 {
18031 if (needLoop)
18032 {
18033 m_encoder->SetInversePredicate(true);
18034 m_encoder->Jump(flag, label);
18035 m_encoder->Push();
18036
18037 m_currShader->GetContext()->Stats().IncreaseI64("ResourceLoopCount", 1, numLanes(m_currShader->m_dispatchSize));
18038 }
18039 }
18040
emitStateRegID(uint32_t BitStart,uint32_t BitEnd)18041 void EmitPass::emitStateRegID(uint32_t BitStart, uint32_t BitEnd)
18042 {
18043 // For example, emitStateRegID(14, 18) would return the value in the
18044 // range [18:14].
18045 uint32_t and_imm = BITMASK_RANGE(BitStart, BitEnd);
18046 uint32_t shr_imm = BitStart;
18047 m_encoder->And(m_destination, m_currShader->GetSR0(), m_currShader->ImmToVariable(and_imm, ISA_TYPE_UD));
18048 m_encoder->Shr(m_destination, m_destination, m_currShader->ImmToVariable(shr_imm, ISA_TYPE_UD));
18049 m_encoder->Push();
18050 }
18051
emitMulAdd16(Instruction * I,const SSource Sources[2],const DstModifier & DstMod)18052 void EmitPass::emitMulAdd16(Instruction* I, const SSource Sources[2], const DstModifier& DstMod)
18053 {
18054 CVariable* LVar = GetSrcVariable(Sources[0]);
18055 CVariable* RVar = GetSrcVariable(Sources[1]);
18056 VISA_Type LTy = LVar->GetType();
18057 VISA_Type RTy = RVar->GetType();
18058
18059 // Use SetSourceModifiers() to set subReg correctly.
18060 SetSourceModifiers(0, Sources[0]);
18061 SetSourceModifiers(1, Sources[1]);
18062 if (!LVar->IsUniform() && (!Sources[0].region_set) &&
18063 (LTy == ISA_TYPE_W || LTy == ISA_TYPE_UW))
18064 {
18065 m_encoder->SetSrcRegion(0, 16, 8, 2);
18066 }
18067 if (!RVar->IsUniform() && (!Sources[1].region_set) &&
18068 (RTy == ISA_TYPE_W || RTy == ISA_TYPE_UW))
18069 {
18070 m_encoder->SetSrcRegion(1, 16, 8, 2);
18071 }
18072
18073 unsigned opc = I->getOpcode();
18074 if (opc == Instruction::Mul) {
18075 m_encoder->Mul(m_destination, LVar, RVar);
18076 }
18077 else if (opc == Instruction::Sub) {
18078 e_modifier mod = CombineModifier(EMOD_NEG, Sources[1].mod);
18079 m_encoder->SetSrcModifier(1, mod); // override modifier
18080 m_encoder->Add(m_destination, LVar, RVar);
18081 }
18082 else {
18083 IGC_ASSERT_MESSAGE(I->getOpcode() == Instruction::Add, "Unknown Opcode.");
18084 m_encoder->Add(m_destination, LVar, RVar);
18085 }
18086 m_encoder->Push();
18087 }
18088
GetDispatchMask()18089 CVariable* EmitPass::GetDispatchMask()
18090 {
18091 return m_currShader->GetNewAlias(
18092 m_currShader->GetSR0(),
18093 ISA_TYPE_UD,
18094 (m_pattern->NeedVMask() ? 3 : 2) * SIZE_DWORD,
18095 1);
18096 }
18097
emitThreadPause(llvm::GenIntrinsicInst * inst)18098 void EmitPass::emitThreadPause(llvm::GenIntrinsicInst* inst)
18099 {
18100 CVariable* TSC_reg = m_currShader->GetTSC();
18101 CVariable* TSC_pause = m_currShader->GetNewAlias(TSC_reg, ISA_TYPE_UD, 16, 1);
18102 uint64_t var = GetImmediateVal(inst->getOperand(0));
18103 if (var >= 32)
18104 var = 0x03E0;
18105 else if (var <= 4)
18106 var = 0x0080;
18107 else
18108 var <<= 5;
18109 m_encoder->Copy(TSC_pause, m_currShader->ImmToVariable(var, ISA_TYPE_UD));
18110 m_encoder->Push();
18111 }
18112
18113
emitWaveBallot(llvm::GenIntrinsicInst * inst)18114 void EmitPass::emitWaveBallot(llvm::GenIntrinsicInst* inst)
18115 {
18116 CVariable* destination = m_destination;
18117 if (!m_destination->IsUniform())
18118 {
18119 destination = m_currShader->GetNewVariable(1, ISA_TYPE_UD, EALIGN_GRF, true, CName::NONE);
18120 }
18121
18122 bool uniform_active_lane = false;
18123 if (ConstantInt * pConst = dyn_cast<ConstantInt>(inst->getOperand(0)))
18124 {
18125 if (pConst->getZExtValue() == 1)
18126 uniform_active_lane = true;
18127 }
18128
18129
18130 if (!m_currShader->InsideDivergentCF(inst))
18131 {
18132 CVariable* f0 = GetSymbol(inst->getOperand(0));
18133
18134 if (m_currShader->m_dispatchSize == SIMDMode::SIMD8 && m_currShader->HasFullDispatchMask())
18135 {
18136 // for SIMD8 make sure the higher 8 bits of the flag are not copied
18137 destination = m_currShader->GetNewVariable(1, ISA_TYPE_UB, EALIGN_BYTE, true, CName::NONE);
18138 }
18139 m_encoder->BoolToInt(destination, f0);
18140 if (!m_currShader->HasFullDispatchMask())
18141 {
18142 m_encoder->And(destination, GetDispatchMask(), destination);
18143 }
18144 }
18145 else
18146 {
18147 CVariable* exeMask = GetExecutionMask();
18148 if (!uniform_active_lane)
18149 {
18150 // (W) and (1|M0) r1.0:ud r0.0<0;1;0>:ud f0.0:uw
18151 CVariable* f0 = GetSymbol(inst->getOperand(0));
18152 CVariable* vf0 = m_currShader->GetNewVariable(
18153 1, ISA_TYPE_UD, EALIGN_GRF, true, CName::NONE);
18154 m_encoder->SetSimdSize(SIMDMode::SIMD1);
18155 m_encoder->SetNoMask();
18156 m_encoder->BoolToInt(vf0, f0);
18157 m_encoder->Push();
18158
18159 m_encoder->SetSimdSize(SIMDMode::SIMD1);
18160 m_encoder->SetNoMask();
18161 m_encoder->And(destination, exeMask, vf0);
18162 m_encoder->Push();
18163 }
18164 else
18165 {
18166 m_encoder->Cast(destination, exeMask);
18167 m_encoder->Push();
18168 }
18169 }
18170
18171 if (destination != m_destination)
18172 {
18173 m_encoder->Cast(m_destination, destination);
18174 m_encoder->Push();
18175 }
18176 }
18177
emitWaveInverseBallot(llvm::GenIntrinsicInst * inst)18178 void EmitPass::emitWaveInverseBallot(llvm::GenIntrinsicInst* inst)
18179 {
18180 CVariable* Mask = GetSymbol(inst->getOperand(0));
18181
18182 if (Mask->IsUniform())
18183 {
18184 if (m_encoder->IsSecondHalf())
18185 return;
18186
18187 m_encoder->SetP(m_destination, Mask);
18188 return;
18189 }
18190
18191 // The uniform case should by far be the most common. Otherwise,
18192 // fall back and compute:
18193 //
18194 // (val & (1 << id)) != 0
18195 CVariable* Temp = m_currShader->GetNewVariable(
18196 numLanes(m_currShader->m_SIMDSize), ISA_TYPE_UD, EALIGN_GRF, CName::NONE);
18197
18198 m_currShader->GetSimdOffsetBase(Temp);
18199 m_encoder->Shl(Temp, m_currShader->ImmToVariable(1, ISA_TYPE_UD), Temp);
18200 m_encoder->And(Temp, Mask, Temp);
18201 m_encoder->Cmp(EPREDICATE_NE,
18202 m_destination, Temp, m_currShader->ImmToVariable(0, ISA_TYPE_UD));
18203 }
18204
GetReductionOp(WaveOps op,Type * opndTy,uint64_t & identity,e_opcode & opcode,VISA_Type & type)18205 static void GetReductionOp(WaveOps op, Type* opndTy, uint64_t& identity, e_opcode& opcode, VISA_Type& type)
18206 {
18207 auto getISAType = [](Type* ty, bool isSigned = true)
18208 {
18209 if (ty->isHalfTy())
18210 {
18211 return ISA_TYPE_HF;
18212 }
18213 if (ty->isFloatTy())
18214 {
18215 return ISA_TYPE_F;
18216 }
18217 if (ty->isDoubleTy())
18218 {
18219 return ISA_TYPE_DF;
18220 }
18221 IGC_ASSERT_MESSAGE(ty->isIntegerTy(), "expect integer type");
18222 auto width = dyn_cast<IntegerType>(ty)->getBitWidth();
18223 IGC_ASSERT(width == 8 || width == 16 || width == 32 || width == 64);
18224 if (isSigned)
18225 {
18226 return width == 64 ? ISA_TYPE_Q : (width == 16 ? ISA_TYPE_W : (width == 8 ? ISA_TYPE_B : ISA_TYPE_D));
18227 }
18228 else
18229 {
18230 return width == 64 ? ISA_TYPE_UQ : (width == 16 ? ISA_TYPE_UW : (width == 8 ? ISA_TYPE_UB : ISA_TYPE_UD));
18231 }
18232 };
18233 auto getMaxVal = [](VISA_Type ty) -> uint64_t
18234 {
18235 switch (ty)
18236 {
18237 case ISA_TYPE_D:
18238 return std::numeric_limits<int>::max();
18239 case ISA_TYPE_UD:
18240 return std::numeric_limits<uint32_t>::max();
18241 case ISA_TYPE_B:
18242 return std::numeric_limits<int8_t>::max();
18243 case ISA_TYPE_UB:
18244 return std::numeric_limits<uint8_t>::max();
18245 case ISA_TYPE_W:
18246 return std::numeric_limits<int16_t>::max();
18247 case ISA_TYPE_UW:
18248 return std::numeric_limits<uint16_t>::max();
18249 case ISA_TYPE_Q:
18250 return std::numeric_limits<int64_t>::max();
18251 case ISA_TYPE_UQ:
18252 return std::numeric_limits<uint64_t>::max();
18253 default:
18254 IGC_ASSERT_MESSAGE(0, "unexpected visa type");
18255 return std::numeric_limits<int>::max();
18256 }
18257 };
18258 auto getMinVal = [](VISA_Type ty) -> uint64_t
18259 {
18260 switch (ty)
18261 {
18262 case ISA_TYPE_D:
18263 return std::numeric_limits<int>::min();
18264 case ISA_TYPE_UD:
18265 return std::numeric_limits<uint32_t>::min();
18266 case ISA_TYPE_B:
18267 return std::numeric_limits<int8_t>::min();
18268 case ISA_TYPE_UB:
18269 return std::numeric_limits<uint8_t>::min();
18270 case ISA_TYPE_W:
18271 return std::numeric_limits<int16_t>::min();
18272 case ISA_TYPE_UW:
18273 return std::numeric_limits<uint16_t>::min();
18274 case ISA_TYPE_Q:
18275 return std::numeric_limits<int64_t>::min();
18276 case ISA_TYPE_UQ:
18277 return std::numeric_limits<uint64_t>::min();
18278 default:
18279 IGC_ASSERT_MESSAGE(0, "unexpected visa type");
18280 return std::numeric_limits<int>::min();
18281 }
18282 };
18283
18284 switch (op)
18285 {
18286 case WaveOps::SUM:
18287 identity = 0;
18288 opcode = EOPCODE_ADD;
18289 type = getISAType(opndTy);
18290 break;
18291 case WaveOps::PROD:
18292 identity = 1;
18293 opcode = EOPCODE_MUL;
18294 type = getISAType(opndTy);
18295 break;
18296 case WaveOps::UMAX:
18297 opcode = EOPCODE_MAX;
18298 type = getISAType(opndTy, false);
18299 identity = getMinVal(type);
18300 break;
18301 case WaveOps::UMIN:
18302 opcode = EOPCODE_MIN;
18303 type = getISAType(opndTy, false);
18304 identity = getMaxVal(type);
18305 break;
18306 case WaveOps::IMAX:
18307 opcode = EOPCODE_MAX;
18308 type = getISAType(opndTy);
18309 identity = getMinVal(type);
18310 break;
18311 case WaveOps::IMIN:
18312 opcode = EOPCODE_MIN;
18313 type = getISAType(opndTy);
18314 identity = getMaxVal(type);
18315 break;
18316 case WaveOps::OR:
18317 identity = 0;
18318 opcode = EOPCODE_OR;
18319 type = getISAType(opndTy, false);
18320 break;
18321 case WaveOps::XOR:
18322 identity = 0;
18323 opcode = EOPCODE_XOR;
18324 type = getISAType(opndTy, false);
18325 break;
18326 case WaveOps::AND:
18327 opcode = EOPCODE_AND;
18328 type = getISAType(opndTy, false);
18329 identity = dyn_cast<IntegerType>(opndTy)->getBitMask();
18330 break;
18331 case WaveOps::FSUM:
18332 opcode = EOPCODE_ADD;
18333 type = getISAType(opndTy);
18334 identity = 0;
18335 break;
18336 case WaveOps::FPROD:
18337 opcode = EOPCODE_MUL;
18338 type = getISAType(opndTy);
18339 identity = getFPOne(type);
18340 break;
18341 case WaveOps::FMIN:
18342 opcode = EOPCODE_MIN;
18343 type = getISAType(opndTy);
18344 identity = dyn_cast<ConstantFP>(ConstantFP::getInfinity(opndTy))->getValueAPF().bitcastToAPInt().getZExtValue();
18345 break;
18346 case WaveOps::FMAX:
18347 opcode = EOPCODE_MAX;
18348 type = getISAType(opndTy);
18349 identity = dyn_cast<ConstantFP>(ConstantFP::getInfinity(opndTy, true))->getValueAPF().bitcastToAPInt().getZExtValue();
18350 break;
18351 default:
18352 IGC_ASSERT(0);
18353 }
18354 }
18355
emitWavePrefix(WavePrefixIntrinsic * I)18356 void EmitPass::emitWavePrefix(WavePrefixIntrinsic* I)
18357 {
18358 Value* Mask = I->getMask();
18359 if (auto * CI = dyn_cast<ConstantInt>(Mask))
18360 {
18361 // If the mask is all set, then we just pass a null
18362 // mask to emitScan() indicating we don't want to
18363 // emit any predication.
18364 if (CI->isAllOnesValue())
18365 Mask = nullptr;
18366 }
18367 emitScan(
18368 I->getSrc(), I->getOpKind(), I->isInclusiveScan(), Mask, false);
18369 }
18370
emitQuadPrefix(QuadPrefixIntrinsic * I)18371 void EmitPass::emitQuadPrefix(QuadPrefixIntrinsic* I)
18372 {
18373 emitScan(
18374 I->getSrc(), I->getOpKind(), I->isInclusiveScan(), nullptr, true);
18375 }
18376
emitScan(Value * Src,IGC::WaveOps Op,bool isInclusiveScan,Value * Mask,bool isQuad)18377 void EmitPass::emitScan(
18378 Value* Src, IGC::WaveOps Op,
18379 bool isInclusiveScan, Value* Mask, bool isQuad)
18380 {
18381 VISA_Type type;
18382 e_opcode opCode;
18383 uint64_t identity = 0;
18384 GetReductionOp(Op, Src->getType(), identity, opCode, type);
18385 CVariable* src = GetSymbol(Src);
18386 CVariable* dst[2] = { nullptr, nullptr };
18387 CVariable* Flag = Mask ? GetSymbol(Mask) : nullptr;
18388
18389 emitPreOrPostFixOp(
18390 opCode, identity, type,
18391 false, src, dst, Flag,
18392 !isInclusiveScan, isQuad);
18393
18394 // Now that we've computed the result in temporary registers,
18395 // make sure we only write the results to lanes participating in the
18396 // scan as specified by 'mask'.
18397 if (Flag)
18398 m_encoder->SetPredicate(Flag);
18399 m_encoder->Copy(m_destination, dst[0]);
18400 if (m_currShader->m_numberInstance == 2)
18401 {
18402 m_encoder->SetSecondHalf(true);
18403 m_encoder->Copy(m_destination, dst[1]);
18404 }
18405 m_encoder->Push();
18406 }
18407
emitWaveAll(llvm::GenIntrinsicInst * inst)18408 void EmitPass::emitWaveAll(llvm::GenIntrinsicInst* inst)
18409 {
18410 CVariable* src = GetSymbol(inst->getOperand(0));
18411 const WaveOps op = static_cast<WaveOps>(cast<llvm::ConstantInt>(inst->getOperand(1))->getZExtValue());
18412 VISA_Type type;
18413 e_opcode opCode;
18414 uint64_t identity = 0;
18415 GetReductionOp(op, inst->getOperand(0)->getType(), identity, opCode, type);
18416 CVariable* dst = m_destination;
18417 emitReductionAll(opCode, identity, type, false, src, dst);
18418 }
18419
emitWaveClustered(llvm::GenIntrinsicInst * inst)18420 void EmitPass::emitWaveClustered(llvm::GenIntrinsicInst* inst)
18421 {
18422 CVariable* src = GetSymbol(inst->getOperand(0));
18423 const WaveOps op = static_cast<WaveOps>(cast<llvm::ConstantInt>(inst->getOperand(1))->getZExtValue());
18424 const unsigned int clusterSize = int_cast<uint32_t>(cast<llvm::ConstantInt>(inst->getOperand(2))->getZExtValue());
18425 VISA_Type type;
18426 e_opcode opCode;
18427 uint64_t identity = 0;
18428 GetReductionOp(op, inst->getOperand(0)->getType(), identity, opCode, type);
18429 CVariable *dst = m_destination;
18430 emitReductionClustered(opCode, identity, type, false, clusterSize, src, dst);
18431 }
18432
emitDP4A(GenIntrinsicInst * GII,const SSource * Sources,const DstModifier & modifier)18433 void EmitPass::emitDP4A(GenIntrinsicInst* GII, const SSource* Sources, const DstModifier& modifier) {
18434 GenISAIntrinsic::ID GIID = GII->getIntrinsicID();
18435 CVariable* dst = m_destination;
18436 CVariable *src0, *src1, *src2;
18437
18438 // Check if Sources was set in PatternMatch
18439 if (!Sources)
18440 {
18441 src0 = GetSymbol(GII->getOperand(0));
18442 src1 = GetSymbol(GII->getOperand(1));
18443 src2 = GetSymbol(GII->getOperand(2));
18444 }
18445 else
18446 {
18447 m_encoder->SetSrcRegion(1, Sources[1].region[0], Sources[1].region[1], Sources[1].region[2]);
18448 src0 = GetSrcVariable(Sources[0]);
18449 src1 = GetSrcVariable(Sources[1]);
18450 src2 = GetSrcVariable(Sources[2]);
18451 }
18452
18453 // Set correct signedness of src1.
18454 if (GIID == GenISAIntrinsic::GenISA_dp4a_ss ||
18455 GIID == GenISAIntrinsic::GenISA_dp4a_su)
18456 src1 = m_currShader->BitCast(src1, ISA_TYPE_D);
18457 if (GIID == GenISAIntrinsic::GenISA_dp4a_uu ||
18458 GIID == GenISAIntrinsic::GenISA_dp4a_us)
18459 src1 = m_currShader->BitCast(src1, ISA_TYPE_UD);
18460 // Set correct signedness of src2.
18461 if (GIID == GenISAIntrinsic::GenISA_dp4a_ss ||
18462 GIID == GenISAIntrinsic::GenISA_dp4a_us)
18463 src2 = m_currShader->BitCast(src2, ISA_TYPE_D);
18464 if (GIID == GenISAIntrinsic::GenISA_dp4a_uu ||
18465 GIID == GenISAIntrinsic::GenISA_dp4a_su)
18466 src2 = m_currShader->BitCast(src2, ISA_TYPE_UD);
18467 // Emit dp4a.
18468 m_encoder->SetDstModifier(modifier);
18469 m_encoder->dp4a(dst, src0, src1, src2);
18470 m_encoder->Push();
18471 }
18472
emitUnmaskedRegionBoundary(bool start)18473 void EmitPass::emitUnmaskedRegionBoundary(bool start)
18474 {
18475 if (start) {
18476 m_encoder->BeginForcedNoMaskRegion();
18477 } else {
18478 m_encoder->EndForcedNoMaskRegion();
18479 }
18480 }
18481
emitDebugPlaceholder(llvm::GenIntrinsicInst * I)18482 void EmitPass::emitDebugPlaceholder(llvm::GenIntrinsicInst* I)
18483 {
18484 m_encoder->Loc(I->getDebugLoc().getLine());
18485 m_encoder->DebugLinePlaceholder();
18486 }
18487
18488 // Dummy instruction that won't be optimized away.
emitDummyInst(llvm::GenIntrinsicInst * GII)18489 void EmitPass::emitDummyInst(llvm::GenIntrinsicInst* GII)
18490 {
18491 CVariable* dst = m_currShader->GetNULL();
18492 CVariable* src = m_currShader->GetR0();
18493 m_encoder->Copy(dst, src);
18494 m_encoder->Push();
18495 }
18496
emitImplicitArgIntrinsic(llvm::GenIntrinsicInst * I)18497 void EmitPass::emitImplicitArgIntrinsic(llvm::GenIntrinsicInst* I)
18498 {
18499 Function* parentFunc = I->getParent()->getParent();
18500 MetaDataUtils* pMdUtils = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
18501
18502 // We can just drop the intrinsic if there are no uses for it.
18503 // It should have been lowered in LowerImplicitArgIntrinsics pass, but did not get cleaned up.
18504 if (I->getNumUses() == 0) return;
18505
18506 if (I->getIntrinsicID() == GenISAIntrinsic::ID::GenISA_getR0)
18507 {
18508 // Returns the predefined R0 register
18509 m_encoder->SetUniformSIMDSize(lanesToSIMDMode(m_currShader->getGRFSize() / SIZE_DWORD));
18510 m_encoder->SetNoMask();
18511 m_currShader->CopyVariable(GetSymbol(I), m_currShader->GetR0());
18512 return;
18513 }
18514
18515 Function* groupHead = nullptr;
18516 if (!m_FGA || m_FGA->isGroupHead(parentFunc)) {
18517 groupHead = parentFunc;
18518 }
18519 else {
18520 groupHead = m_FGA->getSubGroupMap(parentFunc);
18521 }
18522
18523 if (isEntryFunc(pMdUtils, groupHead))
18524 {
18525 // Map to the root kernel's implicit arg symbol
18526 ImplicitArgs IAS(*groupHead, pMdUtils);
18527 ImplicitArg::ArgType IAtype = ImplicitArgs::getArgType(I->getIntrinsicID());
18528 Argument* arg = IAS.getImplicitArg(*groupHead, IAtype);
18529 IGC_ASSERT_MESSAGE(arg, "Implicit argument not found!");
18530 if (arg)
18531 {
18532 m_encoder->SetNoMask();
18533 m_currShader->CopyVariable(GetSymbol(I), m_currShader->getOrCreateArgumentSymbol(arg, false));
18534 }
18535 }
18536 else
18537 {
18538 IGC_ASSERT_MESSAGE(0, "Intrinsics used in stackcalls has not been lowered!");
18539 }
18540 }
18541
emitStoreImplBufferPtr(llvm::GenIntrinsicInst * I)18542 void EmitPass::emitStoreImplBufferPtr(llvm::GenIntrinsicInst* I)
18543 {
18544 if (m_currShader->HasStackCalls())
18545 m_currShader->CopyVariable(m_currShader->GetImplArgBufPtr(), GetSymbol(I->getArgOperand(0)));
18546 }
18547
emitStoreLocalIdBufferPtr(llvm::GenIntrinsicInst * I)18548 void EmitPass::emitStoreLocalIdBufferPtr(llvm::GenIntrinsicInst* I)
18549 {
18550 if(m_currShader->HasStackCalls())
18551 m_currShader->CopyVariable(m_currShader->GetLocalIdBufPtr(), GetSymbol(I->getArgOperand(0)));
18552 }
18553
emitLoadImplBufferPtr(llvm::GenIntrinsicInst * I)18554 void EmitPass::emitLoadImplBufferPtr(llvm::GenIntrinsicInst* I)
18555 {
18556 m_encoder->SetUniformSIMDSize(lanesToSIMDMode(1));
18557 m_encoder->SetNoMask();
18558 m_encoder->SetSrcSubReg(0, 0);
18559 m_encoder->Copy(m_destination, m_currShader->GetImplArgBufPtr());
18560 m_encoder->Push();
18561 }
18562
emitLoadLocalIdBufferPtr(llvm::GenIntrinsicInst * I)18563 void EmitPass::emitLoadLocalIdBufferPtr(llvm::GenIntrinsicInst* I)
18564 {
18565 m_encoder->SetUniformSIMDSize(lanesToSIMDMode(1));
18566 m_encoder->SetNoMask();
18567 m_encoder->SetSrcSubReg(0, 0);
18568 m_encoder->Copy(m_destination, m_currShader->GetLocalIdBufPtr());
18569 m_encoder->Push();
18570 }
18571
18572
18573
emitDpas(GenIntrinsicInst * GII,const SSource * Sources,const DstModifier & modifier)18574 void EmitPass::emitDpas(GenIntrinsicInst* GII, const SSource* Sources, const DstModifier& modifier)
18575 {
18576 // Note that in intrinsic's arguments, activation goes before weight;
18577 // But in visa (gen isa), weight goes before activation.
18578 CVariable* dst = m_destination;
18579 CVariable* activation = GetSrcVariable(Sources[1]);
18580 CVariable* weight = GetSrcVariable(Sources[2]);
18581
18582 // input could be null if it is integer 0 or float positive 0.0f
18583 CVariable* input = nullptr;
18584 Constant* CSTVal = dyn_cast<Constant>(Sources[0].value);
18585 if (!(CSTVal && CSTVal->isNullValue()))
18586 {
18587 input = GetSrcVariable(Sources[0]);
18588 }
18589
18590 // float dpas uses short as bfloat16 for either input or dst.
18591 ConstantInt* pa = dyn_cast<ConstantInt>(GII->getOperand(3)); // Activation's precision
18592 ConstantInt* pb = dyn_cast<ConstantInt>(GII->getOperand(4)); // Weight's precision
18593 ConstantInt* sdepth = dyn_cast<ConstantInt>(GII->getOperand(5));
18594 ConstantInt* rcount = dyn_cast<ConstantInt>(GII->getOperand(6));
18595 ConstantInt* dpasw = dyn_cast<ConstantInt>(GII->getOperand(7));
18596 int PA = (int)pa->getSExtValue();
18597 int PB = (int)pb->getSExtValue();
18598 int SD = (int)sdepth->getSExtValue();
18599 int RC = (int)rcount->getSExtValue();
18600 bool IsDpasw = dpasw->getValue().getBoolValue();
18601
18602 // Make sure all operands are non-uniform. If any of them are uniform
18603 // broadcast them to a non-uniform variable.
18604 // (Note that activation should be uniform for non-subgroup dpas)
18605 if (GII->getIntrinsicID() == GenISAIntrinsic::GenISA_sub_group_dpas) {
18606 activation = BroadcastIfUniform(activation);
18607 }
18608 weight = BroadcastIfUniform(weight);
18609 if (input) {
18610 input = BroadcastIfUniform(input);
18611 }
18612
18613 // Sanity: Make sure that activation and weight are D/UD always
18614 if (activation->GetType() != ISA_TYPE_UD && activation->GetType() != ISA_TYPE_D)
18615 {
18616 activation = m_currShader->GetNewAlias(activation, ISA_TYPE_UD, 0, 0);
18617 }
18618 if (weight->GetType() != ISA_TYPE_UD && weight->GetType() != ISA_TYPE_D)
18619 {
18620 weight = m_currShader->GetNewAlias(weight, ISA_TYPE_UD, 0, 0);
18621 }
18622
18623 m_encoder->dpas(dst, input, weight, (PrecisionType)PB, activation, (PrecisionType)PA,
18624 (uint8_t)SD, (uint8_t)RC, IsDpasw);
18625 m_encoder->Push();
18626 }
18627
18628 // Conversion between float types
emitfcvt(llvm::GenIntrinsicInst * GII)18629 void EmitPass::emitfcvt(llvm::GenIntrinsicInst* GII)
18630 {
18631 /// Divide N into multiple of 16 and the remaining into 8, 4, 2, 1
18632 /// Each sequence takes two elements in execsizeSeq, in which first
18633 /// one has execsize, and the second one the starting offset.
18634 auto getAllExecsize = [=](SmallVector<uint32_t, 16> & execsizeSeq, uint32_t N) {
18635 // Max execution size is 16.
18636 int n = (int)N / 16;
18637 uint32_t offset = 0;
18638 for (int i = 0; i < n; ++i) {
18639 execsizeSeq.push_back(16);
18640 execsizeSeq.push_back(offset);
18641 offset += 16;
18642 }
18643
18644 int m = (int)N % 16;
18645 for (uint32_t s = 8; m > 0; s = s / 2)
18646 {
18647 if (m >= (int)s)
18648 {
18649 execsizeSeq.push_back(s);
18650 execsizeSeq.push_back(offset);
18651 offset += s;
18652 m -= s;
18653 }
18654 }
18655 };
18656
18657 Value* sVal = GII->getOperand(0);
18658 CVariable* src = GetSymbol(sVal);
18659 CVariable* dst = m_destination;
18660
18661 Type* dTy = GII->getType();
18662 IGCLLVM::FixedVectorType* dVTy = dyn_cast<IGCLLVM::FixedVectorType>(dTy);
18663 Type* sTy = sVal->getType();
18664 IGCLLVM::FixedVectorType* sVTy = dyn_cast<IGCLLVM::FixedVectorType>(sTy);
18665 int nelts = dVTy ? (int)dVTy->getNumElements() : 1;
18666 int src_nelts = sVTy ? (int)sVTy->getNumElements() : 1;
18667 if (nelts != src_nelts)
18668 {
18669 IGC_ASSERT_MESSAGE(0, "Different #elements in src and dst of conversion intrinsic!");
18670 return;
18671 }
18672
18673 bool isSrcUniform = src->IsUniform();
18674 bool isDstUniform = dst->IsUniform();
18675 uint16_t nsimdsize = numLanes(m_currShader->m_SIMDSize);
18676 GenISAIntrinsic::ID id = GII->getIntrinsicID();
18677
18678 ERoundingMode FP_RM = static_cast<ERoundingMode>(
18679 m_pCtx->getModuleMetaData()->compOpt.FloatRoundingMode);
18680 if (id == GenISAIntrinsic::GenISA_ftobf) {
18681 ConstantInt* CI = cast<ConstantInt>(GII->getOperand(1));
18682 FP_RM = (ERoundingMode)CI->getZExtValue();
18683 }
18684 else if (id == GenISAIntrinsic::GenISA_2fto2bf)
18685 {
18686 ConstantInt* CI = cast<ConstantInt>(GII->getOperand(2));
18687 FP_RM = (ERoundingMode)CI->getZExtValue();
18688 }
18689 else {
18690 FP_RM = ERoundingMode::ROUND_TO_ANY;
18691 }
18692
18693 if (FP_RM != ERoundingMode::ROUND_TO_ANY)
18694 SetRoundingMode_FP(FP_RM);
18695
18696 // vISA instruction doesn't support immediate source of type BF
18697 if (id == GenISAIntrinsic::GenISA_bftof && src->IsImmediate())
18698 {
18699 uint32_t imm32 = ((uint32_t)src->GetImmediateValue()) & 0xFFFF;
18700 imm32 = imm32 << 16; // make it as float immediate
18701 CVariable* fSrc = m_currShader->ImmToVariable((uint64_t)imm32, ISA_TYPE_F);
18702 m_encoder->Copy(dst, fSrc);
18703 m_encoder->Push();
18704 return;
18705 }
18706
18707 if (id == GenISAIntrinsic::GenISA_ftobf ||
18708 id == GenISAIntrinsic::GenISA_bftof)
18709 {
18710 CVariable* tDst = nullptr, * tSrc = nullptr;
18711 if (id == GenISAIntrinsic::GenISA_ftobf) {
18712 tDst = m_currShader->GetNewAlias(dst, ISA_TYPE_BF, 0, 0);
18713 tSrc = src;
18714 }
18715 else if (id == GenISAIntrinsic::GenISA_bftof) {
18716 tDst = dst;
18717 tSrc = m_currShader->GetNewAlias(src, ISA_TYPE_BF, 0, 0);
18718 }
18719 else {
18720 IGC_ASSERT_EXIT_MESSAGE(0, "Something wrong in cvt!");
18721 }
18722
18723 if (isSrcUniform && isDstUniform)
18724 {
18725 SmallVector<uint32_t, 16> insts;
18726 getAllExecsize(insts, nelts);
18727 for (int i = 0, s = (int)insts.size(); i < s; i += 2)
18728 {
18729 uint32_t esize = insts[i];
18730 SIMDMode simdMode = lanesToSIMDMode(esize);
18731 uint32_t offset = insts[i + 1];
18732
18733 m_encoder->SetNoMask();
18734 m_encoder->SetUniformSIMDSize(simdMode);
18735 m_encoder->SetDstSubReg(offset);
18736 m_encoder->SetSrcSubReg(0, offset);
18737 // by default, uniform's region is (0, 1, 0)
18738 if (esize > 1) {
18739 uint32_t stride = (esize >= 8 ? 8 : esize);
18740 m_encoder->SetSrcRegion(0, stride, stride, 1);
18741 }
18742 m_encoder->Cast(tDst, tSrc);
18743 m_encoder->Push();
18744 }
18745 }
18746 else
18747 {
18748 uint32_t dstOff = 0, srcOff = 0;
18749 for (int i = 0; i < nelts; ++i)
18750 {
18751 m_encoder->SetDstSubReg(dstOff);
18752 m_encoder->SetSrcSubReg(0, srcOff);
18753 m_encoder->Cast(tDst, tSrc);
18754 m_encoder->Push();
18755
18756 dstOff += (isDstUniform ? 1 : nsimdsize);
18757 srcOff += (isSrcUniform ? 1 : nsimdsize);
18758 }
18759 }
18760 }
18761 else if (id == GenISAIntrinsic::GenISA_2fto2bf)
18762 {
18763 CVariable* srcs[2];
18764 srcs[0] = src;
18765 srcs[1] = GetSymbol(GII->getOperand(1));
18766 CVariable* tDst = m_currShader->GetNewAlias(dst, ISA_TYPE_BF, 0, 0);
18767 SmallVector<uint32_t, 16> insts;
18768 getAllExecsize(insts, nelts);
18769 for (int e = 0; e < 2; ++e)
18770 {
18771 CVariable* tSrc = srcs[e];
18772 isSrcUniform = tSrc->IsUniform();
18773 if (isSrcUniform && isDstUniform)
18774 {
18775 for (int i = 0, s = (int)insts.size(); i < s; i += 2)
18776 {
18777 uint32_t esize = insts[i];
18778 SIMDMode simdMode = lanesToSIMDMode(esize);
18779 uint32_t offset = insts[i + 1];
18780
18781 m_encoder->SetNoMask();
18782 m_encoder->SetUniformSIMDSize(simdMode);
18783 m_encoder->SetDstSubReg(2 * offset + e);
18784 m_encoder->SetDstRegion(2);
18785 m_encoder->SetSrcSubReg(0, offset);
18786 // by default, uniform's region is (0, 1, 0)
18787 if (esize > 1) {
18788 uint32_t stride = (esize >= 8 ? 8 : esize);
18789 m_encoder->SetSrcRegion(0, stride, stride, 1);
18790 }
18791 m_encoder->Cast(tDst, tSrc);
18792 m_encoder->Push();
18793 }
18794 }
18795 else
18796 {
18797 uint32_t dstOff = 0, srcOff = 0;
18798 for (int i = 0; i < nelts; ++i)
18799 {
18800 m_encoder->SetDstSubReg(2 * dstOff + e);
18801 m_encoder->SetDstRegion(2);
18802 m_encoder->SetSrcSubReg(0, srcOff);
18803 m_encoder->Cast(tDst, tSrc);
18804 m_encoder->Push();
18805
18806 dstOff += (isDstUniform ? 1 : nsimdsize);
18807 srcOff += (isSrcUniform ? 1 : nsimdsize);
18808 }
18809 }
18810 }
18811 }
18812 else
18813 {
18814 IGC_ASSERT_MESSAGE(0, "ICE: unhandled gen intrinsic within cvt!");
18815 }
18816
18817 if (FP_RM != ERoundingMode::ROUND_TO_ANY) {
18818 ResetRoundingMode(GII);
18819 }
18820 }
18821
18822
18823
18824