1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2017-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include "Compiler/CISACodeGen/CISABuilder.hpp"
10 #include "Compiler/CISACodeGen/ShaderCodeGen.hpp"
11 #include "Compiler/CISACodeGen/PixelShaderCodeGen.hpp"
12 #include "Compiler/CISACodeGen/ComputeShaderCodeGen.hpp"
13 #include "common/allocator.h"
14 #include "common/Types.hpp"
15 #include "common/Stats.hpp"
16 #include "common/MemStats.h"
17 #include "common/debug/Dump.hpp"
18 #include "common/igc_regkeys.hpp"
19 #include "common/secure_mem.h"
20 #include "common/secure_string.h"
21 #include "common/shaderOverride.hpp"
22 #include "common/CompilerStatsUtils.hpp"
23 #include "inc/common/sku_wa.h"
24 #include <llvm/Support/Path.h>
25 #include <llvm/ADT/Statistic.h>
26 #include <iStdLib/utility.h>
27 #include <iostream>
28 #include <iomanip>
29 #include <sstream>
30 #include <string>
31 #include <fstream>
32 #include "Probe/Assertion.h"
33 
34 #if !defined(_WIN32)
35 #   define _strdup strdup
36 #endif
37 
38 /***********************************************************************************
39 This file defines the CEncoder class which is used to generate CISA instructions
40 ************************************************************************************/
41 
42 // macro to check the result of VISA API calls
43 #define V(x) do { int result = (x); IGC_ASSERT_MESSAGE((0 == result), "call to VISA API failed"); } while(0)
44 
45 static const unsigned  int g_cScratchSpaceMsglimit = (128 * 1024);
46 using namespace llvm;
47 
48 #define DEBUG_TYPE "cisa-builder"
49 
50 STATISTIC(SimdSize8, "Number of shader(s) with SIMD8");
51 STATISTIC(SimdSize16, "Number of shader(s) with SIMD16");
52 STATISTIC(SimdSize32, "Number of shader(s) with SIMD32");
53 
54 namespace IGC
55 {
visaExecSize(SIMDMode width)56     inline VISA_Exec_Size visaExecSize(SIMDMode width)
57     {
58         switch (width)
59         {
60         case SIMDMode::SIMD1: return EXEC_SIZE_1;
61         case SIMDMode::SIMD2: return EXEC_SIZE_2;
62         case SIMDMode::SIMD4: return EXEC_SIZE_4;
63         case SIMDMode::SIMD8: return EXEC_SIZE_8;
64         case SIMDMode::SIMD16: return EXEC_SIZE_16;
65         case SIMDMode::SIMD32: return EXEC_SIZE_32;
66         case SIMDMode::UNKNOWN:
67         default: IGC_ASSERT_MESSAGE(0, "unreachable"); break;
68         }
69         return EXEC_SIZE_ILLEGAL;
70     }
71 
convertAtomicOpEnumToVisa(AtomicOp op)72     VISAAtomicOps convertAtomicOpEnumToVisa(AtomicOp op)
73     {
74         switch (op)
75         {
76         case EATOMIC_AND:
77         case EATOMIC_AND64:
78             return ATOMIC_AND;
79         case EATOMIC_DEC:
80         case EATOMIC_DEC64:
81             return ATOMIC_DEC;
82         case EATOMIC_IADD:
83         case EATOMIC_IADD64:
84             return ATOMIC_ADD;
85         case EATOMIC_IMAX:
86         case EATOMIC_IMAX64:
87             return ATOMIC_IMAX;
88         case EATOMIC_IMIN:
89         case EATOMIC_IMIN64:
90             return ATOMIC_IMIN;
91         case EATOMIC_INC:
92         case EATOMIC_INC64:
93             return ATOMIC_INC;
94         case EATOMIC_MAX:
95         case EATOMIC_MAX64:
96             return ATOMIC_MAX;
97         case EATOMIC_MIN:
98         case EATOMIC_MIN64:
99             return ATOMIC_MIN;
100         case EATOMIC_OR:
101         case EATOMIC_OR64:
102             return ATOMIC_OR;
103         case EATOMIC_SUB:
104         case EATOMIC_SUB64:
105             return ATOMIC_SUB;
106         case EATOMIC_UMAX:
107         case EATOMIC_UMAX64:
108             return ATOMIC_MAX;
109         case EATOMIC_UMIN:
110         case EATOMIC_UMIN64:
111             return ATOMIC_MIN;
112         case EATOMIC_XOR:
113         case EATOMIC_XOR64:
114             return ATOMIC_XOR;
115         case EATOMIC_XCHG:
116         case EATOMIC_XCHG64:
117             return ATOMIC_XCHG;
118         case EATOMIC_CMPXCHG:
119         case EATOMIC_CMPXCHG64:
120             return ATOMIC_CMPXCHG;
121         case EATOMIC_PREDEC:
122         case EATOMIC_PREDEC64:
123             return ATOMIC_PREDEC;
124         case EATOMIC_FMAX:
125             return ATOMIC_FMAX;
126         case EATOMIC_FMIN:
127             return ATOMIC_FMIN;
128         case EATOMIC_FCMPWR:
129             return ATOMIC_FCMPWR;
130         case EATOMIC_FADD:
131         case EATOMIC_FADD64:
132             return ATOMIC_FADD;
133         case EATOMIC_FSUB:
134             return ATOMIC_FSUB;
135         default:
136             IGC_ASSERT_MESSAGE(0, "Atomic Op not implemented");
137             return ATOMIC_AND;
138         }
139     }
140 
visaElementSize(unsigned int m_elt_size)141     inline GATHER_SCATTER_ELEMENT_SIZE visaElementSize(unsigned int m_elt_size)
142     {
143         GATHER_SCATTER_ELEMENT_SIZE elementSize = GATHER_SCATTER_BYTE_UNDEF;
144         if (m_elt_size == 1)
145         {
146             elementSize = GATHER_SCATTER_BYTE;
147         }
148         else if (m_elt_size == 2)
149         {
150             elementSize = GATHER_SCATTER_WORD;
151         }
152         else if (m_elt_size == 4)
153         {
154             elementSize = GATHER_SCATTER_DWORD;
155         }
156         else
157         {
158             IGC_ASSERT_MESSAGE(0, "unreachable");
159         }
160         return elementSize;
161     }
162 
163     static inline VISA_SVM_Block_Type
visaBlockType(unsigned elemSize)164         visaBlockType(unsigned elemSize) {
165         switch (elemSize) {
166         case 8:  return SVM_BLOCK_TYPE_BYTE;
167         case 32: return SVM_BLOCK_TYPE_DWORD;
168         case 64: return SVM_BLOCK_TYPE_QWORD;
169         }
170 
171         IGC_ASSERT_MESSAGE(0, "Unknown block/element size. Expect 8-/32-/64-bit only!");
172         return static_cast<VISA_SVM_Block_Type>(~0U);
173     }
174 
175     static inline VISA_SVM_Block_Num
visaBlockNum(unsigned numElems)176         visaBlockNum(unsigned numElems) {
177         switch (numElems) {
178         case 1: return SVM_BLOCK_NUM_1;
179         case 2: return SVM_BLOCK_NUM_2;
180         case 4: return SVM_BLOCK_NUM_4;
181         case 8: return SVM_BLOCK_NUM_8;
182         }
183 
184         IGC_ASSERT_MESSAGE(0, "Unknown number of blocks/elements. Expect 1, 2, 4, or 8 only!");
185         return static_cast<VISA_SVM_Block_Num>(~0U);
186     }
187 
visaNumLanes(VISA_Exec_Size execSize)188     constexpr unsigned visaNumLanes(VISA_Exec_Size execSize)
189     {
190         unsigned lanes = 0;
191         switch (execSize)
192         {
193         case EXEC_SIZE_1:  lanes = 1; break;
194         case EXEC_SIZE_2:  lanes = 2; break;
195         case EXEC_SIZE_4:  lanes = 4; break;
196         case EXEC_SIZE_8:  lanes = 8; break;
197         case EXEC_SIZE_16: lanes = 16; break;
198         case EXEC_SIZE_32: lanes = 32; break;
199         default: IGC_ASSERT(0); break;
200         }
201         return lanes;
202     }
203 
204     // Take certain attributes of either src or dst instruction operand and return the size
205     // of the associated grf region, accessed during instruction's execution, in bytes.
206     // If aligned==true, the size includes length of data block starting at the beginning of grf
207     // and ending at the subReg; this is useful to check if the region crosses 2 grf boundary.
208     // If special region attribute is not set, the regioning is <1; 1, 0> for src and <1> for dst.
209     // Note that the assertions may hit in certain cases, which should be handled separately,
210     // like uniform vars with operand with special region set.
GrfRegionSize(VISA_Exec_Size execSize,unsigned elementSize,const SModifier & mod,bool isSource,bool aligned=true)211     constexpr unsigned GrfRegionSize(VISA_Exec_Size execSize, unsigned elementSize,
212         const SModifier& mod, bool isSource, bool aligned = true)
213     {
214         constexpr unsigned grfSize = 32; // in bytes
215         // If subReg is big enough to cross grf boundary, adjust it.
216         const unsigned base = (mod.subReg * elementSize) % grfSize;
217         unsigned lastInRegion = aligned ? base : 0;
218         if (isSource)
219         {
220             // Formula based on algorithm provided in the spec (see Region Parameters)
221             const unsigned vstride = mod.specialRegion ? mod.region[0] : 1;
222             const unsigned width = mod.specialRegion ? mod.region[1] : 1;
223             const unsigned hstride = mod.specialRegion ? mod.region[2] : 0;
224             if (0 == width)
225             {
226                 return unsigned(-1);
227             }
228             const unsigned height = visaNumLanes(execSize) / width;
229             if (0 == height)
230             {
231                 return unsigned(-1);
232             }
233             lastInRegion += (height - 1) * vstride * elementSize +
234                 (width - 1) * hstride * elementSize;
235         }
236         else
237         {
238             const unsigned hstride = mod.specialRegion ? mod.region[2] : 1;
239             lastInRegion += (visaNumLanes(execSize) - 1) * hstride * elementSize;
240         }
241         return lastInRegion + elementSize;
242     };
243     // Compile-time ULTs for GrfRegionSize()
244     static_assert(GrfRegionSize(EXEC_SIZE_16, 4, SModifier{}, false) == 64 &&
245         GrfRegionSize(EXEC_SIZE_16, 4, SModifier{ 16, {}, {0,0,2}, {}, {}, true }, false) == 124 &&
246         GrfRegionSize(EXEC_SIZE_16, 4, SModifier{ 15, {}, {0,0,2}, {}, {}, true }, false) == 124 + 7 * 4 &&
247         GrfRegionSize(EXEC_SIZE_8, 8, SModifier{ 1, {}, {0,0,2}, {}, {}, true }, false) == 128,
248         "GrfRegionSize compile-time test failed - dst.");
249     static_assert(GrfRegionSize(EXEC_SIZE_16, 4, SModifier{}, true) == 64 &&
250         GrfRegionSize(EXEC_SIZE_16, 4, SModifier{ {}, {}, {4,4,0}, {}, {}, true }, true) == 52 &&
251         GrfRegionSize(EXEC_SIZE_8, 8, SModifier{ 8, {}, {2,1,0}, {}, {}, true }, true) == 120 &&
252         GrfRegionSize(EXEC_SIZE_8, 8, SModifier{ 10, {}, {2,1,0}, {}, {}, true }, true) == 120 + 2 * 8,
253         "GrfRegionSize compile-time test failed - src.");
254 
255     // split a SIMD16 variable into two SIMD8 while satisfying vISA's raw operand alignment
256     // return a tuple representing the vISA raw operand (var + offset) after split
splitRawOperand(CVariable * var,bool isFirstHalf,VISA_EMask_Ctrl execMask)257     std::tuple<CVariable*, uint32_t> CEncoder::splitRawOperand(CVariable* var, bool isFirstHalf,
258         VISA_EMask_Ctrl execMask)
259     {
260 
261         if (!var || var->IsUniform() || isFirstHalf)
262         {
263             // simply return the original variable
264             return std::make_tuple(var, 0);
265         }
266 
267         uint32_t offset = 8 * var->GetElemSize();
268         if ((offset % getGRFSize()) == 0)
269         {
270             return std::make_tuple(var, offset);
271         }
272         else
273         {
274             // create a copy to make the CVariable aligned
275             auto tmpVar = m_program->GetNewVariable(
276                 8, var->GetType(), CVariable::getAlignment(getGRFSize()),
277                 CName::NONE);
278             SModifier mod;
279             mod.init();
280             auto dstOpnd = GetDestinationOperand(tmpVar, mod);
281             mod.subReg = 8;
282             auto srcOpnd = GetSourceOperand(var, mod);
283 
284             V(vKernel->AppendVISADataMovementInst(
285                 ISA_MOV, nullptr, false,
286                 SplitEMask(EXEC_SIZE_16, EXEC_SIZE_8, 1, execMask),
287                 EXEC_SIZE_8, dstOpnd, srcOpnd));
288 
289             return std::make_tuple(tmpVar, 0);
290         }
291     }
292 
GetRawOpndSplitOffset(VISA_Exec_Size fromExecSize,VISA_Exec_Size toExecSize,unsigned thePart,CVariable * var) const293     unsigned  CEncoder::GetRawOpndSplitOffset(
294         VISA_Exec_Size fromExecSize,
295         VISA_Exec_Size toExecSize,
296         unsigned thePart, CVariable* var) const
297     {
298         if (!var || var->IsUniform())
299             return 0;
300 
301         IGC_ASSERT_MESSAGE(fromExecSize == EXEC_SIZE_16, "Only support splitting from exec-size 16 to exec-size 8");
302         IGC_ASSERT_MESSAGE(toExecSize == EXEC_SIZE_8, "Only support splitting from exec-size 16 to exec-size 8");
303         IGC_ASSERT_MESSAGE((thePart == 0) || (thePart == 1), "Splitting from exec-size-16 to exec-size-8 only breaks into 2 parts");
304 
305         unsigned elemSize = var->GetElemSize();
306 
307         switch (elemSize)
308         {
309         case 4:
310             return thePart * getGRFSize() * 1;
311         case 8:
312             return thePart * getGRFSize() * 2;
313         }
314 
315         IGC_ASSERT_MESSAGE(0, "Unknown data type to split!");
316         return ~0U;
317     }
318 
size() const319     size_t URBChannelMask::size() const
320     {
321         return m_bitmask == 0 ? 0 : iSTD::bsr(m_bitmask) + 1;
322     }
323 
asVISAMask() const324     unsigned int URBChannelMask::asVISAMask() const
325     {
326         // if all bits in the mask are set we need to return 0xFF which means 'no channel mask'
327         // if all bits are set -> adding one creates a power of two, so x and x+1 has no common bits.
328         if (((m_bitmask + 1) & m_bitmask) == 0)
329         {
330             return (uint32_t)-1;
331         }
332         else
333         {
334             return (uint16_t)m_bitmask;
335         }
336     }
337 
Init()338     void CEncoder::Init()
339     {
340         m_encoderState.m_srcOperand[0].init();
341         m_encoderState.m_srcOperand[1].init();
342         m_encoderState.m_srcOperand[2].init();
343         m_encoderState.m_srcOperand[3].init();
344         m_encoderState.m_dstOperand.init();
345         m_encoderState.m_flag.init();
346         m_encoderState.m_mask = EMASK_Q1;
347         m_encoderState.m_noMask = false;
348         m_encoderState.m_simdSize = m_program->m_SIMDSize;
349         m_encoderState.m_uniformSIMDSize = SIMDMode::SIMD1;
350 
351         if (m_nestLevelForcedNoMaskRegion > 0) {
352           m_encoderState.m_noMask = true;
353         }
354     }
355 
CEncoder()356     CEncoder::CEncoder()
357     {
358         m_program = nullptr;
359         vbuilder = nullptr;
360         vAsmTextBuilder = nullptr;
361     }
362 
~CEncoder()363     CEncoder::~CEncoder()
364     {
365     }
366 
getGRFSize() const367     uint32_t CEncoder::getGRFSize() const { return m_program->getGRFSize(); }
368 
369 
GetShaderName()370     std::string CEncoder::GetShaderName() {
371         return IGC::Debug::GetDumpNameObj(m_program, "").str();
372     }
373 
SetProgram(CShader * program)374     void CEncoder::SetProgram(CShader* program)
375     {
376         m_program = program;
377         Init();
378     }
379 
SubroutineCall(CVariable * flag,llvm::Function * F)380     void CEncoder::SubroutineCall(CVariable* flag, llvm::Function* F)
381     {
382         VISA_LabelOpnd* visaLabel = GetFuncLabel(F);
383         m_encoderState.m_flag.var = flag;
384         VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
385         // control flow instructions cannot be broken down into lower SIMD
386         VISA_EMask_Ctrl emask = m_encoderState.m_noMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1;
387         VISA_Exec_Size execSize = visaExecSize(m_program->m_dispatchSize);
388         if (F->hasFnAttribute("KMPLOCK"))
389         {
390             emask = vISA_EMASK_M1_NM;
391             execSize = EXEC_SIZE_1;
392         }
393         V(vKernel->AppendVISACFCallInst(predOpnd, emask, execSize, visaLabel));
394     }
395 
StackCall(CVariable * flag,llvm::Function * F,unsigned char argSize,unsigned char retSize)396     void CEncoder::StackCall(CVariable* flag, llvm::Function* F, unsigned char argSize, unsigned char retSize)
397     {
398 
399         m_encoderState.m_flag.var = flag;
400         VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
401         // control flow instructions cannot be broken down into lower SIMD
402         VISA_EMask_Ctrl emask = m_encoderState.m_noMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1;
403         VISA_Exec_Size execSize = visaExecSize(m_program->m_dispatchSize);
404         V(vKernel->AppendVISACFFunctionCallInst(predOpnd, emask, execSize, F->getName().data(), argSize, retSize));
405     }
406 
IndirectStackCall(CVariable * flag,CVariable * funcPtr,unsigned char argSize,unsigned char retSize)407     void CEncoder::IndirectStackCall(CVariable* flag, CVariable* funcPtr, unsigned char argSize, unsigned char retSize)
408     {
409         m_encoderState.m_flag.var = flag;
410         VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
411         // control flow instructions cannot be broken down into lower SIMD
412         VISA_EMask_Ctrl emask = m_encoderState.m_noMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1;
413         VISA_Exec_Size execSize = visaExecSize(m_program->m_dispatchSize);
414         VISA_VectorOpnd* funcAddrOpnd = GetSourceOperandNoModifier(funcPtr);
415         V(vKernel->AppendVISACFIndirectFuncCallInst(predOpnd, emask, execSize, funcAddrOpnd, argSize, retSize));
416     }
417 
SubroutineRet(CVariable * flag,llvm::Function * F)418     void CEncoder::SubroutineRet(CVariable* flag, llvm::Function* F)
419     {
420         m_encoderState.m_flag.var = flag;
421         VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
422         // control flow instructions cannot be broken down into lower SIMD
423         VISA_EMask_Ctrl emask = m_encoderState.m_noMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1;
424         VISA_Exec_Size execSize = visaExecSize(m_program->m_dispatchSize);
425         if (F->hasFnAttribute("KMPLOCK"))
426         {
427             emask = vISA_EMASK_M1_NM;
428             execSize = EXEC_SIZE_1;
429         }
430         V(vKernel->AppendVISACFRetInst(predOpnd, emask, execSize));
431     }
432 
StackRet(CVariable * flag)433     void CEncoder::StackRet(CVariable* flag)
434     {
435         m_encoderState.m_flag.var = flag;
436         VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
437         // control flow instructions cannot be broken down into lower SIMD
438         VISA_EMask_Ctrl emask = m_encoderState.m_noMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1;
439         VISA_Exec_Size execSize = visaExecSize(m_program->m_dispatchSize);
440         V(vKernel->AppendVISACFFunctionRetInst(predOpnd, emask, execSize));
441     }
442 
Jump(CVariable * flag,uint label)443     void CEncoder::Jump(CVariable* flag, uint label)
444     {
445         VISA_LabelOpnd* visaLabel = GetLabel(label);
446         m_encoderState.m_flag.var = flag;
447         VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
448         // control flow instructions cannot be broken down into lower SIMD
449         VISA_EMask_Ctrl emask = m_encoderState.m_noMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1;
450         VISA_Exec_Size execSize = visaExecSize(m_program->m_dispatchSize);
451 
452         // visa and igc agreement.
453         //    goto (1) is used to tell visa the goto is uniform.
454         // Goto(1) is generated if
455         //   1. jump is unconditional, or
456         //   2. jump is uniform (thread uniform or above) and no EU fusion, or
457         //   3. jump is either workgroup or global uniform under EU fusion
458         //      (it is temporarily under key control for ease of debugging)
459         if (flag == nullptr ||
460             (!m_program->m_Platform->hasFusedEU() && flag->IsUniform()) ||
461             (IGC_IS_FLAG_ENABLED(EnableWorkGroupUniformGoto) &&
462              m_program->m_Platform->hasFusedEU() && flag->IsWorkGroupOrGlobalUniform()))
463         {
464             execSize = EXEC_SIZE_1;
465         }
466         V(vKernel->AppendVISACFGotoInst(predOpnd, emask, execSize, visaLabel));
467     }
468 
Label(uint label)469     void CEncoder::Label(uint label)
470     {
471         VISA_LabelOpnd* visaLabel = GetLabel(label);
472         V(vKernel->AppendVISACFLabelInst(visaLabel));
473     }
474 
GetNewLabelID(const CName & name)475     uint CEncoder::GetNewLabelID(const CName &name)
476     {
477         uint id = labelMap.size();
478         labelMap.push_back(nullptr);
479         labelNameMap.push_back(
480             CreateVisaLabelName(llvm::StringRef(name.getCString())));
481         return id;
482     }
483 
DwordAtomicRaw(AtomicOp atomic_op,const ResourceDescriptor & resource,CVariable * dst,CVariable * elem_offset,CVariable * src0,CVariable * src1,bool is16Bit)484     void CEncoder::DwordAtomicRaw(
485         AtomicOp atomic_op,
486         const ResourceDescriptor& resource,
487         CVariable* dst,
488         CVariable* elem_offset,
489         CVariable* src0,
490         CVariable* src1,
491         bool is16Bit)
492     {
493 
494         // Fix types for dword atomics
495         VISA_Type type = ISA_TYPE_UD;
496         if (atomic_op == EATOMIC_IMAX || atomic_op == EATOMIC_IMIN)
497         {
498             type = ISA_TYPE_D;
499         }
500         else if (atomic_op == EATOMIC_FMAX ||
501             atomic_op == EATOMIC_FMIN ||
502             atomic_op == EATOMIC_FADD ||
503             atomic_op == EATOMIC_FSUB ||
504             atomic_op == EATOMIC_FCMPWR)
505         {
506             type = ISA_TYPE_F;
507         }
508         if (src0 && src0->GetType() != type)
509             src0 = m_program->BitCast(src0, type);
510         if (src1 && src1->GetType() != type)
511             src1 = m_program->BitCast(src1, type);
512         if (dst && dst->GetType() != type)
513             dst = m_program->BitCast(dst, type);
514         if (elem_offset->GetType() != ISA_TYPE_UD)
515             elem_offset = m_program->BitCast(elem_offset, ISA_TYPE_UD);
516 
517         IGC_ASSERT_MESSAGE(nullptr == m_encoderState.m_flag.var, "not supported predicate");
518 
519         VISA_StateOpndHandle* pSurfStateOpndHandle = GetVISASurfaceOpnd(resource);
520         VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
521         VISA_RawOpnd* pDst = GetRawDestination(dst);
522         VISA_RawOpnd* pElemOffset = GetRawSource(elem_offset);
523         VISA_RawOpnd* pSrc0 = GetRawSource(src0);
524         VISA_RawOpnd* pSrc1 = GetRawSource(src1);
525 
526         /*
527         So the problem is this - the message was added for SNB, and at the time it was implemented as
528         CMPXCHG : new = (old==src1) ? src0 : old
529 
530         In IVB this becomes untyped atomic, and it's implemented as
531         AOP_CMPWR (src0 == old_dst) ? src1 : old_dst old_dst
532 
533         Note that the source is swapped.  Since we define CMPXCHG as the former in vISA, internally we
534         perform a swap for it.  So I guess for now you'll need to swap the two source to follow the vISA
535         semantics.  We may want to add a new vISA message to fix this issue.
536         */
537         if (atomic_op == EATOMIC_CMPXCHG) {
538             std::swap(pSrc0, pSrc1);
539         }
540 
541         V(vKernel->AppendVISASurfAccessDwordAtomicInst(
542             predOpnd,
543             convertAtomicOpEnumToVisa(atomic_op),
544             is16Bit,
545             ConvertMaskToVisaType(m_encoderState.m_mask, m_encoderState.m_noMask),
546             visaExecSize(m_encoderState.m_simdSize),
547             pSurfStateOpndHandle,
548             pElemOffset,
549             pSrc0,
550             pSrc1,
551             pDst));
552         if (ESURFACE_STATELESS == resource.m_surfaceType)
553         {
554             this->m_program->IncStatelessWritesCount();
555         }
556     }
557 
Cmp(e_predicate p,CVariable * dst,CVariable * src0,CVariable * src1)558     void CEncoder::Cmp(e_predicate p, CVariable* dst, CVariable* src0, CVariable* src1)
559     {
560         VISA_Cond_Mod subOp = ConvertCondModToVisaType(p);
561 
562         bool flagDst = 0;
563         if (dst->GetType() == ISA_TYPE_BOOL)
564         {
565             flagDst = true;
566         }
567 
568         VISA_VectorOpnd* opnd0 = GetSourceOperand(src0, m_encoderState.m_srcOperand[0]);
569         VISA_VectorOpnd* opnd1 = GetSourceOperand(src1, m_encoderState.m_srcOperand[1]);
570 
571         if (flagDst)
572         {
573             V(vKernel->AppendVISAComparisonInst(
574                 subOp,
575                 GetAluEMask(dst),
576                 GetAluExecSize(dst),
577                 dst->visaPredVariable,
578                 opnd0,
579                 opnd1));
580         }
581         else
582         {
583             V(vKernel->AppendVISAComparisonInst(
584                 subOp,
585                 GetAluEMask(dst),
586                 GetAluExecSize(dst),
587                 GetDestinationOperand(dst, m_encoderState.m_dstOperand),
588                 opnd0,
589                 opnd1));
590         }
591     }
592 
Select(CVariable * flag,CVariable * dst,CVariable * src0,CVariable * src1)593     void CEncoder::Select(CVariable* flag, CVariable* dst, CVariable* src0, CVariable* src1)
594     {
595         m_encoderState.m_flag.var = flag;
596 
597         VISA_VectorOpnd* dstOpnd = GetDestinationOperand(dst, m_encoderState.m_dstOperand);
598         VISA_VectorOpnd* src0Opnd = GetSourceOperand(src0, m_encoderState.m_srcOperand[0]);
599         VISA_VectorOpnd* src1Opnd = GetSourceOperand(src1, m_encoderState.m_srcOperand[1]);
600         VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
601 
602         V(vKernel->AppendVISADataMovementInst(
603             ISA_SEL,
604             predOpnd,
605             IsSat(),
606             GetAluEMask(dst),
607             GetAluExecSize(dst),
608             dstOpnd,
609             src0Opnd,
610             src1Opnd));
611     }
612 
PredAdd(CVariable * flag,CVariable * dst,CVariable * src0,CVariable * src1)613     void CEncoder::PredAdd(CVariable* flag, CVariable* dst, CVariable* src0, CVariable* src1)
614     {
615         m_encoderState.m_flag.var = flag;
616 
617         Arithmetic(ISA_ADD, dst, src0, src1);
618     }
619 
SetDstSubVar(uint subVar)620     void CEncoder::SetDstSubVar(uint subVar)
621     {
622         m_encoderState.m_dstOperand.subVar = int_cast<uint8_t>(subVar);
623     }
624 
SetDstSubReg(uint subReg)625     void CEncoder::SetDstSubReg(uint subReg)
626     {
627         m_encoderState.m_dstOperand.subReg = int_cast<uint16_t>(subReg);
628     }
629 
SetSrcSubVar(uint srcNum,uint subVar)630     void CEncoder::SetSrcSubVar(uint srcNum, uint subVar)
631     {
632         IGC_ASSERT(srcNum < 4);
633         m_encoderState.m_srcOperand[srcNum].subVar = int_cast<uint8_t>(subVar);
634     }
635 
SetSrcSubReg(uint srcNum,uint subReg)636     void CEncoder::SetSrcSubReg(uint srcNum, uint subReg)
637     {
638         IGC_ASSERT(srcNum < 4);
639         m_encoderState.m_srcOperand[srcNum].subReg = int_cast<uint16_t>(subReg);
640     }
641 
SetDstModifier(e_modifier mod)642     void CEncoder::SetDstModifier(e_modifier mod)
643     {
644         IGC_ASSERT((mod == EMOD_SAT) || (mod == EMOD_NONE));
645         m_encoderState.m_dstOperand.mod = mod;
646     }
647 
SetSrcModifier(uint srcNum,e_modifier mod)648     void CEncoder::SetSrcModifier(uint srcNum, e_modifier mod)
649     {
650         IGC_ASSERT(mod != EMOD_SAT);
651         IGC_ASSERT(srcNum < 3);
652         m_encoderState.m_srcOperand[srcNum].mod = mod;
653     }
654 
SetPredicate(CVariable * flag)655     void CEncoder::SetPredicate(CVariable* flag)
656     {
657         IGC_ASSERT((nullptr == flag) || (flag->GetVarType() == EVARTYPE_PREDICATE));
658         m_encoderState.m_flag.var = flag;
659     }
660 
SetInversePredicate(bool inv)661     void CEncoder::SetInversePredicate(bool inv)
662     {
663         m_encoderState.m_flag.invertFlag = inv;
664     }
665 
SetPredicateMode(e_predMode mode)666     void CEncoder::SetPredicateMode(e_predMode mode)
667     {
668         m_encoderState.m_flag.mode = mode;
669     }
670 
SetDstModifier(const DstModifier & modifier)671     void CEncoder::SetDstModifier(const DstModifier& modifier)
672     {
673         if (modifier.sat)
674         {
675             SetDstModifier(EMOD_SAT);
676         }
677         if (modifier.flag)
678         {
679             SetPredicate(m_program->GetSymbol(modifier.flag->value));
680             SetInversePredicate(modifier.invertFlag);
681         }
682     }
683 
SetSrcRegion(uint srcNum,uint vStride,uint width,uint hStride,e_instance instance)684     void CEncoder::SetSrcRegion(uint srcNum, uint vStride, uint width, uint hStride, e_instance instance)
685     {
686         m_encoderState.m_srcOperand[srcNum].region[0] = int_cast<uint8_t>(vStride);
687         m_encoderState.m_srcOperand[srcNum].region[1] = int_cast<uint8_t>(width);
688         m_encoderState.m_srcOperand[srcNum].region[2] = int_cast<uint8_t>(hStride);
689         m_encoderState.m_srcOperand[srcNum].instance = instance;
690         m_encoderState.m_srcOperand[srcNum].specialRegion = true;
691     }
692 
SetDstRegion(uint hStride)693     void CEncoder::SetDstRegion(uint hStride)
694     {
695         m_encoderState.m_dstOperand.region[2] = int_cast<uint8_t>(hStride);
696         m_encoderState.m_dstOperand.specialRegion = (hStride != 1);
697     }
698 
GetSignBit(VISA_Type type)699     uint64_t GetSignBit(VISA_Type type)
700     {
701         switch (type)
702         {
703         case ISA_TYPE_Q:
704         case ISA_TYPE_DF:
705             return 63;
706         case ISA_TYPE_D:
707         case ISA_TYPE_F:
708             return 31;
709         case ISA_TYPE_W:
710         case ISA_TYPE_HF:
711         case ISA_TYPE_BF:
712             return 15;
713         case ISA_TYPE_B:
714             return 7;
715         default:
716             IGC_ASSERT_MESSAGE(0, "type doesn't support modifier");
717             break;
718         }
719         return 63;
720     }
721 
IsFloat(VISA_Type type)722     bool IsFloat(VISA_Type type)
723     {
724         return type == ISA_TYPE_DF || type == ISA_TYPE_F || type == ISA_TYPE_HF || type == ISA_TYPE_BF;
725     }
726 
CalculateImmediateValue(CVariable * var,e_modifier mod)727     uint64_t CalculateImmediateValue(CVariable* var, e_modifier mod)
728     {
729         IGC_ASSERT(nullptr != var);
730         uint64_t immediate = var->GetImmediateValue();
731         IGC_ASSERT((mod == EMOD_ABS) || (mod == EMOD_NEG) || (mod == EMOD_NEGABS) || (mod == EMOD_NONE));
732         // handle modifiers for immediates.
733         // Change the sign bit for floats and do logic operations for integers
734         if (IsFloat(var->GetType()))
735         {
736             if (mod == EMOD_ABS)
737             {
738                 immediate &= ~((uint64_t)(1) << GetSignBit(var->GetType()));
739             }
740             else if (mod == EMOD_NEG)
741             {
742                 immediate ^= (uint64_t)(1) << GetSignBit(var->GetType());
743             }
744             else if (mod == EMOD_NEGABS)
745             {
746                 immediate |= ((uint64_t)(1) << GetSignBit(var->GetType()));
747             }
748         }
749         else
750         {
751             if (mod == EMOD_ABS || mod == EMOD_NEGABS)
752             {
753                 uint64_t mask = (immediate >> GetSignBit(var->GetType()))& (uint64_t)0x01;
754                 immediate = (immediate + mask) ^ mask;
755             }
756             if (mod == EMOD_NEG || mod == EMOD_NEGABS)
757             {
758                 immediate = ~immediate + 1;
759             }
760         }
761         return immediate;
762     }
763 
GetSourceOperandNoModifier(CVariable * var)764     VISA_VectorOpnd* CEncoder::GetSourceOperandNoModifier(CVariable* var)
765     {
766         SModifier nullMod;
767         nullMod.init();
768         return GetSourceOperand(var, nullMod);
769     }
770 
GetSourceOperand(CVariable * var,const SModifier & mod)771     VISA_VectorOpnd* CEncoder::GetSourceOperand(CVariable* var, const SModifier& mod)
772     {
773         if (var == nullptr)
774         {
775             return nullptr;
776         }
777         VISA_VectorOpnd* operand = nullptr;
778         if (var->IsImmediate())
779         {
780             uint64_t immediate = CalculateImmediateValue(var, mod.mod);
781             V(vKernel->CreateVISAImmediate(operand, &immediate, var->GetType()));
782         }
783         else
784         {
785             if (var->GetVarType() == EVARTYPE_GENERAL)
786             {
787                 unsigned short vStride = 1;
788                 unsigned short width = 1;
789                 unsigned short hStride = 0;
790 
791                 if (mod.specialRegion)
792                 {
793                     vStride = int_cast<unsigned short>(mod.region[0]);
794                     width = int_cast<unsigned short>(mod.region[1]);
795                     hStride = int_cast<unsigned short>(mod.region[2]);
796                 }
797                 else if (var->IsUniform())
798                 {
799                     //Scalar regioning
800                     vStride = 0;
801                     width = 1;
802                     hStride = 0;
803                 }
804                 unsigned char rowOffset = 0;
805                 unsigned char colOffset = 0;
806                 GetRowAndColOffset(var, mod.subVar, mod.subReg, rowOffset, colOffset);
807                 V(vKernel->CreateVISASrcOperand(
808                     operand,
809                     GetVISAVariable(var, mod.instance),
810                     ConvertModifierToVisaType(mod.mod),
811                     vStride,
812                     width,
813                     hStride,
814                     rowOffset,
815                     colOffset));
816             }
817             else if (var->GetVarType() == EVARTYPE_ADDRESS)
818             {
819                 if (var->IsUniform())
820                 {
821                     // uniform addressing uses 1x1 indirect addressing mode
822                     unsigned short vStride = 8;
823                     unsigned short width = 8;
824                     unsigned short hStride = 1;
825 
826                     //if vector is also uniform
827                     if (var->IsVectorUniform())
828                     {
829                         vStride = 0;
830                         width = 1;
831                         hStride = 0;
832                     }
833                     unsigned short immOffset = (unsigned short)
834                         mod.subReg * GetCISADataTypeSize(var->GetType());
835                     V(vKernel->CreateVISAIndirectSrcOperand(
836                         operand,
837                         var->visaAddrVariable,
838                         MODIFIER_NONE,
839                         0,
840                         immOffset,
841                         vStride,
842                         width,
843                         hStride,
844                         var->GetType()));
845                 }
846                 else
847                 {
848                     // non-uniform addressing uses VxH indirect addressing mode
849                     // NB: this requires that all subregisters of a0 are properly
850                     // set up, including per-lane subreg offsets.
851                     V(vKernel->CreateVISAIndirectOperandVxH(
852                         operand,
853                         var->visaAddrVariable,
854                         mod.subReg,
855                         0,
856                         var->GetType()));
857                 }
858             }
859         }
860         return operand;
861     }
862 
GetDestinationOperand(CVariable * var,const SModifier & mod)863     VISA_VectorOpnd* CEncoder::GetDestinationOperand(CVariable* var, const SModifier& mod)
864     {
865         VISA_VectorOpnd* operand = NULL;
866         //Create Dst operand
867         if (var->GetVarType() == EVARTYPE_GENERAL)
868         {
869             unsigned short hStride = 1;
870             unsigned char rowOffset = 0;
871             unsigned char colOffset = 0;
872             GetRowAndColOffset(var, mod.subVar, mod.subReg, rowOffset, colOffset);
873             if (mod.specialRegion)
874             {
875                 hStride = (unsigned short)mod.region[2];
876             }
877 
878             V(vKernel->CreateVISADstOperand(
879                 operand,
880                 GetVISAVariable(var),
881                 hStride,
882                 rowOffset,
883                 colOffset));
884         }
885         else if (var->GetVarType() == EVARTYPE_ADDRESS)
886         {
887             const unsigned short hStride = 1;
888             unsigned char  addrOffset = int_cast<unsigned char>(mod.subReg);
889             unsigned short immOffset = 0;
890             if (var->IsUniform())
891             {
892                 // We are using 1x1 destination region, we must use a0.0.
893                 // Use subReg to compute immOffset.
894                 immOffset = (unsigned short)
895                     mod.subReg * GetCISADataTypeSize(var->GetType());
896                 addrOffset = 0;
897             }
898             V(vKernel->CreateVISAIndirectDstOperand(
899                 operand,
900                 var->visaAddrVariable,
901                 addrOffset,
902                 immOffset,
903                 hStride,
904                 var->GetType()));
905         }
906         return operand;
907     }
908 
GetFlagOperand(const SFlag & flag)909     VISA_PredOpnd* CEncoder::GetFlagOperand(const SFlag& flag)
910     {
911         if (flag.var == nullptr)
912         {
913             return nullptr;
914         }
915         VISA_PredOpnd* operand = nullptr;
916         VISA_PREDICATE_STATE predState = (flag.invertFlag)
917             ? PredState_INVERSE : PredState_NO_INVERSE;
918         VISA_PREDICATE_CONTROL predCtrl = PRED_CTRL_NON;
919 
920         switch (flag.mode)
921         {
922         case EPRED_ALL:     predCtrl = PRED_CTRL_ALL;   break;
923         case EPRED_ANY:     predCtrl = PRED_CTRL_ANY;   break;
924         default: break;
925         }
926 
927         V(vKernel->CreateVISAPredicateOperand(
928             operand,
929             flag.var->visaPredVariable,
930             predState,
931             predCtrl));
932         return operand;
933     }
934 
GetAluExecSize(CVariable * dst) const935     VISA_Exec_Size CEncoder::GetAluExecSize(CVariable* dst) const
936     {
937         SIMDMode simdSize = m_encoderState.m_simdSize;
938 
939         if (dst && dst->GetVarType() == EVARTYPE_ADDRESS)
940         {
941             if (dst->IsVectorUniform() && dst->IsUniform())
942             {
943                 simdSize = m_encoderState.m_uniformSIMDSize;
944             }
945         }
946         else if (dst && dst->IsUniform())
947         {
948             if (dst->GetVarType() == EVARTYPE_PREDICATE)
949             {
950                 if (dst->GetNumberElement() == 1)
951                 {
952                     simdSize = m_encoderState.m_uniformSIMDSize;
953                 }
954             }
955             else
956             {
957                 simdSize = m_encoderState.m_uniformSIMDSize;
958             }
959         }
960 
961         return visaExecSize(simdSize);
962     }
963 
GetAluEMask(CVariable * dst)964     VISA_EMask_Ctrl CEncoder::GetAluEMask(CVariable* dst)
965     {
966         e_mask mask = m_encoderState.m_mask;
967         bool noMask = m_encoderState.m_noMask;
968         if (dst)
969         {
970             if (m_encoderState.m_SubSpanDestination)
971             {
972                 noMask = true;
973             }
974             else
975             {
976                 if (dst->GetVarType() == EVARTYPE_ADDRESS)
977                 {
978                     if (dst->IsVectorUniform() && dst->IsUniform())
979                     {
980                         noMask = true;
981                     }
982                 }
983                 else if (dst->IsUniform())
984                 {
985                     noMask = true;
986                 }
987             }
988         }
989 
990         return ConvertMaskToVisaType(mask, noMask);
991     }
992 
IsSat()993     bool CEncoder::IsSat()
994     {
995         return (m_encoderState.m_dstOperand.mod == EMOD_SAT) ? true : false;
996     }
997 
MinMax(CISA_MIN_MAX_SUB_OPCODE subopcode,CVariable * dst,CVariable * src0,CVariable * src1)998     void CEncoder::MinMax(CISA_MIN_MAX_SUB_OPCODE subopcode, CVariable* dst, CVariable* src0, CVariable* src1)
999     {
1000         IGC_ASSERT_MESSAGE(nullptr == m_encoderState.m_flag.var, "min/max doesn't support predication");
1001 
1002         VISA_VectorOpnd* opnd0 = GetSourceOperand(src0, m_encoderState.m_srcOperand[0]);
1003         VISA_VectorOpnd* opnd1 = GetSourceOperand(src1, m_encoderState.m_srcOperand[1]);
1004         VISA_VectorOpnd* dstopnd = GetDestinationOperand(dst, m_encoderState.m_dstOperand);
1005 
1006         V(vKernel->AppendVISAMinMaxInst(
1007             subopcode,
1008             IsSat(),
1009             GetAluEMask(dst),
1010             GetAluExecSize(dst),
1011             dstopnd,
1012             opnd0,
1013             opnd1));
1014     }
1015 
1016     // NeedSplitting - Check whether a variable needs splitting due to the
1017     // violation of the hardware rule of no more than 2 GRFs should be accessed.
1018     // So far, only the following cases are covered
1019     // - SIMD16
1020     //      note that SIMD32 is supported differently.
1021     // - data types of 4+ bytes or 32+ bits
1022     // - for source, we only handle limited regions.
1023     //
1024     // numParts - return the total parts to be split, e.g. if the region spans 4
1025     // GRFs, it needs splitting into 2 parts at least.
NeedSplitting(CVariable * var,const SModifier & mod,unsigned & numParts,bool isSource) const1026     bool CEncoder::NeedSplitting(CVariable* var, const SModifier& mod,
1027         unsigned& numParts, bool isSource) const
1028     {
1029         // If nothing is specified, don't split.
1030         if (!var)
1031         {
1032             return false;
1033         }
1034 
1035         // Only handle SIMD16 now! We assume all data movements in SIMD8 will honor
1036     // the region rules.
1037         VISA_Exec_Size simdSize = GetAluExecSize(var);
1038         const unsigned elemSize = var->GetElemSize();
1039 
1040         switch (simdSize)
1041         {
1042         case EXEC_SIZE_16:
1043             break;
1044         default:
1045         {
1046             // Checks for some rare cases that are not handled by the splitter, but should be detected and reported.
1047             // Example: mov (8|M0)    r4.0<1>:q     r31.0<2;1,0>:q
1048             unsigned maxBlockSize = getGRFSize() * 2; // size of 2 GRFs in bytes
1049             // For uniform variables (which implies simdSize==1) the emitter may set regions with width>1.
1050             // As it may happen in various places, we detect it here.
1051             IGC_ASSERT(var->IsUniform() || (GrfRegionSize(simdSize, elemSize, mod, isSource) <= maxBlockSize));
1052             return false;
1053         }
1054         }
1055 
1056         // Only general variables need splitting so far.
1057         if (var->GetVarType() != EVARTYPE_GENERAL)
1058         {
1059             return false;
1060         }
1061 
1062         // Only varying variable need splitting so far.
1063         // NOTE: uniform variable is assumed to take less than 2 GRF+.
1064         if (var->IsUniform())
1065         {
1066             return false;
1067         }
1068 
1069         // We assume there is no 2 GRF crossing when element size is smaller than
1070         // 4 bytes (or 32 bits), e.g. 16-bit WORD.
1071         if (elemSize < 4)
1072         {
1073             return false;
1074         }
1075 
1076         // If the data type has more than 4 bytes, i.e. 32 bits, it already crosses
1077         // 2+ GRFs by itself. There's no need to check further.
1078         if (elemSize > 4)
1079         {
1080             IGC_ASSERT_MESSAGE(8 == elemSize, "Only QWORD is supported so far");
1081             IGC_ASSERT_MESSAGE(isSource || !mod.specialRegion,
1082                 "It's expected that there's no special region associated with QWORD type destination!");
1083             if (isSource && mod.specialRegion)
1084             {
1085                 if (mod.region[1] == 1 && mod.region[0] == 0)
1086                 {
1087                     // src region is <0;1,x>, can't cross 2 GRF.  No need to split.
1088                     return false;
1089                 }
1090                 IGC_ASSERT_MESSAGE(0, "Unhandled special source region on QWORD type!");
1091             }
1092 
1093             numParts = std::max(numParts, 2U);
1094             return true;
1095 
1096         }
1097 
1098 
1099         // For 32-bit data types, without special region, they won't cross 2+ GRFs.
1100         if (!mod.specialRegion)
1101         {
1102             return false;
1103         }
1104 
1105         // Check regioning.
1106         if (isSource)
1107         {
1108             // FIXME: Need better support for region with non-1 width.
1109             if (mod.region[1] != 1)
1110             {
1111                 return false;
1112             }
1113 
1114             if (mod.region[0] < 2)
1115             {
1116                 return false;
1117             }
1118 
1119             // For src with width set to 1, region with > 1 vstride needs
1120             // splitting.
1121             numParts = std::max(numParts, unsigned(mod.region[0]));
1122             return true;
1123         }
1124 
1125         if (mod.region[2] < 2)
1126         {
1127             return false;
1128         }
1129 
1130         // For dst, region with > 1 hstride needs splitting.
1131         numParts = std::max(numParts, unsigned(mod.region[2]));
1132         return true;
1133     }
1134 
1135     // SplitVariable - Split the variable to prevent accessing 2+ GRFs.
SplitVariable(VISA_Exec_Size fromExecSize,VISA_Exec_Size toExecSize,unsigned thePart,CVariable * var,const SModifier & mod,bool isSource) const1136     SModifier CEncoder::SplitVariable(VISA_Exec_Size fromExecSize,
1137         VISA_Exec_Size toExecSize,
1138         unsigned thePart,
1139         CVariable* var, const SModifier& mod,
1140         bool isSource) const {
1141         // Splitting uniform or source scalar variables is unnecessary!
1142         bool isAddrVar = var && var->GetVarType() == EVARTYPE_ADDRESS;
1143         if (!var || (var->IsUniform() && (!isAddrVar || var->IsVectorUniform())) ||
1144             (isSource && mod.specialRegion &&
1145                 mod.region[1] == 1 && mod.region[0] == 0 && mod.region[2] == 0))
1146             return mod;
1147 
1148         IGC_ASSERT_MESSAGE(((fromExecSize == EXEC_SIZE_16) && (toExecSize == EXEC_SIZE_8)) || ((fromExecSize == EXEC_SIZE_32) && (toExecSize == EXEC_SIZE_16)),
1149             "Only support splitting from exec-size 16 to exec-size 8, or 32 to 16!");
1150         IGC_ASSERT_MESSAGE((thePart == 0) || (thePart == 1),
1151             "Splitting from exec-size-16 to exec-size-8 only breaks into 2 parts!");
1152 
1153         // Copy the original modifier first.
1154         SModifier newMod = mod;
1155         unsigned elemSize = var->GetElemSize();
1156 
1157         if (isAddrVar)
1158         {
1159             // Note that for address var, subReg has two meanings:
1160             //   1. if var is uniform (so using 1x1 addressing mode),
1161             //         subReg * (size of var's type) is a0.0's immOffset;
1162             //   2. otherwise (using VxH addressing mode),
1163             //         subReg is indeed an sub register number of a0.
1164             newMod.subReg += thePart * visaNumLanes(toExecSize);
1165             return newMod;
1166         }
1167 
1168         if (!mod.specialRegion) {
1169             // Without special regioning, split the given variable based on type.
1170             switch (elemSize) {
1171             case 1:
1172             case 2:
1173                 newMod.subReg += thePart * 8; // 8, i.e. half elements
1174                 break;
1175             case 4:
1176                 newMod.subVar += thePart * 1; // 1 GRF
1177                 break;
1178             case 8:
1179                 newMod.subVar += thePart * 2; // 2 GRFs
1180                 break;
1181             default:
1182                 IGC_ASSERT_MESSAGE(0, "Unknown data type to split!");
1183                 break;
1184             }
1185             return newMod;
1186         }
1187 
1188         unsigned theStride = 0;
1189         if (isSource) {
1190             IGC_ASSERT_MESSAGE((mod.region[1] == 1),
1191                 "Don't know how to split region with non-1 width!");
1192             theStride = mod.region[0];
1193         }
1194         else {
1195             theStride = mod.region[2];
1196         }
1197 
1198         switch (elemSize) {
1199         case 1:
1200         case 2:
1201             newMod.subReg += thePart * 8 * theStride; // 8, i.e. half elements
1202             break;
1203         case 4:
1204             newMod.subVar += thePart * 1 * theStride; // 1 GRF
1205             break;
1206         case 8:
1207             newMod.subVar += thePart * 2 * theStride; // 2 GRFs
1208             break;
1209         default:
1210             IGC_ASSERT_MESSAGE(0, "Unknown data type to split!");
1211             break;
1212         }
1213 
1214         return newMod;
1215     }
1216 
SplitExecSize(VISA_Exec_Size fromExecSize,unsigned numParts) const1217     VISA_Exec_Size  CEncoder::SplitExecSize(VISA_Exec_Size fromExecSize, unsigned numParts) const
1218     {
1219         IGC_ASSERT_MESSAGE(2 == numParts, "Only know splitting SIMD16 into SIMD8!");
1220 
1221         switch (fromExecSize) {
1222         default:
1223             break;
1224         case EXEC_SIZE_32:
1225             return EXEC_SIZE_16;
1226         case EXEC_SIZE_16:
1227             return EXEC_SIZE_8;
1228         }
1229         IGC_ASSERT_MESSAGE(0, "Unknown execution size to be split!");
1230         return static_cast<VISA_Exec_Size>(~0);
1231     }
1232 
1233     VISA_EMask_Ctrl
SplitEMask(VISA_Exec_Size fromExecSize,VISA_Exec_Size toExecSize,unsigned thePart,VISA_EMask_Ctrl execMask) const1234         CEncoder::SplitEMask(VISA_Exec_Size fromExecSize,
1235             VISA_Exec_Size toExecSize,
1236             unsigned thePart, VISA_EMask_Ctrl execMask) const {
1237         IGC_ASSERT_MESSAGE(((fromExecSize == EXEC_SIZE_16) && (toExecSize == EXEC_SIZE_8)) || ((fromExecSize == EXEC_SIZE_32) && (toExecSize == EXEC_SIZE_16)),
1238             "Only support splitting from exec-size 16 to exec-size 8, or from 32 to 16!");
1239         IGC_ASSERT_MESSAGE((thePart == 0) || (thePart == 1),
1240             "Splitting from exec-size-16 to exec-size-8 only breaks into 2 parts!");
1241 
1242         // FIXME: Better to generate a table!
1243 
1244         switch (fromExecSize) {
1245         default:
1246             break;
1247         case EXEC_SIZE_32:
1248             switch (toExecSize) {
1249             default:
1250                 break;
1251             case EXEC_SIZE_16:
1252                 switch (execMask) {
1253                 default:
1254                     break;
1255                 case vISA_EMASK_M1:     return thePart ? vISA_EMASK_M5 : vISA_EMASK_M1;
1256                 case vISA_EMASK_M1_NM:  return thePart ? vISA_EMASK_M5_NM : vISA_EMASK_M1_NM;
1257                 case vISA_EMASK_M3:     return thePart ? vISA_EMASK_M7 : vISA_EMASK_M3;
1258                 case vISA_EMASK_M3_NM:  return thePart ? vISA_EMASK_M7_NM : vISA_EMASK_M3_NM;
1259                 case vISA_EMASK_M5:     return thePart ? vISA_EMASK_M1 : vISA_EMASK_M5;
1260                 case vISA_EMASK_M5_NM:  return thePart ? vISA_EMASK_M1_NM : vISA_EMASK_M5_NM;
1261                 case vISA_EMASK_M7:     return thePart ? vISA_EMASK_M3 : vISA_EMASK_M7;
1262                 case vISA_EMASK_M7_NM:  return thePart ? vISA_EMASK_M3_NM : vISA_EMASK_M7_NM;
1263                 }
1264                 break;
1265             }
1266             break;
1267 
1268         case EXEC_SIZE_16:
1269             switch (toExecSize) {
1270             default:
1271                 break;
1272             case EXEC_SIZE_8:
1273                 switch (execMask) {
1274                 default:
1275                     break;
1276                 case vISA_EMASK_M1:     return thePart ? vISA_EMASK_M3 : vISA_EMASK_M1;
1277                 case vISA_EMASK_M1_NM:  return thePart ? vISA_EMASK_M3_NM : vISA_EMASK_M1_NM;
1278                 case vISA_EMASK_M5:     return thePart ? vISA_EMASK_M7 : vISA_EMASK_M5;
1279                 case vISA_EMASK_M5_NM:  return thePart ? vISA_EMASK_M7_NM : vISA_EMASK_M5_NM;
1280                 }
1281                 break;
1282             }
1283             break;
1284         }
1285         IGC_ASSERT_MESSAGE(0, "Unknown execution mask to be split into low part!");
1286         return static_cast<VISA_EMask_Ctrl>(~0);
1287     }
1288 
1289     // Splitting SIMD16 Message Data Payload (MDP at offset = MDPOfst) for A64
1290     // scatter/untyped write messages to two SIMD8 MDPs (V0 and V1).
SplitPayloadToLowerSIMD(CVariable * MDP,uint32_t MDPOfst,uint32_t NumBlks,CVariable * V0,CVariable * V1,uint32_t fromSize)1291     void CEncoder::SplitPayloadToLowerSIMD(CVariable* MDP, uint32_t MDPOfst, uint32_t NumBlks, CVariable* V0, CVariable* V1, uint32_t fromSize)
1292     {
1293         IGC_ASSERT(nullptr != MDP);
1294         IGC_ASSERT(nullptr != V0);
1295         IGC_ASSERT(nullptr != V1);
1296 
1297         VISA_GenVar* GV = GetVISAVariable(MDP);
1298         VISA_GenVar* v0GV = GetVISAVariable(V0);
1299         VISA_GenVar* v1GV = GetVISAVariable(V1);
1300         VISA_VectorOpnd* movDst0 = nullptr;
1301         VISA_VectorOpnd* movDst1 = nullptr;
1302         VISA_VectorOpnd* srcOpnd = nullptr;
1303         const uint32_t toSize = fromSize / 2;
1304         const VISA_Exec_Size fromESize = visaExecSize(lanesToSIMDMode(fromSize));
1305         const VISA_Exec_Size toESize = visaExecSize(lanesToSIMDMode(toSize));
1306         const uint32_t eltBytes = MDP->GetElemSize();
1307 
1308         IGC_ASSERT_MESSAGE(V0->GetElemSize() == eltBytes, "Element size should be the same among SIMD16 MDP and SIMD8 MDP.");
1309         IGC_ASSERT_MESSAGE(V1->GetElemSize() == eltBytes, "Element size should be the same among SIMD16 MDP and SIMD8 MDP.");
1310 
1311         // Number of elements per GRF
1312 
1313         if (eltBytes > 0)
1314         {
1315             uint32_t GRFElts = getGRFSize() / eltBytes;
1316 
1317             if (GRFElts > 0)
1318             {
1319                 VISA_EMask_Ctrl execNM = ConvertMaskToVisaType(m_encoderState.m_mask, m_encoderState.m_noMask);
1320                 uint32_t MDPStart = MDPOfst / eltBytes;
1321                 for (uint32_t i = 0; i < NumBlks; ++i)
1322                 {
1323                     uint32_t dstOfst = i * toSize;
1324                     uint32_t srcOfst = i * fromSize + MDPStart;
1325                     V(vKernel->CreateVISADstOperand(movDst0, v0GV, 1, dstOfst / GRFElts, dstOfst % GRFElts));
1326                     V(vKernel->CreateVISADstOperand(movDst1, v1GV, 1, dstOfst / GRFElts, dstOfst % GRFElts));
1327 
1328                     V(vKernel->CreateVISASrcOperand(srcOpnd, GV, MODIFIER_NONE,
1329                         1, 1, 0, srcOfst / GRFElts, srcOfst % GRFElts));
1330 
1331                     V(vKernel->AppendVISADataMovementInst(
1332                         ISA_MOV, nullptr, false,
1333                         SplitEMask(fromESize, toESize, 0, execNM),
1334                         toESize, movDst0, srcOpnd));
1335 
1336                     srcOfst += toSize;
1337                     V(vKernel->CreateVISASrcOperand(srcOpnd, GV, MODIFIER_NONE,
1338                         1, 1, 0, srcOfst / GRFElts, srcOfst % GRFElts));
1339 
1340                     V(vKernel->AppendVISADataMovementInst(
1341                         ISA_MOV, nullptr, false,
1342                         SplitEMask(fromESize, toESize, 1, execNM),
1343                         toESize, movDst1, srcOpnd));
1344                 }
1345             }
1346         }
1347     }
1348 
1349     // Merge two SIMD8 MDP (V0 and V1) into a single SIMD16 MDP (MDP at offset = MDPOfst)
MergePayloadToHigherSIMD(CVariable * V0,CVariable * V1,uint32_t NumBlks,CVariable * MDP,uint32_t MDPOfst,uint32_t toSize)1350     void CEncoder::MergePayloadToHigherSIMD(CVariable* V0, CVariable* V1, uint32_t NumBlks, CVariable* MDP, uint32_t MDPOfst, uint32_t toSize)
1351     {
1352         VISA_GenVar* GV = GetVISAVariable(MDP);
1353         VISA_GenVar* v0GV = GetVISAVariable(V0);
1354         VISA_GenVar* v1GV = GetVISAVariable(V1);
1355         VISA_VectorOpnd* movDst = nullptr;
1356         VISA_VectorOpnd* movSrc0 = nullptr;
1357         VISA_VectorOpnd* movSrc1 = nullptr;
1358         const uint32_t fromSize = toSize / 2;
1359         const VISA_Exec_Size fromESize = visaExecSize(lanesToSIMDMode(toSize));
1360         const VISA_Exec_Size toESize = visaExecSize(lanesToSIMDMode(fromSize));
1361         const uint32_t eltBytes = MDP->GetElemSize();
1362         IGC_ASSERT_MESSAGE(V0->GetElemSize() == eltBytes, "Element size should be the same among SIMD16 MDP and SIMD8 MDP!");
1363         IGC_ASSERT_MESSAGE(V1->GetElemSize() == eltBytes, "Element size should be the same among SIMD16 MDP and SIMD8 MDP!");
1364 
1365         if (eltBytes > 0)
1366         {
1367             // Number of elements per GRF
1368             const uint32_t GRFElts = getGRFSize() / eltBytes;
1369 
1370             if (GRFElts > 0)
1371             {
1372                 VISA_EMask_Ctrl execNM = ConvertMaskToVisaType(m_encoderState.m_mask, m_encoderState.m_noMask);
1373                 uint32_t MDPStart = MDPOfst / eltBytes;
1374                 for (uint32_t i = 0; i < NumBlks; ++i)
1375                 {
1376                     uint32_t dstOfst = i * toSize + MDPStart;
1377                     uint32_t srcOfst = i * fromSize;
1378                     V(vKernel->CreateVISADstOperand(movDst, GV, 1, dstOfst / GRFElts, dstOfst % GRFElts));
1379                     V(vKernel->CreateVISASrcOperand(movSrc0, v0GV, MODIFIER_NONE,
1380                         1, 1, 0, srcOfst / GRFElts, srcOfst % GRFElts));
1381                     V(vKernel->CreateVISASrcOperand(movSrc1, v1GV, MODIFIER_NONE,
1382                         1, 1, 0, srcOfst / GRFElts, srcOfst % GRFElts));
1383 
1384                     V(vKernel->AppendVISADataMovementInst(
1385                         ISA_MOV, nullptr, false,
1386                         SplitEMask(fromESize, toESize, 0, execNM),
1387                         toESize, movDst, movSrc0));
1388 
1389                     dstOfst += fromSize;
1390                     V(vKernel->CreateVISADstOperand(movDst, GV, 1, dstOfst / GRFElts, dstOfst % GRFElts));
1391                     V(vKernel->AppendVISADataMovementInst(
1392                         ISA_MOV, nullptr, false,
1393                         SplitEMask(fromESize, toESize, 1, execNM),
1394                         toESize, movDst, movSrc1));
1395                 }
1396             }
1397         }
1398     }
1399 
1400     static SModifier
EmulateVariable(CVariable * Var,SModifier Mod,bool IsHiPart,bool IsSource)1401         EmulateVariable(CVariable* Var, SModifier Mod, bool IsHiPart, bool IsSource) {
1402         if (Mod.specialRegion) {
1403             if (IsSource) {
1404                 Mod.region[0] *= 2;
1405                 Mod.region[2] *= 2;
1406             }
1407             else
1408                 Mod.region[2] *= 2;
1409         }
1410         else {
1411             if (IsSource) {
1412                 if (!Var->IsUniform()) {
1413                     Mod.region[0] = 2;
1414                     Mod.region[1] = 1;
1415                     Mod.region[2] = 0;
1416                     Mod.specialRegion = true;
1417                 }
1418             }
1419             else {
1420                 Mod.region[2] = 2;
1421                 Mod.specialRegion = true;
1422             }
1423         }
1424         Mod.subReg *= 2;
1425         if (IsHiPart)
1426             Mod.subReg += 1;
1427         return Mod;
1428     }
1429 
DataMov(ISA_Opcode opcode,CVariable * dst,CVariable * src)1430     void CEncoder::DataMov(ISA_Opcode opcode, CVariable* dst, CVariable* src)
1431     {
1432         if (opcode == ISA_SETP)
1433         {
1434             IGC_ASSERT(nullptr != dst);
1435             IGC_ASSERT(dst->GetVarType() == EVARTYPE_PREDICATE);
1436             V(vKernel->AppendVISASetP(
1437                 GetAluEMask(dst),
1438                 IsSecondHalf() ? GetAluExecSize(dst) : visaExecSize(m_program->m_dispatchSize),
1439                 dst->visaPredVariable,
1440                 GetSourceOperand(src, m_encoderState.m_srcOperand[0])));
1441         }
1442         else if (opcode == ISA_MOV && src->GetVarType() == EVARTYPE_PREDICATE)
1443         {
1444             V(vKernel->AppendVISAPredicateMove(
1445                 GetDestinationOperand(dst, m_encoderState.m_dstOperand),
1446                 src->visaPredVariable));
1447         }
1448         else
1449         {
1450             VISA_Type dstT = dst->GetType();
1451             VISA_Type srcT = src->GetType();
1452             bool Is64BitDst = (dstT == ISA_TYPE_Q || dstT == ISA_TYPE_UQ);
1453             bool Is64BitSrc = (srcT == ISA_TYPE_Q || srcT == ISA_TYPE_UQ);
1454             bool Need64BitEmu =
1455                 m_program->GetContext()->platform.hasNoInt64Inst() &&
1456                 (Is64BitDst || Is64BitSrc);
1457 
1458             // If DP is not supported, need to split mov as well.
1459             if (IGC_IS_FLAG_ENABLED(ForceDPEmulation) ||
1460                 m_program->GetContext()->platform.hasNoFP64Inst())
1461             {
1462                 if (dstT == ISA_TYPE_DF && srcT == ISA_TYPE_DF)
1463                 {
1464                     Need64BitEmu = true;
1465                     Is64BitDst = true;
1466                     Is64BitSrc = true;
1467                 }
1468                 else
1469                 {
1470                     IGC_ASSERT_MESSAGE(dstT != ISA_TYPE_DF, "double type is not expected here");
1471                     IGC_ASSERT_MESSAGE(srcT != ISA_TYPE_DF, "double type is not expected here");
1472                 }
1473             }
1474             if (dst->GetVarType() != EVARTYPE_GENERAL || src->GetVarType() != EVARTYPE_GENERAL)
1475             {
1476                 // code can't handle indirect operands, let vISA do it
1477                 // ToDo: disable int64b copy emu entirely?
1478                 Need64BitEmu = false;
1479             }
1480 
1481             CVariable* dstAlias = nullptr;
1482             CVariable* srcAlias = nullptr;
1483             VISA_VectorOpnd* srcImmLo = nullptr;
1484             VISA_VectorOpnd* srcImmHi = nullptr;
1485             if (Need64BitEmu) {
1486                 if (Is64BitDst)
1487                     dstAlias = m_program->GetNewAlias(dst, ISA_TYPE_UD, 0, 0);
1488                 else
1489                     dstAlias = dst;
1490                 if (src->IsImmediate()) {
1491                     uint64_t Imm = src->GetImmediateValue();
1492                     unsigned ImmLo = Imm & 0xFFFFFFFFULL;
1493                     unsigned ImmHi = Imm >> 32;
1494                     V(vKernel->CreateVISAImmediate(srcImmLo, &ImmLo, ISA_TYPE_UD));
1495                     V(vKernel->CreateVISAImmediate(srcImmHi, &ImmHi, ISA_TYPE_UD));
1496                 }
1497                 else {
1498                     if (Is64BitSrc)
1499                         srcAlias = m_program->GetNewAlias(src, ISA_TYPE_UD, 0, 0);
1500                     else
1501                         srcAlias = src;
1502                 }
1503             }
1504 
1505             if (Need64BitEmu)
1506             {
1507                 if (Is64BitSrc && Is64BitDst)
1508                 {
1509                     VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
1510                     if (!predOpnd && !IsSat() && dst->IsUniform() && src->IsUniform() && !src->IsImmediate() && m_encoderState.m_uniformSIMDSize == SIMDMode::SIMD1)
1511                     {
1512                         // special handling for uniform 64b copy by generating SIMD2 move instead of 2xSIMD1
1513                         // technically we need to check for src modifier and whether dst/src are indirect operand as well,
1514                         // but it doesn't look like the original code below is doing it anyway..
1515                         SModifier dstAsUDMod = m_encoderState.m_dstOperand;
1516                         dstAsUDMod.subReg *= 2;
1517                         SModifier srcAsUDMod = m_encoderState.m_srcOperand[0];
1518                         srcAsUDMod.region[0] = 1;
1519                         srcAsUDMod.region[1] = 1;
1520                         srcAsUDMod.region[2] = 0;
1521                         srcAsUDMod.specialRegion = true;
1522                         srcAsUDMod.subReg *= 2;
1523                         auto dstOpnd = GetDestinationOperand(dstAlias, dstAsUDMod);
1524                         auto SIMDSize = lanesToSIMDMode(numLanes(m_encoderState.m_uniformSIMDSize) * 2);
1525                         auto srcOpnd = GetSourceOperand(srcAlias, srcAsUDMod);
1526                         V(vKernel->AppendVISADataMovementInst(opcode, nullptr, false, vISA_EMASK_M1_NM, visaExecSize(SIMDSize),
1527                             dstOpnd, srcOpnd));
1528                     }
1529                     else
1530                     {
1531                         // Generate data movement on Lo part.
1532                         SModifier LoDstMod = EmulateVariable(dst, m_encoderState.m_dstOperand, false, false);
1533                         SModifier LoSrcMod = EmulateVariable(src, m_encoderState.m_srcOperand[0], false, true);
1534                         VISA_VectorOpnd* dstOpnd = GetDestinationOperand(dstAlias, LoDstMod);
1535                         VISA_VectorOpnd* srcOpnd = srcImmLo ? srcImmLo : GetSourceOperand(srcAlias, LoSrcMod);
1536 
1537                         V(vKernel->AppendVISADataMovementInst(opcode, predOpnd, IsSat(),
1538                             GetAluEMask(dst),
1539                             GetAluExecSize(dst),
1540                             dstOpnd, srcOpnd));
1541                         // Generate data movement on Hi part.
1542                         SModifier HiDstMod = EmulateVariable(dst, m_encoderState.m_dstOperand, true, false);
1543                         SModifier HiSrcMod = EmulateVariable(src, m_encoderState.m_srcOperand[0], true, true);
1544                         dstOpnd = GetDestinationOperand(dstAlias, HiDstMod);
1545                         srcOpnd = srcImmHi ? srcImmHi : GetSourceOperand(srcAlias, HiSrcMod);
1546                         predOpnd = GetFlagOperand(m_encoderState.m_flag);
1547                         V(vKernel->AppendVISADataMovementInst(opcode, predOpnd, IsSat(),
1548                             GetAluEMask(dst),
1549                             GetAluExecSize(dst),
1550                             dstOpnd, srcOpnd));
1551                     }
1552                 }
1553                 else if (Is64BitSrc)
1554                 {
1555                     IGC_ASSERT_MESSAGE(!Is64BitDst, "Expect non 64-bit dst!");
1556                     // Generate data movement on Lo part only.
1557                     SModifier LoSrcMod = EmulateVariable(src, m_encoderState.m_srcOperand[0], false, true);
1558                     VISA_VectorOpnd* dstOpnd = GetDestinationOperand(dstAlias, m_encoderState.m_dstOperand);
1559                     VISA_VectorOpnd* srcOpnd = srcImmLo ? srcImmLo : GetSourceOperand(srcAlias, LoSrcMod);
1560                     VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
1561                     V(vKernel->AppendVISADataMovementInst(opcode, predOpnd, IsSat(),
1562                         GetAluEMask(dst),
1563                         GetAluExecSize(dst),
1564                         dstOpnd, srcOpnd));
1565                 }
1566                 else
1567                 {
1568                     IGC_ASSERT_MESSAGE(Is64BitDst, "Expect 64-bit dst!");
1569                     IGC_ASSERT_MESSAGE(!Is64BitSrc, "Expect non 64-bit src");
1570 
1571                     // Generate data movement on Lo part.
1572                     SModifier LoDstMod = EmulateVariable(dst, m_encoderState.m_dstOperand, false, false);
1573                     VISA_VectorOpnd* dstOpnd = GetDestinationOperand(dstAlias, LoDstMod);
1574                     VISA_VectorOpnd* srcOpnd = srcImmLo ? srcImmLo : GetSourceOperand(srcAlias, m_encoderState.m_srcOperand[0]);
1575                     VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
1576                     V(vKernel->AppendVISADataMovementInst(opcode, predOpnd, IsSat(),
1577                         GetAluEMask(dst),
1578                         GetAluExecSize(dst),
1579                         dstOpnd, srcOpnd));
1580                     // Generate data movement on Hi part.
1581                     unsigned ImmHi = 0U;
1582                     V(vKernel->CreateVISAImmediate(srcImmHi, &ImmHi, ISA_TYPE_UD));
1583                     SModifier HiDstMod = EmulateVariable(dst, m_encoderState.m_dstOperand, true, false);
1584                     dstOpnd = GetDestinationOperand(dstAlias, HiDstMod);
1585                     srcOpnd = srcImmHi;
1586                     predOpnd = GetFlagOperand(m_encoderState.m_flag);
1587                     V(vKernel->AppendVISADataMovementInst(opcode, predOpnd, IsSat(),
1588                         GetAluEMask(dst),
1589                         GetAluExecSize(dst),
1590                         dstOpnd, srcOpnd));
1591                 }
1592             }
1593             else
1594             {
1595                 VISA_VectorOpnd* srcOpnd = GetSourceOperand(src, m_encoderState.m_srcOperand[0]);
1596                 VISA_VectorOpnd* dstOpnd = GetDestinationOperand(dst, m_encoderState.m_dstOperand);
1597                 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
1598                 V(vKernel->AppendVISADataMovementInst(
1599                     opcode,
1600                     predOpnd,
1601                     IsSat(),
1602                     GetAluEMask(dst),
1603                     GetAluExecSize(dst),
1604                     dstOpnd,
1605                     srcOpnd));
1606             }
1607         }
1608     }
1609 
LogicOp(ISA_Opcode opcode,CVariable * dst,CVariable * src0,CVariable * src1,CVariable * src2,CVariable * src3)1610     void CEncoder::LogicOp(
1611         ISA_Opcode opcode,
1612         CVariable* dst,
1613         CVariable* src0,
1614         CVariable* src1,
1615         CVariable* src2,
1616         CVariable* src3)
1617     {
1618         if (dst->GetVarType() == EVARTYPE_PREDICATE ||
1619             src0->GetVarType() == EVARTYPE_PREDICATE ||
1620             (src1 != nullptr && src1->GetVarType() == EVARTYPE_PREDICATE))
1621         {
1622             VISA_PredVar* src1Dcl = NULL;
1623             if (src1 != NULL)
1624                 src1Dcl = src1->visaPredVariable;
1625 
1626             // Try to use NOT instruction for predicate, we won't have phi on
1627             // predicate since Legalization pass convert i1 phi to i32.
1628             if (opcode == ISA_NOT)
1629                 SetNoMask();
1630 
1631             V(vKernel->AppendVISALogicOrShiftInst(
1632                 opcode,
1633                 GetAluEMask(dst),
1634                 GetAluExecSize(dst),
1635                 dst->visaPredVariable,
1636                 src0->visaPredVariable,
1637                 src1Dcl));
1638         }
1639         else
1640         {
1641             VISA_VectorOpnd* srcOpnd0 = GetSourceOperand(src0, m_encoderState.m_srcOperand[0]);
1642             VISA_VectorOpnd* srcOpnd1 = GetSourceOperand(src1, m_encoderState.m_srcOperand[1]);
1643             VISA_VectorOpnd* srcOpnd2 = GetSourceOperand(src2, m_encoderState.m_srcOperand[2]);
1644             VISA_VectorOpnd* srcOpnd3 = GetSourceOperand(src3, m_encoderState.m_srcOperand[3]);
1645             VISA_VectorOpnd* dstOpnd = GetDestinationOperand(dst, m_encoderState.m_dstOperand);
1646             VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
1647 
1648             V(vKernel->AppendVISALogicOrShiftInst(
1649                 opcode,
1650                 predOpnd,
1651                 IsSat(),
1652                 GetAluEMask(dst),
1653                 GetAluExecSize(dst),
1654                 dstOpnd,
1655                 srcOpnd0,
1656                 srcOpnd1,
1657                 srcOpnd2,
1658                 srcOpnd3));
1659         }
1660     }
1661 
Arithmetic(ISA_Opcode opcode,CVariable * dst,CVariable * src0,CVariable * src1,CVariable * src2)1662     void CEncoder::Arithmetic(ISA_Opcode opcode, CVariable* dst, CVariable* src0, CVariable* src1, CVariable* src2)
1663     {
1664         VISA_VectorOpnd* srcOpnd0 = GetSourceOperand(src0, m_encoderState.m_srcOperand[0]);
1665         VISA_VectorOpnd* srcOpnd1 = GetSourceOperand(src1, m_encoderState.m_srcOperand[1]);
1666         VISA_VectorOpnd* srcOpnd2 = GetSourceOperand(src2, m_encoderState.m_srcOperand[2]);
1667         VISA_VectorOpnd* dstOpnd = GetDestinationOperand(dst, m_encoderState.m_dstOperand);
1668         VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
1669         V(vKernel->AppendVISAArithmeticInst(
1670             opcode,
1671             predOpnd,
1672             IsSat(),
1673             GetAluEMask(dst),
1674             GetAluExecSize(dst),
1675             dstOpnd,
1676             srcOpnd0,
1677             srcOpnd1,
1678             srcOpnd2));
1679     }
1680 
Bfn(uint8_t booleanFuncCtrl,CVariable * dst,CVariable * src0,CVariable * src1,CVariable * src2)1681     void CEncoder::Bfn(uint8_t booleanFuncCtrl, CVariable* dst, CVariable* src0, CVariable* src1, CVariable* src2)
1682     {
1683         VISA_VectorOpnd* srcOpnd0 = GetSourceOperand(src0, m_encoderState.m_srcOperand[0]);
1684         VISA_VectorOpnd* srcOpnd1 = GetSourceOperand(src1, m_encoderState.m_srcOperand[1]);
1685         VISA_VectorOpnd* srcOpnd2 = GetSourceOperand(src2, m_encoderState.m_srcOperand[2]);
1686         VISA_VectorOpnd* dstOpnd = GetDestinationOperand(dst, m_encoderState.m_dstOperand);
1687         VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
1688 
1689         V(vKernel->AppendVISABfnInst(
1690             booleanFuncCtrl,
1691             predOpnd,
1692             IsSat(),
1693             GetAluEMask(dst),
1694             GetAluExecSize(dst),
1695             dstOpnd,
1696             srcOpnd0,
1697             srcOpnd1,
1698             srcOpnd2));
1699     }
1700 
1701     // We allow H1 to be nullptr for the common case of adding 64-bit variable with 32-bit imm
AddPair(CVariable * Lo,CVariable * Hi,CVariable * L0,CVariable * H0,CVariable * L1,CVariable * H1)1702     void CEncoder::AddPair(CVariable* Lo, CVariable* Hi, CVariable* L0, CVariable* H0, CVariable* L1, CVariable* H1) {
1703         IGC_ASSERT_MESSAGE(m_encoderState.m_dstOperand.mod == EMOD_NONE, "addPair doesn't support saturate");
1704 
1705         if (Hi == nullptr) {
1706             // When Hi part is ignored, reduce 64-bit subtraction into 32-bit.
1707             GenericAlu(EOPCODE_ADD, Lo, L0, L1);
1708             return;
1709         }
1710 
1711         if (Lo == nullptr) {
1712             // We cannot reduce the strength if only Lo is ignored.
1713             Lo = m_program->GetNewVariable(
1714                 Hi->GetNumberElement(), Hi->GetType(), Hi->GetAlign(), Hi->IsUniform(), Hi->getName());
1715         }
1716 
1717         // Use `UD` only.
1718         if (Lo->GetType() != ISA_TYPE_UD && Lo->GetType() != ISA_TYPE_UV) Lo = m_program->BitCast(Lo, ISA_TYPE_UD);
1719         if (Hi->GetType() != ISA_TYPE_UD && Hi->GetType() != ISA_TYPE_UV) Hi = m_program->BitCast(Hi, ISA_TYPE_UD);
1720         if (L0->GetType() != ISA_TYPE_UD && L0->GetType() != ISA_TYPE_UV) L0 = m_program->BitCast(L0, ISA_TYPE_UD);
1721         if (H0->GetType() != ISA_TYPE_UD && H0->GetType() != ISA_TYPE_UV) H0 = m_program->BitCast(H0, ISA_TYPE_UD);
1722         if (L1->GetType() != ISA_TYPE_UD && L1->GetType() != ISA_TYPE_UV) L1 = m_program->BitCast(L1, ISA_TYPE_UD);
1723         if (H1 && H1->GetType() != ISA_TYPE_UD && H1->GetType() != ISA_TYPE_UV) H1 = m_program->BitCast(H1, ISA_TYPE_UD);
1724 
1725         VISA_Exec_Size ExecSize = GetAluExecSize(Lo);
1726         IGC_ASSERT((ExecSize == EXEC_SIZE_32) || (ExecSize == EXEC_SIZE_16) || (ExecSize == EXEC_SIZE_8) || (ExecSize == EXEC_SIZE_4) || (ExecSize == EXEC_SIZE_2) || (ExecSize == EXEC_SIZE_1));
1727 
1728         if (needsSplitting(ExecSize))
1729         {
1730             // Have to split it because `acc0` has only 8 elements for 32-bit
1731             // integer types.
1732             unsigned NumParts = 2;
1733             VISA_EMask_Ctrl ExecMask = GetAluEMask(Lo);
1734             VISA_Exec_Size FromExecSize = GetAluExecSize(Lo);
1735             VISA_Exec_Size ToExecSize = SplitExecSize(FromExecSize, NumParts);
1736 
1737             VISA_PredOpnd* Pred = GetFlagOperand(m_encoderState.m_flag);
1738             for (unsigned ThePart = 0; ThePart != NumParts; ++ThePart) {
1739                 SModifier NewDstMod = SplitVariable(FromExecSize, ToExecSize, ThePart, Lo, m_encoderState.m_dstOperand);
1740                 SModifier NewS0LMod = SplitVariable(FromExecSize, ToExecSize, ThePart, L0, m_encoderState.m_srcOperand[0], true);
1741                 SModifier NewS0HMod = SplitVariable(FromExecSize, ToExecSize, ThePart, H0, m_encoderState.m_srcOperand[1], true);
1742                 SModifier NewS1LMod = SplitVariable(FromExecSize, ToExecSize, ThePart, L1, m_encoderState.m_srcOperand[2], true);
1743 
1744                 VISA_VectorOpnd* S0L = GetSourceOperand(L0, NewS0LMod);
1745                 VISA_VectorOpnd* S0H = GetSourceOperand(H0, NewS0HMod);
1746                 VISA_VectorOpnd* S1L = GetSourceOperand(L1, NewS1LMod);
1747                 VISA_VectorOpnd* L = GetDestinationOperand(Lo, NewDstMod);
1748                 VISA_VectorOpnd* H = GetDestinationOperand(Hi, NewDstMod);
1749                 VISA_VectorOpnd* HIn = GetSourceOperand(Hi, NewDstMod);
1750 
1751                 unsigned NumElems = m_program->m_Platform->getAccChNumUD();
1752                 CVariable* Carry = m_program->GetNewVariable(
1753                     (uint16_t)NumElems, Lo->GetType(), Lo->GetAlign(), Lo->IsUniform(),
1754                     CName(Lo->getName(), "Carry"));
1755                 VISA_VectorOpnd* AccOut = GetDestinationOperand(Carry, m_encoderState.m_dstOperand);
1756                 VISA_VectorOpnd* AccIn = GetSourceOperand(Carry, m_encoderState.m_dstOperand);
1757 
1758                 VISA_EMask_Ctrl EMask = SplitEMask(FromExecSize, ToExecSize, ThePart, ExecMask);
1759                 V(vKernel->AppendVISATwoDstArithmeticInst(
1760                     ISA_ADDC, Pred, EMask, ToExecSize,
1761                     L, AccOut, S0L, S1L));
1762 
1763                 if (H1 && !(H1->IsImmediate() && H1->GetImmediateValue() == 0))
1764                 {
1765                     SModifier NewS1HMod = SplitVariable(FromExecSize, ToExecSize, ThePart, H1, m_encoderState.m_srcOperand[3], true);
1766                     VISA_VectorOpnd* S1H = GetSourceOperand(H1, NewS1HMod);
1767                     if (m_program->m_Platform->supportAdd3Instruction())
1768                     {
1769                         H = GetDestinationOperand(Hi, NewDstMod);
1770                         V(vKernel->AppendVISAArithmeticInst(
1771                             ISA_ADD3, Pred, false, EMask, ToExecSize,
1772                             H, AccIn, S0H, S1H));
1773                     }
1774                     else
1775                     {
1776                         V(vKernel->AppendVISAArithmeticInst(
1777                             ISA_ADD, Pred, false, EMask, ToExecSize,
1778                             H, S0H, S1H));
1779                         H = GetDestinationOperand(Hi, NewDstMod);
1780                         V(vKernel->AppendVISAArithmeticInst(
1781                             ISA_ADD, Pred, false, EMask, ToExecSize,
1782                             H, AccIn, HIn));
1783                     }
1784                 }
1785                 else
1786                 {
1787                     V(vKernel->AppendVISAArithmeticInst(
1788                         ISA_ADD, Pred, false, EMask, ToExecSize,
1789                         H, AccIn, S0H));
1790                 }
1791             }
1792         }
1793         else {
1794             VISA_VectorOpnd* S0L = GetSourceOperand(L0, m_encoderState.m_srcOperand[0]);
1795             VISA_VectorOpnd* S0H = GetSourceOperand(H0, m_encoderState.m_srcOperand[1]);
1796             VISA_VectorOpnd* S1L = GetSourceOperand(L1, m_encoderState.m_srcOperand[2]);
1797             VISA_VectorOpnd* L = GetDestinationOperand(Lo, m_encoderState.m_dstOperand);
1798             VISA_VectorOpnd* H = GetDestinationOperand(Hi, m_encoderState.m_dstOperand);
1799             VISA_PredOpnd* Pred = GetFlagOperand(m_encoderState.m_flag);
1800 
1801             unsigned short NumElems = (ExecSize == EXEC_SIZE_1) ? 1 :
1802                 (ExecSize == EXEC_SIZE_2) ? 2 :
1803                 (ExecSize == EXEC_SIZE_4) ? 4 : m_program->m_Platform->getAccChNumUD();
1804             CVariable* Carry = m_program->GetNewVariable(
1805                 NumElems, Lo->GetType(), Lo->GetAlign(), Lo->IsUniform(), CName(Lo->getName(), "Carry"));
1806             VISA_VectorOpnd* AccOut = GetDestinationOperand(Carry, m_encoderState.m_dstOperand);
1807 
1808             SModifier MidMod = m_encoderState.m_dstOperand;
1809             if (Lo->IsUniform() && NumElems != 1) {
1810                 MidMod.region[0] = 1;
1811                 MidMod.region[1] = 1;
1812                 MidMod.region[2] = 0;
1813                 MidMod.specialRegion = true;
1814             }
1815             VISA_VectorOpnd* HIn = GetSourceOperand(Hi, MidMod);
1816             VISA_VectorOpnd* AccIn = GetSourceOperand(Carry, MidMod);
1817 
1818             VISA_EMask_Ctrl ExecMask = GetAluEMask(Lo);
1819             V(vKernel->AppendVISATwoDstArithmeticInst(
1820                 ISA_ADDC, Pred, ExecMask, ExecSize,
1821                 L, AccOut, S0L, S1L));
1822 
1823             if (H1 && !(H1->IsImmediate() && H1->GetImmediateValue() == 0))
1824             {
1825                 VISA_VectorOpnd* S1H = GetSourceOperand(H1, m_encoderState.m_srcOperand[3]);
1826                 if (m_program->m_Platform->supportAdd3Instruction())
1827                 {
1828                     H = GetDestinationOperand(Hi, m_encoderState.m_dstOperand);
1829                     V(vKernel->AppendVISAArithmeticInst(
1830                         ISA_ADD3, Pred, false, ExecMask, ExecSize,
1831                         H, AccIn, S0H, S1H));
1832                 }
1833                 else
1834                 {
1835                     V(vKernel->AppendVISAArithmeticInst(
1836                         ISA_ADD, Pred, false, ExecMask, ExecSize,
1837                         H, S0H, S1H));
1838                     H = GetDestinationOperand(Hi, m_encoderState.m_dstOperand);
1839                     V(vKernel->AppendVISAArithmeticInst(
1840                         ISA_ADD, Pred, false, ExecMask, ExecSize,
1841                         H, AccIn, HIn));
1842                 }
1843             }
1844             else
1845             {
1846                 V(vKernel->AppendVISAArithmeticInst(
1847                     ISA_ADD, Pred, false, ExecMask, ExecSize,
1848                     H, AccIn, S0H));
1849             }
1850         }
1851     }
1852 
SubPair(CVariable * Lo,CVariable * Hi,CVariable * L0,CVariable * H0,CVariable * L1,CVariable * H1)1853     void CEncoder::SubPair(CVariable* Lo, CVariable* Hi, CVariable* L0, CVariable* H0, CVariable* L1, CVariable* H1) {
1854         IGC_ASSERT_MESSAGE(m_encoderState.m_dstOperand.mod == EMOD_NONE, "subPair doesn't support saturate");
1855 
1856         IGC_ASSERT(Lo || Hi);  // At least one is used
1857         if (Hi == nullptr) {
1858             // When Hi part is ignored, reduce 64-bit subtraction into 32-bit.
1859             SetSrcModifier(1, EMOD_NEG);
1860             GenericAlu(EOPCODE_ADD, Lo, L0, L1);
1861             return;
1862         }
1863 
1864         if (Lo == nullptr) {
1865             // We cannot reduce the strength if only Lo is ignored.
1866             Lo = m_program->GetNewVariable(
1867                 Hi->GetNumberElement(), Hi->GetType(), Hi->GetAlign(), Hi->IsUniform(), CName(Hi->getName(), "Carry"));
1868         }
1869 
1870         VISA_Exec_Size ExecSize = GetAluExecSize(Lo);
1871         IGC_ASSERT((ExecSize == EXEC_SIZE_32) || (ExecSize == EXEC_SIZE_16) || (ExecSize == EXEC_SIZE_8) || (ExecSize == EXEC_SIZE_1));
1872 
1873         // Use `UD` only.
1874         if (Lo->GetType() != ISA_TYPE_UD && Lo->GetType() != ISA_TYPE_UV) Lo = m_program->BitCast(Lo, ISA_TYPE_UD);
1875         if (Hi->GetType() != ISA_TYPE_UD && Hi->GetType() != ISA_TYPE_UV) Hi = m_program->BitCast(Hi, ISA_TYPE_UD);
1876         if (L0->GetType() != ISA_TYPE_UD && L0->GetType() != ISA_TYPE_UV) L0 = m_program->BitCast(L0, ISA_TYPE_UD);
1877         if (H0->GetType() != ISA_TYPE_UD && H0->GetType() != ISA_TYPE_UV) H0 = m_program->BitCast(H0, ISA_TYPE_UD);
1878         if (L1->GetType() != ISA_TYPE_UD && L1->GetType() != ISA_TYPE_UV) L1 = m_program->BitCast(L1, ISA_TYPE_UD);
1879         if (H1->GetType() != ISA_TYPE_UD && H1->GetType() != ISA_TYPE_UV) H1 = m_program->BitCast(H1, ISA_TYPE_UD);
1880 
1881         if (needsSplitting(ExecSize))
1882         {
1883             // Have to split it because `acc0` has only 8 elements for 32-bit
1884             // integer types.
1885             unsigned NumParts = 2;
1886             VISA_EMask_Ctrl ExecMask = GetAluEMask(Lo);
1887             VISA_Exec_Size FromExecSize = GetAluExecSize(Lo);
1888             VISA_Exec_Size ToExecSize = SplitExecSize(FromExecSize, NumParts);
1889 
1890             // Negative `S1H`
1891             SModifier S1HMod = m_encoderState.m_srcOperand[1];
1892             IGC_ASSERT(S1HMod.mod == EMOD_NONE);
1893             S1HMod.mod = EMOD_NEG;
1894             VISA_PredOpnd* Pred = GetFlagOperand(m_encoderState.m_flag);
1895             for (unsigned ThePart = 0; ThePart != NumParts; ++ThePart) {
1896                 SModifier NewDstMod = SplitVariable(FromExecSize, ToExecSize, ThePart, Lo, m_encoderState.m_dstOperand);
1897                 SModifier NewS0LMod = SplitVariable(FromExecSize, ToExecSize, ThePart, L0, m_encoderState.m_srcOperand[0], true);
1898                 SModifier NewS0HMod = SplitVariable(FromExecSize, ToExecSize, ThePart, H0, m_encoderState.m_srcOperand[1], true);
1899                 SModifier NewS1LMod = SplitVariable(FromExecSize, ToExecSize, ThePart, L1, m_encoderState.m_srcOperand[2], true);
1900                 SModifier NewS1HMod = SplitVariable(FromExecSize, ToExecSize, ThePart, H1, S1HMod, true);
1901                 VISA_VectorOpnd* S0L = GetSourceOperand(L0, NewS0LMod);
1902                 VISA_VectorOpnd* S0H = GetSourceOperand(H0, NewS0HMod);
1903                 VISA_VectorOpnd* S1L = GetSourceOperand(L1, NewS1LMod);
1904                 VISA_VectorOpnd* S1H = GetSourceOperand(H1, NewS1HMod);
1905                 VISA_VectorOpnd* L = GetDestinationOperand(Lo, NewDstMod);
1906                 VISA_VectorOpnd* H = GetDestinationOperand(Hi, NewDstMod);
1907                 VISA_VectorOpnd* HIn = GetSourceOperand(Hi, NewDstMod);
1908 
1909                 unsigned short NumElems = m_program->m_Platform->getAccChNumUD();
1910                 CVariable* Carry =
1911                     m_program->GetNewVariable(NumElems, Lo->GetType(), Lo->GetAlign(), Lo->IsUniform(), CName(Lo->getName(), "Carry"));
1912                 VISA_VectorOpnd* AccOut = GetDestinationOperand(Carry, m_encoderState.m_dstOperand);
1913                 // Negative `Acc0`
1914                 SModifier AccMod = m_encoderState.m_dstOperand;
1915                 IGC_ASSERT(AccMod.mod == EMOD_NONE);
1916                 AccMod.mod = EMOD_NEG;
1917                 VISA_VectorOpnd* AccIn = GetSourceOperand(Carry, AccMod);
1918 
1919                 VISA_EMask_Ctrl EMask = SplitEMask(FromExecSize, ToExecSize, ThePart, ExecMask);
1920                 V(vKernel->AppendVISATwoDstArithmeticInst(
1921                     ISA_SUBB, Pred, EMask, ToExecSize,
1922                     L, AccOut, S0L, S1L));
1923                 if (m_program->m_Platform->supportAdd3Instruction())
1924                 {
1925                     H = GetDestinationOperand(Hi, NewDstMod);
1926                     V(vKernel->AppendVISAArithmeticInst(
1927                         ISA_ADD3, Pred, false, EMask, ToExecSize,
1928                         H, AccIn, S0H, S1H));
1929                 }
1930                 else
1931                 {
1932                     V(vKernel->AppendVISAArithmeticInst(
1933                         ISA_ADD, Pred, false, EMask, ToExecSize,
1934                         H, S0H, S1H));
1935                     H = GetDestinationOperand(Hi, NewDstMod);
1936                     V(vKernel->AppendVISAArithmeticInst(
1937                         ISA_ADD, Pred, false, EMask, ToExecSize,
1938                         H, AccIn, HIn));
1939                 }
1940             }
1941         }
1942         else {
1943             VISA_VectorOpnd* S0L = GetSourceOperand(L0, m_encoderState.m_srcOperand[0]);
1944             VISA_VectorOpnd* S0H = GetSourceOperand(H0, m_encoderState.m_srcOperand[1]);
1945             VISA_VectorOpnd* S1L = GetSourceOperand(L1, m_encoderState.m_srcOperand[2]);
1946             // Negative `S0H`
1947             SModifier S1HMod = m_encoderState.m_srcOperand[1];
1948             IGC_ASSERT(S1HMod.mod == EMOD_NONE);
1949             S1HMod.mod = EMOD_NEG;
1950             VISA_VectorOpnd* S1H = GetSourceOperand(H1, S1HMod);
1951             VISA_VectorOpnd* L = GetDestinationOperand(Lo, m_encoderState.m_dstOperand);
1952             VISA_VectorOpnd* H = GetDestinationOperand(Hi, m_encoderState.m_dstOperand);
1953             VISA_PredOpnd* Pred = GetFlagOperand(m_encoderState.m_flag);
1954 
1955             unsigned short NumElems = (ExecSize == 1) ? 1 : m_program->m_Platform->getAccChNumUD();
1956             CVariable* Carry = m_program->GetNewVariable(
1957                 NumElems, Lo->GetType(), Lo->GetAlign(), Lo->IsUniform(), CName(Lo->getName(), "Carry"));
1958             VISA_VectorOpnd* AccOut = GetDestinationOperand(Carry, m_encoderState.m_dstOperand);
1959 
1960             SModifier MidMod = m_encoderState.m_dstOperand;
1961             if (Lo->IsUniform() && NumElems != 1) {
1962                 MidMod.region[0] = 1;
1963                 MidMod.region[1] = 1;
1964                 MidMod.region[2] = 0;
1965                 MidMod.specialRegion = true;
1966             }
1967             VISA_VectorOpnd* HIn = GetSourceOperand(Hi, MidMod);
1968             // Negative `Acc0`
1969             SModifier AccMod = MidMod;
1970             IGC_ASSERT(AccMod.mod == EMOD_NONE);
1971             AccMod.mod = EMOD_NEG;
1972             VISA_VectorOpnd* AccIn = GetSourceOperand(Carry, AccMod);
1973 
1974             VISA_EMask_Ctrl ExecMask = GetAluEMask(Lo);
1975             V(vKernel->AppendVISATwoDstArithmeticInst(
1976                 ISA_SUBB, Pred, ExecMask, ExecSize,
1977                 L, AccOut, S0L, S1L));
1978             if (m_program->m_Platform->supportAdd3Instruction())
1979             {
1980                 H = GetDestinationOperand(Hi, m_encoderState.m_dstOperand);
1981                 V(vKernel->AppendVISAArithmeticInst(
1982                     ISA_ADD3, Pred, false, ExecMask, ExecSize,
1983                     H, AccIn, S0H, S1H));
1984             }
1985             else
1986             {
1987                 V(vKernel->AppendVISAArithmeticInst(
1988                     ISA_ADD, Pred, false, ExecMask, ExecSize,
1989                     H, S0H, S1H));
1990                 H = GetDestinationOperand(Hi, m_encoderState.m_dstOperand);
1991                 V(vKernel->AppendVISAArithmeticInst(
1992                     ISA_ADD, Pred, false, ExecMask, ExecSize,
1993                     H, AccIn, HIn));
1994             }
1995         }
1996     }
1997 
CarryBorrowArith(ISA_Opcode opcode,CVariable * dst,CVariable * src0,CVariable * src1)1998     void CEncoder::CarryBorrowArith(ISA_Opcode opcode, CVariable* dst, CVariable* src0, CVariable* src1)
1999     {
2000         VISA_VectorOpnd* srcOpnd0 = GetSourceOperand(src0, m_encoderState.m_srcOperand[0]);
2001         VISA_VectorOpnd* srcOpnd1 = GetSourceOperand(src1, m_encoderState.m_srcOperand[1]);
2002         VISA_VectorOpnd* dstOpnd = GetDestinationOperand(dst, m_encoderState.m_dstOperand);
2003         VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
2004         SModifier carryOperand = m_encoderState.m_dstOperand;
2005         VISA_Exec_Size execSize = GetAluExecSize(dst);
2006 
2007         switch (execSize)
2008         {
2009         case EXEC_SIZE_1:
2010             carryOperand.subReg += 1;
2011             break;
2012         case EXEC_SIZE_8:
2013             carryOperand.subVar += 1;
2014             break;
2015         case EXEC_SIZE_16:
2016             carryOperand.subVar += 2;
2017             break;
2018         default:
2019             IGC_ASSERT_MESSAGE(0, "Unknown execution size on carry-borrow-arith!");
2020             break;
2021         }
2022         VISA_VectorOpnd* carryBorrowOpnd = GetDestinationOperand(dst, carryOperand);
2023         IGC_ASSERT_MESSAGE(m_encoderState.m_dstOperand.mod == EMOD_NONE, "addc/subb doesn't support saturate");
2024 
2025         V(vKernel->AppendVISATwoDstArithmeticInst(
2026             opcode,
2027             predOpnd,
2028             GetAluEMask(dst),
2029             GetAluExecSize(dst),
2030             dstOpnd,
2031             carryBorrowOpnd,
2032             srcOpnd0,
2033             srcOpnd1));
2034     }
2035 
URBWrite(CVariable * src,const int payloadElementOffset,CVariable * offset,CVariable * urbHandle,CVariable * mask)2036     void CEncoder::URBWrite(
2037         CVariable* src,
2038         const int payloadElementOffset,
2039         CVariable* offset,
2040         CVariable* urbHandle,
2041         CVariable* mask)
2042     {
2043 
2044         IGC_ASSERT(nullptr != offset);
2045 
2046         VISA_EMask_Ctrl emask = ConvertMaskToVisaType(m_encoderState.m_mask, m_encoderState.m_noMask);
2047         VISA_Exec_Size execSize = visaExecSize(m_encoderState.m_simdSize);
2048         VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
2049         VISA_RawOpnd* handle = GetRawSource(urbHandle);
2050         // Two possible cases: offset may be constant (immediate) or runtime value.
2051         unsigned short immOffset = 0;
2052         VISA_RawOpnd* perSlotOffset = nullptr;
2053         if (offset->IsImmediate())
2054         {
2055             immOffset = int_cast<unsigned short>(offset->GetImmediateValue());
2056             V(vKernel->CreateVISANullRawOperand(perSlotOffset, false));
2057         }
2058         else
2059         {
2060             perSlotOffset = GetRawSource(offset);
2061         }
2062 
2063         // Three possible cases:
2064         // 1. Channel Mask is immediate value with 0xFF, so not needed to send
2065         // 2. Channel Mask is immediate value other than 0xFF, so needed to send, but as immediate value
2066         // 3. Channel Mask is not immediate value, so needed to send, but as not immediate value
2067         VISA_RawOpnd* channelMask = nullptr;
2068         unsigned char payloadSize = 0;
2069         if (!mask->IsImmediate())
2070         {
2071             channelMask = GetRawSource(mask);
2072             // All 4 elements will be send - we don't know which are masked out.
2073             payloadSize = 4;
2074         }
2075         else
2076         {
2077             unsigned int immChannelMask = int_cast<unsigned int>(mask->GetImmediateValue());
2078             URBChannelMask immMask(immChannelMask);
2079             if (immMask.isAllSet())
2080             {
2081                 V(vKernel->CreateVISANullRawOperand(channelMask, false));
2082             }
2083             else
2084             {
2085                 CVariable* tmpDst = m_program->GetNewVariable(8, ISA_TYPE_UD, EALIGN_GRF, true, CName::NONE);
2086                 VISA_VectorOpnd* movDst = nullptr;
2087                 V(vKernel->CreateVISADstOperand(movDst, GetVISAVariable(tmpDst), 1, 0, 0));
2088 
2089                 VISA_VectorOpnd* immSrc = nullptr;
2090                 V(vKernel->CreateVISAImmediate(immSrc, &immChannelMask, ISA_TYPE_UW));
2091 
2092                 V(vKernel->AppendVISADataMovementInst(
2093                     ISA_MOV, nullptr, false, emask,
2094                     EXEC_SIZE_8, movDst, immSrc));
2095                 V(vKernel->CreateVISARawOperand(channelMask, GetVISAVariable(tmpDst), 0));
2096             }
2097 
2098             payloadSize = int_cast<unsigned char>(immMask.size());
2099         }
2100 
2101         VISA_RawOpnd* vertexData = GetRawSource(src, payloadElementOffset);
2102 
2103         V(vKernel->AppendVISA3dURBWrite(
2104             predOpnd,
2105             emask,
2106             execSize,
2107             payloadSize,
2108             channelMask,
2109             immOffset,
2110             handle,
2111             perSlotOffset,
2112             vertexData));
2113     }
2114 
2115 
GetRawSource(CVariable * var,uint offset)2116     VISA_RawOpnd* CEncoder::GetRawSource(CVariable* var, uint offset)
2117     {
2118         VISA_RawOpnd* srcOpnd = nullptr;
2119         if (var)
2120         {
2121             if (var->IsImmediate())
2122             {
2123                 VISA_VectorOpnd* vecOpnd = nullptr;
2124                 uint immediate = int_cast<uint>(var->GetImmediateValue());
2125                 V(vKernel->CreateVISAImmediate(vecOpnd, &immediate, ISA_TYPE_UD));
2126                 srcOpnd = (VISA_RawOpnd*)vecOpnd;
2127             }
2128             else
2129             {
2130                 V(vKernel->CreateVISARawOperand(
2131                     srcOpnd,
2132                     GetVISAVariable(var),
2133                     int_cast<unsigned short>(offset + var->GetAliasOffset())));
2134             }
2135         }
2136         else
2137         {
2138             V(vKernel->CreateVISANullRawOperand(srcOpnd, false));
2139         }
2140         return srcOpnd;
2141     }
2142 
GetRawDestination(CVariable * var,unsigned offset)2143     VISA_RawOpnd* CEncoder::GetRawDestination(CVariable* var, unsigned offset)
2144     {
2145         VISA_RawOpnd* dstOpnd = nullptr;
2146         if (var)
2147         {
2148             V(vKernel->CreateVISARawOperand(
2149                 dstOpnd, GetVISAVariable(var),
2150                 m_encoderState.m_dstOperand.subVar * getGRFSize() + offset + var->GetAliasOffset()));
2151         }
2152         else
2153         {
2154             V(vKernel->CreateVISANullRawOperand(dstOpnd, true));
2155         }
2156         return dstOpnd;
2157     }
2158 
Send(CVariable * dst,CVariable * src,uint exDesc,CVariable * messDescriptor,bool isSendc)2159     void CEncoder::Send(CVariable* dst, CVariable* src, uint exDesc, CVariable* messDescriptor, bool isSendc)
2160     {
2161         if (dst && dst->IsUniform())
2162         {
2163             m_encoderState.m_simdSize = m_encoderState.m_uniformSIMDSize;
2164         }
2165         unsigned char sendc = isSendc ? 1 : 0;
2166         unsigned char srcSize = src->GetSize() / getGRFSize();
2167         unsigned char dstSize = dst ? dst->GetSize() / getGRFSize() : 0;
2168         VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
2169         VISA_RawOpnd* srcOpnd0 = GetRawSource(src);
2170         VISA_RawOpnd* dstOpnd = GetRawDestination(dst);
2171         VISA_VectorOpnd* desc = GetUniformSource(messDescriptor);
2172 
2173         V(vKernel->AppendVISAMiscRawSend(
2174             predOpnd,
2175             GetAluEMask(dst),
2176             visaExecSize(m_encoderState.m_simdSize),
2177             sendc,
2178             exDesc,
2179             srcSize,
2180             dstSize,
2181             desc,
2182             srcOpnd0,
2183             dstOpnd));
2184     }
2185 
Send(CVariable * dst,CVariable * src,uint ffid,CVariable * exDesc,CVariable * messDescriptor,bool isSendc)2186     void CEncoder::Send(CVariable* dst, CVariable* src, uint ffid, CVariable* exDesc, CVariable* messDescriptor, bool isSendc)
2187     {
2188         Sends(dst, src, nullptr, ffid, exDesc, messDescriptor, isSendc);
2189     }
2190 
Sends(CVariable * dst,CVariable * src0,CVariable * src1,uint ffid,CVariable * exDesc,CVariable * messDescriptor,bool isSendc,bool hasEOT)2191     void CEncoder::Sends(CVariable* dst, CVariable* src0, CVariable* src1, uint ffid, CVariable* exDesc, CVariable* messDescriptor, bool isSendc, bool hasEOT)
2192     {
2193         if (exDesc->IsImmediate() && src1 == nullptr)
2194         {
2195             Send(dst, src0, (uint)exDesc->GetImmediateValue(), messDescriptor, isSendc);
2196             return;
2197         }
2198         if (dst && dst->IsUniform())
2199         {
2200             m_encoderState.m_simdSize = m_encoderState.m_uniformSIMDSize;
2201         }
2202         unsigned char sendc = isSendc ? 1 : 0;
2203         unsigned char src0Size = src0->GetSize() / getGRFSize();
2204         unsigned char src1Size = src1 ? src1->GetSize() / getGRFSize() : 0;
2205         unsigned char dstSize = dst ? dst->GetSize() / getGRFSize() : 0;
2206         VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
2207         VISA_RawOpnd* srcOpnd0 = GetRawSource(src0);
2208         VISA_RawOpnd* srcOpnd1 = GetRawSource(src1);
2209         VISA_RawOpnd* dstOpnd = GetRawDestination(dst);
2210         VISA_VectorOpnd* exMessDesc = GetUniformSource(exDesc);
2211         VISA_VectorOpnd* desc = GetUniformSource(messDescriptor);
2212 
2213         V(vKernel->AppendVISAMiscRawSends(
2214             predOpnd,
2215             GetAluEMask(dst),
2216             visaExecSize(m_encoderState.m_simdSize),
2217             sendc,
2218             ffid,
2219             exMessDesc,
2220             src0Size,
2221             src1Size, // right now only one source
2222             dstSize,
2223             desc,
2224             srcOpnd0,
2225             srcOpnd1,
2226             dstOpnd,
2227             hasEOT));
2228     }
2229 
GetBTIOperand(uint bindingTableIndex)2230     VISA_StateOpndHandle* CEncoder::GetBTIOperand(uint bindingTableIndex)
2231     {
2232         IGC::e_predefSurface predDefSurface = ESURFACE_NORMAL;
2233         if (bindingTableIndex == 255)
2234             predDefSurface = ESURFACE_STATELESS;
2235         else if (bindingTableIndex == 254)
2236             predDefSurface = ESURFACE_SLM;
2237         CVariable tempImm(bindingTableIndex, ISA_TYPE_UD);
2238         return GetVISASurfaceOpnd(predDefSurface, &tempImm);
2239     }
2240 
RenderTargetWrite(CVariable * var[],bool isUndefined[],bool lastRenderTarget,bool isNullRT,bool perSample,bool coarseMode,bool headerMaskFromCe0,CVariable * bindingTableIndex,CVariable * RTIndex,CVariable * source0Alpha,CVariable * oMask,CVariable * depth,CVariable * stencil,CVariable * CPSCounter,CVariable * sampleIndex,CVariable * r1Reg)2241     void CEncoder::RenderTargetWrite(CVariable* var[],
2242         bool isUndefined[],
2243         bool lastRenderTarget,
2244         bool isNullRT,
2245         bool perSample,
2246         bool coarseMode,
2247         bool headerMaskFromCe0,
2248         CVariable* bindingTableIndex,
2249         CVariable* RTIndex,
2250         CVariable* source0Alpha,
2251         CVariable* oMask,
2252         CVariable* depth,
2253         CVariable* stencil,
2254         CVariable* CPSCounter,
2255         CVariable* sampleIndex,
2256         CVariable* r1Reg)
2257     {
2258         VISA_EMask_Ctrl emask = ConvertMaskToVisaType(m_encoderState.m_mask, m_encoderState.m_noMask);
2259         VISA_Exec_Size execSize = visaExecSize(m_encoderState.m_simdSize);
2260         VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
2261         VISA_StateOpndHandle* surfOpnd = GetVISASurfaceOpnd(ESURFACE_NORMAL, bindingTableIndex);
2262 
2263         vISA_RT_CONTROLS cntrls;
2264         uint8_t numMsgSpecificOpnds = 0;
2265         VISA_RawOpnd* srcOpnd[8] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL };
2266 
2267         cntrls.isPerSample = perSample;
2268         cntrls.isCoarseMode = coarseMode;
2269         cntrls.isHeaderMaskfromCe0 = headerMaskFromCe0;
2270         IGC_ASSERT(!((predOpnd != nullptr) && cntrls.isHeaderMaskfromCe0));
2271 
2272         if (source0Alpha)
2273         {
2274             cntrls.s0aPresent = true;
2275             srcOpnd[numMsgSpecificOpnds++] = GetRawSource(source0Alpha);
2276         }
2277         else
2278             cntrls.s0aPresent = false;
2279 
2280         if (oMask)
2281         {
2282             cntrls.oMPresent = true;
2283             srcOpnd[numMsgSpecificOpnds++] = GetRawSource(oMask);
2284         }
2285         else
2286             cntrls.oMPresent = false;
2287 
2288         for (int i = 0; i < 4; i++)
2289         {
2290             if (isUndefined[i])
2291             {
2292                 V(vKernel->CreateVISANullRawOperand(srcOpnd[numMsgSpecificOpnds++], false));
2293             }
2294             else
2295             {
2296                 srcOpnd[numMsgSpecificOpnds++] = GetRawSource(var[i]);
2297             }
2298         }
2299 
2300         if (depth)
2301         {
2302             cntrls.zPresent = true;
2303             srcOpnd[numMsgSpecificOpnds++] = GetRawSource(depth);
2304         }
2305         else
2306             cntrls.zPresent = false;
2307 
2308         if (stencil)
2309         {
2310             cntrls.isStencil = true;
2311             srcOpnd[numMsgSpecificOpnds++] = GetRawSource(stencil);
2312         }
2313         else
2314             cntrls.isStencil = false;
2315 
2316         cntrls.isSampleIndex = false;
2317         VISA_VectorOpnd* sampleIndexOpnd = NULL;
2318         if (sampleIndex)
2319         {
2320             sampleIndexOpnd = GetSourceOperandNoModifier(sampleIndex);
2321             cntrls.isSampleIndex = true;
2322         }
2323         VISA_VectorOpnd* cpsCounterOpnd = GetSourceOperandNoModifier(CPSCounter);
2324 
2325         VISA_VectorOpnd* RTIndexOpnd = nullptr;
2326         cntrls.RTIndexPresent = false;
2327         // if RTIndex is 0, then no need to prepare the header for send
2328         if (!RTIndex->IsImmediate() || RTIndex->GetImmediateValue() != 0)
2329         {
2330             RTIndexOpnd = GetSourceOperandNoModifier(RTIndex);
2331             cntrls.RTIndexPresent = true;
2332         }
2333 
2334         //controls last render target select bit
2335         cntrls.isLastWrite = lastRenderTarget;
2336 
2337         // controls NULL render target enbale bit
2338         cntrls.isNullRT = isNullRT;
2339 
2340         //r1Reg should always be populated
2341         //vISA will decide whether to use it or not.
2342         VISA_RawOpnd* r1RegOpnd = GetRawSource(r1Reg);
2343 
2344 
2345         if (CPSCounter)
2346         {
2347             V(vKernel->AppendVISA3dRTWriteCPS(
2348                 predOpnd,
2349                 emask,
2350                 execSize,
2351                 RTIndexOpnd,
2352                 cntrls,
2353                 surfOpnd,
2354                 r1RegOpnd,
2355                 sampleIndexOpnd,
2356                 cpsCounterOpnd,
2357                 numMsgSpecificOpnds,
2358                 srcOpnd));
2359         }
2360         else
2361         {
2362             V(vKernel->AppendVISA3dRTWrite(
2363                 predOpnd,
2364                 emask,
2365                 execSize,
2366                 RTIndexOpnd,
2367                 cntrls,
2368                 surfOpnd,
2369                 r1RegOpnd,
2370                 sampleIndexOpnd,
2371                 numMsgSpecificOpnds,
2372                 srcOpnd));
2373         }
2374     }
2375 
GetSamplerOperand(const SamplerDescriptor & sampler,bool & isIdxLT16)2376     VISA_StateOpndHandle* CEncoder::GetSamplerOperand(
2377         const SamplerDescriptor& sampler,
2378         bool& isIdxLT16)
2379     {
2380         //Sampler index
2381         VISA_VectorOpnd* dstOpnd = nullptr;
2382         VISA_SamplerVar* samplerVar = nullptr;
2383 
2384         if (sampler.m_samplerType == ESAMPLER_NORMAL)
2385         {
2386             samplerVar = samplervar;
2387 
2388             if (sampler.m_sampler->IsImmediate())
2389             {
2390                 uint immediate = int_cast<uint>(sampler.m_sampler->GetImmediateValue());
2391                 if (immediate < 16)
2392                 {
2393                     isIdxLT16 = true;
2394                 }
2395                 else
2396                 {
2397                     isIdxLT16 = false;
2398                 }
2399             }
2400             else
2401             {
2402                 // for dynamic index, avoid generate additional code for APIs only supporting 16 samplers
2403                 if (m_program->GetContext()->m_DriverInfo.SupportMoreThan16Samplers())
2404                 {
2405                     isIdxLT16 = false;
2406                 }
2407                 else
2408                 {
2409                     isIdxLT16 = true;
2410                 }
2411             }
2412         }
2413         else
2414         {
2415             V(vKernel->GetBindlessSampler(samplerVar));
2416             isIdxLT16 = true;
2417         }
2418 
2419         V(vKernel->CreateVISAStateOperand(dstOpnd, samplerVar, 0, true));
2420 
2421         IGC_ASSERT(nullptr != sampler.m_sampler);
2422         IGC_ASSERT(sampler.m_sampler->IsUniform());
2423         VISA_VectorOpnd* sourecOpnd = GetUniformSource(sampler.m_sampler);
2424 
2425         //Add the mov special instruction for sampler
2426         V(vKernel->AppendVISADataMovementInst(
2427             ISA_MOVS,
2428             nullptr,
2429             false,
2430             vISA_EMASK_M1_NM,
2431             EXEC_SIZE_1,
2432             dstOpnd,
2433             sourecOpnd,
2434             nullptr));
2435 
2436         VISA_StateOpndHandle* samplerOpnd = nullptr;
2437         V(vKernel->CreateVISAStateOperandHandle(samplerOpnd, samplerVar));
2438         return samplerOpnd;
2439     }
2440 
GetSamplerOperand(CVariable * samplerIndex)2441     VISA_StateOpndHandle* CEncoder::GetSamplerOperand(CVariable* samplerIndex)
2442     {
2443         SamplerDescriptor sampler;
2444         bool isIdxLT16;
2445         sampler.m_sampler = samplerIndex;
2446         return GetSamplerOperand(sampler, isIdxLT16);
2447     }
2448 
Sample(EOPCODE subOpcode,uint writeMask,CVariable * offset,const ResourceDescriptor & resource,const SamplerDescriptor & sampler,uint numSources,CVariable * dst,SmallVector<CVariable *,4> & payload,bool zeroLOD,bool cpsEnable,bool feedbackEnable,bool nonUniformState)2449     void CEncoder::Sample(
2450         EOPCODE subOpcode,
2451         uint writeMask,
2452         CVariable* offset,
2453         const ResourceDescriptor& resource,
2454         const SamplerDescriptor& sampler,
2455         uint numSources,
2456         CVariable* dst,
2457         SmallVector<CVariable*, 4>& payload,
2458         bool zeroLOD,
2459         bool cpsEnable,
2460         bool feedbackEnable,
2461         bool nonUniformState)
2462     {
2463 
2464         if (!m_program->m_Platform->hasSamplerSupport())
2465             return;
2466 
2467         int numMsgSpecificOpnds = numSources;
2468         VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
2469         bool isIdxLT16;
2470         VISA_StateOpndHandle* samplerOpnd = GetSamplerOperand(sampler, isIdxLT16);
2471         VISA_StateOpndHandle* btiOpnd = GetVISASurfaceOpnd(resource);
2472         VISA_RawOpnd* dstVar = GetRawDestination(dst);
2473         VISA_RawOpnd* opndArray[11];
2474         for (int i = 0; i < numMsgSpecificOpnds; i++)
2475         {
2476             opndArray[i] = GetRawSource(payload[i]);
2477         }
2478 
2479         VISA_VectorOpnd* aoffimmi = GetSourceOperandNoModifier(offset);
2480         // Use bit 15 of aoffimmi to tell VISA the sample index could be greater
2481         // than 15.  In this case, we need to use msg header, and setup M0.3
2482         // to point to next 16 sampler state.
2483         if (!isIdxLT16)
2484         {
2485             uint16_t aoffimmiVal = (uint16_t)offset->GetImmediateValue() | BIT(15);
2486             V(vKernel->CreateVISAImmediate(aoffimmi, &aoffimmiVal, ISA_TYPE_UW));
2487         }
2488 
2489         {
2490             int status = vKernel->AppendVISA3dSampler(
2491                 ConvertSubOpcode(subOpcode, zeroLOD),
2492                 feedbackEnable, // pixel null mask
2493                 cpsEnable,
2494                 !nonUniformState,
2495                 predOpnd,
2496                 GetAluEMask(dst),
2497                 visaExecSize(m_encoderState.m_simdSize),
2498                 ConvertChannelMaskToVisaType(writeMask),
2499                 aoffimmi,
2500                 samplerOpnd,
2501                 btiOpnd,
2502                 dstVar,
2503                 numSources,
2504                 opndArray);
2505 
2506             V(status);
2507         }
2508     }
2509 
Load(EOPCODE subOpcode,uint writeMask,CVariable * offset,const ResourceDescriptor & resource,uint numSources,CVariable * dst,SmallVector<CVariable *,4> & payload,bool zeroLOD,bool feedbackEnable)2510     void CEncoder::Load(
2511         EOPCODE subOpcode,
2512         uint writeMask,
2513         CVariable* offset,
2514         const ResourceDescriptor& resource,
2515         uint numSources,
2516         CVariable* dst,
2517         SmallVector<CVariable*, 4>& payload,
2518         bool zeroLOD,
2519         bool feedbackEnable)
2520     {
2521 
2522         if (!m_program->m_Platform->hasSamplerSupport())
2523             return;
2524 
2525         VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
2526         VISA_StateOpndHandle* surfOpnd = GetVISASurfaceOpnd(resource);
2527         VISA_RawOpnd* dstVar = GetRawDestination(dst);
2528 
2529         VISA_RawOpnd* opndArray[11];
2530         for (unsigned int i = 0; i < numSources; i++)
2531         {
2532             opndArray[i] = GetRawSource(payload[i]);
2533         }
2534 
2535         VISA_VectorOpnd* aoffimmi = GetSourceOperandNoModifier(offset);
2536 
2537         {
2538             int status = vKernel->AppendVISA3dLoad(
2539                 ConvertSubOpcode(subOpcode, zeroLOD),
2540                 feedbackEnable, // pixel null mask
2541                 predOpnd,
2542                 GetAluEMask(dst),
2543                 GetAluExecSize(dst),
2544                 ConvertChannelMaskToVisaType(writeMask),
2545                 aoffimmi,
2546                 surfOpnd,
2547                 dstVar,
2548                 numSources,
2549                 opndArray);
2550 
2551             V(status);
2552         }
2553     }
2554 
Info(EOPCODE subOpcode,uint writeMask,const ResourceDescriptor & resource,CVariable * lod,CVariable * dst)2555     void CEncoder::Info(EOPCODE subOpcode, uint writeMask, const ResourceDescriptor& resource, CVariable* lod, CVariable* dst)
2556     {
2557         VISA_StateOpndHandle* surfOpnd = GetVISASurfaceOpnd(resource);
2558         VISA_RawOpnd* dstVar = GetRawDestination(dst);
2559         VISA_RawOpnd* lodVar = GetRawSource(lod);
2560 
2561         V(vKernel->AppendVISA3dInfo(
2562             ConvertSubOpcode(subOpcode, false),
2563             GetAluEMask(dst),
2564             GetAluExecSize(dst),
2565             ConvertChannelMaskToVisaType(writeMask),
2566             surfOpnd,
2567             lodVar,
2568             dstVar));
2569     }
2570 
Gather4Inst(EOPCODE subOpcode,CVariable * offset,const ResourceDescriptor & resource,const SamplerDescriptor & sampler,uint numSources,CVariable * dst,SmallVector<CVariable *,4> & payload,uint channel,bool feedbackEnable)2571     void CEncoder::Gather4Inst(
2572         EOPCODE subOpcode,
2573         CVariable* offset,
2574         const ResourceDescriptor& resource,
2575         const SamplerDescriptor& sampler,
2576         uint numSources,
2577         CVariable* dst,
2578         SmallVector<CVariable*, 4>& payload,
2579         uint channel,
2580         bool feedbackEnable)
2581     {
2582 
2583         if (!m_program->m_Platform->hasSamplerSupport())
2584             return;
2585 
2586         VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
2587         bool isIdxLT16;
2588         VISA_StateOpndHandle* samplerOpnd = GetSamplerOperand(sampler, isIdxLT16);
2589         VISA_StateOpndHandle* surfOpnd = GetVISASurfaceOpnd(resource);
2590         VISA_RawOpnd* dstVar = GetRawDestination(dst);
2591         VISA_RawOpnd* opndArray[11];
2592         for (unsigned int i = 0; i < numSources; i++)
2593         {
2594             opndArray[i] = GetRawSource(payload[i]);
2595         }
2596 
2597         VISA_VectorOpnd* aoffimmi = GetSourceOperandNoModifier(offset);
2598         if (!isIdxLT16)
2599         {
2600             uint16_t aoffimmiVal = (uint16_t)offset->GetImmediateValue() | BIT(15);
2601             V(vKernel->CreateVISAImmediate(aoffimmi, &aoffimmiVal, ISA_TYPE_UW));
2602         }
2603 
2604         {
2605             int status = vKernel->AppendVISA3dGather4(
2606                 ConvertSubOpcode(subOpcode, false),
2607                 feedbackEnable, // pixel null mask
2608                 predOpnd,
2609                 GetAluEMask(dst),
2610                 visaExecSize(m_encoderState.m_simdSize),
2611                 ConvertSingleSourceChannel(channel),
2612                 aoffimmi,
2613                 samplerOpnd,
2614                 surfOpnd,
2615                 dstVar,
2616                 numSources,
2617                 opndArray);
2618 
2619             V(status);
2620         }
2621     }
2622 
AddrAdd(CVariable * dst,CVariable * src0,CVariable * src1)2623     void CEncoder::AddrAdd(CVariable* dst, CVariable* src0, CVariable* src1)
2624     {
2625         // On ICL+ platforms address register must be initialized if it is used
2626         // in VxH indirect addressing to avoid out-of-bounds access on inactive
2627         // lanes. VISA initializes address register at the beginning of the
2628         // shader which is sufficient for shaders that use address register only
2629         // for indirect addressing but is not sufficient if shader also uses
2630         // address register in send descriptors. The latter case is handled by
2631         // the initialization below.
2632         // see VISA Optimizer::resetA0()
2633         const bool mayUseA0InSendDesc =
2634             m_program->GetContext()->m_instrTypes.mayHaveIndirectResources;
2635         const bool needsA0Reset =
2636             m_program->m_Platform->NeedResetA0forVxHA0();
2637 
2638         if (((mayUseA0InSendDesc && needsA0Reset) ||
2639             IGC_IS_FLAG_ENABLED(InitializeAddressRegistersBeforeUse)) &&
2640             !dst->IsUniform() &&
2641             !m_encoderState.m_noMask)
2642         {
2643             m_encoderState.m_noMask = true;
2644             VISA_VectorOpnd* srcOpnd = nullptr;
2645             VISA_VectorOpnd* dstOpnd = nullptr;
2646             const DWORD zero = 0;
2647             V(vKernel->CreateVISAImmediate(srcOpnd, &zero, ISA_TYPE_UW));
2648             V(vKernel->CreateVISAAddressDstOperand(dstOpnd, dst->visaAddrVariable, 0));
2649             V(vKernel->AppendVISADataMovementInst(
2650                 ISA_MOV,
2651                 nullptr,
2652                 false,
2653                 GetAluEMask(dst),
2654                 visaExecSize(m_encoderState.m_simdSize),
2655                 dstOpnd,
2656                 srcOpnd));
2657             m_encoderState.m_noMask = false;
2658         }
2659 
2660         if (dst->IsUniform())
2661         {
2662             m_encoderState.m_simdSize = SIMDMode::SIMD1;
2663             m_encoderState.m_noMask = true;
2664         }
2665         VISA_VectorOpnd* pSrc1Opnd = GetSourceOperand(src1, m_encoderState.m_srcOperand[1]);
2666         VISA_VectorOpnd* pSrc0Addr = nullptr;
2667         V(vKernel->CreateVISAAddressOfOperand(pSrc0Addr, GetVISAVariable(src0), src0->GetAliasOffset()));
2668         VISA_VectorOpnd* pVectorOpnd = nullptr;
2669         V(vKernel->CreateVISAAddressDstOperand(pVectorOpnd, dst->visaAddrVariable, 0));
2670 
2671         V(vKernel->AppendVISAAddrAddInst(
2672             GetAluEMask(dst),
2673             visaExecSize(m_encoderState.m_simdSize),
2674             pVectorOpnd,
2675             pSrc0Addr,
2676             pSrc1Opnd));
2677     }
2678 
Barrier(e_barrierKind BarrierKind)2679     void CEncoder::Barrier(e_barrierKind BarrierKind)
2680     {
2681         if (BarrierKind == EBARRIER_SIGNAL) {
2682             // signal only
2683             V(vKernel->AppendVISASplitBarrierInst(true));
2684             return;
2685         }
2686         if (BarrierKind == EBARRIER_WAIT) {
2687             // wait only
2688             V(vKernel->AppendVISASplitBarrierInst(false));
2689             return;
2690         }
2691         V(vKernel->AppendVISASyncInst(ISA_BARRIER));
2692     }
2693 
Fence(bool CommitEnable,bool L3_Flush_RW_Data,bool L3_Flush_Constant_Data,bool L3_Flush_Texture_Data,bool L3_Flush_Instructions,bool Global_Mem_Fence,bool L1_Flush_Constant_Data,bool SWFence)2694     void CEncoder::Fence(bool CommitEnable,
2695         bool L3_Flush_RW_Data,
2696         bool L3_Flush_Constant_Data,
2697         bool L3_Flush_Texture_Data,
2698         bool L3_Flush_Instructions,
2699         bool Global_Mem_Fence,
2700         bool L1_Flush_Constant_Data,
2701         bool SWFence) // if true no ISA is emitted and the instruction is a pure code barrier
2702     {
2703         // Only a single bit set here is a valid configuration
2704         IGC_ASSERT((L3_Flush_Instructions + L3_Flush_Texture_Data + L3_Flush_Constant_Data + L3_Flush_RW_Data) <= 1);
2705 
2706         uint fenceFlags = (L3_Flush_Instructions << 1) |
2707             (L3_Flush_Texture_Data << 2) |
2708             (L3_Flush_Constant_Data << 3) |
2709             (L3_Flush_RW_Data << 4) |
2710             ((!Global_Mem_Fence) << 5) | // bit 5: 1 -- local, 0 -- global
2711             (L1_Flush_Constant_Data << 6) |
2712             (SWFence << 7) |
2713             (CommitEnable << 0);
2714 
2715         V(vKernel->AppendVISASyncInst(ISA_FENCE, int_cast<unsigned char>(fenceFlags)));
2716     }
2717 
FlushSamplerCache()2718     void CEncoder::FlushSamplerCache()
2719     {
2720         V(vKernel->AppendVISASyncInst(ISA_SAMPLR_CACHE_FLUSH));
2721     }
2722 
EOT()2723     void CEncoder::EOT()
2724     {
2725         VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
2726         V(vKernel->AppendVISACFRetInst(predOpnd, vISA_EMASK_M1, EXEC_SIZE_1));
2727     }
2728 
2729     // Init Control register for denorm modes, rounding modes, etc.
initCR(VISAKernel * vKernel)2730     void CEncoder::initCR(VISAKernel* vKernel)
2731     {
2732         // Those bits must be zero'ed on entry to kernel/shader.
2733         // (If not, this function needs to be changed accordingly.)
2734         VISA_VectorOpnd* src0_Opnd = nullptr;
2735         VISA_VectorOpnd* src1_Opnd = nullptr;
2736         VISA_VectorOpnd* dst_Opnd = nullptr;
2737         VISA_GenVar* cr0_var = nullptr;
2738         uint imm_data = 0;
2739 
2740         CodeGenContext* pCtx = m_program->GetContext();
2741         if (pCtx->m_floatDenormMode16 == FLOAT_DENORM_RETAIN)
2742             imm_data |= 0x400;
2743         if (pCtx->m_floatDenormMode32 == FLOAT_DENORM_RETAIN)
2744             imm_data |= 0x80;
2745         if (pCtx->m_floatDenormMode64 == FLOAT_DENORM_RETAIN)
2746             imm_data |= 0x40;
2747 
2748         uint RM_bits = 0;
2749         ERoundingMode RM_FPCvtInt = static_cast<ERoundingMode>(pCtx->getModuleMetaData()->compOpt.FloatCvtIntRoundingMode);
2750         ERoundingMode RM_FP = static_cast<ERoundingMode>(pCtx->getModuleMetaData()->compOpt.FloatRoundingMode);
2751         if (RM_FPCvtInt == ERoundingMode::ROUND_TO_ZERO) {
2752             // No need to set FPCvtInt, just need to set FP RM.
2753             RM_bits = getEncoderRoundingMode_FP(RM_FP);
2754         }
2755         else if (RM_FPCvtInt == RM_FP) {
2756             // Setting FPCvtInt will set both FPCvtInt and FP
2757             RM_bits = getEncoderRoundingMode_FPCvtInt(RM_FPCvtInt);
2758         }
2759         else {
2760             IGC_ASSERT_MESSAGE(0, "Unsupport combination of default rounding mode (FP and FPCvtInt)!");
2761         }
2762         imm_data |= RM_bits;
2763 
2764         // If we are in the default mode no need to set the CR
2765         if (imm_data != 0)
2766         {
2767             V(vKernel->GetPredefinedVar(cr0_var, PREDEFINED_CR0));
2768             V(vKernel->CreateVISASrcOperand(src0_Opnd, cr0_var, MODIFIER_NONE, 0, 1, 0, 0, 0));
2769             V(vKernel->CreateVISAImmediate(src1_Opnd, &imm_data, ISA_TYPE_UD));
2770             V(vKernel->CreateVISADstOperand(dst_Opnd, cr0_var, 1, 0, 0));
2771             V(vKernel->AppendVISAArithmeticInst(
2772                 ISA_OR,
2773                 nullptr,
2774                 false,
2775                 vISA_EMASK_M1_NM,
2776                 EXEC_SIZE_1,
2777                 dst_Opnd,
2778                 src0_Opnd,
2779                 src1_Opnd));
2780         }
2781     }
2782 
SetVectorMask(bool VMask)2783     void CEncoder::SetVectorMask(bool VMask)
2784     {
2785         VISA_VectorOpnd* src0_Opnd = nullptr;
2786         VISA_VectorOpnd* src1_Opnd = nullptr;
2787         VISA_VectorOpnd* dst_Opnd = nullptr;
2788         VISA_GenVar* cr0_var = nullptr;
2789         uint bitmaskImm = 1 << 3;
2790         if (!VMask)
2791         {
2792             bitmaskImm = ~bitmaskImm;
2793         }
2794         V(vKernel->GetPredefinedVar(cr0_var, PREDEFINED_CR0));
2795         V(vKernel->CreateVISASrcOperand(src0_Opnd, cr0_var, MODIFIER_NONE, 0, 1, 0, 0, 0));
2796         V(vKernel->CreateVISAImmediate(src1_Opnd, &bitmaskImm, ISA_TYPE_UD));
2797         V(vKernel->CreateVISADstOperand(dst_Opnd, cr0_var, 1, 0, 0));
2798         V(vKernel->AppendVISAArithmeticInst(
2799             VMask ? ISA_OR : ISA_AND,
2800             nullptr,
2801             false,
2802             vISA_EMASK_M1_NM,
2803             EXEC_SIZE_1,
2804             dst_Opnd,
2805             src0_Opnd,
2806             src1_Opnd));
2807     }
2808 
SetRoundingMode_FP(ERoundingMode actualRM,ERoundingMode newRM)2809     void CEncoder::SetRoundingMode_FP(ERoundingMode actualRM, ERoundingMode newRM)
2810     {
2811         IGC_ASSERT_MESSAGE(newRM != ERoundingMode::ROUND_TO_ANY, "Invalid rounding mode");
2812         if (actualRM != newRM)
2813         {
2814             RMEncoding actualRM_en = getEncoderRoundingMode_FP(actualRM);
2815             RMEncoding newRM_en = getEncoderRoundingMode_FP(newRM);
2816             SetRoundingMode(actualRM_en, newRM_en);
2817         }
2818     }
2819 
SetRoundingMode_FPCvtInt(ERoundingMode actualRM,ERoundingMode newRM)2820     void CEncoder::SetRoundingMode_FPCvtInt(ERoundingMode actualRM, ERoundingMode newRM)
2821     {
2822         IGC_ASSERT_MESSAGE(newRM != ERoundingMode::ROUND_TO_ANY, "Invalid rounding mode");
2823         if (actualRM != newRM)
2824         {
2825             RMEncoding actualRM_en = getEncoderRoundingMode_FPCvtInt(actualRM);
2826             RMEncoding newRM_en = getEncoderRoundingMode_FPCvtInt(newRM);
2827             SetRoundingMode(actualRM_en, newRM_en);
2828         }
2829     }
2830 
2831     // Set rounding mode based on given encoding.
SetRoundingMode(RMEncoding actualRM,RMEncoding newRM)2832     void CEncoder::SetRoundingMode(RMEncoding actualRM, RMEncoding newRM)
2833     {
2834         IGC_ASSERT_MESSAGE((actualRM != newRM), "Only setting RM if the new RM is different from the current RM!");
2835 
2836         VISA_VectorOpnd* src0_Opnd = nullptr;
2837         VISA_VectorOpnd* src1_Opnd = nullptr;
2838         VISA_VectorOpnd* dst_Opnd = nullptr;
2839         VISA_GenVar* cr0_var = nullptr;
2840         uint roundingMode = actualRM ^ newRM;
2841         IGC_ASSERT(nullptr != vKernel);
2842         V(vKernel->GetPredefinedVar(cr0_var, PREDEFINED_CR0));
2843         V(vKernel->CreateVISASrcOperand(src0_Opnd, cr0_var, MODIFIER_NONE, 0, 1, 0, 0, 0));
2844         V(vKernel->CreateVISAImmediate(src1_Opnd, &roundingMode, ISA_TYPE_UD));
2845         V(vKernel->CreateVISADstOperand(dst_Opnd, cr0_var, 1, 0, 0));
2846         V(vKernel->AppendVISAArithmeticInst(
2847             ISA_XOR,
2848             nullptr,
2849             false,
2850             vISA_EMASK_M1_NM,
2851             EXEC_SIZE_1,
2852             dst_Opnd,
2853             src0_Opnd,
2854             src1_Opnd));
2855     }
2856 
getEncoderRoundingMode_FP(ERoundingMode FP_RM)2857     CEncoder::RMEncoding CEncoder::getEncoderRoundingMode_FP(ERoundingMode FP_RM)
2858     {
2859         switch (FP_RM) {
2860         default:
2861             break;
2862         case ROUND_TO_POSITIVE:
2863             return RMEncoding::RoundToPositive;
2864         case ROUND_TO_NEGATIVE:
2865             return RMEncoding::RoundToNegative;
2866         case ROUND_TO_ZERO:
2867             return RMEncoding::RoundToZero;
2868         }
2869         return RMEncoding::RoundToNearestEven;
2870     }
2871 
getEncoderRoundingMode_FPCvtInt(ERoundingMode FCvtI_RM)2872     CEncoder::RMEncoding CEncoder::getEncoderRoundingMode_FPCvtInt(ERoundingMode FCvtI_RM)
2873     {
2874         switch (FCvtI_RM) {
2875         default:
2876             break;
2877         case ROUND_TO_NEAREST_EVEN:
2878             return RMEncoding::RoundToNearestEven_int;
2879         case ROUND_TO_POSITIVE:
2880             return RMEncoding::RoundToPositive_int;
2881         case ROUND_TO_NEGATIVE:
2882             return RMEncoding::RoundToNegative_int;
2883         }
2884         return RMEncoding::RoundToZero_int;
2885     }
2886 
GetLabel(uint label)2887     VISA_LabelOpnd* CEncoder::GetLabel(uint label)
2888     {
2889         VISA_LabelOpnd* visaLabel = labelMap[label];
2890         if (visaLabel == nullptr)
2891         {
2892             // all blocks should have labels; but new blocks inserted during
2893             // encoding might not
2894             VISA_Label_Kind kind = LABEL_BLOCK;
2895 
2896             std::stringstream lbl;
2897             if (labelNameMap[label].empty()) {
2898                 lbl << CreateShortLabel(labelCounter++);
2899             } else {
2900                 lbl << labelNameMap[label].getVisaCString();
2901             }
2902             V(vKernel->CreateVISALabelVar(visaLabel, lbl.str().c_str(), kind));
2903             labelMap[label] = visaLabel;
2904         }
2905         return visaLabel;
2906     }
2907 
GetStackFunction(llvm::Function * F)2908     VISAFunction* CEncoder::GetStackFunction(llvm::Function* F)
2909     {
2910         auto Iter = stackFuncMap.find(F);
2911         if (Iter != stackFuncMap.end())
2912         {
2913             return Iter->second;
2914         }
2915         VISAFunction* visaFunc = nullptr;
2916         V(vbuilder->AddFunction(visaFunc, F->getName().data()));
2917         stackFuncMap[F] = visaFunc;
2918         return visaFunc;
2919     }
2920 
GetFuncLabel(llvm::Function * F)2921     VISA_LabelOpnd* CEncoder::GetFuncLabel(llvm::Function* F)
2922     {
2923         auto Iter = funcLabelMap.find(F);
2924         if (Iter != funcLabelMap.end())
2925         {
2926             return Iter->second;
2927         }
2928 
2929         // Create a new function label.
2930         VISA_LabelOpnd* visaLabel = nullptr;
2931         V(vKernel->CreateVISALabelVar(visaLabel, F->getName().data(), LABEL_SUBROUTINE));
2932         funcLabelMap[F] = visaLabel;
2933 
2934         return visaLabel;
2935     }
2936 
Push()2937     void CEncoder::Push()
2938     {
2939         Init();
2940     }
2941 
GetUniformSource(CVariable * var)2942     VISA_VectorOpnd* CEncoder::GetUniformSource(CVariable* var)
2943     {
2944         VISA_VectorOpnd* srcOperand = nullptr;
2945         if (var == nullptr)
2946         {
2947             return nullptr;
2948         }
2949         if (var->IsImmediate())
2950         {
2951             // TODO: need support for 64 bits immediate
2952             uint immediate = int_cast<uint>(var->GetImmediateValue());
2953             V(vKernel->CreateVISAImmediate(srcOperand, &immediate, ISA_TYPE_UD));
2954         }
2955         else
2956         {
2957             unsigned char rowOffset = 0;
2958             unsigned char colOffset = 0;
2959             GetRowAndColOffset(var, 0, 0, rowOffset, colOffset);
2960             V(vKernel->CreateVISASrcOperand(srcOperand, GetVISAVariable(var), MODIFIER_NONE, 0, 1, 0, rowOffset, colOffset));
2961         }
2962         return srcOperand;
2963     }
2964 
GetVISAPlatform(const CPlatform * platform)2965     TARGET_PLATFORM GetVISAPlatform(const CPlatform* platform)
2966     {
2967         switch (platform->GetPlatformFamily())
2968         {
2969         case IGFX_GEN8_CORE:
2970             if (platform->getPlatformInfo().eProductFamily == IGFX_CHERRYVIEW)
2971             {
2972                 return GENX_CHV;
2973             }
2974             else
2975             {
2976                 return GENX_BDW;
2977             }
2978             // fall-through
2979         case IGFX_GEN9_CORE:
2980         case IGFX_GENNEXT_CORE:
2981             if (platform->getPlatformInfo().eProductFamily == IGFX_BROXTON ||
2982                 platform->getPlatformInfo().eProductFamily == IGFX_GEMINILAKE)
2983             {
2984                 return GENX_BXT;
2985             }
2986             else
2987             {
2988                 return GENX_SKL;
2989             }
2990             // fall-through
2991         case IGFX_GEN11_CORE:
2992             return GENX_ICLLP;
2993         case IGFX_GEN12_CORE:
2994         case IGFX_XE_HP_CORE:
2995         case IGFX_GEN12LP_CORE:
2996             if (   platform->getPlatformInfo().eProductFamily == IGFX_TIGERLAKE_LP
2997                 || platform->getPlatformInfo().eProductFamily == IGFX_DG1
2998                 || platform->getPlatformInfo().eProductFamily == IGFX_ROCKETLAKE
2999                 || platform->getPlatformInfo().eProductFamily == IGFX_ALDERLAKE_S
3000                 || platform->getPlatformInfo().eProductFamily == IGFX_ALDERLAKE_P
3001                )
3002             {
3003                 return GENX_TGLLP;
3004             }
3005             else if (platform->getPlatformInfo().eProductFamily == IGFX_XE_HP_SDV)
3006             {
3007                 return XeHP_SDV;
3008             }
3009             // fall-through
3010         default:
3011             IGC_ASSERT_MESSAGE(0, "unsupported platform");
3012             break;
3013         }
3014         return GENX_SKL;
3015     }
3016 
OWLoad(CVariable * dst,const ResourceDescriptor & resource,CVariable * src0,bool owordAligned,uint bytesToBeRead,uint dstOffset)3017     void CEncoder::OWLoad(CVariable* dst, const ResourceDescriptor& resource, CVariable* src0, bool owordAligned, uint bytesToBeRead, uint dstOffset)
3018     {
3019         VISA_StateOpndHandle* surfOpnd = GetVISASurfaceOpnd(resource);
3020         VISA_VectorOpnd* offset = GetUniformSource(src0);
3021         VISA_RawOpnd* dstVar = GetRawDestination(dst, dstOffset);
3022         uint size = (bytesToBeRead / SIZE_OWORD);
3023 
3024         V(vKernel->AppendVISASurfAccessOwordLoadStoreInst(
3025             owordAligned ? ISA_OWORD_LD : ISA_OWORD_LD_UNALIGNED,
3026             vISA_EMASK_M1_NM,  // OWord load is always nomask
3027             surfOpnd,
3028             ConvertSizeToVisaType(size),
3029             offset,
3030             dstVar));
3031     }
3032 
OWStore(CVariable * data,e_predefSurface surfaceType,CVariable * bufId,CVariable * src0,uint bytesToBeRead,uint srcOffset)3033     void CEncoder::OWStore(CVariable* data, e_predefSurface surfaceType, CVariable* bufId, CVariable* src0, uint bytesToBeRead, uint srcOffset)
3034     {
3035         VISA_StateOpndHandle* surfOpnd = GetVISASurfaceOpnd(surfaceType, bufId);
3036         VISA_VectorOpnd* offset = GetUniformSource(src0);
3037         VISA_RawOpnd* dataVar = GetRawSource(data, srcOffset);
3038         uint size = (bytesToBeRead / SIZE_OWORD);
3039 
3040         V(vKernel->AppendVISASurfAccessOwordLoadStoreInst(
3041             ISA_OWORD_ST,
3042             vISA_EMASK_M1_NM,
3043             surfOpnd,
3044             ConvertSizeToVisaType(size),
3045             offset,
3046             dataVar));
3047         if (ESURFACE_STATELESS == surfaceType)
3048         {
3049             this->m_program->IncStatelessWritesCount();
3050         }
3051     }
3052 
OWStoreA64(CVariable * data,CVariable * src0,uint bytesToBeRead,uint srcOffset)3053     void CEncoder::OWStoreA64(CVariable* data, CVariable* src0, uint bytesToBeRead, uint srcOffset)
3054     {
3055         VISA_VectorOpnd* offset = GetUniformSource(src0);
3056         VISA_RawOpnd* dataVar = GetRawDestination(data, srcOffset);
3057         uint size = (bytesToBeRead / SIZE_OWORD);
3058 
3059         V(vKernel->AppendVISASvmBlockStoreInst(
3060             ConvertSizeToVisaType(size),
3061             true,   // always unaligned for now
3062             offset,
3063             dataVar));
3064     }
3065 
OWLoadA64(CVariable * dst,CVariable * src0,uint bytesToBeRead,uint dstOffset)3066     void CEncoder::OWLoadA64(CVariable* dst, CVariable* src0, uint bytesToBeRead, uint dstOffset)
3067     {
3068         VISA_VectorOpnd* offset = GetUniformSource(src0);
3069         VISA_RawOpnd* dstVar = GetRawDestination(dst, dstOffset);
3070         uint size = (bytesToBeRead / SIZE_OWORD);
3071 
3072         V(vKernel->AppendVISASvmBlockLoadInst(
3073             ConvertSizeToVisaType(size),
3074             true,   // always unaligned for now
3075             offset,
3076             dstVar));
3077     }
3078 
MediaBlockMessage(ISA_Opcode subOpcode,CVariable * dst,e_predefSurface surfaceType,CVariable * bufId,CVariable * xOffset,CVariable * yOffset,uint modifier,unsigned char blockWidth,unsigned char blockHeight,uint plane)3079     void CEncoder::MediaBlockMessage(
3080         ISA_Opcode subOpcode,
3081         CVariable* dst,
3082         e_predefSurface surfaceType,
3083         CVariable* bufId,
3084         CVariable* xOffset,
3085         CVariable* yOffset,
3086         uint modifier,
3087         unsigned char blockWidth,
3088         unsigned char blockHeight,
3089         uint plane)
3090     {
3091         VISA_StateOpndHandle* surfOpnd = GetVISASurfaceOpnd(surfaceType, bufId);
3092         VISA_VectorOpnd* xVar = GetUniformSource(xOffset);
3093         VISA_VectorOpnd* yVar = GetUniformSource(yOffset);
3094         VISA_RawOpnd* tempVar = nullptr;
3095         if (subOpcode == ISA_MEDIA_LD)
3096         {
3097             tempVar = GetRawDestination(dst);
3098         }
3099         else if (subOpcode == ISA_MEDIA_ST)
3100         {
3101             tempVar = GetRawSource(dst);
3102         }
3103 
3104         MEDIA_LD_mod  modi = (MEDIA_LD_mod)modifier;
3105         CISA_PLANE_ID planeVar = (CISA_PLANE_ID)plane;
3106 
3107         V(vKernel->AppendVISASurfAccessMediaLoadStoreInst(
3108             subOpcode,
3109             modi,
3110             surfOpnd,
3111             blockWidth,
3112             blockHeight,
3113             xVar,
3114             yVar,
3115             tempVar,
3116             planeVar));
3117     }
3118 
TypedReadWrite(ISA_Opcode opcode,const ResourceDescriptor & resource,CVariable * pU,CVariable * pV,CVariable * pR,CVariable * pLOD,CVariable * pSrcDst,uint writeMask)3119     void CEncoder::TypedReadWrite(
3120         ISA_Opcode opcode,
3121         const ResourceDescriptor& resource,
3122         CVariable* pU,
3123         CVariable* pV,
3124         CVariable* pR,
3125         CVariable* pLOD,
3126         CVariable* pSrcDst,
3127         uint writeMask)
3128     {
3129         // only SIMD 8 reads & writes are supported.
3130         VISAChannelMask channelMask = CHANNEL_MASK_RGBA;//for typed write leaving this as before
3131         if (writeMask != 0)
3132         {
3133             channelMask = ConvertChannelMaskToVisaType(writeMask);
3134         }
3135         VISA_StateOpndHandle* pSurfStateOpndHandle = GetVISASurfaceOpnd(resource);
3136 
3137         // TODO unify the way we calculate offset for raw sources, maybe we shouldn't use offset at all
3138         VISA_RawOpnd* pUOffset = GetRawSource(pU, m_encoderState.m_srcOperand[0].subVar * getGRFSize());
3139         VISA_RawOpnd* pVOffset = GetRawSource(pV, m_encoderState.m_srcOperand[1].subVar * getGRFSize());
3140         VISA_RawOpnd* pROffset = GetRawSource(pR, m_encoderState.m_srcOperand[2].subVar * getGRFSize());
3141         VISA_RawOpnd* pLODOffset = GetRawSource(pLOD, m_encoderState.m_srcOperand[3].subVar * getGRFSize());
3142         VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
3143         IGC_ASSERT(0 == m_encoderState.m_dstOperand.subVar);
3144 
3145         VISA_RawOpnd* pDstVar = nullptr;
3146         VISA_EMask_Ctrl mask;
3147         if (opcode == ISA_SCATTER4_TYPED)
3148         {
3149             pDstVar = GetRawSource(pSrcDst, 0);
3150             mask = ConvertMaskToVisaType(m_encoderState.m_mask, m_encoderState.m_noMask);
3151         }
3152         else
3153         {
3154             pDstVar = GetRawDestination(pSrcDst);
3155             mask = GetAluEMask(pSrcDst);
3156         }
3157 
3158         V(vKernel->AppendVISASurfAccessGather4Scatter4TypedInst(
3159             opcode,
3160             predOpnd,
3161             channelMask,
3162             mask,
3163             visaExecSize(m_encoderState.m_simdSize),
3164             pSurfStateOpndHandle,
3165             pUOffset,
3166             pVOffset,
3167             pROffset,
3168             pLODOffset,
3169             pDstVar));
3170     }
3171 
ScatterGather(ISA_Opcode opcode,CVariable * srcdst,CVariable * bufId,CVariable * offset,CVariable * gOffset,e_predefSurface surface,int elementSize)3172     void CEncoder::ScatterGather(ISA_Opcode opcode, CVariable* srcdst, CVariable* bufId, CVariable* offset, CVariable* gOffset, e_predefSurface surface, int elementSize)
3173     {
3174         VISA_VectorOpnd* globalOffsetOpnd = nullptr;
3175         VISA_StateOpndHandle* surfOpnd = GetVISASurfaceOpnd(surface, bufId);
3176         if (gOffset)
3177         {
3178             globalOffsetOpnd = GetUniformSource(gOffset);
3179         }
3180         else
3181         {
3182             int value = 0;
3183             V(vKernel->CreateVISAImmediate(globalOffsetOpnd, &value, ISA_TYPE_UD));
3184         }
3185         VISA_RawOpnd* elementOffset = GetRawSource(offset);
3186 
3187         VISA_RawOpnd* dstVar = NULL;
3188 
3189         VISA_EMask_Ctrl mask;
3190         if (opcode == ISA_GATHER)
3191         {
3192             dstVar = GetRawDestination(srcdst);
3193             mask = GetAluEMask(srcdst);
3194         }
3195         else
3196         {
3197             dstVar = GetRawSource(srcdst);
3198             mask = ConvertMaskToVisaType(m_encoderState.m_mask, m_encoderState.m_noMask);
3199         }
3200 
3201         V(vKernel->AppendVISASurfAccessGatherScatterInst(
3202             opcode,
3203             mask,
3204             visaElementSize(elementSize),
3205             visaExecSize(m_encoderState.m_simdSize),
3206             surfOpnd,
3207             globalOffsetOpnd,
3208             elementOffset,
3209             dstVar));
3210         if (ISA_SCATTER == opcode && ESURFACE_STATELESS == surface)
3211         {
3212             this->m_program->IncStatelessWritesCount();
3213         }
3214     }
3215 
GenericAlu(e_opcode opcode,CVariable * dst,CVariable * src0,CVariable * src1,CVariable * src2)3216     void CEncoder::GenericAlu(e_opcode opcode, CVariable* dst, CVariable* src0, CVariable* src1, CVariable* src2)
3217     {
3218         ISA_Opcode visaOpcode = ConvertOpcode[opcode];
3219         switch (visaOpcode)
3220         {
3221         case ISA_MOV:
3222         case ISA_MOVS:
3223         case ISA_SETP:
3224             DataMov(visaOpcode, dst, src0);
3225             break;
3226         case ISA_FMINMAX:
3227             MinMax(opcode == EOPCODE_MIN ? CISA_DM_FMIN : CISA_DM_FMAX, dst, src0, src1);
3228             break;
3229         case ISA_AND:
3230         case ISA_ASR:
3231         case ISA_CBIT:
3232         case ISA_FBH:
3233         case ISA_FBL:
3234         case ISA_NOT:
3235         case ISA_OR:
3236         case ISA_SHL:
3237         case ISA_SHR:
3238         case ISA_ROL:
3239         case ISA_ROR:
3240         case ISA_XOR:
3241             LogicOp(visaOpcode, dst, src0, src1, src2);
3242             break;
3243         default:
3244             Arithmetic(visaOpcode, dst, src0, src1, src2);
3245             break;
3246         }
3247     }
3248 
GetVISASurfaceOpnd(const ResourceDescriptor & resource)3249     VISA_StateOpndHandle* CEncoder::GetVISASurfaceOpnd(const ResourceDescriptor& resource)
3250     {
3251         return GetVISASurfaceOpnd(resource.m_surfaceType, resource.m_resource);
3252     }
3253 
GetVISASurfaceOpnd(e_predefSurface surfaceType,CVariable * bti)3254     VISA_StateOpndHandle* CEncoder::GetVISASurfaceOpnd(e_predefSurface surfaceType, CVariable* bti)
3255     {
3256         VISA_StateOpndHandle* surfOpnd = nullptr;
3257         if (surfaceType == ESURFACE_NORMAL || surfaceType == ESURFACE_BINDLESS || surfaceType == ESURFACE_SSHBINDLESS)
3258         {
3259             VISA_SurfaceVar* surfacevar = nullptr;
3260             if (surfaceType == ESURFACE_BINDLESS)
3261             {
3262                 V(vKernel->GetPredefinedSurface(surfacevar, PREDEFINED_SURFACE_T252));
3263             }
3264             else
3265             {
3266                 surfacevar = dummySurface;
3267             }
3268             VISA_VectorOpnd* sourecOpnd = GetUniformSource(bti);
3269             VISA_VectorOpnd* dstOpnd = nullptr;
3270             V(vKernel->CreateVISAStateOperand(dstOpnd, surfacevar, 0, true));
3271 
3272             //Add the mov special instruction
3273             V(vKernel->AppendVISADataMovementInst(
3274                 ISA_MOVS,
3275                 nullptr,
3276                 false,
3277                 vISA_EMASK_M1_NM,
3278                 EXEC_SIZE_1,
3279                 dstOpnd,
3280                 sourecOpnd,
3281                 nullptr));
3282 
3283             V(vKernel->CreateVISAStateOperandHandle(surfOpnd, surfacevar));
3284         }
3285         else
3286         {
3287             VISA_SurfaceVar* surfacevar = NULL;
3288             switch (surfaceType)
3289             {
3290             case ESURFACE_SLM:
3291                 V(vKernel->GetPredefinedSurface(surfacevar, PREDEFINED_SURFACE_SLM));
3292                 break;
3293             case ESURFACE_STATELESS:
3294                 V(vKernel->GetPredefinedSurface(surfacevar, PREDEFINED_SURFACE_T255));
3295                 break;
3296             case ESURFACE_SCRATCH:
3297                 // NOTE: For scratch surface, we need to shr the surface state offset coming in R0.5 by 4.
3298                 //       This shr operation is generated by vISA in HDC path
3299                 V(vKernel->GetPredefinedSurface(surfacevar, PREDEFINED_SURFACE_SCRATCH));
3300                 break;
3301             default:
3302                 IGC_ASSERT_MESSAGE(0, "Invalid surface");
3303                 break;
3304             }
3305             V(vKernel->CreateVISAStateOperandHandle(surfOpnd, surfacevar));
3306         }
3307         return surfOpnd;
3308     }
3309 
ConvertMaskToVisaType(e_mask mask,bool noMask)3310     VISA_EMask_Ctrl CEncoder::ConvertMaskToVisaType(e_mask mask, bool noMask)
3311     {
3312         VISA_EMask_Ctrl emaskRet = vISA_EMASK_M1_NM;
3313         switch (mask)
3314         {
3315         case EMASK_Q1:
3316             if (m_encoderState.m_secondHalf)
3317             {
3318                 emaskRet = noMask ? vISA_EMASK_M5_NM : vISA_EMASK_M5;
3319             }
3320             else
3321             {
3322                 emaskRet = noMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1;
3323             }
3324             break;
3325         case EMASK_Q2:
3326             if (m_encoderState.m_secondHalf)
3327             {
3328                 emaskRet = noMask ? vISA_EMASK_M7_NM : vISA_EMASK_M7;
3329             }
3330             else
3331             {
3332                 emaskRet = noMask ? vISA_EMASK_M3_NM : vISA_EMASK_M3;
3333             }
3334             break;
3335         case EMASK_Q3:
3336             emaskRet = noMask ? vISA_EMASK_M5_NM : vISA_EMASK_M5;
3337             break;
3338         case EMASK_Q4:
3339             emaskRet = noMask ? vISA_EMASK_M7_NM : vISA_EMASK_M7;
3340             break;
3341         case EMASK_H1:
3342             emaskRet = noMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1;
3343             break;
3344         case EMASK_H2:
3345             emaskRet = noMask ? vISA_EMASK_M5_NM : vISA_EMASK_M5;
3346             break;
3347         default:
3348             IGC_ASSERT_MESSAGE(0, "unreachable");
3349             emaskRet = vISA_EMASK_M1_NM;
3350         }
3351 
3352         if (!m_encoderState.m_secondNibble)
3353             return emaskRet;
3354 
3355         switch (emaskRet) {
3356         case vISA_EMASK_M1:     return vISA_EMASK_M2;
3357         case vISA_EMASK_M1_NM:  return vISA_EMASK_M2_NM;
3358         case vISA_EMASK_M3:     return vISA_EMASK_M4;
3359         case vISA_EMASK_M3_NM:  return vISA_EMASK_M4_NM;
3360         case vISA_EMASK_M5:     return vISA_EMASK_M6;
3361         case vISA_EMASK_M5_NM:  return vISA_EMASK_M6_NM;
3362         case vISA_EMASK_M7:     return vISA_EMASK_M8;
3363         case vISA_EMASK_M7_NM:  return vISA_EMASK_M8_NM;
3364         default:
3365             IGC_ASSERT_MESSAGE(0, "unreachable");
3366             return vISA_EMASK_M1_NM;
3367         }
3368         return vISA_EMASK_M1_NM;
3369     }
3370 
ConvertModifierToVisaType(e_modifier modifier)3371     VISA_Modifier ConvertModifierToVisaType(e_modifier modifier)
3372     {
3373         switch (modifier)
3374         {
3375         case EMOD_NONE:
3376             return MODIFIER_NONE;
3377         case EMOD_SAT:
3378             return MODIFIER_SAT;
3379         case EMOD_ABS:
3380             return MODIFIER_ABS;
3381         case EMOD_NEG:
3382             return MODIFIER_NEG;
3383         case EMOD_NEGABS:
3384             return MODIFIER_NEG_ABS;
3385         case EMOD_NOT:
3386             return MODIFIER_NOT;
3387         default:
3388             IGC_ASSERT_MESSAGE(0, "unreachable");
3389             return MODIFIER_NONE;
3390         }
3391     }
3392 
ConvertCondModToVisaType(e_predicate condMod)3393     VISA_Cond_Mod ConvertCondModToVisaType(e_predicate condMod)
3394     {
3395         switch (condMod)
3396         {
3397         case EPREDICATE_EQ:
3398             return ISA_CMP_E;
3399         case EPREDICATE_NE:
3400             return ISA_CMP_NE;
3401         case EPREDICATE_GT:
3402             return ISA_CMP_G;
3403         case EPREDICATE_GE:
3404             return ISA_CMP_GE;
3405         case EPREDICATE_LT:
3406             return ISA_CMP_L;
3407         case EPREDICATE_LE:
3408             return ISA_CMP_LE;
3409         default:
3410             IGC_ASSERT_MESSAGE(0, "unreachable");
3411             return ISA_CMP_UNDEF;
3412         }
3413     }
3414 
ConvertSizeToVisaType(uint size)3415     VISA_Oword_Num  ConvertSizeToVisaType(uint size)
3416     {
3417         switch (size)
3418         {
3419         case 1:
3420             return OWORD_NUM_1;
3421         case 2:
3422             return OWORD_NUM_2;
3423         case 4:
3424             return OWORD_NUM_4;
3425         case 8:
3426             return OWORD_NUM_8;
3427         case 16:
3428             return OWORD_NUM_16;
3429         default:
3430             IGC_ASSERT_MESSAGE(0, "unreachable");
3431             return OWORD_NUM_ILLEGAL;
3432         }
3433     }
3434 
ConvertChannelMaskToVisaType(uint mask)3435     VISAChannelMask ConvertChannelMaskToVisaType(uint mask)
3436     {
3437         switch (mask & 0xf)
3438         {
3439         case 1:   return CHANNEL_MASK_R;
3440         case 2:   return CHANNEL_MASK_G;
3441         case 3:   return CHANNEL_MASK_RG;
3442         case 4:   return CHANNEL_MASK_B;
3443         case 5:   return CHANNEL_MASK_RB;
3444         case 6:   return CHANNEL_MASK_GB;
3445         case 7:   return CHANNEL_MASK_RGB;
3446         case 8:   return CHANNEL_MASK_A;
3447         case 9:   return CHANNEL_MASK_RA;
3448         case 0xa: return CHANNEL_MASK_GA;
3449         case 0xb: return CHANNEL_MASK_RGA;
3450         case 0xc: return CHANNEL_MASK_BA;
3451         case 0xd: return CHANNEL_MASK_RBA;
3452         case 0xe: return CHANNEL_MASK_GBA;
3453         case 0xf: return CHANNEL_MASK_RGBA;
3454         default:
3455         {
3456             IGC_ASSERT_MESSAGE(0, "Wrong mask");
3457             return CHANNEL_MASK_NOMASK;
3458         }
3459         }
3460     }
3461 
ConvertSubOpcode(EOPCODE subOpcode,bool zeroLOD)3462     VISASampler3DSubOpCode CEncoder::ConvertSubOpcode(EOPCODE subOpcode, bool zeroLOD)
3463     {
3464         switch (subOpcode)
3465         {
3466         case llvm_sampleptr:
3467             return VISA_3D_SAMPLE;
3468         case llvm_sample_bptr:
3469             return VISA_3D_SAMPLE_B;
3470         case llvm_sample_cptr:
3471             return VISA_3D_SAMPLE_C;
3472         case llvm_sample_dptr:
3473             return VISA_3D_SAMPLE_D;
3474         case llvm_sample_dcptr:
3475             return VISA_3D_SAMPLE_D_C;
3476         case llvm_sample_lptr:
3477             return zeroLOD ? VISA_3D_SAMPLE_LZ : VISA_3D_SAMPLE_L;
3478         case llvm_sample_lcptr:
3479             return zeroLOD ? VISA_3D_SAMPLE_C_LZ : VISA_3D_SAMPLE_L_C;
3480         case llvm_sample_bcptr:
3481             return VISA_3D_SAMPLE_B_C;
3482         case llvm_ld_ptr:
3483             return zeroLOD ? VISA_3D_LD_LZ : VISA_3D_LD;
3484         case llvm_resinfoptr:
3485             return VISA_3D_RESINFO;
3486         case llvm_gather4ptr:
3487             return VISA_3D_GATHER4;
3488         case llvm_gather4Cptr:
3489             return VISA_3D_GATHER4_C;
3490         case llvm_gather4POptr:
3491             return VISA_3D_GATHER4_PO;
3492         case llvm_gather4POCptr:
3493             return VISA_3D_GATHER4_PO_C;
3494         case llvm_sampleinfoptr:
3495             return VISA_3D_SAMPLEINFO;
3496         case llvm_ldmsptr:
3497         case llvm_ldmsptr16bit:
3498             return VISA_3D_LD2DMS_W;
3499         case llvm_ldmcsptr:
3500             return VISA_3D_LD_MCS;
3501         case llvm_lodptr:
3502             return VISA_3D_LOD;
3503         case llvm_sample_killpix:
3504             return VISA_3D_SAMPLE_KILLPIX;
3505         default:
3506             IGC_ASSERT_MESSAGE(0, "wrong sampler subopcode");
3507             return VISA_3D_SAMPLE;
3508         }
3509     }
3510 
IsIntegerType(VISA_Type type)3511     bool CEncoder::IsIntegerType(VISA_Type type)
3512     {
3513         return (type == ISA_TYPE_B ||
3514             type == ISA_TYPE_UB ||
3515             type == ISA_TYPE_W ||
3516             type == ISA_TYPE_UW ||
3517             type == ISA_TYPE_D ||
3518             type == ISA_TYPE_UD ||
3519             type == ISA_TYPE_Q ||
3520             type == ISA_TYPE_UQ ||
3521             0);
3522     }
3523 
IsFloatType(VISA_Type type)3524     bool CEncoder::IsFloatType(VISA_Type type)
3525     {
3526         return (type == ISA_TYPE_F ||
3527             type == ISA_TYPE_DF ||
3528             0);
3529     }
3530 
ConvertSingleSourceChannel(uint srcChannel)3531     VISASourceSingleChannel ConvertSingleSourceChannel(uint srcChannel)
3532     {
3533         switch (srcChannel)
3534         {
3535         case 0:
3536             return VISA_3D_GATHER4_CHANNEL_R;
3537         case 1:
3538             return VISA_3D_GATHER4_CHANNEL_G;
3539         case 2:
3540             return VISA_3D_GATHER4_CHANNEL_B;
3541         case 3:
3542             return VISA_3D_GATHER4_CHANNEL_A;
3543         default:
3544             IGC_ASSERT_MESSAGE(0, "Wrong channel");
3545             return VISA_3D_GATHER4_CHANNEL_R;
3546         }
3547     }
3548 
BeginSubroutine(llvm::Function * F)3549     void CEncoder::BeginSubroutine(llvm::Function* F)
3550     {
3551         InitLabelMap(F);
3552         V(vKernel->AppendVISACFLabelInst(GetFuncLabel(F)));
3553     }
3554 
BeginStackFunction(llvm::Function * F)3555     void CEncoder::BeginStackFunction(llvm::Function* F)
3556     {
3557         InitLabelMap(F);
3558         // At this place, the vISA object is changed!
3559         vKernel = GetStackFunction(F);
3560         VISA_LabelOpnd* visaLabel = nullptr;
3561         V(vKernel->CreateVISALabelVar(visaLabel, F->getName().data(), LABEL_SUBROUTINE));
3562         V(vKernel->AppendVISACFLabelInst(visaLabel));
3563     }
3564 
BeginPayloadSection()3565     void CEncoder::BeginPayloadSection()
3566     {
3567         // Payload Section is created as a function and compiled separately
3568         // from the shader body
3569         VISAFunction* visaFunc = nullptr;
3570         V(vbuilder->AddPayloadSection(visaFunc, "PayloadSection"));
3571         vPayloadSection = visaFunc;
3572         CodeGenContext* context = m_program->GetContext();
3573         std::string asmName;
3574         if (m_enableVISAdump || context->m_instrTypes.hasDebugInfo)
3575         {
3576             asmName = GetDumpFileName("asm");
3577         }
3578         else
3579         {
3580             asmName = "kernel.asm";
3581         }
3582         V(vPayloadSection->AddKernelAttribute("OutputAsmPath", asmName.length(), asmName.c_str()));
3583 
3584         VISA_LabelOpnd* functionLabel = nullptr;
3585         V(vPayloadSection->CreateVISALabelVar(functionLabel, "payload", LABEL_SUBROUTINE));
3586         V(vPayloadSection->AppendVISACFLabelInst(functionLabel));
3587         vMainKernel = vPayloadSection;
3588     }
3589 
AddVISASymbol(std::string & symName,CVariable * cvar)3590     void CEncoder::AddVISASymbol(std::string& symName, CVariable* cvar)
3591     {
3592         SModifier mod;
3593         mod.init();
3594         VISA_VectorOpnd* visaSymAddr = GetDestinationOperand(cvar, mod);
3595         V(vKernel->AppendVISACFSymbolInst(symName, visaSymAddr));
3596     }
3597 
SaveOption(vISAOptions option,bool val)3598     void CEncoder::SaveOption(vISAOptions option, bool val)
3599     {
3600         OptionValue entry;
3601         entry.type = OpType::ET_BOOL;
3602         entry.vBool = val;
3603         m_visaUserOptions.push_back(std::make_pair(option, entry));
3604     }
SaveOption(vISAOptions option,uint32_t val)3605     void CEncoder::SaveOption(vISAOptions option, uint32_t val)
3606     {
3607         OptionValue entry;
3608         entry.type = OpType::ET_INT32;
3609         entry.vInt32 = val;
3610         m_visaUserOptions.push_back(std::make_pair(option, entry));
3611     }
SaveOption(vISAOptions option,const char * val)3612     void CEncoder::SaveOption(vISAOptions option, const char* val)
3613     {
3614         OptionValue entry;
3615         entry.type = OpType::ET_CSTR;
3616         entry.vCstr = val;
3617         m_visaUserOptions.push_back(std::make_pair(option, entry));
3618     }
SetBuilderOptions(VISABuilder * pbuilder)3619     void CEncoder::SetBuilderOptions(VISABuilder* pbuilder)
3620     {
3621         for (auto OP : m_visaUserOptions)
3622         {
3623             switch (OP.second.type)
3624             {
3625             case OpType::ET_BOOL:
3626                 pbuilder->SetOption(OP.first, OP.second.vBool);
3627                 break;
3628             case OpType::ET_INT32:
3629                 pbuilder->SetOption(OP.first, OP.second.vInt32);
3630                 break;
3631             case OpType::ET_CSTR:
3632                 pbuilder->SetOption(OP.first, OP.second.vCstr);
3633                 break;
3634             default:
3635                 IGC_ASSERT_MESSAGE(0, "Undefined user option type");
3636                 break;
3637             }
3638         }
3639     }
3640 
InitBuildParams(llvm::SmallVector<std::unique_ptr<char,std::function<void (char *)>>,10> & params)3641     void CEncoder::InitBuildParams(llvm::SmallVector<std::unique_ptr< char, std::function<void(char*)>>, 10>& params)
3642     {
3643         CodeGenContext* context = m_program->GetContext();
3644         bool isOptDisabled = context->getModuleMetaData()->compOpt.OptDisable;
3645         using param_uptr = std::unique_ptr<char, std::function<void(char*)>>;
3646         auto literal_deleter = [](char* val) {};
3647         auto dup_deleter = [](char* val) {free(val); };
3648         // create vbuilder->Compile() params
3649         if (IGC_IS_FLAG_ENABLED(EnableVISADotAll))
3650         {
3651             params.push_back(param_uptr("-dotAll", literal_deleter));
3652         }
3653         if (IGC_IS_FLAG_ENABLED(EnableVISADebug) || isOptDisabled)
3654         {
3655             params.push_back(param_uptr("-debug", literal_deleter));
3656         }
3657 
3658         if (context->getModuleMetaData()->compOpt.FastVISACompile)
3659         {
3660             params.push_back(param_uptr("-fasterRA", literal_deleter));
3661             params.push_back(param_uptr("-noLocalSplit", literal_deleter));
3662         }
3663         if (IGC_IS_FLAG_ENABLED(EnableGlobalStateBuffer))
3664         {
3665             params.push_back(param_uptr("-emitCrossThreadOffR0Reloc", literal_deleter));
3666         }
3667         // Ensure VISA_Opts has the same scope as CreateVISABuilder so that valid
3668         // strings are checked by vISA and freed out of this function.
3669         if (IGC_IS_FLAG_ENABLED(VISAOptions))
3670         {
3671             std::vector<std::string> VISA_Opts;
3672             const char* DELIMITERS = " \t\n\v\f\r,"; // isspace(c), and comma for igcstandalone
3673             std::string line(IGC_GET_REGKEYSTRING(VISAOptions));
3674             std::size_t pos = 0;
3675             std::size_t found;
3676             for (; (found = line.find_first_of(DELIMITERS, pos)) != std::string::npos; ++pos) {
3677                 // Skip consecutive whitespaces.
3678                 if (found == pos)
3679                     continue;
3680                 VISA_Opts.push_back(line.substr(pos, found - pos));
3681                 pos = found;
3682             }
3683             if (pos < line.length())
3684                 VISA_Opts.push_back(line.substr(pos));
3685             for (auto& opt : VISA_Opts) {
3686                 // note that the memory should be freed once
3687                 // params has been read, but since this is only for
3688                 // debugging, do not bother freeing memory.
3689                 params.push_back(param_uptr(_strdup(opt.c_str()), dup_deleter));
3690                 if (opt == "-output" || opt == "-binary" || opt == "-dumpvisa" || opt == "-dumpcommonisa")
3691                 {
3692                     m_enableVISAdump = true;
3693                 }
3694             }
3695         }
3696         if (IGC_IS_FLAG_DISABLED(ForceDisableShaderDebugHashCodeInKernel) &&
3697             (context->m_DriverInfo.EnableShaderDebugHashCodeInKernel() ||
3698              IGC_IS_FLAG_ENABLED(ShaderDebugHashCodeInKernel)))
3699         {
3700             auto addHash = [&](char* OptName, QWORD Hash)
3701             {
3702                 params.push_back(param_uptr(OptName, literal_deleter));
3703                 std::string Low = std::to_string((DWORD)Hash);
3704                 std::string High = std::to_string((DWORD)(Hash >> 32));
3705                 params.push_back(param_uptr(_strdup(Low.c_str()), dup_deleter));
3706                 params.push_back(param_uptr(_strdup(High.c_str()), dup_deleter));
3707             };
3708 
3709             QWORD AssemblyHash = context->hash.getAsmHash();
3710             addHash("-hashmovs", AssemblyHash);
3711 
3712             QWORD NosHash = context->hash.getNosHash();
3713             QWORD PsoHash = context->hash.getPsoHash();
3714             QWORD hashToUse = NosHash != 0 ? NosHash : PsoHash;
3715             if (hashToUse)
3716                 addHash("-hashmovs1", hashToUse);
3717             else if (context->hash.getPerShaderPsoHash() != 0)
3718                 addHash("-hashmovs1", context->hash.getPerShaderPsoHash());
3719         }
3720     }
InitVISABuilderOptions(TARGET_PLATFORM VISAPlatform,bool canAbortOnSpill,bool hasStackCall,bool enableVISA_IR)3721     void CEncoder::InitVISABuilderOptions(TARGET_PLATFORM VISAPlatform, bool canAbortOnSpill, bool hasStackCall, bool enableVISA_IR)
3722     {
3723         CodeGenContext* context = m_program->GetContext();
3724         bool KernelDebugEnable = false;
3725         bool ForceNonCoherentStatelessBti = false;
3726         bool AllowSpill = true;
3727         if (context->type == ShaderType::OPENCL_SHADER)
3728         {
3729             auto ClContext = static_cast<OpenCLProgramContext*>(context);
3730             KernelDebugEnable = ClContext->m_InternalOptions.KernelDebugEnable;
3731             ForceNonCoherentStatelessBti = ClContext->m_ShouldUseNonCoherentStatelessBTI;
3732             AllowSpill = !ClContext->m_InternalOptions.NoSpill;
3733 
3734             if (ClContext->m_InternalOptions.GTPinReRA)
3735             {
3736                 SaveOption(vISA_GTPinReRA, true);
3737                 SaveOption(vISA_ReRAPostSchedule, true);
3738             }
3739             if (ClContext->m_InternalOptions.GTPinGRFInfo)
3740             {
3741                 SaveOption(vISA_GetFreeGRFInfo, true);
3742             }
3743             if (ClContext->m_InternalOptions.GTPinScratchAreaSize)
3744             {
3745                 SaveOption(vISA_GTPinScratchAreaSize, ClContext->m_InternalOptions.GTPinScratchAreaSizeValue);
3746             }
3747         }
3748 
3749         bool EnableBarrierInstCounterBits = false;
3750         if (context->type == ShaderType::HULL_SHADER)
3751         {
3752             EnableBarrierInstCounterBits = true;
3753         }
3754         bool preserveR0 = false;
3755         if (context->type == ShaderType::PIXEL_SHADER)
3756         {
3757             preserveR0 = !static_cast<CPixelShader*>(m_program)->IsLastPhase();
3758         }
3759         bool isOptDisabled = context->getModuleMetaData()->compOpt.OptDisable;
3760 
3761         // Set up options. This must be done before creating any variable/instructions
3762         // since some of the options affect IR building.
3763         if (IGC_IS_FLAG_ENABLED(ForceNoFP64bRegioning))
3764         {
3765             SaveOption(vISA_forceNoFP64bRegioning, true);
3766         }
3767 
3768         if (IGC_IS_FLAG_ENABLED(DumpCompilerStats) || context->getModuleMetaData()->compOpt.CaptureCompilerStats)
3769         {
3770             SaveOption(vISA_EnableCompilerStats, true);
3771         }
3772 
3773         if (IGC_IS_FLAG_ENABLED(EnableSamplerSplit))
3774         {
3775             SaveOption(vISA_enableCloneSampleInst, true);
3776         }
3777 
3778 
3779         if (m_program->m_Platform->getWATable().Wa_14012760189 && IGC_IS_FLAG_ENABLED(EnableEvaluateSamplerSplit))
3780         {
3781             SaveOption(vISA_cloneEvaluateSampleInst, true);
3782         }
3783 
3784         if (IGC_IS_FLAG_ENABLED(ForceFFIDOverwrite)/*|| m_program->m_Platform->WaOverwriteFFID()*/)
3785         {
3786             unsigned int ffid[unsigned(ShaderType::END)] = {
3787                 0,
3788                 static_cast<unsigned>(context->isPOSH() ? FFID_VSR : FFID_VS),
3789                 FFID_HS,
3790                 FFID_DS,
3791                 FFID_GS,
3792                 FFID_PS,
3793                 FFID_GP,
3794                 FFID_GP
3795             };
3796             SaveOption(vISA_setFFID, ffid[unsigned(context->type)]);
3797         }
3798 
3799         SaveOption(vISA_hasRNEandDenorm, true);
3800 
3801         // need to fold ret into the previous RTWrite/URBWrite/etc
3802         if (context->type != ShaderType::OPENCL_SHADER && context->type != ShaderType::COMPUTE_SHADER)
3803         {
3804             {
3805                 SaveOption(vISA_foldEOTtoPrevSend, true);
3806             }
3807         }
3808 
3809         if (m_program->m_DriverInfo->clearScratchWriteBeforeEOT() &&
3810             (context->type == ShaderType::PIXEL_SHADER || context->type == ShaderType::OPENCL_SHADER))
3811         {
3812             SaveOption(vISA_clearScratchWritesBeforeEOT, true);
3813         }
3814 
3815         bool clearHDCWritesBeforeEOT = m_program->m_DriverInfo->UsesSparseAliasedResidency() &&
3816             context->platform.WaInsertHDCFenceBeforeEOTWhenSparseAliasedResources();
3817         clearHDCWritesBeforeEOT |= ((context->type == ShaderType::PIXEL_SHADER) ||
3818             (context->type == ShaderType::COMPUTE_SHADER) ||
3819             (context->type == ShaderType::OPENCL_SHADER)) &&
3820             context->platform.NeedsHDCFenceBeforeEOTInPixelShader();
3821         clearHDCWritesBeforeEOT |= IGC_IS_FLAG_ENABLED(ForceMemoryFenceBeforeEOT);
3822 
3823         if (clearHDCWritesBeforeEOT)
3824         {
3825             SaveOption(vISA_clearHDCWritesBeforeEOT, true);
3826         }
3827 
3828 
3829         // Disable multi-threaded latencies in the vISA scheduler when not in 3D
3830         if (context->type == ShaderType::OPENCL_SHADER)
3831         {
3832             if (m_program->m_Platform->singleThreadBasedInstScheduling())
3833             {
3834                 SaveOption(vISA_useMultiThreadedLatencies, false);
3835             }
3836         }
3837 
3838         auto enableScheduler = [=]() {
3839             // Check if preRA scheduler is disabled from input.
3840             if (isOptDisabled)
3841                 return false;
3842             if (context->type == ShaderType::OPENCL_SHADER) {
3843                 auto ClContext = static_cast<OpenCLProgramContext*>(context);
3844                 if (!ClContext->m_InternalOptions.IntelEnablePreRAScheduling)
3845                     return false;
3846             }
3847 
3848             // Check reg-key or compiler input
3849             if (IGC_IS_FLAG_ENABLED(ForceVISAPreSched) || context->getModuleMetaData()->csInfo.forcedVISAPreRAScheduler)
3850                 return true;
3851 
3852             // API check.
3853             bool enableForRetey = m_program->m_DriverInfo->enableVISAPreRASchedulerForRetry() ||
3854                 context->m_retryManager.AllowVISAPreRAScheduler();
3855 
3856             if (IGC_IS_FLAG_ENABLED(EnableVISAPreSched) &&
3857                 m_program->m_DriverInfo->enableVISAPreRAScheduler() &&
3858                 enableForRetey)
3859                 return true;
3860 
3861             return false;
3862         };
3863 
3864         if (enableScheduler())
3865         {
3866             SaveOption(vISA_preRA_Schedule, true);
3867             if (uint32_t Val = IGC_GET_FLAG_VALUE(VISAPreSchedCtrl))
3868             {
3869                 SaveOption(vISA_preRA_ScheduleCtrl, Val);
3870             }
3871             else
3872             {
3873                 uint32_t V = m_program->m_DriverInfo->getVISAPreRASchedulerCtrl();
3874                 if (m_program->GetHasDPAS())
3875                 {
3876                     V = 4; // register pressure only
3877                 }
3878                 SaveOption(vISA_preRA_ScheduleCtrl, V);
3879             }
3880 
3881             uint32_t VISAPreSchedVal = 0;
3882             if (context->type == ShaderType::COMPUTE_SHADER)
3883                 VISAPreSchedVal = context->getModuleMetaData()->csInfo.VISAPreSchedRPThreshold;
3884             else if (context->type == ShaderType::PIXEL_SHADER)
3885                 VISAPreSchedVal = context->getModuleMetaData()->compOpt.VISAPreSchedRPThreshold;
3886             // registry key setting has higher priority
3887             if (uint32_t Val = IGC_GET_FLAG_VALUE(VISAPreSchedRPThreshold))
3888             {
3889                 SaveOption(vISA_preRA_ScheduleRPThreshold, Val);
3890             }
3891             else if (VISAPreSchedVal)
3892             {
3893                 SaveOption(vISA_preRA_ScheduleRPThreshold, VISAPreSchedVal);
3894             }
3895 
3896             if (uint32_t Val = IGC_GET_FLAG_VALUE(VISAScheduleStartBBID))
3897             {
3898                 SaveOption(vISA_ScheduleStartBBID, Val);
3899             }
3900 
3901             if (uint32_t Val = IGC_GET_FLAG_VALUE(VISAScheduleEndBBID))
3902             {
3903                 SaveOption(vISA_ScheduleEndBBID, Val);
3904             }
3905         }
3906         else
3907         {
3908             SaveOption(vISA_preRA_Schedule, false);
3909         }
3910 
3911         if (IGC_IS_FLAG_ENABLED(ReplaceIndirectCallWithJmpi))
3912         {
3913             SaveOption(vISA_replaceIndirectCallWithJmpi, true);
3914         }
3915 
3916         if (IGC_IS_FLAG_ENABLED(FastSpill))
3917         {
3918             SaveOption(vISA_FastSpill, true);
3919         }
3920 
3921 #ifdef _DEBUG
3922         // enable vISA verifier if we are generating vISA IR
3923         SaveOption(vISA_NoVerifyvISA, !enableVISA_IR);
3924 #else
3925         SaveOption(vISA_NoVerifyvISA, true);
3926 #endif
3927 
3928         if (context->m_instrTypes.hasDebugInfo)
3929         {
3930             SaveOption(vISA_GenerateDebugInfo, true);
3931 
3932             if (context->metrics.Enable())
3933             {
3934                 SaveOption(vISA_GenerateKernelInfo, true);
3935                 SaveOption(vISA_EmitLocation, true);
3936             }
3937         }
3938 
3939         if (canAbortOnSpill)
3940         {
3941             SaveOption(vISA_AbortOnSpill, true);
3942             if (AvoidRetryOnSmallSpill())
3943             {
3944                 // 2 means #spill/fill is roughly 1% of #inst
3945                 // ToDo: tune the threshold
3946                 if (m_program->m_dispatchSize == SIMDMode::SIMD8)
3947                     SaveOption(vISA_AbortOnSpillThreshold, IGC_GET_FLAG_VALUE(SIMD8_SpillThreshold) * 2);
3948 
3949                 else if (m_program->m_dispatchSize == SIMDMode::SIMD16)
3950                     SaveOption(vISA_AbortOnSpillThreshold, IGC_GET_FLAG_VALUE(SIMD16_SpillThreshold) * 2);
3951             }
3952         }
3953 
3954         if (context->type == ShaderType::OPENCL_SHADER && m_program->m_dispatchSize == SIMDMode::SIMD8)
3955         {
3956             // AllowSpill is set to false if -cl-intel-no-spill internal option was passed from OpenCL Runtime.
3957             // It has been implemented to avoid scratch space usage for scheduler kernel.
3958             if (AllowSpill)
3959             {
3960                 SaveOption(vISA_AbortOnSpillThreshold, IGC_GET_FLAG_VALUE(SIMD8_SpillThreshold) * 2);
3961             }
3962         }
3963 
3964         if ((context->type == ShaderType::OPENCL_SHADER || context->type == ShaderType::COMPUTE_SHADER) &&
3965             m_program->m_Platform->preemptionSupported() && IGC_IS_FLAG_ENABLED(EnablePreemption))
3966         {
3967             SaveOption(vISA_enablePreemption, true);
3968         }
3969 
3970         if (IGC_IS_FLAG_ENABLED(forceGlobalRA))
3971         {
3972             SaveOption(vISA_LocalRA, false);
3973             SaveOption(vISA_LocalBankConflictReduction, false);
3974         }
3975 
3976         if (IGC_IS_FLAG_ENABLED(disableVarSplit))
3977         {
3978             SaveOption(vISA_LocalDeclareSplitInGlobalRA, false);
3979         }
3980 
3981         if (IGC_IS_FLAG_ENABLED(disableRemat))
3982         {
3983             SaveOption(vISA_NoRemat, true);
3984         }
3985 
3986         if (ForceNonCoherentStatelessBti || IGC_IS_FLAG_ENABLED(ForceNonCoherentStatelessBTI))
3987         {
3988             SaveOption(vISA_noncoherentStateless, true);
3989         }
3990 
3991         if (IGC_IS_FLAG_ENABLED(DisableIfCvt))
3992         {
3993             SaveOption(vISA_ifCvt, false);
3994         }
3995 
3996         if (IGC_IS_FLAG_ENABLED(EnableVISAStructurizer) &&
3997             (m_program->m_Platform->hasSCF() || IGC_IS_FLAG_ENABLED(ForceVISAStructurizer)))
3998         {
3999             SaveOption(vISA_EnableStructurizer, true);
4000 
4001             if (IGC_GET_FLAG_VALUE(EnableVISAStructurizer) == FLAG_SCF_UCFOnly)
4002             {
4003                 // visa structurizer will generate UCF only.
4004                 SaveOption(vISA_StructurizerCF, false);
4005             }
4006         }
4007 
4008         if (IGC_IS_FLAG_DISABLED(EnableVISAJmpi))
4009         {
4010             SaveOption(vISA_EnableScalarJmp, false);
4011         }
4012 
4013         if (IGC_IS_FLAG_ENABLED(ForceNoMaskWA)) {
4014             SaveOption(vISA_forceNoMaskWA, true);
4015             // Turn off jmpi as there is no wa for jmpi
4016             SaveOption(vISA_EnableScalarJmp, false);
4017         }
4018 
4019         if (m_program->m_Platform->getWATable().Wa_1808850743 ||
4020             m_program->m_Platform->getWATable().Wa_1409909237)
4021         {
4022             SaveOption(vISA_noMaskWA, IGC_GET_FLAG_VALUE(NoMaskWA));
4023             if (IGC_GET_FLAG_VALUE(NoMaskWA) > 0)
4024             {
4025                 // Turn off jmpi as there is no wa for jmpi
4026                 SaveOption(vISA_EnableScalarJmp, false);
4027             }
4028         }
4029 
4030         if (m_program->m_Platform->hasFusedEU()
4031             && IGC_IS_FLAG_ENABLED(EnableCallWA)
4032             && (m_program->HasStackCalls() || m_program->IsIntelSymbolTableVoidProgram()))
4033         {
4034             SaveOption(vISA_fusedCallWA, true);
4035         }
4036 
4037         if (IGC_IS_FLAG_ENABLED(DisableCSEL))
4038         {
4039             SaveOption(vISA_enableCSEL, false);
4040         }
4041         if (IGC_IS_FLAG_ENABLED(DisableFlagOpt))
4042         {
4043             SaveOption(vISA_LocalFlagOpt, false);
4044         }
4045 
4046         if (IGC_IS_FLAG_ENABLED(EnableVISAOutput))
4047         {
4048             SaveOption(vISA_outputToFile, true);
4049             m_enableVISAdump = true;
4050         }
4051         if (IGC_IS_FLAG_ENABLED(EnableVISABinary))
4052         {
4053             SaveOption(vISA_GenerateBinary, true);
4054             m_enableVISAdump = true;
4055         }
4056         if (IGC_IS_FLAG_ENABLED(EnableVISADumpCommonISA))
4057         {
4058             SaveOption(vISA_DumpvISA, true);
4059             SaveOption(vISA_GenerateISAASM, true);
4060             m_enableVISAdump = true;
4061         }
4062         if (IGC_IS_FLAG_ENABLED(EnableVISANoSchedule))
4063         {
4064             SaveOption(vISA_LocalScheduling, false);
4065         }
4066         if (IGC_IS_FLAG_ENABLED(EnableVISANoBXMLEncoder))
4067         {
4068             SaveOption(vISA_BXMLEncoder, false);
4069         }
4070         if (IGC_IS_FLAG_ENABLED(DisableMixMode) ||
4071             context->getModuleMetaData()->disableMixMode)
4072         {
4073             SaveOption(vISA_DisableMixMode, true);
4074         }
4075         if (IGC_IS_FLAG_ENABLED(ForceMixMode))
4076         {
4077             SaveOption(vISA_ForceMixMode, true);
4078         }
4079         if (IGC_IS_FLAG_ENABLED(DisableHFMath))
4080         {
4081             SaveOption(vISA_DisableHFMath, true);
4082         }
4083 
4084         if (IGC_IS_FLAG_ENABLED(disableIGASyntax))
4085         {
4086             SaveOption(vISA_dumpNewSyntax, false);
4087         }
4088         if (IGC_IS_FLAG_ENABLED(disableCompaction))
4089         {
4090             SaveOption(vISA_Compaction, false);
4091         }
4092 
4093         if (auto *regex = IGC_GET_REGKEYSTRING(ShaderDumpFilter))
4094         {
4095             SaveOption(vISA_ShaderDumpFilter, regex);
4096         }
4097 
4098         // In Vulkan and OGL buffer variable memory reads and writes within
4099         // a single shader invocation must be processed in order.
4100         if (m_program->m_DriverInfo->DisableDpSendReordering())
4101         {
4102             SaveOption(vISA_ReorderDPSendToDifferentBti, false);
4103         }
4104 
4105         if (m_program->m_DriverInfo->UseALTMode())
4106         {
4107             SaveOption(vISA_ChangeMoveType, false);
4108             SaveOption(vISA_ALTMode, true);
4109         }
4110 
4111         if (IGC_IS_FLAG_ENABLED(DisableSendS))
4112         {
4113             SaveOption(vISA_UseSends, false);
4114         }
4115         if (m_program->m_DriverInfo->AllowUnsafeHalf())
4116         {
4117             SaveOption(vISA_enableUnsafeCP_DF, true);
4118         }
4119 
4120         if (IGC_GET_FLAG_VALUE(ReservedRegisterNum) != 0 && (IGC_GET_FLAG_VALUE(TotalGRFNum) != 0))
4121         {
4122             IGC_ASSERT_MESSAGE(0, "ReservedRegisterNum and TotalGRFNum registry keys cannot be used at the same time");
4123         }
4124 
4125         if (IGC_GET_FLAG_VALUE(ReservedRegisterNum) != 0)
4126         {
4127             SaveOption(vISA_ReservedGRFNum, IGC_GET_FLAG_VALUE(ReservedRegisterNum));
4128         }
4129         if (IGC_GET_FLAG_VALUE(GRFNumToUse) > 0)
4130         {
4131             SaveOption(vISA_GRFNumToUse, IGC_GET_FLAG_VALUE(GRFNumToUse));
4132         }
4133 
4134         if (IGC_GET_FLAG_VALUE(TotalGRFNum) != 0)
4135         {
4136             SaveOption(vISA_TotalGRFNum, IGC_GET_FLAG_VALUE(TotalGRFNum));
4137         }
4138         else if (context->type == ShaderType::COMPUTE_SHADER && IGC_GET_FLAG_VALUE(TotalGRFNum4CS) != 0)
4139         {
4140             SaveOption(vISA_TotalGRFNum, IGC_GET_FLAG_VALUE(TotalGRFNum4CS));
4141         }
4142         else
4143         {
4144             SaveOption(vISA_TotalGRFNum, context->getNumGRFPerThread());
4145         }
4146 
4147         //
4148         // Setting number of GRF and threads per EU is restricted to OCL only
4149         // Number of threads can be set by:
4150         //  1. User input through
4151         //    1.1 compiler option for entire module
4152         //    1.2 kernel annotation for a specific kernel function
4153         //  2. Compiler heuristics
4154         //
4155         if (context->type == ShaderType::OPENCL_SHADER)
4156         {
4157             auto ClContext = static_cast<OpenCLProgramContext*>(context);
4158             if (m_program->m_Platform->supportsStaticRegSharing())
4159             {
4160                 if (ClContext->getNumThreadsPerEU() > 0)
4161                 {
4162                     // Number of threads per EU is set per module (by compiler option)
4163                     SaveOption(vISA_HWThreadNumberPerEU, ClContext->getNumThreadsPerEU());
4164                 }
4165                 else if (m_program->getAnnotatedNumThreads() > 0)
4166                 {
4167                     // Number of threads per EU is set per kernel function (by user annotation)
4168                     SaveOption(vISA_HWThreadNumberPerEU, m_program->getAnnotatedNumThreads());
4169                 }
4170                 else if (m_program->m_Platform->supportsAutoGRFSelection() &&
4171                     context->m_DriverInfo.supportsAutoGRFSelection() &&
4172                     !IGC_IS_FLAG_ENABLED(DisableRegSharingHeuristics) &&
4173                     !ClContext->m_InternalOptions.Intel128GRFPerThread &&
4174                     !ClContext->m_InternalOptions.Intel256GRFPerThread)
4175                 {
4176                     // When user hasn't specified number of threads, we can rely on compiler heuristics
4177                     SaveOption(vISA_RegSharingHeuristics, true);
4178                 }
4179             }
4180         }
4181         if (IGC_GET_FLAG_VALUE(ForceHWThreadNumberPerEU) != 0)
4182         {
4183             SaveOption(vISA_ForceHWThreadNumberPerEU, IGC_GET_FLAG_VALUE(ForceHWThreadNumberPerEU));
4184         }
4185 
4186         if (IGC_IS_FLAG_ENABLED(EnableHashMovsAtPrologue))
4187         {
4188             SaveOption(vISA_HashMovsAtPrologue, true);
4189         }
4190 
4191         if (IGC_IS_FLAG_ENABLED(SystemThreadEnable))
4192         {
4193             /* Some tools only use 32bits hash, to maintain compatibility
4194             across lot of unknown tool chains doing Compare for only LowerPart
4195             */
4196             if (IGC_GET_FLAG_VALUE(ShaderDebugHashCode) == (DWORD)context->hash.getAsmHash())
4197             {
4198                 SaveOption(vISA_setStartBreakPoint, true);
4199             }
4200         }
4201         else if (KernelDebugEnable)
4202         {
4203             SaveOption(vISA_AddKernelID, true);
4204             SaveOption(vISA_setStartBreakPoint, true);
4205         }
4206 
4207         auto g4Subset = (uint32_t)IGC_GET_FLAG_VALUE(ShaderDumpEnableG4);
4208         if (g4Subset != 0)
4209             SaveOption(vISA_DumpPassesSubset, g4Subset);
4210 
4211         if (EnableBarrierInstCounterBits)
4212         {
4213             SaveOption(VISA_EnableBarrierInstCounterBits, true);
4214         }
4215         if (preserveR0)
4216         {
4217             SaveOption(vISA_ReserveR0, true);
4218         }
4219         if (IGC_IS_FLAG_ENABLED(InitializeRegistersEnable))
4220         {
4221             SaveOption(vISA_InitPayload, true);
4222         }
4223         if (IGC_IS_FLAG_ENABLED(UseOldSubRoutineAugIntf))
4224         {
4225             SaveOption(vISA_UseOldSubRoutineAugIntf, true);
4226         }
4227         if (IGC_IS_FLAG_ENABLED(FastCompileRA) && !hasStackCall)
4228         {
4229             SaveOption(vISA_FastCompileRA, true);
4230         }
4231         if (IGC_IS_FLAG_ENABLED(HybridRAWithSpill) && !hasStackCall)
4232         {
4233             SaveOption(vISA_HybridRAWithSpill, true);
4234         }
4235         if (IGC_IS_FLAG_ENABLED(DumpPayloadToScratch))
4236         {
4237             SaveOption(vISA_dumpPayload, true);
4238         }
4239         if (IGC_IS_FLAG_ENABLED(ExpandPlane))
4240         {
4241             SaveOption(vISA_expandPlane, true);
4242         }
4243         if (IGC_IS_FLAG_ENABLED(EnableBCR))
4244         {
4245             SaveOption(vISA_enableBCR, true);
4246         }
4247         if (IGC_IS_FLAG_ENABLED(ForceBCR))
4248         {
4249             SaveOption(vISA_forceBCR, true);
4250         }
4251         if (IGC_IS_FLAG_ENABLED(forceSamplerHeader))
4252         {
4253             SaveOption(vISA_forceSamplerHeader, true);
4254         }
4255         if (IGC_IS_FLAG_ENABLED(EnableIGAEncoder))
4256         {
4257             SaveOption(vISA_IGAEncoder, true);
4258         }
4259         else
4260         {
4261             SaveOption(vISA_IGAEncoder, false);
4262         }
4263 
4264         if (IGC_IS_FLAG_ENABLED(SetA0toTdrForSendc))
4265         {
4266             SaveOption(vISA_setA0toTdrForSendc, true);
4267         }
4268 
4269         if (IGC_IS_FLAG_ENABLED(AvoidDstSrcGRFOverlap))
4270         {
4271             SaveOption(vISA_DstSrcOverlapWA, true);
4272         }
4273 
4274         if (IGC_IS_FLAG_ENABLED(AvoidSrc1Src2Overlap))
4275         {
4276             SaveOption(vISA_Src1Src2OverlapWA, true);
4277         }
4278 
4279         if (IGC_IS_FLAG_ENABLED(UseLinearScanRA))
4280         {
4281             SaveOption(vISA_LinearScan, true);
4282         }
4283 
4284         if (IGC_IS_FLAG_ENABLED(EnableIGASWSB))
4285         {
4286             SaveOption(vISA_EnableIGASWSB, true);
4287         }
4288 
4289         if (IGC_IS_FLAG_ENABLED(EnableQuickTokenAlloc))
4290         {
4291             SaveOption(vISA_QuickTokenAllocation, true);
4292         }
4293 
4294         if (IGC_IS_FLAG_ENABLED(EnableSWSBStitch) ||
4295             (context->type == ShaderType::PIXEL_SHADER &&
4296              static_cast<CPixelShader*>(m_program)->GetPhase() == PSPHASE_PIXEL))
4297         {
4298             SaveOption(vISA_SWSBStitch, true);
4299         }
4300 
4301         if (IGC_IS_FLAG_ENABLED(DisableRegDistDep))
4302         {
4303             SaveOption(vISA_disableRegDistDep, true);
4304         }
4305 
4306         if (IGC_IS_FLAG_ENABLED(EnableForceDebugSWSB) ||
4307             IGC_IS_FLAG_ENABLED(EnableSWSBInstStall) ||
4308             IGC_IS_FLAG_ENABLED(EnableSWSBTokenBarrier))
4309         {
4310             if (IGC_IS_FLAG_ENABLED(EnableSWSBInstStall))
4311             {
4312                 SaveOption(vISA_SWSBInstStall, IGC_GET_FLAG_VALUE(EnableSWSBInstStall));
4313                 SaveOption(vISA_SWSBInstStallEnd, IGC_GET_FLAG_VALUE(EnableSWSBInstStallEnd));
4314             }
4315 
4316             if (IGC_IS_FLAG_ENABLED(EnableSWSBTokenBarrier))
4317             {
4318                 SaveOption(vISA_SWSBTokenBarrier, IGC_GET_FLAG_VALUE(EnableSWSBTokenBarrier));
4319             }
4320 
4321             if (IGC_IS_FLAG_ENABLED(EnableForceDebugSWSB))
4322             {
4323                 SaveOption(vISA_forceDebugSWSB, true);
4324             }
4325             SaveOption(vISA_Compaction, false);
4326         }
4327 
4328         if (IGC_IS_FLAG_ENABLED(EnableGatherWithImm))
4329         {
4330             SaveOption(vISA_EnableGatherWithImm, true);
4331         }
4332 
4333         if (IGC_IS_FLAG_ENABLED(EnableGroupScheduleForBC))
4334         {
4335             SaveOption(vISA_EnableGroupScheduleForBC, true);
4336         }
4337 
4338         if (VISAPlatform == XeHP_SDV && IGC_IS_FLAG_ENABLED(DPASTokenReduction))
4339         {
4340             SaveOption(vISA_EnableDPASTokenReduction, true);
4341         }
4342 
4343         if (IGC_IS_FLAG_ENABLED(DisableThreeALUPipes))
4344         {
4345             SaveOption(vISA_EnableALUThreePipes, false);
4346         }
4347 
4348         SaveOption(vISA_useInlineData, m_program->passNOSInlineData());
4349 
4350         if (m_program->m_Platform->supportLoadThreadPayloadForCompute())
4351         {
4352             SaveOption(vISA_loadThreadPayload, m_program->loadThreadPayload());
4353         }
4354         else
4355         {
4356             SaveOption(vISA_loadThreadPayload, false);
4357         }
4358 
4359 
4360 
4361         if (IGC_IS_FLAG_ENABLED(EnablerReadSuppressionWA) &&
4362             VISAPlatform >= GENX_TGLLP)
4363         {
4364             SaveOption(vISA_InsertDummyMovForHWRSWA, true);
4365             if (IGC_IS_FLAG_ENABLED(DPASReadSuppressionWA))
4366             {
4367                 SaveOption(vISA_InsertDummyMovForDPASRSWA, true);
4368             }
4369             if (IGC_GET_FLAG_VALUE(RSWARegNum) != 0)
4370             {
4371                 SaveOption(vISA_registerHWRSWA, IGC_GET_FLAG_VALUE(RSWARegNum));
4372             }
4373         }
4374 
4375         if (IGC_GET_FLAG_VALUE(SWSBTokenNum) != 0)
4376         {
4377             SaveOption(vISA_SWSBTokenNum, IGC_GET_FLAG_VALUE(SWSBTokenNum));
4378         }
4379 
4380         if (IGC_IS_FLAG_ENABLED(EnableAccSub))
4381         {
4382             SaveOption(vISA_accSubstitution, true);
4383             uint32_t numAcc = IGC_GET_FLAG_VALUE(NumGeneralAcc);
4384 
4385             IGC_ASSERT_MESSAGE(0 <= numAcc, "number of general acc should be [1-16] if set");
4386             IGC_ASSERT_MESSAGE(numAcc <= 16, "number of general acc should be [1-16] if set");
4387 
4388             if (numAcc > 0)
4389             {
4390                 SaveOption(vISA_numGeneralAcc, numAcc);
4391             }
4392 
4393             if (IGC_IS_FLAG_ENABLED(HasDoubleAcc))
4394             {
4395                 SaveOption(vISA_hasDoubleAcc, true);
4396             }
4397         }
4398         else
4399         {
4400             SaveOption(vISA_accSubstitution, false);
4401         }
4402 
4403         if (IGC_IS_FLAG_ENABLED(GlobalSendVarSplit))
4404         {
4405             SaveOption(vISA_GlobalSendVarSplit, true);
4406         }
4407 
4408         if (m_program->m_Platform->canFuseTypedWrite())
4409         {
4410             SaveOption(vISA_FuseTypedWrites, true);
4411         }
4412 
4413         if (IGC_IS_FLAG_ENABLED(ShaderDumpEnable) && IGC_IS_FLAG_ENABLED(InterleaveSourceShader))
4414         {
4415             SaveOption(vISA_EmitLocation, true);
4416         }
4417 
4418         if (IGC_IS_FLAG_ENABLED(ShaderDumpEnable))
4419         {
4420             SaveOption(vISA_SBIDDepLoc, true);
4421         }
4422 
4423         // Enable SendFusion for SIMD8
4424         // TODO: Re-enable SendFusion when VMask is enabled. The hardware should support this, but
4425         //  more investigation needs to be done on whether simply replacing sr0.2 with sr0.3 is enough.
4426         if (IGC_IS_FLAG_ENABLED(EnableSendFusion) &&
4427             !(context->type == ShaderType::PIXEL_SHADER && static_cast<CPixelShader*>(m_program)->NeedVMask()) &&
4428             m_program->GetContext()->platform.supportSplitSend() &&
4429             m_program->m_dispatchSize == SIMDMode::SIMD8 &&
4430             (IGC_GET_FLAG_VALUE(EnableSendFusion) == FLAG_LEVEL_2 ||   // 2: force send fusion
4431                 context->m_DriverInfo.AllowSendFusion()))
4432         {
4433             SaveOption(vISA_EnableSendFusion, true);
4434             if (IGC_IS_FLAG_ENABLED(EnableAtomicFusion) &&
4435                 context->type == ShaderType::OPENCL_SHADER)
4436             {
4437                 SaveOption(vISA_EnableAtomicFusion, true);
4438             }
4439         }
4440 
4441         // With statelessToStatefull on, it is possible that two different BTI messages
4442         // (two kernel arguments) might refer to the same memory. To be safe, turn off
4443         // visa DPSend reordering.
4444         if (IGC_IS_FLAG_ENABLED(EnableStatelessToStatefull) &&
4445             context->type == ShaderType::OPENCL_SHADER)
4446         {
4447             SaveOption(vISA_ReorderDPSendToDifferentBti, false);
4448         }
4449 
4450         if (m_program->m_Platform->WaDisableSendSrcDstOverlap())
4451         {
4452             SaveOption(vISA_noSendSrcDstOverlap, true);
4453         }
4454 
4455         if (m_program->m_Platform->WaDisableSendSrcDstOverlap())
4456         {
4457             SaveOption(vISA_noSendSrcDstOverlap, true);
4458         }
4459 
4460         // Set to stitch all functions to all kernels in a VISABuidler
4461         SaveOption(vISA_noStitchExternFunc, false);
4462 
4463         // Turning off optimizations as much as possible to have the fastest compilation
4464         if ((IsStage1FastestCompile(context->m_CgFlag, context->m_StagingCtx) ||
4465              IGC_GET_FLAG_VALUE(ForceFastestSIMD)) &&
4466             (m_program->m_DriverInfo->SupportFastestStage1() ||
4467              IGC_IS_FLAG_ENABLED(EnableFastestForVulkan)))
4468         {
4469             if (IGC_GET_FLAG_VALUE(FastestS1Experiments) == FCEXP_NO_EXPRIMENT)
4470             {
4471                 SaveOption(vISA_LocalScheduling, false);
4472                 SaveOption(vISA_preRA_Schedule, false);
4473                 SaveOption(vISA_SpillSpaceCompression, false);
4474                 SaveOption(vISA_LVN, false);
4475                 SaveOption(vISA_QuickTokenAllocation, true);
4476                 if (IGC_IS_FLAG_DISABLED(FastestWALinearScanForCS) ||
4477                     context->type != ShaderType::COMPUTE_SHADER)
4478                 {
4479                     SaveOption(vISA_LinearScan, true);
4480                 }
4481             }
4482             else
4483             {
4484                 if (IGC_GET_FLAG_VALUE(FastestS1Experiments) & FCEXP_FASTSPILL)
4485                     SaveOption(vISA_FastSpill, true);
4486 
4487                 if (IGC_GET_FLAG_VALUE(FastestS1Experiments) & FCEXP_LOCAL_SCHEDULING)
4488                     SaveOption(vISA_LocalScheduling, false);
4489 
4490                 if (IGC_GET_FLAG_VALUE(FastestS1Experiments) & FCEXP_PRERA_SCHEDULING)
4491                     SaveOption(vISA_preRA_Schedule, false);
4492 
4493                 if (IGC_GET_FLAG_VALUE(FastestS1Experiments) & FCEXP_NO_REMAT)
4494                     SaveOption(vISA_NoRemat, true);
4495 
4496                 if (IGC_GET_FLAG_VALUE(FastestS1Experiments) & FCEXP_SPILL_COMPRESSION)
4497                     SaveOption(vISA_SpillSpaceCompression, false);
4498 
4499                 if (IGC_GET_FLAG_VALUE(FastestS1Experiments) & FCEXP_LOCAL_DECL_SPLIT_GLOBAL_RA)
4500                     SaveOption(vISA_LocalDeclareSplitInGlobalRA, false);
4501 
4502                 if (IGC_GET_FLAG_VALUE(FastestS1Experiments) & FCEXP_DISABLE_LVN)
4503                     SaveOption(vISA_LVN, false);
4504                 if (IGC_GET_FLAG_VALUE(FastestS1Experiments) & FCEXP_QUICKTOKEN_ALLOC)
4505                     SaveOption(vISA_QuickTokenAllocation, true);
4506                 if (((IGC_IS_FLAG_DISABLED(FastestWALinearScanForCS) ||
4507                      context->type != ShaderType::COMPUTE_SHADER)) &&
4508                     (IGC_GET_FLAG_VALUE(FastestS1Experiments) & FCEXP_LINEARSCAN))
4509                     SaveOption(vISA_LinearScan, true); // use linearScan
4510 
4511                 if (IGC_GET_FLAG_VALUE(FastestS1Experiments) & FCEXP_1PASSRA)
4512                     SaveOption(vISA_FastCompileRA, true); // use 1 iteration RA
4513             }
4514         }
4515 
4516     } // InitVISABuilderOptions
4517 
4518     // Get a unqiue label for inline asm instruction blocks at the module level.
4519     // For each call to asm("..."), user can input the "%=" string format to generate a unique label for that call.
4520     // In this case we would generate "__4_000" for the 1st usage of "%=" in an asm block in the 5th function of the module.
GetUniqueInlineAsmLabel()4521     std::string CEncoder::GetUniqueInlineAsmLabel()
4522     {
4523         std::stringstream ss;
4524         ss << GetCompilerLabelPrefix() << labelFunctionIndex << "_" <<
4525             std::setw(3) << std::setfill('0') << labelInlineAsmCounter++;
4526         return ss.str();
4527     }
4528 
4529     // Creates a module/program-unique label prefix.
4530     // E.g. the 3rd label of the 5th function would be
4531     // "__4_002".  Ugly, yes, but you shouldn't see it as this is the
4532     // fallback case.  Short, unique, and debuggable....
4533     // Release-internal/debug will have better names.
CreateShortLabel(unsigned labelIndex) const4534     std::string CEncoder::CreateShortLabel(unsigned labelIndex) const
4535     {
4536         std::stringstream ss;
4537         ss << GetCompilerLabelPrefix() << labelFunctionIndex << "_" <<
4538             std::setw(3) << std::setfill('0') << labelIndex;
4539         return ss.str();
4540     }
4541 
4542     // Converts an LLVM label L into a name appropriate for vISA's label rules
4543     //  * remove illegal chracters for vISA
4544     //  * contrains the length while maintaining uniqueness
4545     // The format is something that contains both function index and the
4546     // label name passed in.
4547     //
4548     // If enabled the output will be:
4549     //  _[FUNCTION-INDEX]_[LABEL-INDEX](_[LLVM-NAME])?
4550     // i.e. if the LLVM name is empty we omit that whole suffix
CreateVisaLabelName(const llvm::StringRef & L)4551     CName CEncoder::CreateVisaLabelName(const llvm::StringRef &L)
4552     {
4553 #ifndef IGC_MAP_LLVM_NAMES_TO_VISA
4554         return CreateShortLabel(labelCounter++);
4555 #else // IGC_MAP_LLVM_NAMES_TO_VISA
4556         static const size_t MAX_LLVM_NAME = 250;
4557 
4558         auto sanitizeChar = [](char c) {
4559             return isalnum(c) || c == '_' ? c : '_';
4560         };
4561 
4562         // The vISA backend constrains this to around 256 characters.
4563         // (1) Function names can be extremely long (currFunctionName).
4564         //     DPC++ with template gunk can be hundreds of characters.
4565         //     If the names are too long, punt and use a function index.
4566         //     Functions cannot be integers, thus the function part cannot
4567         //     collide if we use this replacement.
4568         // (2) LLVM labels (L) can be extremely long. E.g. LLVM chains
4569         //     together names synthetically and can get to >900 chars.
4570         //     In this case, we prefix a label index and suffix as much of
4571         //     the LLVM label on as possible.
4572         std::stringstream lbl;
4573         lbl << GetCompilerLabelPrefix();
4574         if (!currFunctionName.empty() && currFunctionName.size() < 128) {
4575             const char *s = currFunctionName.getVisaCString();
4576             while (*s)
4577                 lbl << sanitizeChar(*s++);
4578         } else {
4579             lbl << std::setw(2) << std::setfill('0') << labelFunctionIndex;
4580         }
4581         // since the label name could be the empty string, and to keep things
4582         // simple, we unconditionally use the label counter (and increment it)
4583         lbl << "_" << std::setw(3) << std::setfill('0') <<
4584             labelCounter++;
4585 
4586         size_t charsLeft = MAX_LLVM_NAME - (size_t)lbl.tellp();
4587         size_t nLeft = std::min(charsLeft, L.size());
4588         if (L.size() > 0 && nLeft > 0) {
4589             // if not the empty string then add a separator
4590             lbl << "_";
4591             nLeft--;
4592         }
4593         // suffix as many characters of the label as we can
4594         for (size_t i = 0; i < nLeft; i++) {
4595             lbl << sanitizeChar(L[i]);
4596         }
4597 
4598         return lbl.str();
4599 #endif // IGC_MAP_LLVM_NAMES_TO_VISA
4600     }
4601 
InitLabelMap(const llvm::Function * F)4602     void CEncoder::InitLabelMap(const llvm::Function* F)
4603     {
4604         labelMap.clear();
4605         labelMap.resize(F->size(), nullptr);
4606         labelCounter = 0;
4607         labelInlineAsmCounter = 0;
4608         labelFunctionIndex++;
4609         currFunctionName = F->getName();
4610         labelNameMap.clear();
4611         labelNameMap.reserve(F->size());
4612         for (auto BI = F->begin(), BE = F->end(); BI != BE; BI++)
4613         {
4614             labelNameMap.emplace_back(CreateVisaLabelName(BI->getName()));
4615         }
4616     }
4617 
InitEncoder(bool canAbortOnSpill,bool hasStackCall,bool hasInlineAsmCall,VISAKernel * prevKernel)4618     void CEncoder::InitEncoder(bool canAbortOnSpill, bool hasStackCall, bool hasInlineAsmCall, VISAKernel* prevKernel)
4619     {
4620         m_aliasesMap.clear();
4621         m_encoderState.m_SubSpanDestination = false;
4622         CodeGenContext* context = m_program->GetContext();
4623         m_encoderState.m_secondHalf = false;
4624         m_encoderState.m_secondNibble = false;
4625         m_enableVISAdump = false;
4626         m_nestLevelForcedNoMaskRegion = 0;
4627         m_hasInlineAsm = hasInlineAsmCall;
4628 
4629         InitLabelMap(m_program->entry);
4630 
4631         vbuilder = nullptr;
4632         vAsmTextBuilder = nullptr;
4633         TARGET_PLATFORM VISAPlatform = GetVISAPlatform(&(context->platform));
4634 
4635         SetVISAWaTable(m_program->m_Platform->getWATable());
4636 
4637         llvm::SmallVector<const char*, 10> params;
4638         llvm::SmallVector<std::unique_ptr< char, std::function<void(char*)>>, 10> params2;
4639         if (!m_hasInlineAsm)
4640         {
4641             // Asm text writer mode doesnt need dump params
4642             InitBuildParams(params2);
4643             for (size_t i = 0; i < params2.size(); i++)
4644             {
4645                 params.push_back((params2[i].get()));
4646             }
4647         }
4648 
4649         COMPILER_TIME_START(m_program->GetContext(), TIME_CG_vISACompile);
4650         bool enableVISADump = IGC_IS_FLAG_ENABLED(EnableVISASlowpath) || IGC_IS_FLAG_ENABLED(ShaderDumpEnable);
4651         auto builderMode = m_hasInlineAsm ? vISA_ASM_WRITER : vISA_DEFAULT;
4652         auto builderOpt = (enableVISADump || m_hasInlineAsm) ? VISA_BUILDER_BOTH : VISA_BUILDER_GEN;
4653         V(CreateVISABuilder(vbuilder, builderMode, builderOpt, VISAPlatform, params.size(), params.data(),
4654             &m_vISAWaTable));
4655 
4656         if (IsCodePatchCandidate())
4657         {
4658             SetHasPrevKernel(prevKernel != nullptr);
4659         }
4660         InitVISABuilderOptions(VISAPlatform, canAbortOnSpill, hasStackCall, builderOpt == VISA_BUILDER_BOTH);
4661 
4662         // Pass all build options to builder
4663         SetBuilderOptions(vbuilder);
4664 
4665         vKernel = nullptr;
4666 
4667         std::string kernelName = std::string(m_program->entry->getName());
4668         if (context->m_instrTypes.hasDebugInfo)
4669         {
4670             // This metadata node is added by TransformBlocks pass for device side
4671             // enqueue feature of OCL2.0+.
4672             // The problem is that for device side enqueue, kernel name used in
4673             // IGC differs the one used to create JIT kernel. This leads to different
4674             // kernel names in .elf file and .dbg file. So dbgmerge tool cannot
4675             // merge the two together. With this metadata node we create a mapping
4676             // between the two names and when debug info is enabled, make JIT use
4677             // same name as IGC.
4678             // Names earlier -
4679             // ParentKernel_dispatch_0 in dbg and
4680             // __ParentKernel_block_invoke in elf
4681             // when kernel name is ParentKernel
4682             //
4683             auto md = m_program->entry->getParent()->getNamedMetadata("igc.device.enqueue");
4684             if (md)
4685             {
4686                 for (unsigned int i = 0; i < md->getNumOperands(); i++)
4687                 {
4688                     auto mdOpnd = md->getOperand(i);
4689                     auto first = dyn_cast_or_null<MDString>(mdOpnd->getOperand(1));
4690                     if (first &&
4691                         first->getString().equals(kernelName))
4692                     {
4693                         auto second = dyn_cast_or_null<MDString>(mdOpnd->getOperand(0));
4694                         if (second)
4695                         {
4696                             kernelName = second->getString().str();
4697                         }
4698                     }
4699                 }
4700             }
4701         }
4702 
4703         std::string asmName;
4704         if (m_enableVISAdump || context->m_instrTypes.hasDebugInfo)
4705         {
4706             asmName = GetDumpFileName("asm");
4707         }
4708         else
4709         {
4710             kernelName = "kernel";
4711             asmName = "kernel.asm";
4712         }
4713 
4714         V(vbuilder->AddKernel(vKernel, kernelName.c_str()));
4715         V(vbuilder->SetPrevKernel(prevKernel));
4716         V(vKernel->AddKernelAttribute("OutputAsmPath", asmName.length(), asmName.c_str()));
4717 
4718         SetDispatchSimdSize();
4719         SetSpillMemOffset();
4720 
4721         vMainKernel = vKernel;
4722 
4723         auto gtpin_init = context->gtpin_init;
4724         if (gtpin_init)
4725         {
4726             vKernel->SetGTPinInit(gtpin_init);
4727         }
4728 
4729         // Right now only 1 main function in the kernel
4730         VISA_LabelOpnd* functionLabel = nullptr;
4731         V(vKernel->CreateVISALabelVar(functionLabel, "_main", LABEL_SUBROUTINE));
4732         V(vKernel->AppendVISACFLabelInst(functionLabel));
4733 
4734         V(vKernel->CreateVISASurfaceVar(dummySurface, "", 1));
4735 
4736         V(vKernel->CreateVISASamplerVar(samplervar, "", 1));
4737 
4738         // Set float denorm modes and rounding modes as default
4739         initCR(vKernel);
4740     }
4741 
SetDispatchSimdSize()4742     void CEncoder::SetDispatchSimdSize()
4743     {
4744         IGC_ASSERT(nullptr != vKernel);
4745         uint8_t dispatchSIMD = (uint8_t)numLanes(m_program->m_dispatchSize);
4746         V(vKernel->AddKernelAttribute("SimdSize", 1, &dispatchSIMD));
4747     }
4748 
SetSpillMemOffset()4749     void CEncoder::SetSpillMemOffset()
4750     {
4751         IGC_ASSERT(nullptr != vKernel);
4752         uint scratchSpaceSizeTemp = m_program->m_ScratchSpaceSize;
4753 
4754 
4755         if (scratchSpaceSizeTemp > 0) {
4756             V(vKernel->AddKernelAttribute("SpillMemOffset", 4, &scratchSpaceSizeTemp));
4757         }
4758     }
4759 
SetStackFunctionArgSize(uint size)4760     void CEncoder::SetStackFunctionArgSize(uint size)
4761     {
4762         uint8_t sz = (uint8_t)size;
4763         IGC_ASSERT(nullptr != vKernel);
4764         V(vKernel->AddKernelAttribute("ArgSize", 1, &sz));
4765     }
4766 
SetStackFunctionRetSize(uint size)4767     void CEncoder::SetStackFunctionRetSize(uint size)
4768     {
4769         uint8_t sz = (uint8_t)size;
4770         IGC_ASSERT(nullptr != vKernel);
4771         V(vKernel->AddKernelAttribute("RetValSize", 1, &sz));
4772     }
4773 
SetExternFunctionFlag()4774     void CEncoder::SetExternFunctionFlag()
4775     {
4776         IGC_ASSERT(nullptr != vKernel);
4777         V(vKernel->AddKernelAttribute("Extern", 0, nullptr));
4778     }
4779 
CopyEncoderState()4780     SEncoderState CEncoder::CopyEncoderState()
4781     {
4782         return m_encoderState;
4783     }
4784 
SetEncoderState(SEncoderState & newState)4785     void CEncoder::SetEncoderState(SEncoderState& newState)
4786     {
4787         m_encoderState = newState;
4788     }
4789 
GetVISAAlign(CVariable * var)4790     VISA_Align CEncoder:: GetVISAAlign(CVariable* var)
4791     {
4792         VISA_Align align;
4793         switch (var->GetAlign())
4794         {
4795         case EALIGN_BYTE: align = ALIGN_BYTE;
4796             break;
4797         case EALIGN_WORD: align = ALIGN_WORD;
4798             break;
4799         case EALIGN_DWORD: align = ALIGN_DWORD;
4800             break;
4801         case EALIGN_QWORD: align = ALIGN_QWORD;
4802             break;
4803         case EALIGN_OWORD: align = ALIGN_OWORD;
4804             break;
4805         case EALIGN_HWORD: align = ALIGN_HWORD;
4806             break;
4807         case EALIGN_32WORD: align = ALIGN_32WORD;
4808             break;
4809         case EALIGN_64WORD: align = ALIGN_64WORD;
4810             break;
4811         default:
4812             align = ALIGN_UNDEF;
4813             IGC_ASSERT(0);
4814             break;
4815         }
4816         return align;
4817     }
4818 
GetVISAVariable(CVariable * var)4819     VISA_GenVar* CEncoder::GetVISAVariable(CVariable* var)
4820     {
4821         if (m_encoderState.m_secondHalf)
4822         {
4823             if (var->GetNumberInstance() == 2)
4824             {
4825                 return var->visaGenVariable[1];
4826             }
4827         }
4828         return var->visaGenVariable[0];
4829     }
4830 
GetVISAVariable(CVariable * var,e_instance instance)4831     VISA_GenVar* CEncoder::GetVISAVariable(CVariable* var, e_instance instance)
4832     {
4833         VISA_GenVar* result = GetVISAVariable(var);
4834 
4835         if (instance != EINSTANCE_UNSPECIFIED &&
4836             var->GetNumberInstance() == 2)
4837         {
4838             if (instance == EINSTANCE_FIRST_HALF)
4839             {
4840                 result = var->visaGenVariable[0];
4841             }
4842             else
4843             {
4844                 result = var->visaGenVariable[1];
4845             }
4846         }
4847         return result;
4848     }
4849 
GetVISAPredefinedVar(CVariable * pVar,PreDefined_Vars var)4850     void CEncoder::GetVISAPredefinedVar(CVariable* pVar, PreDefined_Vars var)
4851     {
4852         vKernel->GetPredefinedVar(pVar->visaGenVariable[0], var);
4853         switch (var) {
4854         case PREDEFINED_NULL:
4855         case PREDEFINED_TSC:
4856         case PREDEFINED_SR0:
4857         case PREDEFINED_CR0:
4858         case PREDEFINED_CE0:
4859         case PREDEFINED_DBG:
4860             // Creating alias to ARF is not allowed.
4861             return;
4862         default:
4863             break;
4864         }
4865 
4866         VISA_GenVar* pAliasGenVar = nullptr;
4867 
4868         // Create alias to the specified pre-defined variable to match the
4869         // requested types and elements..
4870         vKernel->CreateVISAGenVar(
4871             pAliasGenVar,
4872             pVar->getVisaCString(),
4873             pVar->GetNumberElement(),
4874             pVar->GetType(),
4875             ALIGN_HWORD,
4876             pVar->visaGenVariable[0],
4877             pVar->GetAliasOffset());
4878 
4879         pVar->visaGenVariable[0] = pAliasGenVar;
4880     }
4881 
CreateVISAVar(CVariable * var)4882     void CEncoder::CreateVISAVar(CVariable* var)
4883     {
4884         IGC_ASSERT(nullptr != var);
4885 
4886         if (var->GetAlias() != NULL)
4887         {
4888             var->ResolveAlias();
4889             // In case the alias is an exact copy or just a sub variable just re-use the variable
4890             if (var->GetAlias()->GetType() == var->GetType())
4891             {
4892                 for (uint i = 0; i < var->GetNumberInstance(); i++)
4893                 {
4894                     var->visaGenVariable[i] = var->GetAlias()->visaGenVariable[i];
4895                 }
4896             }
4897             else
4898             {
4899                 SAlias alias(var->GetAlias(), var->GetType());
4900                 auto aliasPair = m_aliasesMap.insert(std::pair<SAlias, CVariable*>(alias, var));
4901                 if (aliasPair.second == false)
4902                 {
4903                     for (uint i = 0; i < var->GetNumberInstance(); i++)
4904                     {
4905                         var->visaGenVariable[i] = aliasPair.first->second->visaGenVariable[i];
4906                     }
4907                 }
4908                 else
4909                 {
4910                     IGC_ASSERT_MESSAGE(var->GetType() != ISA_TYPE_BOOL, "boolean cannot have alias");
4911                     for (uint i = 0; i < var->GetNumberInstance(); i++)
4912                     {
4913                         // Since we no longer use the built-in alias offset mechanism,
4914                         // we have to create the aliases to be of at least the size of the
4915                         // original variable (in bytes)
4916                         // Otherwise, we may end up a situation where we have an alias with
4917                         // an offset (m_aliasOffset, that we don't notify vISA about),
4918                         // and make an out-of-bounds access.
4919                         // This is the opposite of the calculation that happens in
4920                         // CVariable::CVariable.
4921 
4922                         const unsigned int denominator = CEncoder::GetCISADataTypeSize(var->GetType());
4923                         IGC_ASSERT(denominator);
4924                         uint16_t nbElement =
4925                             var->GetAlias()->GetNumberElement() *
4926                             CEncoder::GetCISADataTypeSize(var->GetAlias()->GetType()) /
4927                             denominator;
4928 
4929                         V(vKernel->CreateVISAGenVar(
4930                             var->visaGenVariable[i],
4931                             var->getVisaCString(),
4932                             nbElement,
4933                             var->GetType(),
4934                             GetVISAAlign(var->GetAlias()), // Use parent's align as we create an alias of the parent.
4935                             var->GetAlias()->visaGenVariable[i],
4936                             0));
4937                     }
4938                 }
4939             }
4940         }
4941         else
4942         {
4943             uint num_elts = var->GetNumberElement();
4944             if (var->GetVarType() == EVARTYPE_GENERAL)
4945             {
4946                 var->visaGenVariable[0] = nullptr;
4947                 var->visaGenVariable[1] = nullptr;
4948                 IGC_ASSERT_MESSAGE(var->GetType() != ISA_TYPE_BOOL, "boolean cannot be general var");
4949                 for (uint i = 0; i < var->GetNumberInstance(); i++)
4950                 {
4951                     V(vKernel->CreateVISAGenVar(
4952                         var->visaGenVariable[i],
4953                         var->getVisaCString(),
4954                         num_elts,
4955                         var->GetType(),
4956                         GetVISAAlign(var)));
4957                 }
4958             }
4959             else if (var->GetVarType() == EVARTYPE_PREDICATE)
4960             {
4961                 unsigned short nb = int_cast<unsigned short>(num_elts) * var->GetNumberInstance();
4962                 V(vKernel->CreateVISAPredVar(
4963                     var->visaPredVariable,
4964                     "",
4965                     nb));
4966             }
4967             else
4968             {
4969                 // when both array and index are uniform so is the destination address variable
4970                 uint nb = (var->IsUniform() && var->IsVectorUniform()) ? 1 : var->GetNumberElement();
4971                 V(vKernel->CreateVISAAddrVar(var->visaAddrVariable, "", nb));
4972             }
4973         }
4974     }
4975 
DeclareInput(CVariable * var,uint offset,uint instance)4976     void CEncoder::DeclareInput(CVariable* var, uint offset, uint instance)
4977     {
4978         // Avoid declaring more inputs/outputs than available registers
4979         if (offset + var->GetSize() >= vKernel->getNumRegTotal() * getGRFSize())
4980             return;
4981         V(vKernel->CreateVISAInputVar(
4982             var->visaGenVariable[instance],
4983             int_cast<unsigned short>(offset),
4984             int_cast<unsigned short>(var->GetSize())));
4985     }
4986 
MarkAsOutput(CVariable * var)4987     void CEncoder::MarkAsOutput(CVariable* var)
4988     {
4989         for (unsigned int i = 0; i < var->GetNumberInstance(); i++)
4990         {
4991             V(vKernel->AddAttributeToVar(var->visaGenVariable[i], "Output", 0, nullptr));
4992         }
4993     }
4994 
MarkAsPayloadLiveOut(CVariable * var)4995     void CEncoder::MarkAsPayloadLiveOut(CVariable* var)
4996     {
4997         for (unsigned int i = 0; i < var->GetNumberInstance(); i++)
4998         {
4999             V(vKernel->AddAttributeToVar(var->visaGenVariable[i], "PayloadLiveOut", 0, nullptr));
5000         }
5001     }
5002 
AvoidRetryOnSmallSpill() const5003     bool CEncoder::AvoidRetryOnSmallSpill() const
5004     {
5005         CodeGenContext* context = m_program->GetContext();
5006         return context->type == ShaderType::PIXEL_SHADER &&
5007             (m_program->m_dispatchSize == SIMDMode::SIMD8 || m_program->m_dispatchSize == SIMDMode::SIMD16) &&
5008             context->m_retryManager.IsFirstTry();
5009     }
5010 
CreateKernelSymbol(const std::string & kernelName,unsigned offset,unsigned size,SProgramOutput::ZEBinFuncSymbolTable & symbols)5011     void CEncoder::CreateKernelSymbol(const std::string& kernelName, unsigned offset,
5012         unsigned size, SProgramOutput::ZEBinFuncSymbolTable& symbols)
5013     {
5014         // kernel symbols are local symbols
5015         symbols.local.emplace_back(vISA::GenSymType::S_KERNEL, offset, size, kernelName);
5016     }
5017 
CreateSymbolTable(ValueToSymbolList & symbolTableList)5018     void CEncoder::CreateSymbolTable(ValueToSymbolList& symbolTableList)
5019     {
5020         Module* pModule = m_program->GetContext()->getModule();
5021         ModuleMetaData* modMD = m_program->GetContext()->getModuleMetaData();
5022 
5023         for (auto& F : pModule->getFunctionList())
5024         {
5025             // Find all variant function declarations
5026             if (F.isDeclaration() && F.hasFnAttribute("variant-function-decl"))
5027             {
5028                 // Parse the function name string
5029                 auto [symStr, fName, vecLen] = IGC::ParseVectorVariantFunctionString(F.getName());
5030 
5031                 Function* VFDef = pModule->getFunction(fName);
5032                 if (VFDef && numLanes(m_program->m_dispatchSize) == vecLen)
5033                 {
5034                     auto Iter = stackFuncMap.find(VFDef);
5035                     IGC_ASSERT_MESSAGE(Iter != stackFuncMap.end(), "vISA function not found");
5036 
5037                     vISA::GenSymEntry fEntry;
5038                     IGC_ASSERT(F.getName().size() <= vISA::MAX_SYMBOL_NAME_LENGTH);
5039                     strcpy_s(fEntry.s_name, vISA::MAX_SYMBOL_NAME_LENGTH, F.getName().str().c_str());
5040 
5041                     // Query vISA for the function's byte offset within the compiled module
5042                     // The actual binary offset data should point to the function definition
5043                     VISAFunction* visaFunc = Iter->second;
5044                     fEntry.s_type = vISA::GenSymType::S_FUNC;
5045                     fEntry.s_offset = (uint32_t)visaFunc->getGenOffset();
5046                     fEntry.s_size = (uint32_t)visaFunc->getGenSize();
5047 
5048                     symbolTableList.push_back(std::make_pair(&F, fEntry));
5049                 }
5050             }
5051             // Ignore variant function definitions
5052             else if (F.hasFnAttribute("variant-function-def"))
5053             {
5054                 IGC_ASSERT_MESSAGE(F.use_empty(), "This function should never be accessed directly");
5055                 continue;
5056             }
5057             // Find all functions in the module we need to export as symbols
5058             else if (F.hasFnAttribute("referenced-indirectly") && (!F.isDeclaration() || !F.use_empty()))
5059             {
5060                 vISA::GenSymEntry fEntry;
5061                 IGC_ASSERT(F.getName().size() <= vISA::MAX_SYMBOL_NAME_LENGTH);
5062                 strcpy_s(fEntry.s_name, vISA::MAX_SYMBOL_NAME_LENGTH, F.getName().str().c_str());
5063 
5064                 bool isTrue = false;
5065                 if (F.isDeclaration() || isTrue)
5066                 {
5067                     // If the function is only declared, set as undefined type
5068                     fEntry.s_type = vISA::GenSymType::S_UNDEF;
5069                     fEntry.s_offset = 0;
5070                     fEntry.s_size = 0;
5071                 }
5072                 else
5073                 {
5074                     auto Iter = stackFuncMap.find(&F);
5075                     IGC_ASSERT_MESSAGE(Iter != stackFuncMap.end(), "vISA function not found");
5076 
5077                     // Query vISA for the function's byte offset within the compiled module
5078                     VISAFunction* visaFunc = Iter->second;
5079                     fEntry.s_type = vISA::GenSymType::S_FUNC;
5080                     fEntry.s_offset = (uint32_t)visaFunc->getGenOffset();
5081                     fEntry.s_size = (uint32_t)visaFunc->getGenSize();
5082                 }
5083                 symbolTableList.push_back(std::make_pair(&F, fEntry));
5084             }
5085         }
5086 
5087         // Export global symbols
5088         for (auto global : modMD->inlineProgramScopeOffsets)
5089         {
5090             GlobalVariable* pGlobal = global.first;
5091 
5092             // Export the symbol if global is external/common linkage, or has uses in the module
5093             bool needSymbol = pGlobal->use_empty()
5094                 ? (modMD->compOpt.EnableTakeGlobalAddress && (pGlobal->hasCommonLinkage() || pGlobal->hasExternalLinkage()))
5095                 : true;
5096 
5097             if (needSymbol)
5098             {
5099                 StringRef name = pGlobal->getName();
5100                 unsigned addrSpace = pGlobal->getType()->getAddressSpace();
5101                 IGC_ASSERT(name.size() <= vISA::MAX_SYMBOL_NAME_LENGTH);
5102 
5103                 vISA::GenSymEntry sEntry;
5104                 memset(sEntry.s_name, '0', vISA::MAX_SYMBOL_NAME_LENGTH);
5105                 strcpy_s(sEntry.s_name, vISA::MAX_SYMBOL_NAME_LENGTH, name.str().c_str());
5106                 MDNode* md = pGlobal->getMetadata("ConstSampler");
5107                 if (md)
5108                 {
5109                     // Constant Sampler: s_offset contains the sampler ID
5110                     sEntry.s_type = vISA::GenSymType::S_CONST_SAMPLER;
5111                     sEntry.s_size = 0;
5112                     sEntry.s_offset = static_cast<uint32_t>(global.second);
5113                 }
5114                 else
5115                 {
5116                     sEntry.s_type = (addrSpace == ADDRESS_SPACE_GLOBAL) ? vISA::GenSymType::S_GLOBAL_VAR : vISA::GenSymType::S_GLOBAL_VAR_CONST;
5117                     sEntry.s_size = int_cast<uint32_t>(pModule->getDataLayout().getTypeAllocSize(pGlobal->getType()->getPointerElementType()));
5118                     sEntry.s_offset = static_cast<uint32_t>(global.second);
5119                 }
5120                 symbolTableList.push_back(std::make_pair(pGlobal, sEntry));
5121             }
5122         }
5123     }
5124 
CreateSymbolTable(void * & buffer,unsigned & bufferSize,unsigned & tableEntries)5125     void CEncoder::CreateSymbolTable(void*& buffer, unsigned& bufferSize, unsigned& tableEntries)
5126     {
5127         buffer = nullptr;
5128         bufferSize = 0;
5129         tableEntries = 0;
5130 
5131         ValueToSymbolList symbolTableList;
5132         CreateSymbolTable(symbolTableList);
5133 
5134         // Get the data for patch token
5135         if (!symbolTableList.empty())
5136         {
5137             std::vector<vISA::GenSymEntry> tempBufferData;
5138             // Collect the data just for the symbol table entries
5139             for (auto I : symbolTableList)
5140             {
5141                 auto symbolEntry = I.second;
5142                 tempBufferData.push_back(symbolEntry);
5143             }
5144 
5145             tableEntries = tempBufferData.size();
5146             bufferSize = tableEntries * sizeof(vISA::GenSymEntry);
5147             buffer = malloc(bufferSize);
5148             IGC_ASSERT_MESSAGE(nullptr != buffer, "Symbol table cannot be allocated");
5149             memcpy_s(buffer, bufferSize, tempBufferData.data(), bufferSize);
5150         }
5151     }
5152 
CreateSymbolTable(SProgramOutput::ZEBinFuncSymbolTable & funcSyms,SOpenCLProgramInfo::ZEBinProgramSymbolTable & programSyms)5153     void CEncoder::CreateSymbolTable(SProgramOutput::ZEBinFuncSymbolTable& funcSyms,
5154         SOpenCLProgramInfo::ZEBinProgramSymbolTable& programSyms)
5155     {
5156         ValueToSymbolList symbolTableList;
5157         CreateSymbolTable(symbolTableList);
5158 
5159         // Get the data for zebin
5160         for (auto I : symbolTableList)
5161         {
5162             Value* symbolValue = I.first;
5163             auto symbolEntry = I.second;
5164 
5165             if (Function* F = dyn_cast<Function>(symbolValue))
5166             {
5167                 funcSyms.function.emplace_back((vISA::GenSymType)symbolEntry.s_type, symbolEntry.s_offset, symbolEntry.s_size, F->getName().str());
5168             }
5169             else if (GlobalVariable* G = dyn_cast<GlobalVariable>(symbolValue))
5170             {
5171                 // const sampler
5172                 if (symbolEntry.s_type == vISA::GenSymType::S_CONST_SAMPLER) {
5173                     funcSyms.sampler.emplace_back((vISA::GenSymType)symbolEntry.s_type, symbolEntry.s_offset, symbolEntry.s_size, G->getName().str());
5174                 }
5175                 // global variables
5176                 else if (symbolEntry.s_type == vISA::GenSymType::S_GLOBAL_VAR) {
5177                     programSyms.global.emplace_back((vISA::GenSymType)symbolEntry.s_type, symbolEntry.s_offset, symbolEntry.s_size, G->getName().str());
5178                 }
5179                 // global constants and string literals
5180                 else {
5181                     Constant* initializer = G->getInitializer();
5182                     ConstantDataSequential* cds = dyn_cast<ConstantDataSequential>(initializer);
5183                     if (cds && (cds->isCString() || cds->isString()))
5184                         programSyms.globalStringConst.emplace_back((vISA::GenSymType)symbolEntry.s_type, symbolEntry.s_offset, symbolEntry.s_size, G->getName().str());
5185                     else
5186                         programSyms.globalConst.emplace_back((vISA::GenSymType)symbolEntry.s_type, symbolEntry.s_offset, symbolEntry.s_size, G->getName().str());
5187                 }
5188             }
5189             else
5190             {
5191                 IGC_ASSERT(0);
5192             }
5193         }
5194     }
5195 
CreateRelocationTable(void * & buffer,unsigned & bufferSize,unsigned & tableEntries)5196     void CEncoder::CreateRelocationTable(void*& buffer, unsigned& bufferSize, unsigned& tableEntries)
5197     {
5198         // for patch-token-based binary format
5199         buffer = nullptr;
5200         bufferSize = 0;
5201         tableEntries = 0;
5202 
5203         // vISA will directly return the buffer with GenRelocEntry layout
5204         IGC_ASSERT(nullptr != vMainKernel);
5205         V(vMainKernel->GetGenRelocEntryBuffer(buffer, bufferSize, tableEntries));
5206         IGC_ASSERT((sizeof(vISA::GenRelocEntry) * tableEntries) == bufferSize);
5207     }
5208 
CreateRelocationTable(SProgramOutput::RelocListTy & relocations)5209     void CEncoder::CreateRelocationTable(SProgramOutput::RelocListTy& relocations)
5210     {
5211         // for ZEBinary format
5212         IGC_ASSERT(nullptr != vMainKernel);
5213         V(vMainKernel->GetRelocations(relocations));
5214     }
5215 
CreateFuncAttributeTable(void * & buffer,unsigned & bufferSize,unsigned & tableEntries,SProgramOutput::FuncAttrListTy & attrs)5216     void CEncoder::CreateFuncAttributeTable(void*& buffer, unsigned& bufferSize,
5217         unsigned& tableEntries, SProgramOutput::FuncAttrListTy& attrs)
5218     {
5219         buffer = nullptr;
5220         bufferSize = 0;
5221         tableEntries = 0;
5222 
5223         std::vector<vISA::GenFuncAttribEntry> attribTable;
5224         for (auto it : funcAttributeMap)
5225         {
5226             vISA::GenFuncAttribEntry entry;
5227             Function* F = it.first;
5228             IGC_ASSERT(nullptr != F);
5229             IGC_ASSERT(F->getName().size() <= vISA::MAX_SYMBOL_NAME_LENGTH);
5230             strcpy_s(entry.f_name, vISA::MAX_SYMBOL_NAME_LENGTH, F->getName().str().c_str());
5231             entry.f_isKernel = it.second.isKernel ? 1 : 0;
5232             entry.f_hasBarrier = it.second.hasBarrier ? 1 : 0;
5233             entry.f_privateMemPerThread = (uint32_t) (it.second.argumentStackSize + it.second.allocaStackSize);
5234 
5235             // Get spill mem usage from visa
5236             VISAKernel* visaFunc = nullptr;
5237             if (it.second.isKernel)
5238             {
5239                 visaFunc = vMainKernel;
5240             }
5241             else
5242             {
5243                 auto Iter = stackFuncMap.find(F);
5244                 IGC_ASSERT_MESSAGE(Iter != stackFuncMap.end(), "vISA function not found");
5245                 visaFunc = Iter->second;
5246             }
5247             FINALIZER_INFO* jitInfo;
5248             visaFunc->GetJitInfo(jitInfo);
5249             entry.f_spillMemPerThread = jitInfo->spillMemUsed;
5250 
5251             attrs.emplace_back(entry.f_isKernel, entry.f_hasBarrier, entry.f_privateMemPerThread,
5252                 entry.f_spillMemPerThread, F->getName().str());
5253             attribTable.push_back(entry);
5254         }
5255 
5256         if (!attribTable.empty())
5257         {
5258             tableEntries = attribTable.size();
5259             bufferSize = tableEntries * sizeof(vISA::GenFuncAttribEntry);
5260             buffer = malloc(bufferSize);
5261             IGC_ASSERT_MESSAGE(nullptr != buffer, "Table cannot be allocated");
5262             memcpy_s(buffer, bufferSize, attribTable.data(), bufferSize);
5263         }
5264     }
5265 
Compile(bool hasSymbolTable)5266     void CEncoder::Compile(bool hasSymbolTable)
5267     {
5268         IGC_ASSERT(nullptr != m_program);
5269         CodeGenContext* const context = m_program->GetContext();
5270         SProgramOutput* const pOutput = m_program->ProgramOutput();
5271 
5272         if (m_program->m_dispatchSize == SIMDMode::SIMD8)
5273         {
5274             MEM_SNAPSHOT(IGC::SMS_AFTER_CISACreateDestroy_SIMD8);
5275         }
5276         else if (m_program->m_dispatchSize == SIMDMode::SIMD16)
5277         {
5278             MEM_SNAPSHOT(IGC::SMS_AFTER_CISACreateDestroy_SIMD16);
5279         }
5280         else if (m_program->m_dispatchSize == SIMDMode::SIMD32)
5281         {
5282             MEM_SNAPSHOT(IGC::SMS_AFTER_CISACreateDestroy_SIMD32);
5283         }
5284 
5285         int vIsaCompile = 0;
5286         VISAKernel* pMainKernel = nullptr;
5287 
5288         // ShaderOverride for .visaasm files
5289         std::vector<std::string> visaOverrideFiles;
5290         bool visaAsmOverride = false;
5291         std::string kernelName;
5292         if (IGC_IS_FLAG_ENABLED(ShaderOverride))
5293         {
5294             // Kernel count is one per visaBuilder
5295             // Function count depends on stackFuncMap size
5296             int kernelCount = 1;
5297             int functionCount = stackFuncMap.size();
5298             int count = kernelCount + functionCount;
5299             IGC::Debug::OutputFolderName folder = IGC::Debug::GetShaderOverridePath();
5300             Debug::DumpName name = IGC::Debug::GetDumpNameObj(m_program, "visaasm");
5301             kernelName = name.GetKernelName();
5302 
5303             visaOverrideFiles.push_back(name.AbsolutePath(folder));
5304 
5305             for (int i = 0; i < functionCount; i++)
5306             {
5307                 std::string tmpVisaFile = name.AbsolutePath(folder);
5308                 std::string::size_type asmNameEnd = tmpVisaFile.find_last_of('.');
5309                 tmpVisaFile = tmpVisaFile.substr(0, asmNameEnd);
5310                 std::stringstream asmName;
5311                 asmName << tmpVisaFile;
5312                 asmName << "_f";
5313                 asmName << i;
5314                 asmName << ".visaasm";
5315                 visaOverrideFiles.push_back(asmName.str());
5316             }
5317 
5318             if (visaOverrideFiles.size() == count)
5319             {
5320                 for (const std::string& file : visaOverrideFiles)
5321                 {
5322                     FILE*  tempFile = fopen(file.c_str(), "r");
5323                     if (tempFile)
5324                     {
5325                         visaAsmOverride = true;
5326                         fclose(tempFile);
5327                     }
5328                     else
5329                     {
5330                         visaAsmOverride = false;
5331                         if (functionCount > 0)
5332                         {
5333                             std::string message = "Cannot open overridden file! Put all .visaasm files in ShaderOverride dir.";
5334                             appendToShaderOverrideLogFile(message, "WARNING: ");
5335                         }
5336                         break;
5337 
5338                     }
5339                 }
5340             }
5341         }
5342 
5343         // Compile generated VISA text string for inlineAsm
5344         if (m_hasInlineAsm || visaAsmOverride)
5345         {
5346             llvm::SmallVector<const char*, 10> params;
5347             llvm::SmallVector<std::unique_ptr< char, std::function<void(char*)>>, 10> params2;
5348             InitBuildParams(params2);
5349             for (const auto &ptr : params2)
5350             {
5351                 params.push_back(ptr.get());
5352             }
5353 
5354             // Create a new builder for parsing the visaasm
5355             TARGET_PLATFORM VISAPlatform = GetVISAPlatform(&(context->platform));
5356             V(CreateVISABuilder(vAsmTextBuilder, vISA_ASM_READER, VISA_BUILDER_BOTH, VISAPlatform,
5357                 params.size(), params.data(), &m_vISAWaTable));
5358             // Use the same build options as before, except that we enable vISA verifier to catch
5359             // potential errors in user inline assembly
5360             SetBuilderOptions(vAsmTextBuilder);
5361             vAsmTextBuilder->SetOption(vISA_NoVerifyvISA, false);
5362 
5363             bool vISAAsmParseError = false;
5364             // Parse the generated VISA text
5365             if (visaAsmOverride)
5366             {
5367                 for (const std::string& tmpVisaFile : visaOverrideFiles)
5368                 {
5369                     llvm::SmallVector<char, 1024> visaAsmNameVector;
5370                     std::string visaAsmName = GetDumpFileName("");
5371 
5372                     StringRef visaAsmNameRef(visaAsmName.c_str());
5373                     StringRef tmpVisaFileRef(tmpVisaFile.c_str());
5374                     StringRef directory = llvm::sys::path::parent_path(visaAsmNameRef);
5375                     StringRef filename = llvm::sys::path::filename(tmpVisaFileRef);
5376 
5377                     llvm::sys::path::append(visaAsmNameVector, directory, filename);
5378                     visaAsmName = std::string(visaAsmNameVector.begin(), visaAsmNameVector.end());
5379 
5380                     auto result = vAsmTextBuilder->ParseVISAText(tmpVisaFile.c_str());
5381                     appendToShaderOverrideLogFile(visaAsmName, "OVERRIDEN: ");
5382                     vISAAsmParseError = (result != 0);
5383                     if (vISAAsmParseError) {
5384                         IGC_ASSERT_MESSAGE(0, "visaasm file parse error!");
5385                         break;
5386                     }
5387                 }
5388                 // After call to ParseVISAText, we have new VISAKernel, which don't have asm path set.
5389                 // So we need to set the OutputAsmPath attribute of overridden kernel,
5390                 // otherwise, we will not get .visaasm dump and .asm file dump
5391                 auto kernelName = IGC::Debug::GetDumpNameObj(m_program, "").GetKernelName();
5392                 std::string asmName = GetDumpFileName("asm");
5393                 auto overriddenKernel = vAsmTextBuilder->GetVISAKernel(kernelName);
5394                 overriddenKernel->AddKernelAttribute("OutputAsmPath", asmName.length(), asmName.c_str());
5395 
5396                 // We need to update stackFuncMap for the symbol table for the overridden object,
5397                 // because stackFuncMap contains information about functions for original object.
5398                 // Only the IndirectlyCalled functions should be updated,
5399                 // because these functions can be used in CreateSymbolTable.
5400                 // Other normal stack call functions aren't used in CreateSymbolTable.
5401                 if (hasSymbolTable && stackFuncMap.size() > 0)
5402                 {
5403                     Module* pModule = m_program->GetContext()->getModule();
5404                     for (auto& F : pModule->getFunctionList())
5405                     {
5406                         if (F.hasFnAttribute("referenced-indirectly") && (!F.isDeclaration() || !F.use_empty()))
5407                         {
5408                             auto Iter = stackFuncMap.find(&F);
5409                             IGC_ASSERT_MESSAGE(Iter != stackFuncMap.end(), "vISA function not found");
5410 
5411                             VISAFunction* original = Iter->second;
5412                             stackFuncMap[&F] = static_cast<VISAFunction*>(vAsmTextBuilder->GetVISAKernel(original->getFunctionName()));
5413                         }
5414                     }
5415                 }
5416             }
5417             else
5418             {
5419                 std::string parseTextFile = GetDumpFileName("inline.visaasm");
5420                 auto result = vAsmTextBuilder->ParseVISAText(vbuilder->GetAsmTextStream().str(), parseTextFile);
5421                 if (result != 0)
5422                 {
5423                     std::string output;
5424                     raw_string_ostream S(output);
5425                     S << "parsing vISA inline assembly failed:\n" << vAsmTextBuilder->GetCriticalMsg();
5426                     S.flush();
5427                     context->EmitError(output.c_str(), nullptr);
5428                     vISAAsmParseError = true;
5429                 }
5430             }
5431 
5432             if (vISAAsmParseError)
5433             {
5434                 COMPILER_TIME_END(m_program->GetContext(), TIME_CG_vISACompile);
5435                 return;
5436             }
5437             else
5438             {
5439                 if (!visaAsmOverride)
5440                 {
5441                     // vISA verifier is already invoked in ParseVISAText earlier
5442                     vAsmTextBuilder->SetOption(vISA_NoVerifyvISA, true);
5443                 }
5444                 pMainKernel = vAsmTextBuilder->GetVISAKernel(kernelName);
5445                 std::stringstream ss;
5446                 vIsaCompile = vAsmTextBuilder->Compile(
5447                     m_enableVISAdump ? GetDumpFileName("isa").c_str() : "",
5448                     (context->m_compileToVISAOnly) ? &ss : nullptr,
5449                     context->m_compileToVISAOnly);
5450             }
5451         }
5452         //Compile to generate the V-ISA binary
5453         else
5454         {
5455             pMainKernel = vMainKernel;
5456             std::stringstream ss;
5457             vIsaCompile = vbuilder->Compile(
5458                 m_enableVISAdump ? GetDumpFileName("isa").c_str() : "",
5459                 (context->m_compileToVISAOnly) ? &ss : nullptr,
5460                 context->m_compileToVISAOnly);
5461         }
5462 
5463         COMPILER_TIME_END(m_program->GetContext(), TIME_CG_vISACompile);
5464 
5465 #if GET_TIME_STATS
5466         // handle the vISA time counters differently here
5467         if (context->m_compilerTimeStats)
5468         {
5469             context->m_compilerTimeStats->recordVISATimers();
5470         }
5471 #endif
5472         KERNEL_INFO* vISAstats;
5473         pMainKernel->GetKernelInfo(vISAstats);
5474         // Collect metrics from vISA
5475         context->metrics.CollectRegStats(vISAstats);
5476 
5477         FINALIZER_INFO* jitInfo = nullptr;
5478         pMainKernel->GetJitInfo(jitInfo);
5479 
5480         // Depend on vISA information about barriers presence to make sure that it's
5481         // always set properly, even if a barrier is used as a part of Inline vISA code only.
5482         if (jitInfo->usesBarrier)
5483         {
5484             m_program->SetHasBarrier();
5485         }
5486 
5487         if (jitInfo->isSpill)
5488         {
5489             context->m_retryManager.SetSpillSize(jitInfo->numGRFSpillFill);
5490             m_program->m_spillSize = jitInfo->numGRFSpillFill;
5491             m_program->m_spillCost =
5492                 float(jitInfo->numGRFSpillFill) / jitInfo->numAsmCount;
5493 
5494             context->m_retryManager.numInstructions = jitInfo->numAsmCount;
5495         }
5496 
5497         if (IGC_IS_FLAG_ENABLED(DumpCompilerStats))
5498         {
5499             CompilerStats CompilerStats;
5500             pMainKernel->GetCompilerStats(CompilerStats);
5501             CompilerStatsUtils::RecordCodeGenCompilerStats(context, m_program->m_dispatchSize, CompilerStats, jitInfo);
5502         }
5503 
5504         if (vIsaCompile == -1)
5505         {
5506             IGC_ASSERT_MESSAGE(0, "CM failure in vbuilder->Compile()");
5507         }
5508         else if (vIsaCompile == -2)
5509         {
5510             IGC_ASSERT_MESSAGE(0, "CM user error in vbuilder->Compile()");
5511         }
5512         else if (vIsaCompile == -3) // CM early terminates on spill
5513         {
5514 #if (GET_SHADER_STATS)
5515             if (m_program->m_dispatchSize == SIMDMode::SIMD8)
5516             {
5517                 COMPILER_SHADER_STATS_SET(m_program->m_shaderStats, STATS_ISA_EARLYEXIT8, 1);
5518             }
5519             else if (m_program->m_dispatchSize == SIMDMode::SIMD16)
5520             {
5521                 COMPILER_SHADER_STATS_SET(m_program->m_shaderStats, STATS_ISA_EARLYEXIT16, 1);
5522             }
5523             else if (m_program->m_dispatchSize == SIMDMode::SIMD32)
5524             {
5525                 COMPILER_SHADER_STATS_SET(m_program->m_shaderStats, STATS_ISA_EARLYEXIT32, 1);
5526             }
5527 #endif
5528             context->SetSIMDInfo(SIMD_SKIP_SPILL, m_program->m_dispatchSize, m_program->m_ShaderDispatchMode);
5529             return;
5530         }
5531 
5532         if (m_program->m_dispatchSize == SIMDMode::SIMD8)
5533         {
5534             MEM_SNAPSHOT(IGC::SMS_AFTER_vISACompile_SIMD8);
5535             SimdSize8++;
5536         }
5537         else if (m_program->m_dispatchSize == SIMDMode::SIMD16)
5538         {
5539             MEM_SNAPSHOT(IGC::SMS_AFTER_vISACompile_SIMD16);
5540             SimdSize16++;
5541         }
5542         else if (m_program->m_dispatchSize == SIMDMode::SIMD32)
5543         {
5544             MEM_SNAPSHOT(IGC::SMS_AFTER_vISACompile_SIMD32);
5545             SimdSize32++;
5546         }
5547 
5548         if (m_program->m_dispatchSize == SIMDMode::SIMD16)
5549         {
5550             uint sendStallCycle = 0;
5551             uint staticCycle = 0;
5552             for (uint i = 0; i < jitInfo->BBNum; i++)
5553             {
5554                 sendStallCycle += jitInfo->BBInfo[i].sendStallCycle;
5555                 staticCycle += jitInfo->BBInfo[i].staticCycle;
5556             }
5557             m_program->m_sendStallCycle = sendStallCycle;
5558             m_program->m_staticCycle = staticCycle;
5559         }
5560 
5561         if (jitInfo->isSpill && (AvoidRetryOnSmallSpill() || jitInfo->avoidRetry))
5562         {
5563             context->m_retryManager.Disable();
5564         }
5565 
5566 #if (GET_SHADER_STATS && !PRINT_DETAIL_SHADER_STATS)
5567         if (m_program->m_dispatchSize == SIMDMode::SIMD8)
5568         {
5569             COMPILER_SHADER_STATS_SET(m_program->m_shaderStats, STATS_ISA_INST_COUNT, jitInfo->numAsmCount);
5570             COMPILER_SHADER_STATS_SET(m_program->m_shaderStats, STATS_ISA_SPILL8, (int)jitInfo->isSpill);
5571         }
5572         else if (m_program->m_dispatchSize == SIMDMode::SIMD16)
5573         {
5574             COMPILER_SHADER_STATS_SET(m_program->m_shaderStats, STATS_ISA_INST_COUNT_SIMD16, jitInfo->numAsmCount);
5575             COMPILER_SHADER_STATS_SET(m_program->m_shaderStats, STATS_ISA_SPILL16, (int)jitInfo->isSpill);
5576         }
5577         else if (m_program->m_dispatchSize == SIMDMode::SIMD32)
5578         {
5579             COMPILER_SHADER_STATS_SET(m_program->m_shaderStats, STATS_ISA_INST_COUNT_SIMD32, jitInfo->numAsmCount);
5580             COMPILER_SHADER_STATS_SET(m_program->m_shaderStats, STATS_ISA_SPILL32, (int)jitInfo->isSpill);
5581         }
5582 #endif
5583 
5584         if (context->m_compileToVISAOnly) {
5585             return;
5586         }
5587 
5588         void* genxbin = nullptr;
5589         int size = 0, binSize = 0;
5590         bool binOverride = false;
5591 
5592         V(pMainKernel->GetGenxBinary(genxbin, binSize));
5593         if (IGC_IS_FLAG_ENABLED(ShaderOverride))
5594         {
5595             Debug::DumpName name = IGC::Debug::GetDumpNameObj(m_program, "asm");
5596             std::string binFileName = name.overridePath();
5597 
5598             overrideShaderIGA(context->platform.getPlatformInfo(), genxbin, binSize, binFileName, binOverride);
5599 
5600             if (!binOverride)
5601             {
5602                 name = IGC::Debug::GetDumpNameObj(m_program, "dat");
5603                 binFileName = name.overridePath();
5604                 overrideShaderBinary(genxbin, binSize, binFileName, binOverride);
5605             }
5606 
5607         }
5608 
5609         IGC_ASSERT(genxbin != nullptr);
5610         size = binSize;
5611 
5612         // the kernel has to be padded to have a size aligned on 64 bytes
5613         size_t padding = iSTD::GetAlignmentOffset(size, 64);//m_program->m_Platform->getKernelPointerAlignSize());
5614         void* kernel = nullptr;
5615         if (size!=0)
5616         {
5617             kernel = IGC::aligned_malloc(size + padding, 16 /* sizeof(DQWORD) */);
5618             memcpy_s(kernel, size + padding, genxbin, binSize);
5619             // pad out the rest with 0s
5620             memset(static_cast<char*>(kernel) + size, 0, padding);
5621         }
5622         if (binOverride)
5623         {
5624             free(genxbin);
5625         }
5626         else
5627         {
5628             freeBlock(genxbin);
5629         }
5630 
5631         void* dbgInfo = nullptr;
5632         unsigned int dbgSize = 0;
5633         if (context->m_instrTypes.hasDebugInfo || m_enableVISAdump)
5634         {
5635             void* genxdbgInfo = nullptr;
5636             V(pMainKernel->GetGenxDebugInfo(genxdbgInfo, dbgSize));
5637             if (m_enableVISAdump)
5638             {
5639                 // passing VISAOptions: -generateDebugInfo should
5640                 // cause dbg file to be generated, even when
5641                 // hasDebugInfo = false.
5642                 if (context->m_instrTypes.hasDebugInfo)
5643                 {
5644                     // assertion check makes sense only if debug info
5645                     // is present in input.
5646                     IGC_ASSERT(nullptr != genxdbgInfo);
5647                     IGC_ASSERT(0 < dbgSize);
5648                 }
5649                 if (dbgSize > 0)
5650                 {
5651                     // dump dbg file only if it not empty
5652                     std::string debugFileNameStr = IGC::Debug::GetDumpName(m_program, "dbg");
5653                     FILE* const dbgFile = fopen(debugFileNameStr.c_str(), "wb+");
5654                     if (nullptr != dbgFile)
5655                     {
5656                         fwrite(genxdbgInfo, dbgSize, 1, dbgFile);
5657                         fclose(dbgFile);
5658                     }
5659                 }
5660             }
5661 
5662             dbgInfo = IGC::aligned_malloc(dbgSize, sizeof(void*));
5663 
5664             memcpy_s(dbgInfo, dbgSize, genxdbgInfo, dbgSize);
5665 
5666             freeBlock(genxdbgInfo);
5667         }
5668 
5669         pOutput->m_programBin = kernel;
5670         pOutput->m_programSize = size + padding;
5671         pOutput->m_unpaddedProgramSize = size;
5672         pOutput->m_scratchSpaceUsedBySpills = 0; // initializing
5673         pOutput->m_debugDataGenISA = dbgInfo;
5674         pOutput->m_debugDataGenISASize = dbgSize;
5675         pOutput->m_InstructionCount = jitInfo->numAsmCount;
5676         pOutput->m_BasicBlockCount = jitInfo->BBNum;
5677         if (context->getModuleMetaData()->compOpt.CaptureCompilerStats)
5678         {
5679             ReportCompilerStatistics(pMainKernel, pOutput);
5680         }
5681 
5682         pMainKernel->GetGTPinBuffer(pOutput->m_gtpinBuffer, pOutput->m_gtpinBufferSize);
5683 
5684         bool ZEBinEnabled = IGC_IS_FLAG_ENABLED(EnableZEBinary) || context->getCompilerOption().EnableZEBinary;
5685 
5686         if (hasSymbolTable)
5687         {
5688             if (ZEBinEnabled)
5689             {
5690                 // we can only support zebin symbols for OPENCL_SHADER for now
5691                 IGC_ASSERT(context->type == ShaderType::OPENCL_SHADER);
5692                 auto cl_context = static_cast<OpenCLProgramContext*>(context);
5693                 CreateSymbolTable(pOutput->m_symbols,
5694                     cl_context->m_programInfo.m_zebinSymbolTable);
5695             }
5696             else
5697             {
5698                 CreateSymbolTable(pOutput->m_funcSymbolTable,
5699                     pOutput->m_funcSymbolTableSize,
5700                     pOutput->m_funcSymbolTableEntries);
5701             }
5702         }
5703 
5704         if (ZEBinEnabled)
5705         {
5706             // create symbols for kernel.
5707             // The kernel Symbol has the same name as the kernel, and offset
5708             // pointed to 0.
5709             CreateKernelSymbol(m_program->entry->getName().str(), 0,
5710                 (unsigned)pMainKernel->getGenSize(), pOutput->m_symbols);
5711 
5712             // Emit symbol "_entry' as the actual kernel start. Maybe we can
5713             // consider to use the value of the _main label in this case. Now
5714             // set the symbol value as the max offset next to the per-thread
5715             // prolog, the cross-thread prolog, or the compute-FFID prolog.
5716             unsigned actual_kernel_start_off =
5717                 std::max(std::max(jitInfo->offsetToSkipPerThreadDataLoad,
5718                                   jitInfo->offsetToSkipCrossThreadDataLoad),
5719                          jitInfo->offsetToSkipSetFFIDGP1);
5720             CreateKernelSymbol("_entry", actual_kernel_start_off,
5721                 (unsigned)pMainKernel->getGenSize() - actual_kernel_start_off, pOutput->m_symbols);
5722         }
5723 
5724         if (ZEBinEnabled)
5725         {
5726             CreateRelocationTable(pOutput->m_relocs);
5727         }
5728         else
5729         {
5730             CreateRelocationTable(pOutput->m_funcRelocationTable,
5731                 pOutput->m_funcRelocationTableSize,
5732                 pOutput->m_funcRelocationTableEntries);
5733         }
5734 
5735         if (IGC_IS_FLAG_ENABLED(EnableRuntimeFuncAttributePatching))
5736         {
5737             CreateFuncAttributeTable(pOutput->m_funcAttributeTable,
5738                 pOutput->m_funcAttributeTableSize,
5739                 pOutput->m_funcAttributeTableEntries,
5740                 pOutput->m_funcAttrs);
5741         }
5742 
5743         if (jitInfo->isSpill == true)
5744         {
5745             pOutput->m_scratchSpaceUsedBySpills = jitInfo->spillMemUsed;
5746         }
5747 
5748         pOutput->setScratchSpaceUsedByShader(m_program->m_ScratchSpaceSize);
5749 
5750         pOutput->m_scratchSpaceUsedByGtpin = jitInfo->numBytesScratchGtpin;
5751 
5752         pOutput->m_offsetToSkipPerThreadDataLoad = jitInfo->offsetToSkipPerThreadDataLoad;
5753 
5754         pOutput->m_offsetToSkipSetFFIDGP = jitInfo->offsetToSkipSetFFIDGP;
5755 
5756         pOutput->m_numGRFTotal = jitInfo->numGRFTotal;
5757     }
5758 
DestroyVISABuilder()5759     void CEncoder::DestroyVISABuilder()
5760     {
5761         if (vAsmTextBuilder != nullptr)
5762         {
5763             V(::DestroyVISABuilder(vAsmTextBuilder));
5764             vAsmTextBuilder = nullptr;
5765         }
5766         V(::DestroyVISABuilder(vbuilder));
5767         vbuilder = nullptr;
5768     }
5769 
Copy(CVariable * dst,CVariable * src)5770     void CEncoder::Copy(CVariable* dst, CVariable* src)
5771     {
5772         IGC_ASSERT(nullptr != dst);
5773         IGC_ASSERT(nullptr != src);
5774         // undef value are not copied
5775         if (!src->IsUndef() || IGC_IS_FLAG_ENABLED(InitializeUndefValueEnable))
5776         {
5777             CVariable* rawDst = dst;
5778             IGC_ASSERT(GetCISADataTypeSize(src->GetType()) == GetCISADataTypeSize(dst->GetType()));
5779             bool isVecImm = src->IsImmediate() && (src->GetType() == ISA_TYPE_UV ||
5780                 src->GetType() == ISA_TYPE_V ||
5781                 src->GetType() == ISA_TYPE_VF);
5782             if (src->GetType() != dst->GetType() && !isVecImm)
5783             {
5784                 rawDst = m_program->BitCast(dst, src->GetType());
5785             }
5786             DataMov(ISA_MOV, rawDst, src);
5787         }
5788     }
5789 
BoolToInt(CVariable * dst,CVariable * src)5790     void CEncoder::BoolToInt(CVariable* dst, CVariable* src)
5791     {
5792         IGC_ASSERT(nullptr != dst);
5793         IGC_ASSERT(nullptr != src);
5794         IGC_ASSERT(src->GetType() == ISA_TYPE_BOOL);
5795 
5796         VISA_Type dstType = dst->GetType();
5797         IGC_ASSERT((dstType == ISA_TYPE_UD) || (dstType == ISA_TYPE_D) || (dstType == ISA_TYPE_UB) || (dstType == ISA_TYPE_B) || (dstType == ISA_TYPE_UW) || (dstType == ISA_TYPE_W));
5798 
5799         // undef value are not copied
5800         if (!src->IsUndef() || IGC_IS_FLAG_ENABLED(InitializeUndefValueEnable)) {
5801             // Casting 'dst' to BOOL is unnecessary.
5802             DataMov(ISA_MOV, dst, src);
5803         }
5804     }
5805 
GatherA64(CVariable * dst,CVariable * offset,unsigned elemSize,unsigned numElems)5806     void CEncoder::GatherA64(
5807         CVariable* dst,
5808         CVariable* offset,
5809         unsigned elemSize,
5810         unsigned numElems)
5811     {
5812         IGC_ASSERT_MESSAGE((elemSize == 8) || (elemSize == 32) || (elemSize == 64),
5813             "Only B/DW/QW-sized elements are supported!");
5814         IGC_ASSERT_MESSAGE((numElems == 1) || (numElems == 2) || (numElems == 4) || ((numElems == 8) && ((elemSize == 32) || m_program->m_Platform->has8ByteA64ByteScatteredMessage())),
5815             "Only 1/2/4/8 elements are supported!");
5816 
5817         VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
5818         VISA_RawOpnd* addressOpnd = GetRawSource(offset);
5819         VISA_RawOpnd* dstOpnd = GetRawDestination(dst);
5820 
5821         SIMDMode thisSM = offset->IsUniform() ? lanesToSIMDMode(offset->GetNumberElement()) : m_encoderState.m_simdSize;
5822         if (m_program->m_Platform->GetPlatformFamily() == IGFX_GEN8_CORE && thisSM == SIMDMode::SIMD16)
5823         {
5824             // BDW A64 gather does not support SIMD16, split it into 2 SIMD8
5825             VISA_EMask_Ctrl execMask = GetAluEMask(offset);
5826             VISA_Exec_Size fromExecSize = EXEC_SIZE_16;
5827             VISA_Exec_Size toExecSize = EXEC_SIZE_8;
5828 
5829             if (numElems == 1 || elemSize == 8)
5830             {   // No mov instructions (for packing) are needed.
5831                 for (unsigned p = 0; p < 2; ++p)
5832                 {
5833                     addressOpnd = GetRawSource(offset, GetRawOpndSplitOffset(fromExecSize, toExecSize, p, offset));
5834                     dstOpnd = GetRawDestination(dst, GetRawOpndSplitOffset(fromExecSize, toExecSize, p, dst));
5835 
5836                     V(vKernel->AppendVISASvmGatherInst(
5837                         predOpnd,
5838                         SplitEMask(fromExecSize, toExecSize, p, execMask),
5839                         toExecSize,
5840                         visaBlockType(elemSize),
5841                         visaBlockNum(numElems),
5842                         addressOpnd, dstOpnd));
5843                 }
5844             }
5845             else
5846             {
5847                 // Do two SIMD8 gather and then merge (pack) the two simd8 results
5848                 // to form the single simd16 payload.
5849                 CVariable* V0, * V1;
5850                 uint16_t newNumElems = (uint16_t)8 * numElems;
5851                 V0 = m_program->GetNewVariable(
5852                     newNumElems,
5853                     dst->GetType(),
5854                     dst->GetAlign(),
5855                     dst->IsUniform(),
5856                     dst->getName());
5857                 V1 = m_program->GetNewVariable(
5858                     newNumElems,
5859                     dst->GetType(),
5860                     dst->GetAlign(),
5861                     dst->IsUniform(),
5862                     dst->getName());
5863 
5864                 for (unsigned p = 0; p < 2; ++p)
5865                 {
5866                     addressOpnd = GetRawSource(offset, GetRawOpndSplitOffset(fromExecSize, toExecSize, p, offset));
5867                     dstOpnd = GetRawDestination(p == 0 ? V0 : V1);
5868 
5869                     V(vKernel->AppendVISASvmGatherInst(
5870                         predOpnd,
5871                         SplitEMask(fromExecSize, toExecSize, p, execMask),
5872                         toExecSize,
5873                         visaBlockType(elemSize),
5874                         visaBlockNum(numElems),
5875                         addressOpnd, dstOpnd));
5876                 }
5877 
5878                 uint32_t dstOfstBytes = dst->GetAliasOffset() + m_encoderState.m_dstOperand.subVar * getGRFSize();
5879                 MergePayloadToHigherSIMD(V0, V1, numElems, dst, dstOfstBytes, 16);
5880             }
5881             return;
5882         }
5883 
5884         V(vKernel->AppendVISASvmGatherInst(predOpnd, GetAluEMask(offset),
5885             visaExecSize(offset->IsUniform() ? lanesToSIMDMode(offset->GetNumberElement()) : m_encoderState.m_simdSize),
5886             visaBlockType(elemSize),
5887             visaBlockNum(numElems),
5888             addressOpnd, dstOpnd));
5889     }
5890 
ScatterA64(CVariable * src,CVariable * offset,unsigned elemSize,unsigned numElems)5891     void CEncoder::ScatterA64(CVariable* src,
5892         CVariable* offset,
5893         unsigned elemSize,
5894         unsigned numElems) {
5895         IGC_ASSERT_MESSAGE((elemSize == 8) || (elemSize == 32) || (elemSize == 64),
5896             "Only B/DW/QW-sized elements are supported!");
5897         IGC_ASSERT_MESSAGE((numElems == 1) || (numElems == 2) || (numElems == 4) || ((numElems == 8) && ((elemSize == 32) || m_program->m_Platform->has8ByteA64ByteScatteredMessage())),
5898             "Only 1/2/4/8 elements are supported!");
5899 
5900         VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
5901         VISA_RawOpnd* addressOpnd = GetRawSource(offset);
5902         VISA_RawOpnd* srcOpnd = GetRawSource(src);
5903 
5904         SIMDMode thisSM = offset->IsUniform() ? lanesToSIMDMode(offset->GetNumberElement()) : m_encoderState.m_simdSize;
5905         if (m_program->m_Platform->GetPlatformFamily() == IGFX_GEN8_CORE && thisSM == SIMDMode::SIMD16)
5906         {
5907             // BDW A64 scatter does not support SIMD16, split it into 2 SIMD8
5908             VISA_EMask_Ctrl execMask = GetAluEMask(offset);
5909             VISA_Exec_Size fromExecSize = EXEC_SIZE_16;
5910             VISA_Exec_Size toExecSize = EXEC_SIZE_8;
5911 
5912             if (numElems == 1 || elemSize == 8)
5913             {   // No unpacking (mov instructions) are needed.
5914                 for (unsigned p = 0; p < 2; ++p)
5915                 {
5916                     addressOpnd = GetRawSource(offset, GetRawOpndSplitOffset(fromExecSize, toExecSize, p, offset));
5917                     srcOpnd = GetRawSource(src, GetRawOpndSplitOffset(fromExecSize, toExecSize, p, src));
5918                     V(vKernel->AppendVISASvmScatterInst(
5919                         predOpnd,
5920                         SplitEMask(fromExecSize, toExecSize, p, execMask),
5921                         toExecSize,
5922                         visaBlockType(elemSize),
5923                         visaBlockNum(numElems),
5924                         addressOpnd, srcOpnd));
5925                 }
5926             }
5927             else
5928             {
5929                 // Unpacking the original simd16 data payload to form the two simd8
5930                 // data payload by splitting the original simd16 data payload.
5931                 CVariable* V0, * V1;
5932                 uint16_t newNumElems = (uint16_t)8 * numElems;
5933                 V0 = m_program->GetNewVariable(
5934                     newNumElems,
5935                     src->GetType(),
5936                     src->GetAlign(),
5937                     src->IsUniform(),
5938                     CName::NONE);
5939                 V1 = m_program->GetNewVariable(
5940                     newNumElems,
5941                     src->GetType(),
5942                     src->GetAlign(),
5943                     src->IsUniform(),
5944                     CName::NONE);
5945                 // Starting offset is calculated from AliasOffset only (subVar not used).
5946                 uint32_t srcOfstBytes = src->GetAliasOffset();
5947                 SplitPayloadToLowerSIMD(src, srcOfstBytes, numElems, V0, V1, 16);
5948 
5949                 for (unsigned p = 0; p < 2; ++p)
5950                 {
5951                     addressOpnd = GetRawSource(offset, GetRawOpndSplitOffset(fromExecSize, toExecSize, p, offset));
5952                     srcOpnd = GetRawSource(p == 0 ? V0 : V1);
5953 
5954                     V(vKernel->AppendVISASvmScatterInst(
5955                         predOpnd,
5956                         SplitEMask(fromExecSize, toExecSize, p, execMask),
5957                         toExecSize,
5958                         visaBlockType(elemSize),
5959                         visaBlockNum(numElems),
5960                         addressOpnd, srcOpnd));
5961                 }
5962             }
5963             return;
5964         }
5965 
5966         V(vKernel->AppendVISASvmScatterInst(predOpnd, GetAluEMask(offset),
5967             visaExecSize(offset->IsUniform() ? lanesToSIMDMode(offset->GetNumberElement()) : m_encoderState.m_simdSize),
5968             visaBlockType(elemSize),
5969             visaBlockNum(numElems),
5970             addressOpnd, srcOpnd));
5971         this->m_program->IncStatelessWritesCount();
5972     }
5973 
ByteGather(CVariable * dst,const ResourceDescriptor & resource,CVariable * offset,unsigned elemSize,unsigned numElems)5974     void CEncoder::ByteGather(
5975         CVariable* dst,
5976         const ResourceDescriptor& resource,
5977         CVariable* offset,
5978         unsigned elemSize,
5979         unsigned numElems) {
5980         IGC_ASSERT_MESSAGE(elemSize == 8, "Only BYTE element is supported!");
5981         IGC_ASSERT_MESSAGE((numElems == 1) || (numElems == 2) || (numElems == 4),
5982             "Only 1/2/4 elements are supported!");
5983 
5984         // Extend the offset to 64bits and use the A64 gather message if needed
5985         if ((resource.m_surfaceType == ESURFACE_STATELESS) &&
5986             (m_program->m_DriverInfo->NeedWAToTransformA32MessagesToA64()) &&
5987             (m_program->m_Platform->getWATable().WaNoA32ByteScatteredStatelessMessages != 0))
5988         {
5989             SEncoderState gatherState = CopyEncoderState();
5990             Push();
5991 
5992             CVariable* offset64 = m_program->GetNewVariable(
5993                 offset->GetNumberElement(),
5994                 ISA_TYPE_UQ,
5995                 EALIGN_GRF,
5996                 offset->IsUniform(),
5997                 offset->GetNumberInstance(),
5998                 CName(offset->getName(), "_64b"));
5999 
6000             CVariable* offset32UD = m_program->BitCast(offset, ISA_TYPE_UD);
6001 
6002             if (offset->IsUniform())
6003             {
6004                 uint elements = offset->GetNumberElement();
6005                 SetUniformSIMDSize(lanesToSIMDMode(elements));
6006                 SetNoMask();
6007                 SetSrcRegion(0, elements, elements, 1);
6008             }
6009 
6010             Cast(offset64, offset32UD);
6011             Push();
6012 
6013             SetEncoderState(gatherState);
6014             GatherA64(dst, offset64, elemSize, numElems);
6015             return;
6016 
6017         }
6018 
6019         VISA_StateOpndHandle* surfaceOpnd = GetVISASurfaceOpnd(resource);
6020         VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
6021         VISA_RawOpnd* addressOpnd = GetRawSource(offset);
6022         VISA_RawOpnd* dstOpnd = GetRawDestination(dst);
6023 
6024         VISA_VectorOpnd* globalOffsetOpnd = 0;
6025         int val = 0;
6026         V(vKernel->CreateVISAImmediate(globalOffsetOpnd, &val, ISA_TYPE_UD));
6027 
6028         V(vKernel->AppendVISASurfAccessScatterScaledInst(ISA_GATHER_SCALED,
6029             predOpnd,
6030             GetAluEMask(offset),
6031             visaExecSize(offset->IsUniform() ? lanesToSIMDMode(offset->GetNumberElement()) :
6032                 m_encoderState.m_simdSize),
6033             visaBlockNum(numElems),
6034             surfaceOpnd,
6035             globalOffsetOpnd,
6036             addressOpnd, dstOpnd));
6037     }
6038 
ByteScatter(CVariable * src,const ResourceDescriptor & resource,CVariable * offset,unsigned elemSize,unsigned numElems)6039     void CEncoder::ByteScatter(
6040         CVariable* src,
6041         const ResourceDescriptor& resource,
6042         CVariable* offset,
6043         unsigned elemSize,
6044         unsigned numElems)
6045     {
6046         IGC_ASSERT_MESSAGE(elemSize == 8, "Only BYTE element is supported!");
6047         IGC_ASSERT_MESSAGE((numElems == 1) || (numElems == 2) || (numElems == 4),
6048             "Only 1/2/4 elements are supported!");
6049 
6050         // Extend the offset to 64bits and use the A64 gather message if needed
6051         if ((resource.m_surfaceType == ESURFACE_STATELESS) &&
6052             (m_program->m_DriverInfo->NeedWAToTransformA32MessagesToA64()) &&
6053             (m_program->m_Platform->getWATable().WaNoA32ByteScatteredStatelessMessages != 0))
6054         {
6055             SEncoderState gatherState = CopyEncoderState();
6056             Push();
6057 
6058             CVariable* offset64 = m_program->GetNewVariable(
6059                 offset->GetNumberElement(),
6060                 ISA_TYPE_UQ,
6061                 EALIGN_GRF,
6062                 offset->IsUniform(),
6063                 offset->GetNumberInstance(),
6064                 CName(offset->getName(), "_64b"));
6065 
6066             CVariable* offset32UD = m_program->BitCast(offset, ISA_TYPE_UD);
6067 
6068             if (offset->IsUniform())
6069             {
6070                 uint elements = offset->GetNumberElement();
6071                 SetUniformSIMDSize(lanesToSIMDMode(elements));
6072                 SetNoMask();
6073                 SetSrcRegion(0, elements, elements, 1);
6074             }
6075 
6076             Cast(offset64, offset32UD);
6077             Push();
6078 
6079             SetEncoderState(gatherState);
6080             ScatterA64(src, offset64, elemSize, numElems);
6081             return;
6082 
6083         }
6084 
6085         VISA_StateOpndHandle* surfaceOpnd = GetVISASurfaceOpnd(resource);
6086         VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
6087         VISA_RawOpnd* addressOpnd = GetRawSource(offset);
6088         VISA_RawOpnd* srcOpnd = GetRawSource(src);
6089 
6090         VISA_VectorOpnd* globalOffsetOpnd = 0;
6091         int val = 0;
6092         V(vKernel->CreateVISAImmediate(globalOffsetOpnd, &val, ISA_TYPE_UD));
6093 
6094         V(vKernel->AppendVISASurfAccessScatterScaledInst(ISA_SCATTER_SCALED,
6095             predOpnd,
6096             GetAluEMask(offset),
6097             visaExecSize(offset->IsUniform() ? lanesToSIMDMode(offset->GetNumberElement()) :
6098                 m_encoderState.m_simdSize),
6099             visaBlockNum(numElems),
6100             surfaceOpnd,
6101             globalOffsetOpnd,
6102             addressOpnd, srcOpnd));
6103     }
6104 
Gather4ScaledNd(CVariable * dst,const ResourceDescriptor & resource,CVariable * offset,unsigned nd)6105     void CEncoder::Gather4ScaledNd(CVariable* dst,
6106         const ResourceDescriptor& resource,
6107         CVariable* offset,
6108         unsigned nd) {
6109 
6110         VISA_StateOpndHandle* surfaceOpnd = GetVISASurfaceOpnd(resource);
6111         VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
6112         VISA_RawOpnd* addressOpnd = GetRawSource(offset);
6113         VISA_RawOpnd* dstOpnd = GetRawDestination(dst);
6114 
6115         VISA_VectorOpnd* globalOffsetOpnd = 0;
6116         int val = 0;
6117         V(vKernel->CreateVISAImmediate(globalOffsetOpnd, &val, ISA_TYPE_UD));
6118 
6119         V(vKernel->AppendVISASurfAccessGather4Scatter4ScaledInst(
6120             ISA_GATHER4_SCALED,
6121             predOpnd,
6122             GetAluEMask(dst),
6123             visaExecSize(offset->IsUniform() ? lanesToSIMDMode(offset->GetNumberElement()) :
6124                 m_encoderState.m_simdSize),
6125             ConvertChannelMaskToVisaType(BIT(nd) - 1),
6126             surfaceOpnd,
6127             globalOffsetOpnd,
6128             addressOpnd, dstOpnd));
6129     }
6130 
getNumChannels(CVariable * var) const6131     uint32_t CEncoder::getNumChannels(CVariable* var) const
6132     {
6133         IGC_ASSERT(nullptr != var);
6134         unsigned nd = var->GetSize();
6135         if (var->IsUniform())
6136         {
6137             IGC_ASSERT_MESSAGE(nd <= getGRFSize(), "Unknown Variable Size!");
6138             return 1;
6139         }
6140         else
6141         {
6142             static_assert(0 < SIZE_DWORD);
6143 
6144             switch (m_encoderState.m_simdSize)
6145             {
6146             case SIMDMode::SIMD8:
6147                 return nd / (8 * SIZE_DWORD);
6148             case SIMDMode::SIMD16:
6149                 return nd / (16 * SIZE_DWORD);
6150             case SIMDMode::SIMD32:
6151                 return nd / (32 * SIZE_DWORD);
6152             default:
6153                 IGC_ASSERT_MESSAGE(0, "Unknown SIMD size!");
6154                 return 1;
6155             }
6156         }
6157         return 1;
6158     }
6159 
Gather4Scaled(CVariable * dst,const ResourceDescriptor & resource,CVariable * offset)6160     void CEncoder::Gather4Scaled(CVariable* dst,
6161         const ResourceDescriptor& resource,
6162         CVariable* offset)
6163     {
6164         unsigned nd = getNumChannels(dst);
6165         Gather4ScaledNd(dst, resource, offset, nd);
6166     }
6167 
Scatter4Scaled(CVariable * src,const ResourceDescriptor & resource,CVariable * offset)6168     void CEncoder::Scatter4Scaled(CVariable* src,
6169         const ResourceDescriptor& resource,
6170         CVariable* offset) {
6171         unsigned nd = getNumChannels(src);
6172 
6173         VISA_StateOpndHandle* surfaceOpnd = GetVISASurfaceOpnd(resource);
6174         VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
6175         VISA_RawOpnd* addressOpnd = GetRawSource(offset);
6176         VISA_RawOpnd* srcOpnd = GetRawSource(src);
6177 
6178         VISA_VectorOpnd* globalOffsetOpnd = 0;
6179         int val = 0;
6180         V(vKernel->CreateVISAImmediate(globalOffsetOpnd, &val, ISA_TYPE_UD));
6181 
6182         V(vKernel->AppendVISASurfAccessGather4Scatter4ScaledInst(
6183             ISA_SCATTER4_SCALED,
6184             predOpnd,
6185             GetAluEMask(src),
6186             visaExecSize(offset->IsUniform() ? lanesToSIMDMode(offset->GetNumberElement()) :
6187                 m_encoderState.m_simdSize),
6188             ConvertChannelMaskToVisaType(BIT(nd) - 1),
6189             surfaceOpnd,
6190             globalOffsetOpnd,
6191             addressOpnd, srcOpnd));
6192         if (ESURFACE_STATELESS == resource.m_surfaceType)
6193         {
6194             this->m_program->IncStatelessWritesCount();
6195         }
6196     }
6197 
Gather4A64(CVariable * dst,CVariable * offset)6198     void CEncoder::Gather4A64(CVariable* dst, CVariable* offset) {
6199         IGC_ASSERT(nullptr != dst);
6200         IGC_ASSERT_MESSAGE(dst->GetElemSize() == 4, "Gather4 must have 4-byte element");
6201 
6202         uint32_t dstOfstBytes = m_encoderState.m_dstOperand.subVar * getGRFSize() + dst->GetAliasOffset();
6203         unsigned nd = dst->GetSize();
6204         switch (m_encoderState.m_simdSize) {
6205         case SIMDMode::SIMD8:
6206             nd = nd / (8 * SIZE_DWORD);
6207             break;
6208         case SIMDMode::SIMD16:
6209             nd = nd / (16 * SIZE_DWORD);
6210             break;
6211         case SIMDMode::SIMD32:
6212             nd = nd / (32 * SIZE_DWORD);
6213             break;
6214         default:
6215             IGC_ASSERT_MESSAGE(0, "Unknown SIMD size!");
6216             return;
6217         }
6218 
6219         VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
6220         VISA_RawOpnd* addressOpnd = GetRawSource(offset);
6221         VISA_RawOpnd* dstOpnd = GetRawDestination(dst);
6222 
6223         VISA_VectorOpnd* globalOffsetOpnd = 0;
6224         int val = 0;
6225         V(vKernel->CreateVISAImmediate(globalOffsetOpnd, &val, ISA_TYPE_UD));
6226 
6227         if (m_program->m_Platform->GetPlatformFamily() == IGFX_GEN8_CORE && m_encoderState.m_simdSize == SIMDMode::SIMD16)
6228         {
6229             // BDW A64 untyped does not support SIMD16, split it into 2 SIMD8
6230             VISA_EMask_Ctrl execMask = GetAluEMask(offset);
6231             VISA_Exec_Size fromExecSize = EXEC_SIZE_16;
6232             VISA_Exec_Size toExecSize = EXEC_SIZE_8;
6233 
6234             if (nd == 1)
6235             {
6236                 // No packing is needed.
6237                 for (unsigned p = 0; p < 2; ++p)
6238                 {
6239                     addressOpnd = GetRawSource(offset, GetRawOpndSplitOffset(fromExecSize, toExecSize, p, offset));
6240                     dstOpnd = GetRawDestination(dst, GetRawOpndSplitOffset(fromExecSize, toExecSize, p, dst));
6241 
6242                     V(vKernel->AppendVISASvmGather4ScaledInst(
6243                         predOpnd,
6244                         SplitEMask(fromExecSize, toExecSize, p, execMask),
6245                         toExecSize,
6246                         ConvertChannelMaskToVisaType(BIT(nd) - 1),
6247                         globalOffsetOpnd,
6248                         addressOpnd, dstOpnd));
6249                 }
6250             }
6251             else
6252             {
6253                 // Packing the two SIMD8 data payload to form the SIMD16 data payload
6254                 // by merging the two simd8 data payload.
6255                 CVariable* V0, * V1;
6256                 uint16_t newNumElems = (uint16_t)8 * nd;
6257                 V0 = m_program->GetNewVariable(
6258                     newNumElems,
6259                     dst->GetType(),
6260                     dst->GetAlign(),
6261                     dst->IsUniform(),
6262                     CName(dst->getName(),"_M0"));
6263                 V1 = m_program->GetNewVariable(
6264                     newNumElems,
6265                     dst->GetType(),
6266                     dst->GetAlign(),
6267                     dst->IsUniform(),
6268                     CName(dst->getName(),"_M8"));
6269 
6270                 for (unsigned p = 0; p < 2; ++p)
6271                 {
6272                     addressOpnd = GetRawSource(offset, GetRawOpndSplitOffset(fromExecSize, toExecSize, p, offset));
6273                     dstOpnd = GetRawDestination(p == 0 ? V0 : V1);
6274 
6275                     V(vKernel->AppendVISASvmGather4ScaledInst(
6276                         predOpnd,
6277                         SplitEMask(fromExecSize, toExecSize, p, execMask),
6278                         toExecSize,
6279                         ConvertChannelMaskToVisaType(BIT(nd) - 1),
6280                         globalOffsetOpnd,
6281                         addressOpnd, dstOpnd));
6282                 }
6283 
6284                 MergePayloadToHigherSIMD(V0, V1, nd, dst, dstOfstBytes, 16);
6285             }
6286             return;
6287         }
6288 
6289         V(vKernel->AppendVISASvmGather4ScaledInst(
6290             predOpnd,
6291             GetAluEMask(dst),
6292             visaExecSize(m_encoderState.m_simdSize),
6293             ConvertChannelMaskToVisaType(BIT(nd) - 1),
6294             globalOffsetOpnd,
6295             addressOpnd, dstOpnd));
6296     }
6297 
Scatter4A64(CVariable * src,CVariable * offset)6298     void CEncoder::Scatter4A64(CVariable* src, CVariable* offset) {
6299         IGC_ASSERT(nullptr != src);
6300         IGC_ASSERT_MESSAGE(src->GetElemSize() == 4, "scatter4 must have 4-byte element");
6301 
6302         uint32_t srcOfstBytes = src->GetAliasOffset();
6303         unsigned nd = src->GetSize();
6304         switch (m_encoderState.m_simdSize) {
6305         case SIMDMode::SIMD8:
6306             nd = nd / (8 * SIZE_DWORD);
6307             break;
6308         case SIMDMode::SIMD16:
6309             nd = nd / (16 * SIZE_DWORD);
6310             break;
6311         case SIMDMode::SIMD32:
6312             nd = nd / (32 * SIZE_DWORD);
6313             break;
6314         default:
6315             IGC_ASSERT_MESSAGE(0, "unknown SIMD size");
6316             return;
6317         }
6318 
6319         VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
6320         VISA_RawOpnd* addressOpnd = GetRawSource(offset);
6321         VISA_RawOpnd* srcOpnd = GetRawSource(src);
6322 
6323         VISA_VectorOpnd* globalOffsetOpnd = 0;
6324         int val = 0;
6325         V(vKernel->CreateVISAImmediate(globalOffsetOpnd, &val, ISA_TYPE_UD));
6326 
6327         if (m_program->m_Platform->GetPlatformFamily() == IGFX_GEN8_CORE && m_encoderState.m_simdSize == SIMDMode::SIMD16)
6328         {
6329             // BDW A64 untyped does not support SIMD16, split it into 2 SIMD8
6330             VISA_EMask_Ctrl execMask = GetAluEMask(src);
6331             VISA_Exec_Size fromExecSize = EXEC_SIZE_16;
6332             VISA_Exec_Size toExecSize = EXEC_SIZE_8;
6333 
6334             if (nd == 1)
6335             {
6336                 // No need to do unpacking
6337                 for (unsigned p = 0; p < 2; ++p)
6338                 {
6339                     addressOpnd = GetRawSource(offset, GetRawOpndSplitOffset(fromExecSize, toExecSize, p, offset));
6340                     srcOpnd = GetRawSource(src, GetRawOpndSplitOffset(fromExecSize, toExecSize, p, src));
6341 
6342                     V(vKernel->AppendVISASvmScatter4ScaledInst(
6343                         predOpnd,
6344                         SplitEMask(fromExecSize, toExecSize, p, execMask),
6345                         toExecSize,
6346                         ConvertChannelMaskToVisaType(BIT(nd) - 1),
6347                         globalOffsetOpnd,
6348                         addressOpnd, srcOpnd));
6349                 }
6350             }
6351             else
6352             {
6353                 // Unpacking is needed from the original SIMD16 data payload to form
6354                 // two SIMD8 data payload by spliting the original simd16 data payload.
6355                 CVariable* V0, * V1;
6356                 uint16_t newNumElems = (uint16_t)8 * nd;
6357                 V0 = m_program->GetNewVariable(
6358                     newNumElems,
6359                     src->GetType(),
6360                     src->GetAlign(),
6361                     src->IsUniform(),
6362                     CName(src->getName(),"_M0"));
6363                 V1 = m_program->GetNewVariable(
6364                     newNumElems,
6365                     src->GetType(),
6366                     src->GetAlign(),
6367                     src->IsUniform(),
6368                     CName(src->getName(),"_M8"));
6369 
6370                 SplitPayloadToLowerSIMD(src, srcOfstBytes, nd, V0, V1, 16);
6371 
6372                 for (unsigned p = 0; p < 2; ++p)
6373                 {
6374                     addressOpnd = GetRawSource(offset, GetRawOpndSplitOffset(fromExecSize, toExecSize, p, offset));
6375                     srcOpnd = GetRawSource(p == 0 ? V0 : V1);
6376 
6377                     V(vKernel->AppendVISASvmScatter4ScaledInst(
6378                         predOpnd,
6379                         SplitEMask(fromExecSize, toExecSize, p, execMask),
6380                         toExecSize,
6381                         ConvertChannelMaskToVisaType(BIT(nd) - 1),
6382                         globalOffsetOpnd,
6383                         addressOpnd, srcOpnd));
6384                 }
6385             }
6386             return;
6387         }
6388 
6389         V(vKernel->AppendVISASvmScatter4ScaledInst(
6390             predOpnd,
6391             GetAluEMask(src),
6392             visaExecSize(m_encoderState.m_simdSize),
6393             ConvertChannelMaskToVisaType(BIT(nd) - 1),
6394             globalOffsetOpnd,
6395             addressOpnd, srcOpnd));
6396     }
6397 
AtomicRawA64(AtomicOp atomic_op,const ResourceDescriptor & resource,CVariable * dst,CVariable * offset,CVariable * src0,CVariable * src1,unsigned short bitwidth)6398     void CEncoder::AtomicRawA64(
6399         AtomicOp atomic_op,
6400         const ResourceDescriptor& resource,
6401         CVariable* dst,
6402         CVariable* offset,
6403         CVariable* src0,
6404         CVariable* src1,
6405         unsigned short bitwidth)
6406     {
6407         // For cmpxchg, we have to change the order of arguments.
6408         if (atomic_op == EATOMIC_CMPXCHG) {
6409             std::swap(src0, src1);
6410         }
6411 
6412         VISAAtomicOps atomicOpcode = convertAtomicOpEnumToVisa(atomic_op);
6413 
6414         if (m_encoderState.m_simdSize == SIMDMode::SIMD16)
6415         {
6416             // Split SIMD16 atomic ops into two SIMD8 ones.
6417             VISA_EMask_Ctrl execMask = ConvertMaskToVisaType(m_encoderState.m_mask, m_encoderState.m_noMask);
6418             VISA_Exec_Size fromExecSize = visaExecSize(m_encoderState.m_simdSize);
6419             VISA_Exec_Size toExecSize = SplitExecSize(fromExecSize, 2);
6420 
6421             for (unsigned thePart = 0; thePart != 2; ++thePart)
6422             {
6423                 CVariable* rawOpndVar = nullptr;
6424                 uint32_t rawOpndOffset = 0;
6425                 bool isFirstHalf = thePart == 0;
6426 
6427                 std::tie(rawOpndVar, rawOpndOffset) = splitRawOperand(offset, isFirstHalf, execMask);
6428                 VISA_RawOpnd* addressOpnd = GetRawSource(rawOpndVar, rawOpndOffset);
6429                 std::tie(rawOpndVar, rawOpndOffset) = splitRawOperand(src0, isFirstHalf, execMask);
6430                 VISA_RawOpnd* src0Opnd = GetRawSource(rawOpndVar, rawOpndOffset);
6431                 std::tie(rawOpndVar, rawOpndOffset) = splitRawOperand(src1, isFirstHalf, execMask);
6432                 VISA_RawOpnd* src1Opnd = GetRawSource(rawOpndVar, rawOpndOffset);
6433 
6434                 // dst needs special handling since its move has to come after the send
6435                 VISA_RawOpnd* dstOpnd = nullptr;
6436                 bool needsTmpDst = !isFirstHalf && dst && (dst->GetElemSize() * 8) % getGRFSize() != 0;
6437                 if (!needsTmpDst)
6438                 {
6439                     std::tie(rawOpndVar, rawOpndOffset) = splitRawOperand(dst, isFirstHalf, execMask);
6440                     dstOpnd = GetRawDestination(rawOpndVar, rawOpndOffset);
6441                 }
6442                 else
6443                 {
6444                     rawOpndVar = m_program->GetNewVariable(
6445                         8,
6446                         dst->GetType(),
6447                         CVariable::getAlignment(getGRFSize()),
6448                         CName(dst->getName(), "_RET"));
6449                     dstOpnd = GetRawDestination(rawOpndVar, 0);
6450                 }
6451 
6452                 V(vKernel->AppendVISASvmAtomicInst(GetFlagOperand(m_encoderState.m_flag),
6453                     SplitEMask(fromExecSize, toExecSize, thePart, execMask),
6454                     toExecSize, atomicOpcode, bitwidth,
6455                     addressOpnd, src0Opnd, src1Opnd, dstOpnd));
6456                 this->m_program->IncStatelessWritesCount();
6457 
6458                 if (needsTmpDst)
6459                 {
6460                     SModifier mod;
6461                     mod.init();
6462                     mod.subReg = 8;
6463                     auto dstOpnd = GetDestinationOperand(dst, mod);
6464 
6465                     mod.init();
6466                     auto srcOpnd = GetSourceOperand(rawOpndVar, mod);
6467 
6468                     V(vKernel->AppendVISADataMovementInst(
6469                         ISA_MOV, nullptr, false,
6470                         SplitEMask(EXEC_SIZE_16, EXEC_SIZE_8, 1, execMask),
6471                         EXEC_SIZE_8, dstOpnd, srcOpnd));
6472                 }
6473             }
6474 
6475             return;
6476         }
6477 
6478         VISA_RawOpnd* addressOpnd = GetRawSource(offset);
6479         VISA_RawOpnd* src0Opnd = GetRawSource(src0);
6480         VISA_RawOpnd* src1Opnd = GetRawSource(src1);
6481         VISA_RawOpnd* dstOpnd = GetRawDestination(dst);
6482 
6483         V(vKernel->AppendVISASvmAtomicInst(GetFlagOperand(m_encoderState.m_flag),
6484             ConvertMaskToVisaType(m_encoderState.m_mask, m_encoderState.m_noMask),
6485             visaExecSize(m_encoderState.m_simdSize),
6486             atomicOpcode,
6487             bitwidth,
6488             addressOpnd,
6489             src0Opnd,
6490             src1Opnd,
6491             dstOpnd));
6492         this->m_program->IncStatelessWritesCount();
6493     }
6494 
Wait()6495     void CEncoder::Wait()
6496     {
6497         V(vKernel->AppendVISAWaitInst(nullptr));
6498     }
6499 
SendVmeIme(CVariable * bindingTableIndex,unsigned char streamMode,unsigned char searchControlMode,CVariable * uniInputVar,CVariable * imeInputVar,CVariable * ref0Var,CVariable * ref1Var,CVariable * costCenterVar,CVariable * outputVar)6500     void CEncoder::SendVmeIme(CVariable* bindingTableIndex,
6501         unsigned char streamMode,
6502         unsigned char searchControlMode,
6503         CVariable* uniInputVar,
6504         CVariable* imeInputVar,
6505         CVariable* ref0Var,
6506         CVariable* ref1Var,
6507         CVariable* costCenterVar,
6508         CVariable* outputVar) {
6509 
6510         VISA_StateOpndHandle* surface = GetVISASurfaceOpnd(ESURFACE_NORMAL, bindingTableIndex);
6511         VISA_RawOpnd* uniInput = GetRawSource(uniInputVar);
6512         VISA_RawOpnd* imeInput = GetRawSource(imeInputVar);
6513         VISA_RawOpnd* ref0 = GetRawSource(ref0Var);
6514         VISA_RawOpnd* ref1 = GetRawSource(ref1Var);
6515         VISA_RawOpnd* costCenter = GetRawSource(costCenterVar);
6516         VISA_RawOpnd* output = GetRawDestination(outputVar);
6517         V(vKernel->AppendVISAMiscVME_IME(surface, streamMode, searchControlMode, uniInput, imeInput, ref0, ref1, costCenter, output));
6518     }
6519 
SendVmeFbr(CVariable * bindingTableIndex,CVariable * uniInputVar,CVariable * fbrInputVar,CVariable * FBRMbModeVar,CVariable * FBRSubMbShapeVar,CVariable * FBRSubPredModeVar,CVariable * outputVar)6520     void CEncoder::SendVmeFbr(CVariable* bindingTableIndex,
6521         CVariable* uniInputVar,
6522         CVariable* fbrInputVar,
6523         CVariable* FBRMbModeVar,
6524         CVariable* FBRSubMbShapeVar,
6525         CVariable* FBRSubPredModeVar,
6526         CVariable* outputVar) {
6527         VISA_StateOpndHandle* surface = GetVISASurfaceOpnd(ESURFACE_NORMAL, bindingTableIndex);
6528         VISA_RawOpnd* UNIInput = GetRawSource(uniInputVar);
6529         VISA_RawOpnd* FBRInput = GetRawSource(fbrInputVar);
6530         VISA_VectorOpnd* FBRMbMode = GetSourceOperand(FBRMbModeVar, m_encoderState.m_srcOperand[0]);
6531         VISA_VectorOpnd* FBRSubMbShape = GetSourceOperand(FBRSubMbShapeVar, m_encoderState.m_srcOperand[1]);
6532         VISA_VectorOpnd* FBRSubPredMode = GetSourceOperand(FBRSubPredModeVar, m_encoderState.m_srcOperand[2]);
6533         VISA_RawOpnd* output = GetRawDestination(outputVar);
6534 
6535         V(vKernel->AppendVISAMiscVME_FBR(surface, UNIInput, FBRInput, FBRMbMode, FBRSubMbShape, FBRSubPredMode, output));
6536     }
6537 
SendVmeSic(CVariable * bindingTableIndex,CVariable * uniInputVar,CVariable * sicInputVar,CVariable * outputVar)6538     void CEncoder::SendVmeSic(
6539         CVariable* bindingTableIndex,
6540         CVariable* uniInputVar,
6541         CVariable* sicInputVar,
6542         CVariable* outputVar)
6543     {
6544         VISA_StateOpndHandle* surface = GetVISASurfaceOpnd(ESURFACE_NORMAL, bindingTableIndex);
6545         VISA_RawOpnd* UNIInput = GetRawSource(uniInputVar);
6546         VISA_RawOpnd* SICInput = GetRawSource(sicInputVar);
6547         VISA_RawOpnd* output = GetRawDestination(outputVar);
6548 
6549         V(vKernel->AppendVISAMiscVME_SIC(surface, UNIInput, SICInput, output));
6550     }
6551 
SendVideoAnalytic(llvm::GenIntrinsicInst * inst,CVariable * vaResult,CVariable * coords,CVariable * size,CVariable * srcImg,CVariable * sampler)6552     void CEncoder::SendVideoAnalytic(
6553         llvm::GenIntrinsicInst* inst,
6554         CVariable* vaResult,
6555         CVariable* coords,
6556         CVariable* size,
6557         CVariable* srcImg,
6558         CVariable* sampler)
6559     {
6560         VISA_RawOpnd* vaOutput = GetRawDestination(vaResult);
6561 
6562         SModifier mod0 = m_encoderState.m_srcOperand[0];
6563         SModifier mod1 = m_encoderState.m_srcOperand[1];
6564 
6565         mod0.specialRegion = mod1.specialRegion = true;
6566         mod0.region[0] = mod1.region[0] = 0;
6567         mod0.region[1] = mod1.region[1] = 1;
6568         mod0.region[2] = mod1.region[2] = 0;
6569         mod0.subReg = 0;
6570         mod0.subVar = 0;
6571 
6572         if (coords->IsUniform())
6573         {
6574             mod1.subReg = 1;
6575             mod1.subVar = 0;
6576         }
6577         else
6578         {
6579             mod1.subReg = 0;
6580             mod1.subVar = 2;
6581         }
6582 
6583         VISA_VectorOpnd* uOffset = GetSourceOperand(coords, mod0);
6584         VISA_VectorOpnd* vOffset = GetSourceOperand(coords, mod1);
6585 
6586         if (size && size->IsUniform())
6587         {
6588             mod1.subReg = 1;
6589             mod1.subVar = 0;
6590         }
6591         else
6592         {
6593             mod1.subReg = 0;
6594             mod1.subVar = 2;
6595         }
6596 
6597         VISA_VectorOpnd* wSize = (size ? GetSourceOperand(size, mod0) : NULL);
6598         VISA_VectorOpnd* hSize = (size ? GetSourceOperand(size, mod1) : NULL);
6599 
6600         // So far we support only one VA function per kernel, and other sample
6601         // messages are not supported when there is VA function within the kernel.
6602         // So, for now it should be fine to always use bti 0 for VA functions.
6603         DWORD btiIndex = 0;
6604         DWORD mmfMode = 0;
6605 
6606         VISA_StateOpndHandle* surfaceOpnd = GetBTIOperand(btiIndex);
6607         VISA_StateOpndHandle* samplerHnd = GetSamplerOperand(sampler);
6608         VISA_VectorOpnd* mmModeOpnd = NULL;
6609 
6610         EDMode erodeDilateMode = VA_DILATE;
6611         EDExecMode execMode = VA_ED_64x4;
6612         bool isBigKernel = true;
6613 
6614         if (m_program->m_Platform->GetPlatformFamily() == IGFX_GEN8_CORE)
6615         {
6616             isBigKernel = false;
6617         }
6618 
6619         switch (inst->getIntrinsicID())
6620         {
6621         case GenISAIntrinsic::GenISA_vaErode:
6622             erodeDilateMode = VA_ERODE;
6623         case GenISAIntrinsic::GenISA_vaDilate:
6624             V(vKernel->AppendVISAVAErodeDilate(erodeDilateMode, samplerHnd, surfaceOpnd, uOffset, vOffset, execMode, vaOutput));
6625             break;
6626         case GenISAIntrinsic::GenISA_vaMinMaxFilter:
6627             V(vKernel->CreateVISAImmediate(mmModeOpnd, &mmfMode, ISA_TYPE_UD));
6628             V(vKernel->AppendVISAVAMinMaxFilter(samplerHnd, surfaceOpnd, uOffset, vOffset, AVS_16_FULL, VA_MMF_16x4, mmModeOpnd, vaOutput));
6629             break;
6630         case GenISAIntrinsic::GenISA_vaConvolveGRF_16x1:
6631             V(vKernel->AppendVISAVAConvolve(samplerHnd, surfaceOpnd, uOffset, vOffset, VA_CONV_16x1, isBigKernel, vaOutput));
6632             break;
6633         case GenISAIntrinsic::GenISA_vaConvolve:
6634         case GenISAIntrinsic::GenISA_vaConvolveGRF_16x4:
6635             V(vKernel->AppendVISAVAConvolve(samplerHnd, surfaceOpnd, uOffset, vOffset, VA_CONV_16x4, isBigKernel, vaOutput));
6636             break;
6637         case GenISAIntrinsic::GenISA_vaMinMax:
6638             V(vKernel->CreateVISAImmediate(mmModeOpnd, &mmfMode, ISA_TYPE_UD));
6639             V(vKernel->AppendVISAVAMinMax(surfaceOpnd, uOffset, vOffset, mmModeOpnd, vaOutput));
6640             break;
6641         case GenISAIntrinsic::GenISA_vaCentroid:
6642             V(vKernel->AppendVISAVACentroid(surfaceOpnd, uOffset, vOffset, wSize, vaOutput));
6643             break;
6644         case GenISAIntrinsic::GenISA_vaBoolCentroid:
6645         case GenISAIntrinsic::GenISA_vaBoolSum:
6646             V(vKernel->AppendVISAVABooleanCentroid(surfaceOpnd, uOffset, vOffset, wSize, hSize, vaOutput));
6647             break;
6648         default:
6649             IGC_ASSERT_MESSAGE(0, "Trying to emit unrecognized video analytic instruction (listed above)");
6650             break;
6651         };
6652     }
6653 
SetVISAWaTable(WA_TABLE const & waTable)6654     void CEncoder::SetVISAWaTable(WA_TABLE const& waTable)
6655     {
6656         // Copy from driver WA table to VISA WA table,
6657         // then update the conditional W/A
6658         m_vISAWaTable = waTable;
6659 
6660         if (m_program->GetShaderType() != ShaderType::PIXEL_SHADER &&
6661             m_program->GetShaderType() != ShaderType::COMPUTE_SHADER &&
6662             m_program->GetShaderType() != ShaderType::OPENCL_SHADER)
6663         {
6664             m_vISAWaTable.WaClearTDRRegBeforeEOTForNonPS = waTable.WaClearTDRRegBeforeEOTForNonPS;
6665         }
6666         else
6667         {
6668             m_vISAWaTable.WaClearTDRRegBeforeEOTForNonPS = false;
6669         }
6670 
6671         if (IGC_IS_FLAG_DISABLED(ForceSendsSupportOnSKLA0))
6672         {
6673             m_vISAWaTable.WaDisableSendsSrc0DstOverlap = waTable.WaDisableSendsSrc0DstOverlap;
6674         }
6675         else
6676         {
6677             m_vISAWaTable.WaDisableSendsSrc0DstOverlap = false;
6678         }
6679 
6680         TODO("Limit this C0 WA as required to only Compute , as it causes hangs in some 3D Workloads");
6681         if (IGC_IS_FLAG_DISABLED(DisableWaSendSEnableIndirectMsgDesc) &&
6682             (m_program->GetShaderType() == ShaderType::COMPUTE_SHADER ||
6683                 m_program->GetShaderType() == ShaderType::OPENCL_SHADER))
6684         {
6685             m_vISAWaTable.WaSendSEnableIndirectMsgDesc = waTable.WaSendSEnableIndirectMsgDesc;
6686         }
6687         else
6688         {
6689             m_vISAWaTable.WaSendSEnableIndirectMsgDesc = false;
6690         }
6691 
6692         if (IGC_IS_FLAG_DISABLED(DisableWaDisableSIMD16On3SrcInstr))
6693         {
6694             m_vISAWaTable.WaDisableSIMD16On3SrcInstr = waTable.WaDisableSIMD16On3SrcInstr;
6695         }
6696         else
6697         {
6698             m_vISAWaTable.WaDisableSIMD16On3SrcInstr = false;
6699         }
6700     }
6701 
GetRowAndColOffset(CVariable * var,unsigned int subVar,unsigned int subReg,unsigned char & rowOff,unsigned char & colOff)6702     void CEncoder::GetRowAndColOffset(CVariable* var, unsigned int subVar, unsigned int subReg, unsigned char& rowOff, unsigned char& colOff)
6703     {
6704         IGC_ASSERT(nullptr != var);
6705         unsigned int varTypeSize = GetCISADataTypeSize(var->GetType());
6706         unsigned int offset = var->GetAliasOffset() + subVar * getGRFSize() + subReg * varTypeSize;
6707         IGC_ASSERT(0 < getGRFSize());
6708         IGC_ASSERT(0 < varTypeSize);
6709         IGC_ASSERT_MESSAGE((offset % getGRFSize()) % varTypeSize == 0, "offset has to be aligned on element size");
6710         rowOff = int_cast<unsigned char>(offset / getGRFSize());
6711         colOff = int_cast<unsigned char>((offset % getGRFSize()) / varTypeSize);
6712     }
6713 
Loc(unsigned int line)6714     void CEncoder::Loc(unsigned int line)
6715     {
6716         V(vKernel->AppendVISAMiscLOC(line));
6717     }
6718 
File(std::string & s)6719     void CEncoder::File(std::string& s)
6720     {
6721         V(vKernel->AppendVISAMiscFileInst(s.c_str()));
6722     }
6723 
Lifetime(VISAVarLifetime StartOrEnd,CVariable * dst)6724     void CEncoder::Lifetime(VISAVarLifetime StartOrEnd, CVariable* dst)
6725     {
6726         SModifier noMod; // Default is no mod.
6727         noMod.init();
6728         VISA_VectorOpnd* srcOpnd = GetSourceOperand(dst, noMod);
6729         V(vKernel->AppendVISALifetime(StartOrEnd, srcOpnd));
6730     }
6731 
DebugLinePlaceholder()6732     void CEncoder::DebugLinePlaceholder()
6733     {
6734         V(vKernel->AppendVISADebugLinePlaceholder());
6735     }
6736 
ConvertPrecisionToVisaType(PrecisionType P)6737     GenPrecision ConvertPrecisionToVisaType(PrecisionType P)
6738     {
6739         switch (P) {
6740         default: break;
6741         case PrecisionType::S2: return GenPrecision::S2;
6742         case PrecisionType::S4: return GenPrecision::S4;
6743         case PrecisionType::S8: return GenPrecision::S8;
6744         case PrecisionType::U2: return GenPrecision::U2;
6745         case PrecisionType::U4: return GenPrecision::U4;
6746         case PrecisionType::U8: return GenPrecision::U8;
6747         case PrecisionType::BF16: return GenPrecision::BF16;
6748         case PrecisionType::FP16: return GenPrecision::FP16;
6749         }
6750 
6751         return GenPrecision::INVALID;
6752     }
6753 
6754 
dpas(CVariable * dst,CVariable * input,CVariable * weight,PrecisionType weight_precision,CVariable * activation,PrecisionType activation_precision,uint8_t systolicDepth,uint8_t repeatCount,bool IsDpasw)6755     void CEncoder::dpas(
6756         CVariable* dst, CVariable* input,
6757         CVariable* weight, PrecisionType weight_precision,
6758         CVariable* activation, PrecisionType activation_precision,
6759         uint8_t systolicDepth, uint8_t repeatCount,
6760         bool IsDpasw)
6761     {
6762         SModifier noMod; // Default is no mod.
6763         noMod.init();
6764         // PrecisionType to GenPrecision
6765         GenPrecision src1Precision = ConvertPrecisionToVisaType(weight_precision);
6766         GenPrecision src2Precision = ConvertPrecisionToVisaType(activation_precision);
6767 
6768         VISA_EMask_Ctrl execMask = GetAluEMask(dst);
6769         VISA_Exec_Size execSize = EXEC_SIZE_8;
6770         {
6771             VISA_RawOpnd* dstOpnd = GetRawDestination(dst);
6772             VISA_RawOpnd* srcOpnd0 = GetRawSource(input);
6773             VISA_RawOpnd* srcOpnd1 = GetRawSource(weight);
6774             VISA_VectorOpnd* srcOpnd2 = GetSourceOperand(activation, noMod);
6775             V(vKernel->AppendVISADpasInst(
6776                 IsDpasw ? ISA_DPASW : ISA_DPAS,
6777                 execMask,
6778                 execSize,
6779                 dstOpnd,
6780                 srcOpnd0,
6781                 srcOpnd1,
6782                 srcOpnd2,
6783                 src2Precision,
6784                 src1Precision,
6785                 systolicDepth,
6786                 repeatCount));
6787         }
6788     }
6789 
QWGather(CVariable * dst,const ResourceDescriptor & resource,CVariable * offset,unsigned elemSize,unsigned numElems)6790     void CEncoder::QWGather(CVariable* dst,
6791         const ResourceDescriptor& resource,
6792         CVariable* offset,
6793         unsigned elemSize,
6794         unsigned numElems)
6795     {
6796         IGC_ASSERT_MESSAGE(elemSize == 64, "Only QWord element is supported!");
6797         IGC_ASSERT_MESSAGE((numElems == 1) || (numElems == 2) || (numElems == 4),
6798             "Only 1/2/4 elements are supported!");
6799 
6800         VISA_StateOpndHandle* surfaceOpnd = GetVISASurfaceOpnd(resource);
6801         VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
6802         VISA_RawOpnd* addressOpnd = GetRawSource(offset);
6803         VISA_RawOpnd* dstOpnd = GetRawDestination(dst);
6804 
6805         V(vKernel->AppendVISAQwordGatherInst(
6806             predOpnd,
6807             GetAluEMask(offset),
6808             visaExecSize(offset->IsUniform() ? lanesToSIMDMode(offset->GetNumberElement()) :
6809                 m_encoderState.m_simdSize),
6810             visaBlockNum(numElems),
6811             surfaceOpnd,
6812             addressOpnd, dstOpnd));
6813     }
6814 
QWScatter(CVariable * src,const ResourceDescriptor & resource,CVariable * offset,unsigned elemSize,unsigned numElems)6815     void CEncoder::QWScatter(CVariable* src,
6816         const ResourceDescriptor& resource,
6817         CVariable* offset,
6818         unsigned elemSize,
6819         unsigned numElems)
6820     {
6821         IGC_ASSERT_MESSAGE(elemSize == 64, "Only QWord element is supported");
6822         IGC_ASSERT_MESSAGE((numElems == 1) || (numElems == 2) || (numElems == 4),
6823             "Only 1/2/4 elements are supported!");
6824 
6825         VISA_StateOpndHandle* surfaceOpnd = GetVISASurfaceOpnd(resource);
6826         VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag);
6827         VISA_RawOpnd* addressOpnd = GetRawSource(offset);
6828         VISA_RawOpnd* srcOpnd = GetRawSource(src);
6829 
6830         V(vKernel->AppendVISAQwordScatterInst(
6831             predOpnd,
6832             GetAluEMask(offset),
6833             visaExecSize(offset->IsUniform() ? lanesToSIMDMode(offset->GetNumberElement()) :
6834                 m_encoderState.m_simdSize),
6835             visaBlockNum(numElems),
6836             surfaceOpnd,
6837             addressOpnd, srcOpnd));
6838     }
6839 
6840 
GetVariableName(CVariable * var)6841     std::string CEncoder::GetVariableName(CVariable* var)
6842     {
6843         IGC_ASSERT(nullptr != var);
6844         if (var->IsImmediate())
6845         {
6846             std::stringstream temp;
6847             temp << "0x" << std::hex << var->GetImmediateValue() << ":" << CISATypeTable[var->GetType()].typeName;
6848             return temp.str();
6849         }
6850         switch (var->GetVarType())
6851         {
6852         case EVARTYPE_GENERAL:
6853             return vKernel->getVarName(GetVISAVariable(var));
6854         case EVARTYPE_PREDICATE:
6855             return vKernel->getVarName(var->visaPredVariable);
6856         case EVARTYPE_ADDRESS:
6857             return vKernel->getVarName(var->visaAddrVariable);
6858         case EVARTYPE_SURFACE:
6859             return vKernel->getVarName(var->visaSurfVariable);
6860         case EVARTYPE_SAMPLER:
6861             return vKernel->getVarName(var->visaSamplerVariable);
6862         default:
6863             IGC_ASSERT_MESSAGE(0, "Unknown var type");
6864             return "";
6865         }
6866     }
6867 
GetDumpFileName(std::string extension)6868     std::string CEncoder::GetDumpFileName(std::string extension)
6869     {
6870         std::string filename = IGC::Debug::GetDumpName(m_program, extension.c_str());
6871         return filename;
6872     }
6873 
6874 
ReportCompilerStatistics(VISAKernel * pMainKernel,SProgramOutput * pOutput)6875     void CEncoder::ReportCompilerStatistics(VISAKernel* pMainKernel, SProgramOutput* pOutput)
6876     {
6877         CompilerStats compilerStats;
6878         pMainKernel->GetCompilerStats(compilerStats);
6879         int simdsize = GetThreadCount(m_program->m_dispatchSize);
6880 
6881 
6882         // set optional statistics
6883         if (compilerStats.Find(CompilerStats::numCyclesStr()))
6884         {
6885             pOutput->m_NumCycles.emplace(compilerStats.GetI64(CompilerStats::numCyclesStr(), simdsize));
6886         }
6887 
6888         if (compilerStats.Find(CompilerStats::numGRFFillStr()))
6889         {
6890             pOutput->m_NumGRFFill.emplace(compilerStats.GetI64(CompilerStats::numGRFFillStr(), simdsize));
6891         }
6892 
6893         if (compilerStats.Find(CompilerStats::numGRFSpillStr()))
6894         {
6895             pOutput->m_NumGRFSpill.emplace(compilerStats.GetI64(CompilerStats::numGRFSpillStr(), simdsize));
6896         }
6897 
6898         if (compilerStats.Find(CompilerStats::numSendStr()))
6899         {
6900             pOutput->m_NumSends.emplace(compilerStats.GetI64(CompilerStats::numSendStr(), simdsize));
6901         }
6902         FINALIZER_INFO* jitInfo = nullptr;
6903         if (0 == pMainKernel->GetJitInfo(jitInfo))
6904         {
6905             uint sendStallCycle = 0;
6906             for (uint i = 0; i < jitInfo->BBNum; i++)
6907             {
6908                 sendStallCycle += jitInfo->BBInfo[i].sendStallCycle;
6909             }
6910             pOutput->m_NumSendStallCycles.emplace(sendStallCycle);
6911         }
6912     }
6913 
GetThreadCount(SIMDMode simdMode)6914     int CEncoder::GetThreadCount(SIMDMode simdMode)
6915     {
6916         int simdsize = 0;
6917         switch (m_program->m_dispatchSize)
6918         {
6919         case SIMDMode::SIMD8:
6920             simdsize = 8;
6921             break;
6922         case SIMDMode::SIMD16:
6923             simdsize = 16;
6924             break;
6925         case SIMDMode::SIMD32:
6926             simdsize = 32;
6927             break;
6928         default:
6929             break;
6930         }
6931         return simdsize;
6932     }
6933 
6934 }
6935