1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2017-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include "common/LLVMWarningsPush.hpp"
10 #include <llvm/Support/ScaledNumber.h>
11 #include "llvm/IR/DataLayout.h"
12 #include "llvm/ADT/StringExtras.h"
13 #include "common/LLVMWarningsPop.hpp"
14 #include "AdaptorCommon/ImplicitArgs.hpp"
15 #include "Compiler/CISACodeGen/ShaderCodeGen.hpp"
16 #include "Compiler/CISACodeGen/OpenCLKernelCodeGen.hpp"
17 #include "Compiler/CISACodeGen/messageEncoding.hpp"
18 #include "Compiler/Optimizer/OpenCLPasses/ResourceAllocator/ResourceAllocator.hpp"
19 #include "Compiler/Optimizer/OpenCLPasses/ProgramScopeConstants/ProgramScopeConstantAnalysis.hpp"
20 #include "Compiler/Optimizer/OpenCLPasses/LocalBuffers/InlineLocalsResolution.hpp"
21 #include "Compiler/Optimizer/OpenCLPasses/KernelArgs.hpp"
22 #include "Compiler/CISACodeGen/EmitVISAPass.hpp"
23 #include "Compiler/Optimizer/OCLBIUtils.h"
24 #include "AdaptorOCL/OCL/KernelAnnotations.hpp"
25 #include "common/allocator.h"
26 #include "common/igc_regkeys.hpp"
27 #include "common/Stats.hpp"
28 #include "common/SystemThread.h"
29 #include "common/secure_mem.h"
30 #include "common/MDFrameWork.h"
31 #include <iStdLib/utility.h>
32 #include "Probe/Assertion.h"
33 #include "ZEBinWriter/zebin/source/ZEELFObjectBuilder.hpp"
34 
35 /***********************************************************************************
36 This file contains the code specific to opencl kernels
37 ************************************************************************************/
38 
39 using namespace llvm;
40 using namespace IGC;
41 using namespace IGC::IGCMD;
42 
43 namespace IGC
44 {
45 
getLocalIdBufferSize(SIMDMode mode)46     unsigned int getLocalIdBufferSize(SIMDMode mode)
47     {
48         auto simdSize = numLanes(mode);
49         IGC_ASSERT(simdSize != 0);
50 
51         // as per spec, size of local id buffer depends on simd size
52         // simd size * size/elem * #dims
53         unsigned int allocSize = simdSize * 2 * 3;
54 
55         // simd8 version has some reserved fields
56         if (simdSize == 8)
57             allocSize *= 2;
58 
59         // field to hold pointer to local id buffer
60         allocSize += 8;
61 
62         return allocSize;
63     }
64 
COpenCLKernel(const OpenCLProgramContext * ctx,Function * pFunc,CShaderProgram * pProgram)65     COpenCLKernel::COpenCLKernel(const OpenCLProgramContext* ctx, Function* pFunc, CShaderProgram* pProgram) :
66         CComputeShaderBase(pFunc, pProgram)
67     {
68         m_HasTID = false;
69         m_HasGlobalSize = false;
70         m_disableMidThreadPreemption = false;
71         m_perWIStatelessPrivateMemSize = 0;
72         m_Context = const_cast<OpenCLProgramContext*>(ctx);
73         m_localOffsetsMap.clear();
74         m_pBtiLayout = &(ctx->btiLayout);
75         m_Platform = &(ctx->platform);
76         m_DriverInfo = &(ctx->m_DriverInfo);
77 
78     }
79 
~COpenCLKernel()80     COpenCLKernel::~COpenCLKernel()
81     {
82         ClearKernelInfo();
83         m_simdProgram.Destroy();
84     }
85 
ClearKernelInfo()86     void COpenCLKernel::ClearKernelInfo()
87     {
88         // Global pointer arguments
89         m_kernelInfo.m_pointerArgument.clear();
90 
91         // Non-argument pointer inputs
92         m_kernelInfo.m_pointerInput.clear();
93 
94         // Local pointer arguments
95         m_kernelInfo.m_localPointerArgument.clear();
96 
97         // Sampler inputs
98         m_kernelInfo.m_samplerInput.clear();
99 
100         // Sampler arguments
101         m_kernelInfo.m_samplerArgument.clear();
102 
103         // Scalar inputs
104         m_kernelInfo.m_constantInputAnnotation.clear();
105 
106         // Scalar arguments
107         m_kernelInfo.m_constantArgumentAnnotation.clear();
108 
109         // Image arguments
110         m_kernelInfo.m_imageInputAnnotations.clear();
111 
112         // Kernel Arg Reflection Info
113         m_kernelInfo.m_kernelArgInfo.clear();
114 
115         // Printf strings
116         m_kernelInfo.m_printfStringAnnotations.clear();
117 
118         // Argument to BTI/Sampler index map
119         m_kernelInfo.m_argIndexMap.clear();
120     }
121 
PreCompile()122     void COpenCLKernel::PreCompile()
123     {
124         ClearKernelInfo();
125         CreateImplicitArgs();
126         //We explicitly want this to be GRF-sized, without relation to simd width
127 
128         RecomputeBTLayout();
129 
130         ModuleMetaData* modMD = m_Context->getModuleMetaData();
131         auto funcIter = modMD->FuncMD.find(entry);
132 
133         // Initialize the table of offsets for GlobalVariables representing locals
134         if (funcIter != modMD->FuncMD.end())
135         {
136             auto loIter = funcIter->second.localOffsets.begin();
137             auto loEnd = funcIter->second.localOffsets.end();
138             for (; loIter != loEnd; ++loIter)
139             {
140                 LocalOffsetMD loHandle = *loIter;
141                 m_localOffsetsMap[loHandle.m_Var] = loHandle.m_Offset;
142             }
143         }
144     }
145 
hasWorkGroupWalkOrder()146     bool COpenCLKernel::hasWorkGroupWalkOrder()
147     {
148         const CodeGenContext* pCtx = GetContext();
149         const ModuleMetaData* MMD = pCtx->getModuleMetaData();
150         if (auto I = MMD->FuncMD.find(entry); I != MMD->FuncMD.end())
151         {
152             auto& FMD = I->second;
153             auto& Order = FMD.workGroupWalkOrder;
154             if (Order.dim0 != 0 || Order.dim1 != 0 || Order.dim2 != 0)
155                 return true;
156         }
157 
158         return false;
159     }
160 
getResourceInfo(int argNo)161     SOpenCLKernelInfo::SResourceInfo COpenCLKernel::getResourceInfo(int argNo)
162     {
163         CodeGenContext* pCtx = GetContext();
164         ModuleMetaData* modMD = pCtx->getModuleMetaData();
165         FunctionMetaData* funcMD = &modMD->FuncMD[entry];
166         ResourceAllocMD* resAllocMD = &funcMD->resAllocMD;
167         IGC_ASSERT_MESSAGE(resAllocMD->argAllocMDList.size() > 0, "ArgAllocMD List Out of Bounds");
168         ArgAllocMD* argAlloc = &resAllocMD->argAllocMDList[argNo];
169 
170         SOpenCLKernelInfo::SResourceInfo resInfo;
171         ResourceTypeEnum type = (ResourceTypeEnum)argAlloc->type;
172 
173         if (type == ResourceTypeEnum::UAVResourceType ||
174             type == ResourceTypeEnum::BindlessUAVResourceType)
175         {
176             resInfo.Type = SOpenCLKernelInfo::SResourceInfo::RES_UAV;
177         }
178         else if (type == ResourceTypeEnum::SRVResourceType)
179         {
180             resInfo.Type = SOpenCLKernelInfo::SResourceInfo::RES_SRV;
181         }
182         else
183         {
184             resInfo.Type = SOpenCLKernelInfo::SResourceInfo::RES_OTHER;
185         }
186         resInfo.Index = argAlloc->indexType;
187         return resInfo;
188     }
189 
getExtensionInfo(int argNo)190     ResourceExtensionTypeEnum COpenCLKernel::getExtensionInfo(int argNo)
191     {
192         CodeGenContext* pCtx = GetContext();
193         ModuleMetaData* modMD = pCtx->getModuleMetaData();
194         FunctionMetaData* funcMD = &modMD->FuncMD[entry];
195         ResourceAllocMD* resAllocMD = &funcMD->resAllocMD;
196         IGC_ASSERT_MESSAGE(resAllocMD->argAllocMDList.size() > 0, "ArgAllocMD List Out of Bounds");
197         ArgAllocMD* argAlloc = &resAllocMD->argAllocMDList[argNo];
198         return (ResourceExtensionTypeEnum)argAlloc->extensionType;
199     }
200 
CreateInlineSamplerAnnotations()201     void COpenCLKernel::CreateInlineSamplerAnnotations()
202     {
203         if (m_Context->getModuleMetaData()->FuncMD.find(entry) != m_Context->getModuleMetaData()->FuncMD.end())
204         {
205             FunctionMetaData funcMD = m_Context->getModuleMetaData()->FuncMD.find(entry)->second;
206 
207             ResourceAllocMD resAllocMD = funcMD.resAllocMD;
208 
209             for (const auto &inlineSamplerMD : resAllocMD.inlineSamplersMD)
210             {
211                 auto samplerInput = std::make_unique<iOpenCL::SamplerInputAnnotation>();
212 
213                 samplerInput->SamplerType = iOpenCL::SAMPLER_OBJECT_TEXTURE;
214                 samplerInput->SamplerTableIndex = inlineSamplerMD.index;
215 
216                 samplerInput->TCXAddressMode = iOpenCL::SAMPLER_TEXTURE_ADDRESS_MODE(inlineSamplerMD.TCXAddressMode);
217                 samplerInput->TCYAddressMode = iOpenCL::SAMPLER_TEXTURE_ADDRESS_MODE(inlineSamplerMD.TCYAddressMode);
218                 samplerInput->TCZAddressMode = iOpenCL::SAMPLER_TEXTURE_ADDRESS_MODE(inlineSamplerMD.TCZAddressMode);
219                 samplerInput->NormalizedCoords = inlineSamplerMD.NormalizedCoords != 0 ? true : false;
220 
221                 samplerInput->MagFilterType = iOpenCL::SAMPLER_MAPFILTER_TYPE(inlineSamplerMD.MagFilterType);
222                 samplerInput->MinFilterType = iOpenCL::SAMPLER_MAPFILTER_TYPE(inlineSamplerMD.MinFilterType);
223                 samplerInput->MipFilterType = iOpenCL::SAMPLER_MIPFILTER_TYPE(inlineSamplerMD.MipFilterType);
224                 samplerInput->CompareFunc = iOpenCL::SAMPLER_COMPARE_FUNC_TYPE(inlineSamplerMD.CompareFunc);
225 
226                 samplerInput->BorderColorR = inlineSamplerMD.BorderColorR;
227                 samplerInput->BorderColorG = inlineSamplerMD.BorderColorG;
228                 samplerInput->BorderColorB = inlineSamplerMD.BorderColorB;
229                 samplerInput->BorderColorA = inlineSamplerMD.BorderColorA;
230 
231                 m_kernelInfo.m_samplerInput.push_back(std::move(samplerInput));
232             }
233 
234             m_kernelInfo.m_HasInlineVmeSamplers = funcMD.hasInlineVmeSamplers;
235         }
236     }
237 
CreateKernelArgInfo()238     void COpenCLKernel::CreateKernelArgInfo()
239     {
240         FunctionInfoMetaDataHandle funcInfoMD = m_pMdUtils->getFunctionsInfoItem(entry);
241 
242         uint count = 0;
243         if (m_Context->getModuleMetaData()->FuncMD.find(entry) != m_Context->getModuleMetaData()->FuncMD.end())
244         {
245             FunctionMetaData* funcMD = &m_Context->getModuleMetaData()->FuncMD[entry];
246             count = funcMD->m_OpenCLArgAccessQualifiers.size();
247         }
248 
249         for (uint i = 0; i < count; ++i)
250         {
251             auto kernelArgInfo = std::make_unique<iOpenCL::KernelArgumentInfoAnnotation>();
252             FunctionMetaData* funcMD = &m_Context->getModuleMetaData()->FuncMD[entry];
253 
254             // Format the strings the way the OpenCL runtime expects them
255 
256             // The access qualifier is expected to have a "__" prefix,
257             // or an upper-case "NONE" if there is no qualifier
258             kernelArgInfo->AccessQualifier = funcMD->m_OpenCLArgAccessQualifiers[i];
259             if (kernelArgInfo->AccessQualifier == "none" || kernelArgInfo->AccessQualifier == "")
260             {
261                 kernelArgInfo->AccessQualifier = "NONE";
262             }
263             else if (kernelArgInfo->AccessQualifier[0] != '_')
264             {
265                 kernelArgInfo->AccessQualifier = "__" + kernelArgInfo->AccessQualifier;
266             }
267 
268             // The address space is expected to have a __ prefix
269             switch (funcMD->m_OpenCLArgAddressSpaces[i])
270             {
271             case ADDRESS_SPACE_CONSTANT:
272                 kernelArgInfo->AddressQualifier = "__constant";
273                 break;
274             case ADDRESS_SPACE_GLOBAL:
275                 kernelArgInfo->AddressQualifier = "__global";
276                 break;
277             case ADDRESS_SPACE_LOCAL:
278                 kernelArgInfo->AddressQualifier = "__local";
279                 break;
280             case ADDRESS_SPACE_PRIVATE:
281                 kernelArgInfo->AddressQualifier = "__private";
282                 break;
283             default:
284                 m_Context->EmitError("Generic pointers are not allowed as kernel argument storage class!", nullptr);
285                 IGC_ASSERT_MESSAGE(0, "Unexpected address space");
286                 break;
287             }
288 
289             // ArgNames is not guaranteed to be present if -cl-kernel-arg-info
290             // is not passed in.
291             if (funcMD->m_OpenCLArgNames.size() > i)
292             {
293                 kernelArgInfo->ArgumentName = funcMD->m_OpenCLArgNames[i];
294             }
295 
296             // The type name is expected to also have the type size, appended after a ";"
297             kernelArgInfo->TypeName = funcMD->m_OpenCLArgTypes[i] + ";";
298 
299             // Unfortunately, unlike SPIR, legacy OCL uses an ABI that has byval pointers.
300             // So, if the parameter is a byval pointer, look at the contained type
301             {
302                 Function::arg_iterator argumentIter = entry->arg_begin();
303                 std::advance(argumentIter, i);
304 
305                 Type* argType = entry->getFunctionType()->getParamType(i);
306                 if (argumentIter->hasByValAttr())
307                 {
308                     argType = argType->getContainedType(0);
309                 }
310 
311                 kernelArgInfo->TypeName += utostr(m_DL->getTypeAllocSize(argType));
312             }
313 
314             // If there are no type qualifiers, "NONE" is expected
315             kernelArgInfo->TypeQualifier = funcMD->m_OpenCLArgTypeQualifiers[i];
316             if (kernelArgInfo->TypeQualifier == "")
317             {
318                 kernelArgInfo->TypeQualifier = "NONE";
319             }
320 
321             m_kernelInfo.m_kernelArgInfo.push_back(std::move(kernelArgInfo));
322         }
323     }
324 
CreateKernelAttributeInfo()325     void COpenCLKernel::CreateKernelAttributeInfo()
326     {
327         FunctionInfoMetaDataHandle funcInfoMD = m_pMdUtils->getFunctionsInfoItem(entry);
328 
329         // We need to concatenate 2 things:
330         // (a) LLVM attributes, except nounwind. Why? Because that's how IGIL does it.
331         // (b) The attributes that get translated into SPIR metadata:
332         //     (*) vec_type_hint
333         //     (*) reqd_work_group_size
334         //     (*) work_group_size_hint
335         //
336 
337         // Get LLVM function attributes, and erase "nounwind" if necessary
338         m_kernelInfo.m_kernelAttributeInfo = entry->getAttributes().getAsString(-1);
339         size_t nounwindLoc = m_kernelInfo.m_kernelAttributeInfo.find("nounwind");
340         if (nounwindLoc != std::string::npos)
341         {
342             //8 is the length of "nounwind".
343             //If this is not the first attribute, it has a leading space, which we also want to delete.
344             int eraseLen = 8;
345             if (nounwindLoc != 0)
346             {
347                 nounwindLoc--;
348                 eraseLen++;
349             }
350             m_kernelInfo.m_kernelAttributeInfo.erase(nounwindLoc, eraseLen);
351         }
352 
353         // Now fill in the special OCL attributes from the MD
354         VectorTypeHintMetaDataHandle vecTypeHintInfo = funcInfoMD->getOpenCLVectorTypeHint();
355         if (vecTypeHintInfo->hasValue())
356         {
357             m_kernelInfo.m_kernelAttributeInfo += " " + getVecTypeHintString(vecTypeHintInfo);
358         }
359         SubGroupSizeMetaDataHandle subGroupSize = funcInfoMD->getSubGroupSize();
360         if (subGroupSize->hasValue())
361         {
362             m_kernelInfo.m_kernelAttributeInfo += " " + getSubGroupSizeString(subGroupSize);
363         }
364 
365         auto it = m_Context->getModuleMetaData()->FuncMD.find(entry);
366         if (it != m_Context->getModuleMetaData()->FuncMD.end())
367         {
368             WorkGroupWalkOrderMD workgroupWalkOrder = it->second.workGroupWalkOrder;
369             if (workgroupWalkOrder.dim0 || workgroupWalkOrder.dim1 || workgroupWalkOrder.dim2)
370             {
371                 m_kernelInfo.m_kernelAttributeInfo += " " + getWorkgroupWalkOrderString(workgroupWalkOrder);
372             }
373         }
374 
375         ThreadGroupSizeMetaDataHandle threadGroupSize = funcInfoMD->getThreadGroupSize();
376         if (threadGroupSize->hasValue())
377         {
378             m_kernelInfo.m_kernelAttributeInfo += " " + getThreadGroupSizeString(threadGroupSize, false);
379         }
380 
381         ThreadGroupSizeMetaDataHandle threadGroupSizeHint = funcInfoMD->getThreadGroupSizeHint();
382         if (threadGroupSizeHint->hasValue())
383         {
384             m_kernelInfo.m_kernelAttributeInfo += " " + getThreadGroupSizeString(threadGroupSizeHint, true);
385         }
386     }
387 
getThreadGroupSizeString(ThreadGroupSizeMetaDataHandle & threadGroupSize,bool isHint)388     std::string COpenCLKernel::getThreadGroupSizeString(ThreadGroupSizeMetaDataHandle& threadGroupSize, bool isHint)
389     {
390         std::string threadGroupSizeString = "";
391         if (isHint)
392         {
393             threadGroupSizeString = "work_group_size_hint(";
394         }
395         else
396         {
397             threadGroupSizeString = "reqd_work_group_size(";
398         }
399 
400         threadGroupSizeString += utostr(threadGroupSize->getXDim()) + ",";
401         threadGroupSizeString += utostr(threadGroupSize->getYDim()) + ",";
402         threadGroupSizeString += utostr(threadGroupSize->getZDim());
403 
404         threadGroupSizeString += ")";
405         return threadGroupSizeString;
406     }
getSubGroupSizeString(SubGroupSizeMetaDataHandle & subGroupSize)407     std::string COpenCLKernel::getSubGroupSizeString(SubGroupSizeMetaDataHandle& subGroupSize)
408     {
409         std::string subTypeString = "intel_reqd_sub_group_size(";
410         subTypeString += utostr(subGroupSize->getSIMD_size());
411         subTypeString += ")";
412         return subTypeString;
413     }
getWorkgroupWalkOrderString(const IGC::WorkGroupWalkOrderMD & workgroupWalkOrder)414     std::string COpenCLKernel::getWorkgroupWalkOrderString(const IGC::WorkGroupWalkOrderMD& workgroupWalkOrder)
415     {
416         std::string subTypeString = "intel_reqd_workgroup_walk_order(";
417         subTypeString += utostr(workgroupWalkOrder.dim0) + ",";
418         subTypeString += utostr(workgroupWalkOrder.dim1) + ",";
419         subTypeString += utostr(workgroupWalkOrder.dim2) + ",";
420         subTypeString += ")";
421         return subTypeString;
422     }
getVecTypeHintString(VectorTypeHintMetaDataHandle & vecTypeHintInfo)423     std::string COpenCLKernel::getVecTypeHintString(VectorTypeHintMetaDataHandle& vecTypeHintInfo)
424     {
425         std::string vecTypeString = "vec_type_hint(";
426 
427         // Get the information about the type
428         Type* baseType = vecTypeHintInfo->getVecType()->getType();
429         unsigned int numElements = 1;
430         if (baseType->isVectorTy())
431         {
432             numElements = (unsigned)cast<IGCLLVM::FixedVectorType>(baseType)->getNumElements();
433             baseType = cast<VectorType>(baseType)->getElementType();
434         }
435 
436         // Integer types need to be qualified with a "u" if they are unsigned
437         if (baseType->isIntegerTy())
438         {
439             std::string signString = vecTypeHintInfo->getSign() ? "" : "u";
440             vecTypeString += signString;
441         }
442 
443         switch (baseType->getTypeID())
444         {
445         case Type::IntegerTyID:
446             switch (baseType->getIntegerBitWidth())
447             {
448             case 8:
449                 vecTypeString += "char";
450                 break;
451             case 16:
452                 vecTypeString += "short";
453                 break;
454             case 32:
455                 vecTypeString += "int";
456                 break;
457             case 64:
458                 vecTypeString += "long";
459                 break;
460             default:
461                 IGC_ASSERT_MESSAGE(0, "Unexpected data type in vec_type_hint");
462                 break;
463             }
464             break;
465         case Type::DoubleTyID:
466             vecTypeString += "double";
467             break;
468         case Type::FloatTyID:
469             vecTypeString += "float";
470             break;
471         case Type::HalfTyID:
472             vecTypeString += "half";
473             break;
474         default:
475             IGC_ASSERT_MESSAGE(0, "Unexpected data type in vec_type_hint");
476             break;
477         }
478 
479         if (numElements != 1)
480         {
481             vecTypeString += utostr(numElements);
482         }
483 
484         vecTypeString += ")";
485 
486         return vecTypeString;
487     }
488 
CreatePrintfStringAnnotations()489     void COpenCLKernel::CreatePrintfStringAnnotations()
490     {
491         auto printfStrings = GetPrintfStrings(*entry->getParent());
492 
493         for (const auto& printfString : printfStrings)
494         {
495             auto printfAnnotation = std::make_unique<iOpenCL::PrintfStringAnnotation>();
496             printfAnnotation->Index = printfString.first;
497             printfAnnotation->StringSize = printfString.second.size() + 1;
498             printfAnnotation->StringData = new char[printfAnnotation->StringSize + 1];
499 
500             memcpy_s(printfAnnotation->StringData, printfAnnotation->StringSize, printfString.second.c_str(), printfAnnotation->StringSize);
501             printfAnnotation->StringData[printfAnnotation->StringSize - 1] = '\0';
502 
503             m_kernelInfo.m_printfStringAnnotations.push_back(std::move(printfAnnotation));
504         }
505     }
506 
CreateZEPayloadArguments(IGC::KernelArg * kernelArg,uint payloadPosition)507     bool COpenCLKernel::CreateZEPayloadArguments(IGC::KernelArg* kernelArg, uint payloadPosition)
508     {
509         switch (kernelArg->getArgType()) {
510 
511         case KernelArg::ArgType::IMPLICIT_PAYLOAD_HEADER:{
512             // PayloadHeader contains global work offset x,y,z and local size x,y,z
513             // global work offset, size is int32x3
514             uint cur_pos = payloadPosition;
515             uint32_t size = iOpenCL::DATA_PARAMETER_DATA_SIZE * 3;
516             zebin::ZEInfoBuilder::addPayloadArgumentImplicit(m_kernelInfo.m_zePayloadArgs,
517                 zebin::PreDefinedAttrGetter::ArgType::global_id_offset, cur_pos, size);
518             cur_pos += size;
519             // local size, size is int32x3, the same as above
520             zebin::ZEInfoBuilder::addPayloadArgumentImplicit(m_kernelInfo.m_zePayloadArgs,
521                 zebin::PreDefinedAttrGetter::ArgType::local_size, cur_pos, size);
522             break;
523         }
524         case KernelArg::ArgType::IMPLICIT_PRIVATE_BASE:
525             zebin::ZEInfoBuilder::addPayloadArgumentImplicit(m_kernelInfo.m_zePayloadArgs,
526                 zebin::PreDefinedAttrGetter::ArgType::private_base_stateless,
527                 payloadPosition, kernelArg->getAllocateSize());
528             break;
529 
530         case KernelArg::ArgType::IMPLICIT_NUM_GROUPS:
531             zebin::ZEInfoBuilder::addPayloadArgumentImplicit(m_kernelInfo.m_zePayloadArgs,
532                 zebin::PreDefinedAttrGetter::ArgType::group_count,
533                 payloadPosition, iOpenCL::DATA_PARAMETER_DATA_SIZE * 3);
534             break;
535 
536         case KernelArg::ArgType::IMPLICIT_LOCAL_SIZE:
537             // FIXME: duplicated information as KernelArg::ArgType::IMPLICIT_PAYLOAD_HEADER?
538             zebin::ZEInfoBuilder::addPayloadArgumentImplicit(m_kernelInfo.m_zePayloadArgs,
539                 zebin::PreDefinedAttrGetter::ArgType::local_size,
540                 payloadPosition, iOpenCL::DATA_PARAMETER_DATA_SIZE * 3);
541             break;
542 
543          case KernelArg::ArgType::IMPLICIT_ENQUEUED_LOCAL_WORK_SIZE:
544              zebin::ZEInfoBuilder::addPayloadArgumentImplicit(m_kernelInfo.m_zePayloadArgs,
545                  zebin::PreDefinedAttrGetter::ArgType::enqueued_local_size,
546                  payloadPosition, iOpenCL::DATA_PARAMETER_DATA_SIZE * 3);
547              break;
548 
549          case KernelArg::ArgType::IMPLICIT_GLOBAL_SIZE:
550              zebin::ZEInfoBuilder::addPayloadArgumentImplicit(m_kernelInfo.m_zePayloadArgs,
551                  zebin::PreDefinedAttrGetter::ArgType::global_size,
552                  payloadPosition, iOpenCL::DATA_PARAMETER_DATA_SIZE * 3);
553              break;
554 
555          case KernelArg::ArgType::IMPLICIT_WORK_DIM:
556              zebin::ZEInfoBuilder::addPayloadArgumentImplicit(m_kernelInfo.m_zePayloadArgs,
557                  zebin::PreDefinedAttrGetter::ArgType::work_dimensions,
558                  payloadPosition, iOpenCL::DATA_PARAMETER_DATA_SIZE);
559              break;
560 
561         // pointer args
562         case KernelArg::ArgType::PTR_GLOBAL:
563         case KernelArg::ArgType::PTR_CONSTANT: {
564             uint32_t arg_idx = kernelArg->getAssociatedArgNo();
565 
566             // Add BTI argument if being promoted
567             // FIXME: do not set bti if the number is 0xffffffff (?)
568             SOpenCLKernelInfo::SResourceInfo resInfo = getResourceInfo(arg_idx);
569             uint32_t bti_idx = getBTI(resInfo);
570             if (bti_idx != 0xffffffff) {
571                 // add BTI argument with addr_mode set to stateful
572                 // promoted arg has 0 offset and 0 size
573                 zebin::ZEInfoBuilder::addPayloadArgumentByPointer(m_kernelInfo.m_zePayloadArgs,
574                     0, 0, arg_idx,
575                     zebin::PreDefinedAttrGetter::ArgAddrMode::stateful,
576                     (kernelArg->getArgType() == KernelArg::ArgType::PTR_GLOBAL)?
577                       zebin::PreDefinedAttrGetter::ArgAddrSpace::global :
578                       zebin::PreDefinedAttrGetter::ArgAddrSpace::constant,
579                     (kernelArg->getArgType() == KernelArg::ArgType::PTR_GLOBAL)?
580                       zebin::PreDefinedAttrGetter::ArgAccessType::readwrite :
581                       zebin::PreDefinedAttrGetter::ArgAccessType::readonly
582                 );
583                 // add the corresponding BTI table index
584                 zebin::ZEInfoBuilder::addBindingTableIndex(m_kernelInfo.m_zeBTIArgs,
585                     bti_idx, arg_idx);
586             }
587             // FIXME: check if all reference are promoted, if it is, we can skip
588             // creating non-bti payload arg
589             /*
590             bool is_bti_only =
591                 IGC_IS_FLAG_ENABLED(EnableStatelessToStatefull) &&
592                 IGC_IS_FLAG_ENABLED(EnableStatefulToken) &&
593                 m_DriverInfo->SupportStatefulToken() &&
594                 kernelArg->getArg() &&
595                 ((kernelArg->getArgType() == KernelArg::ArgType::PTR_GLOBAL &&
596                 (kernelArg->getArg()->use_empty() || !GetHasGlobalStatelessAccess())) ||
597                     (kernelArg->getArgType() == KernelArg::ArgType::PTR_CONSTANT &&
598                     (kernelArg->getArg()->use_empty() || !GetHasConstantStatelessAccess())));
599             // no need to add normal argument if all use are promoted
600             if (is_bti_only)
601                 break;
602              */
603             ResourceAllocMD& resAllocMD = GetContext()->getModuleMetaData()->FuncMD[entry].resAllocMD;
604             IGC_ASSERT_MESSAGE(resAllocMD.argAllocMDList.size() > 0, "ArgAllocMDList is empty.");
605 
606             ArgAllocMD& argAlloc = resAllocMD.argAllocMDList[arg_idx];
607 
608             zebin::PreDefinedAttrGetter::ArgAddrMode addr_mode =
609                 zebin::PreDefinedAttrGetter::ArgAddrMode::stateless;
610             if (argAlloc.type == ResourceTypeEnum::BindlessUAVResourceType)
611                 addr_mode = zebin::PreDefinedAttrGetter::ArgAddrMode::bindless;
612 
613             zebin::ZEInfoBuilder::addPayloadArgumentByPointer(m_kernelInfo.m_zePayloadArgs,
614                 payloadPosition, kernelArg->getAllocateSize(), arg_idx, addr_mode,
615                 (kernelArg->getArgType() == KernelArg::ArgType::PTR_GLOBAL)?
616                   zebin::PreDefinedAttrGetter::ArgAddrSpace::global :
617                   zebin::PreDefinedAttrGetter::ArgAddrSpace::constant,
618                 (kernelArg->getArgType() == KernelArg::ArgType::PTR_GLOBAL)?
619                   zebin::PreDefinedAttrGetter::ArgAccessType::readwrite :
620                   zebin::PreDefinedAttrGetter::ArgAccessType::readonly
621                 );
622             break;
623         }
624         case KernelArg::ArgType::PTR_LOCAL:
625             zebin::ZEInfoBuilder::addPayloadArgumentByPointer(m_kernelInfo.m_zePayloadArgs,
626                 payloadPosition, kernelArg->getAllocateSize(),
627                 kernelArg->getAssociatedArgNo(),
628                 zebin::PreDefinedAttrGetter::ArgAddrMode::slm,
629                 zebin::PreDefinedAttrGetter::ArgAddrSpace::local,
630                 zebin::PreDefinedAttrGetter::ArgAccessType::readwrite);
631             break;
632         // by value arguments
633         case KernelArg::ArgType::CONSTANT_REG:
634             zebin::ZEInfoBuilder::addPayloadArgumentByValue(m_kernelInfo.m_zePayloadArgs,
635                 payloadPosition, kernelArg->getAllocateSize(),
636                 kernelArg->getAssociatedArgNo());
637             break;
638 
639         // Local ids are supported in per-thread payload arguments
640         case KernelArg::ArgType::IMPLICIT_LOCAL_IDS:
641             break;
642 
643         // Images
644         case KernelArg::ArgType::IMAGE_1D:
645         case KernelArg::ArgType::BINDLESS_IMAGE_1D:
646         case KernelArg::ArgType::IMAGE_1D_BUFFER:
647         case KernelArg::ArgType::BINDLESS_IMAGE_1D_BUFFER:
648         case KernelArg::ArgType::IMAGE_2D:
649         case KernelArg::ArgType::BINDLESS_IMAGE_2D:
650         case KernelArg::ArgType::IMAGE_3D:
651         case KernelArg::ArgType::BINDLESS_IMAGE_3D:
652         case KernelArg::ArgType::IMAGE_CUBE:
653         case KernelArg::ArgType::BINDLESS_IMAGE_CUBE:
654         case KernelArg::ArgType::IMAGE_CUBE_DEPTH:
655         case KernelArg::ArgType::BINDLESS_IMAGE_CUBE_DEPTH:
656         case KernelArg::ArgType::IMAGE_1D_ARRAY:
657         case KernelArg::ArgType::BINDLESS_IMAGE_1D_ARRAY:
658         case KernelArg::ArgType::IMAGE_2D_ARRAY:
659         case KernelArg::ArgType::BINDLESS_IMAGE_2D_ARRAY:
660         case KernelArg::ArgType::IMAGE_2D_DEPTH:
661         case KernelArg::ArgType::BINDLESS_IMAGE_2D_DEPTH:
662         case KernelArg::ArgType::IMAGE_2D_DEPTH_ARRAY:
663         case KernelArg::ArgType::BINDLESS_IMAGE_2D_DEPTH_ARRAY:
664         case KernelArg::ArgType::IMAGE_2D_MSAA:
665         case KernelArg::ArgType::BINDLESS_IMAGE_2D_MSAA:
666         case KernelArg::ArgType::IMAGE_2D_MSAA_ARRAY:
667         case KernelArg::ArgType::BINDLESS_IMAGE_2D_MSAA_ARRAY:
668         case KernelArg::ArgType::IMAGE_2D_MSAA_DEPTH:
669         case KernelArg::ArgType::BINDLESS_IMAGE_2D_MSAA_DEPTH:
670         case KernelArg::ArgType::IMAGE_2D_MSAA_DEPTH_ARRAY:
671         case KernelArg::ArgType::BINDLESS_IMAGE_2D_MSAA_DEPTH_ARRAY:
672         case KernelArg::ArgType::IMAGE_CUBE_ARRAY:
673         case KernelArg::ArgType::BINDLESS_IMAGE_CUBE_ARRAY:
674         case KernelArg::ArgType::IMAGE_CUBE_DEPTH_ARRAY:
675         case KernelArg::ArgType::BINDLESS_IMAGE_CUBE_DEPTH_ARRAY:
676         {
677             int arg_idx = kernelArg->getAssociatedArgNo();
678             SOpenCLKernelInfo::SResourceInfo resInfo = getResourceInfo(arg_idx);
679 
680             // check if the image is writeable
681             bool writeable = false;
682             if (resInfo.Type == SOpenCLKernelInfo::SResourceInfo::RES_UAV &&
683                 kernelArg->getAccessQual() != IGC::KernelArg::AccessQual::READ_ONLY)
684                 writeable = true;
685             IGC_ASSERT_MESSAGE(resInfo.Type == SOpenCLKernelInfo::SResourceInfo::RES_UAV ||
686                 resInfo.Type == SOpenCLKernelInfo::SResourceInfo::RES_SRV, "Unknown resource type");
687 
688             // the image arg is either bindless or stateful. check from "kernelArg->needsAllocation()"
689             // For statefull image argument, the arg has 0 offset and 0 size
690             zebin::PreDefinedAttrGetter::ArgAddrMode arg_addrmode =
691                 zebin::PreDefinedAttrGetter::ArgAddrMode::stateful;
692             uint arg_off = 0;
693             uint arg_size = 0;
694 
695             if (kernelArg->needsAllocation()) {
696                 // set to bindless
697                 arg_addrmode =
698                     zebin::PreDefinedAttrGetter::ArgAddrMode::bindless;
699                 arg_off = payloadPosition;
700                 arg_size = kernelArg->getAllocateSize();
701             } else {
702                 // add bti index for this arg if it's stateful
703                 zebin::ZEInfoBuilder::addBindingTableIndex(m_kernelInfo.m_zeBTIArgs,
704                     getBTI(resInfo), arg_idx);
705             }
706 
707             // add the payload argument
708             zebin::ZEInfoBuilder::addPayloadArgumentByPointer(m_kernelInfo.m_zePayloadArgs,
709                 arg_off, arg_size, arg_idx, arg_addrmode,
710                   zebin::PreDefinedAttrGetter::ArgAddrSpace::image,
711                 writeable ?
712                   zebin::PreDefinedAttrGetter::ArgAccessType::readwrite :
713                   zebin::PreDefinedAttrGetter::ArgAccessType::readonly
714             );
715         }
716         break;
717 
718         // sampler
719         case KernelArg::ArgType::SAMPLER:
720         case KernelArg::ArgType::BINDLESS_SAMPLER:
721         {
722             // the sampler arg is either bindless or stateful. check from "kernelArg->needsAllocation()"
723             // For statefull image argument, the arg has 0 offset and 0 size
724             // NOTE: we only have statefull sampler now
725             zebin::PreDefinedAttrGetter::ArgAddrMode arg_addrmode =
726                 zebin::PreDefinedAttrGetter::ArgAddrMode::stateful;
727             uint arg_off = 0;
728             uint arg_size = 0;
729             if (kernelArg->needsAllocation()) {
730                 // set to bindless
731                 arg_addrmode =
732                     zebin::PreDefinedAttrGetter::ArgAddrMode::bindless;
733                 arg_off = payloadPosition;
734                 arg_size = kernelArg->getAllocateSize();
735             }
736 
737             int arg_idx = kernelArg->getAssociatedArgNo();
738             SOpenCLKernelInfo::SResourceInfo resInfo = getResourceInfo(arg_idx);
739             // add the payload argument
740             zebin::ZEInfoBuilder::addPayloadArgumentSampler(m_kernelInfo.m_zePayloadArgs,
741                 arg_off, arg_size, arg_idx, resInfo.Index, arg_addrmode,
742                 zebin::PreDefinedAttrGetter::ArgAccessType::readwrite);
743         }
744         break;
745 
746         case KernelArg::ArgType::IMPLICIT_BUFFER_OFFSET:
747         {
748             zebin::zeInfoPayloadArgument& arg = zebin::ZEInfoBuilder::addPayloadArgumentImplicit(m_kernelInfo.m_zePayloadArgs,
749                 zebin::PreDefinedAttrGetter::ArgType::buffer_offset,
750                 payloadPosition, kernelArg->getAllocateSize());
751             arg.arg_index = kernelArg->getAssociatedArgNo();
752         }
753         break;
754 
755         case KernelArg::ArgType::IMPLICIT_PRINTF_BUFFER:
756             zebin::ZEInfoBuilder::addPayloadArgumentImplicit(m_kernelInfo.m_zePayloadArgs,
757                 zebin::PreDefinedAttrGetter::ArgType::printf_buffer,
758                 payloadPosition, kernelArg->getAllocateSize());
759             break;
760 
761         case KernelArg::ArgType::IMPLICIT_ARG_BUFFER:
762             zebin::ZEInfoBuilder::addPayloadArgumentImplicit(m_kernelInfo.m_zePayloadArgs,
763                 zebin::PreDefinedAttrGetter::ArgType::implicit_arg_buffer,
764                 payloadPosition, kernelArg->getAllocateSize());
765             break;
766 
767         case KernelArg::ArgType::IMPLICIT_LOCAL_ID_BUFFER:
768             zebin::ZEInfoBuilder::addPayloadArgumentImplicit(m_kernelInfo.m_zePayloadArgs,
769                 zebin::PreDefinedAttrGetter::ArgType::implicit_local_id_buffer,
770                 payloadPosition, kernelArg->getAllocateSize());
771             break;
772 
773         // We don't need these in ZEBinary, can safely skip them
774         case KernelArg::ArgType::IMPLICIT_R0:
775         case KernelArg::ArgType::R1:
776         case KernelArg::ArgType::STRUCT:
777         // FIXME: this implicit arg is not used nowadays, should remove it completely
778         case KernelArg::ArgType::IMPLICIT_SAMPLER_SNAP_WA:
779             break;
780 
781         // FIXME: should these be supported?
782         // CONSTANT_BASE and GLOBAL_BASE are not required that we should export
783         // all globals and constants and let the runtime relocate them when enabling
784         // ZEBinary
785         case KernelArg::ArgType::IMPLICIT_CONSTANT_BASE:
786         case KernelArg::ArgType::IMPLICIT_GLOBAL_BASE:
787         case KernelArg::ArgType::IMPLICIT_STAGE_IN_GRID_ORIGIN:
788         case KernelArg::ArgType::IMPLICIT_STAGE_IN_GRID_SIZE:
789         default:
790             return false;
791             break;
792         } // end switch (kernelArg->getArgType())
793 
794         return true;
795     }
796 
CreateAnnotations(KernelArg * kernelArg,uint payloadPosition)797     void COpenCLKernel::CreateAnnotations(KernelArg* kernelArg, uint payloadPosition)
798     {
799         KernelArg::ArgType type = kernelArg->getArgType();
800 
801         DWORD constantType = iOpenCL::DATA_PARAMETER_TOKEN_UNKNOWN;
802         iOpenCL::POINTER_ADDRESS_SPACE addressSpace = iOpenCL::KERNEL_ARGUMENT_ADDRESS_SPACE_INVALID;
803         FunctionInfoMetaDataHandle funcInfoMD = m_pMdUtils->getFunctionsInfoItem(entry);
804 
805         static const DWORD DEFAULT_ARG_NUM = 0;
806         const llvm::Argument* arg = kernelArg->getArg();
807 
808         switch (type) {
809 
810         case KernelArg::ArgType::IMPLICIT_R0:
811             for (Value::const_user_iterator U = arg->user_begin(), UE = arg->user_end(); U != UE; ++U)
812             {
813                 const ExtractElementInst* EEI = dyn_cast<ExtractElementInst>(*U);
814 
815                 if (EEI)
816                 {
817                     const ConstantInt* index = dyn_cast<ConstantInt>(EEI->getIndexOperand());
818                     if (index)
819                     {
820                         uint64_t value = index->getZExtValue();
821                         if (value == 1 || value == 6 || value == 7)
822                         {
823                             // group ids x/y/z
824                             ModuleMetaData* modMD = m_Context->getModuleMetaData();
825                             auto it = modMD->FuncMD.find(entry);
826                             if (it != modMD->FuncMD.end())
827                             {
828                                 if (it->second.groupIDPresent == true)
829                                     m_kernelInfo.m_threadPayload.HasGroupID = true;
830                             }
831                             break;
832                         }
833                     }
834                 }
835             }
836             break;
837 
838         case KernelArg::ArgType::IMPLICIT_PAYLOAD_HEADER:
839             // PayloadHeader contains global work offset x,y,z and local size x,y,z -->
840             // total of 6 annotations, 3 of each type
841             for (int i = 0; i < 6; ++i)
842             {
843                 auto constInput = std::make_unique<iOpenCL::ConstantInputAnnotation>();
844 
845                 DWORD sizeInBytes = iOpenCL::DATA_PARAMETER_DATA_SIZE;
846 
847                 constInput->ConstantType = (i < 3 ?
848                     iOpenCL::DATA_PARAMETER_GLOBAL_WORK_OFFSET :
849                     iOpenCL::DATA_PARAMETER_LOCAL_WORK_SIZE);
850                 constInput->Offset = (i % 3) * sizeInBytes;
851                 constInput->PayloadPosition = payloadPosition;
852                 constInput->PayloadSizeInBytes = sizeInBytes;
853                 constInput->ArgumentNumber = DEFAULT_ARG_NUM;
854                 m_kernelInfo.m_constantInputAnnotation.push_back(std::move(constInput));
855 
856                 payloadPosition += sizeInBytes;
857             }
858 
859             for (Value::const_user_iterator U = arg->user_begin(), UE = arg->user_end(); U != UE; ++U)
860             {
861                 const ExtractElementInst* EEI = dyn_cast<ExtractElementInst>(*U);
862 
863                 if (EEI)
864                 {
865                     const ConstantInt* index = dyn_cast<ConstantInt>(EEI->getIndexOperand());
866                     if (index)
867                     {
868                         uint64_t value = index->getZExtValue();
869                         if (value == 0 || value == 1 || value == 2)
870                         {
871                             // global offset x/y/z
872                             ModuleMetaData* modMD = m_Context->getModuleMetaData();
873                             auto it = modMD->FuncMD.find(entry);
874                             if (it != modMD->FuncMD.end())
875                             {
876                                 if (it->second.globalIDPresent)
877                                     m_kernelInfo.m_threadPayload.HasGlobalIDOffset = true;
878                             }
879                             break;
880                         }
881                     }
882                 }
883             }
884             break;
885 
886         case KernelArg::ArgType::IMPLICIT_BINDLESS_OFFSET:
887             {
888                 int argNo = kernelArg->getAssociatedArgNo();
889                 std::shared_ptr<iOpenCL::PointerArgumentAnnotation> ptrAnnotation = m_kernelInfo.m_argOffsetMap[argNo];
890                 ptrAnnotation->BindingTableIndex = payloadPosition;
891             }
892             break;
893 
894         case KernelArg::ArgType::PTR_GLOBAL:
895             if (addressSpace == iOpenCL::KERNEL_ARGUMENT_ADDRESS_SPACE_INVALID) {
896                 addressSpace = iOpenCL::KERNEL_ARGUMENT_ADDRESS_SPACE_GLOBAL;
897             }
898             // Fall through until PTR_CONSTANT
899         case KernelArg::ArgType::PTR_CONSTANT:
900             if (addressSpace == iOpenCL::KERNEL_ARGUMENT_ADDRESS_SPACE_INVALID) {
901                 addressSpace = iOpenCL::KERNEL_ARGUMENT_ADDRESS_SPACE_CONSTANT;
902             }
903             // may reach here from PTR_GLOBAL, PTR_CONSTANT
904             IGC_ASSERT(addressSpace != iOpenCL::KERNEL_ARGUMENT_ADDRESS_SPACE_INVALID);
905 
906             {
907                 int argNo = kernelArg->getAssociatedArgNo();
908                 SOpenCLKernelInfo::SResourceInfo resInfo = getResourceInfo(argNo);
909                 m_kernelInfo.m_argIndexMap[argNo] = getBTI(resInfo);
910                 CodeGenContext* pCtx = GetContext();
911                 ModuleMetaData* modMD = pCtx->getModuleMetaData();
912                 FunctionMetaData* funcMD = &modMD->FuncMD[entry];
913                 ResourceAllocMD* resAllocMD = &funcMD->resAllocMD;
914                 IGC_ASSERT_MESSAGE(resAllocMD->argAllocMDList.size() > 0, "ArgAllocMDList is empty.");
915                 ArgAllocMD* argAlloc = &resAllocMD->argAllocMDList[argNo];
916 
917                 auto ptrAnnotation = std::make_shared<iOpenCL::PointerArgumentAnnotation>();
918 
919                 if (argAlloc->type == ResourceTypeEnum::BindlessUAVResourceType)
920                 {
921                     ptrAnnotation->IsStateless = false;
922                     ptrAnnotation->IsBindlessAccess = true;
923                 }
924                 else
925                 {
926                     ptrAnnotation->IsStateless = true;
927                     ptrAnnotation->IsBindlessAccess = false;
928                 }
929 
930                 m_kernelInfo.m_argOffsetMap[argNo] = ptrAnnotation;
931 
932                 ptrAnnotation->AddressSpace = addressSpace;
933                 ptrAnnotation->ArgumentNumber = argNo;
934                 ptrAnnotation->BindingTableIndex = getBTI(resInfo);
935                 ptrAnnotation->PayloadPosition = payloadPosition;
936                 ptrAnnotation->PayloadSizeInBytes = kernelArg->getAllocateSize();
937                 ptrAnnotation->LocationIndex = kernelArg->getLocationIndex();
938                 ptrAnnotation->LocationCount = kernelArg->getLocationCount();
939                 ptrAnnotation->IsEmulationArgument = kernelArg->isEmulationArgument();
940                 m_kernelInfo.m_pointerArgument.push_back(ptrAnnotation);
941             }
942             break;
943 
944         case KernelArg::ArgType::PTR_LOCAL:
945         {
946             auto locAnnotation = std::make_unique<iOpenCL::LocalArgumentAnnotation>();
947 
948             locAnnotation->Alignment = (DWORD)kernelArg->getAlignment();
949             locAnnotation->PayloadPosition = payloadPosition;
950             locAnnotation->PayloadSizeInBytes = kernelArg->getAllocateSize();
951             locAnnotation->ArgumentNumber = kernelArg->getAssociatedArgNo();
952             locAnnotation->LocationIndex = kernelArg->getLocationIndex();
953             locAnnotation->LocationCount = kernelArg->getLocationCount();
954             m_kernelInfo.m_localPointerArgument.push_back(std::move(locAnnotation));
955         }
956         break;
957 
958         case KernelArg::ArgType::PTR_DEVICE_QUEUE:
959         {
960             m_kernelInfo.m_executionEnivronment.HasDeviceEnqueue = true;
961             unsigned int argNo = kernelArg->getAssociatedArgNo();
962             SOpenCLKernelInfo::SResourceInfo resInfo = getResourceInfo(argNo);
963             m_kernelInfo.m_argIndexMap[argNo] = getBTI(resInfo);
964 
965             auto ptrAnnotation = std::make_shared<iOpenCL::PointerArgumentAnnotation>();
966 
967             ptrAnnotation->AddressSpace = iOpenCL::KERNEL_ARGUMENT_ADDRESS_SPACE_DEVICE_QUEUE;
968             ptrAnnotation->ArgumentNumber = argNo;
969             ptrAnnotation->BindingTableIndex = getBTI(resInfo);
970             ptrAnnotation->IsStateless = true;
971             ptrAnnotation->PayloadPosition = payloadPosition;
972             ptrAnnotation->PayloadSizeInBytes = kernelArg->getAllocateSize();
973             m_kernelInfo.m_pointerArgument.push_back(ptrAnnotation);
974         }
975         break;
976         case KernelArg::ArgType::CONSTANT_REG:
977         {
978             uint sourceOffsetBase = 0;
979 
980             // aggregate arguments may have additional source offsets
981             if (kernelArg->getStructArgOffset() != -1)
982             {
983                 sourceOffsetBase = kernelArg->getStructArgOffset();
984             }
985 
986             auto constInput = std::make_unique<iOpenCL::ConstantArgumentAnnotation>();
987 
988             DWORD sizeInBytes = kernelArg->getAllocateSize();
989 
990             constInput->Offset = sourceOffsetBase;
991             constInput->PayloadPosition = payloadPosition;
992             constInput->PayloadSizeInBytes = sizeInBytes;
993             constInput->ArgumentNumber = kernelArg->getAssociatedArgNo();
994             constInput->LocationIndex = kernelArg->getLocationIndex();
995             constInput->LocationCount = kernelArg->getLocationCount();
996             constInput->IsEmulationArgument = kernelArg->isEmulationArgument();
997             m_kernelInfo.m_constantArgumentAnnotation.push_back(std::move(constInput));
998 
999             payloadPosition += sizeInBytes;
1000         }
1001         break;
1002 
1003         case KernelArg::ArgType::IMPLICIT_CONSTANT_BASE:
1004         {
1005             int argNo = kernelArg->getAssociatedArgNo();
1006             SOpenCLKernelInfo::SResourceInfo resInfo = getResourceInfo(argNo);
1007             m_kernelInfo.m_argIndexMap[argNo] = getBTI(resInfo);
1008 
1009             auto ptrAnnotation = std::make_unique<iOpenCL::PointerInputAnnotation>();
1010             ptrAnnotation->AddressSpace = iOpenCL::KERNEL_ARGUMENT_ADDRESS_SPACE_CONSTANT;
1011             ptrAnnotation->BindingTableIndex = 0xffffffff;
1012             ptrAnnotation->IsStateless = true;
1013             ptrAnnotation->PayloadPosition = payloadPosition;
1014             ptrAnnotation->PayloadSizeInBytes = kernelArg->getAllocateSize();
1015             ptrAnnotation->ArgumentNumber = argNo;
1016             m_kernelInfo.m_pointerInput.push_back(std::move(ptrAnnotation));
1017         }
1018         break;
1019 
1020         case KernelArg::ArgType::IMPLICIT_GLOBAL_BASE:
1021         {
1022             int argNo = kernelArg->getAssociatedArgNo();
1023             SOpenCLKernelInfo::SResourceInfo resInfo = getResourceInfo(argNo);
1024             m_kernelInfo.m_argIndexMap[argNo] = getBTI(resInfo);
1025 
1026             auto ptrAnnotation = std::make_unique<iOpenCL::PointerInputAnnotation>();
1027             ptrAnnotation->AddressSpace = iOpenCL::KERNEL_ARGUMENT_ADDRESS_SPACE_GLOBAL;
1028             ptrAnnotation->BindingTableIndex = 0xffffffff;
1029             ptrAnnotation->IsStateless = true;
1030             ptrAnnotation->PayloadPosition = payloadPosition;
1031             ptrAnnotation->PayloadSizeInBytes = kernelArg->getAllocateSize();
1032             ptrAnnotation->ArgumentNumber = argNo;
1033             m_kernelInfo.m_pointerInput.push_back(std::move(ptrAnnotation));
1034         }
1035         break;
1036 
1037         case KernelArg::ArgType::IMPLICIT_PRIVATE_BASE:
1038         {
1039             int argNo = kernelArg->getAssociatedArgNo();
1040             SOpenCLKernelInfo::SResourceInfo resInfo = getResourceInfo(argNo);
1041             m_kernelInfo.m_argIndexMap[argNo] = getBTI(resInfo);
1042 
1043             auto ptrAnnotation = std::make_unique<iOpenCL::PrivateInputAnnotation>();
1044 
1045             ptrAnnotation->AddressSpace = iOpenCL::KERNEL_ARGUMENT_ADDRESS_SPACE_PRIVATE;
1046             ptrAnnotation->ArgumentNumber = argNo;
1047             // PerThreadPrivateMemorySize is defined as "Total private memory requirements for each OpenCL work-item."
1048             ptrAnnotation->PerThreadPrivateMemorySize = m_perWIStatelessPrivateMemSize;
1049             ptrAnnotation->BindingTableIndex = getBTI(resInfo);
1050             ptrAnnotation->IsStateless = true;
1051             ptrAnnotation->PayloadPosition = payloadPosition;
1052             ptrAnnotation->PayloadSizeInBytes = kernelArg->getAllocateSize();
1053             m_kernelInfo.m_pointerInput.push_back(std::move(ptrAnnotation));
1054         }
1055         break;
1056 
1057         case KernelArg::ArgType::IMPLICIT_ARG_BUFFER:
1058         case KernelArg::ArgType::IMPLICIT_LOCAL_ID_BUFFER:
1059         {
1060             constantType = kernelArg->getDataParamToken();
1061             IGC_ASSERT(constantType != iOpenCL::DATA_PARAMETER_TOKEN_UNKNOWN);
1062 
1063             auto constInput = std::make_unique<iOpenCL::ConstantInputAnnotation>();
1064 
1065             DWORD sizeInBytes = kernelArg->getAllocateSize();
1066             if (type == KernelArg::ArgType::IMPLICIT_LOCAL_ID_BUFFER)
1067             {
1068                 sizeInBytes = getLocalIdBufferSize(m_dispatchSize);
1069             }
1070 
1071             constInput->ConstantType = constantType;
1072             constInput->Offset = sizeInBytes;
1073             constInput->PayloadPosition = payloadPosition;
1074             constInput->PayloadSizeInBytes = sizeInBytes;
1075             constInput->ArgumentNumber = DEFAULT_ARG_NUM;
1076             m_kernelInfo.m_constantInputAnnotation.push_back(std::move(constInput));
1077 
1078             break;
1079         }
1080 
1081         case KernelArg::ArgType::IMPLICIT_NUM_GROUPS:
1082         case KernelArg::ArgType::IMPLICIT_GLOBAL_SIZE:
1083         case KernelArg::ArgType::IMPLICIT_LOCAL_SIZE:
1084         case KernelArg::ArgType::IMPLICIT_ENQUEUED_LOCAL_WORK_SIZE:
1085         case KernelArg::ArgType::IMPLICIT_STAGE_IN_GRID_ORIGIN:
1086         case KernelArg::ArgType::IMPLICIT_STAGE_IN_GRID_SIZE:
1087 
1088             constantType = kernelArg->getDataParamToken();
1089             IGC_ASSERT(constantType != iOpenCL::DATA_PARAMETER_TOKEN_UNKNOWN);
1090 
1091             for (int i = 0; i < 3; ++i)
1092             {
1093                 auto constInput = std::make_unique<iOpenCL::ConstantInputAnnotation>();
1094 
1095                 DWORD sizeInBytes = iOpenCL::DATA_PARAMETER_DATA_SIZE;
1096 
1097                 constInput->ConstantType = constantType;
1098                 constInput->Offset = i * sizeInBytes;
1099                 constInput->PayloadPosition = payloadPosition;
1100                 constInput->PayloadSizeInBytes = sizeInBytes;
1101                 constInput->ArgumentNumber = DEFAULT_ARG_NUM;
1102                 m_kernelInfo.m_constantInputAnnotation.push_back(std::move(constInput));
1103 
1104                 payloadPosition += sizeInBytes;
1105             }
1106 
1107             if (type == KernelArg::ArgType::IMPLICIT_STAGE_IN_GRID_ORIGIN)
1108                 m_kernelInfo.m_threadPayload.HasStageInGridOrigin = true;
1109             else if (type == KernelArg::ArgType::IMPLICIT_STAGE_IN_GRID_SIZE)
1110                 m_kernelInfo.m_threadPayload.HasStageInGridSize = true;
1111 
1112             break;
1113 
1114         case KernelArg::ArgType::IMPLICIT_IMAGE_HEIGHT:
1115         case KernelArg::ArgType::IMPLICIT_IMAGE_WIDTH:
1116         case KernelArg::ArgType::IMPLICIT_IMAGE_DEPTH:
1117         case KernelArg::ArgType::IMPLICIT_IMAGE_NUM_MIP_LEVELS:
1118         case KernelArg::ArgType::IMPLICIT_IMAGE_CHANNEL_DATA_TYPE:
1119         case KernelArg::ArgType::IMPLICIT_IMAGE_CHANNEL_ORDER:
1120         case KernelArg::ArgType::IMPLICIT_IMAGE_SRGB_CHANNEL_ORDER:
1121         case KernelArg::ArgType::IMPLICIT_IMAGE_ARRAY_SIZE:
1122         case KernelArg::ArgType::IMPLICIT_IMAGE_NUM_SAMPLES:
1123         case KernelArg::ArgType::IMPLICIT_SAMPLER_ADDRESS:
1124         case KernelArg::ArgType::IMPLICIT_SAMPLER_NORMALIZED:
1125         case KernelArg::ArgType::IMPLICIT_SAMPLER_SNAP_WA:
1126         case KernelArg::ArgType::IMPLICIT_FLAT_IMAGE_BASEOFFSET:
1127         case KernelArg::ArgType::IMPLICIT_FLAT_IMAGE_HEIGHT:
1128         case KernelArg::ArgType::IMPLICIT_FLAT_IMAGE_WIDTH:
1129         case KernelArg::ArgType::IMPLICIT_FLAT_IMAGE_PITCH:
1130         case KernelArg::ArgType::IMPLICIT_DEVICE_ENQUEUE_DATA_PARAMETER_OBJECT_ID:
1131         case KernelArg::ArgType::IMPLICIT_DEVICE_ENQUEUE_DISPATCHER_SIMD_SIZE:
1132         case KernelArg::ArgType::IMPLICIT_BUFFER_OFFSET:
1133             constantType = kernelArg->getDataParamToken();
1134             IGC_ASSERT(constantType != iOpenCL::DATA_PARAMETER_TOKEN_UNKNOWN);
1135             {
1136                 auto constInput = std::make_unique<iOpenCL::ConstantInputAnnotation>();
1137 
1138                 constInput->ConstantType = constantType;
1139                 constInput->Offset = 0;
1140                 constInput->PayloadPosition = payloadPosition;
1141                 constInput->PayloadSizeInBytes = iOpenCL::DATA_PARAMETER_DATA_SIZE;
1142                 constInput->ArgumentNumber = kernelArg->getAssociatedArgNo();
1143                 m_kernelInfo.m_constantInputAnnotation.push_back(std::move(constInput));
1144             }
1145             break;
1146 
1147         case KernelArg::ArgType::IMAGE_1D:
1148         case KernelArg::ArgType::BINDLESS_IMAGE_1D:
1149         case KernelArg::ArgType::IMAGE_1D_BUFFER:
1150         case KernelArg::ArgType::BINDLESS_IMAGE_1D_BUFFER:
1151         case KernelArg::ArgType::IMAGE_2D:
1152         case KernelArg::ArgType::BINDLESS_IMAGE_2D:
1153         case KernelArg::ArgType::IMAGE_3D:
1154         case KernelArg::ArgType::BINDLESS_IMAGE_3D:
1155         case KernelArg::ArgType::IMAGE_CUBE:
1156         case KernelArg::ArgType::BINDLESS_IMAGE_CUBE:
1157         case KernelArg::ArgType::IMAGE_CUBE_DEPTH:
1158         case KernelArg::ArgType::BINDLESS_IMAGE_CUBE_DEPTH:
1159         case KernelArg::ArgType::IMAGE_1D_ARRAY:
1160         case KernelArg::ArgType::BINDLESS_IMAGE_1D_ARRAY:
1161         case KernelArg::ArgType::IMAGE_2D_ARRAY:
1162         case KernelArg::ArgType::BINDLESS_IMAGE_2D_ARRAY:
1163         case KernelArg::ArgType::IMAGE_2D_DEPTH:
1164         case KernelArg::ArgType::BINDLESS_IMAGE_2D_DEPTH:
1165         case KernelArg::ArgType::IMAGE_2D_DEPTH_ARRAY:
1166         case KernelArg::ArgType::BINDLESS_IMAGE_2D_DEPTH_ARRAY:
1167         case KernelArg::ArgType::IMAGE_2D_MSAA:
1168         case KernelArg::ArgType::BINDLESS_IMAGE_2D_MSAA:
1169         case KernelArg::ArgType::IMAGE_2D_MSAA_ARRAY:
1170         case KernelArg::ArgType::BINDLESS_IMAGE_2D_MSAA_ARRAY:
1171         case KernelArg::ArgType::IMAGE_2D_MSAA_DEPTH:
1172         case KernelArg::ArgType::BINDLESS_IMAGE_2D_MSAA_DEPTH:
1173         case KernelArg::ArgType::IMAGE_2D_MSAA_DEPTH_ARRAY:
1174         case KernelArg::ArgType::BINDLESS_IMAGE_2D_MSAA_DEPTH_ARRAY:
1175         case KernelArg::ArgType::IMAGE_CUBE_ARRAY:
1176         case KernelArg::ArgType::BINDLESS_IMAGE_CUBE_ARRAY:
1177         case KernelArg::ArgType::IMAGE_CUBE_DEPTH_ARRAY:
1178         case KernelArg::ArgType::BINDLESS_IMAGE_CUBE_DEPTH_ARRAY:
1179         {
1180             int argNo = kernelArg->getAssociatedArgNo();
1181             SOpenCLKernelInfo::SResourceInfo resInfo = getResourceInfo(argNo);
1182             m_kernelInfo.m_argIndexMap[argNo] = getBTI(resInfo);
1183 
1184             auto imageInput = std::make_unique<iOpenCL::ImageArgumentAnnotation>();
1185 
1186             imageInput->ArgumentNumber = argNo;
1187             imageInput->IsFixedBindingTableIndex = true;
1188             imageInput->BindingTableIndex = getBTI(resInfo);
1189             imageInput->ImageType = getImageTypeFromKernelArg(*kernelArg);
1190             IGC_ASSERT(imageInput->ImageType != iOpenCL::IMAGE_MEMORY_OBJECT_INVALID);
1191             imageInput->LocationIndex = kernelArg->getLocationIndex();
1192             imageInput->LocationCount = kernelArg->getLocationCount();
1193             imageInput->IsEmulationArgument = kernelArg->isEmulationArgument();
1194 
1195             imageInput->AccessedByFloatCoords = kernelArg->getImgAccessedFloatCoords();
1196             imageInput->AccessedByIntCoords = kernelArg->getImgAccessedIntCoords();
1197             imageInput->IsBindlessAccess = kernelArg->needsAllocation();
1198             imageInput->PayloadPosition = payloadPosition;
1199 
1200             switch (resInfo.Type)
1201             {
1202             case SOpenCLKernelInfo::SResourceInfo::RES_UAV:
1203                 if (kernelArg->getAccessQual() == IGC::KernelArg::AccessQual::READ_ONLY)
1204                     imageInput->Writeable = false;
1205                 else
1206                     imageInput->Writeable = true;
1207                 break;
1208             case SOpenCLKernelInfo::SResourceInfo::RES_SRV:
1209                 imageInput->Writeable = false;
1210                 break;
1211             default:
1212                 IGC_ASSERT_MESSAGE(0, "Unknown resource type");
1213                 break;
1214             }
1215             m_kernelInfo.m_imageInputAnnotations.push_back(std::move(imageInput));
1216 
1217             if (kernelArg->getAccessQual() == IGC::KernelArg::AccessQual::READ_WRITE)
1218             {
1219                 m_kernelInfo.m_executionEnivronment.HasReadWriteImages = true;
1220             }
1221         }
1222         break;
1223 
1224         case KernelArg::ArgType::SAMPLER:
1225         case KernelArg::ArgType::BINDLESS_SAMPLER:
1226         {
1227             int argNo = kernelArg->getAssociatedArgNo();
1228             SOpenCLKernelInfo::SResourceInfo resInfo = getResourceInfo(argNo);
1229             m_kernelInfo.m_argIndexMap[argNo] = resInfo.Index;
1230 
1231             iOpenCL::SAMPLER_OBJECT_TYPE samplerType;
1232             if (getExtensionInfo(argNo) == ResourceExtensionTypeEnum::MediaSamplerType) {
1233                 samplerType = iOpenCL::SAMPLER_OBJECT_VME;
1234             }
1235             else if (getExtensionInfo(argNo) == ResourceExtensionTypeEnum::MediaSamplerTypeConvolve) {
1236                 samplerType = iOpenCL::SAMPLER_OBJECT_SAMPLE_8X8_2DCONVOLVE;
1237             }
1238             else if (getExtensionInfo(argNo) == ResourceExtensionTypeEnum::MediaSamplerTypeErode) {
1239                 samplerType = iOpenCL::SAMPLER_OBJECT_SAMPLE_8X8_ERODE;
1240             }
1241             else if (getExtensionInfo(argNo) == ResourceExtensionTypeEnum::MediaSamplerTypeDilate) {
1242                 samplerType = iOpenCL::SAMPLER_OBJECT_SAMPLE_8X8_DILATE;
1243             }
1244             else if (getExtensionInfo(argNo) == ResourceExtensionTypeEnum::MediaSamplerTypeMinMaxFilter) {
1245                 samplerType = iOpenCL::SAMPLER_OBJECT_SAMPLE_8X8_MINMAXFILTER;
1246             }
1247             else if (getExtensionInfo(argNo) == ResourceExtensionTypeEnum::MediaSamplerTypeMinMax) {
1248                 samplerType = iOpenCL::SAMPLER_OBJECT_SAMPLE_8X8_MINMAX;
1249             }
1250             else if (getExtensionInfo(argNo) == ResourceExtensionTypeEnum::MediaSamplerTypeCentroid) {
1251                 samplerType = iOpenCL::SAMPLER_OBJECT_SAMPLE_8X8_CENTROID;
1252             }
1253             else if (getExtensionInfo(argNo) == ResourceExtensionTypeEnum::MediaSamplerTypeBoolCentroid) {
1254                 samplerType = iOpenCL::SAMPLER_OBJECT_SAMPLE_8X8_BOOL_CENTROID;
1255             }
1256             else if (getExtensionInfo(argNo) == ResourceExtensionTypeEnum::MediaSamplerTypeBoolSum) {
1257                 samplerType = iOpenCL::SAMPLER_OBJECT_SAMPLE_8X8_BOOL_SUM;
1258             }
1259             else {
1260                 samplerType = iOpenCL::SAMPLER_OBJECT_TEXTURE;
1261             }
1262 
1263             auto samplerArg = std::make_unique<iOpenCL::SamplerArgumentAnnotation>();
1264             samplerArg->SamplerType = samplerType;
1265             samplerArg->ArgumentNumber = argNo;
1266             samplerArg->SamplerTableIndex = resInfo.Index;
1267             samplerArg->LocationIndex = kernelArg->getLocationIndex();
1268             samplerArg->LocationCount = kernelArg->getLocationCount();
1269             samplerArg->IsBindlessAccess = kernelArg->needsAllocation();
1270             samplerArg->IsEmulationArgument = kernelArg->isEmulationArgument();
1271             samplerArg->PayloadPosition = payloadPosition;
1272 
1273             m_kernelInfo.m_samplerArgument.push_back(std::move(samplerArg));
1274         }
1275         break;
1276 
1277         case KernelArg::ArgType::IMPLICIT_LOCAL_IDS:
1278         {
1279             m_kernelInfo.m_threadPayload.HasLocalIDx = true;
1280             m_kernelInfo.m_threadPayload.HasLocalIDy = true;
1281             m_kernelInfo.m_threadPayload.HasLocalIDz = true;
1282 
1283             ModuleMetaData* modMD = m_Context->getModuleMetaData();
1284             auto it = modMD->FuncMD.find(entry);
1285             if (it != modMD->FuncMD.end())
1286             {
1287                 if (it->second.localIDPresent == true)
1288                     m_kernelInfo.m_threadPayload.HasLocalID = true;
1289             }
1290         }
1291         break;
1292         case KernelArg::ArgType::R1:
1293             m_kernelInfo.m_threadPayload.UnusedPerThreadConstantPresent = true;
1294             break;
1295 
1296         case KernelArg::ArgType::IMPLICIT_SYNC_BUFFER:
1297         {
1298             int argNo = kernelArg->getAssociatedArgNo();
1299             SOpenCLKernelInfo::SResourceInfo resInfo = getResourceInfo(argNo);
1300             m_kernelInfo.m_argIndexMap[argNo] = getBTI(resInfo);
1301 
1302             auto syncBuffer = std::make_unique<iOpenCL::SyncBufferAnnotation>();
1303 
1304             syncBuffer->ArgumentNumber = argNo;
1305             syncBuffer->PayloadPosition = payloadPosition;
1306             syncBuffer->DataSize = kernelArg->getAllocateSize();
1307 
1308             m_kernelInfo.m_syncBufferAnnotation = std::move(syncBuffer);
1309         }
1310         break;
1311 
1312         case KernelArg::ArgType::IMPLICIT_PRINTF_BUFFER:
1313         {
1314             int argNo = kernelArg->getAssociatedArgNo();
1315             SOpenCLKernelInfo::SResourceInfo resInfo = getResourceInfo(argNo);
1316             m_kernelInfo.m_argIndexMap[argNo] = getBTI(resInfo);
1317 
1318             auto printfBuffer = std::make_unique<iOpenCL::PrintfBufferAnnotation>();
1319 
1320             printfBuffer->ArgumentNumber = argNo;
1321             printfBuffer->PayloadPosition = payloadPosition;
1322             printfBuffer->DataSize = kernelArg->getAllocateSize();
1323             printfBuffer->Index = 0; // This value is not used by Runtime.
1324 
1325             m_kernelInfo.m_printfBufferAnnotation = std::move(printfBuffer);
1326         }
1327         break;
1328 
1329         case KernelArg::ArgType::IMPLICIT_DEVICE_ENQUEUE_DEFAULT_DEVICE_QUEUE:
1330         {
1331             m_kernelInfo.m_executionEnivronment.HasDeviceEnqueue = true;
1332             int argNo = kernelArg->getAssociatedArgNo();
1333             SOpenCLKernelInfo::SResourceInfo resInfo = getResourceInfo(argNo);
1334             m_kernelInfo.m_argIndexMap[argNo] = getBTI(resInfo);
1335 
1336             auto ptrAnnotation = std::make_unique<iOpenCL::PointerInputAnnotation>();
1337 
1338             ptrAnnotation->AddressSpace = iOpenCL::ADDRESS_SPACE_INTERNAL_DEFAULT_DEVICE_QUEUE;
1339             ptrAnnotation->BindingTableIndex = getBTI(resInfo);
1340             ptrAnnotation->IsStateless = true;
1341             ptrAnnotation->PayloadPosition = payloadPosition;
1342             ptrAnnotation->PayloadSizeInBytes = kernelArg->getAllocateSize();
1343             ptrAnnotation->ArgumentNumber = argNo;
1344             m_kernelInfo.m_pointerInput.push_back(std::move(ptrAnnotation));
1345         }
1346         break;
1347 
1348         case KernelArg::ArgType::IMPLICIT_DEVICE_ENQUEUE_EVENT_POOL:
1349         {
1350             m_kernelInfo.m_executionEnivronment.HasDeviceEnqueue = true;
1351             int argNo = kernelArg->getAssociatedArgNo();
1352             SOpenCLKernelInfo::SResourceInfo resInfo = getResourceInfo(argNo);
1353             m_kernelInfo.m_argIndexMap[argNo] = getBTI(resInfo);
1354 
1355             auto ptrAnnotation = std::make_unique<iOpenCL::PointerInputAnnotation>();
1356             ptrAnnotation->AddressSpace = iOpenCL::ADDRESS_SPACE_INTERNAL_EVENT_POOL;
1357             ptrAnnotation->BindingTableIndex = getBTI(resInfo);
1358             ptrAnnotation->IsStateless = true;
1359             ptrAnnotation->PayloadPosition = payloadPosition;
1360             ptrAnnotation->PayloadSizeInBytes = kernelArg->getAllocateSize();
1361             ptrAnnotation->ArgumentNumber = argNo;
1362             m_kernelInfo.m_pointerInput.push_back(std::move(ptrAnnotation));
1363         }
1364         break;
1365 
1366         case KernelArg::ArgType::IMPLICIT_WORK_DIM:
1367         case KernelArg::ArgType::IMPLICIT_VME_MB_BLOCK_TYPE:
1368         case KernelArg::ArgType::IMPLICIT_VME_SUBPIXEL_MODE:
1369         case KernelArg::ArgType::IMPLICIT_VME_SAD_ADJUST_MODE:
1370         case KernelArg::ArgType::IMPLICIT_VME_SEARCH_PATH_TYPE:
1371         case KernelArg::ArgType::IMPLICIT_DEVICE_ENQUEUE_MAX_WORKGROUP_SIZE:
1372         case KernelArg::ArgType::IMPLICIT_DEVICE_ENQUEUE_PARENT_EVENT:
1373         case KernelArg::ArgType::IMPLICIT_DEVICE_ENQUEUE_PREFERED_WORKGROUP_MULTIPLE:
1374             constantType = kernelArg->getDataParamToken();
1375             {
1376                 auto constInput = std::make_unique<iOpenCL::ConstantInputAnnotation>();
1377 
1378                 DWORD sizeInBytes = iOpenCL::DATA_PARAMETER_DATA_SIZE;
1379 
1380                 constInput->ConstantType = constantType;
1381                 constInput->Offset = 0;
1382                 constInput->PayloadPosition = payloadPosition;
1383                 constInput->PayloadSizeInBytes = sizeInBytes;
1384                 constInput->ArgumentNumber = DEFAULT_ARG_NUM;
1385                 m_kernelInfo.m_constantInputAnnotation.push_back(std::move(constInput));
1386 
1387                 payloadPosition += sizeInBytes;
1388             }
1389             break;
1390         case KernelArg::ArgType::IMPLICIT_LOCAL_MEMORY_STATELESS_WINDOW_START_ADDRESS:
1391         {
1392             auto GASStart = std::make_unique<iOpenCL::StartGASAnnotation>();
1393             GASStart->Offset = payloadPosition;
1394             GASStart->gpuPointerSizeInBytes = kernelArg->getAllocateSize();
1395             m_kernelInfo.m_startGAS = std::move(GASStart);
1396         }
1397         break;
1398         case KernelArg::ArgType::IMPLICIT_LOCAL_MEMORY_STATELESS_WINDOW_SIZE:
1399         {
1400             auto winSizeGAS = std::make_unique<iOpenCL::WindowSizeGASAnnotation>();
1401 
1402             winSizeGAS->Offset = payloadPosition;
1403             m_kernelInfo.m_WindowSizeGAS = std::move(winSizeGAS);
1404         }
1405         break;
1406         case KernelArg::ArgType::IMPLICIT_PRIVATE_MEMORY_STATELESS_SIZE:
1407         {
1408             auto privateMemSize = std::make_unique<iOpenCL::PrivateMemSizeAnnotation>();
1409 
1410             privateMemSize->Offset = payloadPosition;
1411             m_kernelInfo.m_PrivateMemSize = std::move(privateMemSize);
1412         }
1413         break;
1414         default:
1415             // Do nothing
1416             break;
1417         }
1418 
1419 
1420         // DATA_PARAMETER_BUFFER_STATEFUL
1421         //   ( SPatchDataParameterBuffer for this token only uses one field: ArgumentNumber )
1422         //   Used to indicate that all memory references via a gobal/constant ptr argument are
1423         //   converted to stateful (by StatelessToStateful optimization). Thus, the ptr itself
1424         //   is no longer referenced at all.
1425         //
1426         if (IGC_IS_FLAG_ENABLED(EnableStatelessToStatefull) &&
1427             IGC_IS_FLAG_ENABLED(EnableStatefulToken) &&
1428             m_DriverInfo->SupportStatefulToken() &&
1429             arg &&
1430             ((type == KernelArg::ArgType::PTR_GLOBAL &&
1431             (arg->use_empty() || !GetHasGlobalStatelessAccess())) ||
1432                 (type == KernelArg::ArgType::PTR_CONSTANT &&
1433                 (arg->use_empty() || !GetHasConstantStatelessAccess()))))
1434         {
1435             auto constInput = std::make_unique<iOpenCL::ConstantInputAnnotation>();
1436 
1437             constInput->ConstantType = iOpenCL::DATA_PARAMETER_BUFFER_STATEFUL;
1438             constInput->Offset = 0;
1439             constInput->PayloadPosition = payloadPosition;
1440             constInput->PayloadSizeInBytes = iOpenCL::DATA_PARAMETER_DATA_SIZE;
1441             constInput->ArgumentNumber = kernelArg->getAssociatedArgNo(); // used only for this token.
1442             m_kernelInfo.m_constantInputAnnotation.push_back(std::move(constInput));
1443         }
1444     }
1445 
getImageTypeFromKernelArg(const KernelArg & kernelArg)1446    iOpenCL::IMAGE_MEMORY_OBJECT_TYPE COpenCLKernel::getImageTypeFromKernelArg(const KernelArg& kernelArg)
1447    {
1448        switch(kernelArg.getArgType()) {
1449            case KernelArg::ArgType::IMAGE_1D:
1450            case KernelArg::ArgType::BINDLESS_IMAGE_1D:
1451                return iOpenCL::IMAGE_MEMORY_OBJECT_1D;
1452 
1453            case KernelArg::ArgType::IMAGE_1D_BUFFER:
1454            case KernelArg::ArgType::BINDLESS_IMAGE_1D_BUFFER:
1455                return iOpenCL::IMAGE_MEMORY_OBJECT_BUFFER;
1456 
1457            case KernelArg::ArgType::IMAGE_2D:
1458            case KernelArg::ArgType::BINDLESS_IMAGE_2D:
1459                if (getExtensionInfo(kernelArg.getAssociatedArgNo()) == ResourceExtensionTypeEnum::MediaResourceType)
1460                    return iOpenCL::IMAGE_MEMORY_OBJECT_2D_MEDIA;
1461                else if (getExtensionInfo(kernelArg.getAssociatedArgNo()) == ResourceExtensionTypeEnum::MediaResourceBlockType)
1462                    return iOpenCL::IMAGE_MEMORY_OBJECT_2D_MEDIA_BLOCK;
1463                return iOpenCL::IMAGE_MEMORY_OBJECT_2D;
1464 
1465            case KernelArg::ArgType::IMAGE_3D:
1466            case KernelArg::ArgType::BINDLESS_IMAGE_3D:
1467                return iOpenCL::IMAGE_MEMORY_OBJECT_3D;
1468 
1469            case KernelArg::ArgType::IMAGE_CUBE:
1470            case KernelArg::ArgType::BINDLESS_IMAGE_CUBE:
1471                return iOpenCL::IMAGE_MEMORY_OBJECT_CUBE;
1472 
1473            case KernelArg::ArgType::IMAGE_CUBE_DEPTH:
1474            case KernelArg::ArgType::BINDLESS_IMAGE_CUBE_DEPTH:
1475                // Use regular cube texture for depth:
1476                return iOpenCL::IMAGE_MEMORY_OBJECT_CUBE;
1477 
1478            case KernelArg::ArgType::IMAGE_1D_ARRAY:
1479            case KernelArg::ArgType::BINDLESS_IMAGE_1D_ARRAY:
1480                return iOpenCL::IMAGE_MEMORY_OBJECT_1D_ARRAY;
1481 
1482            case KernelArg::ArgType::IMAGE_2D_ARRAY:
1483            case KernelArg::ArgType::BINDLESS_IMAGE_2D_ARRAY:
1484                return iOpenCL::IMAGE_MEMORY_OBJECT_2D_ARRAY;
1485 
1486 
1487            case KernelArg::ArgType::IMAGE_2D_DEPTH:
1488            case KernelArg::ArgType::BINDLESS_IMAGE_2D_DEPTH:
1489                return iOpenCL::IMAGE_MEMORY_OBJECT_2D_DEPTH;
1490 
1491            case KernelArg::ArgType::IMAGE_2D_DEPTH_ARRAY:
1492            case KernelArg::ArgType::BINDLESS_IMAGE_2D_DEPTH_ARRAY:
1493                return iOpenCL::IMAGE_MEMORY_OBJECT_2D_ARRAY_DEPTH;
1494 
1495            case KernelArg::ArgType::IMAGE_2D_MSAA:
1496            case KernelArg::ArgType::BINDLESS_IMAGE_2D_MSAA:
1497                return iOpenCL::IMAGE_MEMORY_OBJECT_2D_MSAA;
1498 
1499            case KernelArg::ArgType::IMAGE_2D_MSAA_ARRAY:
1500            case KernelArg::ArgType::BINDLESS_IMAGE_2D_MSAA_ARRAY:
1501                return iOpenCL::IMAGE_MEMORY_OBJECT_2D_ARRAY_MSAA;
1502 
1503            case KernelArg::ArgType::IMAGE_2D_MSAA_DEPTH:
1504            case KernelArg::ArgType::BINDLESS_IMAGE_2D_MSAA_DEPTH:
1505                return iOpenCL::IMAGE_MEMORY_OBJECT_2D_MSAA_DEPTH;
1506 
1507            case KernelArg::ArgType::IMAGE_2D_MSAA_DEPTH_ARRAY:
1508            case KernelArg::ArgType::BINDLESS_IMAGE_2D_MSAA_DEPTH_ARRAY:
1509                return iOpenCL::IMAGE_MEMORY_OBJECT_2D_ARRAY_MSAA_DEPTH;
1510 
1511            case KernelArg::ArgType::IMAGE_CUBE_ARRAY:
1512            case KernelArg::ArgType::BINDLESS_IMAGE_CUBE_ARRAY:
1513                return iOpenCL::IMAGE_MEMORY_OBJECT_CUBE_ARRAY;
1514 
1515            case KernelArg::ArgType::IMAGE_CUBE_DEPTH_ARRAY:
1516            case KernelArg::ArgType::BINDLESS_IMAGE_CUBE_DEPTH_ARRAY:
1517                // Use regular cube texture array for depth
1518                return iOpenCL::IMAGE_MEMORY_OBJECT_CUBE_ARRAY;
1519 
1520            default:
1521                break;
1522        }
1523        return iOpenCL::IMAGE_MEMORY_OBJECT_INVALID;
1524    }
1525 
ParseShaderSpecificOpcode(llvm::Instruction * inst)1526     void COpenCLKernel::ParseShaderSpecificOpcode(llvm::Instruction* inst)
1527     {
1528         auto setStatelessAccess = [&](unsigned AS) {
1529             if (AS == ADDRESS_SPACE_GLOBAL ||
1530                 AS == ADDRESS_SPACE_GENERIC ||
1531                 AS == ADDRESS_SPACE_GLOBAL_OR_PRIVATE)
1532             {
1533                 SetHasGlobalStatelessAccess();
1534             }
1535 
1536             if (AS == ADDRESS_SPACE_CONSTANT)
1537             {
1538                 SetHasConstantStatelessAccess();
1539             }
1540         };
1541 
1542         // Currently we see data corruption when we have IEEE macros and midthread preemption enabled.
1543         // Adding a temporary work around to disable mid thread preemption when we see IEEE Macros.
1544         switch (inst->getOpcode())
1545         {
1546         case Instruction::FDiv:
1547             if (inst->getType()->isDoubleTy())
1548             {
1549                 SetDisableMidthreadPreemption();
1550             }
1551             break;
1552         case Instruction::Call:
1553             if (inst->getType()->isDoubleTy())
1554             {
1555                 if (GetOpCode(inst) == llvm_sqrt)
1556                 {
1557                     SetDisableMidthreadPreemption();
1558                 }
1559             }
1560             break;
1561         case Instruction::Load:
1562         {
1563             unsigned AS = cast<LoadInst>(inst)->getPointerAddressSpace();
1564             setStatelessAccess(AS);
1565             break;
1566         }
1567         case Instruction::Store:
1568         {
1569             unsigned AS = cast<StoreInst>(inst)->getPointerAddressSpace();
1570             setStatelessAccess(AS);
1571             break;
1572         }
1573         default:
1574             break;
1575         }
1576 
1577         if (CallInst * CallI = dyn_cast<CallInst>(inst))
1578         {
1579             bool mayHasMemoryAccess = true;  // for checking stateless access
1580             if (GenIntrinsicInst * GII = dyn_cast<GenIntrinsicInst>(CallI))
1581             {
1582                 GenISAIntrinsic::ID id = GII->getIntrinsicID();
1583                 switch (id)
1584                 {
1585                 default:
1586                     break;
1587                 case GenISAIntrinsic::GenISA_dpas:
1588                 case GenISAIntrinsic::GenISA_sub_group_dpas:
1589                     SetHasDPAS();
1590                     break;
1591                 case GenISAIntrinsic::GenISA_ptr_to_pair:
1592                 case GenISAIntrinsic::GenISA_pair_to_ptr:
1593                     mayHasMemoryAccess = false;
1594                     break;
1595                 } // End of switch
1596             }
1597 
1598             if (mayHasMemoryAccess)
1599             {
1600                 // Checking stateless access info
1601                 if (!isa<IntrinsicInst>(CallI) && !isa<GenIntrinsicInst>(CallI)) {
1602                     // function/subroutine call. Give up
1603                     SetHasConstantStatelessAccess();
1604                     SetHasGlobalStatelessAccess();
1605                 }
1606                 else
1607                 {
1608                     for (int i = 0, e = (int)CallI->getNumArgOperands(); i < e; ++i)
1609                     {
1610                         Value* arg = CallI->getArgOperand(i);
1611                         PointerType* PTy = dyn_cast<PointerType>(arg->getType());
1612                         if (!PTy)
1613                             continue;
1614                         unsigned AS = PTy->getAddressSpace();
1615                         setStatelessAccess(AS);
1616                     }
1617                 }
1618             }
1619         }
1620     }
1621 
AllocatePayload()1622     void COpenCLKernel::AllocatePayload()
1623     {
1624         IGC_ASSERT(m_Context);
1625 
1626         bool loadThreadPayload = false;
1627 
1628         loadThreadPayload = m_Platform->supportLoadThreadPayloadForCompute();
1629 
1630         // SKL defaults to indirect thread payload storage.
1631         // BDW needs CURBE payload. Spec says:
1632         // "CURBE should be used for the payload when using indirect dispatch rather than indirect payload".
1633         m_kernelInfo.m_threadPayload.CompiledForIndirectPayloadStorage = true;
1634         if (IGC_IS_FLAG_ENABLED(DisableGPGPUIndirectPayload) ||
1635             m_Context->platform.getWATable().WaDisableIndirectDataForIndirectDispatch)
1636         {
1637             m_kernelInfo.m_threadPayload.CompiledForIndirectPayloadStorage = false;
1638         }
1639         if (loadThreadPayload)
1640         {
1641             m_kernelInfo.m_threadPayload.CompiledForIndirectPayloadStorage = true;
1642         }
1643         m_kernelInfo.m_threadPayload.HasFlattenedLocalID = false;
1644         m_kernelInfo.m_threadPayload.HasLocalIDx = false;
1645         m_kernelInfo.m_threadPayload.HasLocalIDy = false;
1646         m_kernelInfo.m_threadPayload.HasLocalIDz = false;
1647         m_kernelInfo.m_threadPayload.HasGlobalIDOffset = false;
1648         m_kernelInfo.m_threadPayload.HasGroupID = false;
1649         m_kernelInfo.m_threadPayload.HasLocalID = false;
1650         m_kernelInfo.m_threadPayload.UnusedPerThreadConstantPresent = false;
1651         m_kernelInfo.m_printfBufferAnnotation = nullptr;
1652         m_kernelInfo.m_syncBufferAnnotation = nullptr;
1653         m_kernelInfo.m_threadPayload.HasStageInGridOrigin = false;
1654         m_kernelInfo.m_threadPayload.HasStageInGridSize = false;
1655 
1656         // Set the amount of the private memory used by the kernel
1657         // Set only if the private memory metadata actually exists and we don't use
1658         // scratch space for private memory.
1659         bool noScratchSpacePrivMem = !m_Context->getModuleMetaData()->compOpt.UseScratchSpacePrivateMemory;
1660 
1661         auto funcMD = m_Context->getModuleMetaData()->FuncMD.find(entry);
1662         if (noScratchSpacePrivMem && (funcMD != m_Context->getModuleMetaData()->FuncMD.end()) && funcMD->second.privateMemoryPerWI)
1663         {
1664             m_perWIStatelessPrivateMemSize = funcMD->second.privateMemoryPerWI;
1665         }
1666 
1667 
1668         m_ConstantBufferLength = 0;
1669         m_NOSBufferSize = 0;
1670 
1671         uint offset = 0;
1672 
1673         uint constantBufferStart = 0;
1674         bool constantBufferStartSet = false;
1675 
1676         uint prevOffset = 0;
1677         bool nosBufferAllocated = false;
1678 
1679         KernelArgsOrder::InputType layout =
1680             m_kernelInfo.m_threadPayload.CompiledForIndirectPayloadStorage ?
1681             KernelArgsOrder::InputType::INDIRECT :
1682             KernelArgsOrder::InputType::CURBE;
1683 
1684         KernelArgs kernelArgs(*entry, m_DL, m_pMdUtils, m_ModuleMetadata, getGRFSize(), layout);
1685 
1686         if (layout == KernelArgsOrder::InputType::INDIRECT && !loadThreadPayload)
1687         {
1688             kernelArgs.checkForZeroPerThreadData();
1689         }
1690 
1691         for (KernelArgs::const_iterator i = kernelArgs.begin(), e = kernelArgs.end(); i != e; ++i)
1692         {
1693             KernelArg arg = *i;
1694             prevOffset = offset;
1695 
1696             // skip unused arguments
1697             bool IsUnusedArg = (arg.getArgType() == KernelArg::ArgType::IMPLICIT_BUFFER_OFFSET) &&
1698                 arg.getArg()->use_empty();
1699 
1700             // Runtime Values should not be processed any further. No annotations shall be created for them.
1701             // Only added to KernelArgs to enforce correct allocation order.
1702             bool isRuntimeValue = (arg.getArgType() == KernelArg::ArgType::RUNTIME_VALUE);
1703 
1704             if (!constantBufferStartSet && arg.isConstantBuf())
1705             {
1706                 constantBufferStart = offset;
1707                 constantBufferStartSet = true;
1708             }
1709 
1710             if (!nosBufferAllocated && isRuntimeValue) {
1711                 IGC_ASSERT_MESSAGE(arg.isConstantBuf(), "RuntimeValues must be marked as isConstantBuf");
1712                 AllocateNOSConstants(offset);
1713                 nosBufferAllocated = true;
1714             }
1715 
1716             // Local IDs are non-uniform and may have two instances in SIMD32 mode
1717             int numAllocInstances = arg.getArgType() == KernelArg::ArgType::IMPLICIT_LOCAL_IDS ? m_numberInstance : 1;
1718 
1719             auto allocSize = arg.getAllocateSize();
1720 
1721             if (arg.getArgType() == KernelArg::ArgType::IMPLICIT_LOCAL_ID_BUFFER)
1722             {
1723                 allocSize = getLocalIdBufferSize(m_dispatchSize);
1724             }
1725 
1726             if (!IsUnusedArg && !isRuntimeValue)
1727             {
1728                 if (arg.needsAllocation())
1729                 {
1730                     // Align on the desired alignment for this argument
1731                     auto alignment = arg.getAlignment();
1732 
1733                     offset = iSTD::Align(offset, alignment);
1734 
1735                     // Arguments larger than a GRF must be at least GRF-aligned.
1736                     // Arguments smaller than a GRF may not cross GRF boundaries.
1737                     // This means that arguments that cross a GRF boundary
1738                     // must be GRF aligned.
1739                     // Note that this is done AFTER we align on the base alignment,
1740                     // because of edge cases where aligning on the base alignment
1741                     // is what causes the "overflow".
1742                     unsigned int startGRF = offset / getGRFSize();
1743                     unsigned int endGRF = (offset + allocSize - 1) / getGRFSize();
1744                     if (startGRF != endGRF)
1745                     {
1746                         offset = iSTD::Align(offset, getGRFSize());
1747                     }
1748 
1749                     // And now actually tell vISA we need this space.
1750                     // (Except for r0, which is a predefined variable, and should never be allocated as input!)
1751                     const llvm::Argument* A = arg.getArg();
1752                     if (A != nullptr && arg.getArgType() != KernelArg::ArgType::IMPLICIT_R0)
1753                     {
1754                         CVariable* var = GetSymbol(const_cast<Argument*>(A));
1755                         for (int i = 0; i < numAllocInstances; ++i)
1756                         {
1757                             uint totalOffset = offset + (allocSize * i);
1758                             if ((totalOffset / getGRFSize()) >= m_Context->getNumGRFPerThread())
1759                             {
1760                                 m_Context->EmitError("Kernel inputs exceed total register size!", A);
1761                                 return;
1762                             }
1763                             AllocateInput(var, totalOffset, i);
1764                         }
1765                     }
1766                     // or else we would just need to increase an offset
1767                 }
1768 
1769                 // Create annotations for the kernel argument
1770                 // If an arg is unused, don't generate patch token for it.
1771                 CreateAnnotations(&arg, offset - constantBufferStart);
1772                 if (IGC_IS_FLAG_ENABLED(EnableZEBinary) ||
1773                     m_Context->getCompilerOption().EnableZEBinary) {
1774                     // FIXME: once we transit to zebin completely, we don't need to do
1775                     // CreateAnnotations above. Only CreateZEPayloadArguments is required
1776 
1777                     // During the transition, we disable ZEBinary if there are unsupported
1778                     // arguments
1779                     bool success = CreateZEPayloadArguments(&arg, offset - constantBufferStart);
1780                     if (!success) {
1781                         // assertion tests if we force to EnableZEBinary but encounter unsupported features
1782                         IGC_ASSERT_MESSAGE(!IGC_IS_FLAG_ENABLED(EnableZEBinary),
1783                             "ZEBin: unsupported KernelArg Type");
1784 
1785                         // fall back to patch-token if ZEBinary is enabled by CodeGenContext::CompOptions
1786                         if (m_Context->getCompilerOption().EnableZEBinary)
1787                             m_Context->getCompilerOption().EnableZEBinary = false;
1788                     }
1789                 }
1790                 if (arg.needsAllocation())
1791                 {
1792                     for (int i = 0; i < numAllocInstances; ++i)
1793                     {
1794                         offset += allocSize;
1795                     }
1796                 }
1797             }
1798 
1799             if (arg.isConstantBuf())
1800             {
1801                 m_ConstantBufferLength += offset - prevOffset;
1802             }
1803         }
1804 
1805         // ToDo: we should avoid passing all three dimensions of local id
1806         if (m_kernelInfo.m_threadPayload.HasLocalIDx ||
1807             m_kernelInfo.m_threadPayload.HasLocalIDy ||
1808             m_kernelInfo.m_threadPayload.HasLocalIDz)
1809         {
1810             if (loadThreadPayload)
1811             {
1812                 uint perThreadInputSize = SIZE_WORD * 3 * (m_dispatchSize == SIMDMode::SIMD32 ? 32 : 16);
1813                 if (m_dispatchSize == SIMDMode::SIMD16 && getGRFSize() == 64)
1814                 {
1815                     perThreadInputSize *= 2;
1816                 }
1817                 encoder.GetVISAKernel()->AddKernelAttribute("PerThreadInputSize", sizeof(uint16_t), &perThreadInputSize);
1818             }
1819         }
1820 
1821         m_kernelInfo.m_threadPayload.OffsetToSkipPerThreadDataLoad = 0;
1822         m_kernelInfo.m_threadPayload.OffsetToSkipSetFFIDGP = 0;
1823 
1824         m_ConstantBufferLength = iSTD::Align(m_ConstantBufferLength, getGRFSize());
1825 
1826         CreateInlineSamplerAnnotations();
1827         // Currently we can't support inline sampler in zebin
1828         // assertion tests if we force to EnableZEBinary but encounter inline sampler
1829         bool hasInlineSampler = m_kernelInfo.m_HasInlineVmeSamplers || !m_kernelInfo.m_samplerInput.empty();
1830         IGC_ASSERT_MESSAGE(!IGC_IS_FLAG_ENABLED(EnableZEBinary) || !hasInlineSampler,
1831             "ZEBin: Inline sampler unsupported");
1832         // fall back to patch-token if ZEBinary is enabled by CodeGenContext::CompOptions
1833         if (m_Context->getCompilerOption().EnableZEBinary && hasInlineSampler)
1834             m_Context->getCompilerOption().EnableZEBinary = false;
1835 
1836         // Handle kernel reflection
1837         CreateKernelArgInfo();
1838         CreateKernelAttributeInfo();
1839 
1840         // Create annotations for printf string.
1841         CreatePrintfStringAnnotations();
1842     }
1843 
1844 
GetGlobalMappingValue(llvm::Value * c)1845     unsigned int COpenCLKernel::GetGlobalMappingValue(llvm::Value* c)
1846     {
1847         unsigned int val = 0;
1848         auto localIter = m_localOffsetsMap.find(c);
1849         if (localIter != m_localOffsetsMap.end())
1850         {
1851             val = localIter->second;
1852         }
1853         else
1854         {
1855             IGC_ASSERT_MESSAGE(0, "Trying to access a GlobalVariable not in locals map");
1856         }
1857         return val;
1858     }
1859 
GetGlobalMapping(llvm::Value * c)1860     CVariable* COpenCLKernel::GetGlobalMapping(llvm::Value* c)
1861     {
1862         unsigned int val = GetGlobalMappingValue(c);
1863 
1864         VISA_Type type = GetType(c->getType());
1865         return ImmToVariable(val, type);
1866     }
1867 
getSumFixedTGSMSizes(Function * F)1868     unsigned int COpenCLKernel::getSumFixedTGSMSizes(Function* F)
1869     {
1870         // Find whether we have size information for this kernel.
1871         // If not, then the total TGSM is 0, otherwise pull it from the MD
1872         ModuleMetaData* modMD = m_Context->getModuleMetaData();
1873         auto funcMD = modMD->FuncMD.find(F);
1874         if (funcMD == modMD->FuncMD.end())
1875         {
1876             return 0;
1877         }
1878         return funcMD->second.localSize;
1879     }
1880 
FillKernel()1881     void COpenCLKernel::FillKernel()
1882     {
1883         m_kernelInfo.m_executionEnivronment.PerThreadScratchSpace = ProgramOutput()->getScratchSpaceUsageInSlot0();
1884         m_kernelInfo.m_executionEnivronment.PerThreadScratchSpaceSlot1 = ProgramOutput()->getScratchSpaceUsageInSlot1();
1885         m_kernelInfo.m_executionEnivronment.PerThreadPrivateOnStatelessSize = m_perWIStatelessPrivateMemSize;
1886         m_kernelInfo.m_kernelProgram.NOSBufferSize = m_NOSBufferSize / getGRFSize(); // in 256 bits
1887         m_kernelInfo.m_kernelProgram.ConstantBufferLength = m_ConstantBufferLength / getGRFSize(); // in 256 bits
1888         m_kernelInfo.m_kernelProgram.MaxNumberOfThreads = m_Platform->getMaxGPGPUShaderThreads() / GetShaderThreadUsageRate();
1889 
1890         m_kernelInfo.m_executionEnivronment.SumFixedTGSMSizes = getSumFixedTGSMSizes(entry);
1891         m_kernelInfo.m_executionEnivronment.HasBarriers = this->GetHasBarrier();
1892         m_kernelInfo.m_executionEnivronment.DisableMidThreadPreemption = GetDisableMidThreadPreemption();
1893         m_kernelInfo.m_executionEnivronment.SubgroupIndependentForwardProgressRequired =
1894             m_Context->getModuleMetaData()->compOpt.SubgroupIndependentForwardProgressRequired;
1895         m_kernelInfo.m_executionEnivronment.CompiledForGreaterThan4GBBuffers =
1896             m_Context->getModuleMetaData()->compOpt.GreaterThan4GBBufferRequired;
1897         IGC_ASSERT(gatherMap.size() == 0);
1898         m_kernelInfo.m_kernelProgram.gatherMapSize = 0;
1899         m_kernelInfo.m_kernelProgram.bindingTableEntryCount = 0;
1900 
1901         m_kernelInfo.m_executionEnivronment.HasDeviceEnqueue = false;
1902         m_kernelInfo.m_executionEnivronment.IsSingleProgramFlow = false;
1903         //m_kernelInfo.m_executionEnivronment.PerSIMDLanePrivateMemorySize = m_perWIStatelessPrivateMemSize;
1904         m_kernelInfo.m_executionEnivronment.HasFixedWorkGroupSize = false;
1905         m_kernelInfo.m_kernelName = entry->getName().str();
1906         m_kernelInfo.m_ShaderHashCode = m_Context->hash.getAsmHash();
1907 
1908         FunctionInfoMetaDataHandle funcInfoMD = m_pMdUtils->getFunctionsInfoItem(entry);
1909         ThreadGroupSizeMetaDataHandle threadGroupSize = funcInfoMD->getThreadGroupSize();
1910         SubGroupSizeMetaDataHandle subGroupSize = funcInfoMD->getSubGroupSize();
1911 
1912         if (threadGroupSize->hasValue())
1913         {
1914             m_kernelInfo.m_executionEnivronment.HasFixedWorkGroupSize = true;
1915             m_kernelInfo.m_executionEnivronment.FixedWorkgroupSize[0] = threadGroupSize->getXDim();
1916             m_kernelInfo.m_executionEnivronment.FixedWorkgroupSize[1] = threadGroupSize->getYDim();
1917             m_kernelInfo.m_executionEnivronment.FixedWorkgroupSize[2] = threadGroupSize->getZDim();
1918         }
1919         if (subGroupSize->hasValue())
1920         {
1921             m_kernelInfo.m_executionEnivronment.CompiledSIMDSize = subGroupSize->getSIMD_size();
1922         }
1923 
1924         auto& FuncMap = m_Context->getModuleMetaData()->FuncMD;
1925         auto FuncIter = FuncMap.find(entry);
1926         if (FuncIter != FuncMap.end())
1927         {
1928             IGC::FunctionMetaData funcMD = FuncIter->second;
1929             WorkGroupWalkOrderMD workGroupWalkOrder = funcMD.workGroupWalkOrder;
1930 
1931             if (workGroupWalkOrder.dim0 || workGroupWalkOrder.dim1 || workGroupWalkOrder.dim2)
1932             {
1933                 m_kernelInfo.m_executionEnivronment.WorkgroupWalkOrder[0] = workGroupWalkOrder.dim0;
1934                 m_kernelInfo.m_executionEnivronment.WorkgroupWalkOrder[1] = workGroupWalkOrder.dim1;
1935                 m_kernelInfo.m_executionEnivronment.WorkgroupWalkOrder[2] = workGroupWalkOrder.dim2;
1936             }
1937 
1938             m_kernelInfo.m_executionEnivronment.IsInitializer = funcMD.IsInitializer;
1939             m_kernelInfo.m_executionEnivronment.IsFinalizer = funcMD.IsFinalizer;
1940 
1941             m_kernelInfo.m_executionEnivronment.CompiledSubGroupsNumber = funcMD.CompiledSubGroupsNumber;
1942 
1943         }
1944 
1945         m_kernelInfo.m_executionEnivronment.HasGlobalAtomics = GetHasGlobalAtomics();
1946         m_kernelInfo.m_threadPayload.OffsetToSkipPerThreadDataLoad = ProgramOutput()->m_offsetToSkipPerThreadDataLoad;
1947         m_kernelInfo.m_threadPayload.OffsetToSkipSetFFIDGP = ProgramOutput()->m_offsetToSkipSetFFIDGP;
1948 
1949         m_kernelInfo.m_executionEnivronment.NumGRFRequired = ProgramOutput()->m_numGRFTotal;
1950 
1951 
1952         m_kernelInfo.m_executionEnivronment.UseBindlessMode = m_Context->m_InternalOptions.UseBindlessMode;
1953         m_kernelInfo.m_executionEnivronment.HasStackCalls = HasStackCalls();
1954     }
1955 
RecomputeBTLayout()1956     void COpenCLKernel::RecomputeBTLayout()
1957     {
1958         CodeGenContext* pCtx = GetContext();
1959         ModuleMetaData* modMD = pCtx->getModuleMetaData();
1960         FunctionMetaData* funcMD = &modMD->FuncMD[entry];
1961         ResourceAllocMD* resAllocMD = &funcMD->resAllocMD;
1962         // Get the number of UAVs and Resources from MD.
1963         int numUAVs = resAllocMD->uavsNumType;
1964         int numResources = resAllocMD->srvsNumType;
1965 
1966         // Now, update the layout information
1967         USC::SShaderStageBTLayout* layout = ((COCLBTILayout*)m_pBtiLayout)->getModifiableLayout();
1968 
1969         // The BT layout contains the minimum and the maximum number BTI for each kind
1970         // of resource. E.g. UAVs may be mapped to BTIs 0..3, SRVs to 4..5, and the scratch
1971         // surface to 6.
1972         // Note that the names are somewhat misleading. They are used for the sake of consistency
1973         // with the ICBE sources.
1974 
1975         // Some fields are always 0 for OCL.
1976         layout->resourceNullBoundOffset = 0;
1977         layout->immediateConstantBufferOffset = 0;
1978         layout->interfaceConstantBufferOffset = 0;
1979         layout->constantBufferNullBoundOffset = 0;
1980         layout->JournalIdx = 0;
1981         layout->JournalCounterIdx = 0;
1982 
1983         // And TGSM (aka SLM) is always 254.
1984         layout->TGSMIdx = 254;
1985 
1986         int index = 0;
1987 
1988         // First, allocate BTI for debug surface
1989         if (m_Context->m_InternalOptions.KernelDebugEnable)
1990         {
1991             layout->systemThreadIdx = index++;
1992         }
1993 
1994         // Now, allocate BTIs for all the SRVs.
1995         layout->minResourceIdx = index;
1996         if (numResources)
1997         {
1998             index += numResources - 1;
1999             layout->maxResourceIdx = index++;
2000         }
2001         else
2002         {
2003             layout->maxResourceIdx = index;
2004         }
2005 
2006         // Now, ConstantBuffers - used as a placeholder for the inline constants, if present.
2007         layout->minConstantBufferIdx = index;
2008         layout->maxConstantBufferIdx = index;
2009 
2010         // Now, the UAVs
2011         layout->minUAVIdx = index;
2012         if (numUAVs)
2013         {
2014             index += numUAVs - 1;
2015             layout->maxUAVIdx = index++;
2016         }
2017         else
2018         {
2019             layout->maxUAVIdx = index;
2020         }
2021 
2022         // And finally, the scratch surface
2023         layout->surfaceScratchIdx = index++;
2024 
2025         // Overall number of used BT entries, not including TGSM.
2026         layout->maxBTsize = index;
2027     }
2028 
HasFullDispatchMask()2029     bool COpenCLKernel::HasFullDispatchMask()
2030     {
2031         unsigned int groupSize = IGCMetaDataHelper::getThreadGroupSize(*m_pMdUtils, entry);
2032         if (groupSize != 0)
2033         {
2034             if (groupSize % numLanes(m_dispatchSize) == 0)
2035             {
2036                 return true;
2037             }
2038         }
2039         return false;
2040     }
2041 
getBTI(SOpenCLKernelInfo::SResourceInfo & resInfo)2042     unsigned int COpenCLKernel::getBTI(SOpenCLKernelInfo::SResourceInfo& resInfo)
2043     {
2044         switch (resInfo.Type)
2045         {
2046         case SOpenCLKernelInfo::SResourceInfo::RES_UAV:
2047             return m_pBtiLayout->GetUavIndex(resInfo.Index);
2048         case SOpenCLKernelInfo::SResourceInfo::RES_SRV:
2049             return m_pBtiLayout->GetTextureIndex(resInfo.Index);
2050         default:
2051             return 0xffffffff;
2052         }
2053     }
2054 
CollectProgramInfo(OpenCLProgramContext * ctx)2055     void CollectProgramInfo(OpenCLProgramContext* ctx)
2056     {
2057         MetaDataUtils mdUtils(ctx->getModule());
2058         ModuleMetaData* modMD = ctx->getModuleMetaData();
2059 
2060         if (!modMD->inlineConstantBuffers.empty())
2061         {
2062             // For ZeBin, constants are mantained in two separate buffers
2063             // the first is for general constants, and the second for string literals
2064 
2065             // General constants
2066             auto ipsbMDHandle = modMD->inlineConstantBuffers[0];
2067             std::unique_ptr<iOpenCL::InitConstantAnnotation> initConstant(new iOpenCL::InitConstantAnnotation());
2068             initConstant->Alignment = ipsbMDHandle.alignment;
2069             initConstant->AllocSize = ipsbMDHandle.allocSize;
2070 
2071             size_t bufferSize = (ipsbMDHandle.Buffer).size();
2072             initConstant->InlineData.resize(bufferSize);
2073             memcpy_s(initConstant->InlineData.data(), bufferSize, ipsbMDHandle.Buffer.data(), bufferSize);
2074 
2075             ctx->m_programInfo.m_initConstantAnnotation = std::move(initConstant);
2076 
2077             if (IGC_IS_FLAG_ENABLED(EnableZEBinary) ||
2078                 modMD->compOpt.EnableZEBinary)
2079             {
2080                 // String literals
2081                 auto ipsbStringMDHandle = modMD->inlineConstantBuffers[1];
2082                 std::unique_ptr<iOpenCL::InitConstantAnnotation> initStringConstant(new iOpenCL::InitConstantAnnotation());
2083                 initStringConstant->Alignment = ipsbStringMDHandle.alignment;
2084                 initStringConstant->AllocSize = ipsbStringMDHandle.allocSize;
2085 
2086                 bufferSize = (ipsbStringMDHandle.Buffer).size();
2087                 initStringConstant->InlineData.resize(bufferSize);
2088                 memcpy_s(initStringConstant->InlineData.data(), bufferSize, ipsbStringMDHandle.Buffer.data(), bufferSize);
2089 
2090                 ctx->m_programInfo.m_initConstantStringAnnotation = std::move(initStringConstant);
2091             }
2092         }
2093 
2094         if (!modMD->inlineGlobalBuffers.empty())
2095         {
2096             auto ipsbMDHandle = modMD->inlineGlobalBuffers[0];
2097 
2098             std::unique_ptr<iOpenCL::InitGlobalAnnotation> initGlobal(new iOpenCL::InitGlobalAnnotation());
2099             initGlobal->Alignment = ipsbMDHandle.alignment;
2100             initGlobal->AllocSize = ipsbMDHandle.allocSize;
2101 
2102             size_t bufferSize = (ipsbMDHandle.Buffer).size();
2103             initGlobal->InlineData.resize(bufferSize);
2104             memcpy_s(initGlobal->InlineData.data(), bufferSize, ipsbMDHandle.Buffer.data(), bufferSize);
2105 
2106             ctx->m_programInfo.m_initGlobalAnnotation = std::move(initGlobal);
2107         }
2108 
2109         {
2110             auto& FuncMap = ctx->getModuleMetaData()->FuncMD;
2111             for (const auto& i : FuncMap)
2112             {
2113                 std::unique_ptr<iOpenCL::KernelTypeProgramBinaryInfo> initConstant(new iOpenCL::KernelTypeProgramBinaryInfo());
2114                 initConstant->KernelName = i.first->getName().str();
2115                 if (i.second.IsFinalizer)
2116                 {
2117 
2118                     initConstant->Type = iOpenCL::PROGRAM_SCOPE_KERNEL_DESTRUCTOR;
2119                     ctx->m_programInfo.m_initKernelTypeAnnotation.push_back(std::move(initConstant));
2120                 }
2121                 else if (i.second.IsInitializer)
2122                 {
2123                     initConstant->Type = iOpenCL::PROGRAM_SCOPE_KERNEL_CONSTRUCTOR;
2124                     ctx->m_programInfo.m_initKernelTypeAnnotation.push_back(std::move(initConstant));
2125                 }
2126 
2127             }
2128         }
2129 
2130         for (const auto& globPtrInfo : modMD->GlobalPointerProgramBinaryInfos)
2131         {
2132             auto initGlobalPointer = std::make_unique<iOpenCL::GlobalPointerAnnotation>();
2133             initGlobalPointer->PointeeAddressSpace = globPtrInfo.PointeeAddressSpace;
2134             initGlobalPointer->PointeeBufferIndex = globPtrInfo.PointeeBufferIndex;
2135             initGlobalPointer->PointerBufferIndex = globPtrInfo.PointerBufferIndex;
2136             initGlobalPointer->PointerOffset = globPtrInfo.PointerOffset;
2137             ctx->m_programInfo.m_initGlobalPointerAnnotation.push_back(std::move(initGlobalPointer));
2138         }
2139 
2140         for (const auto& constPtrInfo : modMD->ConstantPointerProgramBinaryInfos)
2141         {
2142             auto  initConstantPointer = std::make_unique<iOpenCL::ConstantPointerAnnotation>();
2143             initConstantPointer->PointeeAddressSpace = constPtrInfo.PointeeAddressSpace;
2144             initConstantPointer->PointeeBufferIndex = constPtrInfo.PointeeBufferIndex;
2145             initConstantPointer->PointerBufferIndex = constPtrInfo.PointerBufferIndex;
2146             initConstantPointer->PointerOffset = constPtrInfo.PointerOffset;
2147 
2148             ctx->m_programInfo.m_initConstantPointerAnnotation.push_back(std::move(initConstantPointer));
2149         }
2150 
2151         // Pointer address relocation table data for GLOBAL buffer
2152         for (const auto& globalRelocEntry : modMD->GlobalBufferAddressRelocInfo)
2153         {
2154             ctx->m_programInfo.m_GlobalPointerAddressRelocAnnotation.globalReloc.emplace_back(
2155                 (globalRelocEntry.PointerSize == 8) ? vISA::GenRelocType::R_SYM_ADDR : vISA::GenRelocType::R_SYM_ADDR_32,
2156                 (uint32_t)globalRelocEntry.BufferOffset,
2157                 globalRelocEntry.Symbol);
2158         }
2159         // Pointer address relocation table data for CONST buffer
2160         for (const auto& constRelocEntry : modMD->ConstantBufferAddressRelocInfo)
2161         {
2162             ctx->m_programInfo.m_GlobalPointerAddressRelocAnnotation.globalConstReloc.emplace_back(
2163                 (constRelocEntry.PointerSize == 8) ? vISA::GenRelocType::R_SYM_ADDR : vISA::GenRelocType::R_SYM_ADDR_32,
2164                 (uint32_t)constRelocEntry.BufferOffset,
2165                 constRelocEntry.Symbol);
2166         }
2167     }
2168 
GatherDataForDriver(OpenCLProgramContext * ctx,COpenCLKernel * pShader,CShaderProgram * pKernel,Function * pFunc,MetaDataUtils * pMdUtils)2169     void GatherDataForDriver(OpenCLProgramContext* ctx, COpenCLKernel* pShader, CShaderProgram* pKernel, Function* pFunc, MetaDataUtils* pMdUtils)
2170     {
2171         IGC_ASSERT(pShader != nullptr);
2172         pShader->FillKernel();
2173         SProgramOutput* pOutput = pShader->ProgramOutput();
2174 
2175         //  Need a better heuristic for NoRetry
2176         FunctionInfoMetaDataHandle funcInfoMD = pMdUtils->getFunctionsInfoItem(pFunc);
2177         int subGrpSize = funcInfoMD->getSubGroupSize()->getSIMD_size();
2178         bool noRetry = ((subGrpSize > 0 || pOutput->m_scratchSpaceUsedBySpills < 1000) &&
2179             ctx->m_instrTypes.mayHaveIndirectOperands);
2180 
2181         bool optDisable = false;
2182         if (ctx->getModuleMetaData()->compOpt.OptDisable)
2183         {
2184             optDisable = true;
2185         }
2186 
2187         if (pOutput->m_scratchSpaceUsedBySpills == 0 ||
2188             noRetry ||
2189             ctx->m_retryManager.IsLastTry() ||
2190             optDisable)
2191         {
2192             // Save the shader program to the state processor to be handled later
2193             if (ctx->m_programOutput.m_ShaderProgramList.size() == 0 ||
2194                 ctx->m_programOutput.m_ShaderProgramList.back() != pKernel)
2195             {
2196                 ctx->m_programOutput.m_ShaderProgramList.push_back(pKernel);
2197             }
2198             COMPILER_SHADER_STATS_PRINT(pKernel->m_shaderStats, ShaderType::OPENCL_SHADER, ctx->hash, pFunc->getName().str());
2199             COMPILER_SHADER_STATS_SUM(ctx->m_sumShaderStats, pKernel->m_shaderStats, ShaderType::OPENCL_SHADER);
2200             COMPILER_SHADER_STATS_DEL(pKernel->m_shaderStats);
2201         }
2202         else
2203         {
2204             ctx->m_retryManager.kernelSet.insert(pShader->m_kernelInfo.m_kernelName);
2205         }
2206     }
2207 
SetKernelProgram(OpenCLProgramContext * ctx,COpenCLKernel * shader,DWORD simdMode)2208     static bool SetKernelProgram(OpenCLProgramContext* ctx, COpenCLKernel* shader, DWORD simdMode)
2209     {
2210         if (shader && (shader->ProgramOutput()->m_programSize > 0 ||
2211                       (ctx->m_compileToVISAOnly && !shader->ProgramOutput()->m_VISAAsm.empty())))
2212         {
2213             if (simdMode == 32)
2214             {
2215                 //why do we need this? we will get all output in GatherDataForDriver(...)
2216                 //remove it to avoid messy logics
2217                 //shader->m_kernelInfo.m_executionEnivronment.PerThreadSpillFillSize =
2218                 //    shader->ProgramOutput()->m_scratchSpaceUsedBySpills;
2219                 shader->m_kernelInfo.m_kernelProgram.simd32 = *shader->ProgramOutput();
2220                 ctx->SetSIMDInfo(SIMD_SELECTED, SIMDMode::SIMD32, ShaderDispatchMode::NOT_APPLICABLE);
2221             }
2222             else if (simdMode == 16)
2223             {
2224                 shader->m_kernelInfo.m_kernelProgram.simd16 = *shader->ProgramOutput();
2225                 ctx->SetSIMDInfo(SIMD_SELECTED, SIMDMode::SIMD16, ShaderDispatchMode::NOT_APPLICABLE);
2226             }
2227             else if (simdMode == 8)
2228             {
2229                 shader->m_kernelInfo.m_kernelProgram.simd8 = *shader->ProgramOutput();
2230                 ctx->SetSIMDInfo(SIMD_SELECTED, SIMDMode::SIMD8, ShaderDispatchMode::NOT_APPLICABLE);
2231             }
2232             shader->m_kernelInfo.m_executionEnivronment.CompiledSIMDSize = simdMode;
2233             shader->m_kernelInfo.m_executionEnivronment.SIMDInfo = ctx->GetSIMDInfo();
2234             return true;
2235         }
2236         return false;
2237     }
2238 
CodeGen(OpenCLProgramContext * ctx)2239     void CodeGen(OpenCLProgramContext* ctx)
2240     {
2241         // Do program-wide code generation.
2242         // Currently, this just creates the program-scope patch stream.
2243         if (ctx->m_retryManager.IsFirstTry())
2244         {
2245             CollectProgramInfo(ctx);
2246             if (IGC_IS_FLAG_DISABLED(EnableZEBinary) &&
2247                 !ctx->getCompilerOption().EnableZEBinary)
2248             {
2249                 ctx->m_programOutput.CreateProgramScopePatchStream(ctx->m_programInfo);
2250             }
2251         }
2252 
2253         MetaDataUtils* pMdUtils = ctx->getMetaDataUtils();
2254 
2255         //Clear spill parameters of retry manager in the very begining of code gen
2256         ctx->m_retryManager.ClearSpillParams();
2257 
2258         CShaderProgram::KernelShaderMap shaders;
2259         CodeGen(ctx, shaders);
2260 
2261         if (ctx->m_programOutput.m_pSystemThreadKernelOutput == nullptr)
2262         {
2263             const auto options = ctx->m_InternalOptions;
2264             if (options.IncludeSIPCSR ||
2265                 options.IncludeSIPKernelDebug ||
2266                 options.IncludeSIPKernelDebugWithLocalMemory ||
2267                 options.KernelDebugEnable)
2268             {
2269                 DWORD systemThreadMode = 0;
2270 
2271                 if (options.IncludeSIPCSR)
2272                 {
2273                     systemThreadMode |= USC::SYSTEM_THREAD_MODE_CSR;
2274                 }
2275 
2276                 if (options.KernelDebugEnable ||
2277                     options.IncludeSIPKernelDebug)
2278                 {
2279                     systemThreadMode |= USC::SYSTEM_THREAD_MODE_DEBUG;
2280                 }
2281 
2282                 if (options.IncludeSIPKernelDebugWithLocalMemory)
2283                 {
2284                     systemThreadMode |= USC::SYSTEM_THREAD_MODE_DEBUG_LOCAL;
2285                 }
2286 
2287                 bool success = SIP::CSystemThread::CreateSystemThreadKernel(
2288                     ctx->platform,
2289                     (USC::SYSTEM_THREAD_MODE)systemThreadMode,
2290                     ctx->m_programOutput.m_pSystemThreadKernelOutput);
2291 
2292                 if (!success)
2293                 {
2294                     ctx->EmitError("System thread kernel could not be created!", nullptr);
2295                 }
2296             }
2297         }
2298 
2299         ctx->m_retryManager.kernelSet.clear();
2300 
2301         // gather data to send back to the driver
2302         for (const auto& k : shaders)
2303         {
2304             Function* pFunc = k.first;
2305             CShaderProgram* pKernel = static_cast<CShaderProgram*>(k.second);
2306             COpenCLKernel* simd8Shader = static_cast<COpenCLKernel*>(pKernel->GetShader(SIMDMode::SIMD8));
2307             COpenCLKernel* simd16Shader = static_cast<COpenCLKernel*>(pKernel->GetShader(SIMDMode::SIMD16));
2308             COpenCLKernel* simd32Shader = static_cast<COpenCLKernel*>(pKernel->GetShader(SIMDMode::SIMD32));
2309 
2310             if ((ctx->m_DriverInfo.sendMultipleSIMDModes() || ctx->m_enableSimdVariantCompilation)
2311                 && (ctx->getModuleMetaData()->csInfo.forcedSIMDSize == 0))
2312             {
2313                 //Gather the kernel binary for each compiled kernel
2314                 if (SetKernelProgram(ctx, simd32Shader, 32))
2315                     GatherDataForDriver(ctx, simd32Shader, pKernel, pFunc, pMdUtils);
2316                 if (SetKernelProgram(ctx, simd16Shader, 16))
2317                     GatherDataForDriver(ctx, simd16Shader, pKernel, pFunc, pMdUtils);
2318                 if (SetKernelProgram(ctx, simd8Shader, 8))
2319                     GatherDataForDriver(ctx, simd8Shader, pKernel, pFunc, pMdUtils);
2320             }
2321             else
2322             {
2323                 //Gather the kernel binary only for 1 SIMD mode of the kernel
2324                 if (SetKernelProgram(ctx, simd32Shader, 32))
2325                     GatherDataForDriver(ctx, simd32Shader, pKernel, pFunc, pMdUtils);
2326                 else if (SetKernelProgram(ctx, simd16Shader, 16))
2327                     GatherDataForDriver(ctx, simd16Shader, pKernel, pFunc, pMdUtils);
2328                 else if (SetKernelProgram(ctx, simd8Shader, 8))
2329                     GatherDataForDriver(ctx, simd8Shader, pKernel, pFunc, pMdUtils);
2330             }
2331         }
2332     }
2333 
hasReadWriteImage(llvm::Function & F)2334     bool COpenCLKernel::hasReadWriteImage(llvm::Function& F)
2335     {
2336         if (!isEntryFunc(m_pMdUtils, &F))
2337         {
2338             // Ignore read/write flags for subroutines for now.
2339             // TODO: get access types for subroutines without using kernel args
2340             return false;
2341         }
2342 
2343         KernelArgs kernelArgs(F, m_DL, m_pMdUtils, m_ModuleMetadata, getGRFSize(), KernelArgsOrder::InputType::INDEPENDENT);
2344         for (const auto& KA : kernelArgs)
2345         {
2346             // RenderScript annotation sets "read_write" qualifier
2347             // for any applicable kernel argument, not only for kernel arguments
2348             // that are images, so we should check if kernel argument is an image.
2349             if (KA.getAccessQual() == KernelArg::AccessQual::READ_WRITE &&
2350                 KA.getArgType() >= KernelArg::ArgType::IMAGE_1D &&
2351                 KA.getArgType() <= KernelArg::ArgType::BINDLESS_IMAGE_CUBE_DEPTH_ARRAY)
2352             {
2353                 return true;
2354             }
2355         }
2356         return false;
2357     }
2358 
CompileSIMDSize(SIMDMode simdMode,EmitPass & EP,llvm::Function & F)2359     bool COpenCLKernel::CompileSIMDSize(SIMDMode simdMode, EmitPass& EP, llvm::Function& F)
2360     {
2361         if (!CompileSIMDSizeInCommon(simdMode))
2362             return false;
2363 
2364         {
2365             // If stack calls are present, disable simd32 in order to do wa in visa
2366             bool needCallWA = (IGC_IS_FLAG_ENABLED(EnableCallWA) && m_Context->platform.hasFusedEU());
2367             if (needCallWA && simdMode == SIMDMode::SIMD32  && HasStackCalls())
2368             {
2369                 return false;
2370             }
2371         }
2372 
2373         if (!m_Context->m_retryManager.IsFirstTry())
2374         {
2375             m_Context->ClearSIMDInfo(simdMode, ShaderDispatchMode::NOT_APPLICABLE);
2376             m_Context->SetSIMDInfo(SIMD_RETRY, simdMode, ShaderDispatchMode::NOT_APPLICABLE);
2377         }
2378 
2379 
2380         //If forced SIMD Mode (by driver or regkey), then:
2381         // 1. Compile only that SIMD mode and nothing else
2382         // 2. Compile that SIMD mode even if it is not profitable, i.e. even if compileThisSIMD() returns false for it.
2383         //    So, don't bother checking profitability for it
2384         if (m_Context->getModuleMetaData()->csInfo.forcedSIMDSize != 0)
2385         {
2386             // Entered here means driver has requested a specific SIMD mode, which was forced in the regkey ForceOCLSIMDWidth.
2387             // We return the condition can we compile the given forcedSIMDSize with this simdMode?
2388             return (
2389                 // These statements are basically equivalent to (simdMode == forcedSIMDSize)
2390                 (simdMode == SIMDMode::SIMD8 && m_Context->getModuleMetaData()->csInfo.forcedSIMDSize == 8)   ||
2391                 (simdMode == SIMDMode::SIMD16 && m_Context->getModuleMetaData()->csInfo.forcedSIMDSize == 16) ||
2392                 (simdMode == SIMDMode::SIMD32 && m_Context->getModuleMetaData()->csInfo.forcedSIMDSize == 32)
2393             );
2394         }
2395 
2396         SIMDStatus simdStatus = checkSIMDCompileConds(simdMode, EP, F);
2397 
2398 
2399         // Func and Perf checks pass, compile this SIMD
2400         if (simdStatus == SIMDStatus::SIMD_PASS)
2401             return true;
2402 
2403         // Functional failure, skip compiling this SIMD
2404         if (simdStatus == SIMDStatus::SIMD_FUNC_FAIL)
2405             return false;
2406 
2407         IGC_ASSERT(simdStatus == SIMDStatus::SIMD_PERF_FAIL);
2408         //not profitable
2409         if (m_Context->m_DriverInfo.sendMultipleSIMDModes())
2410         {
2411             if (EP.m_canAbortOnSpill)
2412                 return false; //not the first functionally correct SIMD, exit
2413             else
2414                 return true; //is the first functionally correct SIMD, compile
2415         }
2416         return simdStatus == SIMDStatus::SIMD_PASS;
2417     }
2418 
2419 
checkSIMDCompileConds(SIMDMode simdMode,EmitPass & EP,llvm::Function & F)2420     SIMDStatus COpenCLKernel::checkSIMDCompileConds(SIMDMode simdMode, EmitPass& EP, llvm::Function& F)
2421     {
2422         CShader* simd8Program = m_parent->GetShader(SIMDMode::SIMD8);
2423         CShader* simd16Program = m_parent->GetShader(SIMDMode::SIMD16);
2424         CShader* simd32Program = m_parent->GetShader(SIMDMode::SIMD32);
2425 
2426         CodeGenContext* pCtx = GetContext();
2427 
2428         bool compileFunctionVariants = pCtx->m_enableSimdVariantCompilation &&
2429             (m_FGA && IGC::isIntelSymbolTableVoidProgram(m_FGA->getGroupHead(&F)));
2430 
2431         // Here we see if we have compiled a size for this shader already
2432         if ((simd8Program && simd8Program->ProgramOutput()->m_programSize > 0) ||
2433             (simd16Program && simd16Program->ProgramOutput()->m_programSize > 0) ||
2434             (simd32Program && simd32Program->ProgramOutput()->m_programSize > 0))
2435         {
2436             bool canCompileMultipleSIMD = pCtx->m_DriverInfo.sendMultipleSIMDModes() || compileFunctionVariants;
2437             if (!(canCompileMultipleSIMD && (pCtx->getModuleMetaData()->csInfo.forcedSIMDSize == 0)))
2438                 return SIMDStatus::SIMD_FUNC_FAIL;
2439         }
2440 
2441         // Next we check if there is a required sub group size specified
2442         MetaDataUtils* pMdUtils = EP.getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
2443         ModuleMetaData* modMD = pCtx->getModuleMetaData();
2444         FunctionInfoMetaDataHandle funcInfoMD = pMdUtils->getFunctionsInfoItem(&F);
2445         int simd_size = funcInfoMD->getSubGroupSize()->getSIMD_size();
2446 
2447         // Finds the kernel and get the group simd size from the kernel
2448         if (m_FGA)
2449         {
2450             llvm::Function* Kernel = &F;
2451             auto FG = m_FGA->getGroup(&F);
2452             Kernel = FG->getHead();
2453             funcInfoMD = pMdUtils->getFunctionsInfoItem(Kernel);
2454             simd_size = funcInfoMD->getSubGroupSize()->getSIMD_size();
2455         }
2456 
2457         // For simd variant functions, detect which SIMD sizes are needed
2458         if (compileFunctionVariants && F.hasFnAttribute("variant-function-def"))
2459         {
2460             bool canCompile = true;
2461             if (simdMode == SIMDMode::SIMD16)
2462                 canCompile = F.hasFnAttribute("CompileSIMD16");
2463             else if (simdMode == SIMDMode::SIMD8)
2464                 canCompile = F.hasFnAttribute("CompileSIMD8");
2465 
2466             if (!canCompile)
2467             {
2468                 pCtx->SetSIMDInfo(SIMD_SKIP_HW, simdMode, ShaderDispatchMode::NOT_APPLICABLE);
2469                 return SIMDStatus::SIMD_FUNC_FAIL;
2470             }
2471         }
2472 
2473         // Cannot compile simd32 for function calls due to slicing
2474         if (m_FGA && m_FGA->getGroup(&F) && (!m_FGA->getGroup(&F)->isSingle() || m_FGA->getGroup(&F)->hasStackCall()))
2475         {
2476             // Fail on SIMD32 for all groups with function calls
2477             if (simdMode == SIMDMode::SIMD32)
2478             {
2479                 pCtx->SetSIMDInfo(SIMD_SKIP_HW, simdMode, ShaderDispatchMode::NOT_APPLICABLE);
2480                 return SIMDStatus::SIMD_FUNC_FAIL;
2481             }
2482             // Group has no stackcalls, is not the SymbolTable dummy kernel, and subgroup size is not set
2483             // Just subroutines, default to SIMD8
2484             if (!m_FGA->getGroup(&F)->hasStackCall() &&
2485                 !IGC::isIntelSymbolTableVoidProgram(m_FGA->getGroupHead(&F)) &&
2486                 simd_size == 0 &&
2487                 simdMode != SIMDMode::SIMD8)
2488             {
2489                 pCtx->SetSIMDInfo(SIMD_SKIP_HW, simdMode, ShaderDispatchMode::NOT_APPLICABLE);
2490                 return SIMDStatus::SIMD_FUNC_FAIL;
2491             }
2492         }
2493 
2494         uint32_t groupSize = 0;
2495         if (modMD->csInfo.maxWorkGroupSize)
2496         {
2497             groupSize = modMD->csInfo.maxWorkGroupSize;
2498         }
2499         else
2500         {
2501             groupSize = IGCMetaDataHelper::getThreadGroupSize(*pMdUtils, &F);
2502         }
2503 
2504         if (groupSize == 0)
2505         {
2506             groupSize = IGCMetaDataHelper::getThreadGroupSizeHint(*pMdUtils, &F);
2507         }
2508 
2509         if (simd_size)
2510         {
2511             switch (simd_size)
2512             {
2513                 // Apparently the only possible simdModes here are SIMD8, SIMD16, SIMD32
2514             case 8:
2515                 if (simdMode != SIMDMode::SIMD8)
2516                 {
2517                     pCtx->SetSIMDInfo(SIMD_SKIP_THGRPSIZE, simdMode, ShaderDispatchMode::NOT_APPLICABLE);
2518                     return SIMDStatus::SIMD_FUNC_FAIL;
2519                 }
2520                 break;
2521             case 16:
2522                 if (simdMode != SIMDMode::SIMD16)
2523                 {
2524                     pCtx->SetSIMDInfo(SIMD_SKIP_THGRPSIZE, simdMode, ShaderDispatchMode::NOT_APPLICABLE);
2525                     return SIMDStatus::SIMD_FUNC_FAIL;
2526                 }
2527                 EP.m_canAbortOnSpill = false;
2528                 break;
2529             case 32:
2530                 if (simdMode != SIMDMode::SIMD32)
2531                 {
2532                     pCtx->SetSIMDInfo(SIMD_SKIP_THGRPSIZE, simdMode, ShaderDispatchMode::NOT_APPLICABLE);
2533                     return SIMDStatus::SIMD_FUNC_FAIL;
2534                 }
2535                 else {
2536                     EP.m_canAbortOnSpill = false;
2537                 }
2538                 break;
2539             default:
2540                 IGC_ASSERT_MESSAGE(0, "Unsupported required sub group size");
2541                 break;
2542             }
2543         }
2544         else
2545         {
2546             // Checking registry/flag here. Note that if ForceOCLSIMDWidth is set to
2547             // 8/16/32, only corresponding EnableOCLSIMD<N> is set to true. Therefore,
2548             // if any of EnableOCLSIMD<N> is disabled, ForceOCLSIMDWidth must set to
2549             // a value other than <N> if set. See igc_regkeys.cpp for detail.
2550             if ((simdMode == SIMDMode::SIMD32 && IGC_IS_FLAG_DISABLED(EnableOCLSIMD32)) ||
2551                 (simdMode == SIMDMode::SIMD16 && IGC_IS_FLAG_DISABLED(EnableOCLSIMD16)))
2552             {
2553                 return SIMDStatus::SIMD_FUNC_FAIL;
2554             }
2555 
2556             // Check if we force code generation for the current SIMD size.
2557             // Note that for SIMD8, we always force it!
2558             //ATTN: This check is redundant!
2559             if (numLanes(simdMode) == pCtx->getModuleMetaData()->csInfo.forcedSIMDSize ||
2560                 simdMode == SIMDMode::SIMD8)
2561             {
2562                 return SIMDStatus::SIMD_PASS;
2563             }
2564 
2565 
2566             if (groupSize != 0 && groupSize <= 16)
2567             {
2568                 if (simdMode == SIMDMode::SIMD32 ||
2569                     (groupSize <= 8 && simdMode != SIMDMode::SIMD8))
2570                 {
2571                     pCtx->SetSIMDInfo(SIMD_SKIP_THGRPSIZE, simdMode, ShaderDispatchMode::NOT_APPLICABLE);
2572                     return SIMDStatus::SIMD_FUNC_FAIL;
2573                 }
2574             }
2575 
2576             // Here we check profitablility, etc.
2577             if (simdMode == SIMDMode::SIMD16)
2578             {
2579                 bool optDisable = this->GetContext()->getModuleMetaData()->compOpt.OptDisable;
2580 
2581                 if (optDisable)
2582                 {
2583                     return SIMDStatus::SIMD_FUNC_FAIL;
2584                 }
2585 
2586                 // bail out of SIMD16 if it's not profitable.
2587                 Simd32ProfitabilityAnalysis& PA = EP.getAnalysis<Simd32ProfitabilityAnalysis>();
2588                 if (!PA.isSimd16Profitable())
2589                 {
2590                     pCtx->SetSIMDInfo(SIMD_SKIP_PERF, simdMode, ShaderDispatchMode::NOT_APPLICABLE);
2591                     return SIMDStatus::SIMD_PERF_FAIL;
2592                 }
2593             }
2594             if (simdMode == SIMDMode::SIMD32)
2595             {
2596                 bool optDisable = this->GetContext()->getModuleMetaData()->compOpt.OptDisable;
2597 
2598                 if (optDisable)
2599                 {
2600                     return SIMDStatus::SIMD_FUNC_FAIL;
2601                 }
2602 
2603                 // bail out of SIMD32 if it's not profitable.
2604                 Simd32ProfitabilityAnalysis& PA = EP.getAnalysis<Simd32ProfitabilityAnalysis>();
2605                 if (!PA.isSimd32Profitable())
2606                 {
2607                     pCtx->SetSIMDInfo(SIMD_SKIP_HW, simdMode, ShaderDispatchMode::NOT_APPLICABLE);
2608                     return SIMDStatus::SIMD_PERF_FAIL;
2609                 }
2610             }
2611         }
2612 
2613         return SIMDStatus::SIMD_PASS;
2614     }
2615 
2616 }
2617