1 /*========================== begin_copyright_notice ============================ 2 3 Copyright (C) 2017-2021 Intel Corporation 4 5 SPDX-License-Identifier: MIT 6 7 ============================= end_copyright_notice ===========================*/ 8 9 #include "common/LLVMWarningsPush.hpp" 10 #include <llvm/Support/ScaledNumber.h> 11 #include "llvm/IR/DataLayout.h" 12 #include "llvm/ADT/StringExtras.h" 13 #include "common/LLVMWarningsPop.hpp" 14 #include "AdaptorCommon/ImplicitArgs.hpp" 15 #include "Compiler/CISACodeGen/ShaderCodeGen.hpp" 16 #include "Compiler/CISACodeGen/OpenCLKernelCodeGen.hpp" 17 #include "Compiler/CISACodeGen/messageEncoding.hpp" 18 #include "Compiler/Optimizer/OpenCLPasses/ResourceAllocator/ResourceAllocator.hpp" 19 #include "Compiler/Optimizer/OpenCLPasses/ProgramScopeConstants/ProgramScopeConstantAnalysis.hpp" 20 #include "Compiler/Optimizer/OpenCLPasses/LocalBuffers/InlineLocalsResolution.hpp" 21 #include "Compiler/Optimizer/OpenCLPasses/KernelArgs.hpp" 22 #include "Compiler/CISACodeGen/EmitVISAPass.hpp" 23 #include "Compiler/Optimizer/OCLBIUtils.h" 24 #include "AdaptorOCL/OCL/KernelAnnotations.hpp" 25 #include "common/allocator.h" 26 #include "common/igc_regkeys.hpp" 27 #include "common/Stats.hpp" 28 #include "common/SystemThread.h" 29 #include "common/secure_mem.h" 30 #include "common/MDFrameWork.h" 31 #include <iStdLib/utility.h> 32 #include "Probe/Assertion.h" 33 #include "ZEBinWriter/zebin/source/ZEELFObjectBuilder.hpp" 34 35 /*********************************************************************************** 36 This file contains the code specific to opencl kernels 37 ************************************************************************************/ 38 39 using namespace llvm; 40 using namespace IGC; 41 using namespace IGC::IGCMD; 42 43 namespace IGC 44 { 45 getLocalIdBufferSize(SIMDMode mode)46 unsigned int getLocalIdBufferSize(SIMDMode mode) 47 { 48 auto simdSize = numLanes(mode); 49 IGC_ASSERT(simdSize != 0); 50 51 // as per spec, size of local id buffer depends on simd size 52 // simd size * size/elem * #dims 53 unsigned int allocSize = simdSize * 2 * 3; 54 55 // simd8 version has some reserved fields 56 if (simdSize == 8) 57 allocSize *= 2; 58 59 // field to hold pointer to local id buffer 60 allocSize += 8; 61 62 return allocSize; 63 } 64 COpenCLKernel(const OpenCLProgramContext * ctx,Function * pFunc,CShaderProgram * pProgram)65 COpenCLKernel::COpenCLKernel(const OpenCLProgramContext* ctx, Function* pFunc, CShaderProgram* pProgram) : 66 CComputeShaderBase(pFunc, pProgram) 67 { 68 m_HasTID = false; 69 m_HasGlobalSize = false; 70 m_disableMidThreadPreemption = false; 71 m_perWIStatelessPrivateMemSize = 0; 72 m_Context = const_cast<OpenCLProgramContext*>(ctx); 73 m_localOffsetsMap.clear(); 74 m_pBtiLayout = &(ctx->btiLayout); 75 m_Platform = &(ctx->platform); 76 m_DriverInfo = &(ctx->m_DriverInfo); 77 78 } 79 ~COpenCLKernel()80 COpenCLKernel::~COpenCLKernel() 81 { 82 ClearKernelInfo(); 83 m_simdProgram.Destroy(); 84 } 85 ClearKernelInfo()86 void COpenCLKernel::ClearKernelInfo() 87 { 88 // Global pointer arguments 89 m_kernelInfo.m_pointerArgument.clear(); 90 91 // Non-argument pointer inputs 92 m_kernelInfo.m_pointerInput.clear(); 93 94 // Local pointer arguments 95 m_kernelInfo.m_localPointerArgument.clear(); 96 97 // Sampler inputs 98 m_kernelInfo.m_samplerInput.clear(); 99 100 // Sampler arguments 101 m_kernelInfo.m_samplerArgument.clear(); 102 103 // Scalar inputs 104 m_kernelInfo.m_constantInputAnnotation.clear(); 105 106 // Scalar arguments 107 m_kernelInfo.m_constantArgumentAnnotation.clear(); 108 109 // Image arguments 110 m_kernelInfo.m_imageInputAnnotations.clear(); 111 112 // Kernel Arg Reflection Info 113 m_kernelInfo.m_kernelArgInfo.clear(); 114 115 // Printf strings 116 m_kernelInfo.m_printfStringAnnotations.clear(); 117 118 // Argument to BTI/Sampler index map 119 m_kernelInfo.m_argIndexMap.clear(); 120 } 121 PreCompile()122 void COpenCLKernel::PreCompile() 123 { 124 ClearKernelInfo(); 125 CreateImplicitArgs(); 126 //We explicitly want this to be GRF-sized, without relation to simd width 127 128 RecomputeBTLayout(); 129 130 ModuleMetaData* modMD = m_Context->getModuleMetaData(); 131 auto funcIter = modMD->FuncMD.find(entry); 132 133 // Initialize the table of offsets for GlobalVariables representing locals 134 if (funcIter != modMD->FuncMD.end()) 135 { 136 auto loIter = funcIter->second.localOffsets.begin(); 137 auto loEnd = funcIter->second.localOffsets.end(); 138 for (; loIter != loEnd; ++loIter) 139 { 140 LocalOffsetMD loHandle = *loIter; 141 m_localOffsetsMap[loHandle.m_Var] = loHandle.m_Offset; 142 } 143 } 144 } 145 hasWorkGroupWalkOrder()146 bool COpenCLKernel::hasWorkGroupWalkOrder() 147 { 148 const CodeGenContext* pCtx = GetContext(); 149 const ModuleMetaData* MMD = pCtx->getModuleMetaData(); 150 if (auto I = MMD->FuncMD.find(entry); I != MMD->FuncMD.end()) 151 { 152 auto& FMD = I->second; 153 auto& Order = FMD.workGroupWalkOrder; 154 if (Order.dim0 != 0 || Order.dim1 != 0 || Order.dim2 != 0) 155 return true; 156 } 157 158 return false; 159 } 160 getResourceInfo(int argNo)161 SOpenCLKernelInfo::SResourceInfo COpenCLKernel::getResourceInfo(int argNo) 162 { 163 CodeGenContext* pCtx = GetContext(); 164 ModuleMetaData* modMD = pCtx->getModuleMetaData(); 165 FunctionMetaData* funcMD = &modMD->FuncMD[entry]; 166 ResourceAllocMD* resAllocMD = &funcMD->resAllocMD; 167 IGC_ASSERT_MESSAGE(resAllocMD->argAllocMDList.size() > 0, "ArgAllocMD List Out of Bounds"); 168 ArgAllocMD* argAlloc = &resAllocMD->argAllocMDList[argNo]; 169 170 SOpenCLKernelInfo::SResourceInfo resInfo; 171 ResourceTypeEnum type = (ResourceTypeEnum)argAlloc->type; 172 173 if (type == ResourceTypeEnum::UAVResourceType || 174 type == ResourceTypeEnum::BindlessUAVResourceType) 175 { 176 resInfo.Type = SOpenCLKernelInfo::SResourceInfo::RES_UAV; 177 } 178 else if (type == ResourceTypeEnum::SRVResourceType) 179 { 180 resInfo.Type = SOpenCLKernelInfo::SResourceInfo::RES_SRV; 181 } 182 else 183 { 184 resInfo.Type = SOpenCLKernelInfo::SResourceInfo::RES_OTHER; 185 } 186 resInfo.Index = argAlloc->indexType; 187 return resInfo; 188 } 189 getExtensionInfo(int argNo)190 ResourceExtensionTypeEnum COpenCLKernel::getExtensionInfo(int argNo) 191 { 192 CodeGenContext* pCtx = GetContext(); 193 ModuleMetaData* modMD = pCtx->getModuleMetaData(); 194 FunctionMetaData* funcMD = &modMD->FuncMD[entry]; 195 ResourceAllocMD* resAllocMD = &funcMD->resAllocMD; 196 IGC_ASSERT_MESSAGE(resAllocMD->argAllocMDList.size() > 0, "ArgAllocMD List Out of Bounds"); 197 ArgAllocMD* argAlloc = &resAllocMD->argAllocMDList[argNo]; 198 return (ResourceExtensionTypeEnum)argAlloc->extensionType; 199 } 200 CreateInlineSamplerAnnotations()201 void COpenCLKernel::CreateInlineSamplerAnnotations() 202 { 203 if (m_Context->getModuleMetaData()->FuncMD.find(entry) != m_Context->getModuleMetaData()->FuncMD.end()) 204 { 205 FunctionMetaData funcMD = m_Context->getModuleMetaData()->FuncMD.find(entry)->second; 206 207 ResourceAllocMD resAllocMD = funcMD.resAllocMD; 208 209 for (const auto &inlineSamplerMD : resAllocMD.inlineSamplersMD) 210 { 211 auto samplerInput = std::make_unique<iOpenCL::SamplerInputAnnotation>(); 212 213 samplerInput->SamplerType = iOpenCL::SAMPLER_OBJECT_TEXTURE; 214 samplerInput->SamplerTableIndex = inlineSamplerMD.index; 215 216 samplerInput->TCXAddressMode = iOpenCL::SAMPLER_TEXTURE_ADDRESS_MODE(inlineSamplerMD.TCXAddressMode); 217 samplerInput->TCYAddressMode = iOpenCL::SAMPLER_TEXTURE_ADDRESS_MODE(inlineSamplerMD.TCYAddressMode); 218 samplerInput->TCZAddressMode = iOpenCL::SAMPLER_TEXTURE_ADDRESS_MODE(inlineSamplerMD.TCZAddressMode); 219 samplerInput->NormalizedCoords = inlineSamplerMD.NormalizedCoords != 0 ? true : false; 220 221 samplerInput->MagFilterType = iOpenCL::SAMPLER_MAPFILTER_TYPE(inlineSamplerMD.MagFilterType); 222 samplerInput->MinFilterType = iOpenCL::SAMPLER_MAPFILTER_TYPE(inlineSamplerMD.MinFilterType); 223 samplerInput->MipFilterType = iOpenCL::SAMPLER_MIPFILTER_TYPE(inlineSamplerMD.MipFilterType); 224 samplerInput->CompareFunc = iOpenCL::SAMPLER_COMPARE_FUNC_TYPE(inlineSamplerMD.CompareFunc); 225 226 samplerInput->BorderColorR = inlineSamplerMD.BorderColorR; 227 samplerInput->BorderColorG = inlineSamplerMD.BorderColorG; 228 samplerInput->BorderColorB = inlineSamplerMD.BorderColorB; 229 samplerInput->BorderColorA = inlineSamplerMD.BorderColorA; 230 231 m_kernelInfo.m_samplerInput.push_back(std::move(samplerInput)); 232 } 233 234 m_kernelInfo.m_HasInlineVmeSamplers = funcMD.hasInlineVmeSamplers; 235 } 236 } 237 CreateKernelArgInfo()238 void COpenCLKernel::CreateKernelArgInfo() 239 { 240 FunctionInfoMetaDataHandle funcInfoMD = m_pMdUtils->getFunctionsInfoItem(entry); 241 242 uint count = 0; 243 if (m_Context->getModuleMetaData()->FuncMD.find(entry) != m_Context->getModuleMetaData()->FuncMD.end()) 244 { 245 FunctionMetaData* funcMD = &m_Context->getModuleMetaData()->FuncMD[entry]; 246 count = funcMD->m_OpenCLArgAccessQualifiers.size(); 247 } 248 249 for (uint i = 0; i < count; ++i) 250 { 251 auto kernelArgInfo = std::make_unique<iOpenCL::KernelArgumentInfoAnnotation>(); 252 FunctionMetaData* funcMD = &m_Context->getModuleMetaData()->FuncMD[entry]; 253 254 // Format the strings the way the OpenCL runtime expects them 255 256 // The access qualifier is expected to have a "__" prefix, 257 // or an upper-case "NONE" if there is no qualifier 258 kernelArgInfo->AccessQualifier = funcMD->m_OpenCLArgAccessQualifiers[i]; 259 if (kernelArgInfo->AccessQualifier == "none" || kernelArgInfo->AccessQualifier == "") 260 { 261 kernelArgInfo->AccessQualifier = "NONE"; 262 } 263 else if (kernelArgInfo->AccessQualifier[0] != '_') 264 { 265 kernelArgInfo->AccessQualifier = "__" + kernelArgInfo->AccessQualifier; 266 } 267 268 // The address space is expected to have a __ prefix 269 switch (funcMD->m_OpenCLArgAddressSpaces[i]) 270 { 271 case ADDRESS_SPACE_CONSTANT: 272 kernelArgInfo->AddressQualifier = "__constant"; 273 break; 274 case ADDRESS_SPACE_GLOBAL: 275 kernelArgInfo->AddressQualifier = "__global"; 276 break; 277 case ADDRESS_SPACE_LOCAL: 278 kernelArgInfo->AddressQualifier = "__local"; 279 break; 280 case ADDRESS_SPACE_PRIVATE: 281 kernelArgInfo->AddressQualifier = "__private"; 282 break; 283 default: 284 m_Context->EmitError("Generic pointers are not allowed as kernel argument storage class!", nullptr); 285 IGC_ASSERT_MESSAGE(0, "Unexpected address space"); 286 break; 287 } 288 289 // ArgNames is not guaranteed to be present if -cl-kernel-arg-info 290 // is not passed in. 291 if (funcMD->m_OpenCLArgNames.size() > i) 292 { 293 kernelArgInfo->ArgumentName = funcMD->m_OpenCLArgNames[i]; 294 } 295 296 // The type name is expected to also have the type size, appended after a ";" 297 kernelArgInfo->TypeName = funcMD->m_OpenCLArgTypes[i] + ";"; 298 299 // Unfortunately, unlike SPIR, legacy OCL uses an ABI that has byval pointers. 300 // So, if the parameter is a byval pointer, look at the contained type 301 { 302 Function::arg_iterator argumentIter = entry->arg_begin(); 303 std::advance(argumentIter, i); 304 305 Type* argType = entry->getFunctionType()->getParamType(i); 306 if (argumentIter->hasByValAttr()) 307 { 308 argType = argType->getContainedType(0); 309 } 310 311 kernelArgInfo->TypeName += utostr(m_DL->getTypeAllocSize(argType)); 312 } 313 314 // If there are no type qualifiers, "NONE" is expected 315 kernelArgInfo->TypeQualifier = funcMD->m_OpenCLArgTypeQualifiers[i]; 316 if (kernelArgInfo->TypeQualifier == "") 317 { 318 kernelArgInfo->TypeQualifier = "NONE"; 319 } 320 321 m_kernelInfo.m_kernelArgInfo.push_back(std::move(kernelArgInfo)); 322 } 323 } 324 CreateKernelAttributeInfo()325 void COpenCLKernel::CreateKernelAttributeInfo() 326 { 327 FunctionInfoMetaDataHandle funcInfoMD = m_pMdUtils->getFunctionsInfoItem(entry); 328 329 // We need to concatenate 2 things: 330 // (a) LLVM attributes, except nounwind. Why? Because that's how IGIL does it. 331 // (b) The attributes that get translated into SPIR metadata: 332 // (*) vec_type_hint 333 // (*) reqd_work_group_size 334 // (*) work_group_size_hint 335 // 336 337 // Get LLVM function attributes, and erase "nounwind" if necessary 338 m_kernelInfo.m_kernelAttributeInfo = entry->getAttributes().getAsString(-1); 339 size_t nounwindLoc = m_kernelInfo.m_kernelAttributeInfo.find("nounwind"); 340 if (nounwindLoc != std::string::npos) 341 { 342 //8 is the length of "nounwind". 343 //If this is not the first attribute, it has a leading space, which we also want to delete. 344 int eraseLen = 8; 345 if (nounwindLoc != 0) 346 { 347 nounwindLoc--; 348 eraseLen++; 349 } 350 m_kernelInfo.m_kernelAttributeInfo.erase(nounwindLoc, eraseLen); 351 } 352 353 // Now fill in the special OCL attributes from the MD 354 VectorTypeHintMetaDataHandle vecTypeHintInfo = funcInfoMD->getOpenCLVectorTypeHint(); 355 if (vecTypeHintInfo->hasValue()) 356 { 357 m_kernelInfo.m_kernelAttributeInfo += " " + getVecTypeHintString(vecTypeHintInfo); 358 } 359 SubGroupSizeMetaDataHandle subGroupSize = funcInfoMD->getSubGroupSize(); 360 if (subGroupSize->hasValue()) 361 { 362 m_kernelInfo.m_kernelAttributeInfo += " " + getSubGroupSizeString(subGroupSize); 363 } 364 365 auto it = m_Context->getModuleMetaData()->FuncMD.find(entry); 366 if (it != m_Context->getModuleMetaData()->FuncMD.end()) 367 { 368 WorkGroupWalkOrderMD workgroupWalkOrder = it->second.workGroupWalkOrder; 369 if (workgroupWalkOrder.dim0 || workgroupWalkOrder.dim1 || workgroupWalkOrder.dim2) 370 { 371 m_kernelInfo.m_kernelAttributeInfo += " " + getWorkgroupWalkOrderString(workgroupWalkOrder); 372 } 373 } 374 375 ThreadGroupSizeMetaDataHandle threadGroupSize = funcInfoMD->getThreadGroupSize(); 376 if (threadGroupSize->hasValue()) 377 { 378 m_kernelInfo.m_kernelAttributeInfo += " " + getThreadGroupSizeString(threadGroupSize, false); 379 } 380 381 ThreadGroupSizeMetaDataHandle threadGroupSizeHint = funcInfoMD->getThreadGroupSizeHint(); 382 if (threadGroupSizeHint->hasValue()) 383 { 384 m_kernelInfo.m_kernelAttributeInfo += " " + getThreadGroupSizeString(threadGroupSizeHint, true); 385 } 386 } 387 getThreadGroupSizeString(ThreadGroupSizeMetaDataHandle & threadGroupSize,bool isHint)388 std::string COpenCLKernel::getThreadGroupSizeString(ThreadGroupSizeMetaDataHandle& threadGroupSize, bool isHint) 389 { 390 std::string threadGroupSizeString = ""; 391 if (isHint) 392 { 393 threadGroupSizeString = "work_group_size_hint("; 394 } 395 else 396 { 397 threadGroupSizeString = "reqd_work_group_size("; 398 } 399 400 threadGroupSizeString += utostr(threadGroupSize->getXDim()) + ","; 401 threadGroupSizeString += utostr(threadGroupSize->getYDim()) + ","; 402 threadGroupSizeString += utostr(threadGroupSize->getZDim()); 403 404 threadGroupSizeString += ")"; 405 return threadGroupSizeString; 406 } getSubGroupSizeString(SubGroupSizeMetaDataHandle & subGroupSize)407 std::string COpenCLKernel::getSubGroupSizeString(SubGroupSizeMetaDataHandle& subGroupSize) 408 { 409 std::string subTypeString = "intel_reqd_sub_group_size("; 410 subTypeString += utostr(subGroupSize->getSIMD_size()); 411 subTypeString += ")"; 412 return subTypeString; 413 } getWorkgroupWalkOrderString(const IGC::WorkGroupWalkOrderMD & workgroupWalkOrder)414 std::string COpenCLKernel::getWorkgroupWalkOrderString(const IGC::WorkGroupWalkOrderMD& workgroupWalkOrder) 415 { 416 std::string subTypeString = "intel_reqd_workgroup_walk_order("; 417 subTypeString += utostr(workgroupWalkOrder.dim0) + ","; 418 subTypeString += utostr(workgroupWalkOrder.dim1) + ","; 419 subTypeString += utostr(workgroupWalkOrder.dim2) + ","; 420 subTypeString += ")"; 421 return subTypeString; 422 } getVecTypeHintString(VectorTypeHintMetaDataHandle & vecTypeHintInfo)423 std::string COpenCLKernel::getVecTypeHintString(VectorTypeHintMetaDataHandle& vecTypeHintInfo) 424 { 425 std::string vecTypeString = "vec_type_hint("; 426 427 // Get the information about the type 428 Type* baseType = vecTypeHintInfo->getVecType()->getType(); 429 unsigned int numElements = 1; 430 if (baseType->isVectorTy()) 431 { 432 numElements = (unsigned)cast<IGCLLVM::FixedVectorType>(baseType)->getNumElements(); 433 baseType = cast<VectorType>(baseType)->getElementType(); 434 } 435 436 // Integer types need to be qualified with a "u" if they are unsigned 437 if (baseType->isIntegerTy()) 438 { 439 std::string signString = vecTypeHintInfo->getSign() ? "" : "u"; 440 vecTypeString += signString; 441 } 442 443 switch (baseType->getTypeID()) 444 { 445 case Type::IntegerTyID: 446 switch (baseType->getIntegerBitWidth()) 447 { 448 case 8: 449 vecTypeString += "char"; 450 break; 451 case 16: 452 vecTypeString += "short"; 453 break; 454 case 32: 455 vecTypeString += "int"; 456 break; 457 case 64: 458 vecTypeString += "long"; 459 break; 460 default: 461 IGC_ASSERT_MESSAGE(0, "Unexpected data type in vec_type_hint"); 462 break; 463 } 464 break; 465 case Type::DoubleTyID: 466 vecTypeString += "double"; 467 break; 468 case Type::FloatTyID: 469 vecTypeString += "float"; 470 break; 471 case Type::HalfTyID: 472 vecTypeString += "half"; 473 break; 474 default: 475 IGC_ASSERT_MESSAGE(0, "Unexpected data type in vec_type_hint"); 476 break; 477 } 478 479 if (numElements != 1) 480 { 481 vecTypeString += utostr(numElements); 482 } 483 484 vecTypeString += ")"; 485 486 return vecTypeString; 487 } 488 CreatePrintfStringAnnotations()489 void COpenCLKernel::CreatePrintfStringAnnotations() 490 { 491 auto printfStrings = GetPrintfStrings(*entry->getParent()); 492 493 for (const auto& printfString : printfStrings) 494 { 495 auto printfAnnotation = std::make_unique<iOpenCL::PrintfStringAnnotation>(); 496 printfAnnotation->Index = printfString.first; 497 printfAnnotation->StringSize = printfString.second.size() + 1; 498 printfAnnotation->StringData = new char[printfAnnotation->StringSize + 1]; 499 500 memcpy_s(printfAnnotation->StringData, printfAnnotation->StringSize, printfString.second.c_str(), printfAnnotation->StringSize); 501 printfAnnotation->StringData[printfAnnotation->StringSize - 1] = '\0'; 502 503 m_kernelInfo.m_printfStringAnnotations.push_back(std::move(printfAnnotation)); 504 } 505 } 506 CreateZEPayloadArguments(IGC::KernelArg * kernelArg,uint payloadPosition)507 bool COpenCLKernel::CreateZEPayloadArguments(IGC::KernelArg* kernelArg, uint payloadPosition) 508 { 509 switch (kernelArg->getArgType()) { 510 511 case KernelArg::ArgType::IMPLICIT_PAYLOAD_HEADER:{ 512 // PayloadHeader contains global work offset x,y,z and local size x,y,z 513 // global work offset, size is int32x3 514 uint cur_pos = payloadPosition; 515 uint32_t size = iOpenCL::DATA_PARAMETER_DATA_SIZE * 3; 516 zebin::ZEInfoBuilder::addPayloadArgumentImplicit(m_kernelInfo.m_zePayloadArgs, 517 zebin::PreDefinedAttrGetter::ArgType::global_id_offset, cur_pos, size); 518 cur_pos += size; 519 // local size, size is int32x3, the same as above 520 zebin::ZEInfoBuilder::addPayloadArgumentImplicit(m_kernelInfo.m_zePayloadArgs, 521 zebin::PreDefinedAttrGetter::ArgType::local_size, cur_pos, size); 522 break; 523 } 524 case KernelArg::ArgType::IMPLICIT_PRIVATE_BASE: 525 zebin::ZEInfoBuilder::addPayloadArgumentImplicit(m_kernelInfo.m_zePayloadArgs, 526 zebin::PreDefinedAttrGetter::ArgType::private_base_stateless, 527 payloadPosition, kernelArg->getAllocateSize()); 528 break; 529 530 case KernelArg::ArgType::IMPLICIT_NUM_GROUPS: 531 zebin::ZEInfoBuilder::addPayloadArgumentImplicit(m_kernelInfo.m_zePayloadArgs, 532 zebin::PreDefinedAttrGetter::ArgType::group_count, 533 payloadPosition, iOpenCL::DATA_PARAMETER_DATA_SIZE * 3); 534 break; 535 536 case KernelArg::ArgType::IMPLICIT_LOCAL_SIZE: 537 // FIXME: duplicated information as KernelArg::ArgType::IMPLICIT_PAYLOAD_HEADER? 538 zebin::ZEInfoBuilder::addPayloadArgumentImplicit(m_kernelInfo.m_zePayloadArgs, 539 zebin::PreDefinedAttrGetter::ArgType::local_size, 540 payloadPosition, iOpenCL::DATA_PARAMETER_DATA_SIZE * 3); 541 break; 542 543 case KernelArg::ArgType::IMPLICIT_ENQUEUED_LOCAL_WORK_SIZE: 544 zebin::ZEInfoBuilder::addPayloadArgumentImplicit(m_kernelInfo.m_zePayloadArgs, 545 zebin::PreDefinedAttrGetter::ArgType::enqueued_local_size, 546 payloadPosition, iOpenCL::DATA_PARAMETER_DATA_SIZE * 3); 547 break; 548 549 case KernelArg::ArgType::IMPLICIT_GLOBAL_SIZE: 550 zebin::ZEInfoBuilder::addPayloadArgumentImplicit(m_kernelInfo.m_zePayloadArgs, 551 zebin::PreDefinedAttrGetter::ArgType::global_size, 552 payloadPosition, iOpenCL::DATA_PARAMETER_DATA_SIZE * 3); 553 break; 554 555 case KernelArg::ArgType::IMPLICIT_WORK_DIM: 556 zebin::ZEInfoBuilder::addPayloadArgumentImplicit(m_kernelInfo.m_zePayloadArgs, 557 zebin::PreDefinedAttrGetter::ArgType::work_dimensions, 558 payloadPosition, iOpenCL::DATA_PARAMETER_DATA_SIZE); 559 break; 560 561 // pointer args 562 case KernelArg::ArgType::PTR_GLOBAL: 563 case KernelArg::ArgType::PTR_CONSTANT: { 564 uint32_t arg_idx = kernelArg->getAssociatedArgNo(); 565 566 // Add BTI argument if being promoted 567 // FIXME: do not set bti if the number is 0xffffffff (?) 568 SOpenCLKernelInfo::SResourceInfo resInfo = getResourceInfo(arg_idx); 569 uint32_t bti_idx = getBTI(resInfo); 570 if (bti_idx != 0xffffffff) { 571 // add BTI argument with addr_mode set to stateful 572 // promoted arg has 0 offset and 0 size 573 zebin::ZEInfoBuilder::addPayloadArgumentByPointer(m_kernelInfo.m_zePayloadArgs, 574 0, 0, arg_idx, 575 zebin::PreDefinedAttrGetter::ArgAddrMode::stateful, 576 (kernelArg->getArgType() == KernelArg::ArgType::PTR_GLOBAL)? 577 zebin::PreDefinedAttrGetter::ArgAddrSpace::global : 578 zebin::PreDefinedAttrGetter::ArgAddrSpace::constant, 579 (kernelArg->getArgType() == KernelArg::ArgType::PTR_GLOBAL)? 580 zebin::PreDefinedAttrGetter::ArgAccessType::readwrite : 581 zebin::PreDefinedAttrGetter::ArgAccessType::readonly 582 ); 583 // add the corresponding BTI table index 584 zebin::ZEInfoBuilder::addBindingTableIndex(m_kernelInfo.m_zeBTIArgs, 585 bti_idx, arg_idx); 586 } 587 // FIXME: check if all reference are promoted, if it is, we can skip 588 // creating non-bti payload arg 589 /* 590 bool is_bti_only = 591 IGC_IS_FLAG_ENABLED(EnableStatelessToStatefull) && 592 IGC_IS_FLAG_ENABLED(EnableStatefulToken) && 593 m_DriverInfo->SupportStatefulToken() && 594 kernelArg->getArg() && 595 ((kernelArg->getArgType() == KernelArg::ArgType::PTR_GLOBAL && 596 (kernelArg->getArg()->use_empty() || !GetHasGlobalStatelessAccess())) || 597 (kernelArg->getArgType() == KernelArg::ArgType::PTR_CONSTANT && 598 (kernelArg->getArg()->use_empty() || !GetHasConstantStatelessAccess()))); 599 // no need to add normal argument if all use are promoted 600 if (is_bti_only) 601 break; 602 */ 603 ResourceAllocMD& resAllocMD = GetContext()->getModuleMetaData()->FuncMD[entry].resAllocMD; 604 IGC_ASSERT_MESSAGE(resAllocMD.argAllocMDList.size() > 0, "ArgAllocMDList is empty."); 605 606 ArgAllocMD& argAlloc = resAllocMD.argAllocMDList[arg_idx]; 607 608 zebin::PreDefinedAttrGetter::ArgAddrMode addr_mode = 609 zebin::PreDefinedAttrGetter::ArgAddrMode::stateless; 610 if (argAlloc.type == ResourceTypeEnum::BindlessUAVResourceType) 611 addr_mode = zebin::PreDefinedAttrGetter::ArgAddrMode::bindless; 612 613 zebin::ZEInfoBuilder::addPayloadArgumentByPointer(m_kernelInfo.m_zePayloadArgs, 614 payloadPosition, kernelArg->getAllocateSize(), arg_idx, addr_mode, 615 (kernelArg->getArgType() == KernelArg::ArgType::PTR_GLOBAL)? 616 zebin::PreDefinedAttrGetter::ArgAddrSpace::global : 617 zebin::PreDefinedAttrGetter::ArgAddrSpace::constant, 618 (kernelArg->getArgType() == KernelArg::ArgType::PTR_GLOBAL)? 619 zebin::PreDefinedAttrGetter::ArgAccessType::readwrite : 620 zebin::PreDefinedAttrGetter::ArgAccessType::readonly 621 ); 622 break; 623 } 624 case KernelArg::ArgType::PTR_LOCAL: 625 zebin::ZEInfoBuilder::addPayloadArgumentByPointer(m_kernelInfo.m_zePayloadArgs, 626 payloadPosition, kernelArg->getAllocateSize(), 627 kernelArg->getAssociatedArgNo(), 628 zebin::PreDefinedAttrGetter::ArgAddrMode::slm, 629 zebin::PreDefinedAttrGetter::ArgAddrSpace::local, 630 zebin::PreDefinedAttrGetter::ArgAccessType::readwrite); 631 break; 632 // by value arguments 633 case KernelArg::ArgType::CONSTANT_REG: 634 zebin::ZEInfoBuilder::addPayloadArgumentByValue(m_kernelInfo.m_zePayloadArgs, 635 payloadPosition, kernelArg->getAllocateSize(), 636 kernelArg->getAssociatedArgNo()); 637 break; 638 639 // Local ids are supported in per-thread payload arguments 640 case KernelArg::ArgType::IMPLICIT_LOCAL_IDS: 641 break; 642 643 // Images 644 case KernelArg::ArgType::IMAGE_1D: 645 case KernelArg::ArgType::BINDLESS_IMAGE_1D: 646 case KernelArg::ArgType::IMAGE_1D_BUFFER: 647 case KernelArg::ArgType::BINDLESS_IMAGE_1D_BUFFER: 648 case KernelArg::ArgType::IMAGE_2D: 649 case KernelArg::ArgType::BINDLESS_IMAGE_2D: 650 case KernelArg::ArgType::IMAGE_3D: 651 case KernelArg::ArgType::BINDLESS_IMAGE_3D: 652 case KernelArg::ArgType::IMAGE_CUBE: 653 case KernelArg::ArgType::BINDLESS_IMAGE_CUBE: 654 case KernelArg::ArgType::IMAGE_CUBE_DEPTH: 655 case KernelArg::ArgType::BINDLESS_IMAGE_CUBE_DEPTH: 656 case KernelArg::ArgType::IMAGE_1D_ARRAY: 657 case KernelArg::ArgType::BINDLESS_IMAGE_1D_ARRAY: 658 case KernelArg::ArgType::IMAGE_2D_ARRAY: 659 case KernelArg::ArgType::BINDLESS_IMAGE_2D_ARRAY: 660 case KernelArg::ArgType::IMAGE_2D_DEPTH: 661 case KernelArg::ArgType::BINDLESS_IMAGE_2D_DEPTH: 662 case KernelArg::ArgType::IMAGE_2D_DEPTH_ARRAY: 663 case KernelArg::ArgType::BINDLESS_IMAGE_2D_DEPTH_ARRAY: 664 case KernelArg::ArgType::IMAGE_2D_MSAA: 665 case KernelArg::ArgType::BINDLESS_IMAGE_2D_MSAA: 666 case KernelArg::ArgType::IMAGE_2D_MSAA_ARRAY: 667 case KernelArg::ArgType::BINDLESS_IMAGE_2D_MSAA_ARRAY: 668 case KernelArg::ArgType::IMAGE_2D_MSAA_DEPTH: 669 case KernelArg::ArgType::BINDLESS_IMAGE_2D_MSAA_DEPTH: 670 case KernelArg::ArgType::IMAGE_2D_MSAA_DEPTH_ARRAY: 671 case KernelArg::ArgType::BINDLESS_IMAGE_2D_MSAA_DEPTH_ARRAY: 672 case KernelArg::ArgType::IMAGE_CUBE_ARRAY: 673 case KernelArg::ArgType::BINDLESS_IMAGE_CUBE_ARRAY: 674 case KernelArg::ArgType::IMAGE_CUBE_DEPTH_ARRAY: 675 case KernelArg::ArgType::BINDLESS_IMAGE_CUBE_DEPTH_ARRAY: 676 { 677 int arg_idx = kernelArg->getAssociatedArgNo(); 678 SOpenCLKernelInfo::SResourceInfo resInfo = getResourceInfo(arg_idx); 679 680 // check if the image is writeable 681 bool writeable = false; 682 if (resInfo.Type == SOpenCLKernelInfo::SResourceInfo::RES_UAV && 683 kernelArg->getAccessQual() != IGC::KernelArg::AccessQual::READ_ONLY) 684 writeable = true; 685 IGC_ASSERT_MESSAGE(resInfo.Type == SOpenCLKernelInfo::SResourceInfo::RES_UAV || 686 resInfo.Type == SOpenCLKernelInfo::SResourceInfo::RES_SRV, "Unknown resource type"); 687 688 // the image arg is either bindless or stateful. check from "kernelArg->needsAllocation()" 689 // For statefull image argument, the arg has 0 offset and 0 size 690 zebin::PreDefinedAttrGetter::ArgAddrMode arg_addrmode = 691 zebin::PreDefinedAttrGetter::ArgAddrMode::stateful; 692 uint arg_off = 0; 693 uint arg_size = 0; 694 695 if (kernelArg->needsAllocation()) { 696 // set to bindless 697 arg_addrmode = 698 zebin::PreDefinedAttrGetter::ArgAddrMode::bindless; 699 arg_off = payloadPosition; 700 arg_size = kernelArg->getAllocateSize(); 701 } else { 702 // add bti index for this arg if it's stateful 703 zebin::ZEInfoBuilder::addBindingTableIndex(m_kernelInfo.m_zeBTIArgs, 704 getBTI(resInfo), arg_idx); 705 } 706 707 // add the payload argument 708 zebin::ZEInfoBuilder::addPayloadArgumentByPointer(m_kernelInfo.m_zePayloadArgs, 709 arg_off, arg_size, arg_idx, arg_addrmode, 710 zebin::PreDefinedAttrGetter::ArgAddrSpace::image, 711 writeable ? 712 zebin::PreDefinedAttrGetter::ArgAccessType::readwrite : 713 zebin::PreDefinedAttrGetter::ArgAccessType::readonly 714 ); 715 } 716 break; 717 718 // sampler 719 case KernelArg::ArgType::SAMPLER: 720 case KernelArg::ArgType::BINDLESS_SAMPLER: 721 { 722 // the sampler arg is either bindless or stateful. check from "kernelArg->needsAllocation()" 723 // For statefull image argument, the arg has 0 offset and 0 size 724 // NOTE: we only have statefull sampler now 725 zebin::PreDefinedAttrGetter::ArgAddrMode arg_addrmode = 726 zebin::PreDefinedAttrGetter::ArgAddrMode::stateful; 727 uint arg_off = 0; 728 uint arg_size = 0; 729 if (kernelArg->needsAllocation()) { 730 // set to bindless 731 arg_addrmode = 732 zebin::PreDefinedAttrGetter::ArgAddrMode::bindless; 733 arg_off = payloadPosition; 734 arg_size = kernelArg->getAllocateSize(); 735 } 736 737 int arg_idx = kernelArg->getAssociatedArgNo(); 738 SOpenCLKernelInfo::SResourceInfo resInfo = getResourceInfo(arg_idx); 739 // add the payload argument 740 zebin::ZEInfoBuilder::addPayloadArgumentSampler(m_kernelInfo.m_zePayloadArgs, 741 arg_off, arg_size, arg_idx, resInfo.Index, arg_addrmode, 742 zebin::PreDefinedAttrGetter::ArgAccessType::readwrite); 743 } 744 break; 745 746 case KernelArg::ArgType::IMPLICIT_BUFFER_OFFSET: 747 { 748 zebin::zeInfoPayloadArgument& arg = zebin::ZEInfoBuilder::addPayloadArgumentImplicit(m_kernelInfo.m_zePayloadArgs, 749 zebin::PreDefinedAttrGetter::ArgType::buffer_offset, 750 payloadPosition, kernelArg->getAllocateSize()); 751 arg.arg_index = kernelArg->getAssociatedArgNo(); 752 } 753 break; 754 755 case KernelArg::ArgType::IMPLICIT_PRINTF_BUFFER: 756 zebin::ZEInfoBuilder::addPayloadArgumentImplicit(m_kernelInfo.m_zePayloadArgs, 757 zebin::PreDefinedAttrGetter::ArgType::printf_buffer, 758 payloadPosition, kernelArg->getAllocateSize()); 759 break; 760 761 case KernelArg::ArgType::IMPLICIT_ARG_BUFFER: 762 zebin::ZEInfoBuilder::addPayloadArgumentImplicit(m_kernelInfo.m_zePayloadArgs, 763 zebin::PreDefinedAttrGetter::ArgType::implicit_arg_buffer, 764 payloadPosition, kernelArg->getAllocateSize()); 765 break; 766 767 case KernelArg::ArgType::IMPLICIT_LOCAL_ID_BUFFER: 768 zebin::ZEInfoBuilder::addPayloadArgumentImplicit(m_kernelInfo.m_zePayloadArgs, 769 zebin::PreDefinedAttrGetter::ArgType::implicit_local_id_buffer, 770 payloadPosition, kernelArg->getAllocateSize()); 771 break; 772 773 // We don't need these in ZEBinary, can safely skip them 774 case KernelArg::ArgType::IMPLICIT_R0: 775 case KernelArg::ArgType::R1: 776 case KernelArg::ArgType::STRUCT: 777 // FIXME: this implicit arg is not used nowadays, should remove it completely 778 case KernelArg::ArgType::IMPLICIT_SAMPLER_SNAP_WA: 779 break; 780 781 // FIXME: should these be supported? 782 // CONSTANT_BASE and GLOBAL_BASE are not required that we should export 783 // all globals and constants and let the runtime relocate them when enabling 784 // ZEBinary 785 case KernelArg::ArgType::IMPLICIT_CONSTANT_BASE: 786 case KernelArg::ArgType::IMPLICIT_GLOBAL_BASE: 787 case KernelArg::ArgType::IMPLICIT_STAGE_IN_GRID_ORIGIN: 788 case KernelArg::ArgType::IMPLICIT_STAGE_IN_GRID_SIZE: 789 default: 790 return false; 791 break; 792 } // end switch (kernelArg->getArgType()) 793 794 return true; 795 } 796 CreateAnnotations(KernelArg * kernelArg,uint payloadPosition)797 void COpenCLKernel::CreateAnnotations(KernelArg* kernelArg, uint payloadPosition) 798 { 799 KernelArg::ArgType type = kernelArg->getArgType(); 800 801 DWORD constantType = iOpenCL::DATA_PARAMETER_TOKEN_UNKNOWN; 802 iOpenCL::POINTER_ADDRESS_SPACE addressSpace = iOpenCL::KERNEL_ARGUMENT_ADDRESS_SPACE_INVALID; 803 FunctionInfoMetaDataHandle funcInfoMD = m_pMdUtils->getFunctionsInfoItem(entry); 804 805 static const DWORD DEFAULT_ARG_NUM = 0; 806 const llvm::Argument* arg = kernelArg->getArg(); 807 808 switch (type) { 809 810 case KernelArg::ArgType::IMPLICIT_R0: 811 for (Value::const_user_iterator U = arg->user_begin(), UE = arg->user_end(); U != UE; ++U) 812 { 813 const ExtractElementInst* EEI = dyn_cast<ExtractElementInst>(*U); 814 815 if (EEI) 816 { 817 const ConstantInt* index = dyn_cast<ConstantInt>(EEI->getIndexOperand()); 818 if (index) 819 { 820 uint64_t value = index->getZExtValue(); 821 if (value == 1 || value == 6 || value == 7) 822 { 823 // group ids x/y/z 824 ModuleMetaData* modMD = m_Context->getModuleMetaData(); 825 auto it = modMD->FuncMD.find(entry); 826 if (it != modMD->FuncMD.end()) 827 { 828 if (it->second.groupIDPresent == true) 829 m_kernelInfo.m_threadPayload.HasGroupID = true; 830 } 831 break; 832 } 833 } 834 } 835 } 836 break; 837 838 case KernelArg::ArgType::IMPLICIT_PAYLOAD_HEADER: 839 // PayloadHeader contains global work offset x,y,z and local size x,y,z --> 840 // total of 6 annotations, 3 of each type 841 for (int i = 0; i < 6; ++i) 842 { 843 auto constInput = std::make_unique<iOpenCL::ConstantInputAnnotation>(); 844 845 DWORD sizeInBytes = iOpenCL::DATA_PARAMETER_DATA_SIZE; 846 847 constInput->ConstantType = (i < 3 ? 848 iOpenCL::DATA_PARAMETER_GLOBAL_WORK_OFFSET : 849 iOpenCL::DATA_PARAMETER_LOCAL_WORK_SIZE); 850 constInput->Offset = (i % 3) * sizeInBytes; 851 constInput->PayloadPosition = payloadPosition; 852 constInput->PayloadSizeInBytes = sizeInBytes; 853 constInput->ArgumentNumber = DEFAULT_ARG_NUM; 854 m_kernelInfo.m_constantInputAnnotation.push_back(std::move(constInput)); 855 856 payloadPosition += sizeInBytes; 857 } 858 859 for (Value::const_user_iterator U = arg->user_begin(), UE = arg->user_end(); U != UE; ++U) 860 { 861 const ExtractElementInst* EEI = dyn_cast<ExtractElementInst>(*U); 862 863 if (EEI) 864 { 865 const ConstantInt* index = dyn_cast<ConstantInt>(EEI->getIndexOperand()); 866 if (index) 867 { 868 uint64_t value = index->getZExtValue(); 869 if (value == 0 || value == 1 || value == 2) 870 { 871 // global offset x/y/z 872 ModuleMetaData* modMD = m_Context->getModuleMetaData(); 873 auto it = modMD->FuncMD.find(entry); 874 if (it != modMD->FuncMD.end()) 875 { 876 if (it->second.globalIDPresent) 877 m_kernelInfo.m_threadPayload.HasGlobalIDOffset = true; 878 } 879 break; 880 } 881 } 882 } 883 } 884 break; 885 886 case KernelArg::ArgType::IMPLICIT_BINDLESS_OFFSET: 887 { 888 int argNo = kernelArg->getAssociatedArgNo(); 889 std::shared_ptr<iOpenCL::PointerArgumentAnnotation> ptrAnnotation = m_kernelInfo.m_argOffsetMap[argNo]; 890 ptrAnnotation->BindingTableIndex = payloadPosition; 891 } 892 break; 893 894 case KernelArg::ArgType::PTR_GLOBAL: 895 if (addressSpace == iOpenCL::KERNEL_ARGUMENT_ADDRESS_SPACE_INVALID) { 896 addressSpace = iOpenCL::KERNEL_ARGUMENT_ADDRESS_SPACE_GLOBAL; 897 } 898 // Fall through until PTR_CONSTANT 899 case KernelArg::ArgType::PTR_CONSTANT: 900 if (addressSpace == iOpenCL::KERNEL_ARGUMENT_ADDRESS_SPACE_INVALID) { 901 addressSpace = iOpenCL::KERNEL_ARGUMENT_ADDRESS_SPACE_CONSTANT; 902 } 903 // may reach here from PTR_GLOBAL, PTR_CONSTANT 904 IGC_ASSERT(addressSpace != iOpenCL::KERNEL_ARGUMENT_ADDRESS_SPACE_INVALID); 905 906 { 907 int argNo = kernelArg->getAssociatedArgNo(); 908 SOpenCLKernelInfo::SResourceInfo resInfo = getResourceInfo(argNo); 909 m_kernelInfo.m_argIndexMap[argNo] = getBTI(resInfo); 910 CodeGenContext* pCtx = GetContext(); 911 ModuleMetaData* modMD = pCtx->getModuleMetaData(); 912 FunctionMetaData* funcMD = &modMD->FuncMD[entry]; 913 ResourceAllocMD* resAllocMD = &funcMD->resAllocMD; 914 IGC_ASSERT_MESSAGE(resAllocMD->argAllocMDList.size() > 0, "ArgAllocMDList is empty."); 915 ArgAllocMD* argAlloc = &resAllocMD->argAllocMDList[argNo]; 916 917 auto ptrAnnotation = std::make_shared<iOpenCL::PointerArgumentAnnotation>(); 918 919 if (argAlloc->type == ResourceTypeEnum::BindlessUAVResourceType) 920 { 921 ptrAnnotation->IsStateless = false; 922 ptrAnnotation->IsBindlessAccess = true; 923 } 924 else 925 { 926 ptrAnnotation->IsStateless = true; 927 ptrAnnotation->IsBindlessAccess = false; 928 } 929 930 m_kernelInfo.m_argOffsetMap[argNo] = ptrAnnotation; 931 932 ptrAnnotation->AddressSpace = addressSpace; 933 ptrAnnotation->ArgumentNumber = argNo; 934 ptrAnnotation->BindingTableIndex = getBTI(resInfo); 935 ptrAnnotation->PayloadPosition = payloadPosition; 936 ptrAnnotation->PayloadSizeInBytes = kernelArg->getAllocateSize(); 937 ptrAnnotation->LocationIndex = kernelArg->getLocationIndex(); 938 ptrAnnotation->LocationCount = kernelArg->getLocationCount(); 939 ptrAnnotation->IsEmulationArgument = kernelArg->isEmulationArgument(); 940 m_kernelInfo.m_pointerArgument.push_back(ptrAnnotation); 941 } 942 break; 943 944 case KernelArg::ArgType::PTR_LOCAL: 945 { 946 auto locAnnotation = std::make_unique<iOpenCL::LocalArgumentAnnotation>(); 947 948 locAnnotation->Alignment = (DWORD)kernelArg->getAlignment(); 949 locAnnotation->PayloadPosition = payloadPosition; 950 locAnnotation->PayloadSizeInBytes = kernelArg->getAllocateSize(); 951 locAnnotation->ArgumentNumber = kernelArg->getAssociatedArgNo(); 952 locAnnotation->LocationIndex = kernelArg->getLocationIndex(); 953 locAnnotation->LocationCount = kernelArg->getLocationCount(); 954 m_kernelInfo.m_localPointerArgument.push_back(std::move(locAnnotation)); 955 } 956 break; 957 958 case KernelArg::ArgType::PTR_DEVICE_QUEUE: 959 { 960 m_kernelInfo.m_executionEnivronment.HasDeviceEnqueue = true; 961 unsigned int argNo = kernelArg->getAssociatedArgNo(); 962 SOpenCLKernelInfo::SResourceInfo resInfo = getResourceInfo(argNo); 963 m_kernelInfo.m_argIndexMap[argNo] = getBTI(resInfo); 964 965 auto ptrAnnotation = std::make_shared<iOpenCL::PointerArgumentAnnotation>(); 966 967 ptrAnnotation->AddressSpace = iOpenCL::KERNEL_ARGUMENT_ADDRESS_SPACE_DEVICE_QUEUE; 968 ptrAnnotation->ArgumentNumber = argNo; 969 ptrAnnotation->BindingTableIndex = getBTI(resInfo); 970 ptrAnnotation->IsStateless = true; 971 ptrAnnotation->PayloadPosition = payloadPosition; 972 ptrAnnotation->PayloadSizeInBytes = kernelArg->getAllocateSize(); 973 m_kernelInfo.m_pointerArgument.push_back(ptrAnnotation); 974 } 975 break; 976 case KernelArg::ArgType::CONSTANT_REG: 977 { 978 uint sourceOffsetBase = 0; 979 980 // aggregate arguments may have additional source offsets 981 if (kernelArg->getStructArgOffset() != -1) 982 { 983 sourceOffsetBase = kernelArg->getStructArgOffset(); 984 } 985 986 auto constInput = std::make_unique<iOpenCL::ConstantArgumentAnnotation>(); 987 988 DWORD sizeInBytes = kernelArg->getAllocateSize(); 989 990 constInput->Offset = sourceOffsetBase; 991 constInput->PayloadPosition = payloadPosition; 992 constInput->PayloadSizeInBytes = sizeInBytes; 993 constInput->ArgumentNumber = kernelArg->getAssociatedArgNo(); 994 constInput->LocationIndex = kernelArg->getLocationIndex(); 995 constInput->LocationCount = kernelArg->getLocationCount(); 996 constInput->IsEmulationArgument = kernelArg->isEmulationArgument(); 997 m_kernelInfo.m_constantArgumentAnnotation.push_back(std::move(constInput)); 998 999 payloadPosition += sizeInBytes; 1000 } 1001 break; 1002 1003 case KernelArg::ArgType::IMPLICIT_CONSTANT_BASE: 1004 { 1005 int argNo = kernelArg->getAssociatedArgNo(); 1006 SOpenCLKernelInfo::SResourceInfo resInfo = getResourceInfo(argNo); 1007 m_kernelInfo.m_argIndexMap[argNo] = getBTI(resInfo); 1008 1009 auto ptrAnnotation = std::make_unique<iOpenCL::PointerInputAnnotation>(); 1010 ptrAnnotation->AddressSpace = iOpenCL::KERNEL_ARGUMENT_ADDRESS_SPACE_CONSTANT; 1011 ptrAnnotation->BindingTableIndex = 0xffffffff; 1012 ptrAnnotation->IsStateless = true; 1013 ptrAnnotation->PayloadPosition = payloadPosition; 1014 ptrAnnotation->PayloadSizeInBytes = kernelArg->getAllocateSize(); 1015 ptrAnnotation->ArgumentNumber = argNo; 1016 m_kernelInfo.m_pointerInput.push_back(std::move(ptrAnnotation)); 1017 } 1018 break; 1019 1020 case KernelArg::ArgType::IMPLICIT_GLOBAL_BASE: 1021 { 1022 int argNo = kernelArg->getAssociatedArgNo(); 1023 SOpenCLKernelInfo::SResourceInfo resInfo = getResourceInfo(argNo); 1024 m_kernelInfo.m_argIndexMap[argNo] = getBTI(resInfo); 1025 1026 auto ptrAnnotation = std::make_unique<iOpenCL::PointerInputAnnotation>(); 1027 ptrAnnotation->AddressSpace = iOpenCL::KERNEL_ARGUMENT_ADDRESS_SPACE_GLOBAL; 1028 ptrAnnotation->BindingTableIndex = 0xffffffff; 1029 ptrAnnotation->IsStateless = true; 1030 ptrAnnotation->PayloadPosition = payloadPosition; 1031 ptrAnnotation->PayloadSizeInBytes = kernelArg->getAllocateSize(); 1032 ptrAnnotation->ArgumentNumber = argNo; 1033 m_kernelInfo.m_pointerInput.push_back(std::move(ptrAnnotation)); 1034 } 1035 break; 1036 1037 case KernelArg::ArgType::IMPLICIT_PRIVATE_BASE: 1038 { 1039 int argNo = kernelArg->getAssociatedArgNo(); 1040 SOpenCLKernelInfo::SResourceInfo resInfo = getResourceInfo(argNo); 1041 m_kernelInfo.m_argIndexMap[argNo] = getBTI(resInfo); 1042 1043 auto ptrAnnotation = std::make_unique<iOpenCL::PrivateInputAnnotation>(); 1044 1045 ptrAnnotation->AddressSpace = iOpenCL::KERNEL_ARGUMENT_ADDRESS_SPACE_PRIVATE; 1046 ptrAnnotation->ArgumentNumber = argNo; 1047 // PerThreadPrivateMemorySize is defined as "Total private memory requirements for each OpenCL work-item." 1048 ptrAnnotation->PerThreadPrivateMemorySize = m_perWIStatelessPrivateMemSize; 1049 ptrAnnotation->BindingTableIndex = getBTI(resInfo); 1050 ptrAnnotation->IsStateless = true; 1051 ptrAnnotation->PayloadPosition = payloadPosition; 1052 ptrAnnotation->PayloadSizeInBytes = kernelArg->getAllocateSize(); 1053 m_kernelInfo.m_pointerInput.push_back(std::move(ptrAnnotation)); 1054 } 1055 break; 1056 1057 case KernelArg::ArgType::IMPLICIT_ARG_BUFFER: 1058 case KernelArg::ArgType::IMPLICIT_LOCAL_ID_BUFFER: 1059 { 1060 constantType = kernelArg->getDataParamToken(); 1061 IGC_ASSERT(constantType != iOpenCL::DATA_PARAMETER_TOKEN_UNKNOWN); 1062 1063 auto constInput = std::make_unique<iOpenCL::ConstantInputAnnotation>(); 1064 1065 DWORD sizeInBytes = kernelArg->getAllocateSize(); 1066 if (type == KernelArg::ArgType::IMPLICIT_LOCAL_ID_BUFFER) 1067 { 1068 sizeInBytes = getLocalIdBufferSize(m_dispatchSize); 1069 } 1070 1071 constInput->ConstantType = constantType; 1072 constInput->Offset = sizeInBytes; 1073 constInput->PayloadPosition = payloadPosition; 1074 constInput->PayloadSizeInBytes = sizeInBytes; 1075 constInput->ArgumentNumber = DEFAULT_ARG_NUM; 1076 m_kernelInfo.m_constantInputAnnotation.push_back(std::move(constInput)); 1077 1078 break; 1079 } 1080 1081 case KernelArg::ArgType::IMPLICIT_NUM_GROUPS: 1082 case KernelArg::ArgType::IMPLICIT_GLOBAL_SIZE: 1083 case KernelArg::ArgType::IMPLICIT_LOCAL_SIZE: 1084 case KernelArg::ArgType::IMPLICIT_ENQUEUED_LOCAL_WORK_SIZE: 1085 case KernelArg::ArgType::IMPLICIT_STAGE_IN_GRID_ORIGIN: 1086 case KernelArg::ArgType::IMPLICIT_STAGE_IN_GRID_SIZE: 1087 1088 constantType = kernelArg->getDataParamToken(); 1089 IGC_ASSERT(constantType != iOpenCL::DATA_PARAMETER_TOKEN_UNKNOWN); 1090 1091 for (int i = 0; i < 3; ++i) 1092 { 1093 auto constInput = std::make_unique<iOpenCL::ConstantInputAnnotation>(); 1094 1095 DWORD sizeInBytes = iOpenCL::DATA_PARAMETER_DATA_SIZE; 1096 1097 constInput->ConstantType = constantType; 1098 constInput->Offset = i * sizeInBytes; 1099 constInput->PayloadPosition = payloadPosition; 1100 constInput->PayloadSizeInBytes = sizeInBytes; 1101 constInput->ArgumentNumber = DEFAULT_ARG_NUM; 1102 m_kernelInfo.m_constantInputAnnotation.push_back(std::move(constInput)); 1103 1104 payloadPosition += sizeInBytes; 1105 } 1106 1107 if (type == KernelArg::ArgType::IMPLICIT_STAGE_IN_GRID_ORIGIN) 1108 m_kernelInfo.m_threadPayload.HasStageInGridOrigin = true; 1109 else if (type == KernelArg::ArgType::IMPLICIT_STAGE_IN_GRID_SIZE) 1110 m_kernelInfo.m_threadPayload.HasStageInGridSize = true; 1111 1112 break; 1113 1114 case KernelArg::ArgType::IMPLICIT_IMAGE_HEIGHT: 1115 case KernelArg::ArgType::IMPLICIT_IMAGE_WIDTH: 1116 case KernelArg::ArgType::IMPLICIT_IMAGE_DEPTH: 1117 case KernelArg::ArgType::IMPLICIT_IMAGE_NUM_MIP_LEVELS: 1118 case KernelArg::ArgType::IMPLICIT_IMAGE_CHANNEL_DATA_TYPE: 1119 case KernelArg::ArgType::IMPLICIT_IMAGE_CHANNEL_ORDER: 1120 case KernelArg::ArgType::IMPLICIT_IMAGE_SRGB_CHANNEL_ORDER: 1121 case KernelArg::ArgType::IMPLICIT_IMAGE_ARRAY_SIZE: 1122 case KernelArg::ArgType::IMPLICIT_IMAGE_NUM_SAMPLES: 1123 case KernelArg::ArgType::IMPLICIT_SAMPLER_ADDRESS: 1124 case KernelArg::ArgType::IMPLICIT_SAMPLER_NORMALIZED: 1125 case KernelArg::ArgType::IMPLICIT_SAMPLER_SNAP_WA: 1126 case KernelArg::ArgType::IMPLICIT_FLAT_IMAGE_BASEOFFSET: 1127 case KernelArg::ArgType::IMPLICIT_FLAT_IMAGE_HEIGHT: 1128 case KernelArg::ArgType::IMPLICIT_FLAT_IMAGE_WIDTH: 1129 case KernelArg::ArgType::IMPLICIT_FLAT_IMAGE_PITCH: 1130 case KernelArg::ArgType::IMPLICIT_DEVICE_ENQUEUE_DATA_PARAMETER_OBJECT_ID: 1131 case KernelArg::ArgType::IMPLICIT_DEVICE_ENQUEUE_DISPATCHER_SIMD_SIZE: 1132 case KernelArg::ArgType::IMPLICIT_BUFFER_OFFSET: 1133 constantType = kernelArg->getDataParamToken(); 1134 IGC_ASSERT(constantType != iOpenCL::DATA_PARAMETER_TOKEN_UNKNOWN); 1135 { 1136 auto constInput = std::make_unique<iOpenCL::ConstantInputAnnotation>(); 1137 1138 constInput->ConstantType = constantType; 1139 constInput->Offset = 0; 1140 constInput->PayloadPosition = payloadPosition; 1141 constInput->PayloadSizeInBytes = iOpenCL::DATA_PARAMETER_DATA_SIZE; 1142 constInput->ArgumentNumber = kernelArg->getAssociatedArgNo(); 1143 m_kernelInfo.m_constantInputAnnotation.push_back(std::move(constInput)); 1144 } 1145 break; 1146 1147 case KernelArg::ArgType::IMAGE_1D: 1148 case KernelArg::ArgType::BINDLESS_IMAGE_1D: 1149 case KernelArg::ArgType::IMAGE_1D_BUFFER: 1150 case KernelArg::ArgType::BINDLESS_IMAGE_1D_BUFFER: 1151 case KernelArg::ArgType::IMAGE_2D: 1152 case KernelArg::ArgType::BINDLESS_IMAGE_2D: 1153 case KernelArg::ArgType::IMAGE_3D: 1154 case KernelArg::ArgType::BINDLESS_IMAGE_3D: 1155 case KernelArg::ArgType::IMAGE_CUBE: 1156 case KernelArg::ArgType::BINDLESS_IMAGE_CUBE: 1157 case KernelArg::ArgType::IMAGE_CUBE_DEPTH: 1158 case KernelArg::ArgType::BINDLESS_IMAGE_CUBE_DEPTH: 1159 case KernelArg::ArgType::IMAGE_1D_ARRAY: 1160 case KernelArg::ArgType::BINDLESS_IMAGE_1D_ARRAY: 1161 case KernelArg::ArgType::IMAGE_2D_ARRAY: 1162 case KernelArg::ArgType::BINDLESS_IMAGE_2D_ARRAY: 1163 case KernelArg::ArgType::IMAGE_2D_DEPTH: 1164 case KernelArg::ArgType::BINDLESS_IMAGE_2D_DEPTH: 1165 case KernelArg::ArgType::IMAGE_2D_DEPTH_ARRAY: 1166 case KernelArg::ArgType::BINDLESS_IMAGE_2D_DEPTH_ARRAY: 1167 case KernelArg::ArgType::IMAGE_2D_MSAA: 1168 case KernelArg::ArgType::BINDLESS_IMAGE_2D_MSAA: 1169 case KernelArg::ArgType::IMAGE_2D_MSAA_ARRAY: 1170 case KernelArg::ArgType::BINDLESS_IMAGE_2D_MSAA_ARRAY: 1171 case KernelArg::ArgType::IMAGE_2D_MSAA_DEPTH: 1172 case KernelArg::ArgType::BINDLESS_IMAGE_2D_MSAA_DEPTH: 1173 case KernelArg::ArgType::IMAGE_2D_MSAA_DEPTH_ARRAY: 1174 case KernelArg::ArgType::BINDLESS_IMAGE_2D_MSAA_DEPTH_ARRAY: 1175 case KernelArg::ArgType::IMAGE_CUBE_ARRAY: 1176 case KernelArg::ArgType::BINDLESS_IMAGE_CUBE_ARRAY: 1177 case KernelArg::ArgType::IMAGE_CUBE_DEPTH_ARRAY: 1178 case KernelArg::ArgType::BINDLESS_IMAGE_CUBE_DEPTH_ARRAY: 1179 { 1180 int argNo = kernelArg->getAssociatedArgNo(); 1181 SOpenCLKernelInfo::SResourceInfo resInfo = getResourceInfo(argNo); 1182 m_kernelInfo.m_argIndexMap[argNo] = getBTI(resInfo); 1183 1184 auto imageInput = std::make_unique<iOpenCL::ImageArgumentAnnotation>(); 1185 1186 imageInput->ArgumentNumber = argNo; 1187 imageInput->IsFixedBindingTableIndex = true; 1188 imageInput->BindingTableIndex = getBTI(resInfo); 1189 imageInput->ImageType = getImageTypeFromKernelArg(*kernelArg); 1190 IGC_ASSERT(imageInput->ImageType != iOpenCL::IMAGE_MEMORY_OBJECT_INVALID); 1191 imageInput->LocationIndex = kernelArg->getLocationIndex(); 1192 imageInput->LocationCount = kernelArg->getLocationCount(); 1193 imageInput->IsEmulationArgument = kernelArg->isEmulationArgument(); 1194 1195 imageInput->AccessedByFloatCoords = kernelArg->getImgAccessedFloatCoords(); 1196 imageInput->AccessedByIntCoords = kernelArg->getImgAccessedIntCoords(); 1197 imageInput->IsBindlessAccess = kernelArg->needsAllocation(); 1198 imageInput->PayloadPosition = payloadPosition; 1199 1200 switch (resInfo.Type) 1201 { 1202 case SOpenCLKernelInfo::SResourceInfo::RES_UAV: 1203 if (kernelArg->getAccessQual() == IGC::KernelArg::AccessQual::READ_ONLY) 1204 imageInput->Writeable = false; 1205 else 1206 imageInput->Writeable = true; 1207 break; 1208 case SOpenCLKernelInfo::SResourceInfo::RES_SRV: 1209 imageInput->Writeable = false; 1210 break; 1211 default: 1212 IGC_ASSERT_MESSAGE(0, "Unknown resource type"); 1213 break; 1214 } 1215 m_kernelInfo.m_imageInputAnnotations.push_back(std::move(imageInput)); 1216 1217 if (kernelArg->getAccessQual() == IGC::KernelArg::AccessQual::READ_WRITE) 1218 { 1219 m_kernelInfo.m_executionEnivronment.HasReadWriteImages = true; 1220 } 1221 } 1222 break; 1223 1224 case KernelArg::ArgType::SAMPLER: 1225 case KernelArg::ArgType::BINDLESS_SAMPLER: 1226 { 1227 int argNo = kernelArg->getAssociatedArgNo(); 1228 SOpenCLKernelInfo::SResourceInfo resInfo = getResourceInfo(argNo); 1229 m_kernelInfo.m_argIndexMap[argNo] = resInfo.Index; 1230 1231 iOpenCL::SAMPLER_OBJECT_TYPE samplerType; 1232 if (getExtensionInfo(argNo) == ResourceExtensionTypeEnum::MediaSamplerType) { 1233 samplerType = iOpenCL::SAMPLER_OBJECT_VME; 1234 } 1235 else if (getExtensionInfo(argNo) == ResourceExtensionTypeEnum::MediaSamplerTypeConvolve) { 1236 samplerType = iOpenCL::SAMPLER_OBJECT_SAMPLE_8X8_2DCONVOLVE; 1237 } 1238 else if (getExtensionInfo(argNo) == ResourceExtensionTypeEnum::MediaSamplerTypeErode) { 1239 samplerType = iOpenCL::SAMPLER_OBJECT_SAMPLE_8X8_ERODE; 1240 } 1241 else if (getExtensionInfo(argNo) == ResourceExtensionTypeEnum::MediaSamplerTypeDilate) { 1242 samplerType = iOpenCL::SAMPLER_OBJECT_SAMPLE_8X8_DILATE; 1243 } 1244 else if (getExtensionInfo(argNo) == ResourceExtensionTypeEnum::MediaSamplerTypeMinMaxFilter) { 1245 samplerType = iOpenCL::SAMPLER_OBJECT_SAMPLE_8X8_MINMAXFILTER; 1246 } 1247 else if (getExtensionInfo(argNo) == ResourceExtensionTypeEnum::MediaSamplerTypeMinMax) { 1248 samplerType = iOpenCL::SAMPLER_OBJECT_SAMPLE_8X8_MINMAX; 1249 } 1250 else if (getExtensionInfo(argNo) == ResourceExtensionTypeEnum::MediaSamplerTypeCentroid) { 1251 samplerType = iOpenCL::SAMPLER_OBJECT_SAMPLE_8X8_CENTROID; 1252 } 1253 else if (getExtensionInfo(argNo) == ResourceExtensionTypeEnum::MediaSamplerTypeBoolCentroid) { 1254 samplerType = iOpenCL::SAMPLER_OBJECT_SAMPLE_8X8_BOOL_CENTROID; 1255 } 1256 else if (getExtensionInfo(argNo) == ResourceExtensionTypeEnum::MediaSamplerTypeBoolSum) { 1257 samplerType = iOpenCL::SAMPLER_OBJECT_SAMPLE_8X8_BOOL_SUM; 1258 } 1259 else { 1260 samplerType = iOpenCL::SAMPLER_OBJECT_TEXTURE; 1261 } 1262 1263 auto samplerArg = std::make_unique<iOpenCL::SamplerArgumentAnnotation>(); 1264 samplerArg->SamplerType = samplerType; 1265 samplerArg->ArgumentNumber = argNo; 1266 samplerArg->SamplerTableIndex = resInfo.Index; 1267 samplerArg->LocationIndex = kernelArg->getLocationIndex(); 1268 samplerArg->LocationCount = kernelArg->getLocationCount(); 1269 samplerArg->IsBindlessAccess = kernelArg->needsAllocation(); 1270 samplerArg->IsEmulationArgument = kernelArg->isEmulationArgument(); 1271 samplerArg->PayloadPosition = payloadPosition; 1272 1273 m_kernelInfo.m_samplerArgument.push_back(std::move(samplerArg)); 1274 } 1275 break; 1276 1277 case KernelArg::ArgType::IMPLICIT_LOCAL_IDS: 1278 { 1279 m_kernelInfo.m_threadPayload.HasLocalIDx = true; 1280 m_kernelInfo.m_threadPayload.HasLocalIDy = true; 1281 m_kernelInfo.m_threadPayload.HasLocalIDz = true; 1282 1283 ModuleMetaData* modMD = m_Context->getModuleMetaData(); 1284 auto it = modMD->FuncMD.find(entry); 1285 if (it != modMD->FuncMD.end()) 1286 { 1287 if (it->second.localIDPresent == true) 1288 m_kernelInfo.m_threadPayload.HasLocalID = true; 1289 } 1290 } 1291 break; 1292 case KernelArg::ArgType::R1: 1293 m_kernelInfo.m_threadPayload.UnusedPerThreadConstantPresent = true; 1294 break; 1295 1296 case KernelArg::ArgType::IMPLICIT_SYNC_BUFFER: 1297 { 1298 int argNo = kernelArg->getAssociatedArgNo(); 1299 SOpenCLKernelInfo::SResourceInfo resInfo = getResourceInfo(argNo); 1300 m_kernelInfo.m_argIndexMap[argNo] = getBTI(resInfo); 1301 1302 auto syncBuffer = std::make_unique<iOpenCL::SyncBufferAnnotation>(); 1303 1304 syncBuffer->ArgumentNumber = argNo; 1305 syncBuffer->PayloadPosition = payloadPosition; 1306 syncBuffer->DataSize = kernelArg->getAllocateSize(); 1307 1308 m_kernelInfo.m_syncBufferAnnotation = std::move(syncBuffer); 1309 } 1310 break; 1311 1312 case KernelArg::ArgType::IMPLICIT_PRINTF_BUFFER: 1313 { 1314 int argNo = kernelArg->getAssociatedArgNo(); 1315 SOpenCLKernelInfo::SResourceInfo resInfo = getResourceInfo(argNo); 1316 m_kernelInfo.m_argIndexMap[argNo] = getBTI(resInfo); 1317 1318 auto printfBuffer = std::make_unique<iOpenCL::PrintfBufferAnnotation>(); 1319 1320 printfBuffer->ArgumentNumber = argNo; 1321 printfBuffer->PayloadPosition = payloadPosition; 1322 printfBuffer->DataSize = kernelArg->getAllocateSize(); 1323 printfBuffer->Index = 0; // This value is not used by Runtime. 1324 1325 m_kernelInfo.m_printfBufferAnnotation = std::move(printfBuffer); 1326 } 1327 break; 1328 1329 case KernelArg::ArgType::IMPLICIT_DEVICE_ENQUEUE_DEFAULT_DEVICE_QUEUE: 1330 { 1331 m_kernelInfo.m_executionEnivronment.HasDeviceEnqueue = true; 1332 int argNo = kernelArg->getAssociatedArgNo(); 1333 SOpenCLKernelInfo::SResourceInfo resInfo = getResourceInfo(argNo); 1334 m_kernelInfo.m_argIndexMap[argNo] = getBTI(resInfo); 1335 1336 auto ptrAnnotation = std::make_unique<iOpenCL::PointerInputAnnotation>(); 1337 1338 ptrAnnotation->AddressSpace = iOpenCL::ADDRESS_SPACE_INTERNAL_DEFAULT_DEVICE_QUEUE; 1339 ptrAnnotation->BindingTableIndex = getBTI(resInfo); 1340 ptrAnnotation->IsStateless = true; 1341 ptrAnnotation->PayloadPosition = payloadPosition; 1342 ptrAnnotation->PayloadSizeInBytes = kernelArg->getAllocateSize(); 1343 ptrAnnotation->ArgumentNumber = argNo; 1344 m_kernelInfo.m_pointerInput.push_back(std::move(ptrAnnotation)); 1345 } 1346 break; 1347 1348 case KernelArg::ArgType::IMPLICIT_DEVICE_ENQUEUE_EVENT_POOL: 1349 { 1350 m_kernelInfo.m_executionEnivronment.HasDeviceEnqueue = true; 1351 int argNo = kernelArg->getAssociatedArgNo(); 1352 SOpenCLKernelInfo::SResourceInfo resInfo = getResourceInfo(argNo); 1353 m_kernelInfo.m_argIndexMap[argNo] = getBTI(resInfo); 1354 1355 auto ptrAnnotation = std::make_unique<iOpenCL::PointerInputAnnotation>(); 1356 ptrAnnotation->AddressSpace = iOpenCL::ADDRESS_SPACE_INTERNAL_EVENT_POOL; 1357 ptrAnnotation->BindingTableIndex = getBTI(resInfo); 1358 ptrAnnotation->IsStateless = true; 1359 ptrAnnotation->PayloadPosition = payloadPosition; 1360 ptrAnnotation->PayloadSizeInBytes = kernelArg->getAllocateSize(); 1361 ptrAnnotation->ArgumentNumber = argNo; 1362 m_kernelInfo.m_pointerInput.push_back(std::move(ptrAnnotation)); 1363 } 1364 break; 1365 1366 case KernelArg::ArgType::IMPLICIT_WORK_DIM: 1367 case KernelArg::ArgType::IMPLICIT_VME_MB_BLOCK_TYPE: 1368 case KernelArg::ArgType::IMPLICIT_VME_SUBPIXEL_MODE: 1369 case KernelArg::ArgType::IMPLICIT_VME_SAD_ADJUST_MODE: 1370 case KernelArg::ArgType::IMPLICIT_VME_SEARCH_PATH_TYPE: 1371 case KernelArg::ArgType::IMPLICIT_DEVICE_ENQUEUE_MAX_WORKGROUP_SIZE: 1372 case KernelArg::ArgType::IMPLICIT_DEVICE_ENQUEUE_PARENT_EVENT: 1373 case KernelArg::ArgType::IMPLICIT_DEVICE_ENQUEUE_PREFERED_WORKGROUP_MULTIPLE: 1374 constantType = kernelArg->getDataParamToken(); 1375 { 1376 auto constInput = std::make_unique<iOpenCL::ConstantInputAnnotation>(); 1377 1378 DWORD sizeInBytes = iOpenCL::DATA_PARAMETER_DATA_SIZE; 1379 1380 constInput->ConstantType = constantType; 1381 constInput->Offset = 0; 1382 constInput->PayloadPosition = payloadPosition; 1383 constInput->PayloadSizeInBytes = sizeInBytes; 1384 constInput->ArgumentNumber = DEFAULT_ARG_NUM; 1385 m_kernelInfo.m_constantInputAnnotation.push_back(std::move(constInput)); 1386 1387 payloadPosition += sizeInBytes; 1388 } 1389 break; 1390 case KernelArg::ArgType::IMPLICIT_LOCAL_MEMORY_STATELESS_WINDOW_START_ADDRESS: 1391 { 1392 auto GASStart = std::make_unique<iOpenCL::StartGASAnnotation>(); 1393 GASStart->Offset = payloadPosition; 1394 GASStart->gpuPointerSizeInBytes = kernelArg->getAllocateSize(); 1395 m_kernelInfo.m_startGAS = std::move(GASStart); 1396 } 1397 break; 1398 case KernelArg::ArgType::IMPLICIT_LOCAL_MEMORY_STATELESS_WINDOW_SIZE: 1399 { 1400 auto winSizeGAS = std::make_unique<iOpenCL::WindowSizeGASAnnotation>(); 1401 1402 winSizeGAS->Offset = payloadPosition; 1403 m_kernelInfo.m_WindowSizeGAS = std::move(winSizeGAS); 1404 } 1405 break; 1406 case KernelArg::ArgType::IMPLICIT_PRIVATE_MEMORY_STATELESS_SIZE: 1407 { 1408 auto privateMemSize = std::make_unique<iOpenCL::PrivateMemSizeAnnotation>(); 1409 1410 privateMemSize->Offset = payloadPosition; 1411 m_kernelInfo.m_PrivateMemSize = std::move(privateMemSize); 1412 } 1413 break; 1414 default: 1415 // Do nothing 1416 break; 1417 } 1418 1419 1420 // DATA_PARAMETER_BUFFER_STATEFUL 1421 // ( SPatchDataParameterBuffer for this token only uses one field: ArgumentNumber ) 1422 // Used to indicate that all memory references via a gobal/constant ptr argument are 1423 // converted to stateful (by StatelessToStateful optimization). Thus, the ptr itself 1424 // is no longer referenced at all. 1425 // 1426 if (IGC_IS_FLAG_ENABLED(EnableStatelessToStatefull) && 1427 IGC_IS_FLAG_ENABLED(EnableStatefulToken) && 1428 m_DriverInfo->SupportStatefulToken() && 1429 arg && 1430 ((type == KernelArg::ArgType::PTR_GLOBAL && 1431 (arg->use_empty() || !GetHasGlobalStatelessAccess())) || 1432 (type == KernelArg::ArgType::PTR_CONSTANT && 1433 (arg->use_empty() || !GetHasConstantStatelessAccess())))) 1434 { 1435 auto constInput = std::make_unique<iOpenCL::ConstantInputAnnotation>(); 1436 1437 constInput->ConstantType = iOpenCL::DATA_PARAMETER_BUFFER_STATEFUL; 1438 constInput->Offset = 0; 1439 constInput->PayloadPosition = payloadPosition; 1440 constInput->PayloadSizeInBytes = iOpenCL::DATA_PARAMETER_DATA_SIZE; 1441 constInput->ArgumentNumber = kernelArg->getAssociatedArgNo(); // used only for this token. 1442 m_kernelInfo.m_constantInputAnnotation.push_back(std::move(constInput)); 1443 } 1444 } 1445 getImageTypeFromKernelArg(const KernelArg & kernelArg)1446 iOpenCL::IMAGE_MEMORY_OBJECT_TYPE COpenCLKernel::getImageTypeFromKernelArg(const KernelArg& kernelArg) 1447 { 1448 switch(kernelArg.getArgType()) { 1449 case KernelArg::ArgType::IMAGE_1D: 1450 case KernelArg::ArgType::BINDLESS_IMAGE_1D: 1451 return iOpenCL::IMAGE_MEMORY_OBJECT_1D; 1452 1453 case KernelArg::ArgType::IMAGE_1D_BUFFER: 1454 case KernelArg::ArgType::BINDLESS_IMAGE_1D_BUFFER: 1455 return iOpenCL::IMAGE_MEMORY_OBJECT_BUFFER; 1456 1457 case KernelArg::ArgType::IMAGE_2D: 1458 case KernelArg::ArgType::BINDLESS_IMAGE_2D: 1459 if (getExtensionInfo(kernelArg.getAssociatedArgNo()) == ResourceExtensionTypeEnum::MediaResourceType) 1460 return iOpenCL::IMAGE_MEMORY_OBJECT_2D_MEDIA; 1461 else if (getExtensionInfo(kernelArg.getAssociatedArgNo()) == ResourceExtensionTypeEnum::MediaResourceBlockType) 1462 return iOpenCL::IMAGE_MEMORY_OBJECT_2D_MEDIA_BLOCK; 1463 return iOpenCL::IMAGE_MEMORY_OBJECT_2D; 1464 1465 case KernelArg::ArgType::IMAGE_3D: 1466 case KernelArg::ArgType::BINDLESS_IMAGE_3D: 1467 return iOpenCL::IMAGE_MEMORY_OBJECT_3D; 1468 1469 case KernelArg::ArgType::IMAGE_CUBE: 1470 case KernelArg::ArgType::BINDLESS_IMAGE_CUBE: 1471 return iOpenCL::IMAGE_MEMORY_OBJECT_CUBE; 1472 1473 case KernelArg::ArgType::IMAGE_CUBE_DEPTH: 1474 case KernelArg::ArgType::BINDLESS_IMAGE_CUBE_DEPTH: 1475 // Use regular cube texture for depth: 1476 return iOpenCL::IMAGE_MEMORY_OBJECT_CUBE; 1477 1478 case KernelArg::ArgType::IMAGE_1D_ARRAY: 1479 case KernelArg::ArgType::BINDLESS_IMAGE_1D_ARRAY: 1480 return iOpenCL::IMAGE_MEMORY_OBJECT_1D_ARRAY; 1481 1482 case KernelArg::ArgType::IMAGE_2D_ARRAY: 1483 case KernelArg::ArgType::BINDLESS_IMAGE_2D_ARRAY: 1484 return iOpenCL::IMAGE_MEMORY_OBJECT_2D_ARRAY; 1485 1486 1487 case KernelArg::ArgType::IMAGE_2D_DEPTH: 1488 case KernelArg::ArgType::BINDLESS_IMAGE_2D_DEPTH: 1489 return iOpenCL::IMAGE_MEMORY_OBJECT_2D_DEPTH; 1490 1491 case KernelArg::ArgType::IMAGE_2D_DEPTH_ARRAY: 1492 case KernelArg::ArgType::BINDLESS_IMAGE_2D_DEPTH_ARRAY: 1493 return iOpenCL::IMAGE_MEMORY_OBJECT_2D_ARRAY_DEPTH; 1494 1495 case KernelArg::ArgType::IMAGE_2D_MSAA: 1496 case KernelArg::ArgType::BINDLESS_IMAGE_2D_MSAA: 1497 return iOpenCL::IMAGE_MEMORY_OBJECT_2D_MSAA; 1498 1499 case KernelArg::ArgType::IMAGE_2D_MSAA_ARRAY: 1500 case KernelArg::ArgType::BINDLESS_IMAGE_2D_MSAA_ARRAY: 1501 return iOpenCL::IMAGE_MEMORY_OBJECT_2D_ARRAY_MSAA; 1502 1503 case KernelArg::ArgType::IMAGE_2D_MSAA_DEPTH: 1504 case KernelArg::ArgType::BINDLESS_IMAGE_2D_MSAA_DEPTH: 1505 return iOpenCL::IMAGE_MEMORY_OBJECT_2D_MSAA_DEPTH; 1506 1507 case KernelArg::ArgType::IMAGE_2D_MSAA_DEPTH_ARRAY: 1508 case KernelArg::ArgType::BINDLESS_IMAGE_2D_MSAA_DEPTH_ARRAY: 1509 return iOpenCL::IMAGE_MEMORY_OBJECT_2D_ARRAY_MSAA_DEPTH; 1510 1511 case KernelArg::ArgType::IMAGE_CUBE_ARRAY: 1512 case KernelArg::ArgType::BINDLESS_IMAGE_CUBE_ARRAY: 1513 return iOpenCL::IMAGE_MEMORY_OBJECT_CUBE_ARRAY; 1514 1515 case KernelArg::ArgType::IMAGE_CUBE_DEPTH_ARRAY: 1516 case KernelArg::ArgType::BINDLESS_IMAGE_CUBE_DEPTH_ARRAY: 1517 // Use regular cube texture array for depth 1518 return iOpenCL::IMAGE_MEMORY_OBJECT_CUBE_ARRAY; 1519 1520 default: 1521 break; 1522 } 1523 return iOpenCL::IMAGE_MEMORY_OBJECT_INVALID; 1524 } 1525 ParseShaderSpecificOpcode(llvm::Instruction * inst)1526 void COpenCLKernel::ParseShaderSpecificOpcode(llvm::Instruction* inst) 1527 { 1528 auto setStatelessAccess = [&](unsigned AS) { 1529 if (AS == ADDRESS_SPACE_GLOBAL || 1530 AS == ADDRESS_SPACE_GENERIC || 1531 AS == ADDRESS_SPACE_GLOBAL_OR_PRIVATE) 1532 { 1533 SetHasGlobalStatelessAccess(); 1534 } 1535 1536 if (AS == ADDRESS_SPACE_CONSTANT) 1537 { 1538 SetHasConstantStatelessAccess(); 1539 } 1540 }; 1541 1542 // Currently we see data corruption when we have IEEE macros and midthread preemption enabled. 1543 // Adding a temporary work around to disable mid thread preemption when we see IEEE Macros. 1544 switch (inst->getOpcode()) 1545 { 1546 case Instruction::FDiv: 1547 if (inst->getType()->isDoubleTy()) 1548 { 1549 SetDisableMidthreadPreemption(); 1550 } 1551 break; 1552 case Instruction::Call: 1553 if (inst->getType()->isDoubleTy()) 1554 { 1555 if (GetOpCode(inst) == llvm_sqrt) 1556 { 1557 SetDisableMidthreadPreemption(); 1558 } 1559 } 1560 break; 1561 case Instruction::Load: 1562 { 1563 unsigned AS = cast<LoadInst>(inst)->getPointerAddressSpace(); 1564 setStatelessAccess(AS); 1565 break; 1566 } 1567 case Instruction::Store: 1568 { 1569 unsigned AS = cast<StoreInst>(inst)->getPointerAddressSpace(); 1570 setStatelessAccess(AS); 1571 break; 1572 } 1573 default: 1574 break; 1575 } 1576 1577 if (CallInst * CallI = dyn_cast<CallInst>(inst)) 1578 { 1579 bool mayHasMemoryAccess = true; // for checking stateless access 1580 if (GenIntrinsicInst * GII = dyn_cast<GenIntrinsicInst>(CallI)) 1581 { 1582 GenISAIntrinsic::ID id = GII->getIntrinsicID(); 1583 switch (id) 1584 { 1585 default: 1586 break; 1587 case GenISAIntrinsic::GenISA_dpas: 1588 case GenISAIntrinsic::GenISA_sub_group_dpas: 1589 SetHasDPAS(); 1590 break; 1591 case GenISAIntrinsic::GenISA_ptr_to_pair: 1592 case GenISAIntrinsic::GenISA_pair_to_ptr: 1593 mayHasMemoryAccess = false; 1594 break; 1595 } // End of switch 1596 } 1597 1598 if (mayHasMemoryAccess) 1599 { 1600 // Checking stateless access info 1601 if (!isa<IntrinsicInst>(CallI) && !isa<GenIntrinsicInst>(CallI)) { 1602 // function/subroutine call. Give up 1603 SetHasConstantStatelessAccess(); 1604 SetHasGlobalStatelessAccess(); 1605 } 1606 else 1607 { 1608 for (int i = 0, e = (int)CallI->getNumArgOperands(); i < e; ++i) 1609 { 1610 Value* arg = CallI->getArgOperand(i); 1611 PointerType* PTy = dyn_cast<PointerType>(arg->getType()); 1612 if (!PTy) 1613 continue; 1614 unsigned AS = PTy->getAddressSpace(); 1615 setStatelessAccess(AS); 1616 } 1617 } 1618 } 1619 } 1620 } 1621 AllocatePayload()1622 void COpenCLKernel::AllocatePayload() 1623 { 1624 IGC_ASSERT(m_Context); 1625 1626 bool loadThreadPayload = false; 1627 1628 loadThreadPayload = m_Platform->supportLoadThreadPayloadForCompute(); 1629 1630 // SKL defaults to indirect thread payload storage. 1631 // BDW needs CURBE payload. Spec says: 1632 // "CURBE should be used for the payload when using indirect dispatch rather than indirect payload". 1633 m_kernelInfo.m_threadPayload.CompiledForIndirectPayloadStorage = true; 1634 if (IGC_IS_FLAG_ENABLED(DisableGPGPUIndirectPayload) || 1635 m_Context->platform.getWATable().WaDisableIndirectDataForIndirectDispatch) 1636 { 1637 m_kernelInfo.m_threadPayload.CompiledForIndirectPayloadStorage = false; 1638 } 1639 if (loadThreadPayload) 1640 { 1641 m_kernelInfo.m_threadPayload.CompiledForIndirectPayloadStorage = true; 1642 } 1643 m_kernelInfo.m_threadPayload.HasFlattenedLocalID = false; 1644 m_kernelInfo.m_threadPayload.HasLocalIDx = false; 1645 m_kernelInfo.m_threadPayload.HasLocalIDy = false; 1646 m_kernelInfo.m_threadPayload.HasLocalIDz = false; 1647 m_kernelInfo.m_threadPayload.HasGlobalIDOffset = false; 1648 m_kernelInfo.m_threadPayload.HasGroupID = false; 1649 m_kernelInfo.m_threadPayload.HasLocalID = false; 1650 m_kernelInfo.m_threadPayload.UnusedPerThreadConstantPresent = false; 1651 m_kernelInfo.m_printfBufferAnnotation = nullptr; 1652 m_kernelInfo.m_syncBufferAnnotation = nullptr; 1653 m_kernelInfo.m_threadPayload.HasStageInGridOrigin = false; 1654 m_kernelInfo.m_threadPayload.HasStageInGridSize = false; 1655 1656 // Set the amount of the private memory used by the kernel 1657 // Set only if the private memory metadata actually exists and we don't use 1658 // scratch space for private memory. 1659 bool noScratchSpacePrivMem = !m_Context->getModuleMetaData()->compOpt.UseScratchSpacePrivateMemory; 1660 1661 auto funcMD = m_Context->getModuleMetaData()->FuncMD.find(entry); 1662 if (noScratchSpacePrivMem && (funcMD != m_Context->getModuleMetaData()->FuncMD.end()) && funcMD->second.privateMemoryPerWI) 1663 { 1664 m_perWIStatelessPrivateMemSize = funcMD->second.privateMemoryPerWI; 1665 } 1666 1667 1668 m_ConstantBufferLength = 0; 1669 m_NOSBufferSize = 0; 1670 1671 uint offset = 0; 1672 1673 uint constantBufferStart = 0; 1674 bool constantBufferStartSet = false; 1675 1676 uint prevOffset = 0; 1677 bool nosBufferAllocated = false; 1678 1679 KernelArgsOrder::InputType layout = 1680 m_kernelInfo.m_threadPayload.CompiledForIndirectPayloadStorage ? 1681 KernelArgsOrder::InputType::INDIRECT : 1682 KernelArgsOrder::InputType::CURBE; 1683 1684 KernelArgs kernelArgs(*entry, m_DL, m_pMdUtils, m_ModuleMetadata, getGRFSize(), layout); 1685 1686 if (layout == KernelArgsOrder::InputType::INDIRECT && !loadThreadPayload) 1687 { 1688 kernelArgs.checkForZeroPerThreadData(); 1689 } 1690 1691 for (KernelArgs::const_iterator i = kernelArgs.begin(), e = kernelArgs.end(); i != e; ++i) 1692 { 1693 KernelArg arg = *i; 1694 prevOffset = offset; 1695 1696 // skip unused arguments 1697 bool IsUnusedArg = (arg.getArgType() == KernelArg::ArgType::IMPLICIT_BUFFER_OFFSET) && 1698 arg.getArg()->use_empty(); 1699 1700 // Runtime Values should not be processed any further. No annotations shall be created for them. 1701 // Only added to KernelArgs to enforce correct allocation order. 1702 bool isRuntimeValue = (arg.getArgType() == KernelArg::ArgType::RUNTIME_VALUE); 1703 1704 if (!constantBufferStartSet && arg.isConstantBuf()) 1705 { 1706 constantBufferStart = offset; 1707 constantBufferStartSet = true; 1708 } 1709 1710 if (!nosBufferAllocated && isRuntimeValue) { 1711 IGC_ASSERT_MESSAGE(arg.isConstantBuf(), "RuntimeValues must be marked as isConstantBuf"); 1712 AllocateNOSConstants(offset); 1713 nosBufferAllocated = true; 1714 } 1715 1716 // Local IDs are non-uniform and may have two instances in SIMD32 mode 1717 int numAllocInstances = arg.getArgType() == KernelArg::ArgType::IMPLICIT_LOCAL_IDS ? m_numberInstance : 1; 1718 1719 auto allocSize = arg.getAllocateSize(); 1720 1721 if (arg.getArgType() == KernelArg::ArgType::IMPLICIT_LOCAL_ID_BUFFER) 1722 { 1723 allocSize = getLocalIdBufferSize(m_dispatchSize); 1724 } 1725 1726 if (!IsUnusedArg && !isRuntimeValue) 1727 { 1728 if (arg.needsAllocation()) 1729 { 1730 // Align on the desired alignment for this argument 1731 auto alignment = arg.getAlignment(); 1732 1733 offset = iSTD::Align(offset, alignment); 1734 1735 // Arguments larger than a GRF must be at least GRF-aligned. 1736 // Arguments smaller than a GRF may not cross GRF boundaries. 1737 // This means that arguments that cross a GRF boundary 1738 // must be GRF aligned. 1739 // Note that this is done AFTER we align on the base alignment, 1740 // because of edge cases where aligning on the base alignment 1741 // is what causes the "overflow". 1742 unsigned int startGRF = offset / getGRFSize(); 1743 unsigned int endGRF = (offset + allocSize - 1) / getGRFSize(); 1744 if (startGRF != endGRF) 1745 { 1746 offset = iSTD::Align(offset, getGRFSize()); 1747 } 1748 1749 // And now actually tell vISA we need this space. 1750 // (Except for r0, which is a predefined variable, and should never be allocated as input!) 1751 const llvm::Argument* A = arg.getArg(); 1752 if (A != nullptr && arg.getArgType() != KernelArg::ArgType::IMPLICIT_R0) 1753 { 1754 CVariable* var = GetSymbol(const_cast<Argument*>(A)); 1755 for (int i = 0; i < numAllocInstances; ++i) 1756 { 1757 uint totalOffset = offset + (allocSize * i); 1758 if ((totalOffset / getGRFSize()) >= m_Context->getNumGRFPerThread()) 1759 { 1760 m_Context->EmitError("Kernel inputs exceed total register size!", A); 1761 return; 1762 } 1763 AllocateInput(var, totalOffset, i); 1764 } 1765 } 1766 // or else we would just need to increase an offset 1767 } 1768 1769 // Create annotations for the kernel argument 1770 // If an arg is unused, don't generate patch token for it. 1771 CreateAnnotations(&arg, offset - constantBufferStart); 1772 if (IGC_IS_FLAG_ENABLED(EnableZEBinary) || 1773 m_Context->getCompilerOption().EnableZEBinary) { 1774 // FIXME: once we transit to zebin completely, we don't need to do 1775 // CreateAnnotations above. Only CreateZEPayloadArguments is required 1776 1777 // During the transition, we disable ZEBinary if there are unsupported 1778 // arguments 1779 bool success = CreateZEPayloadArguments(&arg, offset - constantBufferStart); 1780 if (!success) { 1781 // assertion tests if we force to EnableZEBinary but encounter unsupported features 1782 IGC_ASSERT_MESSAGE(!IGC_IS_FLAG_ENABLED(EnableZEBinary), 1783 "ZEBin: unsupported KernelArg Type"); 1784 1785 // fall back to patch-token if ZEBinary is enabled by CodeGenContext::CompOptions 1786 if (m_Context->getCompilerOption().EnableZEBinary) 1787 m_Context->getCompilerOption().EnableZEBinary = false; 1788 } 1789 } 1790 if (arg.needsAllocation()) 1791 { 1792 for (int i = 0; i < numAllocInstances; ++i) 1793 { 1794 offset += allocSize; 1795 } 1796 } 1797 } 1798 1799 if (arg.isConstantBuf()) 1800 { 1801 m_ConstantBufferLength += offset - prevOffset; 1802 } 1803 } 1804 1805 // ToDo: we should avoid passing all three dimensions of local id 1806 if (m_kernelInfo.m_threadPayload.HasLocalIDx || 1807 m_kernelInfo.m_threadPayload.HasLocalIDy || 1808 m_kernelInfo.m_threadPayload.HasLocalIDz) 1809 { 1810 if (loadThreadPayload) 1811 { 1812 uint perThreadInputSize = SIZE_WORD * 3 * (m_dispatchSize == SIMDMode::SIMD32 ? 32 : 16); 1813 if (m_dispatchSize == SIMDMode::SIMD16 && getGRFSize() == 64) 1814 { 1815 perThreadInputSize *= 2; 1816 } 1817 encoder.GetVISAKernel()->AddKernelAttribute("PerThreadInputSize", sizeof(uint16_t), &perThreadInputSize); 1818 } 1819 } 1820 1821 m_kernelInfo.m_threadPayload.OffsetToSkipPerThreadDataLoad = 0; 1822 m_kernelInfo.m_threadPayload.OffsetToSkipSetFFIDGP = 0; 1823 1824 m_ConstantBufferLength = iSTD::Align(m_ConstantBufferLength, getGRFSize()); 1825 1826 CreateInlineSamplerAnnotations(); 1827 // Currently we can't support inline sampler in zebin 1828 // assertion tests if we force to EnableZEBinary but encounter inline sampler 1829 bool hasInlineSampler = m_kernelInfo.m_HasInlineVmeSamplers || !m_kernelInfo.m_samplerInput.empty(); 1830 IGC_ASSERT_MESSAGE(!IGC_IS_FLAG_ENABLED(EnableZEBinary) || !hasInlineSampler, 1831 "ZEBin: Inline sampler unsupported"); 1832 // fall back to patch-token if ZEBinary is enabled by CodeGenContext::CompOptions 1833 if (m_Context->getCompilerOption().EnableZEBinary && hasInlineSampler) 1834 m_Context->getCompilerOption().EnableZEBinary = false; 1835 1836 // Handle kernel reflection 1837 CreateKernelArgInfo(); 1838 CreateKernelAttributeInfo(); 1839 1840 // Create annotations for printf string. 1841 CreatePrintfStringAnnotations(); 1842 } 1843 1844 GetGlobalMappingValue(llvm::Value * c)1845 unsigned int COpenCLKernel::GetGlobalMappingValue(llvm::Value* c) 1846 { 1847 unsigned int val = 0; 1848 auto localIter = m_localOffsetsMap.find(c); 1849 if (localIter != m_localOffsetsMap.end()) 1850 { 1851 val = localIter->second; 1852 } 1853 else 1854 { 1855 IGC_ASSERT_MESSAGE(0, "Trying to access a GlobalVariable not in locals map"); 1856 } 1857 return val; 1858 } 1859 GetGlobalMapping(llvm::Value * c)1860 CVariable* COpenCLKernel::GetGlobalMapping(llvm::Value* c) 1861 { 1862 unsigned int val = GetGlobalMappingValue(c); 1863 1864 VISA_Type type = GetType(c->getType()); 1865 return ImmToVariable(val, type); 1866 } 1867 getSumFixedTGSMSizes(Function * F)1868 unsigned int COpenCLKernel::getSumFixedTGSMSizes(Function* F) 1869 { 1870 // Find whether we have size information for this kernel. 1871 // If not, then the total TGSM is 0, otherwise pull it from the MD 1872 ModuleMetaData* modMD = m_Context->getModuleMetaData(); 1873 auto funcMD = modMD->FuncMD.find(F); 1874 if (funcMD == modMD->FuncMD.end()) 1875 { 1876 return 0; 1877 } 1878 return funcMD->second.localSize; 1879 } 1880 FillKernel()1881 void COpenCLKernel::FillKernel() 1882 { 1883 m_kernelInfo.m_executionEnivronment.PerThreadScratchSpace = ProgramOutput()->getScratchSpaceUsageInSlot0(); 1884 m_kernelInfo.m_executionEnivronment.PerThreadScratchSpaceSlot1 = ProgramOutput()->getScratchSpaceUsageInSlot1(); 1885 m_kernelInfo.m_executionEnivronment.PerThreadPrivateOnStatelessSize = m_perWIStatelessPrivateMemSize; 1886 m_kernelInfo.m_kernelProgram.NOSBufferSize = m_NOSBufferSize / getGRFSize(); // in 256 bits 1887 m_kernelInfo.m_kernelProgram.ConstantBufferLength = m_ConstantBufferLength / getGRFSize(); // in 256 bits 1888 m_kernelInfo.m_kernelProgram.MaxNumberOfThreads = m_Platform->getMaxGPGPUShaderThreads() / GetShaderThreadUsageRate(); 1889 1890 m_kernelInfo.m_executionEnivronment.SumFixedTGSMSizes = getSumFixedTGSMSizes(entry); 1891 m_kernelInfo.m_executionEnivronment.HasBarriers = this->GetHasBarrier(); 1892 m_kernelInfo.m_executionEnivronment.DisableMidThreadPreemption = GetDisableMidThreadPreemption(); 1893 m_kernelInfo.m_executionEnivronment.SubgroupIndependentForwardProgressRequired = 1894 m_Context->getModuleMetaData()->compOpt.SubgroupIndependentForwardProgressRequired; 1895 m_kernelInfo.m_executionEnivronment.CompiledForGreaterThan4GBBuffers = 1896 m_Context->getModuleMetaData()->compOpt.GreaterThan4GBBufferRequired; 1897 IGC_ASSERT(gatherMap.size() == 0); 1898 m_kernelInfo.m_kernelProgram.gatherMapSize = 0; 1899 m_kernelInfo.m_kernelProgram.bindingTableEntryCount = 0; 1900 1901 m_kernelInfo.m_executionEnivronment.HasDeviceEnqueue = false; 1902 m_kernelInfo.m_executionEnivronment.IsSingleProgramFlow = false; 1903 //m_kernelInfo.m_executionEnivronment.PerSIMDLanePrivateMemorySize = m_perWIStatelessPrivateMemSize; 1904 m_kernelInfo.m_executionEnivronment.HasFixedWorkGroupSize = false; 1905 m_kernelInfo.m_kernelName = entry->getName().str(); 1906 m_kernelInfo.m_ShaderHashCode = m_Context->hash.getAsmHash(); 1907 1908 FunctionInfoMetaDataHandle funcInfoMD = m_pMdUtils->getFunctionsInfoItem(entry); 1909 ThreadGroupSizeMetaDataHandle threadGroupSize = funcInfoMD->getThreadGroupSize(); 1910 SubGroupSizeMetaDataHandle subGroupSize = funcInfoMD->getSubGroupSize(); 1911 1912 if (threadGroupSize->hasValue()) 1913 { 1914 m_kernelInfo.m_executionEnivronment.HasFixedWorkGroupSize = true; 1915 m_kernelInfo.m_executionEnivronment.FixedWorkgroupSize[0] = threadGroupSize->getXDim(); 1916 m_kernelInfo.m_executionEnivronment.FixedWorkgroupSize[1] = threadGroupSize->getYDim(); 1917 m_kernelInfo.m_executionEnivronment.FixedWorkgroupSize[2] = threadGroupSize->getZDim(); 1918 } 1919 if (subGroupSize->hasValue()) 1920 { 1921 m_kernelInfo.m_executionEnivronment.CompiledSIMDSize = subGroupSize->getSIMD_size(); 1922 } 1923 1924 auto& FuncMap = m_Context->getModuleMetaData()->FuncMD; 1925 auto FuncIter = FuncMap.find(entry); 1926 if (FuncIter != FuncMap.end()) 1927 { 1928 IGC::FunctionMetaData funcMD = FuncIter->second; 1929 WorkGroupWalkOrderMD workGroupWalkOrder = funcMD.workGroupWalkOrder; 1930 1931 if (workGroupWalkOrder.dim0 || workGroupWalkOrder.dim1 || workGroupWalkOrder.dim2) 1932 { 1933 m_kernelInfo.m_executionEnivronment.WorkgroupWalkOrder[0] = workGroupWalkOrder.dim0; 1934 m_kernelInfo.m_executionEnivronment.WorkgroupWalkOrder[1] = workGroupWalkOrder.dim1; 1935 m_kernelInfo.m_executionEnivronment.WorkgroupWalkOrder[2] = workGroupWalkOrder.dim2; 1936 } 1937 1938 m_kernelInfo.m_executionEnivronment.IsInitializer = funcMD.IsInitializer; 1939 m_kernelInfo.m_executionEnivronment.IsFinalizer = funcMD.IsFinalizer; 1940 1941 m_kernelInfo.m_executionEnivronment.CompiledSubGroupsNumber = funcMD.CompiledSubGroupsNumber; 1942 1943 } 1944 1945 m_kernelInfo.m_executionEnivronment.HasGlobalAtomics = GetHasGlobalAtomics(); 1946 m_kernelInfo.m_threadPayload.OffsetToSkipPerThreadDataLoad = ProgramOutput()->m_offsetToSkipPerThreadDataLoad; 1947 m_kernelInfo.m_threadPayload.OffsetToSkipSetFFIDGP = ProgramOutput()->m_offsetToSkipSetFFIDGP; 1948 1949 m_kernelInfo.m_executionEnivronment.NumGRFRequired = ProgramOutput()->m_numGRFTotal; 1950 1951 1952 m_kernelInfo.m_executionEnivronment.UseBindlessMode = m_Context->m_InternalOptions.UseBindlessMode; 1953 m_kernelInfo.m_executionEnivronment.HasStackCalls = HasStackCalls(); 1954 } 1955 RecomputeBTLayout()1956 void COpenCLKernel::RecomputeBTLayout() 1957 { 1958 CodeGenContext* pCtx = GetContext(); 1959 ModuleMetaData* modMD = pCtx->getModuleMetaData(); 1960 FunctionMetaData* funcMD = &modMD->FuncMD[entry]; 1961 ResourceAllocMD* resAllocMD = &funcMD->resAllocMD; 1962 // Get the number of UAVs and Resources from MD. 1963 int numUAVs = resAllocMD->uavsNumType; 1964 int numResources = resAllocMD->srvsNumType; 1965 1966 // Now, update the layout information 1967 USC::SShaderStageBTLayout* layout = ((COCLBTILayout*)m_pBtiLayout)->getModifiableLayout(); 1968 1969 // The BT layout contains the minimum and the maximum number BTI for each kind 1970 // of resource. E.g. UAVs may be mapped to BTIs 0..3, SRVs to 4..5, and the scratch 1971 // surface to 6. 1972 // Note that the names are somewhat misleading. They are used for the sake of consistency 1973 // with the ICBE sources. 1974 1975 // Some fields are always 0 for OCL. 1976 layout->resourceNullBoundOffset = 0; 1977 layout->immediateConstantBufferOffset = 0; 1978 layout->interfaceConstantBufferOffset = 0; 1979 layout->constantBufferNullBoundOffset = 0; 1980 layout->JournalIdx = 0; 1981 layout->JournalCounterIdx = 0; 1982 1983 // And TGSM (aka SLM) is always 254. 1984 layout->TGSMIdx = 254; 1985 1986 int index = 0; 1987 1988 // First, allocate BTI for debug surface 1989 if (m_Context->m_InternalOptions.KernelDebugEnable) 1990 { 1991 layout->systemThreadIdx = index++; 1992 } 1993 1994 // Now, allocate BTIs for all the SRVs. 1995 layout->minResourceIdx = index; 1996 if (numResources) 1997 { 1998 index += numResources - 1; 1999 layout->maxResourceIdx = index++; 2000 } 2001 else 2002 { 2003 layout->maxResourceIdx = index; 2004 } 2005 2006 // Now, ConstantBuffers - used as a placeholder for the inline constants, if present. 2007 layout->minConstantBufferIdx = index; 2008 layout->maxConstantBufferIdx = index; 2009 2010 // Now, the UAVs 2011 layout->minUAVIdx = index; 2012 if (numUAVs) 2013 { 2014 index += numUAVs - 1; 2015 layout->maxUAVIdx = index++; 2016 } 2017 else 2018 { 2019 layout->maxUAVIdx = index; 2020 } 2021 2022 // And finally, the scratch surface 2023 layout->surfaceScratchIdx = index++; 2024 2025 // Overall number of used BT entries, not including TGSM. 2026 layout->maxBTsize = index; 2027 } 2028 HasFullDispatchMask()2029 bool COpenCLKernel::HasFullDispatchMask() 2030 { 2031 unsigned int groupSize = IGCMetaDataHelper::getThreadGroupSize(*m_pMdUtils, entry); 2032 if (groupSize != 0) 2033 { 2034 if (groupSize % numLanes(m_dispatchSize) == 0) 2035 { 2036 return true; 2037 } 2038 } 2039 return false; 2040 } 2041 getBTI(SOpenCLKernelInfo::SResourceInfo & resInfo)2042 unsigned int COpenCLKernel::getBTI(SOpenCLKernelInfo::SResourceInfo& resInfo) 2043 { 2044 switch (resInfo.Type) 2045 { 2046 case SOpenCLKernelInfo::SResourceInfo::RES_UAV: 2047 return m_pBtiLayout->GetUavIndex(resInfo.Index); 2048 case SOpenCLKernelInfo::SResourceInfo::RES_SRV: 2049 return m_pBtiLayout->GetTextureIndex(resInfo.Index); 2050 default: 2051 return 0xffffffff; 2052 } 2053 } 2054 CollectProgramInfo(OpenCLProgramContext * ctx)2055 void CollectProgramInfo(OpenCLProgramContext* ctx) 2056 { 2057 MetaDataUtils mdUtils(ctx->getModule()); 2058 ModuleMetaData* modMD = ctx->getModuleMetaData(); 2059 2060 if (!modMD->inlineConstantBuffers.empty()) 2061 { 2062 // For ZeBin, constants are mantained in two separate buffers 2063 // the first is for general constants, and the second for string literals 2064 2065 // General constants 2066 auto ipsbMDHandle = modMD->inlineConstantBuffers[0]; 2067 std::unique_ptr<iOpenCL::InitConstantAnnotation> initConstant(new iOpenCL::InitConstantAnnotation()); 2068 initConstant->Alignment = ipsbMDHandle.alignment; 2069 initConstant->AllocSize = ipsbMDHandle.allocSize; 2070 2071 size_t bufferSize = (ipsbMDHandle.Buffer).size(); 2072 initConstant->InlineData.resize(bufferSize); 2073 memcpy_s(initConstant->InlineData.data(), bufferSize, ipsbMDHandle.Buffer.data(), bufferSize); 2074 2075 ctx->m_programInfo.m_initConstantAnnotation = std::move(initConstant); 2076 2077 if (IGC_IS_FLAG_ENABLED(EnableZEBinary) || 2078 modMD->compOpt.EnableZEBinary) 2079 { 2080 // String literals 2081 auto ipsbStringMDHandle = modMD->inlineConstantBuffers[1]; 2082 std::unique_ptr<iOpenCL::InitConstantAnnotation> initStringConstant(new iOpenCL::InitConstantAnnotation()); 2083 initStringConstant->Alignment = ipsbStringMDHandle.alignment; 2084 initStringConstant->AllocSize = ipsbStringMDHandle.allocSize; 2085 2086 bufferSize = (ipsbStringMDHandle.Buffer).size(); 2087 initStringConstant->InlineData.resize(bufferSize); 2088 memcpy_s(initStringConstant->InlineData.data(), bufferSize, ipsbStringMDHandle.Buffer.data(), bufferSize); 2089 2090 ctx->m_programInfo.m_initConstantStringAnnotation = std::move(initStringConstant); 2091 } 2092 } 2093 2094 if (!modMD->inlineGlobalBuffers.empty()) 2095 { 2096 auto ipsbMDHandle = modMD->inlineGlobalBuffers[0]; 2097 2098 std::unique_ptr<iOpenCL::InitGlobalAnnotation> initGlobal(new iOpenCL::InitGlobalAnnotation()); 2099 initGlobal->Alignment = ipsbMDHandle.alignment; 2100 initGlobal->AllocSize = ipsbMDHandle.allocSize; 2101 2102 size_t bufferSize = (ipsbMDHandle.Buffer).size(); 2103 initGlobal->InlineData.resize(bufferSize); 2104 memcpy_s(initGlobal->InlineData.data(), bufferSize, ipsbMDHandle.Buffer.data(), bufferSize); 2105 2106 ctx->m_programInfo.m_initGlobalAnnotation = std::move(initGlobal); 2107 } 2108 2109 { 2110 auto& FuncMap = ctx->getModuleMetaData()->FuncMD; 2111 for (const auto& i : FuncMap) 2112 { 2113 std::unique_ptr<iOpenCL::KernelTypeProgramBinaryInfo> initConstant(new iOpenCL::KernelTypeProgramBinaryInfo()); 2114 initConstant->KernelName = i.first->getName().str(); 2115 if (i.second.IsFinalizer) 2116 { 2117 2118 initConstant->Type = iOpenCL::PROGRAM_SCOPE_KERNEL_DESTRUCTOR; 2119 ctx->m_programInfo.m_initKernelTypeAnnotation.push_back(std::move(initConstant)); 2120 } 2121 else if (i.second.IsInitializer) 2122 { 2123 initConstant->Type = iOpenCL::PROGRAM_SCOPE_KERNEL_CONSTRUCTOR; 2124 ctx->m_programInfo.m_initKernelTypeAnnotation.push_back(std::move(initConstant)); 2125 } 2126 2127 } 2128 } 2129 2130 for (const auto& globPtrInfo : modMD->GlobalPointerProgramBinaryInfos) 2131 { 2132 auto initGlobalPointer = std::make_unique<iOpenCL::GlobalPointerAnnotation>(); 2133 initGlobalPointer->PointeeAddressSpace = globPtrInfo.PointeeAddressSpace; 2134 initGlobalPointer->PointeeBufferIndex = globPtrInfo.PointeeBufferIndex; 2135 initGlobalPointer->PointerBufferIndex = globPtrInfo.PointerBufferIndex; 2136 initGlobalPointer->PointerOffset = globPtrInfo.PointerOffset; 2137 ctx->m_programInfo.m_initGlobalPointerAnnotation.push_back(std::move(initGlobalPointer)); 2138 } 2139 2140 for (const auto& constPtrInfo : modMD->ConstantPointerProgramBinaryInfos) 2141 { 2142 auto initConstantPointer = std::make_unique<iOpenCL::ConstantPointerAnnotation>(); 2143 initConstantPointer->PointeeAddressSpace = constPtrInfo.PointeeAddressSpace; 2144 initConstantPointer->PointeeBufferIndex = constPtrInfo.PointeeBufferIndex; 2145 initConstantPointer->PointerBufferIndex = constPtrInfo.PointerBufferIndex; 2146 initConstantPointer->PointerOffset = constPtrInfo.PointerOffset; 2147 2148 ctx->m_programInfo.m_initConstantPointerAnnotation.push_back(std::move(initConstantPointer)); 2149 } 2150 2151 // Pointer address relocation table data for GLOBAL buffer 2152 for (const auto& globalRelocEntry : modMD->GlobalBufferAddressRelocInfo) 2153 { 2154 ctx->m_programInfo.m_GlobalPointerAddressRelocAnnotation.globalReloc.emplace_back( 2155 (globalRelocEntry.PointerSize == 8) ? vISA::GenRelocType::R_SYM_ADDR : vISA::GenRelocType::R_SYM_ADDR_32, 2156 (uint32_t)globalRelocEntry.BufferOffset, 2157 globalRelocEntry.Symbol); 2158 } 2159 // Pointer address relocation table data for CONST buffer 2160 for (const auto& constRelocEntry : modMD->ConstantBufferAddressRelocInfo) 2161 { 2162 ctx->m_programInfo.m_GlobalPointerAddressRelocAnnotation.globalConstReloc.emplace_back( 2163 (constRelocEntry.PointerSize == 8) ? vISA::GenRelocType::R_SYM_ADDR : vISA::GenRelocType::R_SYM_ADDR_32, 2164 (uint32_t)constRelocEntry.BufferOffset, 2165 constRelocEntry.Symbol); 2166 } 2167 } 2168 GatherDataForDriver(OpenCLProgramContext * ctx,COpenCLKernel * pShader,CShaderProgram * pKernel,Function * pFunc,MetaDataUtils * pMdUtils)2169 void GatherDataForDriver(OpenCLProgramContext* ctx, COpenCLKernel* pShader, CShaderProgram* pKernel, Function* pFunc, MetaDataUtils* pMdUtils) 2170 { 2171 IGC_ASSERT(pShader != nullptr); 2172 pShader->FillKernel(); 2173 SProgramOutput* pOutput = pShader->ProgramOutput(); 2174 2175 // Need a better heuristic for NoRetry 2176 FunctionInfoMetaDataHandle funcInfoMD = pMdUtils->getFunctionsInfoItem(pFunc); 2177 int subGrpSize = funcInfoMD->getSubGroupSize()->getSIMD_size(); 2178 bool noRetry = ((subGrpSize > 0 || pOutput->m_scratchSpaceUsedBySpills < 1000) && 2179 ctx->m_instrTypes.mayHaveIndirectOperands); 2180 2181 bool optDisable = false; 2182 if (ctx->getModuleMetaData()->compOpt.OptDisable) 2183 { 2184 optDisable = true; 2185 } 2186 2187 if (pOutput->m_scratchSpaceUsedBySpills == 0 || 2188 noRetry || 2189 ctx->m_retryManager.IsLastTry() || 2190 optDisable) 2191 { 2192 // Save the shader program to the state processor to be handled later 2193 if (ctx->m_programOutput.m_ShaderProgramList.size() == 0 || 2194 ctx->m_programOutput.m_ShaderProgramList.back() != pKernel) 2195 { 2196 ctx->m_programOutput.m_ShaderProgramList.push_back(pKernel); 2197 } 2198 COMPILER_SHADER_STATS_PRINT(pKernel->m_shaderStats, ShaderType::OPENCL_SHADER, ctx->hash, pFunc->getName().str()); 2199 COMPILER_SHADER_STATS_SUM(ctx->m_sumShaderStats, pKernel->m_shaderStats, ShaderType::OPENCL_SHADER); 2200 COMPILER_SHADER_STATS_DEL(pKernel->m_shaderStats); 2201 } 2202 else 2203 { 2204 ctx->m_retryManager.kernelSet.insert(pShader->m_kernelInfo.m_kernelName); 2205 } 2206 } 2207 SetKernelProgram(OpenCLProgramContext * ctx,COpenCLKernel * shader,DWORD simdMode)2208 static bool SetKernelProgram(OpenCLProgramContext* ctx, COpenCLKernel* shader, DWORD simdMode) 2209 { 2210 if (shader && (shader->ProgramOutput()->m_programSize > 0 || 2211 (ctx->m_compileToVISAOnly && !shader->ProgramOutput()->m_VISAAsm.empty()))) 2212 { 2213 if (simdMode == 32) 2214 { 2215 //why do we need this? we will get all output in GatherDataForDriver(...) 2216 //remove it to avoid messy logics 2217 //shader->m_kernelInfo.m_executionEnivronment.PerThreadSpillFillSize = 2218 // shader->ProgramOutput()->m_scratchSpaceUsedBySpills; 2219 shader->m_kernelInfo.m_kernelProgram.simd32 = *shader->ProgramOutput(); 2220 ctx->SetSIMDInfo(SIMD_SELECTED, SIMDMode::SIMD32, ShaderDispatchMode::NOT_APPLICABLE); 2221 } 2222 else if (simdMode == 16) 2223 { 2224 shader->m_kernelInfo.m_kernelProgram.simd16 = *shader->ProgramOutput(); 2225 ctx->SetSIMDInfo(SIMD_SELECTED, SIMDMode::SIMD16, ShaderDispatchMode::NOT_APPLICABLE); 2226 } 2227 else if (simdMode == 8) 2228 { 2229 shader->m_kernelInfo.m_kernelProgram.simd8 = *shader->ProgramOutput(); 2230 ctx->SetSIMDInfo(SIMD_SELECTED, SIMDMode::SIMD8, ShaderDispatchMode::NOT_APPLICABLE); 2231 } 2232 shader->m_kernelInfo.m_executionEnivronment.CompiledSIMDSize = simdMode; 2233 shader->m_kernelInfo.m_executionEnivronment.SIMDInfo = ctx->GetSIMDInfo(); 2234 return true; 2235 } 2236 return false; 2237 } 2238 CodeGen(OpenCLProgramContext * ctx)2239 void CodeGen(OpenCLProgramContext* ctx) 2240 { 2241 // Do program-wide code generation. 2242 // Currently, this just creates the program-scope patch stream. 2243 if (ctx->m_retryManager.IsFirstTry()) 2244 { 2245 CollectProgramInfo(ctx); 2246 if (IGC_IS_FLAG_DISABLED(EnableZEBinary) && 2247 !ctx->getCompilerOption().EnableZEBinary) 2248 { 2249 ctx->m_programOutput.CreateProgramScopePatchStream(ctx->m_programInfo); 2250 } 2251 } 2252 2253 MetaDataUtils* pMdUtils = ctx->getMetaDataUtils(); 2254 2255 //Clear spill parameters of retry manager in the very begining of code gen 2256 ctx->m_retryManager.ClearSpillParams(); 2257 2258 CShaderProgram::KernelShaderMap shaders; 2259 CodeGen(ctx, shaders); 2260 2261 if (ctx->m_programOutput.m_pSystemThreadKernelOutput == nullptr) 2262 { 2263 const auto options = ctx->m_InternalOptions; 2264 if (options.IncludeSIPCSR || 2265 options.IncludeSIPKernelDebug || 2266 options.IncludeSIPKernelDebugWithLocalMemory || 2267 options.KernelDebugEnable) 2268 { 2269 DWORD systemThreadMode = 0; 2270 2271 if (options.IncludeSIPCSR) 2272 { 2273 systemThreadMode |= USC::SYSTEM_THREAD_MODE_CSR; 2274 } 2275 2276 if (options.KernelDebugEnable || 2277 options.IncludeSIPKernelDebug) 2278 { 2279 systemThreadMode |= USC::SYSTEM_THREAD_MODE_DEBUG; 2280 } 2281 2282 if (options.IncludeSIPKernelDebugWithLocalMemory) 2283 { 2284 systemThreadMode |= USC::SYSTEM_THREAD_MODE_DEBUG_LOCAL; 2285 } 2286 2287 bool success = SIP::CSystemThread::CreateSystemThreadKernel( 2288 ctx->platform, 2289 (USC::SYSTEM_THREAD_MODE)systemThreadMode, 2290 ctx->m_programOutput.m_pSystemThreadKernelOutput); 2291 2292 if (!success) 2293 { 2294 ctx->EmitError("System thread kernel could not be created!", nullptr); 2295 } 2296 } 2297 } 2298 2299 ctx->m_retryManager.kernelSet.clear(); 2300 2301 // gather data to send back to the driver 2302 for (const auto& k : shaders) 2303 { 2304 Function* pFunc = k.first; 2305 CShaderProgram* pKernel = static_cast<CShaderProgram*>(k.second); 2306 COpenCLKernel* simd8Shader = static_cast<COpenCLKernel*>(pKernel->GetShader(SIMDMode::SIMD8)); 2307 COpenCLKernel* simd16Shader = static_cast<COpenCLKernel*>(pKernel->GetShader(SIMDMode::SIMD16)); 2308 COpenCLKernel* simd32Shader = static_cast<COpenCLKernel*>(pKernel->GetShader(SIMDMode::SIMD32)); 2309 2310 if ((ctx->m_DriverInfo.sendMultipleSIMDModes() || ctx->m_enableSimdVariantCompilation) 2311 && (ctx->getModuleMetaData()->csInfo.forcedSIMDSize == 0)) 2312 { 2313 //Gather the kernel binary for each compiled kernel 2314 if (SetKernelProgram(ctx, simd32Shader, 32)) 2315 GatherDataForDriver(ctx, simd32Shader, pKernel, pFunc, pMdUtils); 2316 if (SetKernelProgram(ctx, simd16Shader, 16)) 2317 GatherDataForDriver(ctx, simd16Shader, pKernel, pFunc, pMdUtils); 2318 if (SetKernelProgram(ctx, simd8Shader, 8)) 2319 GatherDataForDriver(ctx, simd8Shader, pKernel, pFunc, pMdUtils); 2320 } 2321 else 2322 { 2323 //Gather the kernel binary only for 1 SIMD mode of the kernel 2324 if (SetKernelProgram(ctx, simd32Shader, 32)) 2325 GatherDataForDriver(ctx, simd32Shader, pKernel, pFunc, pMdUtils); 2326 else if (SetKernelProgram(ctx, simd16Shader, 16)) 2327 GatherDataForDriver(ctx, simd16Shader, pKernel, pFunc, pMdUtils); 2328 else if (SetKernelProgram(ctx, simd8Shader, 8)) 2329 GatherDataForDriver(ctx, simd8Shader, pKernel, pFunc, pMdUtils); 2330 } 2331 } 2332 } 2333 hasReadWriteImage(llvm::Function & F)2334 bool COpenCLKernel::hasReadWriteImage(llvm::Function& F) 2335 { 2336 if (!isEntryFunc(m_pMdUtils, &F)) 2337 { 2338 // Ignore read/write flags for subroutines for now. 2339 // TODO: get access types for subroutines without using kernel args 2340 return false; 2341 } 2342 2343 KernelArgs kernelArgs(F, m_DL, m_pMdUtils, m_ModuleMetadata, getGRFSize(), KernelArgsOrder::InputType::INDEPENDENT); 2344 for (const auto& KA : kernelArgs) 2345 { 2346 // RenderScript annotation sets "read_write" qualifier 2347 // for any applicable kernel argument, not only for kernel arguments 2348 // that are images, so we should check if kernel argument is an image. 2349 if (KA.getAccessQual() == KernelArg::AccessQual::READ_WRITE && 2350 KA.getArgType() >= KernelArg::ArgType::IMAGE_1D && 2351 KA.getArgType() <= KernelArg::ArgType::BINDLESS_IMAGE_CUBE_DEPTH_ARRAY) 2352 { 2353 return true; 2354 } 2355 } 2356 return false; 2357 } 2358 CompileSIMDSize(SIMDMode simdMode,EmitPass & EP,llvm::Function & F)2359 bool COpenCLKernel::CompileSIMDSize(SIMDMode simdMode, EmitPass& EP, llvm::Function& F) 2360 { 2361 if (!CompileSIMDSizeInCommon(simdMode)) 2362 return false; 2363 2364 { 2365 // If stack calls are present, disable simd32 in order to do wa in visa 2366 bool needCallWA = (IGC_IS_FLAG_ENABLED(EnableCallWA) && m_Context->platform.hasFusedEU()); 2367 if (needCallWA && simdMode == SIMDMode::SIMD32 && HasStackCalls()) 2368 { 2369 return false; 2370 } 2371 } 2372 2373 if (!m_Context->m_retryManager.IsFirstTry()) 2374 { 2375 m_Context->ClearSIMDInfo(simdMode, ShaderDispatchMode::NOT_APPLICABLE); 2376 m_Context->SetSIMDInfo(SIMD_RETRY, simdMode, ShaderDispatchMode::NOT_APPLICABLE); 2377 } 2378 2379 2380 //If forced SIMD Mode (by driver or regkey), then: 2381 // 1. Compile only that SIMD mode and nothing else 2382 // 2. Compile that SIMD mode even if it is not profitable, i.e. even if compileThisSIMD() returns false for it. 2383 // So, don't bother checking profitability for it 2384 if (m_Context->getModuleMetaData()->csInfo.forcedSIMDSize != 0) 2385 { 2386 // Entered here means driver has requested a specific SIMD mode, which was forced in the regkey ForceOCLSIMDWidth. 2387 // We return the condition can we compile the given forcedSIMDSize with this simdMode? 2388 return ( 2389 // These statements are basically equivalent to (simdMode == forcedSIMDSize) 2390 (simdMode == SIMDMode::SIMD8 && m_Context->getModuleMetaData()->csInfo.forcedSIMDSize == 8) || 2391 (simdMode == SIMDMode::SIMD16 && m_Context->getModuleMetaData()->csInfo.forcedSIMDSize == 16) || 2392 (simdMode == SIMDMode::SIMD32 && m_Context->getModuleMetaData()->csInfo.forcedSIMDSize == 32) 2393 ); 2394 } 2395 2396 SIMDStatus simdStatus = checkSIMDCompileConds(simdMode, EP, F); 2397 2398 2399 // Func and Perf checks pass, compile this SIMD 2400 if (simdStatus == SIMDStatus::SIMD_PASS) 2401 return true; 2402 2403 // Functional failure, skip compiling this SIMD 2404 if (simdStatus == SIMDStatus::SIMD_FUNC_FAIL) 2405 return false; 2406 2407 IGC_ASSERT(simdStatus == SIMDStatus::SIMD_PERF_FAIL); 2408 //not profitable 2409 if (m_Context->m_DriverInfo.sendMultipleSIMDModes()) 2410 { 2411 if (EP.m_canAbortOnSpill) 2412 return false; //not the first functionally correct SIMD, exit 2413 else 2414 return true; //is the first functionally correct SIMD, compile 2415 } 2416 return simdStatus == SIMDStatus::SIMD_PASS; 2417 } 2418 2419 checkSIMDCompileConds(SIMDMode simdMode,EmitPass & EP,llvm::Function & F)2420 SIMDStatus COpenCLKernel::checkSIMDCompileConds(SIMDMode simdMode, EmitPass& EP, llvm::Function& F) 2421 { 2422 CShader* simd8Program = m_parent->GetShader(SIMDMode::SIMD8); 2423 CShader* simd16Program = m_parent->GetShader(SIMDMode::SIMD16); 2424 CShader* simd32Program = m_parent->GetShader(SIMDMode::SIMD32); 2425 2426 CodeGenContext* pCtx = GetContext(); 2427 2428 bool compileFunctionVariants = pCtx->m_enableSimdVariantCompilation && 2429 (m_FGA && IGC::isIntelSymbolTableVoidProgram(m_FGA->getGroupHead(&F))); 2430 2431 // Here we see if we have compiled a size for this shader already 2432 if ((simd8Program && simd8Program->ProgramOutput()->m_programSize > 0) || 2433 (simd16Program && simd16Program->ProgramOutput()->m_programSize > 0) || 2434 (simd32Program && simd32Program->ProgramOutput()->m_programSize > 0)) 2435 { 2436 bool canCompileMultipleSIMD = pCtx->m_DriverInfo.sendMultipleSIMDModes() || compileFunctionVariants; 2437 if (!(canCompileMultipleSIMD && (pCtx->getModuleMetaData()->csInfo.forcedSIMDSize == 0))) 2438 return SIMDStatus::SIMD_FUNC_FAIL; 2439 } 2440 2441 // Next we check if there is a required sub group size specified 2442 MetaDataUtils* pMdUtils = EP.getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils(); 2443 ModuleMetaData* modMD = pCtx->getModuleMetaData(); 2444 FunctionInfoMetaDataHandle funcInfoMD = pMdUtils->getFunctionsInfoItem(&F); 2445 int simd_size = funcInfoMD->getSubGroupSize()->getSIMD_size(); 2446 2447 // Finds the kernel and get the group simd size from the kernel 2448 if (m_FGA) 2449 { 2450 llvm::Function* Kernel = &F; 2451 auto FG = m_FGA->getGroup(&F); 2452 Kernel = FG->getHead(); 2453 funcInfoMD = pMdUtils->getFunctionsInfoItem(Kernel); 2454 simd_size = funcInfoMD->getSubGroupSize()->getSIMD_size(); 2455 } 2456 2457 // For simd variant functions, detect which SIMD sizes are needed 2458 if (compileFunctionVariants && F.hasFnAttribute("variant-function-def")) 2459 { 2460 bool canCompile = true; 2461 if (simdMode == SIMDMode::SIMD16) 2462 canCompile = F.hasFnAttribute("CompileSIMD16"); 2463 else if (simdMode == SIMDMode::SIMD8) 2464 canCompile = F.hasFnAttribute("CompileSIMD8"); 2465 2466 if (!canCompile) 2467 { 2468 pCtx->SetSIMDInfo(SIMD_SKIP_HW, simdMode, ShaderDispatchMode::NOT_APPLICABLE); 2469 return SIMDStatus::SIMD_FUNC_FAIL; 2470 } 2471 } 2472 2473 // Cannot compile simd32 for function calls due to slicing 2474 if (m_FGA && m_FGA->getGroup(&F) && (!m_FGA->getGroup(&F)->isSingle() || m_FGA->getGroup(&F)->hasStackCall())) 2475 { 2476 // Fail on SIMD32 for all groups with function calls 2477 if (simdMode == SIMDMode::SIMD32) 2478 { 2479 pCtx->SetSIMDInfo(SIMD_SKIP_HW, simdMode, ShaderDispatchMode::NOT_APPLICABLE); 2480 return SIMDStatus::SIMD_FUNC_FAIL; 2481 } 2482 // Group has no stackcalls, is not the SymbolTable dummy kernel, and subgroup size is not set 2483 // Just subroutines, default to SIMD8 2484 if (!m_FGA->getGroup(&F)->hasStackCall() && 2485 !IGC::isIntelSymbolTableVoidProgram(m_FGA->getGroupHead(&F)) && 2486 simd_size == 0 && 2487 simdMode != SIMDMode::SIMD8) 2488 { 2489 pCtx->SetSIMDInfo(SIMD_SKIP_HW, simdMode, ShaderDispatchMode::NOT_APPLICABLE); 2490 return SIMDStatus::SIMD_FUNC_FAIL; 2491 } 2492 } 2493 2494 uint32_t groupSize = 0; 2495 if (modMD->csInfo.maxWorkGroupSize) 2496 { 2497 groupSize = modMD->csInfo.maxWorkGroupSize; 2498 } 2499 else 2500 { 2501 groupSize = IGCMetaDataHelper::getThreadGroupSize(*pMdUtils, &F); 2502 } 2503 2504 if (groupSize == 0) 2505 { 2506 groupSize = IGCMetaDataHelper::getThreadGroupSizeHint(*pMdUtils, &F); 2507 } 2508 2509 if (simd_size) 2510 { 2511 switch (simd_size) 2512 { 2513 // Apparently the only possible simdModes here are SIMD8, SIMD16, SIMD32 2514 case 8: 2515 if (simdMode != SIMDMode::SIMD8) 2516 { 2517 pCtx->SetSIMDInfo(SIMD_SKIP_THGRPSIZE, simdMode, ShaderDispatchMode::NOT_APPLICABLE); 2518 return SIMDStatus::SIMD_FUNC_FAIL; 2519 } 2520 break; 2521 case 16: 2522 if (simdMode != SIMDMode::SIMD16) 2523 { 2524 pCtx->SetSIMDInfo(SIMD_SKIP_THGRPSIZE, simdMode, ShaderDispatchMode::NOT_APPLICABLE); 2525 return SIMDStatus::SIMD_FUNC_FAIL; 2526 } 2527 EP.m_canAbortOnSpill = false; 2528 break; 2529 case 32: 2530 if (simdMode != SIMDMode::SIMD32) 2531 { 2532 pCtx->SetSIMDInfo(SIMD_SKIP_THGRPSIZE, simdMode, ShaderDispatchMode::NOT_APPLICABLE); 2533 return SIMDStatus::SIMD_FUNC_FAIL; 2534 } 2535 else { 2536 EP.m_canAbortOnSpill = false; 2537 } 2538 break; 2539 default: 2540 IGC_ASSERT_MESSAGE(0, "Unsupported required sub group size"); 2541 break; 2542 } 2543 } 2544 else 2545 { 2546 // Checking registry/flag here. Note that if ForceOCLSIMDWidth is set to 2547 // 8/16/32, only corresponding EnableOCLSIMD<N> is set to true. Therefore, 2548 // if any of EnableOCLSIMD<N> is disabled, ForceOCLSIMDWidth must set to 2549 // a value other than <N> if set. See igc_regkeys.cpp for detail. 2550 if ((simdMode == SIMDMode::SIMD32 && IGC_IS_FLAG_DISABLED(EnableOCLSIMD32)) || 2551 (simdMode == SIMDMode::SIMD16 && IGC_IS_FLAG_DISABLED(EnableOCLSIMD16))) 2552 { 2553 return SIMDStatus::SIMD_FUNC_FAIL; 2554 } 2555 2556 // Check if we force code generation for the current SIMD size. 2557 // Note that for SIMD8, we always force it! 2558 //ATTN: This check is redundant! 2559 if (numLanes(simdMode) == pCtx->getModuleMetaData()->csInfo.forcedSIMDSize || 2560 simdMode == SIMDMode::SIMD8) 2561 { 2562 return SIMDStatus::SIMD_PASS; 2563 } 2564 2565 2566 if (groupSize != 0 && groupSize <= 16) 2567 { 2568 if (simdMode == SIMDMode::SIMD32 || 2569 (groupSize <= 8 && simdMode != SIMDMode::SIMD8)) 2570 { 2571 pCtx->SetSIMDInfo(SIMD_SKIP_THGRPSIZE, simdMode, ShaderDispatchMode::NOT_APPLICABLE); 2572 return SIMDStatus::SIMD_FUNC_FAIL; 2573 } 2574 } 2575 2576 // Here we check profitablility, etc. 2577 if (simdMode == SIMDMode::SIMD16) 2578 { 2579 bool optDisable = this->GetContext()->getModuleMetaData()->compOpt.OptDisable; 2580 2581 if (optDisable) 2582 { 2583 return SIMDStatus::SIMD_FUNC_FAIL; 2584 } 2585 2586 // bail out of SIMD16 if it's not profitable. 2587 Simd32ProfitabilityAnalysis& PA = EP.getAnalysis<Simd32ProfitabilityAnalysis>(); 2588 if (!PA.isSimd16Profitable()) 2589 { 2590 pCtx->SetSIMDInfo(SIMD_SKIP_PERF, simdMode, ShaderDispatchMode::NOT_APPLICABLE); 2591 return SIMDStatus::SIMD_PERF_FAIL; 2592 } 2593 } 2594 if (simdMode == SIMDMode::SIMD32) 2595 { 2596 bool optDisable = this->GetContext()->getModuleMetaData()->compOpt.OptDisable; 2597 2598 if (optDisable) 2599 { 2600 return SIMDStatus::SIMD_FUNC_FAIL; 2601 } 2602 2603 // bail out of SIMD32 if it's not profitable. 2604 Simd32ProfitabilityAnalysis& PA = EP.getAnalysis<Simd32ProfitabilityAnalysis>(); 2605 if (!PA.isSimd32Profitable()) 2606 { 2607 pCtx->SetSIMDInfo(SIMD_SKIP_HW, simdMode, ShaderDispatchMode::NOT_APPLICABLE); 2608 return SIMDStatus::SIMD_PERF_FAIL; 2609 } 2610 } 2611 } 2612 2613 return SIMDStatus::SIMD_PASS; 2614 } 2615 2616 } 2617