1 /* 2 * Copyright (C) 2018-2021 Intel Corporation 3 * 4 * SPDX-License-Identifier: MIT 5 * 6 */ 7 8 #pragma once 9 #include "shared/source/command_stream/command_stream_receiver_hw.h" 10 #include "shared/source/command_stream/csr_properties_flags.h" 11 #include "shared/source/command_stream/thread_arbitration_policy.h" 12 #include "shared/source/debug_settings/debug_settings_manager.h" 13 #include "shared/source/device/device.h" 14 #include "shared/source/helpers/address_patch.h" 15 #include "shared/source/helpers/preamble.h" 16 #include "shared/source/helpers/timestamp_packet.h" 17 #include "shared/source/kernel/implicit_args.h" 18 #include "shared/source/kernel/kernel_execution_type.h" 19 #include "shared/source/program/kernel_info.h" 20 #include "shared/source/unified_memory/unified_memory.h" 21 #include "shared/source/utilities/stackvec.h" 22 23 #include "opencl/extensions/public/cl_ext_private.h" 24 #include "opencl/source/api/cl_types.h" 25 #include "opencl/source/cl_device/cl_device.h" 26 #include "opencl/source/device_queue/device_queue.h" 27 #include "opencl/source/helpers/base_object.h" 28 #include "opencl/source/helpers/properties_helper.h" 29 #include "opencl/source/kernel/kernel_objects_for_aux_translation.h" 30 #include "opencl/source/program/program.h" 31 32 #include <vector> 33 34 namespace NEO { 35 struct CompletionStamp; 36 class Buffer; 37 class CommandStreamReceiver; 38 class GraphicsAllocation; 39 class ImageTransformer; 40 class Surface; 41 class PrintfHandler; 42 class MultiDeviceKernel; 43 44 class Kernel : public ReferenceTrackedObject<Kernel> { 45 public: 46 static const uint32_t kernelBinaryAlignment = 64; 47 48 enum kernelArgType { 49 NONE_OBJ, 50 IMAGE_OBJ, 51 BUFFER_OBJ, 52 PIPE_OBJ, 53 SVM_OBJ, 54 SVM_ALLOC_OBJ, 55 SAMPLER_OBJ, 56 ACCELERATOR_OBJ, 57 DEVICE_QUEUE_OBJ, 58 SLM_OBJ 59 }; 60 61 struct SimpleKernelArgInfo { 62 kernelArgType type; 63 void *object; 64 const void *value; 65 size_t size; 66 GraphicsAllocation *pSvmAlloc; 67 cl_mem_flags svmFlags; 68 bool isPatched = false; 69 bool isStatelessUncacheable = false; 70 }; 71 72 enum class TunningStatus { 73 STANDARD_TUNNING_IN_PROGRESS, 74 SUBDEVICE_TUNNING_IN_PROGRESS, 75 TUNNING_DONE 76 }; 77 78 enum class TunningType { 79 DISABLED, 80 SIMPLE, 81 FULL 82 }; 83 84 typedef int32_t (Kernel::*KernelArgHandler)(uint32_t argIndex, 85 size_t argSize, 86 const void *argVal); 87 88 template <typename kernel_t = Kernel, typename program_t = Program> create(program_t * program,const KernelInfo & kernelInfo,ClDevice & clDevice,cl_int * errcodeRet)89 static kernel_t *create(program_t *program, const KernelInfo &kernelInfo, ClDevice &clDevice, cl_int *errcodeRet) { 90 cl_int retVal; 91 kernel_t *pKernel = nullptr; 92 93 pKernel = new kernel_t(program, kernelInfo, clDevice); 94 retVal = pKernel->initialize(); 95 96 if (retVal != CL_SUCCESS) { 97 delete pKernel; 98 pKernel = nullptr; 99 } 100 101 if (errcodeRet) { 102 *errcodeRet = retVal; 103 } 104 105 if (FileLoggerInstance().enabled()) { 106 std::string source; 107 program->getSource(source); 108 FileLoggerInstance().dumpKernel(kernelInfo.kernelDescriptor.kernelMetadata.kernelName, source); 109 } 110 111 return pKernel; 112 } 113 114 Kernel &operator=(const Kernel &) = delete; 115 Kernel(const Kernel &) = delete; 116 117 virtual ~Kernel(); 118 isMemObj(kernelArgType kernelArg)119 static bool isMemObj(kernelArgType kernelArg) { 120 return kernelArg == BUFFER_OBJ || kernelArg == IMAGE_OBJ || kernelArg == PIPE_OBJ; 121 } 122 isAuxTranslationRequired()123 bool isAuxTranslationRequired() const { return auxTranslationRequired; } setAuxTranslationRequired(bool onOff)124 void setAuxTranslationRequired(bool onOff) { auxTranslationRequired = onOff; } 125 void updateAuxTranslationRequired(); 126 getCrossThreadDataRef()127 ArrayRef<uint8_t> getCrossThreadDataRef() { 128 return ArrayRef<uint8_t>(reinterpret_cast<uint8_t *>(crossThreadData), crossThreadDataSize); 129 } 130 getCrossThreadData()131 char *getCrossThreadData() const { 132 return crossThreadData; 133 } 134 getCrossThreadDataSize()135 uint32_t getCrossThreadDataSize() const { 136 return crossThreadDataSize; 137 } 138 139 cl_int initialize(); 140 141 MOCKABLE_VIRTUAL cl_int cloneKernel(Kernel *pSourceKernel); 142 143 MOCKABLE_VIRTUAL bool canTransformImages() const; 144 MOCKABLE_VIRTUAL bool isPatched() const; 145 146 // API entry points setArgument(uint32_t argIndex,size_t argSize,const void * argVal)147 cl_int setArgument(uint32_t argIndex, size_t argSize, const void *argVal) { return setArg(argIndex, argSize, argVal); } 148 cl_int setArgSvm(uint32_t argIndex, size_t svmAllocSize, void *svmPtr, GraphicsAllocation *svmAlloc, cl_mem_flags svmFlags); 149 cl_int setArgSvmAlloc(uint32_t argIndex, void *svmPtr, GraphicsAllocation *svmAlloc); 150 151 void setSvmKernelExecInfo(GraphicsAllocation *argValue); 152 void clearSvmKernelExecInfo(); 153 154 cl_int getInfo(cl_kernel_info paramName, size_t paramValueSize, 155 void *paramValue, size_t *paramValueSizeRet) const; 156 void getAdditionalInfo(cl_kernel_info paramName, const void *¶mValue, size_t ¶mValueSizeRet) const; 157 void getAdditionalWorkGroupInfo(cl_kernel_work_group_info paramName, const void *¶mValue, size_t ¶mValueSizeRet) const; 158 159 cl_int getArgInfo(cl_uint argIndx, cl_kernel_arg_info paramName, 160 size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const; 161 162 cl_int getWorkGroupInfo(cl_kernel_work_group_info paramName, 163 size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const; 164 165 cl_int getSubGroupInfo(cl_kernel_sub_group_info paramName, 166 size_t inputValueSize, const void *inputValue, 167 size_t paramValueSize, void *paramValue, 168 size_t *paramValueSizeRet) const; 169 170 const void *getKernelHeap() const; 171 void *getSurfaceStateHeap() const; 172 const void *getDynamicStateHeap() const; 173 174 size_t getKernelHeapSize() const; 175 size_t getSurfaceStateHeapSize() const; 176 size_t getDynamicStateHeapSize() const; 177 size_t getNumberOfBindingTableStates() const; getBindingTableOffset()178 size_t getBindingTableOffset() const { 179 return localBindingTableOffset; 180 } 181 182 void resizeSurfaceStateHeap(void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset); 183 184 void substituteKernelHeap(void *newKernelHeap, size_t newKernelHeapSize); 185 bool isKernelHeapSubstituted() const; 186 uint64_t getKernelId() const; 187 void setKernelId(uint64_t newKernelId); 188 uint32_t getStartOffset() const; 189 void setStartOffset(uint32_t offset); 190 getKernelArguments()191 const std::vector<SimpleKernelArgInfo> &getKernelArguments() const { 192 return kernelArguments; 193 } 194 getKernelArgsNumber()195 size_t getKernelArgsNumber() const { 196 return kernelArguments.size(); 197 } 198 usesBindfulAddressingForBuffers()199 bool usesBindfulAddressingForBuffers() const { 200 return KernelDescriptor::BindfulAndStateless == kernelInfo.kernelDescriptor.kernelAttributes.bufferAddressingMode; 201 } 202 getDescriptor()203 inline const KernelDescriptor &getDescriptor() const { 204 return kernelInfo.kernelDescriptor; 205 } getKernelInfo()206 inline const KernelInfo &getKernelInfo() const { 207 return kernelInfo; 208 } 209 getContext()210 Context &getContext() const { 211 return program->getContext(); 212 } 213 getProgram()214 Program *getProgram() const { return program; } 215 getScratchSize()216 uint32_t getScratchSize() { 217 return kernelInfo.kernelDescriptor.kernelAttributes.perThreadScratchSize[0]; 218 } 219 getPrivateScratchSize()220 uint32_t getPrivateScratchSize() { 221 return kernelInfo.kernelDescriptor.kernelAttributes.perThreadScratchSize[1]; 222 } 223 224 void createReflectionSurface(); 225 template <bool mockable = false> 226 void patchReflectionSurface(DeviceQueue *devQueue, PrintfHandler *printfHandler); 227 228 void patchDefaultDeviceQueue(DeviceQueue *devQueue); 229 void patchEventPool(DeviceQueue *devQueue); 230 void patchBlocksSimdSize(); 231 bool usesSyncBuffer() const; 232 void patchSyncBuffer(GraphicsAllocation *gfxAllocation, size_t bufferOffset); 233 void *patchBindlessSurfaceState(NEO::GraphicsAllocation *alloc, uint32_t bindless); 234 getKernelReflectionSurface()235 GraphicsAllocation *getKernelReflectionSurface() const { 236 return kernelReflectionSurface; 237 } 238 239 size_t getInstructionHeapSizeForExecutionModel() const; 240 241 // Helpers 242 cl_int setArg(uint32_t argIndex, uint32_t argValue); 243 cl_int setArg(uint32_t argIndex, uint64_t argValue); 244 cl_int setArg(uint32_t argIndex, cl_mem argValue); 245 cl_int setArg(uint32_t argIndex, cl_mem argValue, uint32_t mipLevel); 246 cl_int setArg(uint32_t argIndex, size_t argSize, const void *argVal); 247 248 // Handlers 249 void setKernelArgHandler(uint32_t argIndex, KernelArgHandler handler); 250 251 void unsetArg(uint32_t argIndex); 252 253 cl_int setArgImmediate(uint32_t argIndex, 254 size_t argSize, 255 const void *argVal); 256 257 cl_int setArgBuffer(uint32_t argIndex, 258 size_t argSize, 259 const void *argVal); 260 261 cl_int setArgPipe(uint32_t argIndex, 262 size_t argSize, 263 const void *argVal); 264 265 cl_int setArgImage(uint32_t argIndex, 266 size_t argSize, 267 const void *argVal); 268 269 cl_int setArgImageWithMipLevel(uint32_t argIndex, 270 size_t argSize, 271 const void *argVal, uint32_t mipLevel); 272 273 cl_int setArgLocal(uint32_t argIndex, 274 size_t argSize, 275 const void *argVal); 276 277 cl_int setArgSampler(uint32_t argIndex, 278 size_t argSize, 279 const void *argVal); 280 281 cl_int setArgAccelerator(uint32_t argIndex, 282 size_t argSize, 283 const void *argVal); 284 285 cl_int setArgDevQueue(uint32_t argIndex, 286 size_t argSize, 287 const void *argVal); 288 289 void storeKernelArg(uint32_t argIndex, 290 kernelArgType argType, 291 void *argObject, 292 const void *argValue, 293 size_t argSize, 294 GraphicsAllocation *argSvmAlloc = nullptr, 295 cl_mem_flags argSvmFlags = 0); 296 const void *getKernelArg(uint32_t argIndex) const; 297 const SimpleKernelArgInfo &getKernelArgInfo(uint32_t argIndex) const; 298 getAllowNonUniform()299 bool getAllowNonUniform() const { return program->getAllowNonUniform(); } isVmeKernel()300 bool isVmeKernel() const { return kernelInfo.kernelDescriptor.kernelAttributes.flags.usesVme; } requiresSpecialPipelineSelectMode()301 bool requiresSpecialPipelineSelectMode() const { return specialPipelineSelectMode; } 302 303 void performKernelTuning(CommandStreamReceiver &commandStreamReceiver, const Vec3<size_t> &lws, const Vec3<size_t> &gws, const Vec3<size_t> &offsets, TimestampPacketContainer *timestampContainer); 304 MOCKABLE_VIRTUAL bool isSingleSubdevicePreferred() const; 305 306 //residency for kernel surfaces 307 MOCKABLE_VIRTUAL void makeResident(CommandStreamReceiver &commandStreamReceiver); 308 MOCKABLE_VIRTUAL void getResidency(std::vector<Surface *> &dst); 309 bool requiresCoherency(); 310 void resetSharedObjectsPatchAddresses(); isUsingSharedObjArgs()311 bool isUsingSharedObjArgs() const { return usingSharedObjArgs; } hasUncacheableStatelessArgs()312 bool hasUncacheableStatelessArgs() const { return statelessUncacheableArgsCount > 0; } 313 314 bool hasPrintfOutput() const; 315 316 void setReflectionSurfaceBlockBtOffset(uint32_t blockID, uint32_t offset); 317 318 cl_int checkCorrectImageAccessQualifier(cl_uint argIndex, 319 size_t argSize, 320 const void *argValue) const; 321 322 static uint32_t dummyPatchLocation; 323 324 uint32_t allBufferArgsStateful = CL_TRUE; 325 326 bool isBuiltIn = false; 327 const bool isParentKernel; 328 const bool isSchedulerKernel; 329 getThreadArbitrationPolicy()330 uint32_t getThreadArbitrationPolicy() const { 331 return threadArbitrationPolicy; 332 } getExecutionType()333 KernelExecutionType getExecutionType() const { 334 return executionType; 335 } 336 337 bool checkIfIsParentKernelAndBlocksUsesPrintf(); 338 is32Bit()339 bool is32Bit() const { 340 return kernelInfo.kernelDescriptor.kernelAttributes.gpuPointerSize == 4; 341 } 342 getPerThreadSystemThreadSurfaceSize()343 size_t getPerThreadSystemThreadSurfaceSize() const { 344 return kernelInfo.kernelDescriptor.kernelAttributes.perThreadSystemThreadSurfaceSize; 345 } 346 getPatchInfoDataList()347 std::vector<PatchInfoData> &getPatchInfoDataList() { return patchInfoDataList; }; usesImages()348 bool usesImages() const { 349 return usingImages; 350 } usesOnlyImages()351 bool usesOnlyImages() const { 352 return usingImagesOnly; 353 } 354 355 void fillWithKernelObjsForAuxTranslation(KernelObjsForAuxTranslation &kernelObjsForAuxTranslation); 356 357 MOCKABLE_VIRTUAL bool requiresCacheFlushCommand(const CommandQueue &commandQueue) const; 358 359 using CacheFlushAllocationsVec = StackVec<GraphicsAllocation *, 32>; 360 void getAllocationsForCacheFlush(CacheFlushAllocationsVec &out) const; 361 setAuxTranslationDirection(AuxTranslationDirection auxTranslationDirection)362 void setAuxTranslationDirection(AuxTranslationDirection auxTranslationDirection) { 363 this->auxTranslationDirection = auxTranslationDirection; 364 } setUnifiedMemorySyncRequirement(bool isUnifiedMemorySyncRequired)365 void setUnifiedMemorySyncRequirement(bool isUnifiedMemorySyncRequired) { 366 this->isUnifiedMemorySyncRequired = isUnifiedMemorySyncRequired; 367 } 368 void setUnifiedMemoryProperty(cl_kernel_exec_info infoType, bool infoValue); 369 void setUnifiedMemoryExecInfo(GraphicsAllocation *argValue); 370 void clearUnifiedMemoryExecInfo(); 371 areStatelessWritesUsed()372 bool areStatelessWritesUsed() { return containsStatelessWrites; } 373 int setKernelThreadArbitrationPolicy(uint32_t propertyValue); 374 cl_int setKernelExecutionType(cl_execution_info_kernel_type_intel executionType); setThreadArbitrationPolicy(uint32_t policy)375 void setThreadArbitrationPolicy(uint32_t policy) { 376 this->threadArbitrationPolicy = policy; 377 } 378 void getSuggestedLocalWorkSize(const cl_uint workDim, const size_t *globalWorkSize, const size_t *globalWorkOffset, 379 size_t *localWorkSize); 380 uint32_t getMaxWorkGroupCount(const cl_uint workDim, const size_t *localWorkSize, const CommandQueue *commandQueue) const; 381 382 uint64_t getKernelStartOffset( 383 const bool localIdsGenerationByRuntime, 384 const bool kernelUsesLocalIds, 385 const bool isCssUsed) const; 386 387 bool requiresLimitedWorkgroupSize() const; isKernelDebugEnabled()388 bool isKernelDebugEnabled() const { return debugEnabled; } 389 int32_t setAdditionalKernelExecInfoWithParam(uint32_t paramName, size_t paramValueSize, const void *paramValue); 390 void setAdditionalKernelExecInfo(uint32_t additionalKernelExecInfo); 391 uint32_t getAdditionalKernelExecInfo() const; 392 MOCKABLE_VIRTUAL bool requiresWaDisableRccRhwoOptimization() const; 393 394 //dispatch traits 395 void setGlobalWorkOffsetValues(uint32_t globalWorkOffsetX, uint32_t globalWorkOffsetY, uint32_t globalWorkOffsetZ); 396 void setGlobalWorkSizeValues(uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ); 397 void setLocalWorkSizeValues(uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ); 398 void setLocalWorkSize2Values(uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ); 399 void setEnqueuedLocalWorkSizeValues(uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ); 400 void setNumWorkGroupsValues(uint32_t numWorkGroupsX, uint32_t numWorkGroupsY, uint32_t numWorkGroupsZ); 401 void setWorkDim(uint32_t workDim); 402 getDispatchTrait(const CrossThreadDataOffset offset)403 const uint32_t *getDispatchTrait(const CrossThreadDataOffset offset) const { 404 return isValidOffset(offset) ? reinterpret_cast<uint32_t *>(getCrossThreadData() + offset) 405 : &Kernel::dummyPatchLocation; 406 } getWorkDim()407 const uint32_t *getWorkDim() const { return getDispatchTrait(getDescriptor().payloadMappings.dispatchTraits.workDim); } getDispatchTraitArray(const CrossThreadDataOffset dispatchTrait[3])408 std::array<const uint32_t *, 3> getDispatchTraitArray(const CrossThreadDataOffset dispatchTrait[3]) const { return {getDispatchTrait(dispatchTrait[0]), getDispatchTrait(dispatchTrait[1]), getDispatchTrait(dispatchTrait[2])}; } getGlobalWorkOffsetValues()409 std::array<const uint32_t *, 3> getGlobalWorkOffsetValues() const { return getDispatchTraitArray(getDescriptor().payloadMappings.dispatchTraits.globalWorkOffset); } getLocalWorkSizeValues()410 std::array<const uint32_t *, 3> getLocalWorkSizeValues() const { return getDispatchTraitArray(getDescriptor().payloadMappings.dispatchTraits.localWorkSize); } getLocalWorkSize2Values()411 std::array<const uint32_t *, 3> getLocalWorkSize2Values() const { return getDispatchTraitArray(getDescriptor().payloadMappings.dispatchTraits.localWorkSize2); } getEnqueuedLocalWorkSizeValues()412 std::array<const uint32_t *, 3> getEnqueuedLocalWorkSizeValues() const { return getDispatchTraitArray(getDescriptor().payloadMappings.dispatchTraits.enqueuedLocalWorkSize); } getNumWorkGroupsValues()413 std::array<const uint32_t *, 3> getNumWorkGroupsValues() const { return getDispatchTraitArray(getDescriptor().payloadMappings.dispatchTraits.numWorkGroups); } 414 415 bool isLocalWorkSize2Patchable(); 416 417 uint32_t getMaxKernelWorkGroupSize() const; 418 uint32_t getSlmTotalSize() const; getHasIndirectAccess()419 bool getHasIndirectAccess() const { 420 return this->kernelHasIndirectAccess; 421 } 422 getMultiDeviceKernel()423 MultiDeviceKernel *getMultiDeviceKernel() const { return pMultiDeviceKernel; } setMultiDeviceKernel(MultiDeviceKernel * pMultiDeviceKernelToSet)424 void setMultiDeviceKernel(MultiDeviceKernel *pMultiDeviceKernelToSet) { pMultiDeviceKernel = pMultiDeviceKernelToSet; } 425 426 bool areMultipleSubDevicesInContext() const; requiresMemoryMigration()427 bool requiresMemoryMigration() const { return migratableArgsMap.size() > 0; } getMemObjectsToMigrate()428 const std::map<uint32_t, MemObj *> &getMemObjectsToMigrate() const { return migratableArgsMap; } getImplicitArgs()429 ImplicitArgs *getImplicitArgs() const { return pImplicitArgs.get(); } 430 431 protected: 432 struct ObjectCounts { 433 uint32_t imageCount; 434 uint32_t samplerCount; 435 }; 436 437 class ReflectionSurfaceHelper { 438 public: 439 static const uint64_t undefinedOffset = (uint64_t)-1; 440 setKernelDataHeader(void * reflectionSurface,uint32_t numberOfBlocks,uint32_t parentImages,uint32_t parentSamplers,uint32_t imageOffset,uint32_t samplerOffset)441 static void setKernelDataHeader(void *reflectionSurface, uint32_t numberOfBlocks, 442 uint32_t parentImages, uint32_t parentSamplers, 443 uint32_t imageOffset, uint32_t samplerOffset) { 444 IGIL_KernelDataHeader *kernelDataHeader = reinterpret_cast<IGIL_KernelDataHeader *>(reflectionSurface); 445 kernelDataHeader->m_numberOfKernels = numberOfBlocks; 446 kernelDataHeader->m_ParentKernelImageCount = parentImages; 447 kernelDataHeader->m_ParentSamplerCount = parentSamplers; 448 kernelDataHeader->m_ParentImageDataOffset = imageOffset; 449 kernelDataHeader->m_ParentSamplerParamsOffset = samplerOffset; 450 } 451 452 static uint32_t setKernelData(void *reflectionSurface, uint32_t offset, 453 std::vector<IGIL_KernelCurbeParams> &curbeParamsIn, 454 uint64_t tokenMaskIn, size_t maxConstantBufferSize, 455 size_t samplerCount, const KernelInfo &kernelInfo, 456 const HardwareInfo &hwInfo); 457 458 static void setKernelAddressData(void *reflectionSurface, uint32_t offset, 459 uint32_t kernelDataOffset, uint32_t samplerHeapOffset, 460 uint32_t constantBufferOffset, uint32_t samplerParamsOffset, 461 uint32_t sshTokensOffset, uint32_t btOffset, 462 const KernelInfo &kernelInfo, const HardwareInfo &hwInfo); 463 464 static void getCurbeParams(std::vector<IGIL_KernelCurbeParams> &curbeParamsOut, 465 uint64_t &tokenMaskOut, uint32_t &firstSSHTokenIndex, 466 const KernelInfo &kernelInfo, const HardwareInfo &hwInfo); 467 compareFunction(IGIL_KernelCurbeParams argFirst,IGIL_KernelCurbeParams argSecond)468 static bool compareFunction(IGIL_KernelCurbeParams argFirst, IGIL_KernelCurbeParams argSecond) { 469 if (argFirst.m_parameterType == argSecond.m_parameterType) { 470 if (argFirst.m_parameterType == iOpenCL::DATA_PARAMETER_LOCAL_WORK_SIZE) { 471 return argFirst.m_patchOffset < argSecond.m_patchOffset; 472 } else { 473 return argFirst.m_sourceOffset < argSecond.m_sourceOffset; 474 } 475 } else { 476 return argFirst.m_parameterType < argSecond.m_parameterType; 477 } 478 } 479 480 static void setKernelAddressDataBtOffset(void *reflectionSurface, uint32_t blockID, uint32_t btOffset); 481 482 static void setParentImageParams(void *reflectionSurface, std::vector<Kernel::SimpleKernelArgInfo> &parentArguments, const KernelInfo &parentKernelInfo); 483 static void setParentSamplerParams(void *reflectionSurface, std::vector<Kernel::SimpleKernelArgInfo> &parentArguments, const KernelInfo &parentKernelInfo); 484 485 template <bool mockable = false> 486 static void patchBlocksCurbe(void *reflectionSurface, uint32_t blockID, 487 uint64_t defaultDeviceQueueCurbeOffset, uint32_t patchSizeDefaultQueue, uint64_t defaultDeviceQueueGpuAddress, 488 uint64_t eventPoolCurbeOffset, uint32_t patchSizeEventPool, uint64_t eventPoolGpuAddress, 489 uint64_t deviceQueueCurbeOffset, uint32_t patchSizeDeviceQueue, uint64_t deviceQueueGpuAddress, 490 uint64_t printfBufferOffset, uint32_t printfBufferSize, uint64_t printfBufferGpuAddress, 491 uint64_t privateSurfaceOffset, uint32_t privateSurfaceSize, uint64_t privateSurfaceGpuAddress); 492 493 static void patchBlocksCurbeWithConstantValues(void *reflectionSurface, uint32_t blockID, 494 uint64_t globalMemoryCurbeOffset, uint32_t globalMemoryPatchSize, uint64_t globalMemoryGpuAddress, 495 uint64_t constantMemoryCurbeOffset, uint32_t constantMemoryPatchSize, uint64_t constantMemoryGpuAddress, 496 uint64_t privateMemoryCurbeOffset, uint32_t privateMemoryPatchSize, uint64_t privateMemoryGpuAddress); 497 }; 498 499 void 500 makeArgsResident(CommandStreamReceiver &commandStreamReceiver); 501 502 void *patchBufferOffset(const ArgDescPointer &argAsPtr, void *svmPtr, GraphicsAllocation *svmAlloc); 503 504 void patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const ArgDescPointer &arg); 505 506 void getParentObjectCounts(ObjectCounts &objectCount); 507 Kernel(Program *programArg, const KernelInfo &kernelInfo, ClDevice &clDevice, bool schedulerKernel = false); 508 void provideInitializationHints(); 509 510 void patchBlocksCurbeWithConstantValues(); 511 512 void markArgPatchedAndResolveArgs(uint32_t argIndex); 513 void resolveArgs(); 514 515 void reconfigureKernel(); 516 bool hasDirectStatelessAccessToSharedBuffer() const; 517 bool hasDirectStatelessAccessToHostMemory() const; 518 bool hasIndirectStatelessAccessToHostMemory() const; 519 520 void addAllocationToCacheFlushVector(uint32_t argIndex, GraphicsAllocation *argAllocation); 521 bool allocationForCacheFlush(GraphicsAllocation *argAllocation) const; 522 523 const HardwareInfo &getHardwareInfo() const; 524 getDevice()525 const ClDevice &getDevice() const { 526 return clDevice; 527 } 528 cl_int patchPrivateSurface(); 529 530 bool containsStatelessWrites = true; 531 const ExecutionEnvironment &executionEnvironment; 532 Program *program; 533 ClDevice &clDevice; 534 const KernelInfo &kernelInfo; 535 536 std::vector<SimpleKernelArgInfo> kernelArguments; 537 std::vector<KernelArgHandler> kernelArgHandlers; 538 std::vector<GraphicsAllocation *> kernelSvmGfxAllocations; 539 std::vector<GraphicsAllocation *> kernelUnifiedMemoryGfxAllocations; 540 541 AuxTranslationDirection auxTranslationDirection = AuxTranslationDirection::None; 542 543 GraphicsAllocation *kernelReflectionSurface = nullptr; 544 545 bool usingSharedObjArgs = false; 546 bool usingImages = false; 547 bool usingImagesOnly = false; 548 bool auxTranslationRequired = false; 549 uint32_t patchedArgumentsNum = 0; 550 uint32_t startOffset = 0; 551 uint32_t statelessUncacheableArgsCount = 0; 552 uint32_t threadArbitrationPolicy = ThreadArbitrationPolicy::NotPresent; 553 KernelExecutionType executionType = KernelExecutionType::Default; 554 555 std::vector<PatchInfoData> patchInfoDataList; 556 std::unique_ptr<ImageTransformer> imageTransformer; 557 std::map<uint32_t, MemObj *> migratableArgsMap{}; 558 559 bool specialPipelineSelectMode = false; 560 bool svmAllocationsRequireCacheFlush = false; 561 std::vector<GraphicsAllocation *> kernelArgRequiresCacheFlush; 562 UnifiedMemoryControls unifiedMemoryControls{}; 563 bool isUnifiedMemorySyncRequired = true; 564 bool debugEnabled = false; 565 uint32_t additionalKernelExecInfo = AdditionalKernelExecInfo::DisableOverdispatch; 566 567 uint32_t *maxWorkGroupSizeForCrossThreadData = &Kernel::dummyPatchLocation; 568 uint32_t maxKernelWorkGroupSize = 0; 569 uint32_t *dataParameterSimdSize = &Kernel::dummyPatchLocation; 570 uint32_t *parentEventOffset = &Kernel::dummyPatchLocation; 571 uint32_t *preferredWkgMultipleOffset = &Kernel::dummyPatchLocation; 572 573 size_t numberOfBindingTableStates = 0u; 574 size_t localBindingTableOffset = 0u; 575 576 std::vector<size_t> slmSizes; 577 uint32_t slmTotalSize = 0u; 578 579 std::unique_ptr<char[]> pSshLocal; 580 uint32_t sshLocalSize = 0u; 581 char *crossThreadData = nullptr; 582 uint32_t crossThreadDataSize = 0u; 583 584 GraphicsAllocation *privateSurface = nullptr; 585 uint64_t privateSurfaceSize = 0u; 586 587 struct KernelConfig { 588 Vec3<size_t> gws; 589 Vec3<size_t> lws; 590 Vec3<size_t> offsets; 591 bool operator==(const KernelConfig &other) const { return this->gws == other.gws && this->lws == other.lws && this->offsets == other.offsets; } 592 }; 593 struct KernelConfigHash { operatorKernelConfigHash594 size_t operator()(KernelConfig const &config) const { 595 auto hash = std::hash<size_t>{}; 596 size_t gwsHashX = hash(config.gws.x); 597 size_t gwsHashY = hash(config.gws.y); 598 size_t gwsHashZ = hash(config.gws.z); 599 size_t gwsHash = hashCombine(gwsHashX, gwsHashY, gwsHashZ); 600 size_t lwsHashX = hash(config.lws.x); 601 size_t lwsHashY = hash(config.lws.y); 602 size_t lwsHashZ = hash(config.lws.z); 603 size_t lwsHash = hashCombine(lwsHashX, lwsHashY, lwsHashZ); 604 size_t offsetsHashX = hash(config.offsets.x); 605 size_t offsetsHashY = hash(config.offsets.y); 606 size_t offsetsHashZ = hash(config.offsets.z); 607 size_t offsetsHash = hashCombine(offsetsHashX, offsetsHashY, offsetsHashZ); 608 return hashCombine(gwsHash, lwsHash, offsetsHash); 609 } 610 hashCombineKernelConfigHash611 size_t hashCombine(size_t hash1, size_t hash2, size_t hash3) const { 612 return (hash1 ^ (hash2 << 1u)) ^ (hash3 << 2u); 613 } 614 }; 615 struct KernelSubmissionData { 616 std::unique_ptr<TimestampPacketContainer> kernelStandardTimestamps; 617 std::unique_ptr<TimestampPacketContainer> kernelSubdeviceTimestamps; 618 TunningStatus status; 619 bool singleSubdevicePreferred = false; 620 }; 621 622 bool hasTunningFinished(KernelSubmissionData &submissionData); 623 bool hasRunFinished(TimestampPacketContainer *timestampContainer); 624 625 std::unordered_map<KernelConfig, KernelSubmissionData, KernelConfigHash> kernelSubmissionMap; 626 bool singleSubdevicePreferredInCurrentEnqueue = false; 627 628 bool kernelHasIndirectAccess = true; 629 MultiDeviceKernel *pMultiDeviceKernel = nullptr; 630 std::unique_ptr<ImplicitArgs> pImplicitArgs = nullptr; 631 }; 632 633 } // namespace NEO 634