1 /*
2  * Copyright (C) 2018-2021 Intel Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  */
7 
8 #pragma once
9 #include "shared/source/command_stream/command_stream_receiver_hw.h"
10 #include "shared/source/command_stream/csr_properties_flags.h"
11 #include "shared/source/command_stream/thread_arbitration_policy.h"
12 #include "shared/source/debug_settings/debug_settings_manager.h"
13 #include "shared/source/device/device.h"
14 #include "shared/source/helpers/address_patch.h"
15 #include "shared/source/helpers/preamble.h"
16 #include "shared/source/helpers/timestamp_packet.h"
17 #include "shared/source/kernel/implicit_args.h"
18 #include "shared/source/kernel/kernel_execution_type.h"
19 #include "shared/source/program/kernel_info.h"
20 #include "shared/source/unified_memory/unified_memory.h"
21 #include "shared/source/utilities/stackvec.h"
22 
23 #include "opencl/extensions/public/cl_ext_private.h"
24 #include "opencl/source/api/cl_types.h"
25 #include "opencl/source/cl_device/cl_device.h"
26 #include "opencl/source/device_queue/device_queue.h"
27 #include "opencl/source/helpers/base_object.h"
28 #include "opencl/source/helpers/properties_helper.h"
29 #include "opencl/source/kernel/kernel_objects_for_aux_translation.h"
30 #include "opencl/source/program/program.h"
31 
32 #include <vector>
33 
34 namespace NEO {
35 struct CompletionStamp;
36 class Buffer;
37 class CommandStreamReceiver;
38 class GraphicsAllocation;
39 class ImageTransformer;
40 class Surface;
41 class PrintfHandler;
42 class MultiDeviceKernel;
43 
44 class Kernel : public ReferenceTrackedObject<Kernel> {
45   public:
46     static const uint32_t kernelBinaryAlignment = 64;
47 
48     enum kernelArgType {
49         NONE_OBJ,
50         IMAGE_OBJ,
51         BUFFER_OBJ,
52         PIPE_OBJ,
53         SVM_OBJ,
54         SVM_ALLOC_OBJ,
55         SAMPLER_OBJ,
56         ACCELERATOR_OBJ,
57         DEVICE_QUEUE_OBJ,
58         SLM_OBJ
59     };
60 
61     struct SimpleKernelArgInfo {
62         kernelArgType type;
63         void *object;
64         const void *value;
65         size_t size;
66         GraphicsAllocation *pSvmAlloc;
67         cl_mem_flags svmFlags;
68         bool isPatched = false;
69         bool isStatelessUncacheable = false;
70     };
71 
72     enum class TunningStatus {
73         STANDARD_TUNNING_IN_PROGRESS,
74         SUBDEVICE_TUNNING_IN_PROGRESS,
75         TUNNING_DONE
76     };
77 
78     enum class TunningType {
79         DISABLED,
80         SIMPLE,
81         FULL
82     };
83 
84     typedef int32_t (Kernel::*KernelArgHandler)(uint32_t argIndex,
85                                                 size_t argSize,
86                                                 const void *argVal);
87 
88     template <typename kernel_t = Kernel, typename program_t = Program>
create(program_t * program,const KernelInfo & kernelInfo,ClDevice & clDevice,cl_int * errcodeRet)89     static kernel_t *create(program_t *program, const KernelInfo &kernelInfo, ClDevice &clDevice, cl_int *errcodeRet) {
90         cl_int retVal;
91         kernel_t *pKernel = nullptr;
92 
93         pKernel = new kernel_t(program, kernelInfo, clDevice);
94         retVal = pKernel->initialize();
95 
96         if (retVal != CL_SUCCESS) {
97             delete pKernel;
98             pKernel = nullptr;
99         }
100 
101         if (errcodeRet) {
102             *errcodeRet = retVal;
103         }
104 
105         if (FileLoggerInstance().enabled()) {
106             std::string source;
107             program->getSource(source);
108             FileLoggerInstance().dumpKernel(kernelInfo.kernelDescriptor.kernelMetadata.kernelName, source);
109         }
110 
111         return pKernel;
112     }
113 
114     Kernel &operator=(const Kernel &) = delete;
115     Kernel(const Kernel &) = delete;
116 
117     virtual ~Kernel();
118 
isMemObj(kernelArgType kernelArg)119     static bool isMemObj(kernelArgType kernelArg) {
120         return kernelArg == BUFFER_OBJ || kernelArg == IMAGE_OBJ || kernelArg == PIPE_OBJ;
121     }
122 
isAuxTranslationRequired()123     bool isAuxTranslationRequired() const { return auxTranslationRequired; }
setAuxTranslationRequired(bool onOff)124     void setAuxTranslationRequired(bool onOff) { auxTranslationRequired = onOff; }
125     void updateAuxTranslationRequired();
126 
getCrossThreadDataRef()127     ArrayRef<uint8_t> getCrossThreadDataRef() {
128         return ArrayRef<uint8_t>(reinterpret_cast<uint8_t *>(crossThreadData), crossThreadDataSize);
129     }
130 
getCrossThreadData()131     char *getCrossThreadData() const {
132         return crossThreadData;
133     }
134 
getCrossThreadDataSize()135     uint32_t getCrossThreadDataSize() const {
136         return crossThreadDataSize;
137     }
138 
139     cl_int initialize();
140 
141     MOCKABLE_VIRTUAL cl_int cloneKernel(Kernel *pSourceKernel);
142 
143     MOCKABLE_VIRTUAL bool canTransformImages() const;
144     MOCKABLE_VIRTUAL bool isPatched() const;
145 
146     // API entry points
setArgument(uint32_t argIndex,size_t argSize,const void * argVal)147     cl_int setArgument(uint32_t argIndex, size_t argSize, const void *argVal) { return setArg(argIndex, argSize, argVal); }
148     cl_int setArgSvm(uint32_t argIndex, size_t svmAllocSize, void *svmPtr, GraphicsAllocation *svmAlloc, cl_mem_flags svmFlags);
149     cl_int setArgSvmAlloc(uint32_t argIndex, void *svmPtr, GraphicsAllocation *svmAlloc);
150 
151     void setSvmKernelExecInfo(GraphicsAllocation *argValue);
152     void clearSvmKernelExecInfo();
153 
154     cl_int getInfo(cl_kernel_info paramName, size_t paramValueSize,
155                    void *paramValue, size_t *paramValueSizeRet) const;
156     void getAdditionalInfo(cl_kernel_info paramName, const void *&paramValue, size_t &paramValueSizeRet) const;
157     void getAdditionalWorkGroupInfo(cl_kernel_work_group_info paramName, const void *&paramValue, size_t &paramValueSizeRet) const;
158 
159     cl_int getArgInfo(cl_uint argIndx, cl_kernel_arg_info paramName,
160                       size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const;
161 
162     cl_int getWorkGroupInfo(cl_kernel_work_group_info paramName,
163                             size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const;
164 
165     cl_int getSubGroupInfo(cl_kernel_sub_group_info paramName,
166                            size_t inputValueSize, const void *inputValue,
167                            size_t paramValueSize, void *paramValue,
168                            size_t *paramValueSizeRet) const;
169 
170     const void *getKernelHeap() const;
171     void *getSurfaceStateHeap() const;
172     const void *getDynamicStateHeap() const;
173 
174     size_t getKernelHeapSize() const;
175     size_t getSurfaceStateHeapSize() const;
176     size_t getDynamicStateHeapSize() const;
177     size_t getNumberOfBindingTableStates() const;
getBindingTableOffset()178     size_t getBindingTableOffset() const {
179         return localBindingTableOffset;
180     }
181 
182     void resizeSurfaceStateHeap(void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset);
183 
184     void substituteKernelHeap(void *newKernelHeap, size_t newKernelHeapSize);
185     bool isKernelHeapSubstituted() const;
186     uint64_t getKernelId() const;
187     void setKernelId(uint64_t newKernelId);
188     uint32_t getStartOffset() const;
189     void setStartOffset(uint32_t offset);
190 
getKernelArguments()191     const std::vector<SimpleKernelArgInfo> &getKernelArguments() const {
192         return kernelArguments;
193     }
194 
getKernelArgsNumber()195     size_t getKernelArgsNumber() const {
196         return kernelArguments.size();
197     }
198 
usesBindfulAddressingForBuffers()199     bool usesBindfulAddressingForBuffers() const {
200         return KernelDescriptor::BindfulAndStateless == kernelInfo.kernelDescriptor.kernelAttributes.bufferAddressingMode;
201     }
202 
getDescriptor()203     inline const KernelDescriptor &getDescriptor() const {
204         return kernelInfo.kernelDescriptor;
205     }
getKernelInfo()206     inline const KernelInfo &getKernelInfo() const {
207         return kernelInfo;
208     }
209 
getContext()210     Context &getContext() const {
211         return program->getContext();
212     }
213 
getProgram()214     Program *getProgram() const { return program; }
215 
getScratchSize()216     uint32_t getScratchSize() {
217         return kernelInfo.kernelDescriptor.kernelAttributes.perThreadScratchSize[0];
218     }
219 
getPrivateScratchSize()220     uint32_t getPrivateScratchSize() {
221         return kernelInfo.kernelDescriptor.kernelAttributes.perThreadScratchSize[1];
222     }
223 
224     void createReflectionSurface();
225     template <bool mockable = false>
226     void patchReflectionSurface(DeviceQueue *devQueue, PrintfHandler *printfHandler);
227 
228     void patchDefaultDeviceQueue(DeviceQueue *devQueue);
229     void patchEventPool(DeviceQueue *devQueue);
230     void patchBlocksSimdSize();
231     bool usesSyncBuffer() const;
232     void patchSyncBuffer(GraphicsAllocation *gfxAllocation, size_t bufferOffset);
233     void *patchBindlessSurfaceState(NEO::GraphicsAllocation *alloc, uint32_t bindless);
234 
getKernelReflectionSurface()235     GraphicsAllocation *getKernelReflectionSurface() const {
236         return kernelReflectionSurface;
237     }
238 
239     size_t getInstructionHeapSizeForExecutionModel() const;
240 
241     // Helpers
242     cl_int setArg(uint32_t argIndex, uint32_t argValue);
243     cl_int setArg(uint32_t argIndex, uint64_t argValue);
244     cl_int setArg(uint32_t argIndex, cl_mem argValue);
245     cl_int setArg(uint32_t argIndex, cl_mem argValue, uint32_t mipLevel);
246     cl_int setArg(uint32_t argIndex, size_t argSize, const void *argVal);
247 
248     // Handlers
249     void setKernelArgHandler(uint32_t argIndex, KernelArgHandler handler);
250 
251     void unsetArg(uint32_t argIndex);
252 
253     cl_int setArgImmediate(uint32_t argIndex,
254                            size_t argSize,
255                            const void *argVal);
256 
257     cl_int setArgBuffer(uint32_t argIndex,
258                         size_t argSize,
259                         const void *argVal);
260 
261     cl_int setArgPipe(uint32_t argIndex,
262                       size_t argSize,
263                       const void *argVal);
264 
265     cl_int setArgImage(uint32_t argIndex,
266                        size_t argSize,
267                        const void *argVal);
268 
269     cl_int setArgImageWithMipLevel(uint32_t argIndex,
270                                    size_t argSize,
271                                    const void *argVal, uint32_t mipLevel);
272 
273     cl_int setArgLocal(uint32_t argIndex,
274                        size_t argSize,
275                        const void *argVal);
276 
277     cl_int setArgSampler(uint32_t argIndex,
278                          size_t argSize,
279                          const void *argVal);
280 
281     cl_int setArgAccelerator(uint32_t argIndex,
282                              size_t argSize,
283                              const void *argVal);
284 
285     cl_int setArgDevQueue(uint32_t argIndex,
286                           size_t argSize,
287                           const void *argVal);
288 
289     void storeKernelArg(uint32_t argIndex,
290                         kernelArgType argType,
291                         void *argObject,
292                         const void *argValue,
293                         size_t argSize,
294                         GraphicsAllocation *argSvmAlloc = nullptr,
295                         cl_mem_flags argSvmFlags = 0);
296     const void *getKernelArg(uint32_t argIndex) const;
297     const SimpleKernelArgInfo &getKernelArgInfo(uint32_t argIndex) const;
298 
getAllowNonUniform()299     bool getAllowNonUniform() const { return program->getAllowNonUniform(); }
isVmeKernel()300     bool isVmeKernel() const { return kernelInfo.kernelDescriptor.kernelAttributes.flags.usesVme; }
requiresSpecialPipelineSelectMode()301     bool requiresSpecialPipelineSelectMode() const { return specialPipelineSelectMode; }
302 
303     void performKernelTuning(CommandStreamReceiver &commandStreamReceiver, const Vec3<size_t> &lws, const Vec3<size_t> &gws, const Vec3<size_t> &offsets, TimestampPacketContainer *timestampContainer);
304     MOCKABLE_VIRTUAL bool isSingleSubdevicePreferred() const;
305 
306     //residency for kernel surfaces
307     MOCKABLE_VIRTUAL void makeResident(CommandStreamReceiver &commandStreamReceiver);
308     MOCKABLE_VIRTUAL void getResidency(std::vector<Surface *> &dst);
309     bool requiresCoherency();
310     void resetSharedObjectsPatchAddresses();
isUsingSharedObjArgs()311     bool isUsingSharedObjArgs() const { return usingSharedObjArgs; }
hasUncacheableStatelessArgs()312     bool hasUncacheableStatelessArgs() const { return statelessUncacheableArgsCount > 0; }
313 
314     bool hasPrintfOutput() const;
315 
316     void setReflectionSurfaceBlockBtOffset(uint32_t blockID, uint32_t offset);
317 
318     cl_int checkCorrectImageAccessQualifier(cl_uint argIndex,
319                                             size_t argSize,
320                                             const void *argValue) const;
321 
322     static uint32_t dummyPatchLocation;
323 
324     uint32_t allBufferArgsStateful = CL_TRUE;
325 
326     bool isBuiltIn = false;
327     const bool isParentKernel;
328     const bool isSchedulerKernel;
329 
getThreadArbitrationPolicy()330     uint32_t getThreadArbitrationPolicy() const {
331         return threadArbitrationPolicy;
332     }
getExecutionType()333     KernelExecutionType getExecutionType() const {
334         return executionType;
335     }
336 
337     bool checkIfIsParentKernelAndBlocksUsesPrintf();
338 
is32Bit()339     bool is32Bit() const {
340         return kernelInfo.kernelDescriptor.kernelAttributes.gpuPointerSize == 4;
341     }
342 
getPerThreadSystemThreadSurfaceSize()343     size_t getPerThreadSystemThreadSurfaceSize() const {
344         return kernelInfo.kernelDescriptor.kernelAttributes.perThreadSystemThreadSurfaceSize;
345     }
346 
getPatchInfoDataList()347     std::vector<PatchInfoData> &getPatchInfoDataList() { return patchInfoDataList; };
usesImages()348     bool usesImages() const {
349         return usingImages;
350     }
usesOnlyImages()351     bool usesOnlyImages() const {
352         return usingImagesOnly;
353     }
354 
355     void fillWithKernelObjsForAuxTranslation(KernelObjsForAuxTranslation &kernelObjsForAuxTranslation);
356 
357     MOCKABLE_VIRTUAL bool requiresCacheFlushCommand(const CommandQueue &commandQueue) const;
358 
359     using CacheFlushAllocationsVec = StackVec<GraphicsAllocation *, 32>;
360     void getAllocationsForCacheFlush(CacheFlushAllocationsVec &out) const;
361 
setAuxTranslationDirection(AuxTranslationDirection auxTranslationDirection)362     void setAuxTranslationDirection(AuxTranslationDirection auxTranslationDirection) {
363         this->auxTranslationDirection = auxTranslationDirection;
364     }
setUnifiedMemorySyncRequirement(bool isUnifiedMemorySyncRequired)365     void setUnifiedMemorySyncRequirement(bool isUnifiedMemorySyncRequired) {
366         this->isUnifiedMemorySyncRequired = isUnifiedMemorySyncRequired;
367     }
368     void setUnifiedMemoryProperty(cl_kernel_exec_info infoType, bool infoValue);
369     void setUnifiedMemoryExecInfo(GraphicsAllocation *argValue);
370     void clearUnifiedMemoryExecInfo();
371 
areStatelessWritesUsed()372     bool areStatelessWritesUsed() { return containsStatelessWrites; }
373     int setKernelThreadArbitrationPolicy(uint32_t propertyValue);
374     cl_int setKernelExecutionType(cl_execution_info_kernel_type_intel executionType);
setThreadArbitrationPolicy(uint32_t policy)375     void setThreadArbitrationPolicy(uint32_t policy) {
376         this->threadArbitrationPolicy = policy;
377     }
378     void getSuggestedLocalWorkSize(const cl_uint workDim, const size_t *globalWorkSize, const size_t *globalWorkOffset,
379                                    size_t *localWorkSize);
380     uint32_t getMaxWorkGroupCount(const cl_uint workDim, const size_t *localWorkSize, const CommandQueue *commandQueue) const;
381 
382     uint64_t getKernelStartOffset(
383         const bool localIdsGenerationByRuntime,
384         const bool kernelUsesLocalIds,
385         const bool isCssUsed) const;
386 
387     bool requiresLimitedWorkgroupSize() const;
isKernelDebugEnabled()388     bool isKernelDebugEnabled() const { return debugEnabled; }
389     int32_t setAdditionalKernelExecInfoWithParam(uint32_t paramName, size_t paramValueSize, const void *paramValue);
390     void setAdditionalKernelExecInfo(uint32_t additionalKernelExecInfo);
391     uint32_t getAdditionalKernelExecInfo() const;
392     MOCKABLE_VIRTUAL bool requiresWaDisableRccRhwoOptimization() const;
393 
394     //dispatch traits
395     void setGlobalWorkOffsetValues(uint32_t globalWorkOffsetX, uint32_t globalWorkOffsetY, uint32_t globalWorkOffsetZ);
396     void setGlobalWorkSizeValues(uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ);
397     void setLocalWorkSizeValues(uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ);
398     void setLocalWorkSize2Values(uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ);
399     void setEnqueuedLocalWorkSizeValues(uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ);
400     void setNumWorkGroupsValues(uint32_t numWorkGroupsX, uint32_t numWorkGroupsY, uint32_t numWorkGroupsZ);
401     void setWorkDim(uint32_t workDim);
402 
getDispatchTrait(const CrossThreadDataOffset offset)403     const uint32_t *getDispatchTrait(const CrossThreadDataOffset offset) const {
404         return isValidOffset(offset) ? reinterpret_cast<uint32_t *>(getCrossThreadData() + offset)
405                                      : &Kernel::dummyPatchLocation;
406     }
getWorkDim()407     const uint32_t *getWorkDim() const { return getDispatchTrait(getDescriptor().payloadMappings.dispatchTraits.workDim); }
getDispatchTraitArray(const CrossThreadDataOffset dispatchTrait[3])408     std::array<const uint32_t *, 3> getDispatchTraitArray(const CrossThreadDataOffset dispatchTrait[3]) const { return {getDispatchTrait(dispatchTrait[0]), getDispatchTrait(dispatchTrait[1]), getDispatchTrait(dispatchTrait[2])}; }
getGlobalWorkOffsetValues()409     std::array<const uint32_t *, 3> getGlobalWorkOffsetValues() const { return getDispatchTraitArray(getDescriptor().payloadMappings.dispatchTraits.globalWorkOffset); }
getLocalWorkSizeValues()410     std::array<const uint32_t *, 3> getLocalWorkSizeValues() const { return getDispatchTraitArray(getDescriptor().payloadMappings.dispatchTraits.localWorkSize); }
getLocalWorkSize2Values()411     std::array<const uint32_t *, 3> getLocalWorkSize2Values() const { return getDispatchTraitArray(getDescriptor().payloadMappings.dispatchTraits.localWorkSize2); }
getEnqueuedLocalWorkSizeValues()412     std::array<const uint32_t *, 3> getEnqueuedLocalWorkSizeValues() const { return getDispatchTraitArray(getDescriptor().payloadMappings.dispatchTraits.enqueuedLocalWorkSize); }
getNumWorkGroupsValues()413     std::array<const uint32_t *, 3> getNumWorkGroupsValues() const { return getDispatchTraitArray(getDescriptor().payloadMappings.dispatchTraits.numWorkGroups); }
414 
415     bool isLocalWorkSize2Patchable();
416 
417     uint32_t getMaxKernelWorkGroupSize() const;
418     uint32_t getSlmTotalSize() const;
getHasIndirectAccess()419     bool getHasIndirectAccess() const {
420         return this->kernelHasIndirectAccess;
421     }
422 
getMultiDeviceKernel()423     MultiDeviceKernel *getMultiDeviceKernel() const { return pMultiDeviceKernel; }
setMultiDeviceKernel(MultiDeviceKernel * pMultiDeviceKernelToSet)424     void setMultiDeviceKernel(MultiDeviceKernel *pMultiDeviceKernelToSet) { pMultiDeviceKernel = pMultiDeviceKernelToSet; }
425 
426     bool areMultipleSubDevicesInContext() const;
requiresMemoryMigration()427     bool requiresMemoryMigration() const { return migratableArgsMap.size() > 0; }
getMemObjectsToMigrate()428     const std::map<uint32_t, MemObj *> &getMemObjectsToMigrate() const { return migratableArgsMap; }
getImplicitArgs()429     ImplicitArgs *getImplicitArgs() const { return pImplicitArgs.get(); }
430 
431   protected:
432     struct ObjectCounts {
433         uint32_t imageCount;
434         uint32_t samplerCount;
435     };
436 
437     class ReflectionSurfaceHelper {
438       public:
439         static const uint64_t undefinedOffset = (uint64_t)-1;
440 
setKernelDataHeader(void * reflectionSurface,uint32_t numberOfBlocks,uint32_t parentImages,uint32_t parentSamplers,uint32_t imageOffset,uint32_t samplerOffset)441         static void setKernelDataHeader(void *reflectionSurface, uint32_t numberOfBlocks,
442                                         uint32_t parentImages, uint32_t parentSamplers,
443                                         uint32_t imageOffset, uint32_t samplerOffset) {
444             IGIL_KernelDataHeader *kernelDataHeader = reinterpret_cast<IGIL_KernelDataHeader *>(reflectionSurface);
445             kernelDataHeader->m_numberOfKernels = numberOfBlocks;
446             kernelDataHeader->m_ParentKernelImageCount = parentImages;
447             kernelDataHeader->m_ParentSamplerCount = parentSamplers;
448             kernelDataHeader->m_ParentImageDataOffset = imageOffset;
449             kernelDataHeader->m_ParentSamplerParamsOffset = samplerOffset;
450         }
451 
452         static uint32_t setKernelData(void *reflectionSurface, uint32_t offset,
453                                       std::vector<IGIL_KernelCurbeParams> &curbeParamsIn,
454                                       uint64_t tokenMaskIn, size_t maxConstantBufferSize,
455                                       size_t samplerCount, const KernelInfo &kernelInfo,
456                                       const HardwareInfo &hwInfo);
457 
458         static void setKernelAddressData(void *reflectionSurface, uint32_t offset,
459                                          uint32_t kernelDataOffset, uint32_t samplerHeapOffset,
460                                          uint32_t constantBufferOffset, uint32_t samplerParamsOffset,
461                                          uint32_t sshTokensOffset, uint32_t btOffset,
462                                          const KernelInfo &kernelInfo, const HardwareInfo &hwInfo);
463 
464         static void getCurbeParams(std::vector<IGIL_KernelCurbeParams> &curbeParamsOut,
465                                    uint64_t &tokenMaskOut, uint32_t &firstSSHTokenIndex,
466                                    const KernelInfo &kernelInfo, const HardwareInfo &hwInfo);
467 
compareFunction(IGIL_KernelCurbeParams argFirst,IGIL_KernelCurbeParams argSecond)468         static bool compareFunction(IGIL_KernelCurbeParams argFirst, IGIL_KernelCurbeParams argSecond) {
469             if (argFirst.m_parameterType == argSecond.m_parameterType) {
470                 if (argFirst.m_parameterType == iOpenCL::DATA_PARAMETER_LOCAL_WORK_SIZE) {
471                     return argFirst.m_patchOffset < argSecond.m_patchOffset;
472                 } else {
473                     return argFirst.m_sourceOffset < argSecond.m_sourceOffset;
474                 }
475             } else {
476                 return argFirst.m_parameterType < argSecond.m_parameterType;
477             }
478         }
479 
480         static void setKernelAddressDataBtOffset(void *reflectionSurface, uint32_t blockID, uint32_t btOffset);
481 
482         static void setParentImageParams(void *reflectionSurface, std::vector<Kernel::SimpleKernelArgInfo> &parentArguments, const KernelInfo &parentKernelInfo);
483         static void setParentSamplerParams(void *reflectionSurface, std::vector<Kernel::SimpleKernelArgInfo> &parentArguments, const KernelInfo &parentKernelInfo);
484 
485         template <bool mockable = false>
486         static void patchBlocksCurbe(void *reflectionSurface, uint32_t blockID,
487                                      uint64_t defaultDeviceQueueCurbeOffset, uint32_t patchSizeDefaultQueue, uint64_t defaultDeviceQueueGpuAddress,
488                                      uint64_t eventPoolCurbeOffset, uint32_t patchSizeEventPool, uint64_t eventPoolGpuAddress,
489                                      uint64_t deviceQueueCurbeOffset, uint32_t patchSizeDeviceQueue, uint64_t deviceQueueGpuAddress,
490                                      uint64_t printfBufferOffset, uint32_t printfBufferSize, uint64_t printfBufferGpuAddress,
491                                      uint64_t privateSurfaceOffset, uint32_t privateSurfaceSize, uint64_t privateSurfaceGpuAddress);
492 
493         static void patchBlocksCurbeWithConstantValues(void *reflectionSurface, uint32_t blockID,
494                                                        uint64_t globalMemoryCurbeOffset, uint32_t globalMemoryPatchSize, uint64_t globalMemoryGpuAddress,
495                                                        uint64_t constantMemoryCurbeOffset, uint32_t constantMemoryPatchSize, uint64_t constantMemoryGpuAddress,
496                                                        uint64_t privateMemoryCurbeOffset, uint32_t privateMemoryPatchSize, uint64_t privateMemoryGpuAddress);
497     };
498 
499     void
500     makeArgsResident(CommandStreamReceiver &commandStreamReceiver);
501 
502     void *patchBufferOffset(const ArgDescPointer &argAsPtr, void *svmPtr, GraphicsAllocation *svmAlloc);
503 
504     void patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const ArgDescPointer &arg);
505 
506     void getParentObjectCounts(ObjectCounts &objectCount);
507     Kernel(Program *programArg, const KernelInfo &kernelInfo, ClDevice &clDevice, bool schedulerKernel = false);
508     void provideInitializationHints();
509 
510     void patchBlocksCurbeWithConstantValues();
511 
512     void markArgPatchedAndResolveArgs(uint32_t argIndex);
513     void resolveArgs();
514 
515     void reconfigureKernel();
516     bool hasDirectStatelessAccessToSharedBuffer() const;
517     bool hasDirectStatelessAccessToHostMemory() const;
518     bool hasIndirectStatelessAccessToHostMemory() const;
519 
520     void addAllocationToCacheFlushVector(uint32_t argIndex, GraphicsAllocation *argAllocation);
521     bool allocationForCacheFlush(GraphicsAllocation *argAllocation) const;
522 
523     const HardwareInfo &getHardwareInfo() const;
524 
getDevice()525     const ClDevice &getDevice() const {
526         return clDevice;
527     }
528     cl_int patchPrivateSurface();
529 
530     bool containsStatelessWrites = true;
531     const ExecutionEnvironment &executionEnvironment;
532     Program *program;
533     ClDevice &clDevice;
534     const KernelInfo &kernelInfo;
535 
536     std::vector<SimpleKernelArgInfo> kernelArguments;
537     std::vector<KernelArgHandler> kernelArgHandlers;
538     std::vector<GraphicsAllocation *> kernelSvmGfxAllocations;
539     std::vector<GraphicsAllocation *> kernelUnifiedMemoryGfxAllocations;
540 
541     AuxTranslationDirection auxTranslationDirection = AuxTranslationDirection::None;
542 
543     GraphicsAllocation *kernelReflectionSurface = nullptr;
544 
545     bool usingSharedObjArgs = false;
546     bool usingImages = false;
547     bool usingImagesOnly = false;
548     bool auxTranslationRequired = false;
549     uint32_t patchedArgumentsNum = 0;
550     uint32_t startOffset = 0;
551     uint32_t statelessUncacheableArgsCount = 0;
552     uint32_t threadArbitrationPolicy = ThreadArbitrationPolicy::NotPresent;
553     KernelExecutionType executionType = KernelExecutionType::Default;
554 
555     std::vector<PatchInfoData> patchInfoDataList;
556     std::unique_ptr<ImageTransformer> imageTransformer;
557     std::map<uint32_t, MemObj *> migratableArgsMap{};
558 
559     bool specialPipelineSelectMode = false;
560     bool svmAllocationsRequireCacheFlush = false;
561     std::vector<GraphicsAllocation *> kernelArgRequiresCacheFlush;
562     UnifiedMemoryControls unifiedMemoryControls{};
563     bool isUnifiedMemorySyncRequired = true;
564     bool debugEnabled = false;
565     uint32_t additionalKernelExecInfo = AdditionalKernelExecInfo::DisableOverdispatch;
566 
567     uint32_t *maxWorkGroupSizeForCrossThreadData = &Kernel::dummyPatchLocation;
568     uint32_t maxKernelWorkGroupSize = 0;
569     uint32_t *dataParameterSimdSize = &Kernel::dummyPatchLocation;
570     uint32_t *parentEventOffset = &Kernel::dummyPatchLocation;
571     uint32_t *preferredWkgMultipleOffset = &Kernel::dummyPatchLocation;
572 
573     size_t numberOfBindingTableStates = 0u;
574     size_t localBindingTableOffset = 0u;
575 
576     std::vector<size_t> slmSizes;
577     uint32_t slmTotalSize = 0u;
578 
579     std::unique_ptr<char[]> pSshLocal;
580     uint32_t sshLocalSize = 0u;
581     char *crossThreadData = nullptr;
582     uint32_t crossThreadDataSize = 0u;
583 
584     GraphicsAllocation *privateSurface = nullptr;
585     uint64_t privateSurfaceSize = 0u;
586 
587     struct KernelConfig {
588         Vec3<size_t> gws;
589         Vec3<size_t> lws;
590         Vec3<size_t> offsets;
591         bool operator==(const KernelConfig &other) const { return this->gws == other.gws && this->lws == other.lws && this->offsets == other.offsets; }
592     };
593     struct KernelConfigHash {
operatorKernelConfigHash594         size_t operator()(KernelConfig const &config) const {
595             auto hash = std::hash<size_t>{};
596             size_t gwsHashX = hash(config.gws.x);
597             size_t gwsHashY = hash(config.gws.y);
598             size_t gwsHashZ = hash(config.gws.z);
599             size_t gwsHash = hashCombine(gwsHashX, gwsHashY, gwsHashZ);
600             size_t lwsHashX = hash(config.lws.x);
601             size_t lwsHashY = hash(config.lws.y);
602             size_t lwsHashZ = hash(config.lws.z);
603             size_t lwsHash = hashCombine(lwsHashX, lwsHashY, lwsHashZ);
604             size_t offsetsHashX = hash(config.offsets.x);
605             size_t offsetsHashY = hash(config.offsets.y);
606             size_t offsetsHashZ = hash(config.offsets.z);
607             size_t offsetsHash = hashCombine(offsetsHashX, offsetsHashY, offsetsHashZ);
608             return hashCombine(gwsHash, lwsHash, offsetsHash);
609         }
610 
hashCombineKernelConfigHash611         size_t hashCombine(size_t hash1, size_t hash2, size_t hash3) const {
612             return (hash1 ^ (hash2 << 1u)) ^ (hash3 << 2u);
613         }
614     };
615     struct KernelSubmissionData {
616         std::unique_ptr<TimestampPacketContainer> kernelStandardTimestamps;
617         std::unique_ptr<TimestampPacketContainer> kernelSubdeviceTimestamps;
618         TunningStatus status;
619         bool singleSubdevicePreferred = false;
620     };
621 
622     bool hasTunningFinished(KernelSubmissionData &submissionData);
623     bool hasRunFinished(TimestampPacketContainer *timestampContainer);
624 
625     std::unordered_map<KernelConfig, KernelSubmissionData, KernelConfigHash> kernelSubmissionMap;
626     bool singleSubdevicePreferredInCurrentEnqueue = false;
627 
628     bool kernelHasIndirectAccess = true;
629     MultiDeviceKernel *pMultiDeviceKernel = nullptr;
630     std::unique_ptr<ImplicitArgs> pImplicitArgs = nullptr;
631 };
632 
633 } // namespace NEO
634