1 /*
2  * Copyright (C) 2018-2021 Intel Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  */
7 
8 #include "opencl/source/kernel/kernel.h"
9 
10 #include "shared/source/built_ins/built_ins.h"
11 #include "shared/source/command_stream/command_stream_receiver.h"
12 #include "shared/source/debug_settings/debug_settings_manager.h"
13 #include "shared/source/device_binary_format/patchtokens_decoder.h"
14 #include "shared/source/gmm_helper/gmm_helper.h"
15 #include "shared/source/helpers/aligned_memory.h"
16 #include "shared/source/helpers/api_specific_config.h"
17 #include "shared/source/helpers/basic_math.h"
18 #include "shared/source/helpers/debug_helpers.h"
19 #include "shared/source/helpers/get_info.h"
20 #include "shared/source/helpers/hw_helper.h"
21 #include "shared/source/helpers/kernel_helpers.h"
22 #include "shared/source/helpers/per_thread_data.h"
23 #include "shared/source/helpers/ptr_math.h"
24 #include "shared/source/helpers/surface_format_info.h"
25 #include "shared/source/kernel/kernel_arg_descriptor_extended_device_side_enqueue.h"
26 #include "shared/source/kernel/kernel_arg_descriptor_extended_vme.h"
27 #include "shared/source/memory_manager/memory_manager.h"
28 #include "shared/source/memory_manager/unified_memory_manager.h"
29 #include "shared/source/os_interface/hw_info_config.h"
30 #include "shared/source/program/kernel_info.h"
31 
32 #include "opencl/source/accelerators/intel_accelerator.h"
33 #include "opencl/source/accelerators/intel_motion_estimation.h"
34 #include "opencl/source/built_ins/builtins_dispatch_builder.h"
35 #include "opencl/source/cl_device/cl_device.h"
36 #include "opencl/source/command_queue/cl_local_work_size.h"
37 #include "opencl/source/command_queue/command_queue.h"
38 #include "opencl/source/context/context.h"
39 #include "opencl/source/device_queue/device_queue.h"
40 #include "opencl/source/execution_model/device_enqueue.h"
41 #include "opencl/source/gtpin/gtpin_notify.h"
42 #include "opencl/source/helpers/cl_hw_helper.h"
43 #include "opencl/source/helpers/dispatch_info.h"
44 #include "opencl/source/helpers/get_info_status_mapper.h"
45 #include "opencl/source/helpers/sampler_helpers.h"
46 #include "opencl/source/kernel/image_transformer.h"
47 #include "opencl/source/kernel/kernel.inl"
48 #include "opencl/source/kernel/kernel_info_cl.h"
49 #include "opencl/source/mem_obj/buffer.h"
50 #include "opencl/source/mem_obj/image.h"
51 #include "opencl/source/mem_obj/pipe.h"
52 #include "opencl/source/memory_manager/mem_obj_surface.h"
53 #include "opencl/source/platform/platform.h"
54 #include "opencl/source/program/block_kernel_manager.h"
55 #include "opencl/source/sampler/sampler.h"
56 
57 #include "patch_list.h"
58 
59 #include <algorithm>
60 #include <cstdint>
61 #include <vector>
62 
63 using namespace iOpenCL;
64 
65 namespace NEO {
66 class Surface;
67 
68 uint32_t Kernel::dummyPatchLocation = 0xbaddf00d;
69 
Kernel(Program * programArg,const KernelInfo & kernelInfoArg,ClDevice & clDeviceArg,bool schedulerKernel)70 Kernel::Kernel(Program *programArg, const KernelInfo &kernelInfoArg, ClDevice &clDeviceArg, bool schedulerKernel)
71     : isParentKernel(kernelInfoArg.kernelDescriptor.kernelAttributes.flags.usesDeviceSideEnqueue),
72       isSchedulerKernel(schedulerKernel),
73       executionEnvironment(programArg->getExecutionEnvironment()),
74       program(programArg),
75       clDevice(clDeviceArg),
76       kernelInfo(kernelInfoArg) {
77     program->retain();
78     program->retainForKernel();
79     imageTransformer.reset(new ImageTransformer);
80     auto &deviceInfo = getDevice().getDevice().getDeviceInfo();
81     if (kernelInfoArg.kernelDescriptor.kernelAttributes.simdSize == 1u) {
82         auto &hwInfoConfig = *HwInfoConfig::get(getHardwareInfo().platform.eProductFamily);
83         maxKernelWorkGroupSize = hwInfoConfig.getMaxThreadsForWorkgroupInDSSOrSS(getHardwareInfo(), static_cast<uint32_t>(deviceInfo.maxNumEUsPerSubSlice), static_cast<uint32_t>(deviceInfo.maxNumEUsPerDualSubSlice));
84     } else {
85         maxKernelWorkGroupSize = static_cast<uint32_t>(deviceInfo.maxWorkGroupSize);
86     }
87     slmTotalSize = kernelInfoArg.kernelDescriptor.kernelAttributes.slmInlineSize;
88 }
89 
~Kernel()90 Kernel::~Kernel() {
91     delete[] crossThreadData;
92     crossThreadData = nullptr;
93     crossThreadDataSize = 0;
94 
95     if (privateSurface) {
96         program->peekExecutionEnvironment().memoryManager->checkGpuUsageAndDestroyGraphicsAllocations(privateSurface);
97         privateSurface = nullptr;
98     }
99 
100     if (kernelReflectionSurface) {
101         program->peekExecutionEnvironment().memoryManager->freeGraphicsMemory(kernelReflectionSurface);
102         kernelReflectionSurface = nullptr;
103     }
104 
105     for (uint32_t i = 0; i < patchedArgumentsNum; i++) {
106         if (SAMPLER_OBJ == getKernelArguments()[i].type) {
107             auto sampler = castToObject<Sampler>(kernelArguments.at(i).object);
108             if (sampler) {
109                 sampler->decRefInternal();
110             }
111         }
112     }
113 
114     kernelArgHandlers.clear();
115     program->releaseForKernel();
116     program->release();
117 }
118 // If dstOffsetBytes is not an invalid offset, then patches dst at dstOffsetBytes
119 // with src casted to DstT type.
120 template <typename DstT, typename SrcT>
patch(const SrcT & src,void * dst,CrossThreadDataOffset dstOffsetBytes)121 inline void patch(const SrcT &src, void *dst, CrossThreadDataOffset dstOffsetBytes) {
122     if (isValidOffset(dstOffsetBytes)) {
123         DstT *patchLocation = reinterpret_cast<DstT *>(ptrOffset(dst, dstOffsetBytes));
124         *patchLocation = static_cast<DstT>(src);
125     }
126 }
127 
patchWithImplicitSurface(void * ptrToPatchInCrossThreadData,GraphicsAllocation & allocation,const ArgDescPointer & arg)128 void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const ArgDescPointer &arg) {
129     if ((nullptr != crossThreadData) && isValidOffset(arg.stateless)) {
130         auto pp = ptrOffset(crossThreadData, arg.stateless);
131         uintptr_t addressToPatch = reinterpret_cast<uintptr_t>(ptrToPatchInCrossThreadData);
132         patchWithRequiredSize(pp, arg.pointerSize, addressToPatch);
133         if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) {
134             PatchInfoData patchInfoData(addressToPatch, 0u, PatchInfoAllocationType::KernelArg, reinterpret_cast<uint64_t>(crossThreadData), arg.stateless, PatchInfoAllocationType::IndirectObjectHeap, arg.pointerSize);
135             this->patchInfoDataList.push_back(patchInfoData);
136         }
137     }
138 
139     void *ssh = getSurfaceStateHeap();
140     if ((nullptr != ssh) && isValidOffset(arg.bindful)) {
141         auto surfaceState = ptrOffset(ssh, arg.bindful);
142         void *addressToPatch = reinterpret_cast<void *>(allocation.getGpuAddressToPatch());
143         size_t sizeToPatch = allocation.getUnderlyingBufferSize();
144         Buffer::setSurfaceState(&clDevice.getDevice(), surfaceState, false, false, sizeToPatch, addressToPatch, 0, &allocation, 0, 0,
145                                 kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, areMultipleSubDevicesInContext());
146     }
147 }
148 
initialize()149 cl_int Kernel::initialize() {
150     this->kernelHasIndirectAccess = false;
151     auto pClDevice = &getDevice();
152     auto rootDeviceIndex = pClDevice->getRootDeviceIndex();
153     reconfigureKernel();
154     auto &hwInfo = pClDevice->getHardwareInfo();
155     auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
156     auto &kernelDescriptor = kernelInfo.kernelDescriptor;
157     const auto &implicitArgs = kernelDescriptor.payloadMappings.implicitArgs;
158     const auto &explicitArgs = kernelDescriptor.payloadMappings.explicitArgs;
159     auto maxSimdSize = kernelInfo.getMaxSimdSize();
160     const auto &heapInfo = kernelInfo.heapInfo;
161 
162     if (maxSimdSize != 1 && maxSimdSize < hwHelper.getMinimalSIMDSize()) {
163         return CL_INVALID_KERNEL;
164     }
165 
166     if (kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs) {
167         pImplicitArgs = std::make_unique<ImplicitArgs>();
168         *pImplicitArgs = {};
169         pImplicitArgs->structSize = sizeof(ImplicitArgs);
170         pImplicitArgs->structVersion = 0;
171         pImplicitArgs->simdWidth = maxSimdSize;
172     }
173 
174     crossThreadDataSize = kernelDescriptor.kernelAttributes.crossThreadDataSize;
175 
176     // now allocate our own cross-thread data, if necessary
177     if (crossThreadDataSize) {
178         crossThreadData = new char[crossThreadDataSize];
179 
180         if (kernelInfo.crossThreadData) {
181             memcpy_s(crossThreadData, crossThreadDataSize,
182                      kernelInfo.crossThreadData, crossThreadDataSize);
183         } else {
184             memset(crossThreadData, 0x00, crossThreadDataSize);
185         }
186 
187         auto crossThread = reinterpret_cast<uint32_t *>(crossThreadData);
188         auto setArgsIfValidOffset = [&](uint32_t *&crossThreadData, NEO::CrossThreadDataOffset offset, uint32_t value) {
189             if (isValidOffset(offset)) {
190                 crossThreadData = ptrOffset(crossThread, offset);
191                 *crossThreadData = value;
192             }
193         };
194         setArgsIfValidOffset(maxWorkGroupSizeForCrossThreadData, implicitArgs.maxWorkGroupSize, maxKernelWorkGroupSize);
195         setArgsIfValidOffset(dataParameterSimdSize, implicitArgs.simdSize, maxSimdSize);
196         setArgsIfValidOffset(preferredWkgMultipleOffset, implicitArgs.preferredWkgMultiple, maxSimdSize);
197         setArgsIfValidOffset(parentEventOffset, implicitArgs.deviceSideEnqueueParentEvent, undefined<uint32_t>);
198     }
199 
200     // allocate our own SSH, if necessary
201     sshLocalSize = heapInfo.SurfaceStateHeapSize;
202     if (sshLocalSize) {
203         pSshLocal = std::make_unique<char[]>(sshLocalSize);
204 
205         // copy the ssh into our local copy
206         memcpy_s(pSshLocal.get(), sshLocalSize,
207                  heapInfo.pSsh, heapInfo.SurfaceStateHeapSize);
208     }
209     numberOfBindingTableStates = kernelDescriptor.payloadMappings.bindingTable.numEntries;
210     localBindingTableOffset = kernelDescriptor.payloadMappings.bindingTable.tableOffset;
211 
212     // patch crossthread data and ssh with inline surfaces, if necessary
213     auto status = patchPrivateSurface();
214     if (CL_SUCCESS != status) {
215         return status;
216     }
217 
218     if (isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress.stateless)) {
219         DEBUG_BREAK_IF(program->getConstantSurface(rootDeviceIndex) == nullptr);
220         uintptr_t constMemory = isBuiltIn ? (uintptr_t)program->getConstantSurface(rootDeviceIndex)->getUnderlyingBuffer() : (uintptr_t)program->getConstantSurface(rootDeviceIndex)->getGpuAddressToPatch();
221 
222         const auto &arg = kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress;
223         patchWithImplicitSurface(reinterpret_cast<void *>(constMemory), *program->getConstantSurface(rootDeviceIndex), arg);
224     }
225 
226     if (isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress.stateless)) {
227         DEBUG_BREAK_IF(program->getGlobalSurface(rootDeviceIndex) == nullptr);
228         uintptr_t globalMemory = isBuiltIn ? (uintptr_t)program->getGlobalSurface(rootDeviceIndex)->getUnderlyingBuffer() : (uintptr_t)program->getGlobalSurface(rootDeviceIndex)->getGpuAddressToPatch();
229 
230         const auto &arg = kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress;
231         patchWithImplicitSurface(reinterpret_cast<void *>(globalMemory), *program->getGlobalSurface(rootDeviceIndex), arg);
232     }
233 
234     // Patch Surface State Heap
235     bool useGlobalAtomics = kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics;
236 
237     if (isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.deviceSideEnqueueEventPoolSurfaceAddress.bindful)) {
238         auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap()),
239                                       kernelDescriptor.payloadMappings.implicitArgs.deviceSideEnqueueEventPoolSurfaceAddress.bindful);
240         Buffer::setSurfaceState(&pClDevice->getDevice(), surfaceState, false, false, 0, nullptr, 0, nullptr, 0, 0, useGlobalAtomics, areMultipleSubDevicesInContext());
241     }
242 
243     if (isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.deviceSideEnqueueDefaultQueueSurfaceAddress.bindful)) {
244         auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap()),
245                                       kernelDescriptor.payloadMappings.implicitArgs.deviceSideEnqueueDefaultQueueSurfaceAddress.bindful);
246         Buffer::setSurfaceState(&pClDevice->getDevice(), surfaceState, false, false, 0, nullptr, 0, nullptr, 0, 0, useGlobalAtomics, areMultipleSubDevicesInContext());
247     }
248 
249     setThreadArbitrationPolicy(hwHelper.getDefaultThreadArbitrationPolicy());
250     if (false == kernelInfo.kernelDescriptor.kernelAttributes.flags.requiresSubgroupIndependentForwardProgress) {
251         setThreadArbitrationPolicy(ThreadArbitrationPolicy::AgeBased);
252     }
253     patchBlocksSimdSize();
254 
255     auto &clHwHelper = ClHwHelper::get(hwInfo.platform.eRenderCoreFamily);
256 
257     auxTranslationRequired = !program->getIsBuiltIn() && HwHelper::compressedBuffersSupported(hwInfo) && clHwHelper.requiresAuxResolves(kernelInfo, hwInfo);
258 
259     if (DebugManager.flags.ForceAuxTranslationEnabled.get() != -1) {
260         auxTranslationRequired &= !!DebugManager.flags.ForceAuxTranslationEnabled.get();
261     }
262     if (auxTranslationRequired) {
263         program->getContextPtr()->setResolvesRequiredInKernels(true);
264     }
265 
266     if (isParentKernel) {
267         program->allocateBlockPrivateSurfaces(*pClDevice);
268     }
269     if (program->isKernelDebugEnabled() && isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.systemThreadSurfaceAddress.bindful)) {
270         debugEnabled = true;
271     }
272     auto numArgs = explicitArgs.size();
273     slmSizes.resize(numArgs);
274 
275     this->kernelHasIndirectAccess |= kernelInfo.kernelDescriptor.kernelAttributes.hasNonKernelArgLoad ||
276                                      kernelInfo.kernelDescriptor.kernelAttributes.hasNonKernelArgStore ||
277                                      kernelInfo.kernelDescriptor.kernelAttributes.hasNonKernelArgAtomic;
278 
279     provideInitializationHints();
280     // resolve the new kernel info to account for kernel handlers
281     // I think by this time we have decoded the binary and know the number of args etc.
282     // double check this assumption
283     bool usingBuffers = false;
284     kernelArguments.resize(numArgs);
285     kernelArgHandlers.resize(numArgs);
286     kernelArgRequiresCacheFlush.resize(numArgs);
287 
288     for (uint32_t i = 0; i < numArgs; ++i) {
289         storeKernelArg(i, NONE_OBJ, nullptr, nullptr, 0);
290 
291         // set the argument handler
292         const auto &arg = explicitArgs[i];
293         if (arg.is<ArgDescriptor::ArgTPointer>()) {
294             if (arg.getTraits().addressQualifier == KernelArgMetadata::AddrLocal) {
295                 kernelArgHandlers[i] = &Kernel::setArgLocal;
296             } else if (arg.getTraits().typeQualifiers.pipeQual) {
297                 kernelArgHandlers[i] = &Kernel::setArgPipe;
298                 kernelArguments[i].type = PIPE_OBJ;
299             } else if (arg.getExtendedTypeInfo().isDeviceQueue) {
300                 kernelArgHandlers[i] = &Kernel::setArgDevQueue;
301                 kernelArguments[i].type = DEVICE_QUEUE_OBJ;
302             } else {
303                 kernelArgHandlers[i] = &Kernel::setArgBuffer;
304                 kernelArguments[i].type = BUFFER_OBJ;
305                 usingBuffers = true;
306                 allBufferArgsStateful &= static_cast<uint32_t>(arg.as<ArgDescPointer>().isPureStateful());
307             }
308         } else if (arg.is<ArgDescriptor::ArgTImage>()) {
309             kernelArgHandlers[i] = &Kernel::setArgImage;
310             kernelArguments[i].type = IMAGE_OBJ;
311             usingImages = true;
312         } else if (arg.is<ArgDescriptor::ArgTSampler>()) {
313             if (arg.getExtendedTypeInfo().isAccelerator) {
314                 kernelArgHandlers[i] = &Kernel::setArgAccelerator;
315             } else {
316                 kernelArgHandlers[i] = &Kernel::setArgSampler;
317                 kernelArguments[i].type = SAMPLER_OBJ;
318             }
319         } else {
320             kernelArgHandlers[i] = &Kernel::setArgImmediate;
321         }
322     }
323 
324     if (usingImages && !usingBuffers) {
325         usingImagesOnly = true;
326     }
327 
328     return CL_SUCCESS;
329 }
330 
patchPrivateSurface()331 cl_int Kernel::patchPrivateSurface() {
332     auto pClDevice = &getDevice();
333     auto rootDeviceIndex = pClDevice->getRootDeviceIndex();
334     auto &kernelDescriptor = kernelInfo.kernelDescriptor;
335     auto perHwThreadPrivateMemorySize = kernelDescriptor.kernelAttributes.perHwThreadPrivateMemorySize;
336     if (perHwThreadPrivateMemorySize) {
337         if (!privateSurface) {
338             privateSurfaceSize = KernelHelper::getPrivateSurfaceSize(perHwThreadPrivateMemorySize, pClDevice->getSharedDeviceInfo().computeUnitsUsedForScratch);
339             DEBUG_BREAK_IF(privateSurfaceSize == 0);
340 
341             if (privateSurfaceSize > std::numeric_limits<uint32_t>::max()) {
342                 return CL_OUT_OF_RESOURCES;
343             }
344             privateSurface = executionEnvironment.memoryManager->allocateGraphicsMemoryWithProperties(
345                 {rootDeviceIndex,
346                  static_cast<size_t>(privateSurfaceSize),
347                  GraphicsAllocation::AllocationType::PRIVATE_SURFACE,
348                  pClDevice->getDeviceBitfield()});
349             if (privateSurface == nullptr) {
350                 return CL_OUT_OF_RESOURCES;
351             }
352         }
353 
354         const auto &privateMemoryAddress = kernelDescriptor.payloadMappings.implicitArgs.privateMemoryAddress;
355         patchWithImplicitSurface(reinterpret_cast<void *>(privateSurface->getGpuAddressToPatch()), *privateSurface, privateMemoryAddress);
356     }
357     return CL_SUCCESS;
358 }
359 
cloneKernel(Kernel * pSourceKernel)360 cl_int Kernel::cloneKernel(Kernel *pSourceKernel) {
361     // copy cross thread data to store arguments set to source kernel with clSetKernelArg on immediate data (non-pointer types)
362     memcpy_s(crossThreadData, crossThreadDataSize,
363              pSourceKernel->crossThreadData, pSourceKernel->crossThreadDataSize);
364     DEBUG_BREAK_IF(pSourceKernel->crossThreadDataSize != crossThreadDataSize);
365 
366     [[maybe_unused]] auto status = patchPrivateSurface();
367     DEBUG_BREAK_IF(status != CL_SUCCESS);
368 
369     // copy arguments set to source kernel with clSetKernelArg or clSetKernelArgSVMPointer
370     for (uint32_t i = 0; i < pSourceKernel->kernelArguments.size(); i++) {
371         if (0 == pSourceKernel->getKernelArgInfo(i).size) {
372             // skip copying arguments that haven't been set to source kernel
373             continue;
374         }
375         switch (pSourceKernel->kernelArguments[i].type) {
376         case NONE_OBJ:
377             // all arguments with immediate data (non-pointer types) have been copied in cross thread data
378             storeKernelArg(i, NONE_OBJ, nullptr, nullptr, pSourceKernel->getKernelArgInfo(i).size);
379             patchedArgumentsNum++;
380             kernelArguments[i].isPatched = true;
381             break;
382         case SVM_OBJ:
383             setArgSvm(i, pSourceKernel->getKernelArgInfo(i).size, const_cast<void *>(pSourceKernel->getKernelArgInfo(i).value),
384                       pSourceKernel->getKernelArgInfo(i).pSvmAlloc, pSourceKernel->getKernelArgInfo(i).svmFlags);
385             break;
386         case SVM_ALLOC_OBJ:
387             setArgSvmAlloc(i, const_cast<void *>(pSourceKernel->getKernelArgInfo(i).value),
388                            (GraphicsAllocation *)pSourceKernel->getKernelArgInfo(i).object);
389             break;
390         default:
391             setArg(i, pSourceKernel->getKernelArgInfo(i).size, pSourceKernel->getKernelArgInfo(i).value);
392             break;
393         }
394     }
395 
396     // copy additional information other than argument values set to source kernel with clSetKernelExecInfo
397     for (auto &gfxAlloc : pSourceKernel->kernelSvmGfxAllocations) {
398         kernelSvmGfxAllocations.push_back(gfxAlloc);
399     }
400     for (auto &gfxAlloc : pSourceKernel->kernelUnifiedMemoryGfxAllocations) {
401         kernelUnifiedMemoryGfxAllocations.push_back(gfxAlloc);
402     }
403 
404     if (pImplicitArgs) {
405         memcpy_s(pImplicitArgs.get(), sizeof(ImplicitArgs), pSourceKernel->getImplicitArgs(), sizeof(ImplicitArgs));
406     }
407     this->isBuiltIn = pSourceKernel->isBuiltIn;
408 
409     return CL_SUCCESS;
410 }
411 
getInfo(cl_kernel_info paramName,size_t paramValueSize,void * paramValue,size_t * paramValueSizeRet) const412 cl_int Kernel::getInfo(cl_kernel_info paramName, size_t paramValueSize,
413                        void *paramValue, size_t *paramValueSizeRet) const {
414     cl_int retVal;
415     const void *pSrc = nullptr;
416     size_t srcSize = GetInfo::invalidSourceSize;
417     cl_uint numArgs = 0;
418     const _cl_program *prog;
419     const _cl_context *ctxt;
420     cl_uint refCount = 0;
421     uint64_t nonCannonizedGpuAddress = 0llu;
422 
423     switch (paramName) {
424     case CL_KERNEL_FUNCTION_NAME:
425         pSrc = kernelInfo.kernelDescriptor.kernelMetadata.kernelName.c_str();
426         srcSize = kernelInfo.kernelDescriptor.kernelMetadata.kernelName.length() + 1;
427         break;
428 
429     case CL_KERNEL_NUM_ARGS:
430         srcSize = sizeof(cl_uint);
431         numArgs = static_cast<cl_uint>(kernelInfo.kernelDescriptor.payloadMappings.explicitArgs.size());
432         pSrc = &numArgs;
433         break;
434 
435     case CL_KERNEL_CONTEXT:
436         ctxt = &program->getContext();
437         srcSize = sizeof(ctxt);
438         pSrc = &ctxt;
439         break;
440 
441     case CL_KERNEL_PROGRAM:
442         prog = program;
443         srcSize = sizeof(prog);
444         pSrc = &prog;
445         break;
446 
447     case CL_KERNEL_REFERENCE_COUNT:
448         refCount = static_cast<cl_uint>(pMultiDeviceKernel->getRefApiCount());
449         srcSize = sizeof(refCount);
450         pSrc = &refCount;
451         break;
452 
453     case CL_KERNEL_ATTRIBUTES:
454         pSrc = kernelInfo.kernelDescriptor.kernelMetadata.kernelLanguageAttributes.c_str();
455         srcSize = kernelInfo.kernelDescriptor.kernelMetadata.kernelLanguageAttributes.length() + 1;
456         break;
457 
458     case CL_KERNEL_BINARY_PROGRAM_INTEL:
459         pSrc = getKernelHeap();
460         srcSize = getKernelHeapSize();
461         break;
462     case CL_KERNEL_BINARY_GPU_ADDRESS_INTEL:
463         nonCannonizedGpuAddress = GmmHelper::decanonize(kernelInfo.kernelAllocation->getGpuAddress());
464         pSrc = &nonCannonizedGpuAddress;
465         srcSize = sizeof(nonCannonizedGpuAddress);
466         break;
467     default:
468         getAdditionalInfo(paramName, pSrc, srcSize);
469         break;
470     }
471 
472     auto getInfoStatus = GetInfo::getInfo(paramValue, paramValueSize, pSrc, srcSize);
473     retVal = changeGetInfoStatusToCLResultType(getInfoStatus);
474     GetInfo::setParamValueReturnSize(paramValueSizeRet, srcSize, getInfoStatus);
475 
476     return retVal;
477 }
478 
getArgInfo(cl_uint argIndex,cl_kernel_arg_info paramName,size_t paramValueSize,void * paramValue,size_t * paramValueSizeRet) const479 cl_int Kernel::getArgInfo(cl_uint argIndex, cl_kernel_arg_info paramName, size_t paramValueSize,
480                           void *paramValue, size_t *paramValueSizeRet) const {
481     cl_int retVal;
482     const void *pSrc = nullptr;
483     size_t srcSize = GetInfo::invalidSourceSize;
484     const auto &args = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs;
485 
486     if (argIndex >= args.size()) {
487         retVal = CL_INVALID_ARG_INDEX;
488         return retVal;
489     }
490 
491     const auto &argTraits = args[argIndex].getTraits();
492     const auto &argMetadata = kernelInfo.kernelDescriptor.explicitArgsExtendedMetadata[argIndex];
493 
494     cl_kernel_arg_address_qualifier addressQualifier;
495     cl_kernel_arg_access_qualifier accessQualifier;
496     cl_kernel_arg_type_qualifier typeQualifier;
497 
498     switch (paramName) {
499     case CL_KERNEL_ARG_ADDRESS_QUALIFIER:
500         addressQualifier = asClKernelArgAddressQualifier(argTraits.getAddressQualifier());
501         srcSize = sizeof(addressQualifier);
502         pSrc = &addressQualifier;
503         break;
504 
505     case CL_KERNEL_ARG_ACCESS_QUALIFIER:
506         accessQualifier = asClKernelArgAccessQualifier(argTraits.getAccessQualifier());
507         srcSize = sizeof(accessQualifier);
508         pSrc = &accessQualifier;
509         break;
510 
511     case CL_KERNEL_ARG_TYPE_QUALIFIER:
512         typeQualifier = asClKernelArgTypeQualifier(argTraits.typeQualifiers);
513         srcSize = sizeof(typeQualifier);
514         pSrc = &typeQualifier;
515         break;
516 
517     case CL_KERNEL_ARG_TYPE_NAME:
518         srcSize = argMetadata.type.length() + 1;
519         pSrc = argMetadata.type.c_str();
520         break;
521 
522     case CL_KERNEL_ARG_NAME:
523         srcSize = argMetadata.argName.length() + 1;
524         pSrc = argMetadata.argName.c_str();
525         break;
526 
527     default:
528         break;
529     }
530 
531     auto getInfoStatus = GetInfo::getInfo(paramValue, paramValueSize, pSrc, srcSize);
532     retVal = changeGetInfoStatusToCLResultType(getInfoStatus);
533     GetInfo::setParamValueReturnSize(paramValueSizeRet, srcSize, getInfoStatus);
534 
535     return retVal;
536 }
537 
getWorkGroupInfo(cl_kernel_work_group_info paramName,size_t paramValueSize,void * paramValue,size_t * paramValueSizeRet) const538 cl_int Kernel::getWorkGroupInfo(cl_kernel_work_group_info paramName,
539                                 size_t paramValueSize, void *paramValue,
540                                 size_t *paramValueSizeRet) const {
541     cl_int retVal = CL_INVALID_VALUE;
542     const void *pSrc = nullptr;
543     size_t srcSize = GetInfo::invalidSourceSize;
544     struct size_t3 {
545         size_t val[3];
546     } requiredWorkGroupSize;
547     cl_ulong localMemorySize;
548     const auto &kernelDescriptor = kernelInfo.kernelDescriptor;
549     size_t preferredWorkGroupSizeMultiple = 0;
550     cl_ulong scratchSize;
551     cl_ulong privateMemSize;
552     size_t maxWorkgroupSize;
553     const auto &hwInfo = clDevice.getHardwareInfo();
554     auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
555     auto &clHwHelper = ClHwHelper::get(hwInfo.platform.eRenderCoreFamily);
556     GetInfoHelper info(paramValue, paramValueSize, paramValueSizeRet);
557 
558     switch (paramName) {
559     case CL_KERNEL_WORK_GROUP_SIZE:
560         maxWorkgroupSize = maxKernelWorkGroupSize;
561         if (DebugManager.flags.UseMaxSimdSizeToDeduceMaxWorkgroupSize.get()) {
562             auto divisionSize = CommonConstants::maximalSimdSize / kernelInfo.getMaxSimdSize();
563             maxWorkgroupSize /= divisionSize;
564         }
565         srcSize = sizeof(maxWorkgroupSize);
566         pSrc = &maxWorkgroupSize;
567         break;
568 
569     case CL_KERNEL_COMPILE_WORK_GROUP_SIZE:
570         requiredWorkGroupSize.val[0] = kernelDescriptor.kernelAttributes.requiredWorkgroupSize[0];
571         requiredWorkGroupSize.val[1] = kernelDescriptor.kernelAttributes.requiredWorkgroupSize[1];
572         requiredWorkGroupSize.val[2] = kernelDescriptor.kernelAttributes.requiredWorkgroupSize[2];
573         srcSize = sizeof(requiredWorkGroupSize);
574         pSrc = &requiredWorkGroupSize;
575         break;
576 
577     case CL_KERNEL_LOCAL_MEM_SIZE:
578         localMemorySize = kernelInfo.kernelDescriptor.kernelAttributes.slmInlineSize;
579         srcSize = sizeof(localMemorySize);
580         pSrc = &localMemorySize;
581         break;
582 
583     case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE:
584         preferredWorkGroupSizeMultiple = kernelInfo.getMaxSimdSize();
585         if (hwHelper.isFusedEuDispatchEnabled(hwInfo)) {
586             preferredWorkGroupSizeMultiple *= 2;
587         }
588         srcSize = sizeof(preferredWorkGroupSizeMultiple);
589         pSrc = &preferredWorkGroupSizeMultiple;
590         break;
591 
592     case CL_KERNEL_SPILL_MEM_SIZE_INTEL:
593         scratchSize = kernelDescriptor.kernelAttributes.perThreadScratchSize[0];
594         srcSize = sizeof(scratchSize);
595         pSrc = &scratchSize;
596         break;
597     case CL_KERNEL_PRIVATE_MEM_SIZE:
598         privateMemSize = clHwHelper.getKernelPrivateMemSize(kernelInfo);
599         srcSize = sizeof(privateMemSize);
600         pSrc = &privateMemSize;
601         break;
602     default:
603         getAdditionalWorkGroupInfo(paramName, pSrc, srcSize);
604         break;
605     }
606 
607     auto getInfoStatus = GetInfo::getInfo(paramValue, paramValueSize, pSrc, srcSize);
608     retVal = changeGetInfoStatusToCLResultType(getInfoStatus);
609     GetInfo::setParamValueReturnSize(paramValueSizeRet, srcSize, getInfoStatus);
610 
611     return retVal;
612 }
613 
getSubGroupInfo(cl_kernel_sub_group_info paramName,size_t inputValueSize,const void * inputValue,size_t paramValueSize,void * paramValue,size_t * paramValueSizeRet) const614 cl_int Kernel::getSubGroupInfo(cl_kernel_sub_group_info paramName,
615                                size_t inputValueSize, const void *inputValue,
616                                size_t paramValueSize, void *paramValue,
617                                size_t *paramValueSizeRet) const {
618     size_t numDimensions = 0;
619     size_t WGS = 1;
620     auto maxSimdSize = static_cast<size_t>(kernelInfo.getMaxSimdSize());
621     auto maxRequiredWorkGroupSize = static_cast<size_t>(kernelInfo.getMaxRequiredWorkGroupSize(getMaxKernelWorkGroupSize()));
622     auto largestCompiledSIMDSize = static_cast<size_t>(kernelInfo.getMaxSimdSize());
623 
624     GetInfoHelper info(paramValue, paramValueSize, paramValueSizeRet);
625 
626     if ((paramName == CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT) ||
627         (paramName == CL_KERNEL_MAX_NUM_SUB_GROUPS) ||
628         (paramName == CL_KERNEL_COMPILE_NUM_SUB_GROUPS)) {
629         if (clDevice.areOcl21FeaturesEnabled() == false) {
630             return CL_INVALID_OPERATION;
631         }
632     }
633 
634     if ((paramName == CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR) ||
635         (paramName == CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR)) {
636         if (!inputValue) {
637             return CL_INVALID_VALUE;
638         }
639         if (inputValueSize % sizeof(size_t) != 0) {
640             return CL_INVALID_VALUE;
641         }
642         numDimensions = inputValueSize / sizeof(size_t);
643         if (numDimensions == 0 ||
644             numDimensions > static_cast<size_t>(clDevice.getDeviceInfo().maxWorkItemDimensions)) {
645             return CL_INVALID_VALUE;
646         }
647     }
648 
649     if (paramName == CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT) {
650         if (!paramValue) {
651             return CL_INVALID_VALUE;
652         }
653         if (paramValueSize % sizeof(size_t) != 0) {
654             return CL_INVALID_VALUE;
655         }
656         numDimensions = paramValueSize / sizeof(size_t);
657         if (numDimensions == 0 ||
658             numDimensions > static_cast<size_t>(clDevice.getDeviceInfo().maxWorkItemDimensions)) {
659             return CL_INVALID_VALUE;
660         }
661     }
662 
663     switch (paramName) {
664     case CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR: {
665         return changeGetInfoStatusToCLResultType(info.set<size_t>(maxSimdSize));
666     }
667     case CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR: {
668         for (size_t i = 0; i < numDimensions; i++) {
669             WGS *= ((size_t *)inputValue)[i];
670         }
671         return changeGetInfoStatusToCLResultType(
672             info.set<size_t>((WGS / maxSimdSize) + std::min(static_cast<size_t>(1), WGS % maxSimdSize))); // add 1 if WGS % maxSimdSize != 0
673     }
674     case CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT: {
675         auto subGroupsNum = *(size_t *)inputValue;
676         auto workGroupSize = subGroupsNum * largestCompiledSIMDSize;
677         // return workgroup size in first dimension, the rest shall be 1 in positive case
678         if (workGroupSize > maxRequiredWorkGroupSize) {
679             workGroupSize = 0;
680         }
681         // If no work group size can accommodate the requested number of subgroups, return 0 in each element of the returned array.
682         switch (numDimensions) {
683         case 1:
684             return changeGetInfoStatusToCLResultType(info.set<size_t>(workGroupSize));
685         case 2:
686             struct size_t2 {
687                 size_t val[2];
688             } workGroupSize2;
689             workGroupSize2.val[0] = workGroupSize;
690             workGroupSize2.val[1] = (workGroupSize > 0) ? 1 : 0;
691             return changeGetInfoStatusToCLResultType(info.set<size_t2>(workGroupSize2));
692         default:
693             struct size_t3 {
694                 size_t val[3];
695             } workGroupSize3;
696             workGroupSize3.val[0] = workGroupSize;
697             workGroupSize3.val[1] = (workGroupSize > 0) ? 1 : 0;
698             workGroupSize3.val[2] = (workGroupSize > 0) ? 1 : 0;
699             return changeGetInfoStatusToCLResultType(info.set<size_t3>(workGroupSize3));
700         }
701     }
702     case CL_KERNEL_MAX_NUM_SUB_GROUPS: {
703         // round-up maximum number of subgroups
704         return changeGetInfoStatusToCLResultType(info.set<size_t>(Math::divideAndRoundUp(maxRequiredWorkGroupSize, largestCompiledSIMDSize)));
705     }
706     case CL_KERNEL_COMPILE_NUM_SUB_GROUPS: {
707         return changeGetInfoStatusToCLResultType(info.set<size_t>(static_cast<size_t>(kernelInfo.kernelDescriptor.kernelMetadata.compiledSubGroupsNumber)));
708     }
709     case CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL: {
710         return changeGetInfoStatusToCLResultType(info.set<size_t>(kernelInfo.kernelDescriptor.kernelMetadata.requiredSubGroupSize));
711     }
712     default:
713         return CL_INVALID_VALUE;
714     }
715 }
716 
getKernelHeap() const717 const void *Kernel::getKernelHeap() const {
718     return kernelInfo.heapInfo.pKernelHeap;
719 }
720 
getKernelHeapSize() const721 size_t Kernel::getKernelHeapSize() const {
722     return kernelInfo.heapInfo.KernelHeapSize;
723 }
724 
substituteKernelHeap(void * newKernelHeap,size_t newKernelHeapSize)725 void Kernel::substituteKernelHeap(void *newKernelHeap, size_t newKernelHeapSize) {
726     KernelInfo *pKernelInfo = const_cast<KernelInfo *>(&kernelInfo);
727     void **pKernelHeap = const_cast<void **>(&pKernelInfo->heapInfo.pKernelHeap);
728     *pKernelHeap = newKernelHeap;
729     auto &heapInfo = pKernelInfo->heapInfo;
730     heapInfo.KernelHeapSize = static_cast<uint32_t>(newKernelHeapSize);
731     pKernelInfo->isKernelHeapSubstituted = true;
732     auto memoryManager = executionEnvironment.memoryManager.get();
733 
734     auto currentAllocationSize = pKernelInfo->kernelAllocation->getUnderlyingBufferSize();
735     bool status = false;
736 
737     const auto &hwInfo = clDevice.getHardwareInfo();
738     auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
739     size_t isaPadding = hwHelper.getPaddingForISAAllocation();
740     if (currentAllocationSize >= newKernelHeapSize + isaPadding) {
741         auto &hwInfo = clDevice.getDevice().getHardwareInfo();
742         auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
743         status = MemoryTransferHelper::transferMemoryToAllocation(hwHelper.isBlitCopyRequiredForLocalMemory(hwInfo, *pKernelInfo->getGraphicsAllocation()),
744                                                                   clDevice.getDevice(), pKernelInfo->getGraphicsAllocation(), 0, newKernelHeap,
745                                                                   static_cast<size_t>(newKernelHeapSize));
746     } else {
747         memoryManager->checkGpuUsageAndDestroyGraphicsAllocations(pKernelInfo->kernelAllocation);
748         pKernelInfo->kernelAllocation = nullptr;
749         status = pKernelInfo->createKernelAllocation(clDevice.getDevice(), isBuiltIn);
750     }
751     UNRECOVERABLE_IF(!status);
752 }
753 
isKernelHeapSubstituted() const754 bool Kernel::isKernelHeapSubstituted() const {
755     return kernelInfo.isKernelHeapSubstituted;
756 }
757 
getKernelId() const758 uint64_t Kernel::getKernelId() const {
759     return kernelInfo.kernelId;
760 }
761 
setKernelId(uint64_t newKernelId)762 void Kernel::setKernelId(uint64_t newKernelId) {
763     KernelInfo *pKernelInfo = const_cast<KernelInfo *>(&kernelInfo);
764     pKernelInfo->kernelId = newKernelId;
765 }
getStartOffset() const766 uint32_t Kernel::getStartOffset() const {
767     return this->startOffset;
768 }
setStartOffset(uint32_t offset)769 void Kernel::setStartOffset(uint32_t offset) {
770     this->startOffset = offset;
771 }
772 
getSurfaceStateHeap() const773 void *Kernel::getSurfaceStateHeap() const {
774     return pSshLocal.get();
775 }
776 
getDynamicStateHeapSize() const777 size_t Kernel::getDynamicStateHeapSize() const {
778     return kernelInfo.heapInfo.DynamicStateHeapSize;
779 }
780 
getDynamicStateHeap() const781 const void *Kernel::getDynamicStateHeap() const {
782     return kernelInfo.heapInfo.pDsh;
783 }
784 
getSurfaceStateHeapSize() const785 size_t Kernel::getSurfaceStateHeapSize() const {
786     return sshLocalSize;
787 }
788 
getNumberOfBindingTableStates() const789 size_t Kernel::getNumberOfBindingTableStates() const {
790     return numberOfBindingTableStates;
791 }
792 
resizeSurfaceStateHeap(void * pNewSsh,size_t newSshSize,size_t newBindingTableCount,size_t newBindingTableOffset)793 void Kernel::resizeSurfaceStateHeap(void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset) {
794     pSshLocal.reset(static_cast<char *>(pNewSsh));
795     sshLocalSize = static_cast<uint32_t>(newSshSize);
796     numberOfBindingTableStates = newBindingTableCount;
797     localBindingTableOffset = newBindingTableOffset;
798 }
799 
markArgPatchedAndResolveArgs(uint32_t argIndex)800 void Kernel::markArgPatchedAndResolveArgs(uint32_t argIndex) {
801     if (!kernelArguments[argIndex].isPatched) {
802         patchedArgumentsNum++;
803         kernelArguments[argIndex].isPatched = true;
804     }
805     if (program->getContextPtr() && getContext().getRootDeviceIndices().size() > 1u && Kernel::isMemObj(kernelArguments[argIndex].type) && kernelArguments[argIndex].object) {
806         auto argMemObj = castToObjectOrAbort<MemObj>(reinterpret_cast<cl_mem>(kernelArguments[argIndex].object));
807         auto memObj = argMemObj->getHighestRootMemObj();
808         auto migrateRequiredForArg = memObj->getMultiGraphicsAllocation().requiresMigrations();
809 
810         if (migratableArgsMap.find(argIndex) == migratableArgsMap.end() && migrateRequiredForArg) {
811             migratableArgsMap.insert({argIndex, memObj});
812         } else if (migrateRequiredForArg) {
813             migratableArgsMap[argIndex] = memObj;
814         } else {
815             migratableArgsMap.erase(argIndex);
816         }
817     }
818 
819     resolveArgs();
820 }
821 
setArg(uint32_t argIndex,size_t argSize,const void * argVal)822 cl_int Kernel::setArg(uint32_t argIndex, size_t argSize, const void *argVal) {
823     cl_int retVal = CL_SUCCESS;
824     bool updateExposedKernel = true;
825     auto argWasUncacheable = false;
826     if (kernelInfo.builtinDispatchBuilder != nullptr) {
827         updateExposedKernel = kernelInfo.builtinDispatchBuilder->setExplicitArg(argIndex, argSize, argVal, retVal);
828     }
829     if (updateExposedKernel) {
830         if (argIndex >= kernelArgHandlers.size()) {
831             return CL_INVALID_ARG_INDEX;
832         }
833         argWasUncacheable = kernelArguments[argIndex].isStatelessUncacheable;
834         auto argHandler = kernelArgHandlers[argIndex];
835         retVal = (this->*argHandler)(argIndex, argSize, argVal);
836     }
837     if (retVal == CL_SUCCESS) {
838         auto argIsUncacheable = kernelArguments[argIndex].isStatelessUncacheable;
839         statelessUncacheableArgsCount += (argIsUncacheable ? 1 : 0) - (argWasUncacheable ? 1 : 0);
840         markArgPatchedAndResolveArgs(argIndex);
841     }
842     return retVal;
843 }
844 
setArg(uint32_t argIndex,uint32_t argVal)845 cl_int Kernel::setArg(uint32_t argIndex, uint32_t argVal) {
846     return setArg(argIndex, sizeof(argVal), &argVal);
847 }
848 
setArg(uint32_t argIndex,uint64_t argVal)849 cl_int Kernel::setArg(uint32_t argIndex, uint64_t argVal) {
850     return setArg(argIndex, sizeof(argVal), &argVal);
851 }
852 
setArg(uint32_t argIndex,cl_mem argVal)853 cl_int Kernel::setArg(uint32_t argIndex, cl_mem argVal) {
854     return setArg(argIndex, sizeof(argVal), &argVal);
855 }
856 
setArg(uint32_t argIndex,cl_mem argVal,uint32_t mipLevel)857 cl_int Kernel::setArg(uint32_t argIndex, cl_mem argVal, uint32_t mipLevel) {
858     auto retVal = setArgImageWithMipLevel(argIndex, sizeof(argVal), &argVal, mipLevel);
859     if (retVal == CL_SUCCESS) {
860         markArgPatchedAndResolveArgs(argIndex);
861     }
862     return retVal;
863 }
864 
patchBufferOffset(const ArgDescPointer & argAsPtr,void * svmPtr,GraphicsAllocation * svmAlloc)865 void *Kernel::patchBufferOffset(const ArgDescPointer &argAsPtr, void *svmPtr, GraphicsAllocation *svmAlloc) {
866     if (isUndefinedOffset(argAsPtr.bufferOffset)) {
867         return svmPtr;
868     }
869     void *ptrToPatch = svmPtr;
870     if (svmAlloc != nullptr) {
871         ptrToPatch = reinterpret_cast<void *>(svmAlloc->getGpuAddressToPatch());
872     }
873 
874     constexpr uint32_t minimumAlignment = 4;
875     ptrToPatch = alignDown(ptrToPatch, minimumAlignment);
876     DEBUG_BREAK_IF(ptrDiff(svmPtr, ptrToPatch) != static_cast<uint32_t>(ptrDiff(svmPtr, ptrToPatch)));
877     uint32_t offsetToPatch = static_cast<uint32_t>(ptrDiff(svmPtr, ptrToPatch));
878 
879     patch<uint32_t, uint32_t>(offsetToPatch, getCrossThreadData(), argAsPtr.bufferOffset);
880     return ptrToPatch;
881 }
882 
setArgSvm(uint32_t argIndex,size_t svmAllocSize,void * svmPtr,GraphicsAllocation * svmAlloc,cl_mem_flags svmFlags)883 cl_int Kernel::setArgSvm(uint32_t argIndex, size_t svmAllocSize, void *svmPtr, GraphicsAllocation *svmAlloc, cl_mem_flags svmFlags) {
884     const auto &argAsPtr = getKernelInfo().kernelDescriptor.payloadMappings.explicitArgs[argIndex].as<ArgDescPointer>();
885 
886     auto patchLocation = ptrOffset(getCrossThreadData(), argAsPtr.stateless);
887     patchWithRequiredSize(patchLocation, argAsPtr.pointerSize, reinterpret_cast<uintptr_t>(svmPtr));
888 
889     void *ptrToPatch = patchBufferOffset(argAsPtr, svmPtr, svmAlloc);
890     if (isValidOffset(argAsPtr.bindful)) {
891         auto surfaceState = ptrOffset(getSurfaceStateHeap(), argAsPtr.bindful);
892         Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, false, false, svmAllocSize + ptrDiff(svmPtr, ptrToPatch), ptrToPatch, 0, svmAlloc, svmFlags, 0,
893                                 kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, areMultipleSubDevicesInContext());
894     }
895 
896     storeKernelArg(argIndex, SVM_OBJ, nullptr, svmPtr, sizeof(void *), svmAlloc, svmFlags);
897     if (!kernelArguments[argIndex].isPatched) {
898         patchedArgumentsNum++;
899         kernelArguments[argIndex].isPatched = true;
900     }
901     addAllocationToCacheFlushVector(argIndex, svmAlloc);
902 
903     return CL_SUCCESS;
904 }
905 
setArgSvmAlloc(uint32_t argIndex,void * svmPtr,GraphicsAllocation * svmAlloc)906 cl_int Kernel::setArgSvmAlloc(uint32_t argIndex, void *svmPtr, GraphicsAllocation *svmAlloc) {
907     DBG_LOG_INPUTS("setArgBuffer svm_alloc", svmAlloc);
908 
909     const auto &argAsPtr = getKernelInfo().kernelDescriptor.payloadMappings.explicitArgs[argIndex].as<ArgDescPointer>();
910 
911     auto patchLocation = ptrOffset(getCrossThreadData(), argAsPtr.stateless);
912     patchWithRequiredSize(patchLocation, argAsPtr.pointerSize, reinterpret_cast<uintptr_t>(svmPtr));
913 
914     bool disableL3 = false;
915     bool forceNonAuxMode = false;
916     bool isAuxTranslationKernel = (AuxTranslationDirection::None != auxTranslationDirection);
917     auto &hwInfo = getDevice().getHardwareInfo();
918     auto &clHwHelper = ClHwHelper::get(hwInfo.platform.eRenderCoreFamily);
919 
920     if (isAuxTranslationKernel) {
921         if (((AuxTranslationDirection::AuxToNonAux == auxTranslationDirection) && argIndex == 1) ||
922             ((AuxTranslationDirection::NonAuxToAux == auxTranslationDirection) && argIndex == 0)) {
923             forceNonAuxMode = true;
924         }
925         disableL3 = (argIndex == 0);
926     } else if (svmAlloc && svmAlloc->isCompressionEnabled() && clHwHelper.requiresNonAuxMode(argAsPtr, hwInfo)) {
927         forceNonAuxMode = true;
928     }
929 
930     bool argWasUncacheable = kernelArguments[argIndex].isStatelessUncacheable;
931     bool argIsUncacheable = svmAlloc ? svmAlloc->isUncacheable() : false;
932     statelessUncacheableArgsCount += (argIsUncacheable ? 1 : 0) - (argWasUncacheable ? 1 : 0);
933 
934     void *ptrToPatch = patchBufferOffset(argAsPtr, svmPtr, svmAlloc);
935     if (isValidOffset(argAsPtr.bindful)) {
936         auto surfaceState = ptrOffset(getSurfaceStateHeap(), argAsPtr.bindful);
937         size_t allocSize = 0;
938         size_t offset = 0;
939         if (svmAlloc != nullptr) {
940             allocSize = svmAlloc->getUnderlyingBufferSize();
941             offset = ptrDiff(ptrToPatch, svmAlloc->getGpuAddressToPatch());
942             allocSize -= offset;
943         }
944         Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, forceNonAuxMode, disableL3, allocSize, ptrToPatch, offset, svmAlloc, 0, 0,
945                                 kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, areMultipleSubDevicesInContext());
946     }
947 
948     storeKernelArg(argIndex, SVM_ALLOC_OBJ, svmAlloc, svmPtr, sizeof(uintptr_t));
949     if (!kernelArguments[argIndex].isPatched) {
950         patchedArgumentsNum++;
951         kernelArguments[argIndex].isPatched = true;
952     }
953     addAllocationToCacheFlushVector(argIndex, svmAlloc);
954 
955     return CL_SUCCESS;
956 }
957 
storeKernelArg(uint32_t argIndex,kernelArgType argType,void * argObject,const void * argValue,size_t argSize,GraphicsAllocation * argSvmAlloc,cl_mem_flags argSvmFlags)958 void Kernel::storeKernelArg(uint32_t argIndex, kernelArgType argType, void *argObject,
959                             const void *argValue, size_t argSize,
960                             GraphicsAllocation *argSvmAlloc, cl_mem_flags argSvmFlags) {
961     kernelArguments[argIndex].type = argType;
962     kernelArguments[argIndex].object = argObject;
963     kernelArguments[argIndex].value = argValue;
964     kernelArguments[argIndex].size = argSize;
965     kernelArguments[argIndex].pSvmAlloc = argSvmAlloc;
966     kernelArguments[argIndex].svmFlags = argSvmFlags;
967 }
968 
getKernelArg(uint32_t argIndex) const969 const void *Kernel::getKernelArg(uint32_t argIndex) const {
970     return kernelArguments[argIndex].object;
971 }
972 
getKernelArgInfo(uint32_t argIndex) const973 const Kernel::SimpleKernelArgInfo &Kernel::getKernelArgInfo(uint32_t argIndex) const {
974     return kernelArguments[argIndex];
975 }
976 
setSvmKernelExecInfo(GraphicsAllocation * argValue)977 void Kernel::setSvmKernelExecInfo(GraphicsAllocation *argValue) {
978     kernelSvmGfxAllocations.push_back(argValue);
979     if (allocationForCacheFlush(argValue)) {
980         svmAllocationsRequireCacheFlush = true;
981     }
982 }
983 
clearSvmKernelExecInfo()984 void Kernel::clearSvmKernelExecInfo() {
985     kernelSvmGfxAllocations.clear();
986     svmAllocationsRequireCacheFlush = false;
987 }
988 
setUnifiedMemoryProperty(cl_kernel_exec_info infoType,bool infoValue)989 void Kernel::setUnifiedMemoryProperty(cl_kernel_exec_info infoType, bool infoValue) {
990     if (infoType == CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL) {
991         this->unifiedMemoryControls.indirectDeviceAllocationsAllowed = infoValue;
992         return;
993     }
994     if (infoType == CL_KERNEL_EXEC_INFO_INDIRECT_HOST_ACCESS_INTEL) {
995         this->unifiedMemoryControls.indirectHostAllocationsAllowed = infoValue;
996         return;
997     }
998     if (infoType == CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL) {
999         this->unifiedMemoryControls.indirectSharedAllocationsAllowed = infoValue;
1000         return;
1001     }
1002 }
1003 
setUnifiedMemoryExecInfo(GraphicsAllocation * unifiedMemoryAllocation)1004 void Kernel::setUnifiedMemoryExecInfo(GraphicsAllocation *unifiedMemoryAllocation) {
1005     kernelUnifiedMemoryGfxAllocations.push_back(unifiedMemoryAllocation);
1006 }
1007 
clearUnifiedMemoryExecInfo()1008 void Kernel::clearUnifiedMemoryExecInfo() {
1009     kernelUnifiedMemoryGfxAllocations.clear();
1010 }
1011 
setKernelExecutionType(cl_execution_info_kernel_type_intel executionType)1012 cl_int Kernel::setKernelExecutionType(cl_execution_info_kernel_type_intel executionType) {
1013     switch (executionType) {
1014     case CL_KERNEL_EXEC_INFO_DEFAULT_TYPE_INTEL:
1015         this->executionType = KernelExecutionType::Default;
1016         break;
1017     case CL_KERNEL_EXEC_INFO_CONCURRENT_TYPE_INTEL:
1018         this->executionType = KernelExecutionType::Concurrent;
1019         break;
1020     default: {
1021         return CL_INVALID_VALUE;
1022     }
1023     }
1024     return CL_SUCCESS;
1025 }
1026 
getSuggestedLocalWorkSize(const cl_uint workDim,const size_t * globalWorkSize,const size_t * globalWorkOffset,size_t * localWorkSize)1027 void Kernel::getSuggestedLocalWorkSize(const cl_uint workDim, const size_t *globalWorkSize, const size_t *globalWorkOffset,
1028                                        size_t *localWorkSize) {
1029     UNRECOVERABLE_IF((workDim == 0) || (workDim > 3));
1030     UNRECOVERABLE_IF(globalWorkSize == nullptr);
1031     Vec3<size_t> elws{0, 0, 0};
1032     Vec3<size_t> gws{
1033         globalWorkSize[0],
1034         (workDim > 1) ? globalWorkSize[1] : 1,
1035         (workDim > 2) ? globalWorkSize[2] : 1};
1036     Vec3<size_t> offset{0, 0, 0};
1037     if (globalWorkOffset) {
1038         offset.x = globalWorkOffset[0];
1039         if (workDim > 1) {
1040             offset.y = globalWorkOffset[1];
1041             if (workDim > 2) {
1042                 offset.z = globalWorkOffset[2];
1043             }
1044         }
1045     }
1046 
1047     Vec3<size_t> suggestedLws{0, 0, 0};
1048 
1049     if (kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[0] != 0) {
1050         suggestedLws.x = kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[0];
1051         suggestedLws.y = kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[1];
1052         suggestedLws.z = kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[2];
1053     } else {
1054         uint32_t dispatchWorkDim = std::max(1U, std::max(gws.getSimplifiedDim(), offset.getSimplifiedDim()));
1055         const DispatchInfo dispatchInfo{&clDevice, this, dispatchWorkDim, gws, elws, offset};
1056         suggestedLws = computeWorkgroupSize(dispatchInfo);
1057     }
1058 
1059     localWorkSize[0] = suggestedLws.x;
1060     if (workDim > 1)
1061         localWorkSize[1] = suggestedLws.y;
1062     if (workDim > 2)
1063         localWorkSize[2] = suggestedLws.z;
1064 }
1065 
getMaxWorkGroupCount(const cl_uint workDim,const size_t * localWorkSize,const CommandQueue * commandQueue) const1066 uint32_t Kernel::getMaxWorkGroupCount(const cl_uint workDim, const size_t *localWorkSize, const CommandQueue *commandQueue) const {
1067     auto &hardwareInfo = getHardwareInfo();
1068     auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
1069 
1070     auto engineGroupType = hwHelper.getEngineGroupType(commandQueue->getGpgpuEngine().getEngineType(),
1071                                                        commandQueue->getGpgpuEngine().getEngineUsage(), hardwareInfo);
1072 
1073     const auto &kernelDescriptor = kernelInfo.kernelDescriptor;
1074     auto dssCount = hardwareInfo.gtSystemInfo.DualSubSliceCount;
1075     if (dssCount == 0) {
1076         dssCount = hardwareInfo.gtSystemInfo.SubSliceCount;
1077     }
1078     auto availableThreadCount = hwHelper.calculateAvailableThreadCount(
1079         hardwareInfo.platform.eProductFamily,
1080         kernelDescriptor.kernelAttributes.numGrfRequired,
1081         hardwareInfo.gtSystemInfo.EUCount, hardwareInfo.gtSystemInfo.ThreadCount / hardwareInfo.gtSystemInfo.EUCount);
1082 
1083     auto barrierCount = kernelDescriptor.kernelAttributes.barrierCount;
1084     auto maxWorkGroupCount = KernelHelper::getMaxWorkGroupCount(kernelInfo.getMaxSimdSize(),
1085                                                                 availableThreadCount,
1086                                                                 dssCount,
1087                                                                 dssCount * KB * hardwareInfo.capabilityTable.slmSize,
1088                                                                 hwHelper.alignSlmSize(slmTotalSize),
1089                                                                 static_cast<uint32_t>(hwHelper.getMaxBarrierRegisterPerSlice()),
1090                                                                 hwHelper.getBarriersCountFromHasBarriers(barrierCount),
1091                                                                 workDim,
1092                                                                 localWorkSize);
1093     auto isEngineInstanced = commandQueue->getGpgpuCommandStreamReceiver().getOsContext().isEngineInstanced();
1094     maxWorkGroupCount = hwHelper.adjustMaxWorkGroupCount(maxWorkGroupCount, engineGroupType, hardwareInfo, isEngineInstanced);
1095     return maxWorkGroupCount;
1096 }
1097 
makeArgsResident(CommandStreamReceiver & commandStreamReceiver)1098 inline void Kernel::makeArgsResident(CommandStreamReceiver &commandStreamReceiver) {
1099     auto numArgs = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs.size();
1100     for (decltype(numArgs) argIndex = 0; argIndex < numArgs; argIndex++) {
1101         if (kernelArguments[argIndex].object) {
1102             if (kernelArguments[argIndex].type == SVM_ALLOC_OBJ) {
1103                 auto pSVMAlloc = (GraphicsAllocation *)kernelArguments[argIndex].object;
1104                 auto pageFaultManager = executionEnvironment.memoryManager->getPageFaultManager();
1105                 if (pageFaultManager &&
1106                     this->isUnifiedMemorySyncRequired) {
1107                     pageFaultManager->moveAllocationToGpuDomain(reinterpret_cast<void *>(pSVMAlloc->getGpuAddress()));
1108                 }
1109                 commandStreamReceiver.makeResident(*pSVMAlloc);
1110             } else if (Kernel::isMemObj(kernelArguments[argIndex].type)) {
1111                 auto clMem = const_cast<cl_mem>(static_cast<const _cl_mem *>(kernelArguments[argIndex].object));
1112                 auto memObj = castToObjectOrAbort<MemObj>(clMem);
1113                 auto image = castToObject<Image>(clMem);
1114                 if (image && image->isImageFromImage()) {
1115                     commandStreamReceiver.setSamplerCacheFlushRequired(CommandStreamReceiver::SamplerCacheFlushState::samplerCacheFlushBefore);
1116                 }
1117                 commandStreamReceiver.makeResident(*memObj->getGraphicsAllocation(commandStreamReceiver.getRootDeviceIndex()));
1118                 if (memObj->getMcsAllocation()) {
1119                     commandStreamReceiver.makeResident(*memObj->getMcsAllocation());
1120                 }
1121             }
1122         }
1123     }
1124 }
1125 
performKernelTuning(CommandStreamReceiver & commandStreamReceiver,const Vec3<size_t> & lws,const Vec3<size_t> & gws,const Vec3<size_t> & offsets,TimestampPacketContainer * timestampContainer)1126 void Kernel::performKernelTuning(CommandStreamReceiver &commandStreamReceiver, const Vec3<size_t> &lws, const Vec3<size_t> &gws, const Vec3<size_t> &offsets, TimestampPacketContainer *timestampContainer) {
1127     auto performTunning = TunningType::DISABLED;
1128 
1129     if (DebugManager.flags.EnableKernelTunning.get() != -1) {
1130         performTunning = static_cast<TunningType>(DebugManager.flags.EnableKernelTunning.get());
1131     }
1132 
1133     if (performTunning == TunningType::SIMPLE) {
1134         this->singleSubdevicePreferredInCurrentEnqueue = !this->kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics;
1135 
1136     } else if (performTunning == TunningType::FULL) {
1137         KernelConfig config{gws, lws, offsets};
1138 
1139         auto submissionDataIt = this->kernelSubmissionMap.find(config);
1140         if (submissionDataIt == this->kernelSubmissionMap.end()) {
1141             KernelSubmissionData submissionData;
1142             submissionData.kernelStandardTimestamps = std::make_unique<TimestampPacketContainer>();
1143             submissionData.kernelSubdeviceTimestamps = std::make_unique<TimestampPacketContainer>();
1144             submissionData.status = TunningStatus::STANDARD_TUNNING_IN_PROGRESS;
1145             submissionData.kernelStandardTimestamps->assignAndIncrementNodesRefCounts(*timestampContainer);
1146             this->kernelSubmissionMap[config] = std::move(submissionData);
1147             this->singleSubdevicePreferredInCurrentEnqueue = false;
1148             return;
1149         }
1150 
1151         auto &submissionData = submissionDataIt->second;
1152 
1153         if (submissionData.status == TunningStatus::TUNNING_DONE) {
1154             this->singleSubdevicePreferredInCurrentEnqueue = submissionData.singleSubdevicePreferred;
1155         }
1156 
1157         if (submissionData.status == TunningStatus::SUBDEVICE_TUNNING_IN_PROGRESS) {
1158             if (this->hasTunningFinished(submissionData)) {
1159                 submissionData.status = TunningStatus::TUNNING_DONE;
1160                 submissionData.kernelStandardTimestamps.reset();
1161                 submissionData.kernelSubdeviceTimestamps.reset();
1162                 this->singleSubdevicePreferredInCurrentEnqueue = submissionData.singleSubdevicePreferred;
1163             } else {
1164                 this->singleSubdevicePreferredInCurrentEnqueue = false;
1165             }
1166         }
1167 
1168         if (submissionData.status == TunningStatus::STANDARD_TUNNING_IN_PROGRESS) {
1169             submissionData.status = TunningStatus::SUBDEVICE_TUNNING_IN_PROGRESS;
1170             submissionData.kernelSubdeviceTimestamps->assignAndIncrementNodesRefCounts(*timestampContainer);
1171             this->singleSubdevicePreferredInCurrentEnqueue = true;
1172         }
1173     }
1174 }
1175 
hasTunningFinished(KernelSubmissionData & submissionData)1176 bool Kernel::hasTunningFinished(KernelSubmissionData &submissionData) {
1177     if (!this->hasRunFinished(submissionData.kernelStandardTimestamps.get()) ||
1178         !this->hasRunFinished(submissionData.kernelSubdeviceTimestamps.get())) {
1179         return false;
1180     }
1181 
1182     uint64_t globalStartTS = 0u;
1183     uint64_t globalEndTS = 0u;
1184 
1185     Event::getBoundaryTimestampValues(submissionData.kernelStandardTimestamps.get(), globalStartTS, globalEndTS);
1186     auto standardTSDiff = globalEndTS - globalStartTS;
1187 
1188     Event::getBoundaryTimestampValues(submissionData.kernelSubdeviceTimestamps.get(), globalStartTS, globalEndTS);
1189     auto subdeviceTSDiff = globalEndTS - globalStartTS;
1190 
1191     submissionData.singleSubdevicePreferred = standardTSDiff > subdeviceTSDiff;
1192 
1193     return true;
1194 }
1195 
hasRunFinished(TimestampPacketContainer * timestampContainer)1196 bool Kernel::hasRunFinished(TimestampPacketContainer *timestampContainer) {
1197     for (const auto &node : timestampContainer->peekNodes()) {
1198         for (uint32_t i = 0; i < node->getPacketsUsed(); i++) {
1199             if (node->getContextEndValue(i) == 1) {
1200                 return false;
1201             }
1202         }
1203     }
1204     return true;
1205 }
1206 
isSingleSubdevicePreferred() const1207 bool Kernel::isSingleSubdevicePreferred() const {
1208     return this->singleSubdevicePreferredInCurrentEnqueue || this->usesSyncBuffer();
1209 }
1210 
makeResident(CommandStreamReceiver & commandStreamReceiver)1211 void Kernel::makeResident(CommandStreamReceiver &commandStreamReceiver) {
1212     auto rootDeviceIndex = commandStreamReceiver.getRootDeviceIndex();
1213     if (privateSurface) {
1214         commandStreamReceiver.makeResident(*privateSurface);
1215     }
1216 
1217     if (program->getConstantSurface(rootDeviceIndex)) {
1218         commandStreamReceiver.makeResident(*(program->getConstantSurface(rootDeviceIndex)));
1219     }
1220 
1221     if (program->getGlobalSurface(rootDeviceIndex)) {
1222         commandStreamReceiver.makeResident(*(program->getGlobalSurface(rootDeviceIndex)));
1223     }
1224 
1225     if (program->getExportedFunctionsSurface(rootDeviceIndex)) {
1226         commandStreamReceiver.makeResident(*(program->getExportedFunctionsSurface(rootDeviceIndex)));
1227     }
1228 
1229     for (auto gfxAlloc : kernelSvmGfxAllocations) {
1230         commandStreamReceiver.makeResident(*gfxAlloc);
1231     }
1232 
1233     auto pageFaultManager = program->peekExecutionEnvironment().memoryManager->getPageFaultManager();
1234 
1235     for (auto gfxAlloc : kernelUnifiedMemoryGfxAllocations) {
1236         commandStreamReceiver.makeResident(*gfxAlloc);
1237         if (pageFaultManager) {
1238             pageFaultManager->moveAllocationToGpuDomain(reinterpret_cast<void *>(gfxAlloc->getGpuAddress()));
1239         }
1240     }
1241 
1242     if (unifiedMemoryControls.indirectSharedAllocationsAllowed && pageFaultManager) {
1243         pageFaultManager->moveAllocationsWithinUMAllocsManagerToGpuDomain(this->getContext().getSVMAllocsManager());
1244     }
1245     makeArgsResident(commandStreamReceiver);
1246 
1247     auto kernelIsaAllocation = this->kernelInfo.kernelAllocation;
1248     if (kernelIsaAllocation) {
1249         commandStreamReceiver.makeResident(*kernelIsaAllocation);
1250     }
1251 
1252     gtpinNotifyMakeResident(this, &commandStreamReceiver);
1253 
1254     if (unifiedMemoryControls.indirectDeviceAllocationsAllowed ||
1255         unifiedMemoryControls.indirectHostAllocationsAllowed ||
1256         unifiedMemoryControls.indirectSharedAllocationsAllowed) {
1257         this->getContext().getSVMAllocsManager()->makeInternalAllocationsResident(commandStreamReceiver, unifiedMemoryControls.generateMask());
1258     }
1259 }
1260 
getResidency(std::vector<Surface * > & dst)1261 void Kernel::getResidency(std::vector<Surface *> &dst) {
1262     if (privateSurface) {
1263         GeneralSurface *surface = new GeneralSurface(privateSurface);
1264         dst.push_back(surface);
1265     }
1266 
1267     auto rootDeviceIndex = getDevice().getRootDeviceIndex();
1268     if (program->getConstantSurface(rootDeviceIndex)) {
1269         GeneralSurface *surface = new GeneralSurface(program->getConstantSurface(rootDeviceIndex));
1270         dst.push_back(surface);
1271     }
1272 
1273     if (program->getGlobalSurface(rootDeviceIndex)) {
1274         GeneralSurface *surface = new GeneralSurface(program->getGlobalSurface(rootDeviceIndex));
1275         dst.push_back(surface);
1276     }
1277 
1278     if (program->getExportedFunctionsSurface(rootDeviceIndex)) {
1279         GeneralSurface *surface = new GeneralSurface(program->getExportedFunctionsSurface(rootDeviceIndex));
1280         dst.push_back(surface);
1281     }
1282 
1283     for (auto gfxAlloc : kernelSvmGfxAllocations) {
1284         GeneralSurface *surface = new GeneralSurface(gfxAlloc);
1285         dst.push_back(surface);
1286     }
1287 
1288     auto numArgs = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs.size();
1289     for (decltype(numArgs) argIndex = 0; argIndex < numArgs; argIndex++) {
1290         if (kernelArguments[argIndex].object) {
1291             if (kernelArguments[argIndex].type == SVM_ALLOC_OBJ) {
1292                 auto pSVMAlloc = (GraphicsAllocation *)kernelArguments[argIndex].object;
1293                 dst.push_back(new GeneralSurface(pSVMAlloc));
1294             } else if (Kernel::isMemObj(kernelArguments[argIndex].type)) {
1295                 auto clMem = const_cast<cl_mem>(static_cast<const _cl_mem *>(kernelArguments[argIndex].object));
1296                 auto memObj = castToObject<MemObj>(clMem);
1297                 DEBUG_BREAK_IF(memObj == nullptr);
1298                 dst.push_back(new MemObjSurface(memObj));
1299             }
1300         }
1301     }
1302 
1303     auto kernelIsaAllocation = this->kernelInfo.kernelAllocation;
1304     if (kernelIsaAllocation) {
1305         GeneralSurface *surface = new GeneralSurface(kernelIsaAllocation);
1306         dst.push_back(surface);
1307     }
1308 
1309     gtpinNotifyUpdateResidencyList(this, &dst);
1310 }
1311 
requiresCoherency()1312 bool Kernel::requiresCoherency() {
1313     auto numArgs = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs.size();
1314     for (decltype(numArgs) argIndex = 0; argIndex < numArgs; argIndex++) {
1315         if (kernelArguments[argIndex].object) {
1316             if (kernelArguments[argIndex].type == SVM_ALLOC_OBJ) {
1317                 auto pSVMAlloc = (GraphicsAllocation *)kernelArguments[argIndex].object;
1318                 if (pSVMAlloc->isCoherent()) {
1319                     return true;
1320                 }
1321             }
1322 
1323             if (Kernel::isMemObj(kernelArguments[argIndex].type)) {
1324                 auto clMem = const_cast<cl_mem>(static_cast<const _cl_mem *>(kernelArguments[argIndex].object));
1325                 auto memObj = castToObjectOrAbort<MemObj>(clMem);
1326                 if (memObj->getMultiGraphicsAllocation().isCoherent()) {
1327                     return true;
1328                 }
1329             }
1330         }
1331     }
1332     return false;
1333 }
1334 
setArgLocal(uint32_t argIndexIn,size_t argSize,const void * argVal)1335 cl_int Kernel::setArgLocal(uint32_t argIndexIn,
1336                            size_t argSize,
1337                            const void *argVal) {
1338     storeKernelArg(argIndexIn, SLM_OBJ, nullptr, argVal, argSize);
1339     uint32_t *crossThreadData = reinterpret_cast<uint32_t *>(this->crossThreadData);
1340     uint32_t argIndex = argIndexIn;
1341 
1342     const auto &args = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs;
1343     const auto &currArg = args[argIndex];
1344     UNRECOVERABLE_IF(currArg.getTraits().getAddressQualifier() != KernelArgMetadata::AddrLocal);
1345 
1346     slmSizes[argIndex] = static_cast<uint32_t>(argSize);
1347 
1348     UNRECOVERABLE_IF(isUndefinedOffset(currArg.as<NEO::ArgDescPointer>().slmOffset));
1349     auto slmOffset = *ptrOffset(crossThreadData, currArg.as<ArgDescPointer>().slmOffset);
1350     slmOffset += static_cast<uint32_t>(argSize);
1351 
1352     ++argIndex;
1353     while (argIndex < slmSizes.size()) {
1354         if (args[argIndex].getTraits().getAddressQualifier() != KernelArgMetadata::AddrLocal) {
1355             ++argIndex;
1356             continue;
1357         }
1358 
1359         const auto &nextArg = args[argIndex].as<ArgDescPointer>();
1360         UNRECOVERABLE_IF(0 == nextArg.requiredSlmAlignment);
1361 
1362         slmOffset = alignUp<uint32_t>(slmOffset, nextArg.requiredSlmAlignment);
1363 
1364         auto patchLocation = ptrOffset(crossThreadData, nextArg.slmOffset);
1365         *patchLocation = slmOffset;
1366 
1367         slmOffset += static_cast<uint32_t>(slmSizes[argIndex]);
1368         ++argIndex;
1369     }
1370 
1371     slmTotalSize = kernelInfo.kernelDescriptor.kernelAttributes.slmInlineSize + alignUp(slmOffset, KB);
1372 
1373     return CL_SUCCESS;
1374 }
1375 
setArgBuffer(uint32_t argIndex,size_t argSize,const void * argVal)1376 cl_int Kernel::setArgBuffer(uint32_t argIndex,
1377                             size_t argSize,
1378                             const void *argVal) {
1379 
1380     if (argSize != sizeof(cl_mem *)) {
1381         return CL_INVALID_ARG_SIZE;
1382     }
1383 
1384     auto clMem = reinterpret_cast<const cl_mem *>(argVal);
1385     auto pClDevice = &getDevice();
1386     auto rootDeviceIndex = pClDevice->getRootDeviceIndex();
1387 
1388     const auto &arg = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[argIndex];
1389     const auto &argAsPtr = arg.as<ArgDescPointer>();
1390 
1391     if (clMem && *clMem) {
1392         auto clMemObj = *clMem;
1393         DBG_LOG_INPUTS("setArgBuffer cl_mem", clMemObj);
1394 
1395         storeKernelArg(argIndex, BUFFER_OBJ, clMemObj, argVal, argSize);
1396 
1397         auto buffer = castToObject<Buffer>(clMemObj);
1398         if (!buffer)
1399             return CL_INVALID_MEM_OBJECT;
1400 
1401         if (buffer->peekSharingHandler()) {
1402             usingSharedObjArgs = true;
1403         }
1404         patchBufferOffset(argAsPtr, nullptr, nullptr);
1405 
1406         if (isValidOffset(argAsPtr.stateless)) {
1407             auto patchLocation = ptrOffset(crossThreadData, argAsPtr.stateless);
1408             uint64_t addressToPatch = buffer->setArgStateless(patchLocation, argAsPtr.pointerSize, rootDeviceIndex, !this->isBuiltIn);
1409 
1410             if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) {
1411                 PatchInfoData patchInfoData(addressToPatch - buffer->getOffset(), static_cast<uint64_t>(buffer->getOffset()),
1412                                             PatchInfoAllocationType::KernelArg, reinterpret_cast<uint64_t>(crossThreadData),
1413                                             static_cast<uint64_t>(argAsPtr.stateless),
1414                                             PatchInfoAllocationType::IndirectObjectHeap, argAsPtr.pointerSize);
1415                 this->patchInfoDataList.push_back(patchInfoData);
1416             }
1417         }
1418 
1419         bool disableL3 = false;
1420         bool forceNonAuxMode = false;
1421         bool isAuxTranslationKernel = (AuxTranslationDirection::None != auxTranslationDirection);
1422         auto graphicsAllocation = buffer->getGraphicsAllocation(rootDeviceIndex);
1423         auto &hwInfo = pClDevice->getHardwareInfo();
1424         auto &clHwHelper = ClHwHelper::get(hwInfo.platform.eRenderCoreFamily);
1425 
1426         if (isAuxTranslationKernel) {
1427             if (((AuxTranslationDirection::AuxToNonAux == auxTranslationDirection) && argIndex == 1) ||
1428                 ((AuxTranslationDirection::NonAuxToAux == auxTranslationDirection) && argIndex == 0)) {
1429                 forceNonAuxMode = true;
1430             }
1431             disableL3 = (argIndex == 0);
1432         } else if (graphicsAllocation->isCompressionEnabled() && clHwHelper.requiresNonAuxMode(argAsPtr, hwInfo)) {
1433             forceNonAuxMode = true;
1434         }
1435 
1436         if (isValidOffset(argAsPtr.bindful)) {
1437             buffer->setArgStateful(ptrOffset(getSurfaceStateHeap(), argAsPtr.bindful), forceNonAuxMode,
1438                                    disableL3, isAuxTranslationKernel, arg.isReadOnly(), pClDevice->getDevice(),
1439                                    kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, areMultipleSubDevicesInContext());
1440         } else if (isValidOffset(argAsPtr.bindless)) {
1441             buffer->setArgStateful(patchBindlessSurfaceState(graphicsAllocation, argAsPtr.bindless), forceNonAuxMode,
1442                                    disableL3, isAuxTranslationKernel, arg.isReadOnly(), pClDevice->getDevice(),
1443                                    kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, areMultipleSubDevicesInContext());
1444         }
1445 
1446         kernelArguments[argIndex].isStatelessUncacheable = argAsPtr.isPureStateful() ? false : buffer->isMemObjUncacheable();
1447 
1448         auto allocationForCacheFlush = graphicsAllocation;
1449 
1450         //if we make object uncacheable for surface state and there are not stateless accessess , then ther is no need to flush caches
1451         if (buffer->isMemObjUncacheableForSurfaceState() && argAsPtr.isPureStateful()) {
1452             allocationForCacheFlush = nullptr;
1453         }
1454 
1455         addAllocationToCacheFlushVector(argIndex, allocationForCacheFlush);
1456 
1457         return CL_SUCCESS;
1458     } else {
1459         storeKernelArg(argIndex, BUFFER_OBJ, nullptr, argVal, argSize);
1460         if (isValidOffset(argAsPtr.stateless)) {
1461             auto patchLocation = ptrOffset(getCrossThreadData(), argAsPtr.stateless);
1462             patchWithRequiredSize(patchLocation, argAsPtr.pointerSize, 0u);
1463         }
1464 
1465         if (isValidOffset(argAsPtr.bindful)) {
1466             auto surfaceState = ptrOffset(getSurfaceStateHeap(), argAsPtr.bindful);
1467             Buffer::setSurfaceState(&pClDevice->getDevice(), surfaceState, false, false, 0, nullptr, 0, nullptr, 0, 0,
1468                                     kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, areMultipleSubDevicesInContext());
1469         }
1470 
1471         return CL_SUCCESS;
1472     }
1473 }
1474 
setArgPipe(uint32_t argIndex,size_t argSize,const void * argVal)1475 cl_int Kernel::setArgPipe(uint32_t argIndex,
1476                           size_t argSize,
1477                           const void *argVal) {
1478 
1479     if (argSize != sizeof(cl_mem *)) {
1480         return CL_INVALID_ARG_SIZE;
1481     }
1482 
1483     auto clMem = reinterpret_cast<const cl_mem *>(argVal);
1484 
1485     if (clMem && *clMem) {
1486         auto clMemObj = *clMem;
1487         DBG_LOG_INPUTS("setArgPipe cl_mem", clMemObj);
1488 
1489         storeKernelArg(argIndex, PIPE_OBJ, clMemObj, argVal, argSize);
1490 
1491         auto memObj = castToObject<MemObj>(clMemObj);
1492         if (!memObj) {
1493             return CL_INVALID_MEM_OBJECT;
1494         }
1495 
1496         auto pipe = castToObject<Pipe>(clMemObj);
1497         if (!pipe) {
1498             return CL_INVALID_ARG_VALUE;
1499         }
1500 
1501         if (memObj->getContext() != &(this->getContext())) {
1502             return CL_INVALID_MEM_OBJECT;
1503         }
1504 
1505         auto rootDeviceIndex = getDevice().getRootDeviceIndex();
1506         const auto &argAsPtr = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[argIndex].as<ArgDescPointer>();
1507 
1508         auto patchLocation = ptrOffset(getCrossThreadData(), argAsPtr.stateless);
1509         pipe->setPipeArg(patchLocation, argAsPtr.pointerSize, rootDeviceIndex);
1510 
1511         if (isValidOffset(argAsPtr.bindful)) {
1512             auto graphicsAllocation = pipe->getGraphicsAllocation(rootDeviceIndex);
1513             auto surfaceState = ptrOffset(getSurfaceStateHeap(), argAsPtr.bindful);
1514             Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, false, false,
1515                                     pipe->getSize(), pipe->getCpuAddress(), 0,
1516                                     graphicsAllocation, 0, 0,
1517                                     kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, areMultipleSubDevicesInContext());
1518         }
1519 
1520         return CL_SUCCESS;
1521     } else {
1522         return CL_INVALID_MEM_OBJECT;
1523     }
1524 }
1525 
setArgImage(uint32_t argIndex,size_t argSize,const void * argVal)1526 cl_int Kernel::setArgImage(uint32_t argIndex,
1527                            size_t argSize,
1528                            const void *argVal) {
1529     return setArgImageWithMipLevel(argIndex, argSize, argVal, 0u);
1530 }
1531 
setArgImageWithMipLevel(uint32_t argIndex,size_t argSize,const void * argVal,uint32_t mipLevel)1532 cl_int Kernel::setArgImageWithMipLevel(uint32_t argIndex,
1533                                        size_t argSize,
1534                                        const void *argVal, uint32_t mipLevel) {
1535     auto retVal = CL_INVALID_ARG_VALUE;
1536     auto rootDeviceIndex = getDevice().getRootDeviceIndex();
1537 
1538     const auto &arg = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[argIndex];
1539     const auto &argAsImg = arg.as<ArgDescImage>();
1540 
1541     uint32_t *crossThreadData = reinterpret_cast<uint32_t *>(this->crossThreadData);
1542     auto clMemObj = *(static_cast<const cl_mem *>(argVal));
1543     auto pImage = castToObject<Image>(clMemObj);
1544 
1545     if (pImage && argSize == sizeof(cl_mem *)) {
1546         if (pImage->peekSharingHandler()) {
1547             usingSharedObjArgs = true;
1548         }
1549 
1550         DBG_LOG_INPUTS("setArgImage cl_mem", clMemObj);
1551 
1552         storeKernelArg(argIndex, IMAGE_OBJ, clMemObj, argVal, argSize);
1553 
1554         void *surfaceState = nullptr;
1555         if (isValidOffset(argAsImg.bindless)) {
1556             surfaceState = patchBindlessSurfaceState(pImage->getGraphicsAllocation(rootDeviceIndex), argAsImg.bindless);
1557         } else {
1558             DEBUG_BREAK_IF(isUndefinedOffset(argAsImg.bindful));
1559             surfaceState = ptrOffset(getSurfaceStateHeap(), argAsImg.bindful);
1560         }
1561 
1562         // Sets SS structure
1563         if (arg.getExtendedTypeInfo().isMediaImage) {
1564             DEBUG_BREAK_IF(!kernelInfo.kernelDescriptor.kernelAttributes.flags.usesVme);
1565             pImage->setMediaImageArg(surfaceState, rootDeviceIndex);
1566         } else {
1567             pImage->setImageArg(surfaceState, arg.getExtendedTypeInfo().isMediaBlockImage, mipLevel, rootDeviceIndex,
1568                                 getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics);
1569         }
1570 
1571         auto &imageDesc = pImage->getImageDesc();
1572         auto &imageFormat = pImage->getImageFormat();
1573         auto graphicsAllocation = pImage->getGraphicsAllocation(rootDeviceIndex);
1574 
1575         if (imageDesc.image_type == CL_MEM_OBJECT_IMAGE3D) {
1576             imageTransformer->registerImage3d(argIndex);
1577         }
1578 
1579         patch<uint32_t, cl_uint>(imageDesc.num_samples, crossThreadData, argAsImg.metadataPayload.numSamples);
1580         patch<uint32_t, cl_uint>(imageDesc.num_mip_levels, crossThreadData, argAsImg.metadataPayload.numMipLevels);
1581         patch<uint32_t, uint64_t>(imageDesc.image_width, crossThreadData, argAsImg.metadataPayload.imgWidth);
1582         patch<uint32_t, uint64_t>(imageDesc.image_height, crossThreadData, argAsImg.metadataPayload.imgHeight);
1583         patch<uint32_t, uint64_t>(imageDesc.image_depth, crossThreadData, argAsImg.metadataPayload.imgDepth);
1584         patch<uint32_t, uint64_t>(imageDesc.image_array_size, crossThreadData, argAsImg.metadataPayload.arraySize);
1585         patch<uint32_t, cl_channel_type>(imageFormat.image_channel_data_type, crossThreadData, argAsImg.metadataPayload.channelDataType);
1586         patch<uint32_t, cl_channel_order>(imageFormat.image_channel_order, crossThreadData, argAsImg.metadataPayload.channelOrder);
1587         if (arg.getExtendedTypeInfo().hasDeviceSideEnqueueExtendedDescriptor) {
1588             const auto &explicitArgsExtendedDescriptors = kernelInfo.kernelDescriptor.payloadMappings.explicitArgsExtendedDescriptors;
1589             UNRECOVERABLE_IF(argIndex >= explicitArgsExtendedDescriptors.size());
1590             auto deviceSideEnqueueDescriptor = static_cast<ArgDescriptorDeviceSideEnqueue *>(explicitArgsExtendedDescriptors[argIndex].get());
1591             patch<uint32_t, uint32_t>(argAsImg.bindful, crossThreadData, deviceSideEnqueueDescriptor->objectId);
1592         }
1593 
1594         auto pixelSize = pImage->getSurfaceFormatInfo().surfaceFormat.ImageElementSizeInBytes;
1595         patch<uint64_t, uint64_t>(graphicsAllocation->getGpuAddress(), crossThreadData, argAsImg.metadataPayload.flatBaseOffset);
1596         patch<uint32_t, uint64_t>((imageDesc.image_width * pixelSize) - 1, crossThreadData, argAsImg.metadataPayload.flatWidth);
1597         patch<uint32_t, uint64_t>((imageDesc.image_height * pixelSize) - 1, crossThreadData, argAsImg.metadataPayload.flatHeight);
1598         patch<uint32_t, uint64_t>(imageDesc.image_row_pitch - 1, crossThreadData, argAsImg.metadataPayload.flatPitch);
1599 
1600         addAllocationToCacheFlushVector(argIndex, graphicsAllocation);
1601         retVal = CL_SUCCESS;
1602     }
1603 
1604     return retVal;
1605 }
1606 
setArgImmediate(uint32_t argIndex,size_t argSize,const void * argVal)1607 cl_int Kernel::setArgImmediate(uint32_t argIndex,
1608                                size_t argSize,
1609                                const void *argVal) {
1610 
1611     auto retVal = CL_INVALID_ARG_VALUE;
1612 
1613     if (argVal) {
1614         storeKernelArg(argIndex, NONE_OBJ, nullptr, nullptr, argSize);
1615 
1616         [[maybe_unused]] auto crossThreadDataEnd = ptrOffset(crossThreadData, crossThreadDataSize);
1617         const auto &argAsVal = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[argIndex].as<ArgDescValue>();
1618         for (const auto &element : argAsVal.elements) {
1619             DEBUG_BREAK_IF(element.size <= 0);
1620 
1621             auto pDst = ptrOffset(crossThreadData, element.offset);
1622             auto pSrc = ptrOffset(argVal, element.sourceOffset);
1623 
1624             DEBUG_BREAK_IF(!(ptrOffset(pDst, element.size) <= crossThreadDataEnd));
1625 
1626             if (element.sourceOffset < argSize) {
1627                 size_t maxBytesToCopy = argSize - element.sourceOffset;
1628                 size_t bytesToCopy = std::min(static_cast<size_t>(element.size), maxBytesToCopy);
1629                 memcpy_s(pDst, element.size, pSrc, bytesToCopy);
1630             }
1631         }
1632 
1633         retVal = CL_SUCCESS;
1634     }
1635 
1636     return retVal;
1637 }
1638 
setArgSampler(uint32_t argIndex,size_t argSize,const void * argVal)1639 cl_int Kernel::setArgSampler(uint32_t argIndex,
1640                              size_t argSize,
1641                              const void *argVal) {
1642     auto retVal = CL_INVALID_SAMPLER;
1643 
1644     if (!argVal) {
1645         return retVal;
1646     }
1647 
1648     uint32_t *crossThreadData = reinterpret_cast<uint32_t *>(this->crossThreadData);
1649     auto clSamplerObj = *(static_cast<const cl_sampler *>(argVal));
1650     auto pSampler = castToObject<Sampler>(clSamplerObj);
1651 
1652     if (pSampler) {
1653         pSampler->incRefInternal();
1654     }
1655 
1656     if (kernelArguments.at(argIndex).object) {
1657         auto oldSampler = castToObject<Sampler>(kernelArguments.at(argIndex).object);
1658         UNRECOVERABLE_IF(!oldSampler);
1659         oldSampler->decRefInternal();
1660     }
1661 
1662     if (pSampler && argSize == sizeof(cl_sampler *)) {
1663         const auto &arg = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[argIndex];
1664         const auto &argAsSmp = arg.as<ArgDescSampler>();
1665 
1666         storeKernelArg(argIndex, SAMPLER_OBJ, clSamplerObj, argVal, argSize);
1667 
1668         auto dsh = getDynamicStateHeap();
1669         auto samplerState = ptrOffset(dsh, argAsSmp.bindful);
1670 
1671         pSampler->setArg(const_cast<void *>(samplerState), clDevice.getHardwareInfo());
1672 
1673         patch<uint32_t, uint32_t>(pSampler->getSnapWaValue(), crossThreadData, argAsSmp.metadataPayload.samplerSnapWa);
1674         patch<uint32_t, uint32_t>(GetAddrModeEnum(pSampler->addressingMode), crossThreadData, argAsSmp.metadataPayload.samplerAddressingMode);
1675         patch<uint32_t, uint32_t>(GetNormCoordsEnum(pSampler->normalizedCoordinates), crossThreadData, argAsSmp.metadataPayload.samplerNormalizedCoords);
1676         if (arg.getExtendedTypeInfo().hasDeviceSideEnqueueExtendedDescriptor) {
1677             const auto &explicitArgsExtendedDescriptors = kernelInfo.kernelDescriptor.payloadMappings.explicitArgsExtendedDescriptors;
1678             UNRECOVERABLE_IF(argIndex >= explicitArgsExtendedDescriptors.size());
1679             auto deviceSideEnqueueDescriptor = static_cast<ArgDescriptorDeviceSideEnqueue *>(explicitArgsExtendedDescriptors[argIndex].get());
1680             patch<uint32_t, uint32_t>(SAMPLER_OBJECT_ID_SHIFT + argAsSmp.bindful, crossThreadData, deviceSideEnqueueDescriptor->objectId);
1681         }
1682 
1683         retVal = CL_SUCCESS;
1684     }
1685 
1686     return retVal;
1687 }
1688 
setArgAccelerator(uint32_t argIndex,size_t argSize,const void * argVal)1689 cl_int Kernel::setArgAccelerator(uint32_t argIndex,
1690                                  size_t argSize,
1691                                  const void *argVal) {
1692     auto retVal = CL_INVALID_ARG_VALUE;
1693 
1694     if (argSize != sizeof(cl_accelerator_intel)) {
1695         return CL_INVALID_ARG_SIZE;
1696     }
1697 
1698     if (!argVal) {
1699         return retVal;
1700     }
1701 
1702     auto clAcceleratorObj = *(static_cast<const cl_accelerator_intel *>(argVal));
1703     DBG_LOG_INPUTS("setArgAccelerator cl_mem", clAcceleratorObj);
1704 
1705     const auto pAccelerator = castToObject<IntelAccelerator>(clAcceleratorObj);
1706 
1707     if (pAccelerator) {
1708         storeKernelArg(argIndex, ACCELERATOR_OBJ, clAcceleratorObj, argVal, argSize);
1709 
1710         const auto &arg = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[argIndex];
1711         const auto &argAsSmp = arg.as<ArgDescSampler>();
1712 
1713         if (argAsSmp.samplerType == iOpenCL::SAMPLER_OBJECT_VME) {
1714 
1715             const auto pVmeAccelerator = castToObjectOrAbort<VmeAccelerator>(pAccelerator);
1716             auto pDesc = static_cast<const cl_motion_estimation_desc_intel *>(pVmeAccelerator->getDescriptor());
1717             DEBUG_BREAK_IF(!pDesc);
1718 
1719             if (arg.getExtendedTypeInfo().hasVmeExtendedDescriptor) {
1720                 const auto &explicitArgsExtendedDescriptors = kernelInfo.kernelDescriptor.payloadMappings.explicitArgsExtendedDescriptors;
1721                 UNRECOVERABLE_IF(argIndex >= explicitArgsExtendedDescriptors.size());
1722                 auto vmeDescriptor = static_cast<ArgDescVme *>(explicitArgsExtendedDescriptors[argIndex].get());
1723 
1724                 auto pVmeMbBlockTypeDst = reinterpret_cast<cl_uint *>(ptrOffset(crossThreadData, vmeDescriptor->mbBlockType));
1725                 *pVmeMbBlockTypeDst = pDesc->mb_block_type;
1726 
1727                 auto pVmeSubpixelMode = reinterpret_cast<cl_uint *>(ptrOffset(crossThreadData, vmeDescriptor->subpixelMode));
1728                 *pVmeSubpixelMode = pDesc->subpixel_mode;
1729 
1730                 auto pVmeSadAdjustMode = reinterpret_cast<cl_uint *>(ptrOffset(crossThreadData, vmeDescriptor->sadAdjustMode));
1731                 *pVmeSadAdjustMode = pDesc->sad_adjust_mode;
1732 
1733                 auto pVmeSearchPathType = reinterpret_cast<cl_uint *>(ptrOffset(crossThreadData, vmeDescriptor->searchPathType));
1734                 *pVmeSearchPathType = pDesc->search_path_type;
1735             }
1736 
1737             retVal = CL_SUCCESS;
1738         } else if (argAsSmp.samplerType == iOpenCL::SAMPLER_OBJECT_VE) {
1739             retVal = CL_SUCCESS;
1740         }
1741     }
1742 
1743     return retVal;
1744 }
1745 
setArgDevQueue(uint32_t argIndex,size_t argSize,const void * argVal)1746 cl_int Kernel::setArgDevQueue(uint32_t argIndex,
1747                               size_t argSize,
1748                               const void *argVal) {
1749     if (argVal == nullptr) {
1750         return CL_INVALID_ARG_VALUE;
1751     }
1752 
1753     if (argSize != sizeof(cl_command_queue)) {
1754         return CL_INVALID_ARG_SIZE;
1755     }
1756 
1757     auto clDeviceQueue = *(static_cast<const device_queue *>(argVal));
1758     auto pDeviceQueue = castToObject<DeviceQueue>(clDeviceQueue);
1759 
1760     if (pDeviceQueue == nullptr) {
1761         return CL_INVALID_DEVICE_QUEUE;
1762     }
1763 
1764     storeKernelArg(argIndex, DEVICE_QUEUE_OBJ, clDeviceQueue, argVal, argSize);
1765 
1766     const auto &argAsPtr = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[argIndex].as<ArgDescPointer>();
1767     auto patchLocation = ptrOffset(reinterpret_cast<uint32_t *>(crossThreadData), argAsPtr.stateless);
1768     patchWithRequiredSize(patchLocation, argAsPtr.pointerSize,
1769                           static_cast<uintptr_t>(pDeviceQueue->getQueueBuffer()->getGpuAddressToPatch()));
1770 
1771     return CL_SUCCESS;
1772 }
1773 
setKernelArgHandler(uint32_t argIndex,KernelArgHandler handler)1774 void Kernel::setKernelArgHandler(uint32_t argIndex, KernelArgHandler handler) {
1775     if (kernelArgHandlers.size() <= argIndex) {
1776         kernelArgHandlers.resize(argIndex + 1);
1777     }
1778 
1779     kernelArgHandlers[argIndex] = handler;
1780 }
1781 
unsetArg(uint32_t argIndex)1782 void Kernel::unsetArg(uint32_t argIndex) {
1783     if (kernelArguments[argIndex].isPatched) {
1784         patchedArgumentsNum--;
1785         kernelArguments[argIndex].isPatched = false;
1786         if (kernelArguments[argIndex].isStatelessUncacheable) {
1787             statelessUncacheableArgsCount--;
1788             kernelArguments[argIndex].isStatelessUncacheable = false;
1789         }
1790     }
1791 }
1792 
createReflectionSurface()1793 void Kernel::createReflectionSurface() {
1794     auto pClDevice = &clDevice;
1795     if (this->isParentKernel && kernelReflectionSurface == nullptr) {
1796         auto &hwInfo = pClDevice->getHardwareInfo();
1797         auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
1798         BlockKernelManager *blockManager = program->getBlockKernelManager();
1799         uint32_t blockCount = static_cast<uint32_t>(blockManager->getCount());
1800 
1801         ObjectCounts objectCount;
1802         getParentObjectCounts(objectCount);
1803         uint32_t parentImageCount = objectCount.imageCount;
1804         uint32_t parentSamplerCount = objectCount.samplerCount;
1805         size_t maxConstantBufferSize = 0;
1806 
1807         std::vector<IGIL_KernelCurbeParams> *curbeParamsForBlocks = new std::vector<IGIL_KernelCurbeParams>[blockCount];
1808 
1809         uint64_t *tokenMask = new uint64_t[blockCount];
1810         uint32_t *sshTokenOffsetsFromKernelData = new uint32_t[blockCount];
1811 
1812         size_t kernelReflectionSize = alignUp(sizeof(IGIL_KernelDataHeader) + blockCount * sizeof(IGIL_KernelAddressData), sizeof(void *));
1813         uint32_t kernelDataOffset = static_cast<uint32_t>(kernelReflectionSize);
1814         uint32_t parentSSHAlignedSize = alignUp(this->kernelInfo.heapInfo.SurfaceStateHeapSize, hwHelper.getBindingTableStateAlignement());
1815         uint32_t btOffset = parentSSHAlignedSize;
1816 
1817         for (uint32_t i = 0; i < blockCount; i++) {
1818             const KernelInfo *pBlockInfo = blockManager->getBlockKernelInfo(i);
1819             size_t samplerStateAndBorderColorSize = 0;
1820 
1821             uint32_t firstSSHTokenIndex = 0;
1822 
1823             ReflectionSurfaceHelper::getCurbeParams(curbeParamsForBlocks[i], tokenMask[i], firstSSHTokenIndex, *pBlockInfo, hwInfo);
1824 
1825             maxConstantBufferSize = std::max(maxConstantBufferSize, static_cast<size_t>(pBlockInfo->kernelDescriptor.kernelAttributes.crossThreadDataSize));
1826 
1827             samplerStateAndBorderColorSize = pBlockInfo->getSamplerStateArraySize(hwInfo);
1828             samplerStateAndBorderColorSize = alignUp(samplerStateAndBorderColorSize, Sampler::samplerStateArrayAlignment);
1829             samplerStateAndBorderColorSize += pBlockInfo->getBorderColorStateSize();
1830             samplerStateAndBorderColorSize = alignUp(samplerStateAndBorderColorSize, sizeof(void *));
1831 
1832             sshTokenOffsetsFromKernelData[i] = offsetof(IGIL_KernelData, m_data) + sizeof(IGIL_KernelCurbeParams) * firstSSHTokenIndex;
1833 
1834             kernelReflectionSize += alignUp(sizeof(IGIL_KernelData) + sizeof(IGIL_KernelCurbeParams) * curbeParamsForBlocks[i].size(), sizeof(void *));
1835             kernelReflectionSize += parentSamplerCount * sizeof(IGIL_SamplerParams) + samplerStateAndBorderColorSize;
1836         }
1837 
1838         maxConstantBufferSize = alignUp(maxConstantBufferSize, sizeof(void *));
1839         kernelReflectionSize += blockCount * alignUp(maxConstantBufferSize, sizeof(void *));
1840         kernelReflectionSize += parentImageCount * sizeof(IGIL_ImageParamters);
1841         kernelReflectionSize += parentSamplerCount * sizeof(IGIL_ParentSamplerParams);
1842         kernelReflectionSurface = executionEnvironment.memoryManager->allocateGraphicsMemoryWithProperties(
1843             {pClDevice->getRootDeviceIndex(), kernelReflectionSize,
1844              GraphicsAllocation::AllocationType::DEVICE_QUEUE_BUFFER,
1845              pClDevice->getDeviceBitfield()});
1846 
1847         for (uint32_t i = 0; i < blockCount; i++) {
1848             const KernelInfo *pBlockInfo = blockManager->getBlockKernelInfo(i);
1849             uint32_t newKernelDataOffset = ReflectionSurfaceHelper::setKernelData(kernelReflectionSurface->getUnderlyingBuffer(),
1850                                                                                   kernelDataOffset,
1851                                                                                   curbeParamsForBlocks[i],
1852                                                                                   tokenMask[i],
1853                                                                                   maxConstantBufferSize,
1854                                                                                   parentSamplerCount,
1855                                                                                   *pBlockInfo,
1856                                                                                   hwInfo);
1857 
1858             uint32_t offset = static_cast<uint32_t>(offsetof(IGIL_KernelDataHeader, m_data) + sizeof(IGIL_KernelAddressData) * i);
1859 
1860             uint32_t samplerHeapOffset = static_cast<uint32_t>(alignUp(kernelDataOffset + sizeof(IGIL_KernelData) + curbeParamsForBlocks[i].size() * sizeof(IGIL_KernelCurbeParams), sizeof(void *)));
1861             uint32_t samplerHeapSize = static_cast<uint32_t>(alignUp(pBlockInfo->getSamplerStateArraySize(hwInfo), Sampler::samplerStateArrayAlignment) + pBlockInfo->getBorderColorStateSize());
1862             uint32_t constantBufferOffset = alignUp(samplerHeapOffset + samplerHeapSize, sizeof(void *));
1863 
1864             uint32_t samplerParamsOffset = 0;
1865             if (parentSamplerCount) {
1866                 samplerParamsOffset = newKernelDataOffset - sizeof(IGIL_SamplerParams) * parentSamplerCount;
1867                 IGIL_SamplerParams *pSamplerParams = (IGIL_SamplerParams *)ptrOffset(kernelReflectionSurface->getUnderlyingBuffer(), samplerParamsOffset);
1868                 uint32_t sampler = 0;
1869                 const auto &args = pBlockInfo->kernelDescriptor.payloadMappings.explicitArgs;
1870                 for (uint32_t argID = 0; argID < args.size(); argID++) {
1871                     if (args[argID].is<ArgDescriptor::ArgTSampler>()) {
1872 
1873                         pSamplerParams[sampler].m_ArgID = argID;
1874                         pSamplerParams[sampler].m_SamplerStateOffset = args[argID].as<ArgDescSampler>().bindful;
1875                         sampler++;
1876                     }
1877                 }
1878             }
1879 
1880             ReflectionSurfaceHelper::setKernelAddressData(kernelReflectionSurface->getUnderlyingBuffer(),
1881                                                           offset,
1882                                                           kernelDataOffset,
1883                                                           samplerHeapOffset,
1884                                                           constantBufferOffset,
1885                                                           samplerParamsOffset,
1886                                                           sshTokenOffsetsFromKernelData[i] + kernelDataOffset,
1887                                                           btOffset,
1888                                                           *pBlockInfo,
1889                                                           hwInfo);
1890 
1891             if (samplerHeapSize > 0) {
1892                 void *pDst = ptrOffset(kernelReflectionSurface->getUnderlyingBuffer(), samplerHeapOffset);
1893                 const void *pSrc = ptrOffset(pBlockInfo->heapInfo.pDsh, pBlockInfo->getBorderColorOffset());
1894                 memcpy_s(pDst, samplerHeapSize, pSrc, samplerHeapSize);
1895             }
1896 
1897             void *pDst = ptrOffset(kernelReflectionSurface->getUnderlyingBuffer(), constantBufferOffset);
1898             const char *pSrc = pBlockInfo->crossThreadData;
1899             memcpy_s(pDst, pBlockInfo->getConstantBufferSize(), pSrc, pBlockInfo->getConstantBufferSize());
1900 
1901             btOffset += pBlockInfo->kernelDescriptor.payloadMappings.bindingTable.tableOffset;
1902             kernelDataOffset = newKernelDataOffset;
1903         }
1904 
1905         uint32_t samplerOffset = 0;
1906         if (parentSamplerCount) {
1907             samplerOffset = kernelDataOffset + parentImageCount * sizeof(IGIL_ImageParamters);
1908         }
1909         ReflectionSurfaceHelper::setKernelDataHeader(kernelReflectionSurface->getUnderlyingBuffer(), blockCount, parentImageCount, parentSamplerCount, kernelDataOffset, samplerOffset);
1910         delete[] curbeParamsForBlocks;
1911         delete[] tokenMask;
1912         delete[] sshTokenOffsetsFromKernelData;
1913 
1914         // Patch constant values once after reflection surface creation
1915         patchBlocksCurbeWithConstantValues();
1916     }
1917 
1918     if (DebugManager.flags.ForceDispatchScheduler.get()) {
1919         if (this->isSchedulerKernel && kernelReflectionSurface == nullptr) {
1920             kernelReflectionSurface = executionEnvironment.memoryManager->allocateGraphicsMemoryWithProperties(
1921                 {pClDevice->getRootDeviceIndex(), MemoryConstants::pageSize,
1922                  GraphicsAllocation::AllocationType::DEVICE_QUEUE_BUFFER,
1923                  pClDevice->getDeviceBitfield()});
1924         }
1925     }
1926 }
1927 
getParentObjectCounts(ObjectCounts & objectCount)1928 void Kernel::getParentObjectCounts(ObjectCounts &objectCount) {
1929     objectCount.imageCount = 0;
1930     objectCount.samplerCount = 0;
1931     DEBUG_BREAK_IF(!isParentKernel);
1932 
1933     for (const auto &arg : this->kernelArguments) {
1934         if (arg.type == SAMPLER_OBJ) {
1935             objectCount.samplerCount++;
1936         } else if (arg.type == IMAGE_OBJ) {
1937             objectCount.imageCount++;
1938         }
1939     }
1940 }
1941 
hasPrintfOutput() const1942 bool Kernel::hasPrintfOutput() const {
1943     return kernelInfo.kernelDescriptor.kernelAttributes.flags.usesPrintf;
1944 }
1945 
getInstructionHeapSizeForExecutionModel() const1946 size_t Kernel::getInstructionHeapSizeForExecutionModel() const {
1947     BlockKernelManager *blockManager = program->getBlockKernelManager();
1948     uint32_t blockCount = static_cast<uint32_t>(blockManager->getCount());
1949 
1950     size_t totalSize = 0;
1951     if (isParentKernel) {
1952         totalSize = kernelBinaryAlignment - 1; // for initial alignment
1953         for (uint32_t i = 0; i < blockCount; i++) {
1954             const KernelInfo *pBlockInfo = blockManager->getBlockKernelInfo(i);
1955             totalSize += pBlockInfo->heapInfo.KernelHeapSize;
1956             totalSize = alignUp(totalSize, kernelBinaryAlignment);
1957         }
1958     }
1959     return totalSize;
1960 }
1961 
patchBlocksCurbeWithConstantValues()1962 void Kernel::patchBlocksCurbeWithConstantValues() {
1963     auto rootDeviceIndex = clDevice.getRootDeviceIndex();
1964     BlockKernelManager *blockManager = program->getBlockKernelManager();
1965     uint32_t blockCount = static_cast<uint32_t>(blockManager->getCount());
1966 
1967     uint64_t globalMemoryGpuAddress = program->getGlobalSurface(rootDeviceIndex) != nullptr ? program->getGlobalSurface(rootDeviceIndex)->getGpuAddressToPatch() : 0;
1968     uint64_t constantMemoryGpuAddress = program->getConstantSurface(rootDeviceIndex) != nullptr ? program->getConstantSurface(rootDeviceIndex)->getGpuAddressToPatch() : 0;
1969 
1970     for (uint32_t blockID = 0; blockID < blockCount; blockID++) {
1971         const KernelInfo *pBlockInfo = blockManager->getBlockKernelInfo(blockID);
1972 
1973         uint64_t globalMemoryCurbeOffset = ReflectionSurfaceHelper::undefinedOffset;
1974         uint32_t globalMemoryPatchSize = 0;
1975         uint64_t constantMemoryCurbeOffset = ReflectionSurfaceHelper::undefinedOffset;
1976         uint32_t constantMemoryPatchSize = 0;
1977 
1978         if (isValidOffset(pBlockInfo->kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress.stateless)) {
1979             globalMemoryCurbeOffset = pBlockInfo->kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress.stateless;
1980             globalMemoryPatchSize = pBlockInfo->kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress.pointerSize;
1981         }
1982 
1983         if (isValidOffset(pBlockInfo->kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress.stateless)) {
1984             constantMemoryCurbeOffset = pBlockInfo->kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress.stateless;
1985             constantMemoryPatchSize = pBlockInfo->kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress.pointerSize;
1986         }
1987 
1988         ReflectionSurfaceHelper::patchBlocksCurbeWithConstantValues(kernelReflectionSurface->getUnderlyingBuffer(), blockID,
1989                                                                     globalMemoryCurbeOffset, globalMemoryPatchSize, globalMemoryGpuAddress,
1990                                                                     constantMemoryCurbeOffset, constantMemoryPatchSize, constantMemoryGpuAddress,
1991                                                                     ReflectionSurfaceHelper::undefinedOffset, 0, 0);
1992     }
1993 }
1994 
getCurbeParams(std::vector<IGIL_KernelCurbeParams> & curbeParamsOut,uint64_t & tokenMaskOut,uint32_t & firstSSHTokenIndex,const KernelInfo & kernelInfo,const HardwareInfo & hwInfo)1995 void Kernel::ReflectionSurfaceHelper::getCurbeParams(std::vector<IGIL_KernelCurbeParams> &curbeParamsOut, uint64_t &tokenMaskOut, uint32_t &firstSSHTokenIndex, const KernelInfo &kernelInfo, const HardwareInfo &hwInfo) {
1996     const auto &args = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs;
1997     const auto gpuPointerSize = kernelInfo.kernelDescriptor.kernelAttributes.gpuPointerSize;
1998     uint32_t bindingTableIndex = 253;
1999     uint64_t tokenMask = 0;
2000 
2001     for (size_t argNum = 0; argNum < args.size(); argNum++) {
2002         const auto &arg = args[argNum];
2003 
2004         auto sizeOfKernelArgForSSH = gpuPointerSize;
2005         bindingTableIndex = 253;
2006 
2007         if (arg.is<ArgDescriptor::ArgTPointer>()) {
2008             const auto &argAsPtr = arg.as<ArgDescPointer>();
2009 
2010             if (argAsPtr.requiredSlmAlignment) {
2011                 curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_OBJECT_ARGUMENT_SIZES, 0, argAsPtr.slmOffset, argAsPtr.requiredSlmAlignment});
2012                 tokenMask |= shiftLeftBy(DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_OBJECT_ARGUMENT_SIZES);
2013             } else {
2014                 curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{COMPILER_DATA_PARAMETER_GLOBAL_SURFACE, gpuPointerSize, argAsPtr.stateless, static_cast<uint>(argNum)});
2015                 tokenMask |= shiftLeftBy(63);
2016             }
2017         } else if (arg.is<ArgDescriptor::ArgTImage>()) {
2018             const auto &argAsImg = arg.as<ArgDescImage>();
2019 
2020             auto emplaceIfValidOffset = [&](uint parameterType, NEO::CrossThreadDataOffset offset) {
2021                 if (isValidOffset(offset)) {
2022                     curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{parameterType + 50, sizeof(uint32_t), offset, static_cast<uint>(argNum)});
2023                 }
2024             };
2025             emplaceIfValidOffset(DATA_PARAMETER_IMAGE_WIDTH, argAsImg.metadataPayload.imgWidth);
2026             emplaceIfValidOffset(DATA_PARAMETER_IMAGE_HEIGHT, argAsImg.metadataPayload.imgHeight);
2027             emplaceIfValidOffset(DATA_PARAMETER_IMAGE_DEPTH, argAsImg.metadataPayload.imgDepth);
2028             emplaceIfValidOffset(DATA_PARAMETER_IMAGE_CHANNEL_DATA_TYPE, argAsImg.metadataPayload.channelDataType);
2029             emplaceIfValidOffset(DATA_PARAMETER_IMAGE_CHANNEL_ORDER, argAsImg.metadataPayload.channelOrder);
2030             emplaceIfValidOffset(DATA_PARAMETER_IMAGE_ARRAY_SIZE, argAsImg.metadataPayload.arraySize);
2031             if (arg.getExtendedTypeInfo().hasDeviceSideEnqueueExtendedDescriptor) {
2032                 const auto &argsExtDescriptors = kernelInfo.kernelDescriptor.payloadMappings.explicitArgsExtendedDescriptors;
2033                 UNRECOVERABLE_IF(argNum >= argsExtDescriptors.size());
2034                 const auto &deviceSideEnqueueDescriptor = static_cast<ArgDescriptorDeviceSideEnqueue *>(argsExtDescriptors[argNum].get());
2035                 emplaceIfValidOffset(DATA_PARAMETER_OBJECT_ID, deviceSideEnqueueDescriptor->objectId);
2036             }
2037 
2038             const auto &bindingTable = kernelInfo.kernelDescriptor.payloadMappings.bindingTable;
2039             if (isValidOffset(bindingTable.tableOffset)) {
2040                 auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
2041                 const auto ssh = static_cast<const char *>(kernelInfo.heapInfo.pSsh) + bindingTable.tableOffset;
2042 
2043                 for (uint8_t i = 0; i < bindingTable.numEntries; i++) {
2044                     const auto pointer = static_cast<NEO::SurfaceStateHeapOffset>(hwHelper.getBindingTableStateSurfaceStatePointer(ssh, i));
2045                     if (pointer == argAsImg.bindful) {
2046                         bindingTableIndex = i;
2047                         break;
2048                     }
2049                 }
2050                 DEBUG_BREAK_IF(bindingTableIndex == 253);
2051             }
2052 
2053             tokenMask |= shiftLeftBy(50);
2054         } else if (arg.is<ArgDescriptor::ArgTSampler>()) {
2055             const auto &argAsSmp = arg.as<ArgDescSampler>();
2056 
2057             auto emplaceIfValidOffset = [&](uint parameterType, NEO::CrossThreadDataOffset offset) {
2058                 if (isValidOffset(offset)) {
2059                     curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{parameterType + 100, sizeof(uint32_t), offset, static_cast<uint>(argNum)});
2060                 }
2061             };
2062             emplaceIfValidOffset(DATA_PARAMETER_SAMPLER_COORDINATE_SNAP_WA_REQUIRED, argAsSmp.metadataPayload.samplerSnapWa);
2063             emplaceIfValidOffset(DATA_PARAMETER_SAMPLER_ADDRESS_MODE, argAsSmp.metadataPayload.samplerAddressingMode);
2064             emplaceIfValidOffset(DATA_PARAMETER_SAMPLER_NORMALIZED_COORDS, argAsSmp.metadataPayload.samplerNormalizedCoords);
2065             if (arg.getExtendedTypeInfo().hasDeviceSideEnqueueExtendedDescriptor) {
2066                 const auto &argsExtDescriptors = kernelInfo.kernelDescriptor.payloadMappings.explicitArgsExtendedDescriptors;
2067                 UNRECOVERABLE_IF(argNum >= argsExtDescriptors.size());
2068                 const auto &deviceSideEnqueueDescriptor = static_cast<ArgDescriptorDeviceSideEnqueue *>(argsExtDescriptors[argNum].get());
2069                 emplaceIfValidOffset(DATA_PARAMETER_OBJECT_ID, deviceSideEnqueueDescriptor->objectId);
2070             }
2071 
2072             tokenMask |= shiftLeftBy(51);
2073         } else {
2074             bindingTableIndex = 0;
2075             sizeOfKernelArgForSSH = 0;
2076         }
2077 
2078         curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{1024, sizeOfKernelArgForSSH, bindingTableIndex, static_cast<uint>(argNum)});
2079     }
2080 
2081     for (const auto &param : kernelInfo.kernelDescriptor.kernelMetadata.allByValueKernelArguments) {
2082         curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{DATA_PARAMETER_KERNEL_ARGUMENT, param.byValueElement.size, param.byValueElement.offset, param.argNum});
2083         tokenMask |= shiftLeftBy(DATA_PARAMETER_KERNEL_ARGUMENT);
2084     }
2085 
2086     const auto &dispatchTraits = kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits;
2087     for (uint32_t i = 0; i < 3U; i++) {
2088         auto emplaceIfValidOffsetAndSetTokenMask = [&](uint parameterType, NEO::CrossThreadDataOffset offset) {
2089             constexpr uint paramSize = sizeof(uint32_t);
2090             if (isValidOffset(offset)) {
2091                 curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{parameterType, paramSize, offset, static_cast<uint>(i * paramSize)});
2092                 tokenMask |= shiftLeftBy(parameterType);
2093             }
2094         };
2095         emplaceIfValidOffsetAndSetTokenMask(DATA_PARAMETER_LOCAL_WORK_SIZE, dispatchTraits.localWorkSize[i]);
2096         emplaceIfValidOffsetAndSetTokenMask(DATA_PARAMETER_LOCAL_WORK_SIZE, dispatchTraits.localWorkSize2[i]);
2097         emplaceIfValidOffsetAndSetTokenMask(DATA_PARAMETER_GLOBAL_WORK_OFFSET, dispatchTraits.globalWorkOffset[i]);
2098         emplaceIfValidOffsetAndSetTokenMask(DATA_PARAMETER_ENQUEUED_LOCAL_WORK_SIZE, dispatchTraits.enqueuedLocalWorkSize[i]);
2099         emplaceIfValidOffsetAndSetTokenMask(DATA_PARAMETER_GLOBAL_WORK_SIZE, dispatchTraits.globalWorkSize[i]);
2100         emplaceIfValidOffsetAndSetTokenMask(DATA_PARAMETER_NUM_WORK_GROUPS, dispatchTraits.numWorkGroups[i]);
2101     }
2102     {
2103         const auto &payloadMappings = kernelInfo.kernelDescriptor.payloadMappings;
2104         auto emplaceIfValidOffsetAndSetTokenMask = [&](uint parameterType, NEO::CrossThreadDataOffset offset) {
2105             if (isValidOffset(offset)) {
2106                 curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{parameterType, sizeof(uint32_t), offset, 0});
2107                 tokenMask |= shiftLeftBy(parameterType);
2108             }
2109         };
2110         emplaceIfValidOffsetAndSetTokenMask(DATA_PARAMETER_PARENT_EVENT, payloadMappings.implicitArgs.deviceSideEnqueueParentEvent);
2111         emplaceIfValidOffsetAndSetTokenMask(DATA_PARAMETER_WORK_DIMENSIONS, payloadMappings.dispatchTraits.workDim);
2112     }
2113 
2114     std::sort(curbeParamsOut.begin(), curbeParamsOut.end(), compareFunction);
2115     tokenMaskOut = tokenMask;
2116     firstSSHTokenIndex = static_cast<uint32_t>(curbeParamsOut.size() - args.size());
2117 }
2118 
setKernelData(void * reflectionSurface,uint32_t offset,std::vector<IGIL_KernelCurbeParams> & curbeParamsIn,uint64_t tokenMaskIn,size_t maxConstantBufferSize,size_t samplerCount,const KernelInfo & kernelInfo,const HardwareInfo & hwInfo)2119 uint32_t Kernel::ReflectionSurfaceHelper::setKernelData(void *reflectionSurface, uint32_t offset,
2120                                                         std::vector<IGIL_KernelCurbeParams> &curbeParamsIn, uint64_t tokenMaskIn,
2121                                                         size_t maxConstantBufferSize, size_t samplerCount, const KernelInfo &kernelInfo, const HardwareInfo &hwInfo) {
2122     uint32_t offsetToEnd = 0;
2123     IGIL_KernelData *kernelData = reinterpret_cast<IGIL_KernelData *>(ptrOffset(reflectionSurface, offset));
2124     size_t samplerHeapSize = alignUp(kernelInfo.getSamplerStateArraySize(hwInfo), Sampler::samplerStateArrayAlignment) + kernelInfo.getBorderColorStateSize();
2125 
2126     kernelData->m_numberOfCurbeParams = static_cast<uint32_t>(curbeParamsIn.size()); // number of paramters to patch
2127     kernelData->m_numberOfCurbeTokens = static_cast<uint32_t>(curbeParamsIn.size() - kernelInfo.kernelDescriptor.payloadMappings.explicitArgs.size());
2128     kernelData->m_numberOfSamplerStates = static_cast<uint32_t>(kernelInfo.getSamplerStateArrayCount());
2129     kernelData->m_SizeOfSamplerHeap = static_cast<uint32_t>(samplerHeapSize);
2130     kernelData->m_SamplerBorderColorStateOffsetOnDSH = isValidOffset(kernelInfo.kernelDescriptor.payloadMappings.samplerTable.borderColor) ? kernelInfo.kernelDescriptor.payloadMappings.samplerTable.borderColor : 0;
2131     kernelData->m_SamplerStateArrayOffsetOnDSH = isValidOffset(kernelInfo.kernelDescriptor.payloadMappings.samplerTable.tableOffset) ? kernelInfo.kernelDescriptor.payloadMappings.samplerTable.tableOffset : -1;
2132     kernelData->m_sizeOfConstantBuffer = kernelInfo.getConstantBufferSize();
2133     kernelData->m_PatchTokensMask = tokenMaskIn;
2134     kernelData->m_ScratchSpacePatchValue = 0;
2135     kernelData->m_SIMDSize = kernelInfo.getMaxSimdSize();
2136     kernelData->m_HasBarriers = kernelInfo.kernelDescriptor.kernelAttributes.barrierCount;
2137     kernelData->m_RequiredWkgSizes[0] = kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[0];
2138     kernelData->m_RequiredWkgSizes[1] = kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[1];
2139     kernelData->m_RequiredWkgSizes[2] = kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[2];
2140     kernelData->m_InilineSLMSize = kernelInfo.kernelDescriptor.kernelAttributes.slmInlineSize;
2141 
2142     bool localIdRequired = false;
2143     if (kernelInfo.kernelDescriptor.kernelAttributes.flags.usesFlattenedLocalIds || (kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels > 0)) {
2144         localIdRequired = true;
2145     }
2146     kernelData->m_PayloadSize = PerThreadDataHelper::getThreadPayloadSize(kernelInfo.kernelDescriptor, hwInfo.capabilityTable.grfSize);
2147 
2148     kernelData->m_NeedLocalIDS = localIdRequired ? 1 : 0;
2149     kernelData->m_DisablePreemption = 0u;
2150 
2151     bool concurrentExecAllowed = true;
2152 
2153     if (kernelInfo.kernelDescriptor.kernelAttributes.perHwThreadPrivateMemorySize > 0) {
2154         concurrentExecAllowed = false;
2155     }
2156     kernelData->m_CanRunConcurently = concurrentExecAllowed ? 1 : 0;
2157 
2158     if (DebugManager.flags.DisableConcurrentBlockExecution.get()) {
2159         kernelData->m_CanRunConcurently = false;
2160     }
2161 
2162     IGIL_KernelCurbeParams *kernelCurbeParams = kernelData->m_data;
2163 
2164     for (uint32_t i = 0; i < curbeParamsIn.size(); i++) {
2165         kernelCurbeParams[i] = curbeParamsIn[i];
2166     }
2167 
2168     offsetToEnd = static_cast<uint32_t>(offset +
2169                                         alignUp(sizeof(IGIL_KernelData) + sizeof(IGIL_KernelCurbeParams) * curbeParamsIn.size(), sizeof(void *)) +
2170                                         alignUp(samplerHeapSize, sizeof(void *)) +
2171                                         alignUp(maxConstantBufferSize, sizeof(void *)) +
2172                                         sizeof(IGIL_SamplerParams) * samplerCount);
2173 
2174     return offsetToEnd;
2175 }
2176 
setKernelAddressDataBtOffset(void * reflectionSurface,uint32_t blockID,uint32_t btOffset)2177 void Kernel::ReflectionSurfaceHelper::setKernelAddressDataBtOffset(void *reflectionSurface, uint32_t blockID, uint32_t btOffset) {
2178 
2179     uint32_t offset = static_cast<uint32_t>(offsetof(IGIL_KernelDataHeader, m_data) + sizeof(IGIL_KernelAddressData) * blockID);
2180     IGIL_KernelAddressData *kernelAddressData = reinterpret_cast<IGIL_KernelAddressData *>(ptrOffset(reflectionSurface, offset));
2181 
2182     kernelAddressData->m_BTSoffset = btOffset;
2183 }
2184 
setKernelAddressData(void * reflectionSurface,uint32_t offset,uint32_t kernelDataOffset,uint32_t samplerHeapOffset,uint32_t constantBufferOffset,uint32_t samplerParamsOffset,uint32_t sshTokensOffset,uint32_t btOffset,const KernelInfo & kernelInfo,const HardwareInfo & hwInfo)2185 void Kernel::ReflectionSurfaceHelper::setKernelAddressData(void *reflectionSurface, uint32_t offset, uint32_t kernelDataOffset, uint32_t samplerHeapOffset,
2186                                                            uint32_t constantBufferOffset, uint32_t samplerParamsOffset,
2187                                                            uint32_t sshTokensOffset, uint32_t btOffset, const KernelInfo &kernelInfo, const HardwareInfo &hwInfo) {
2188     IGIL_KernelAddressData *kernelAddressData = reinterpret_cast<IGIL_KernelAddressData *>(ptrOffset(reflectionSurface, offset));
2189 
2190     auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
2191 
2192     kernelAddressData->m_KernelDataOffset = kernelDataOffset;
2193     kernelAddressData->m_SamplerHeapOffset = samplerHeapOffset;
2194     kernelAddressData->m_SamplerParamsOffset = samplerParamsOffset;
2195     kernelAddressData->m_ConstantBufferOffset = constantBufferOffset;
2196     kernelAddressData->m_SSHTokensOffset = sshTokensOffset;
2197     kernelAddressData->m_BTSoffset = btOffset;
2198     kernelAddressData->m_BTSize = static_cast<uint32_t>(kernelInfo.kernelDescriptor.payloadMappings.bindingTable.numEntries * hwHelper.getBindingTableStateSize());
2199 }
2200 
2201 template <>
patchBlocksCurbe(void * reflectionSurface,uint32_t blockID,uint64_t defaultDeviceQueueCurbeOffset,uint32_t patchSizeDefaultQueue,uint64_t defaultDeviceQueueGpuAddress,uint64_t eventPoolCurbeOffset,uint32_t patchSizeEventPool,uint64_t eventPoolGpuAddress,uint64_t deviceQueueCurbeOffset,uint32_t patchSizeDeviceQueue,uint64_t deviceQueueGpuAddress,uint64_t printfBufferOffset,uint32_t patchSizePrintfBuffer,uint64_t printfBufferGpuAddress,uint64_t privateSurfaceOffset,uint32_t privateSurfaceSize,uint64_t privateSurfaceGpuAddress)2202 void Kernel::ReflectionSurfaceHelper::patchBlocksCurbe<false>(void *reflectionSurface, uint32_t blockID,
2203                                                               uint64_t defaultDeviceQueueCurbeOffset, uint32_t patchSizeDefaultQueue, uint64_t defaultDeviceQueueGpuAddress,
2204                                                               uint64_t eventPoolCurbeOffset, uint32_t patchSizeEventPool, uint64_t eventPoolGpuAddress,
2205                                                               uint64_t deviceQueueCurbeOffset, uint32_t patchSizeDeviceQueue, uint64_t deviceQueueGpuAddress,
2206                                                               uint64_t printfBufferOffset, uint32_t patchSizePrintfBuffer, uint64_t printfBufferGpuAddress,
2207                                                               uint64_t privateSurfaceOffset, uint32_t privateSurfaceSize, uint64_t privateSurfaceGpuAddress) {
2208 
2209     IGIL_KernelDataHeader *pKernelHeader = reinterpret_cast<IGIL_KernelDataHeader *>(reflectionSurface);
2210 
2211     // Reflection surface must be initialized prior to patching blocks curbe on KRS
2212     DEBUG_BREAK_IF(blockID >= pKernelHeader->m_numberOfKernels);
2213 
2214     IGIL_KernelAddressData *addressData = pKernelHeader->m_data;
2215     // const buffer offsets must be set
2216     DEBUG_BREAK_IF(addressData[blockID].m_ConstantBufferOffset == 0);
2217 
2218     void *pCurbe = ptrOffset(reflectionSurface, addressData[blockID].m_ConstantBufferOffset);
2219 
2220     if (defaultDeviceQueueCurbeOffset != undefinedOffset) {
2221         auto *patchedPointer = ptrOffset(pCurbe, (size_t)defaultDeviceQueueCurbeOffset);
2222         patchWithRequiredSize(patchedPointer, patchSizeDefaultQueue, (uintptr_t)defaultDeviceQueueGpuAddress);
2223     }
2224     if (eventPoolCurbeOffset != undefinedOffset) {
2225         auto *patchedPointer = ptrOffset(pCurbe, (size_t)eventPoolCurbeOffset);
2226         patchWithRequiredSize(patchedPointer, patchSizeEventPool, (uintptr_t)eventPoolGpuAddress);
2227     }
2228     if (deviceQueueCurbeOffset != undefinedOffset) {
2229         auto *patchedPointer = ptrOffset(pCurbe, (size_t)deviceQueueCurbeOffset);
2230         patchWithRequiredSize(patchedPointer, patchSizeDeviceQueue, (uintptr_t)deviceQueueGpuAddress);
2231     }
2232     if (printfBufferOffset != undefinedOffset) {
2233         auto *patchedPointer = ptrOffset(pCurbe, (size_t)printfBufferOffset);
2234         patchWithRequiredSize(patchedPointer, patchSizePrintfBuffer, (uintptr_t)printfBufferGpuAddress);
2235     }
2236 
2237     if (privateSurfaceOffset != undefinedOffset) {
2238         auto *patchedPointer = ptrOffset(pCurbe, (size_t)privateSurfaceOffset);
2239         patchWithRequiredSize(patchedPointer, privateSurfaceSize, (uintptr_t)privateSurfaceGpuAddress);
2240     }
2241 }
2242 
patchBlocksCurbeWithConstantValues(void * reflectionSurface,uint32_t blockID,uint64_t globalMemoryCurbeOffset,uint32_t globalMemoryPatchSize,uint64_t globalMemoryGpuAddress,uint64_t constantMemoryCurbeOffset,uint32_t constantMemoryPatchSize,uint64_t constantMemoryGpuAddress,uint64_t privateMemoryCurbeOffset,uint32_t privateMemoryPatchSize,uint64_t privateMemoryGpuAddress)2243 void Kernel::ReflectionSurfaceHelper::patchBlocksCurbeWithConstantValues(void *reflectionSurface, uint32_t blockID,
2244                                                                          uint64_t globalMemoryCurbeOffset, uint32_t globalMemoryPatchSize, uint64_t globalMemoryGpuAddress,
2245                                                                          uint64_t constantMemoryCurbeOffset, uint32_t constantMemoryPatchSize, uint64_t constantMemoryGpuAddress,
2246                                                                          uint64_t privateMemoryCurbeOffset, uint32_t privateMemoryPatchSize, uint64_t privateMemoryGpuAddress) {
2247 
2248     IGIL_KernelDataHeader *pKernelHeader = reinterpret_cast<IGIL_KernelDataHeader *>(reflectionSurface);
2249 
2250     // Reflection surface must be initialized prior to patching blocks curbe on KRS
2251     DEBUG_BREAK_IF(blockID >= pKernelHeader->m_numberOfKernels);
2252 
2253     IGIL_KernelAddressData *addressData = pKernelHeader->m_data;
2254     // const buffer offsets must be set
2255     DEBUG_BREAK_IF(addressData[blockID].m_ConstantBufferOffset == 0);
2256 
2257     void *pCurbe = ptrOffset(reflectionSurface, addressData[blockID].m_ConstantBufferOffset);
2258 
2259     if (globalMemoryCurbeOffset != undefinedOffset) {
2260         auto *patchedPointer = ptrOffset(pCurbe, (size_t)globalMemoryCurbeOffset);
2261         patchWithRequiredSize(patchedPointer, globalMemoryPatchSize, (uintptr_t)globalMemoryGpuAddress);
2262     }
2263     if (constantMemoryCurbeOffset != undefinedOffset) {
2264         auto *patchedPointer = ptrOffset(pCurbe, (size_t)constantMemoryCurbeOffset);
2265         patchWithRequiredSize(patchedPointer, constantMemoryPatchSize, (uintptr_t)constantMemoryGpuAddress);
2266     }
2267     if (privateMemoryCurbeOffset != undefinedOffset) {
2268         auto *patchedPointer = ptrOffset(pCurbe, (size_t)privateMemoryCurbeOffset);
2269         patchWithRequiredSize(patchedPointer, privateMemoryPatchSize, (uintptr_t)privateMemoryGpuAddress);
2270     }
2271 }
2272 
setParentImageParams(void * reflectionSurface,std::vector<Kernel::SimpleKernelArgInfo> & parentArguments,const KernelInfo & parentKernelInfo)2273 void Kernel::ReflectionSurfaceHelper::setParentImageParams(void *reflectionSurface, std::vector<Kernel::SimpleKernelArgInfo> &parentArguments, const KernelInfo &parentKernelInfo) {
2274     IGIL_KernelDataHeader *pKernelHeader = reinterpret_cast<IGIL_KernelDataHeader *>(reflectionSurface);
2275     IGIL_ImageParamters *pImageParameters = reinterpret_cast<IGIL_ImageParamters *>(ptrOffset(pKernelHeader, (size_t)pKernelHeader->m_ParentImageDataOffset));
2276 
2277     uint32_t numArgs = (uint32_t)parentArguments.size();
2278     for (uint32_t i = 0; i < numArgs; i++) {
2279         if (parentArguments[i].type == Kernel::kernelArgType::IMAGE_OBJ) {
2280             const Image *image = castToObject<Image>((cl_mem)parentArguments[i].object);
2281             if (image) {
2282                 pImageParameters->m_ArraySize = (uint32_t)image->getImageDesc().image_array_size;
2283                 pImageParameters->m_Depth = (uint32_t)image->getImageDesc().image_depth;
2284                 pImageParameters->m_Height = (uint32_t)image->getImageDesc().image_height;
2285                 pImageParameters->m_Width = (uint32_t)image->getImageDesc().image_width;
2286                 pImageParameters->m_NumMipLevels = (uint32_t)image->getImageDesc().num_mip_levels;
2287                 pImageParameters->m_NumSamples = (uint32_t)image->getImageDesc().num_samples;
2288 
2289                 pImageParameters->m_ChannelDataType = (uint32_t)image->getImageFormat().image_channel_data_type;
2290                 pImageParameters->m_ChannelOrder = (uint32_t)image->getImageFormat().image_channel_data_type;
2291                 pImageParameters->m_ObjectID = (uint32_t)parentKernelInfo.kernelDescriptor.payloadMappings.explicitArgs[i].as<ArgDescImage>().bindful;
2292                 pImageParameters++;
2293             }
2294         }
2295     }
2296 }
2297 
setParentSamplerParams(void * reflectionSurface,std::vector<Kernel::SimpleKernelArgInfo> & parentArguments,const KernelInfo & parentKernelInfo)2298 void Kernel::ReflectionSurfaceHelper::setParentSamplerParams(void *reflectionSurface, std::vector<Kernel::SimpleKernelArgInfo> &parentArguments, const KernelInfo &parentKernelInfo) {
2299     IGIL_KernelDataHeader *pKernelHeader = reinterpret_cast<IGIL_KernelDataHeader *>(reflectionSurface);
2300     IGIL_ParentSamplerParams *pParentSamplerParams = reinterpret_cast<IGIL_ParentSamplerParams *>(ptrOffset(pKernelHeader, (size_t)pKernelHeader->m_ParentSamplerParamsOffset));
2301 
2302     uint32_t numArgs = (uint32_t)parentArguments.size();
2303     for (uint32_t i = 0; i < numArgs; i++) {
2304         if (parentArguments[i].type == Kernel::kernelArgType::SAMPLER_OBJ) {
2305             const Sampler *sampler = castToObject<Sampler>((cl_sampler)parentArguments[i].object);
2306             if (sampler) {
2307                 pParentSamplerParams->CoordinateSnapRequired = (uint32_t)sampler->getSnapWaValue();
2308                 pParentSamplerParams->m_AddressingMode = (uint32_t)sampler->addressingMode;
2309                 pParentSamplerParams->NormalizedCoords = (uint32_t)sampler->normalizedCoordinates;
2310 
2311                 pParentSamplerParams->m_ObjectID = OCLRT_ARG_OFFSET_TO_SAMPLER_OBJECT_ID((uint32_t)parentKernelInfo.kernelDescriptor.payloadMappings.explicitArgs[i].as<ArgDescSampler>().bindful);
2312                 pParentSamplerParams++;
2313             }
2314         }
2315     }
2316 }
2317 
resetSharedObjectsPatchAddresses()2318 void Kernel::resetSharedObjectsPatchAddresses() {
2319     for (size_t i = 0; i < getKernelArgsNumber(); i++) {
2320         auto clMem = (cl_mem)kernelArguments[i].object;
2321         auto memObj = castToObject<MemObj>(clMem);
2322         if (memObj && memObj->peekSharingHandler()) {
2323             setArg((uint32_t)i, sizeof(cl_mem), &clMem);
2324         }
2325     }
2326 }
2327 
provideInitializationHints()2328 void Kernel::provideInitializationHints() {
2329 
2330     Context *context = program->getContextPtr();
2331     if (context == nullptr || !context->isProvidingPerformanceHints())
2332         return;
2333 
2334     auto pClDevice = &getDevice();
2335     if (privateSurfaceSize) {
2336         context->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, PRIVATE_MEMORY_USAGE_TOO_HIGH,
2337                                         kernelInfo.kernelDescriptor.kernelMetadata.kernelName.c_str(),
2338                                         privateSurfaceSize);
2339     }
2340     auto scratchSize = kernelInfo.kernelDescriptor.kernelAttributes.perThreadScratchSize[0] *
2341                        pClDevice->getSharedDeviceInfo().computeUnitsUsedForScratch * kernelInfo.getMaxSimdSize();
2342     if (scratchSize > 0) {
2343         context->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, REGISTER_PRESSURE_TOO_HIGH,
2344                                         kernelInfo.kernelDescriptor.kernelMetadata.kernelName.c_str(), scratchSize);
2345     }
2346 }
2347 
patchDefaultDeviceQueue(DeviceQueue * devQueue)2348 void Kernel::patchDefaultDeviceQueue(DeviceQueue *devQueue) {
2349     const auto &defaultQueueSurfaceAddress = kernelInfo.kernelDescriptor.payloadMappings.implicitArgs.deviceSideEnqueueDefaultQueueSurfaceAddress;
2350     if (isValidOffset(defaultQueueSurfaceAddress.stateless) && crossThreadData) {
2351         auto patchLocation = ptrOffset(reinterpret_cast<uint32_t *>(crossThreadData), defaultQueueSurfaceAddress.stateless);
2352         patchWithRequiredSize(patchLocation, defaultQueueSurfaceAddress.pointerSize,
2353                               static_cast<uintptr_t>(devQueue->getQueueBuffer()->getGpuAddressToPatch()));
2354     }
2355     if (isValidOffset(defaultQueueSurfaceAddress.bindful)) {
2356         auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap()), defaultQueueSurfaceAddress.bindful);
2357         Buffer::setSurfaceState(&devQueue->getDevice(), surfaceState, false, false, devQueue->getQueueBuffer()->getUnderlyingBufferSize(),
2358                                 (void *)devQueue->getQueueBuffer()->getGpuAddress(), 0, devQueue->getQueueBuffer(), 0, 0,
2359                                 kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, areMultipleSubDevicesInContext());
2360     }
2361 }
2362 
patchEventPool(DeviceQueue * devQueue)2363 void Kernel::patchEventPool(DeviceQueue *devQueue) {
2364     const auto &eventPoolSurfaceAddress = kernelInfo.kernelDescriptor.payloadMappings.implicitArgs.deviceSideEnqueueEventPoolSurfaceAddress;
2365 
2366     if (isValidOffset(eventPoolSurfaceAddress.stateless) && crossThreadData) {
2367         auto patchLocation = ptrOffset(reinterpret_cast<uint32_t *>(crossThreadData), eventPoolSurfaceAddress.stateless);
2368         patchWithRequiredSize(patchLocation, eventPoolSurfaceAddress.pointerSize,
2369                               static_cast<uintptr_t>(devQueue->getEventPoolBuffer()->getGpuAddressToPatch()));
2370     }
2371 
2372     if (isValidOffset(eventPoolSurfaceAddress.bindful)) {
2373         auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap()), eventPoolSurfaceAddress.bindful);
2374         auto eventPoolBuffer = devQueue->getEventPoolBuffer();
2375         Buffer::setSurfaceState(&devQueue->getDevice(), surfaceState, false, false, eventPoolBuffer->getUnderlyingBufferSize(),
2376                                 (void *)eventPoolBuffer->getGpuAddress(), 0, eventPoolBuffer, 0, 0,
2377                                 kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, areMultipleSubDevicesInContext());
2378     }
2379 }
2380 
patchBlocksSimdSize()2381 void Kernel::patchBlocksSimdSize() {
2382     BlockKernelManager *blockManager = program->getBlockKernelManager();
2383 
2384     for (auto &idOffset : kernelInfo.childrenKernelsIdOffset) {
2385 
2386         DEBUG_BREAK_IF(!(idOffset.first < static_cast<uint32_t>(blockManager->getCount())));
2387 
2388         const KernelInfo *blockInfo = blockManager->getBlockKernelInfo(idOffset.first);
2389         uint32_t *simdSize = reinterpret_cast<uint32_t *>(&crossThreadData[idOffset.second]);
2390         *simdSize = blockInfo->getMaxSimdSize();
2391     }
2392 }
2393 
usesSyncBuffer() const2394 bool Kernel::usesSyncBuffer() const {
2395     return kernelInfo.kernelDescriptor.kernelAttributes.flags.usesSyncBuffer;
2396 }
2397 
patchSyncBuffer(GraphicsAllocation * gfxAllocation,size_t bufferOffset)2398 void Kernel::patchSyncBuffer(GraphicsAllocation *gfxAllocation, size_t bufferOffset) {
2399     const auto &syncBuffer = kernelInfo.kernelDescriptor.payloadMappings.implicitArgs.syncBufferAddress;
2400     auto bufferPatchAddress = ptrOffset(crossThreadData, syncBuffer.stateless);
2401     patchWithRequiredSize(bufferPatchAddress, syncBuffer.pointerSize,
2402                           ptrOffset(gfxAllocation->getGpuAddressToPatch(), bufferOffset));
2403 
2404     if (isValidOffset(syncBuffer.bindful)) {
2405         auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap()), syncBuffer.bindful);
2406         auto addressToPatch = gfxAllocation->getUnderlyingBuffer();
2407         auto sizeToPatch = gfxAllocation->getUnderlyingBufferSize();
2408         Buffer::setSurfaceState(&clDevice.getDevice(), surfaceState, false, false, sizeToPatch, addressToPatch, 0, gfxAllocation, 0, 0,
2409                                 kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, areMultipleSubDevicesInContext());
2410     }
2411 }
2412 
2413 template void Kernel::patchReflectionSurface<false>(DeviceQueue *, PrintfHandler *);
2414 
isPatched() const2415 bool Kernel::isPatched() const {
2416     return patchedArgumentsNum == kernelInfo.kernelDescriptor.kernelAttributes.numArgsToPatch;
2417 }
checkCorrectImageAccessQualifier(cl_uint argIndex,size_t argSize,const void * argValue) const2418 cl_int Kernel::checkCorrectImageAccessQualifier(cl_uint argIndex,
2419                                                 size_t argSize,
2420                                                 const void *argValue) const {
2421     const auto &arg = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[argIndex];
2422     if (arg.is<ArgDescriptor::ArgTImage>()) {
2423         cl_mem mem = *(static_cast<const cl_mem *>(argValue));
2424         MemObj *pMemObj = nullptr;
2425         WithCastToInternal(mem, &pMemObj);
2426         if (pMemObj) {
2427             auto accessQualifier = arg.getTraits().accessQualifier;
2428             cl_mem_flags flags = pMemObj->getFlags();
2429             if ((accessQualifier == KernelArgMetadata::AccessReadOnly && ((flags | CL_MEM_WRITE_ONLY) == flags)) ||
2430                 (accessQualifier == KernelArgMetadata::AccessWriteOnly && ((flags | CL_MEM_READ_ONLY) == flags))) {
2431                 return CL_INVALID_ARG_VALUE;
2432             }
2433         } else {
2434             return CL_INVALID_ARG_VALUE;
2435         }
2436     }
2437     return CL_SUCCESS;
2438 }
2439 
resolveArgs()2440 void Kernel::resolveArgs() {
2441     if (!Kernel::isPatched() || !imageTransformer->hasRegisteredImages3d() || !canTransformImages())
2442         return;
2443     bool canTransformImageTo2dArray = true;
2444     const auto &args = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs;
2445     for (uint32_t i = 0; i < patchedArgumentsNum; i++) {
2446         if (args[i].is<ArgDescriptor::ArgTSampler>()) {
2447             auto sampler = castToObject<Sampler>(kernelArguments.at(i).object);
2448             if (sampler->isTransformable()) {
2449                 canTransformImageTo2dArray = true;
2450             } else {
2451                 canTransformImageTo2dArray = false;
2452                 break;
2453             }
2454         }
2455     }
2456 
2457     if (canTransformImageTo2dArray) {
2458         imageTransformer->transformImagesTo2dArray(kernelInfo, kernelArguments, getSurfaceStateHeap());
2459     } else if (imageTransformer->didTransform()) {
2460         imageTransformer->transformImagesTo3d(kernelInfo, kernelArguments, getSurfaceStateHeap());
2461     }
2462 }
2463 
canTransformImages() const2464 bool Kernel::canTransformImages() const {
2465     auto renderCoreFamily = clDevice.getHardwareInfo().platform.eRenderCoreFamily;
2466     return renderCoreFamily >= IGFX_GEN9_CORE && renderCoreFamily <= IGFX_GEN11LP_CORE && !isBuiltIn;
2467 }
2468 
fillWithKernelObjsForAuxTranslation(KernelObjsForAuxTranslation & kernelObjsForAuxTranslation)2469 void Kernel::fillWithKernelObjsForAuxTranslation(KernelObjsForAuxTranslation &kernelObjsForAuxTranslation) {
2470     kernelObjsForAuxTranslation.reserve(getKernelArgsNumber());
2471     for (uint32_t i = 0; i < getKernelArgsNumber(); i++) {
2472         const auto &arg = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[i];
2473         if (BUFFER_OBJ == kernelArguments.at(i).type && !arg.as<ArgDescPointer>().isPureStateful()) {
2474             auto buffer = castToObject<Buffer>(getKernelArg(i));
2475             if (buffer && buffer->getMultiGraphicsAllocation().getDefaultGraphicsAllocation()->isCompressionEnabled()) {
2476                 kernelObjsForAuxTranslation.insert({KernelObjForAuxTranslation::Type::MEM_OBJ, buffer});
2477                 auto &context = this->program->getContext();
2478                 if (context.isProvidingPerformanceHints()) {
2479                     const auto &argExtMeta = kernelInfo.kernelDescriptor.explicitArgsExtendedMetadata[i];
2480                     context.providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, KERNEL_ARGUMENT_AUX_TRANSLATION,
2481                                                    kernelInfo.kernelDescriptor.kernelMetadata.kernelName.c_str(), i, argExtMeta.argName.c_str());
2482                 }
2483             }
2484         }
2485         if (SVM_ALLOC_OBJ == getKernelArguments().at(i).type && !arg.as<ArgDescPointer>().isPureStateful()) {
2486             auto svmAlloc = reinterpret_cast<GraphicsAllocation *>(const_cast<void *>(getKernelArg(i)));
2487             if (svmAlloc && svmAlloc->isCompressionEnabled()) {
2488                 kernelObjsForAuxTranslation.insert({KernelObjForAuxTranslation::Type::GFX_ALLOC, svmAlloc});
2489                 auto &context = this->program->getContext();
2490                 if (context.isProvidingPerformanceHints()) {
2491                     const auto &argExtMeta = kernelInfo.kernelDescriptor.explicitArgsExtendedMetadata[i];
2492                     context.providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, KERNEL_ARGUMENT_AUX_TRANSLATION,
2493                                                    kernelInfo.kernelDescriptor.kernelMetadata.kernelName.c_str(), i, argExtMeta.argName.c_str());
2494                 }
2495             }
2496         }
2497     }
2498     const auto &hwInfoConfig = *HwInfoConfig::get(getDevice().getHardwareInfo().platform.eProductFamily);
2499     if (hwInfoConfig.allowStatelessCompression(getDevice().getHardwareInfo())) {
2500         for (auto gfxAllocation : kernelUnifiedMemoryGfxAllocations) {
2501             if (gfxAllocation->isCompressionEnabled()) {
2502                 kernelObjsForAuxTranslation.insert({KernelObjForAuxTranslation::Type::GFX_ALLOC, gfxAllocation});
2503                 auto &context = this->program->getContext();
2504                 if (context.isProvidingPerformanceHints()) {
2505                     context.providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, KERNEL_ALLOCATION_AUX_TRANSLATION,
2506                                                    kernelInfo.kernelDescriptor.kernelMetadata.kernelName.c_str(),
2507                                                    reinterpret_cast<void *>(gfxAllocation->getGpuAddress()), gfxAllocation->getUnderlyingBufferSize());
2508                 }
2509             }
2510         }
2511         if (getContext().getSVMAllocsManager()) {
2512             for (auto &allocation : getContext().getSVMAllocsManager()->getSVMAllocs()->allocations) {
2513                 auto gfxAllocation = allocation.second.gpuAllocations.getDefaultGraphicsAllocation();
2514                 if (gfxAllocation->isCompressionEnabled()) {
2515                     kernelObjsForAuxTranslation.insert({KernelObjForAuxTranslation::Type::GFX_ALLOC, gfxAllocation});
2516                     auto &context = this->program->getContext();
2517                     if (context.isProvidingPerformanceHints()) {
2518                         context.providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, KERNEL_ALLOCATION_AUX_TRANSLATION,
2519                                                        kernelInfo.kernelDescriptor.kernelMetadata.kernelName.c_str(),
2520                                                        reinterpret_cast<void *>(gfxAllocation->getGpuAddress()), gfxAllocation->getUnderlyingBufferSize());
2521                     }
2522                 }
2523             }
2524         }
2525     }
2526 }
2527 
hasDirectStatelessAccessToSharedBuffer() const2528 bool Kernel::hasDirectStatelessAccessToSharedBuffer() const {
2529     for (uint32_t i = 0; i < getKernelArgsNumber(); i++) {
2530         const auto &arg = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[i];
2531         if (BUFFER_OBJ == kernelArguments.at(i).type && !arg.as<ArgDescPointer>().isPureStateful()) {
2532             auto buffer = castToObject<Buffer>(getKernelArg(i));
2533             if (buffer && buffer->getMultiGraphicsAllocation().getAllocationType() == GraphicsAllocation::AllocationType::SHARED_BUFFER) {
2534                 return true;
2535             }
2536         }
2537     }
2538     return false;
2539 }
2540 
hasDirectStatelessAccessToHostMemory() const2541 bool Kernel::hasDirectStatelessAccessToHostMemory() const {
2542     for (uint32_t i = 0; i < getKernelArgsNumber(); i++) {
2543         const auto &arg = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[i];
2544         if (BUFFER_OBJ == kernelArguments.at(i).type && !arg.as<ArgDescPointer>().isPureStateful()) {
2545             auto buffer = castToObject<Buffer>(getKernelArg(i));
2546             if (buffer && buffer->getMultiGraphicsAllocation().getAllocationType() == GraphicsAllocation::AllocationType::BUFFER_HOST_MEMORY) {
2547                 return true;
2548             }
2549         }
2550         if (SVM_ALLOC_OBJ == kernelArguments.at(i).type && !arg.as<ArgDescPointer>().isPureStateful()) {
2551             auto svmAlloc = reinterpret_cast<const GraphicsAllocation *>(getKernelArg(i));
2552             if (svmAlloc && svmAlloc->getAllocationType() == GraphicsAllocation::AllocationType::BUFFER_HOST_MEMORY) {
2553                 return true;
2554             }
2555         }
2556     }
2557     return false;
2558 }
2559 
hasIndirectStatelessAccessToHostMemory() const2560 bool Kernel::hasIndirectStatelessAccessToHostMemory() const {
2561     if (!kernelInfo.hasIndirectStatelessAccess) {
2562         return false;
2563     }
2564 
2565     for (auto gfxAllocation : kernelUnifiedMemoryGfxAllocations) {
2566         if (gfxAllocation->getAllocationType() == GraphicsAllocation::AllocationType::BUFFER_HOST_MEMORY) {
2567             return true;
2568         }
2569     }
2570 
2571     if (unifiedMemoryControls.indirectHostAllocationsAllowed) {
2572         return getContext().getSVMAllocsManager()->hasHostAllocations();
2573     }
2574 
2575     return false;
2576 }
2577 
getAllocationsForCacheFlush(CacheFlushAllocationsVec & out) const2578 void Kernel::getAllocationsForCacheFlush(CacheFlushAllocationsVec &out) const {
2579     if (false == HwHelper::cacheFlushAfterWalkerSupported(getHardwareInfo())) {
2580         return;
2581     }
2582     for (GraphicsAllocation *alloc : this->kernelArgRequiresCacheFlush) {
2583         if (nullptr == alloc) {
2584             continue;
2585         }
2586 
2587         out.push_back(alloc);
2588     }
2589 
2590     auto rootDeviceIndex = getDevice().getRootDeviceIndex();
2591     auto global = getProgram()->getGlobalSurface(rootDeviceIndex);
2592     if (global != nullptr) {
2593         out.push_back(global);
2594     }
2595 
2596     if (svmAllocationsRequireCacheFlush) {
2597         for (GraphicsAllocation *alloc : kernelSvmGfxAllocations) {
2598             if (allocationForCacheFlush(alloc)) {
2599                 out.push_back(alloc);
2600             }
2601         }
2602     }
2603 }
2604 
allocationForCacheFlush(GraphicsAllocation * argAllocation) const2605 bool Kernel::allocationForCacheFlush(GraphicsAllocation *argAllocation) const {
2606     return argAllocation->isFlushL3Required();
2607 }
2608 
addAllocationToCacheFlushVector(uint32_t argIndex,GraphicsAllocation * argAllocation)2609 void Kernel::addAllocationToCacheFlushVector(uint32_t argIndex, GraphicsAllocation *argAllocation) {
2610     if (argAllocation == nullptr) {
2611         kernelArgRequiresCacheFlush[argIndex] = nullptr;
2612     } else {
2613         if (allocationForCacheFlush(argAllocation)) {
2614             kernelArgRequiresCacheFlush[argIndex] = argAllocation;
2615         } else {
2616             kernelArgRequiresCacheFlush[argIndex] = nullptr;
2617         }
2618     }
2619 }
2620 
setReflectionSurfaceBlockBtOffset(uint32_t blockID,uint32_t offset)2621 void Kernel::setReflectionSurfaceBlockBtOffset(uint32_t blockID, uint32_t offset) {
2622     DEBUG_BREAK_IF(blockID >= program->getBlockKernelManager()->getCount());
2623     ReflectionSurfaceHelper::setKernelAddressDataBtOffset(getKernelReflectionSurface()->getUnderlyingBuffer(), blockID, offset);
2624 }
2625 
checkIfIsParentKernelAndBlocksUsesPrintf()2626 bool Kernel::checkIfIsParentKernelAndBlocksUsesPrintf() {
2627     return isParentKernel && getProgram()->getBlockKernelManager()->getIfBlockUsesPrintf();
2628 }
2629 
getKernelStartOffset(const bool localIdsGenerationByRuntime,const bool kernelUsesLocalIds,const bool isCssUsed) const2630 uint64_t Kernel::getKernelStartOffset(
2631     const bool localIdsGenerationByRuntime,
2632     const bool kernelUsesLocalIds,
2633     const bool isCssUsed) const {
2634 
2635     uint64_t kernelStartOffset = 0;
2636 
2637     if (kernelInfo.getGraphicsAllocation()) {
2638         kernelStartOffset = kernelInfo.getGraphicsAllocation()->getGpuAddressToPatch();
2639         if (localIdsGenerationByRuntime == false && kernelUsesLocalIds == true) {
2640             kernelStartOffset += kernelInfo.kernelDescriptor.entryPoints.skipPerThreadDataLoad;
2641         }
2642     }
2643 
2644     kernelStartOffset += getStartOffset();
2645 
2646     auto &hardwareInfo = getHardwareInfo();
2647     auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
2648 
2649     if (isCssUsed && hwHelper.isOffsetToSkipSetFFIDGPWARequired(hardwareInfo)) {
2650         kernelStartOffset += kernelInfo.kernelDescriptor.entryPoints.skipSetFFIDGP;
2651     }
2652 
2653     return kernelStartOffset;
2654 }
patchBindlessSurfaceState(NEO::GraphicsAllocation * alloc,uint32_t bindless)2655 void *Kernel::patchBindlessSurfaceState(NEO::GraphicsAllocation *alloc, uint32_t bindless) {
2656     auto &hwHelper = HwHelper::get(getDevice().getHardwareInfo().platform.eRenderCoreFamily);
2657     auto surfaceStateSize = hwHelper.getRenderSurfaceStateSize();
2658     NEO::BindlessHeapsHelper *bindlessHeapsHelper = getDevice().getDevice().getBindlessHeapsHelper();
2659     auto ssInHeap = bindlessHeapsHelper->allocateSSInHeap(surfaceStateSize, alloc, NEO::BindlessHeapsHelper::GLOBAL_SSH);
2660     auto patchLocation = ptrOffset(getCrossThreadData(), bindless);
2661     auto patchValue = hwHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast<uint32_t>(ssInHeap.surfaceStateOffset));
2662     patchWithRequiredSize(patchLocation, sizeof(patchValue), patchValue);
2663     return ssInHeap.ssPtr;
2664 }
2665 
setAdditionalKernelExecInfo(uint32_t additionalKernelExecInfo)2666 void Kernel::setAdditionalKernelExecInfo(uint32_t additionalKernelExecInfo) {
2667     this->additionalKernelExecInfo = additionalKernelExecInfo;
2668 }
2669 
getAdditionalKernelExecInfo() const2670 uint32_t Kernel::getAdditionalKernelExecInfo() const {
2671     return this->additionalKernelExecInfo;
2672 }
2673 
requiresWaDisableRccRhwoOptimization() const2674 bool Kernel::requiresWaDisableRccRhwoOptimization() const {
2675     auto &hardwareInfo = getHardwareInfo();
2676     auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
2677     auto rootDeviceIndex = getDevice().getRootDeviceIndex();
2678 
2679     if (hwHelper.isWaDisableRccRhwoOptimizationRequired() && isUsingSharedObjArgs()) {
2680         for (auto &arg : getKernelArguments()) {
2681             auto clMemObj = static_cast<cl_mem>(arg.object);
2682             auto memObj = castToObject<MemObj>(clMemObj);
2683             if (memObj && memObj->peekSharingHandler()) {
2684                 auto allocation = memObj->getGraphicsAllocation(rootDeviceIndex);
2685                 for (uint32_t handleId = 0u; handleId < allocation->getNumGmms(); handleId++) {
2686                     if (allocation->getGmm(handleId)->gmmResourceInfo->getResourceFlags()->Info.MediaCompressed) {
2687                         return true;
2688                     }
2689                 }
2690             }
2691         }
2692     }
2693     return false;
2694 }
2695 
getHardwareInfo() const2696 const HardwareInfo &Kernel::getHardwareInfo() const {
2697     return getDevice().getHardwareInfo();
2698 }
2699 
setWorkDim(uint32_t workDim)2700 void Kernel::setWorkDim(uint32_t workDim) {
2701     patchNonPointer(getCrossThreadDataRef(), getDescriptor().payloadMappings.dispatchTraits.workDim, workDim);
2702     if (pImplicitArgs) {
2703         pImplicitArgs->numWorkDim = workDim;
2704     }
2705 }
2706 
setGlobalWorkOffsetValues(uint32_t globalWorkOffsetX,uint32_t globalWorkOffsetY,uint32_t globalWorkOffsetZ)2707 void Kernel::setGlobalWorkOffsetValues(uint32_t globalWorkOffsetX, uint32_t globalWorkOffsetY, uint32_t globalWorkOffsetZ) {
2708     patchVecNonPointer(getCrossThreadDataRef(),
2709                        getDescriptor().payloadMappings.dispatchTraits.globalWorkOffset,
2710                        {globalWorkOffsetX, globalWorkOffsetY, globalWorkOffsetZ});
2711     if (pImplicitArgs) {
2712         pImplicitArgs->globalOffsetX = globalWorkOffsetX;
2713         pImplicitArgs->globalOffsetY = globalWorkOffsetY;
2714         pImplicitArgs->globalOffsetZ = globalWorkOffsetZ;
2715     }
2716 }
2717 
setGlobalWorkSizeValues(uint32_t globalWorkSizeX,uint32_t globalWorkSizeY,uint32_t globalWorkSizeZ)2718 void Kernel::setGlobalWorkSizeValues(uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ) {
2719     patchVecNonPointer(getCrossThreadDataRef(),
2720                        getDescriptor().payloadMappings.dispatchTraits.globalWorkSize,
2721                        {globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ});
2722     if (pImplicitArgs) {
2723         pImplicitArgs->globalSizeX = globalWorkSizeX;
2724         pImplicitArgs->globalSizeY = globalWorkSizeY;
2725         pImplicitArgs->globalSizeZ = globalWorkSizeZ;
2726     }
2727 }
2728 
setLocalWorkSizeValues(uint32_t localWorkSizeX,uint32_t localWorkSizeY,uint32_t localWorkSizeZ)2729 void Kernel::setLocalWorkSizeValues(uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ) {
2730     patchVecNonPointer(getCrossThreadDataRef(),
2731                        getDescriptor().payloadMappings.dispatchTraits.localWorkSize,
2732                        {localWorkSizeX, localWorkSizeY, localWorkSizeZ});
2733     if (pImplicitArgs) {
2734         pImplicitArgs->localSizeX = localWorkSizeX;
2735         pImplicitArgs->localSizeY = localWorkSizeY;
2736         pImplicitArgs->localSizeZ = localWorkSizeZ;
2737     }
2738 }
2739 
setLocalWorkSize2Values(uint32_t localWorkSizeX,uint32_t localWorkSizeY,uint32_t localWorkSizeZ)2740 void Kernel::setLocalWorkSize2Values(uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ) {
2741     patchVecNonPointer(getCrossThreadDataRef(),
2742                        getDescriptor().payloadMappings.dispatchTraits.localWorkSize2,
2743                        {localWorkSizeX, localWorkSizeY, localWorkSizeZ});
2744 }
2745 
setEnqueuedLocalWorkSizeValues(uint32_t localWorkSizeX,uint32_t localWorkSizeY,uint32_t localWorkSizeZ)2746 void Kernel::setEnqueuedLocalWorkSizeValues(uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ) {
2747     patchVecNonPointer(getCrossThreadDataRef(),
2748                        getDescriptor().payloadMappings.dispatchTraits.enqueuedLocalWorkSize,
2749                        {localWorkSizeX, localWorkSizeY, localWorkSizeZ});
2750 }
2751 
setNumWorkGroupsValues(uint32_t numWorkGroupsX,uint32_t numWorkGroupsY,uint32_t numWorkGroupsZ)2752 void Kernel::setNumWorkGroupsValues(uint32_t numWorkGroupsX, uint32_t numWorkGroupsY, uint32_t numWorkGroupsZ) {
2753     patchVecNonPointer(getCrossThreadDataRef(),
2754                        getDescriptor().payloadMappings.dispatchTraits.numWorkGroups,
2755                        {numWorkGroupsX, numWorkGroupsY, numWorkGroupsZ});
2756     if (pImplicitArgs) {
2757         pImplicitArgs->groupCountX = numWorkGroupsX;
2758         pImplicitArgs->groupCountY = numWorkGroupsY;
2759         pImplicitArgs->groupCountZ = numWorkGroupsZ;
2760     }
2761 }
2762 
isLocalWorkSize2Patchable()2763 bool Kernel::isLocalWorkSize2Patchable() {
2764     const auto &localWorkSize2 = getDescriptor().payloadMappings.dispatchTraits.localWorkSize2;
2765     return isValidOffset(localWorkSize2[0]) && isValidOffset(localWorkSize2[1]) && isValidOffset(localWorkSize2[2]);
2766 }
2767 
getMaxKernelWorkGroupSize() const2768 uint32_t Kernel::getMaxKernelWorkGroupSize() const {
2769     return maxKernelWorkGroupSize;
2770 }
2771 
getSlmTotalSize() const2772 uint32_t Kernel::getSlmTotalSize() const {
2773     return slmTotalSize;
2774 }
2775 
areMultipleSubDevicesInContext() const2776 bool Kernel::areMultipleSubDevicesInContext() const {
2777     auto context = program->getContextPtr();
2778     return context ? context->containsMultipleSubDevices(clDevice.getRootDeviceIndex()) : false;
2779 }
2780 
reconfigureKernel()2781 void Kernel::reconfigureKernel() {
2782     auto &kernelDescriptor = kernelInfo.kernelDescriptor;
2783     if (kernelDescriptor.kernelAttributes.numGrfRequired == GrfConfig::LargeGrfNumber) {
2784         maxKernelWorkGroupSize >>= 1;
2785     }
2786     this->containsStatelessWrites = kernelDescriptor.kernelAttributes.flags.usesStatelessWrites;
2787     this->specialPipelineSelectMode = kernelDescriptor.kernelAttributes.flags.usesSpecialPipelineSelectMode;
2788 }
requiresCacheFlushCommand(const CommandQueue & commandQueue) const2789 bool Kernel::requiresCacheFlushCommand(const CommandQueue &commandQueue) const {
2790     if (false == HwHelper::cacheFlushAfterWalkerSupported(commandQueue.getDevice().getHardwareInfo())) {
2791         return false;
2792     }
2793 
2794     if (DebugManager.flags.EnableCacheFlushAfterWalkerForAllQueues.get() != -1) {
2795         return !!DebugManager.flags.EnableCacheFlushAfterWalkerForAllQueues.get();
2796     }
2797 
2798     bool cmdQueueRequiresCacheFlush = commandQueue.getRequiresCacheFlushAfterWalker();
2799     if (false == cmdQueueRequiresCacheFlush) {
2800         return false;
2801     }
2802     if (commandQueue.getGpgpuCommandStreamReceiver().isMultiOsContextCapable()) {
2803         return false;
2804     }
2805     bool isMultiDevice = commandQueue.getContext().containsMultipleSubDevices(commandQueue.getDevice().getRootDeviceIndex());
2806     if (false == isMultiDevice) {
2807         return false;
2808     }
2809     bool isDefaultContext = (commandQueue.getContext().peekContextType() == ContextType::CONTEXT_TYPE_DEFAULT);
2810     if (true == isDefaultContext) {
2811         return false;
2812     }
2813 
2814     if (getProgram()->getGlobalSurface(commandQueue.getDevice().getRootDeviceIndex()) != nullptr) {
2815         return true;
2816     }
2817     if (svmAllocationsRequireCacheFlush) {
2818         return true;
2819     }
2820     size_t args = kernelArgRequiresCacheFlush.size();
2821     for (size_t i = 0; i < args; i++) {
2822         if (kernelArgRequiresCacheFlush[i] != nullptr) {
2823             return true;
2824         }
2825     }
2826     return false;
2827 }
2828 
requiresLimitedWorkgroupSize() const2829 bool Kernel::requiresLimitedWorkgroupSize() const {
2830     if (!this->isBuiltIn) {
2831         return false;
2832     }
2833     if (this->auxTranslationDirection != AuxTranslationDirection::None) {
2834         return false;
2835     }
2836 
2837     //if source is buffer in local memory, no need for limited workgroup
2838     if (this->kernelInfo.getArgDescriptorAt(0).is<ArgDescriptor::ArgTPointer>()) {
2839         if (this->getKernelArgInfo(0).object) {
2840             auto rootDeviceIndex = getDevice().getRootDeviceIndex();
2841             auto buffer = castToObject<Buffer>(this->getKernelArgInfo(0u).object);
2842             if (buffer && buffer->getGraphicsAllocation(rootDeviceIndex)->getMemoryPool() == MemoryPool::LocalMemory) {
2843                 return false;
2844             }
2845         }
2846     }
2847 
2848     //if we are reading from image no need for limited workgroup
2849     if (this->kernelInfo.getArgDescriptorAt(0).is<ArgDescriptor::ArgTImage>()) {
2850         return false;
2851     }
2852 
2853     return true;
2854 }
2855 
updateAuxTranslationRequired()2856 void Kernel::updateAuxTranslationRequired() {
2857     const auto &hwInfoConfig = *HwInfoConfig::get(getDevice().getHardwareInfo().platform.eProductFamily);
2858     if (hwInfoConfig.allowStatelessCompression(getDevice().getHardwareInfo())) {
2859         if (hasDirectStatelessAccessToHostMemory() ||
2860             hasIndirectStatelessAccessToHostMemory() ||
2861             hasDirectStatelessAccessToSharedBuffer()) {
2862             setAuxTranslationRequired(true);
2863         }
2864     }
2865 }
2866 
setKernelThreadArbitrationPolicy(uint32_t policy)2867 int Kernel::setKernelThreadArbitrationPolicy(uint32_t policy) {
2868     auto &hwInfo = clDevice.getHardwareInfo();
2869     auto &hwHelper = NEO::ClHwHelper::get(hwInfo.platform.eRenderCoreFamily);
2870     if (!hwHelper.isSupportedKernelThreadArbitrationPolicy()) {
2871         this->threadArbitrationPolicy = ThreadArbitrationPolicy::NotPresent;
2872         return CL_INVALID_DEVICE;
2873     } else if (policy == CL_KERNEL_EXEC_INFO_THREAD_ARBITRATION_POLICY_ROUND_ROBIN_INTEL) {
2874         this->threadArbitrationPolicy = ThreadArbitrationPolicy::RoundRobin;
2875     } else if (policy == CL_KERNEL_EXEC_INFO_THREAD_ARBITRATION_POLICY_OLDEST_FIRST_INTEL) {
2876         this->threadArbitrationPolicy = ThreadArbitrationPolicy::AgeBased;
2877     } else if (policy == CL_KERNEL_EXEC_INFO_THREAD_ARBITRATION_POLICY_AFTER_DEPENDENCY_ROUND_ROBIN_INTEL ||
2878                policy == CL_KERNEL_EXEC_INFO_THREAD_ARBITRATION_POLICY_STALL_BASED_ROUND_ROBIN_INTEL) {
2879         this->threadArbitrationPolicy = ThreadArbitrationPolicy::RoundRobinAfterDependency;
2880     } else {
2881         this->threadArbitrationPolicy = ThreadArbitrationPolicy::NotPresent;
2882         return CL_INVALID_VALUE;
2883     }
2884     return CL_SUCCESS;
2885 }
2886 
2887 } // namespace NEO
2888