1 /*
2 * Copyright (C) 2018-2021 Intel Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 */
7
8 #include "opencl/source/kernel/kernel.h"
9
10 #include "shared/source/built_ins/built_ins.h"
11 #include "shared/source/command_stream/command_stream_receiver.h"
12 #include "shared/source/debug_settings/debug_settings_manager.h"
13 #include "shared/source/device_binary_format/patchtokens_decoder.h"
14 #include "shared/source/gmm_helper/gmm_helper.h"
15 #include "shared/source/helpers/aligned_memory.h"
16 #include "shared/source/helpers/api_specific_config.h"
17 #include "shared/source/helpers/basic_math.h"
18 #include "shared/source/helpers/debug_helpers.h"
19 #include "shared/source/helpers/get_info.h"
20 #include "shared/source/helpers/hw_helper.h"
21 #include "shared/source/helpers/kernel_helpers.h"
22 #include "shared/source/helpers/per_thread_data.h"
23 #include "shared/source/helpers/ptr_math.h"
24 #include "shared/source/helpers/surface_format_info.h"
25 #include "shared/source/kernel/kernel_arg_descriptor_extended_device_side_enqueue.h"
26 #include "shared/source/kernel/kernel_arg_descriptor_extended_vme.h"
27 #include "shared/source/memory_manager/memory_manager.h"
28 #include "shared/source/memory_manager/unified_memory_manager.h"
29 #include "shared/source/os_interface/hw_info_config.h"
30 #include "shared/source/program/kernel_info.h"
31
32 #include "opencl/source/accelerators/intel_accelerator.h"
33 #include "opencl/source/accelerators/intel_motion_estimation.h"
34 #include "opencl/source/built_ins/builtins_dispatch_builder.h"
35 #include "opencl/source/cl_device/cl_device.h"
36 #include "opencl/source/command_queue/cl_local_work_size.h"
37 #include "opencl/source/command_queue/command_queue.h"
38 #include "opencl/source/context/context.h"
39 #include "opencl/source/device_queue/device_queue.h"
40 #include "opencl/source/execution_model/device_enqueue.h"
41 #include "opencl/source/gtpin/gtpin_notify.h"
42 #include "opencl/source/helpers/cl_hw_helper.h"
43 #include "opencl/source/helpers/dispatch_info.h"
44 #include "opencl/source/helpers/get_info_status_mapper.h"
45 #include "opencl/source/helpers/sampler_helpers.h"
46 #include "opencl/source/kernel/image_transformer.h"
47 #include "opencl/source/kernel/kernel.inl"
48 #include "opencl/source/kernel/kernel_info_cl.h"
49 #include "opencl/source/mem_obj/buffer.h"
50 #include "opencl/source/mem_obj/image.h"
51 #include "opencl/source/mem_obj/pipe.h"
52 #include "opencl/source/memory_manager/mem_obj_surface.h"
53 #include "opencl/source/platform/platform.h"
54 #include "opencl/source/program/block_kernel_manager.h"
55 #include "opencl/source/sampler/sampler.h"
56
57 #include "patch_list.h"
58
59 #include <algorithm>
60 #include <cstdint>
61 #include <vector>
62
63 using namespace iOpenCL;
64
65 namespace NEO {
66 class Surface;
67
68 uint32_t Kernel::dummyPatchLocation = 0xbaddf00d;
69
Kernel(Program * programArg,const KernelInfo & kernelInfoArg,ClDevice & clDeviceArg,bool schedulerKernel)70 Kernel::Kernel(Program *programArg, const KernelInfo &kernelInfoArg, ClDevice &clDeviceArg, bool schedulerKernel)
71 : isParentKernel(kernelInfoArg.kernelDescriptor.kernelAttributes.flags.usesDeviceSideEnqueue),
72 isSchedulerKernel(schedulerKernel),
73 executionEnvironment(programArg->getExecutionEnvironment()),
74 program(programArg),
75 clDevice(clDeviceArg),
76 kernelInfo(kernelInfoArg) {
77 program->retain();
78 program->retainForKernel();
79 imageTransformer.reset(new ImageTransformer);
80 auto &deviceInfo = getDevice().getDevice().getDeviceInfo();
81 if (kernelInfoArg.kernelDescriptor.kernelAttributes.simdSize == 1u) {
82 auto &hwInfoConfig = *HwInfoConfig::get(getHardwareInfo().platform.eProductFamily);
83 maxKernelWorkGroupSize = hwInfoConfig.getMaxThreadsForWorkgroupInDSSOrSS(getHardwareInfo(), static_cast<uint32_t>(deviceInfo.maxNumEUsPerSubSlice), static_cast<uint32_t>(deviceInfo.maxNumEUsPerDualSubSlice));
84 } else {
85 maxKernelWorkGroupSize = static_cast<uint32_t>(deviceInfo.maxWorkGroupSize);
86 }
87 slmTotalSize = kernelInfoArg.kernelDescriptor.kernelAttributes.slmInlineSize;
88 }
89
~Kernel()90 Kernel::~Kernel() {
91 delete[] crossThreadData;
92 crossThreadData = nullptr;
93 crossThreadDataSize = 0;
94
95 if (privateSurface) {
96 program->peekExecutionEnvironment().memoryManager->checkGpuUsageAndDestroyGraphicsAllocations(privateSurface);
97 privateSurface = nullptr;
98 }
99
100 if (kernelReflectionSurface) {
101 program->peekExecutionEnvironment().memoryManager->freeGraphicsMemory(kernelReflectionSurface);
102 kernelReflectionSurface = nullptr;
103 }
104
105 for (uint32_t i = 0; i < patchedArgumentsNum; i++) {
106 if (SAMPLER_OBJ == getKernelArguments()[i].type) {
107 auto sampler = castToObject<Sampler>(kernelArguments.at(i).object);
108 if (sampler) {
109 sampler->decRefInternal();
110 }
111 }
112 }
113
114 kernelArgHandlers.clear();
115 program->releaseForKernel();
116 program->release();
117 }
118 // If dstOffsetBytes is not an invalid offset, then patches dst at dstOffsetBytes
119 // with src casted to DstT type.
120 template <typename DstT, typename SrcT>
patch(const SrcT & src,void * dst,CrossThreadDataOffset dstOffsetBytes)121 inline void patch(const SrcT &src, void *dst, CrossThreadDataOffset dstOffsetBytes) {
122 if (isValidOffset(dstOffsetBytes)) {
123 DstT *patchLocation = reinterpret_cast<DstT *>(ptrOffset(dst, dstOffsetBytes));
124 *patchLocation = static_cast<DstT>(src);
125 }
126 }
127
patchWithImplicitSurface(void * ptrToPatchInCrossThreadData,GraphicsAllocation & allocation,const ArgDescPointer & arg)128 void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const ArgDescPointer &arg) {
129 if ((nullptr != crossThreadData) && isValidOffset(arg.stateless)) {
130 auto pp = ptrOffset(crossThreadData, arg.stateless);
131 uintptr_t addressToPatch = reinterpret_cast<uintptr_t>(ptrToPatchInCrossThreadData);
132 patchWithRequiredSize(pp, arg.pointerSize, addressToPatch);
133 if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) {
134 PatchInfoData patchInfoData(addressToPatch, 0u, PatchInfoAllocationType::KernelArg, reinterpret_cast<uint64_t>(crossThreadData), arg.stateless, PatchInfoAllocationType::IndirectObjectHeap, arg.pointerSize);
135 this->patchInfoDataList.push_back(patchInfoData);
136 }
137 }
138
139 void *ssh = getSurfaceStateHeap();
140 if ((nullptr != ssh) && isValidOffset(arg.bindful)) {
141 auto surfaceState = ptrOffset(ssh, arg.bindful);
142 void *addressToPatch = reinterpret_cast<void *>(allocation.getGpuAddressToPatch());
143 size_t sizeToPatch = allocation.getUnderlyingBufferSize();
144 Buffer::setSurfaceState(&clDevice.getDevice(), surfaceState, false, false, sizeToPatch, addressToPatch, 0, &allocation, 0, 0,
145 kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, areMultipleSubDevicesInContext());
146 }
147 }
148
initialize()149 cl_int Kernel::initialize() {
150 this->kernelHasIndirectAccess = false;
151 auto pClDevice = &getDevice();
152 auto rootDeviceIndex = pClDevice->getRootDeviceIndex();
153 reconfigureKernel();
154 auto &hwInfo = pClDevice->getHardwareInfo();
155 auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
156 auto &kernelDescriptor = kernelInfo.kernelDescriptor;
157 const auto &implicitArgs = kernelDescriptor.payloadMappings.implicitArgs;
158 const auto &explicitArgs = kernelDescriptor.payloadMappings.explicitArgs;
159 auto maxSimdSize = kernelInfo.getMaxSimdSize();
160 const auto &heapInfo = kernelInfo.heapInfo;
161
162 if (maxSimdSize != 1 && maxSimdSize < hwHelper.getMinimalSIMDSize()) {
163 return CL_INVALID_KERNEL;
164 }
165
166 if (kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs) {
167 pImplicitArgs = std::make_unique<ImplicitArgs>();
168 *pImplicitArgs = {};
169 pImplicitArgs->structSize = sizeof(ImplicitArgs);
170 pImplicitArgs->structVersion = 0;
171 pImplicitArgs->simdWidth = maxSimdSize;
172 }
173
174 crossThreadDataSize = kernelDescriptor.kernelAttributes.crossThreadDataSize;
175
176 // now allocate our own cross-thread data, if necessary
177 if (crossThreadDataSize) {
178 crossThreadData = new char[crossThreadDataSize];
179
180 if (kernelInfo.crossThreadData) {
181 memcpy_s(crossThreadData, crossThreadDataSize,
182 kernelInfo.crossThreadData, crossThreadDataSize);
183 } else {
184 memset(crossThreadData, 0x00, crossThreadDataSize);
185 }
186
187 auto crossThread = reinterpret_cast<uint32_t *>(crossThreadData);
188 auto setArgsIfValidOffset = [&](uint32_t *&crossThreadData, NEO::CrossThreadDataOffset offset, uint32_t value) {
189 if (isValidOffset(offset)) {
190 crossThreadData = ptrOffset(crossThread, offset);
191 *crossThreadData = value;
192 }
193 };
194 setArgsIfValidOffset(maxWorkGroupSizeForCrossThreadData, implicitArgs.maxWorkGroupSize, maxKernelWorkGroupSize);
195 setArgsIfValidOffset(dataParameterSimdSize, implicitArgs.simdSize, maxSimdSize);
196 setArgsIfValidOffset(preferredWkgMultipleOffset, implicitArgs.preferredWkgMultiple, maxSimdSize);
197 setArgsIfValidOffset(parentEventOffset, implicitArgs.deviceSideEnqueueParentEvent, undefined<uint32_t>);
198 }
199
200 // allocate our own SSH, if necessary
201 sshLocalSize = heapInfo.SurfaceStateHeapSize;
202 if (sshLocalSize) {
203 pSshLocal = std::make_unique<char[]>(sshLocalSize);
204
205 // copy the ssh into our local copy
206 memcpy_s(pSshLocal.get(), sshLocalSize,
207 heapInfo.pSsh, heapInfo.SurfaceStateHeapSize);
208 }
209 numberOfBindingTableStates = kernelDescriptor.payloadMappings.bindingTable.numEntries;
210 localBindingTableOffset = kernelDescriptor.payloadMappings.bindingTable.tableOffset;
211
212 // patch crossthread data and ssh with inline surfaces, if necessary
213 auto status = patchPrivateSurface();
214 if (CL_SUCCESS != status) {
215 return status;
216 }
217
218 if (isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress.stateless)) {
219 DEBUG_BREAK_IF(program->getConstantSurface(rootDeviceIndex) == nullptr);
220 uintptr_t constMemory = isBuiltIn ? (uintptr_t)program->getConstantSurface(rootDeviceIndex)->getUnderlyingBuffer() : (uintptr_t)program->getConstantSurface(rootDeviceIndex)->getGpuAddressToPatch();
221
222 const auto &arg = kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress;
223 patchWithImplicitSurface(reinterpret_cast<void *>(constMemory), *program->getConstantSurface(rootDeviceIndex), arg);
224 }
225
226 if (isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress.stateless)) {
227 DEBUG_BREAK_IF(program->getGlobalSurface(rootDeviceIndex) == nullptr);
228 uintptr_t globalMemory = isBuiltIn ? (uintptr_t)program->getGlobalSurface(rootDeviceIndex)->getUnderlyingBuffer() : (uintptr_t)program->getGlobalSurface(rootDeviceIndex)->getGpuAddressToPatch();
229
230 const auto &arg = kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress;
231 patchWithImplicitSurface(reinterpret_cast<void *>(globalMemory), *program->getGlobalSurface(rootDeviceIndex), arg);
232 }
233
234 // Patch Surface State Heap
235 bool useGlobalAtomics = kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics;
236
237 if (isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.deviceSideEnqueueEventPoolSurfaceAddress.bindful)) {
238 auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap()),
239 kernelDescriptor.payloadMappings.implicitArgs.deviceSideEnqueueEventPoolSurfaceAddress.bindful);
240 Buffer::setSurfaceState(&pClDevice->getDevice(), surfaceState, false, false, 0, nullptr, 0, nullptr, 0, 0, useGlobalAtomics, areMultipleSubDevicesInContext());
241 }
242
243 if (isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.deviceSideEnqueueDefaultQueueSurfaceAddress.bindful)) {
244 auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap()),
245 kernelDescriptor.payloadMappings.implicitArgs.deviceSideEnqueueDefaultQueueSurfaceAddress.bindful);
246 Buffer::setSurfaceState(&pClDevice->getDevice(), surfaceState, false, false, 0, nullptr, 0, nullptr, 0, 0, useGlobalAtomics, areMultipleSubDevicesInContext());
247 }
248
249 setThreadArbitrationPolicy(hwHelper.getDefaultThreadArbitrationPolicy());
250 if (false == kernelInfo.kernelDescriptor.kernelAttributes.flags.requiresSubgroupIndependentForwardProgress) {
251 setThreadArbitrationPolicy(ThreadArbitrationPolicy::AgeBased);
252 }
253 patchBlocksSimdSize();
254
255 auto &clHwHelper = ClHwHelper::get(hwInfo.platform.eRenderCoreFamily);
256
257 auxTranslationRequired = !program->getIsBuiltIn() && HwHelper::compressedBuffersSupported(hwInfo) && clHwHelper.requiresAuxResolves(kernelInfo, hwInfo);
258
259 if (DebugManager.flags.ForceAuxTranslationEnabled.get() != -1) {
260 auxTranslationRequired &= !!DebugManager.flags.ForceAuxTranslationEnabled.get();
261 }
262 if (auxTranslationRequired) {
263 program->getContextPtr()->setResolvesRequiredInKernels(true);
264 }
265
266 if (isParentKernel) {
267 program->allocateBlockPrivateSurfaces(*pClDevice);
268 }
269 if (program->isKernelDebugEnabled() && isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.systemThreadSurfaceAddress.bindful)) {
270 debugEnabled = true;
271 }
272 auto numArgs = explicitArgs.size();
273 slmSizes.resize(numArgs);
274
275 this->kernelHasIndirectAccess |= kernelInfo.kernelDescriptor.kernelAttributes.hasNonKernelArgLoad ||
276 kernelInfo.kernelDescriptor.kernelAttributes.hasNonKernelArgStore ||
277 kernelInfo.kernelDescriptor.kernelAttributes.hasNonKernelArgAtomic;
278
279 provideInitializationHints();
280 // resolve the new kernel info to account for kernel handlers
281 // I think by this time we have decoded the binary and know the number of args etc.
282 // double check this assumption
283 bool usingBuffers = false;
284 kernelArguments.resize(numArgs);
285 kernelArgHandlers.resize(numArgs);
286 kernelArgRequiresCacheFlush.resize(numArgs);
287
288 for (uint32_t i = 0; i < numArgs; ++i) {
289 storeKernelArg(i, NONE_OBJ, nullptr, nullptr, 0);
290
291 // set the argument handler
292 const auto &arg = explicitArgs[i];
293 if (arg.is<ArgDescriptor::ArgTPointer>()) {
294 if (arg.getTraits().addressQualifier == KernelArgMetadata::AddrLocal) {
295 kernelArgHandlers[i] = &Kernel::setArgLocal;
296 } else if (arg.getTraits().typeQualifiers.pipeQual) {
297 kernelArgHandlers[i] = &Kernel::setArgPipe;
298 kernelArguments[i].type = PIPE_OBJ;
299 } else if (arg.getExtendedTypeInfo().isDeviceQueue) {
300 kernelArgHandlers[i] = &Kernel::setArgDevQueue;
301 kernelArguments[i].type = DEVICE_QUEUE_OBJ;
302 } else {
303 kernelArgHandlers[i] = &Kernel::setArgBuffer;
304 kernelArguments[i].type = BUFFER_OBJ;
305 usingBuffers = true;
306 allBufferArgsStateful &= static_cast<uint32_t>(arg.as<ArgDescPointer>().isPureStateful());
307 }
308 } else if (arg.is<ArgDescriptor::ArgTImage>()) {
309 kernelArgHandlers[i] = &Kernel::setArgImage;
310 kernelArguments[i].type = IMAGE_OBJ;
311 usingImages = true;
312 } else if (arg.is<ArgDescriptor::ArgTSampler>()) {
313 if (arg.getExtendedTypeInfo().isAccelerator) {
314 kernelArgHandlers[i] = &Kernel::setArgAccelerator;
315 } else {
316 kernelArgHandlers[i] = &Kernel::setArgSampler;
317 kernelArguments[i].type = SAMPLER_OBJ;
318 }
319 } else {
320 kernelArgHandlers[i] = &Kernel::setArgImmediate;
321 }
322 }
323
324 if (usingImages && !usingBuffers) {
325 usingImagesOnly = true;
326 }
327
328 return CL_SUCCESS;
329 }
330
patchPrivateSurface()331 cl_int Kernel::patchPrivateSurface() {
332 auto pClDevice = &getDevice();
333 auto rootDeviceIndex = pClDevice->getRootDeviceIndex();
334 auto &kernelDescriptor = kernelInfo.kernelDescriptor;
335 auto perHwThreadPrivateMemorySize = kernelDescriptor.kernelAttributes.perHwThreadPrivateMemorySize;
336 if (perHwThreadPrivateMemorySize) {
337 if (!privateSurface) {
338 privateSurfaceSize = KernelHelper::getPrivateSurfaceSize(perHwThreadPrivateMemorySize, pClDevice->getSharedDeviceInfo().computeUnitsUsedForScratch);
339 DEBUG_BREAK_IF(privateSurfaceSize == 0);
340
341 if (privateSurfaceSize > std::numeric_limits<uint32_t>::max()) {
342 return CL_OUT_OF_RESOURCES;
343 }
344 privateSurface = executionEnvironment.memoryManager->allocateGraphicsMemoryWithProperties(
345 {rootDeviceIndex,
346 static_cast<size_t>(privateSurfaceSize),
347 GraphicsAllocation::AllocationType::PRIVATE_SURFACE,
348 pClDevice->getDeviceBitfield()});
349 if (privateSurface == nullptr) {
350 return CL_OUT_OF_RESOURCES;
351 }
352 }
353
354 const auto &privateMemoryAddress = kernelDescriptor.payloadMappings.implicitArgs.privateMemoryAddress;
355 patchWithImplicitSurface(reinterpret_cast<void *>(privateSurface->getGpuAddressToPatch()), *privateSurface, privateMemoryAddress);
356 }
357 return CL_SUCCESS;
358 }
359
cloneKernel(Kernel * pSourceKernel)360 cl_int Kernel::cloneKernel(Kernel *pSourceKernel) {
361 // copy cross thread data to store arguments set to source kernel with clSetKernelArg on immediate data (non-pointer types)
362 memcpy_s(crossThreadData, crossThreadDataSize,
363 pSourceKernel->crossThreadData, pSourceKernel->crossThreadDataSize);
364 DEBUG_BREAK_IF(pSourceKernel->crossThreadDataSize != crossThreadDataSize);
365
366 [[maybe_unused]] auto status = patchPrivateSurface();
367 DEBUG_BREAK_IF(status != CL_SUCCESS);
368
369 // copy arguments set to source kernel with clSetKernelArg or clSetKernelArgSVMPointer
370 for (uint32_t i = 0; i < pSourceKernel->kernelArguments.size(); i++) {
371 if (0 == pSourceKernel->getKernelArgInfo(i).size) {
372 // skip copying arguments that haven't been set to source kernel
373 continue;
374 }
375 switch (pSourceKernel->kernelArguments[i].type) {
376 case NONE_OBJ:
377 // all arguments with immediate data (non-pointer types) have been copied in cross thread data
378 storeKernelArg(i, NONE_OBJ, nullptr, nullptr, pSourceKernel->getKernelArgInfo(i).size);
379 patchedArgumentsNum++;
380 kernelArguments[i].isPatched = true;
381 break;
382 case SVM_OBJ:
383 setArgSvm(i, pSourceKernel->getKernelArgInfo(i).size, const_cast<void *>(pSourceKernel->getKernelArgInfo(i).value),
384 pSourceKernel->getKernelArgInfo(i).pSvmAlloc, pSourceKernel->getKernelArgInfo(i).svmFlags);
385 break;
386 case SVM_ALLOC_OBJ:
387 setArgSvmAlloc(i, const_cast<void *>(pSourceKernel->getKernelArgInfo(i).value),
388 (GraphicsAllocation *)pSourceKernel->getKernelArgInfo(i).object);
389 break;
390 default:
391 setArg(i, pSourceKernel->getKernelArgInfo(i).size, pSourceKernel->getKernelArgInfo(i).value);
392 break;
393 }
394 }
395
396 // copy additional information other than argument values set to source kernel with clSetKernelExecInfo
397 for (auto &gfxAlloc : pSourceKernel->kernelSvmGfxAllocations) {
398 kernelSvmGfxAllocations.push_back(gfxAlloc);
399 }
400 for (auto &gfxAlloc : pSourceKernel->kernelUnifiedMemoryGfxAllocations) {
401 kernelUnifiedMemoryGfxAllocations.push_back(gfxAlloc);
402 }
403
404 if (pImplicitArgs) {
405 memcpy_s(pImplicitArgs.get(), sizeof(ImplicitArgs), pSourceKernel->getImplicitArgs(), sizeof(ImplicitArgs));
406 }
407 this->isBuiltIn = pSourceKernel->isBuiltIn;
408
409 return CL_SUCCESS;
410 }
411
getInfo(cl_kernel_info paramName,size_t paramValueSize,void * paramValue,size_t * paramValueSizeRet) const412 cl_int Kernel::getInfo(cl_kernel_info paramName, size_t paramValueSize,
413 void *paramValue, size_t *paramValueSizeRet) const {
414 cl_int retVal;
415 const void *pSrc = nullptr;
416 size_t srcSize = GetInfo::invalidSourceSize;
417 cl_uint numArgs = 0;
418 const _cl_program *prog;
419 const _cl_context *ctxt;
420 cl_uint refCount = 0;
421 uint64_t nonCannonizedGpuAddress = 0llu;
422
423 switch (paramName) {
424 case CL_KERNEL_FUNCTION_NAME:
425 pSrc = kernelInfo.kernelDescriptor.kernelMetadata.kernelName.c_str();
426 srcSize = kernelInfo.kernelDescriptor.kernelMetadata.kernelName.length() + 1;
427 break;
428
429 case CL_KERNEL_NUM_ARGS:
430 srcSize = sizeof(cl_uint);
431 numArgs = static_cast<cl_uint>(kernelInfo.kernelDescriptor.payloadMappings.explicitArgs.size());
432 pSrc = &numArgs;
433 break;
434
435 case CL_KERNEL_CONTEXT:
436 ctxt = &program->getContext();
437 srcSize = sizeof(ctxt);
438 pSrc = &ctxt;
439 break;
440
441 case CL_KERNEL_PROGRAM:
442 prog = program;
443 srcSize = sizeof(prog);
444 pSrc = &prog;
445 break;
446
447 case CL_KERNEL_REFERENCE_COUNT:
448 refCount = static_cast<cl_uint>(pMultiDeviceKernel->getRefApiCount());
449 srcSize = sizeof(refCount);
450 pSrc = &refCount;
451 break;
452
453 case CL_KERNEL_ATTRIBUTES:
454 pSrc = kernelInfo.kernelDescriptor.kernelMetadata.kernelLanguageAttributes.c_str();
455 srcSize = kernelInfo.kernelDescriptor.kernelMetadata.kernelLanguageAttributes.length() + 1;
456 break;
457
458 case CL_KERNEL_BINARY_PROGRAM_INTEL:
459 pSrc = getKernelHeap();
460 srcSize = getKernelHeapSize();
461 break;
462 case CL_KERNEL_BINARY_GPU_ADDRESS_INTEL:
463 nonCannonizedGpuAddress = GmmHelper::decanonize(kernelInfo.kernelAllocation->getGpuAddress());
464 pSrc = &nonCannonizedGpuAddress;
465 srcSize = sizeof(nonCannonizedGpuAddress);
466 break;
467 default:
468 getAdditionalInfo(paramName, pSrc, srcSize);
469 break;
470 }
471
472 auto getInfoStatus = GetInfo::getInfo(paramValue, paramValueSize, pSrc, srcSize);
473 retVal = changeGetInfoStatusToCLResultType(getInfoStatus);
474 GetInfo::setParamValueReturnSize(paramValueSizeRet, srcSize, getInfoStatus);
475
476 return retVal;
477 }
478
getArgInfo(cl_uint argIndex,cl_kernel_arg_info paramName,size_t paramValueSize,void * paramValue,size_t * paramValueSizeRet) const479 cl_int Kernel::getArgInfo(cl_uint argIndex, cl_kernel_arg_info paramName, size_t paramValueSize,
480 void *paramValue, size_t *paramValueSizeRet) const {
481 cl_int retVal;
482 const void *pSrc = nullptr;
483 size_t srcSize = GetInfo::invalidSourceSize;
484 const auto &args = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs;
485
486 if (argIndex >= args.size()) {
487 retVal = CL_INVALID_ARG_INDEX;
488 return retVal;
489 }
490
491 const auto &argTraits = args[argIndex].getTraits();
492 const auto &argMetadata = kernelInfo.kernelDescriptor.explicitArgsExtendedMetadata[argIndex];
493
494 cl_kernel_arg_address_qualifier addressQualifier;
495 cl_kernel_arg_access_qualifier accessQualifier;
496 cl_kernel_arg_type_qualifier typeQualifier;
497
498 switch (paramName) {
499 case CL_KERNEL_ARG_ADDRESS_QUALIFIER:
500 addressQualifier = asClKernelArgAddressQualifier(argTraits.getAddressQualifier());
501 srcSize = sizeof(addressQualifier);
502 pSrc = &addressQualifier;
503 break;
504
505 case CL_KERNEL_ARG_ACCESS_QUALIFIER:
506 accessQualifier = asClKernelArgAccessQualifier(argTraits.getAccessQualifier());
507 srcSize = sizeof(accessQualifier);
508 pSrc = &accessQualifier;
509 break;
510
511 case CL_KERNEL_ARG_TYPE_QUALIFIER:
512 typeQualifier = asClKernelArgTypeQualifier(argTraits.typeQualifiers);
513 srcSize = sizeof(typeQualifier);
514 pSrc = &typeQualifier;
515 break;
516
517 case CL_KERNEL_ARG_TYPE_NAME:
518 srcSize = argMetadata.type.length() + 1;
519 pSrc = argMetadata.type.c_str();
520 break;
521
522 case CL_KERNEL_ARG_NAME:
523 srcSize = argMetadata.argName.length() + 1;
524 pSrc = argMetadata.argName.c_str();
525 break;
526
527 default:
528 break;
529 }
530
531 auto getInfoStatus = GetInfo::getInfo(paramValue, paramValueSize, pSrc, srcSize);
532 retVal = changeGetInfoStatusToCLResultType(getInfoStatus);
533 GetInfo::setParamValueReturnSize(paramValueSizeRet, srcSize, getInfoStatus);
534
535 return retVal;
536 }
537
getWorkGroupInfo(cl_kernel_work_group_info paramName,size_t paramValueSize,void * paramValue,size_t * paramValueSizeRet) const538 cl_int Kernel::getWorkGroupInfo(cl_kernel_work_group_info paramName,
539 size_t paramValueSize, void *paramValue,
540 size_t *paramValueSizeRet) const {
541 cl_int retVal = CL_INVALID_VALUE;
542 const void *pSrc = nullptr;
543 size_t srcSize = GetInfo::invalidSourceSize;
544 struct size_t3 {
545 size_t val[3];
546 } requiredWorkGroupSize;
547 cl_ulong localMemorySize;
548 const auto &kernelDescriptor = kernelInfo.kernelDescriptor;
549 size_t preferredWorkGroupSizeMultiple = 0;
550 cl_ulong scratchSize;
551 cl_ulong privateMemSize;
552 size_t maxWorkgroupSize;
553 const auto &hwInfo = clDevice.getHardwareInfo();
554 auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
555 auto &clHwHelper = ClHwHelper::get(hwInfo.platform.eRenderCoreFamily);
556 GetInfoHelper info(paramValue, paramValueSize, paramValueSizeRet);
557
558 switch (paramName) {
559 case CL_KERNEL_WORK_GROUP_SIZE:
560 maxWorkgroupSize = maxKernelWorkGroupSize;
561 if (DebugManager.flags.UseMaxSimdSizeToDeduceMaxWorkgroupSize.get()) {
562 auto divisionSize = CommonConstants::maximalSimdSize / kernelInfo.getMaxSimdSize();
563 maxWorkgroupSize /= divisionSize;
564 }
565 srcSize = sizeof(maxWorkgroupSize);
566 pSrc = &maxWorkgroupSize;
567 break;
568
569 case CL_KERNEL_COMPILE_WORK_GROUP_SIZE:
570 requiredWorkGroupSize.val[0] = kernelDescriptor.kernelAttributes.requiredWorkgroupSize[0];
571 requiredWorkGroupSize.val[1] = kernelDescriptor.kernelAttributes.requiredWorkgroupSize[1];
572 requiredWorkGroupSize.val[2] = kernelDescriptor.kernelAttributes.requiredWorkgroupSize[2];
573 srcSize = sizeof(requiredWorkGroupSize);
574 pSrc = &requiredWorkGroupSize;
575 break;
576
577 case CL_KERNEL_LOCAL_MEM_SIZE:
578 localMemorySize = kernelInfo.kernelDescriptor.kernelAttributes.slmInlineSize;
579 srcSize = sizeof(localMemorySize);
580 pSrc = &localMemorySize;
581 break;
582
583 case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE:
584 preferredWorkGroupSizeMultiple = kernelInfo.getMaxSimdSize();
585 if (hwHelper.isFusedEuDispatchEnabled(hwInfo)) {
586 preferredWorkGroupSizeMultiple *= 2;
587 }
588 srcSize = sizeof(preferredWorkGroupSizeMultiple);
589 pSrc = &preferredWorkGroupSizeMultiple;
590 break;
591
592 case CL_KERNEL_SPILL_MEM_SIZE_INTEL:
593 scratchSize = kernelDescriptor.kernelAttributes.perThreadScratchSize[0];
594 srcSize = sizeof(scratchSize);
595 pSrc = &scratchSize;
596 break;
597 case CL_KERNEL_PRIVATE_MEM_SIZE:
598 privateMemSize = clHwHelper.getKernelPrivateMemSize(kernelInfo);
599 srcSize = sizeof(privateMemSize);
600 pSrc = &privateMemSize;
601 break;
602 default:
603 getAdditionalWorkGroupInfo(paramName, pSrc, srcSize);
604 break;
605 }
606
607 auto getInfoStatus = GetInfo::getInfo(paramValue, paramValueSize, pSrc, srcSize);
608 retVal = changeGetInfoStatusToCLResultType(getInfoStatus);
609 GetInfo::setParamValueReturnSize(paramValueSizeRet, srcSize, getInfoStatus);
610
611 return retVal;
612 }
613
getSubGroupInfo(cl_kernel_sub_group_info paramName,size_t inputValueSize,const void * inputValue,size_t paramValueSize,void * paramValue,size_t * paramValueSizeRet) const614 cl_int Kernel::getSubGroupInfo(cl_kernel_sub_group_info paramName,
615 size_t inputValueSize, const void *inputValue,
616 size_t paramValueSize, void *paramValue,
617 size_t *paramValueSizeRet) const {
618 size_t numDimensions = 0;
619 size_t WGS = 1;
620 auto maxSimdSize = static_cast<size_t>(kernelInfo.getMaxSimdSize());
621 auto maxRequiredWorkGroupSize = static_cast<size_t>(kernelInfo.getMaxRequiredWorkGroupSize(getMaxKernelWorkGroupSize()));
622 auto largestCompiledSIMDSize = static_cast<size_t>(kernelInfo.getMaxSimdSize());
623
624 GetInfoHelper info(paramValue, paramValueSize, paramValueSizeRet);
625
626 if ((paramName == CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT) ||
627 (paramName == CL_KERNEL_MAX_NUM_SUB_GROUPS) ||
628 (paramName == CL_KERNEL_COMPILE_NUM_SUB_GROUPS)) {
629 if (clDevice.areOcl21FeaturesEnabled() == false) {
630 return CL_INVALID_OPERATION;
631 }
632 }
633
634 if ((paramName == CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR) ||
635 (paramName == CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR)) {
636 if (!inputValue) {
637 return CL_INVALID_VALUE;
638 }
639 if (inputValueSize % sizeof(size_t) != 0) {
640 return CL_INVALID_VALUE;
641 }
642 numDimensions = inputValueSize / sizeof(size_t);
643 if (numDimensions == 0 ||
644 numDimensions > static_cast<size_t>(clDevice.getDeviceInfo().maxWorkItemDimensions)) {
645 return CL_INVALID_VALUE;
646 }
647 }
648
649 if (paramName == CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT) {
650 if (!paramValue) {
651 return CL_INVALID_VALUE;
652 }
653 if (paramValueSize % sizeof(size_t) != 0) {
654 return CL_INVALID_VALUE;
655 }
656 numDimensions = paramValueSize / sizeof(size_t);
657 if (numDimensions == 0 ||
658 numDimensions > static_cast<size_t>(clDevice.getDeviceInfo().maxWorkItemDimensions)) {
659 return CL_INVALID_VALUE;
660 }
661 }
662
663 switch (paramName) {
664 case CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR: {
665 return changeGetInfoStatusToCLResultType(info.set<size_t>(maxSimdSize));
666 }
667 case CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR: {
668 for (size_t i = 0; i < numDimensions; i++) {
669 WGS *= ((size_t *)inputValue)[i];
670 }
671 return changeGetInfoStatusToCLResultType(
672 info.set<size_t>((WGS / maxSimdSize) + std::min(static_cast<size_t>(1), WGS % maxSimdSize))); // add 1 if WGS % maxSimdSize != 0
673 }
674 case CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT: {
675 auto subGroupsNum = *(size_t *)inputValue;
676 auto workGroupSize = subGroupsNum * largestCompiledSIMDSize;
677 // return workgroup size in first dimension, the rest shall be 1 in positive case
678 if (workGroupSize > maxRequiredWorkGroupSize) {
679 workGroupSize = 0;
680 }
681 // If no work group size can accommodate the requested number of subgroups, return 0 in each element of the returned array.
682 switch (numDimensions) {
683 case 1:
684 return changeGetInfoStatusToCLResultType(info.set<size_t>(workGroupSize));
685 case 2:
686 struct size_t2 {
687 size_t val[2];
688 } workGroupSize2;
689 workGroupSize2.val[0] = workGroupSize;
690 workGroupSize2.val[1] = (workGroupSize > 0) ? 1 : 0;
691 return changeGetInfoStatusToCLResultType(info.set<size_t2>(workGroupSize2));
692 default:
693 struct size_t3 {
694 size_t val[3];
695 } workGroupSize3;
696 workGroupSize3.val[0] = workGroupSize;
697 workGroupSize3.val[1] = (workGroupSize > 0) ? 1 : 0;
698 workGroupSize3.val[2] = (workGroupSize > 0) ? 1 : 0;
699 return changeGetInfoStatusToCLResultType(info.set<size_t3>(workGroupSize3));
700 }
701 }
702 case CL_KERNEL_MAX_NUM_SUB_GROUPS: {
703 // round-up maximum number of subgroups
704 return changeGetInfoStatusToCLResultType(info.set<size_t>(Math::divideAndRoundUp(maxRequiredWorkGroupSize, largestCompiledSIMDSize)));
705 }
706 case CL_KERNEL_COMPILE_NUM_SUB_GROUPS: {
707 return changeGetInfoStatusToCLResultType(info.set<size_t>(static_cast<size_t>(kernelInfo.kernelDescriptor.kernelMetadata.compiledSubGroupsNumber)));
708 }
709 case CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL: {
710 return changeGetInfoStatusToCLResultType(info.set<size_t>(kernelInfo.kernelDescriptor.kernelMetadata.requiredSubGroupSize));
711 }
712 default:
713 return CL_INVALID_VALUE;
714 }
715 }
716
getKernelHeap() const717 const void *Kernel::getKernelHeap() const {
718 return kernelInfo.heapInfo.pKernelHeap;
719 }
720
getKernelHeapSize() const721 size_t Kernel::getKernelHeapSize() const {
722 return kernelInfo.heapInfo.KernelHeapSize;
723 }
724
substituteKernelHeap(void * newKernelHeap,size_t newKernelHeapSize)725 void Kernel::substituteKernelHeap(void *newKernelHeap, size_t newKernelHeapSize) {
726 KernelInfo *pKernelInfo = const_cast<KernelInfo *>(&kernelInfo);
727 void **pKernelHeap = const_cast<void **>(&pKernelInfo->heapInfo.pKernelHeap);
728 *pKernelHeap = newKernelHeap;
729 auto &heapInfo = pKernelInfo->heapInfo;
730 heapInfo.KernelHeapSize = static_cast<uint32_t>(newKernelHeapSize);
731 pKernelInfo->isKernelHeapSubstituted = true;
732 auto memoryManager = executionEnvironment.memoryManager.get();
733
734 auto currentAllocationSize = pKernelInfo->kernelAllocation->getUnderlyingBufferSize();
735 bool status = false;
736
737 const auto &hwInfo = clDevice.getHardwareInfo();
738 auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
739 size_t isaPadding = hwHelper.getPaddingForISAAllocation();
740 if (currentAllocationSize >= newKernelHeapSize + isaPadding) {
741 auto &hwInfo = clDevice.getDevice().getHardwareInfo();
742 auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
743 status = MemoryTransferHelper::transferMemoryToAllocation(hwHelper.isBlitCopyRequiredForLocalMemory(hwInfo, *pKernelInfo->getGraphicsAllocation()),
744 clDevice.getDevice(), pKernelInfo->getGraphicsAllocation(), 0, newKernelHeap,
745 static_cast<size_t>(newKernelHeapSize));
746 } else {
747 memoryManager->checkGpuUsageAndDestroyGraphicsAllocations(pKernelInfo->kernelAllocation);
748 pKernelInfo->kernelAllocation = nullptr;
749 status = pKernelInfo->createKernelAllocation(clDevice.getDevice(), isBuiltIn);
750 }
751 UNRECOVERABLE_IF(!status);
752 }
753
isKernelHeapSubstituted() const754 bool Kernel::isKernelHeapSubstituted() const {
755 return kernelInfo.isKernelHeapSubstituted;
756 }
757
getKernelId() const758 uint64_t Kernel::getKernelId() const {
759 return kernelInfo.kernelId;
760 }
761
setKernelId(uint64_t newKernelId)762 void Kernel::setKernelId(uint64_t newKernelId) {
763 KernelInfo *pKernelInfo = const_cast<KernelInfo *>(&kernelInfo);
764 pKernelInfo->kernelId = newKernelId;
765 }
getStartOffset() const766 uint32_t Kernel::getStartOffset() const {
767 return this->startOffset;
768 }
setStartOffset(uint32_t offset)769 void Kernel::setStartOffset(uint32_t offset) {
770 this->startOffset = offset;
771 }
772
getSurfaceStateHeap() const773 void *Kernel::getSurfaceStateHeap() const {
774 return pSshLocal.get();
775 }
776
getDynamicStateHeapSize() const777 size_t Kernel::getDynamicStateHeapSize() const {
778 return kernelInfo.heapInfo.DynamicStateHeapSize;
779 }
780
getDynamicStateHeap() const781 const void *Kernel::getDynamicStateHeap() const {
782 return kernelInfo.heapInfo.pDsh;
783 }
784
getSurfaceStateHeapSize() const785 size_t Kernel::getSurfaceStateHeapSize() const {
786 return sshLocalSize;
787 }
788
getNumberOfBindingTableStates() const789 size_t Kernel::getNumberOfBindingTableStates() const {
790 return numberOfBindingTableStates;
791 }
792
resizeSurfaceStateHeap(void * pNewSsh,size_t newSshSize,size_t newBindingTableCount,size_t newBindingTableOffset)793 void Kernel::resizeSurfaceStateHeap(void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset) {
794 pSshLocal.reset(static_cast<char *>(pNewSsh));
795 sshLocalSize = static_cast<uint32_t>(newSshSize);
796 numberOfBindingTableStates = newBindingTableCount;
797 localBindingTableOffset = newBindingTableOffset;
798 }
799
markArgPatchedAndResolveArgs(uint32_t argIndex)800 void Kernel::markArgPatchedAndResolveArgs(uint32_t argIndex) {
801 if (!kernelArguments[argIndex].isPatched) {
802 patchedArgumentsNum++;
803 kernelArguments[argIndex].isPatched = true;
804 }
805 if (program->getContextPtr() && getContext().getRootDeviceIndices().size() > 1u && Kernel::isMemObj(kernelArguments[argIndex].type) && kernelArguments[argIndex].object) {
806 auto argMemObj = castToObjectOrAbort<MemObj>(reinterpret_cast<cl_mem>(kernelArguments[argIndex].object));
807 auto memObj = argMemObj->getHighestRootMemObj();
808 auto migrateRequiredForArg = memObj->getMultiGraphicsAllocation().requiresMigrations();
809
810 if (migratableArgsMap.find(argIndex) == migratableArgsMap.end() && migrateRequiredForArg) {
811 migratableArgsMap.insert({argIndex, memObj});
812 } else if (migrateRequiredForArg) {
813 migratableArgsMap[argIndex] = memObj;
814 } else {
815 migratableArgsMap.erase(argIndex);
816 }
817 }
818
819 resolveArgs();
820 }
821
setArg(uint32_t argIndex,size_t argSize,const void * argVal)822 cl_int Kernel::setArg(uint32_t argIndex, size_t argSize, const void *argVal) {
823 cl_int retVal = CL_SUCCESS;
824 bool updateExposedKernel = true;
825 auto argWasUncacheable = false;
826 if (kernelInfo.builtinDispatchBuilder != nullptr) {
827 updateExposedKernel = kernelInfo.builtinDispatchBuilder->setExplicitArg(argIndex, argSize, argVal, retVal);
828 }
829 if (updateExposedKernel) {
830 if (argIndex >= kernelArgHandlers.size()) {
831 return CL_INVALID_ARG_INDEX;
832 }
833 argWasUncacheable = kernelArguments[argIndex].isStatelessUncacheable;
834 auto argHandler = kernelArgHandlers[argIndex];
835 retVal = (this->*argHandler)(argIndex, argSize, argVal);
836 }
837 if (retVal == CL_SUCCESS) {
838 auto argIsUncacheable = kernelArguments[argIndex].isStatelessUncacheable;
839 statelessUncacheableArgsCount += (argIsUncacheable ? 1 : 0) - (argWasUncacheable ? 1 : 0);
840 markArgPatchedAndResolveArgs(argIndex);
841 }
842 return retVal;
843 }
844
setArg(uint32_t argIndex,uint32_t argVal)845 cl_int Kernel::setArg(uint32_t argIndex, uint32_t argVal) {
846 return setArg(argIndex, sizeof(argVal), &argVal);
847 }
848
setArg(uint32_t argIndex,uint64_t argVal)849 cl_int Kernel::setArg(uint32_t argIndex, uint64_t argVal) {
850 return setArg(argIndex, sizeof(argVal), &argVal);
851 }
852
setArg(uint32_t argIndex,cl_mem argVal)853 cl_int Kernel::setArg(uint32_t argIndex, cl_mem argVal) {
854 return setArg(argIndex, sizeof(argVal), &argVal);
855 }
856
setArg(uint32_t argIndex,cl_mem argVal,uint32_t mipLevel)857 cl_int Kernel::setArg(uint32_t argIndex, cl_mem argVal, uint32_t mipLevel) {
858 auto retVal = setArgImageWithMipLevel(argIndex, sizeof(argVal), &argVal, mipLevel);
859 if (retVal == CL_SUCCESS) {
860 markArgPatchedAndResolveArgs(argIndex);
861 }
862 return retVal;
863 }
864
patchBufferOffset(const ArgDescPointer & argAsPtr,void * svmPtr,GraphicsAllocation * svmAlloc)865 void *Kernel::patchBufferOffset(const ArgDescPointer &argAsPtr, void *svmPtr, GraphicsAllocation *svmAlloc) {
866 if (isUndefinedOffset(argAsPtr.bufferOffset)) {
867 return svmPtr;
868 }
869 void *ptrToPatch = svmPtr;
870 if (svmAlloc != nullptr) {
871 ptrToPatch = reinterpret_cast<void *>(svmAlloc->getGpuAddressToPatch());
872 }
873
874 constexpr uint32_t minimumAlignment = 4;
875 ptrToPatch = alignDown(ptrToPatch, minimumAlignment);
876 DEBUG_BREAK_IF(ptrDiff(svmPtr, ptrToPatch) != static_cast<uint32_t>(ptrDiff(svmPtr, ptrToPatch)));
877 uint32_t offsetToPatch = static_cast<uint32_t>(ptrDiff(svmPtr, ptrToPatch));
878
879 patch<uint32_t, uint32_t>(offsetToPatch, getCrossThreadData(), argAsPtr.bufferOffset);
880 return ptrToPatch;
881 }
882
setArgSvm(uint32_t argIndex,size_t svmAllocSize,void * svmPtr,GraphicsAllocation * svmAlloc,cl_mem_flags svmFlags)883 cl_int Kernel::setArgSvm(uint32_t argIndex, size_t svmAllocSize, void *svmPtr, GraphicsAllocation *svmAlloc, cl_mem_flags svmFlags) {
884 const auto &argAsPtr = getKernelInfo().kernelDescriptor.payloadMappings.explicitArgs[argIndex].as<ArgDescPointer>();
885
886 auto patchLocation = ptrOffset(getCrossThreadData(), argAsPtr.stateless);
887 patchWithRequiredSize(patchLocation, argAsPtr.pointerSize, reinterpret_cast<uintptr_t>(svmPtr));
888
889 void *ptrToPatch = patchBufferOffset(argAsPtr, svmPtr, svmAlloc);
890 if (isValidOffset(argAsPtr.bindful)) {
891 auto surfaceState = ptrOffset(getSurfaceStateHeap(), argAsPtr.bindful);
892 Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, false, false, svmAllocSize + ptrDiff(svmPtr, ptrToPatch), ptrToPatch, 0, svmAlloc, svmFlags, 0,
893 kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, areMultipleSubDevicesInContext());
894 }
895
896 storeKernelArg(argIndex, SVM_OBJ, nullptr, svmPtr, sizeof(void *), svmAlloc, svmFlags);
897 if (!kernelArguments[argIndex].isPatched) {
898 patchedArgumentsNum++;
899 kernelArguments[argIndex].isPatched = true;
900 }
901 addAllocationToCacheFlushVector(argIndex, svmAlloc);
902
903 return CL_SUCCESS;
904 }
905
setArgSvmAlloc(uint32_t argIndex,void * svmPtr,GraphicsAllocation * svmAlloc)906 cl_int Kernel::setArgSvmAlloc(uint32_t argIndex, void *svmPtr, GraphicsAllocation *svmAlloc) {
907 DBG_LOG_INPUTS("setArgBuffer svm_alloc", svmAlloc);
908
909 const auto &argAsPtr = getKernelInfo().kernelDescriptor.payloadMappings.explicitArgs[argIndex].as<ArgDescPointer>();
910
911 auto patchLocation = ptrOffset(getCrossThreadData(), argAsPtr.stateless);
912 patchWithRequiredSize(patchLocation, argAsPtr.pointerSize, reinterpret_cast<uintptr_t>(svmPtr));
913
914 bool disableL3 = false;
915 bool forceNonAuxMode = false;
916 bool isAuxTranslationKernel = (AuxTranslationDirection::None != auxTranslationDirection);
917 auto &hwInfo = getDevice().getHardwareInfo();
918 auto &clHwHelper = ClHwHelper::get(hwInfo.platform.eRenderCoreFamily);
919
920 if (isAuxTranslationKernel) {
921 if (((AuxTranslationDirection::AuxToNonAux == auxTranslationDirection) && argIndex == 1) ||
922 ((AuxTranslationDirection::NonAuxToAux == auxTranslationDirection) && argIndex == 0)) {
923 forceNonAuxMode = true;
924 }
925 disableL3 = (argIndex == 0);
926 } else if (svmAlloc && svmAlloc->isCompressionEnabled() && clHwHelper.requiresNonAuxMode(argAsPtr, hwInfo)) {
927 forceNonAuxMode = true;
928 }
929
930 bool argWasUncacheable = kernelArguments[argIndex].isStatelessUncacheable;
931 bool argIsUncacheable = svmAlloc ? svmAlloc->isUncacheable() : false;
932 statelessUncacheableArgsCount += (argIsUncacheable ? 1 : 0) - (argWasUncacheable ? 1 : 0);
933
934 void *ptrToPatch = patchBufferOffset(argAsPtr, svmPtr, svmAlloc);
935 if (isValidOffset(argAsPtr.bindful)) {
936 auto surfaceState = ptrOffset(getSurfaceStateHeap(), argAsPtr.bindful);
937 size_t allocSize = 0;
938 size_t offset = 0;
939 if (svmAlloc != nullptr) {
940 allocSize = svmAlloc->getUnderlyingBufferSize();
941 offset = ptrDiff(ptrToPatch, svmAlloc->getGpuAddressToPatch());
942 allocSize -= offset;
943 }
944 Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, forceNonAuxMode, disableL3, allocSize, ptrToPatch, offset, svmAlloc, 0, 0,
945 kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, areMultipleSubDevicesInContext());
946 }
947
948 storeKernelArg(argIndex, SVM_ALLOC_OBJ, svmAlloc, svmPtr, sizeof(uintptr_t));
949 if (!kernelArguments[argIndex].isPatched) {
950 patchedArgumentsNum++;
951 kernelArguments[argIndex].isPatched = true;
952 }
953 addAllocationToCacheFlushVector(argIndex, svmAlloc);
954
955 return CL_SUCCESS;
956 }
957
storeKernelArg(uint32_t argIndex,kernelArgType argType,void * argObject,const void * argValue,size_t argSize,GraphicsAllocation * argSvmAlloc,cl_mem_flags argSvmFlags)958 void Kernel::storeKernelArg(uint32_t argIndex, kernelArgType argType, void *argObject,
959 const void *argValue, size_t argSize,
960 GraphicsAllocation *argSvmAlloc, cl_mem_flags argSvmFlags) {
961 kernelArguments[argIndex].type = argType;
962 kernelArguments[argIndex].object = argObject;
963 kernelArguments[argIndex].value = argValue;
964 kernelArguments[argIndex].size = argSize;
965 kernelArguments[argIndex].pSvmAlloc = argSvmAlloc;
966 kernelArguments[argIndex].svmFlags = argSvmFlags;
967 }
968
getKernelArg(uint32_t argIndex) const969 const void *Kernel::getKernelArg(uint32_t argIndex) const {
970 return kernelArguments[argIndex].object;
971 }
972
getKernelArgInfo(uint32_t argIndex) const973 const Kernel::SimpleKernelArgInfo &Kernel::getKernelArgInfo(uint32_t argIndex) const {
974 return kernelArguments[argIndex];
975 }
976
setSvmKernelExecInfo(GraphicsAllocation * argValue)977 void Kernel::setSvmKernelExecInfo(GraphicsAllocation *argValue) {
978 kernelSvmGfxAllocations.push_back(argValue);
979 if (allocationForCacheFlush(argValue)) {
980 svmAllocationsRequireCacheFlush = true;
981 }
982 }
983
clearSvmKernelExecInfo()984 void Kernel::clearSvmKernelExecInfo() {
985 kernelSvmGfxAllocations.clear();
986 svmAllocationsRequireCacheFlush = false;
987 }
988
setUnifiedMemoryProperty(cl_kernel_exec_info infoType,bool infoValue)989 void Kernel::setUnifiedMemoryProperty(cl_kernel_exec_info infoType, bool infoValue) {
990 if (infoType == CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL) {
991 this->unifiedMemoryControls.indirectDeviceAllocationsAllowed = infoValue;
992 return;
993 }
994 if (infoType == CL_KERNEL_EXEC_INFO_INDIRECT_HOST_ACCESS_INTEL) {
995 this->unifiedMemoryControls.indirectHostAllocationsAllowed = infoValue;
996 return;
997 }
998 if (infoType == CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL) {
999 this->unifiedMemoryControls.indirectSharedAllocationsAllowed = infoValue;
1000 return;
1001 }
1002 }
1003
setUnifiedMemoryExecInfo(GraphicsAllocation * unifiedMemoryAllocation)1004 void Kernel::setUnifiedMemoryExecInfo(GraphicsAllocation *unifiedMemoryAllocation) {
1005 kernelUnifiedMemoryGfxAllocations.push_back(unifiedMemoryAllocation);
1006 }
1007
clearUnifiedMemoryExecInfo()1008 void Kernel::clearUnifiedMemoryExecInfo() {
1009 kernelUnifiedMemoryGfxAllocations.clear();
1010 }
1011
setKernelExecutionType(cl_execution_info_kernel_type_intel executionType)1012 cl_int Kernel::setKernelExecutionType(cl_execution_info_kernel_type_intel executionType) {
1013 switch (executionType) {
1014 case CL_KERNEL_EXEC_INFO_DEFAULT_TYPE_INTEL:
1015 this->executionType = KernelExecutionType::Default;
1016 break;
1017 case CL_KERNEL_EXEC_INFO_CONCURRENT_TYPE_INTEL:
1018 this->executionType = KernelExecutionType::Concurrent;
1019 break;
1020 default: {
1021 return CL_INVALID_VALUE;
1022 }
1023 }
1024 return CL_SUCCESS;
1025 }
1026
getSuggestedLocalWorkSize(const cl_uint workDim,const size_t * globalWorkSize,const size_t * globalWorkOffset,size_t * localWorkSize)1027 void Kernel::getSuggestedLocalWorkSize(const cl_uint workDim, const size_t *globalWorkSize, const size_t *globalWorkOffset,
1028 size_t *localWorkSize) {
1029 UNRECOVERABLE_IF((workDim == 0) || (workDim > 3));
1030 UNRECOVERABLE_IF(globalWorkSize == nullptr);
1031 Vec3<size_t> elws{0, 0, 0};
1032 Vec3<size_t> gws{
1033 globalWorkSize[0],
1034 (workDim > 1) ? globalWorkSize[1] : 1,
1035 (workDim > 2) ? globalWorkSize[2] : 1};
1036 Vec3<size_t> offset{0, 0, 0};
1037 if (globalWorkOffset) {
1038 offset.x = globalWorkOffset[0];
1039 if (workDim > 1) {
1040 offset.y = globalWorkOffset[1];
1041 if (workDim > 2) {
1042 offset.z = globalWorkOffset[2];
1043 }
1044 }
1045 }
1046
1047 Vec3<size_t> suggestedLws{0, 0, 0};
1048
1049 if (kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[0] != 0) {
1050 suggestedLws.x = kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[0];
1051 suggestedLws.y = kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[1];
1052 suggestedLws.z = kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[2];
1053 } else {
1054 uint32_t dispatchWorkDim = std::max(1U, std::max(gws.getSimplifiedDim(), offset.getSimplifiedDim()));
1055 const DispatchInfo dispatchInfo{&clDevice, this, dispatchWorkDim, gws, elws, offset};
1056 suggestedLws = computeWorkgroupSize(dispatchInfo);
1057 }
1058
1059 localWorkSize[0] = suggestedLws.x;
1060 if (workDim > 1)
1061 localWorkSize[1] = suggestedLws.y;
1062 if (workDim > 2)
1063 localWorkSize[2] = suggestedLws.z;
1064 }
1065
getMaxWorkGroupCount(const cl_uint workDim,const size_t * localWorkSize,const CommandQueue * commandQueue) const1066 uint32_t Kernel::getMaxWorkGroupCount(const cl_uint workDim, const size_t *localWorkSize, const CommandQueue *commandQueue) const {
1067 auto &hardwareInfo = getHardwareInfo();
1068 auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
1069
1070 auto engineGroupType = hwHelper.getEngineGroupType(commandQueue->getGpgpuEngine().getEngineType(),
1071 commandQueue->getGpgpuEngine().getEngineUsage(), hardwareInfo);
1072
1073 const auto &kernelDescriptor = kernelInfo.kernelDescriptor;
1074 auto dssCount = hardwareInfo.gtSystemInfo.DualSubSliceCount;
1075 if (dssCount == 0) {
1076 dssCount = hardwareInfo.gtSystemInfo.SubSliceCount;
1077 }
1078 auto availableThreadCount = hwHelper.calculateAvailableThreadCount(
1079 hardwareInfo.platform.eProductFamily,
1080 kernelDescriptor.kernelAttributes.numGrfRequired,
1081 hardwareInfo.gtSystemInfo.EUCount, hardwareInfo.gtSystemInfo.ThreadCount / hardwareInfo.gtSystemInfo.EUCount);
1082
1083 auto barrierCount = kernelDescriptor.kernelAttributes.barrierCount;
1084 auto maxWorkGroupCount = KernelHelper::getMaxWorkGroupCount(kernelInfo.getMaxSimdSize(),
1085 availableThreadCount,
1086 dssCount,
1087 dssCount * KB * hardwareInfo.capabilityTable.slmSize,
1088 hwHelper.alignSlmSize(slmTotalSize),
1089 static_cast<uint32_t>(hwHelper.getMaxBarrierRegisterPerSlice()),
1090 hwHelper.getBarriersCountFromHasBarriers(barrierCount),
1091 workDim,
1092 localWorkSize);
1093 auto isEngineInstanced = commandQueue->getGpgpuCommandStreamReceiver().getOsContext().isEngineInstanced();
1094 maxWorkGroupCount = hwHelper.adjustMaxWorkGroupCount(maxWorkGroupCount, engineGroupType, hardwareInfo, isEngineInstanced);
1095 return maxWorkGroupCount;
1096 }
1097
makeArgsResident(CommandStreamReceiver & commandStreamReceiver)1098 inline void Kernel::makeArgsResident(CommandStreamReceiver &commandStreamReceiver) {
1099 auto numArgs = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs.size();
1100 for (decltype(numArgs) argIndex = 0; argIndex < numArgs; argIndex++) {
1101 if (kernelArguments[argIndex].object) {
1102 if (kernelArguments[argIndex].type == SVM_ALLOC_OBJ) {
1103 auto pSVMAlloc = (GraphicsAllocation *)kernelArguments[argIndex].object;
1104 auto pageFaultManager = executionEnvironment.memoryManager->getPageFaultManager();
1105 if (pageFaultManager &&
1106 this->isUnifiedMemorySyncRequired) {
1107 pageFaultManager->moveAllocationToGpuDomain(reinterpret_cast<void *>(pSVMAlloc->getGpuAddress()));
1108 }
1109 commandStreamReceiver.makeResident(*pSVMAlloc);
1110 } else if (Kernel::isMemObj(kernelArguments[argIndex].type)) {
1111 auto clMem = const_cast<cl_mem>(static_cast<const _cl_mem *>(kernelArguments[argIndex].object));
1112 auto memObj = castToObjectOrAbort<MemObj>(clMem);
1113 auto image = castToObject<Image>(clMem);
1114 if (image && image->isImageFromImage()) {
1115 commandStreamReceiver.setSamplerCacheFlushRequired(CommandStreamReceiver::SamplerCacheFlushState::samplerCacheFlushBefore);
1116 }
1117 commandStreamReceiver.makeResident(*memObj->getGraphicsAllocation(commandStreamReceiver.getRootDeviceIndex()));
1118 if (memObj->getMcsAllocation()) {
1119 commandStreamReceiver.makeResident(*memObj->getMcsAllocation());
1120 }
1121 }
1122 }
1123 }
1124 }
1125
performKernelTuning(CommandStreamReceiver & commandStreamReceiver,const Vec3<size_t> & lws,const Vec3<size_t> & gws,const Vec3<size_t> & offsets,TimestampPacketContainer * timestampContainer)1126 void Kernel::performKernelTuning(CommandStreamReceiver &commandStreamReceiver, const Vec3<size_t> &lws, const Vec3<size_t> &gws, const Vec3<size_t> &offsets, TimestampPacketContainer *timestampContainer) {
1127 auto performTunning = TunningType::DISABLED;
1128
1129 if (DebugManager.flags.EnableKernelTunning.get() != -1) {
1130 performTunning = static_cast<TunningType>(DebugManager.flags.EnableKernelTunning.get());
1131 }
1132
1133 if (performTunning == TunningType::SIMPLE) {
1134 this->singleSubdevicePreferredInCurrentEnqueue = !this->kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics;
1135
1136 } else if (performTunning == TunningType::FULL) {
1137 KernelConfig config{gws, lws, offsets};
1138
1139 auto submissionDataIt = this->kernelSubmissionMap.find(config);
1140 if (submissionDataIt == this->kernelSubmissionMap.end()) {
1141 KernelSubmissionData submissionData;
1142 submissionData.kernelStandardTimestamps = std::make_unique<TimestampPacketContainer>();
1143 submissionData.kernelSubdeviceTimestamps = std::make_unique<TimestampPacketContainer>();
1144 submissionData.status = TunningStatus::STANDARD_TUNNING_IN_PROGRESS;
1145 submissionData.kernelStandardTimestamps->assignAndIncrementNodesRefCounts(*timestampContainer);
1146 this->kernelSubmissionMap[config] = std::move(submissionData);
1147 this->singleSubdevicePreferredInCurrentEnqueue = false;
1148 return;
1149 }
1150
1151 auto &submissionData = submissionDataIt->second;
1152
1153 if (submissionData.status == TunningStatus::TUNNING_DONE) {
1154 this->singleSubdevicePreferredInCurrentEnqueue = submissionData.singleSubdevicePreferred;
1155 }
1156
1157 if (submissionData.status == TunningStatus::SUBDEVICE_TUNNING_IN_PROGRESS) {
1158 if (this->hasTunningFinished(submissionData)) {
1159 submissionData.status = TunningStatus::TUNNING_DONE;
1160 submissionData.kernelStandardTimestamps.reset();
1161 submissionData.kernelSubdeviceTimestamps.reset();
1162 this->singleSubdevicePreferredInCurrentEnqueue = submissionData.singleSubdevicePreferred;
1163 } else {
1164 this->singleSubdevicePreferredInCurrentEnqueue = false;
1165 }
1166 }
1167
1168 if (submissionData.status == TunningStatus::STANDARD_TUNNING_IN_PROGRESS) {
1169 submissionData.status = TunningStatus::SUBDEVICE_TUNNING_IN_PROGRESS;
1170 submissionData.kernelSubdeviceTimestamps->assignAndIncrementNodesRefCounts(*timestampContainer);
1171 this->singleSubdevicePreferredInCurrentEnqueue = true;
1172 }
1173 }
1174 }
1175
hasTunningFinished(KernelSubmissionData & submissionData)1176 bool Kernel::hasTunningFinished(KernelSubmissionData &submissionData) {
1177 if (!this->hasRunFinished(submissionData.kernelStandardTimestamps.get()) ||
1178 !this->hasRunFinished(submissionData.kernelSubdeviceTimestamps.get())) {
1179 return false;
1180 }
1181
1182 uint64_t globalStartTS = 0u;
1183 uint64_t globalEndTS = 0u;
1184
1185 Event::getBoundaryTimestampValues(submissionData.kernelStandardTimestamps.get(), globalStartTS, globalEndTS);
1186 auto standardTSDiff = globalEndTS - globalStartTS;
1187
1188 Event::getBoundaryTimestampValues(submissionData.kernelSubdeviceTimestamps.get(), globalStartTS, globalEndTS);
1189 auto subdeviceTSDiff = globalEndTS - globalStartTS;
1190
1191 submissionData.singleSubdevicePreferred = standardTSDiff > subdeviceTSDiff;
1192
1193 return true;
1194 }
1195
hasRunFinished(TimestampPacketContainer * timestampContainer)1196 bool Kernel::hasRunFinished(TimestampPacketContainer *timestampContainer) {
1197 for (const auto &node : timestampContainer->peekNodes()) {
1198 for (uint32_t i = 0; i < node->getPacketsUsed(); i++) {
1199 if (node->getContextEndValue(i) == 1) {
1200 return false;
1201 }
1202 }
1203 }
1204 return true;
1205 }
1206
isSingleSubdevicePreferred() const1207 bool Kernel::isSingleSubdevicePreferred() const {
1208 return this->singleSubdevicePreferredInCurrentEnqueue || this->usesSyncBuffer();
1209 }
1210
makeResident(CommandStreamReceiver & commandStreamReceiver)1211 void Kernel::makeResident(CommandStreamReceiver &commandStreamReceiver) {
1212 auto rootDeviceIndex = commandStreamReceiver.getRootDeviceIndex();
1213 if (privateSurface) {
1214 commandStreamReceiver.makeResident(*privateSurface);
1215 }
1216
1217 if (program->getConstantSurface(rootDeviceIndex)) {
1218 commandStreamReceiver.makeResident(*(program->getConstantSurface(rootDeviceIndex)));
1219 }
1220
1221 if (program->getGlobalSurface(rootDeviceIndex)) {
1222 commandStreamReceiver.makeResident(*(program->getGlobalSurface(rootDeviceIndex)));
1223 }
1224
1225 if (program->getExportedFunctionsSurface(rootDeviceIndex)) {
1226 commandStreamReceiver.makeResident(*(program->getExportedFunctionsSurface(rootDeviceIndex)));
1227 }
1228
1229 for (auto gfxAlloc : kernelSvmGfxAllocations) {
1230 commandStreamReceiver.makeResident(*gfxAlloc);
1231 }
1232
1233 auto pageFaultManager = program->peekExecutionEnvironment().memoryManager->getPageFaultManager();
1234
1235 for (auto gfxAlloc : kernelUnifiedMemoryGfxAllocations) {
1236 commandStreamReceiver.makeResident(*gfxAlloc);
1237 if (pageFaultManager) {
1238 pageFaultManager->moveAllocationToGpuDomain(reinterpret_cast<void *>(gfxAlloc->getGpuAddress()));
1239 }
1240 }
1241
1242 if (unifiedMemoryControls.indirectSharedAllocationsAllowed && pageFaultManager) {
1243 pageFaultManager->moveAllocationsWithinUMAllocsManagerToGpuDomain(this->getContext().getSVMAllocsManager());
1244 }
1245 makeArgsResident(commandStreamReceiver);
1246
1247 auto kernelIsaAllocation = this->kernelInfo.kernelAllocation;
1248 if (kernelIsaAllocation) {
1249 commandStreamReceiver.makeResident(*kernelIsaAllocation);
1250 }
1251
1252 gtpinNotifyMakeResident(this, &commandStreamReceiver);
1253
1254 if (unifiedMemoryControls.indirectDeviceAllocationsAllowed ||
1255 unifiedMemoryControls.indirectHostAllocationsAllowed ||
1256 unifiedMemoryControls.indirectSharedAllocationsAllowed) {
1257 this->getContext().getSVMAllocsManager()->makeInternalAllocationsResident(commandStreamReceiver, unifiedMemoryControls.generateMask());
1258 }
1259 }
1260
getResidency(std::vector<Surface * > & dst)1261 void Kernel::getResidency(std::vector<Surface *> &dst) {
1262 if (privateSurface) {
1263 GeneralSurface *surface = new GeneralSurface(privateSurface);
1264 dst.push_back(surface);
1265 }
1266
1267 auto rootDeviceIndex = getDevice().getRootDeviceIndex();
1268 if (program->getConstantSurface(rootDeviceIndex)) {
1269 GeneralSurface *surface = new GeneralSurface(program->getConstantSurface(rootDeviceIndex));
1270 dst.push_back(surface);
1271 }
1272
1273 if (program->getGlobalSurface(rootDeviceIndex)) {
1274 GeneralSurface *surface = new GeneralSurface(program->getGlobalSurface(rootDeviceIndex));
1275 dst.push_back(surface);
1276 }
1277
1278 if (program->getExportedFunctionsSurface(rootDeviceIndex)) {
1279 GeneralSurface *surface = new GeneralSurface(program->getExportedFunctionsSurface(rootDeviceIndex));
1280 dst.push_back(surface);
1281 }
1282
1283 for (auto gfxAlloc : kernelSvmGfxAllocations) {
1284 GeneralSurface *surface = new GeneralSurface(gfxAlloc);
1285 dst.push_back(surface);
1286 }
1287
1288 auto numArgs = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs.size();
1289 for (decltype(numArgs) argIndex = 0; argIndex < numArgs; argIndex++) {
1290 if (kernelArguments[argIndex].object) {
1291 if (kernelArguments[argIndex].type == SVM_ALLOC_OBJ) {
1292 auto pSVMAlloc = (GraphicsAllocation *)kernelArguments[argIndex].object;
1293 dst.push_back(new GeneralSurface(pSVMAlloc));
1294 } else if (Kernel::isMemObj(kernelArguments[argIndex].type)) {
1295 auto clMem = const_cast<cl_mem>(static_cast<const _cl_mem *>(kernelArguments[argIndex].object));
1296 auto memObj = castToObject<MemObj>(clMem);
1297 DEBUG_BREAK_IF(memObj == nullptr);
1298 dst.push_back(new MemObjSurface(memObj));
1299 }
1300 }
1301 }
1302
1303 auto kernelIsaAllocation = this->kernelInfo.kernelAllocation;
1304 if (kernelIsaAllocation) {
1305 GeneralSurface *surface = new GeneralSurface(kernelIsaAllocation);
1306 dst.push_back(surface);
1307 }
1308
1309 gtpinNotifyUpdateResidencyList(this, &dst);
1310 }
1311
requiresCoherency()1312 bool Kernel::requiresCoherency() {
1313 auto numArgs = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs.size();
1314 for (decltype(numArgs) argIndex = 0; argIndex < numArgs; argIndex++) {
1315 if (kernelArguments[argIndex].object) {
1316 if (kernelArguments[argIndex].type == SVM_ALLOC_OBJ) {
1317 auto pSVMAlloc = (GraphicsAllocation *)kernelArguments[argIndex].object;
1318 if (pSVMAlloc->isCoherent()) {
1319 return true;
1320 }
1321 }
1322
1323 if (Kernel::isMemObj(kernelArguments[argIndex].type)) {
1324 auto clMem = const_cast<cl_mem>(static_cast<const _cl_mem *>(kernelArguments[argIndex].object));
1325 auto memObj = castToObjectOrAbort<MemObj>(clMem);
1326 if (memObj->getMultiGraphicsAllocation().isCoherent()) {
1327 return true;
1328 }
1329 }
1330 }
1331 }
1332 return false;
1333 }
1334
setArgLocal(uint32_t argIndexIn,size_t argSize,const void * argVal)1335 cl_int Kernel::setArgLocal(uint32_t argIndexIn,
1336 size_t argSize,
1337 const void *argVal) {
1338 storeKernelArg(argIndexIn, SLM_OBJ, nullptr, argVal, argSize);
1339 uint32_t *crossThreadData = reinterpret_cast<uint32_t *>(this->crossThreadData);
1340 uint32_t argIndex = argIndexIn;
1341
1342 const auto &args = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs;
1343 const auto &currArg = args[argIndex];
1344 UNRECOVERABLE_IF(currArg.getTraits().getAddressQualifier() != KernelArgMetadata::AddrLocal);
1345
1346 slmSizes[argIndex] = static_cast<uint32_t>(argSize);
1347
1348 UNRECOVERABLE_IF(isUndefinedOffset(currArg.as<NEO::ArgDescPointer>().slmOffset));
1349 auto slmOffset = *ptrOffset(crossThreadData, currArg.as<ArgDescPointer>().slmOffset);
1350 slmOffset += static_cast<uint32_t>(argSize);
1351
1352 ++argIndex;
1353 while (argIndex < slmSizes.size()) {
1354 if (args[argIndex].getTraits().getAddressQualifier() != KernelArgMetadata::AddrLocal) {
1355 ++argIndex;
1356 continue;
1357 }
1358
1359 const auto &nextArg = args[argIndex].as<ArgDescPointer>();
1360 UNRECOVERABLE_IF(0 == nextArg.requiredSlmAlignment);
1361
1362 slmOffset = alignUp<uint32_t>(slmOffset, nextArg.requiredSlmAlignment);
1363
1364 auto patchLocation = ptrOffset(crossThreadData, nextArg.slmOffset);
1365 *patchLocation = slmOffset;
1366
1367 slmOffset += static_cast<uint32_t>(slmSizes[argIndex]);
1368 ++argIndex;
1369 }
1370
1371 slmTotalSize = kernelInfo.kernelDescriptor.kernelAttributes.slmInlineSize + alignUp(slmOffset, KB);
1372
1373 return CL_SUCCESS;
1374 }
1375
setArgBuffer(uint32_t argIndex,size_t argSize,const void * argVal)1376 cl_int Kernel::setArgBuffer(uint32_t argIndex,
1377 size_t argSize,
1378 const void *argVal) {
1379
1380 if (argSize != sizeof(cl_mem *)) {
1381 return CL_INVALID_ARG_SIZE;
1382 }
1383
1384 auto clMem = reinterpret_cast<const cl_mem *>(argVal);
1385 auto pClDevice = &getDevice();
1386 auto rootDeviceIndex = pClDevice->getRootDeviceIndex();
1387
1388 const auto &arg = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[argIndex];
1389 const auto &argAsPtr = arg.as<ArgDescPointer>();
1390
1391 if (clMem && *clMem) {
1392 auto clMemObj = *clMem;
1393 DBG_LOG_INPUTS("setArgBuffer cl_mem", clMemObj);
1394
1395 storeKernelArg(argIndex, BUFFER_OBJ, clMemObj, argVal, argSize);
1396
1397 auto buffer = castToObject<Buffer>(clMemObj);
1398 if (!buffer)
1399 return CL_INVALID_MEM_OBJECT;
1400
1401 if (buffer->peekSharingHandler()) {
1402 usingSharedObjArgs = true;
1403 }
1404 patchBufferOffset(argAsPtr, nullptr, nullptr);
1405
1406 if (isValidOffset(argAsPtr.stateless)) {
1407 auto patchLocation = ptrOffset(crossThreadData, argAsPtr.stateless);
1408 uint64_t addressToPatch = buffer->setArgStateless(patchLocation, argAsPtr.pointerSize, rootDeviceIndex, !this->isBuiltIn);
1409
1410 if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) {
1411 PatchInfoData patchInfoData(addressToPatch - buffer->getOffset(), static_cast<uint64_t>(buffer->getOffset()),
1412 PatchInfoAllocationType::KernelArg, reinterpret_cast<uint64_t>(crossThreadData),
1413 static_cast<uint64_t>(argAsPtr.stateless),
1414 PatchInfoAllocationType::IndirectObjectHeap, argAsPtr.pointerSize);
1415 this->patchInfoDataList.push_back(patchInfoData);
1416 }
1417 }
1418
1419 bool disableL3 = false;
1420 bool forceNonAuxMode = false;
1421 bool isAuxTranslationKernel = (AuxTranslationDirection::None != auxTranslationDirection);
1422 auto graphicsAllocation = buffer->getGraphicsAllocation(rootDeviceIndex);
1423 auto &hwInfo = pClDevice->getHardwareInfo();
1424 auto &clHwHelper = ClHwHelper::get(hwInfo.platform.eRenderCoreFamily);
1425
1426 if (isAuxTranslationKernel) {
1427 if (((AuxTranslationDirection::AuxToNonAux == auxTranslationDirection) && argIndex == 1) ||
1428 ((AuxTranslationDirection::NonAuxToAux == auxTranslationDirection) && argIndex == 0)) {
1429 forceNonAuxMode = true;
1430 }
1431 disableL3 = (argIndex == 0);
1432 } else if (graphicsAllocation->isCompressionEnabled() && clHwHelper.requiresNonAuxMode(argAsPtr, hwInfo)) {
1433 forceNonAuxMode = true;
1434 }
1435
1436 if (isValidOffset(argAsPtr.bindful)) {
1437 buffer->setArgStateful(ptrOffset(getSurfaceStateHeap(), argAsPtr.bindful), forceNonAuxMode,
1438 disableL3, isAuxTranslationKernel, arg.isReadOnly(), pClDevice->getDevice(),
1439 kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, areMultipleSubDevicesInContext());
1440 } else if (isValidOffset(argAsPtr.bindless)) {
1441 buffer->setArgStateful(patchBindlessSurfaceState(graphicsAllocation, argAsPtr.bindless), forceNonAuxMode,
1442 disableL3, isAuxTranslationKernel, arg.isReadOnly(), pClDevice->getDevice(),
1443 kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, areMultipleSubDevicesInContext());
1444 }
1445
1446 kernelArguments[argIndex].isStatelessUncacheable = argAsPtr.isPureStateful() ? false : buffer->isMemObjUncacheable();
1447
1448 auto allocationForCacheFlush = graphicsAllocation;
1449
1450 //if we make object uncacheable for surface state and there are not stateless accessess , then ther is no need to flush caches
1451 if (buffer->isMemObjUncacheableForSurfaceState() && argAsPtr.isPureStateful()) {
1452 allocationForCacheFlush = nullptr;
1453 }
1454
1455 addAllocationToCacheFlushVector(argIndex, allocationForCacheFlush);
1456
1457 return CL_SUCCESS;
1458 } else {
1459 storeKernelArg(argIndex, BUFFER_OBJ, nullptr, argVal, argSize);
1460 if (isValidOffset(argAsPtr.stateless)) {
1461 auto patchLocation = ptrOffset(getCrossThreadData(), argAsPtr.stateless);
1462 patchWithRequiredSize(patchLocation, argAsPtr.pointerSize, 0u);
1463 }
1464
1465 if (isValidOffset(argAsPtr.bindful)) {
1466 auto surfaceState = ptrOffset(getSurfaceStateHeap(), argAsPtr.bindful);
1467 Buffer::setSurfaceState(&pClDevice->getDevice(), surfaceState, false, false, 0, nullptr, 0, nullptr, 0, 0,
1468 kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, areMultipleSubDevicesInContext());
1469 }
1470
1471 return CL_SUCCESS;
1472 }
1473 }
1474
setArgPipe(uint32_t argIndex,size_t argSize,const void * argVal)1475 cl_int Kernel::setArgPipe(uint32_t argIndex,
1476 size_t argSize,
1477 const void *argVal) {
1478
1479 if (argSize != sizeof(cl_mem *)) {
1480 return CL_INVALID_ARG_SIZE;
1481 }
1482
1483 auto clMem = reinterpret_cast<const cl_mem *>(argVal);
1484
1485 if (clMem && *clMem) {
1486 auto clMemObj = *clMem;
1487 DBG_LOG_INPUTS("setArgPipe cl_mem", clMemObj);
1488
1489 storeKernelArg(argIndex, PIPE_OBJ, clMemObj, argVal, argSize);
1490
1491 auto memObj = castToObject<MemObj>(clMemObj);
1492 if (!memObj) {
1493 return CL_INVALID_MEM_OBJECT;
1494 }
1495
1496 auto pipe = castToObject<Pipe>(clMemObj);
1497 if (!pipe) {
1498 return CL_INVALID_ARG_VALUE;
1499 }
1500
1501 if (memObj->getContext() != &(this->getContext())) {
1502 return CL_INVALID_MEM_OBJECT;
1503 }
1504
1505 auto rootDeviceIndex = getDevice().getRootDeviceIndex();
1506 const auto &argAsPtr = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[argIndex].as<ArgDescPointer>();
1507
1508 auto patchLocation = ptrOffset(getCrossThreadData(), argAsPtr.stateless);
1509 pipe->setPipeArg(patchLocation, argAsPtr.pointerSize, rootDeviceIndex);
1510
1511 if (isValidOffset(argAsPtr.bindful)) {
1512 auto graphicsAllocation = pipe->getGraphicsAllocation(rootDeviceIndex);
1513 auto surfaceState = ptrOffset(getSurfaceStateHeap(), argAsPtr.bindful);
1514 Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, false, false,
1515 pipe->getSize(), pipe->getCpuAddress(), 0,
1516 graphicsAllocation, 0, 0,
1517 kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, areMultipleSubDevicesInContext());
1518 }
1519
1520 return CL_SUCCESS;
1521 } else {
1522 return CL_INVALID_MEM_OBJECT;
1523 }
1524 }
1525
setArgImage(uint32_t argIndex,size_t argSize,const void * argVal)1526 cl_int Kernel::setArgImage(uint32_t argIndex,
1527 size_t argSize,
1528 const void *argVal) {
1529 return setArgImageWithMipLevel(argIndex, argSize, argVal, 0u);
1530 }
1531
setArgImageWithMipLevel(uint32_t argIndex,size_t argSize,const void * argVal,uint32_t mipLevel)1532 cl_int Kernel::setArgImageWithMipLevel(uint32_t argIndex,
1533 size_t argSize,
1534 const void *argVal, uint32_t mipLevel) {
1535 auto retVal = CL_INVALID_ARG_VALUE;
1536 auto rootDeviceIndex = getDevice().getRootDeviceIndex();
1537
1538 const auto &arg = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[argIndex];
1539 const auto &argAsImg = arg.as<ArgDescImage>();
1540
1541 uint32_t *crossThreadData = reinterpret_cast<uint32_t *>(this->crossThreadData);
1542 auto clMemObj = *(static_cast<const cl_mem *>(argVal));
1543 auto pImage = castToObject<Image>(clMemObj);
1544
1545 if (pImage && argSize == sizeof(cl_mem *)) {
1546 if (pImage->peekSharingHandler()) {
1547 usingSharedObjArgs = true;
1548 }
1549
1550 DBG_LOG_INPUTS("setArgImage cl_mem", clMemObj);
1551
1552 storeKernelArg(argIndex, IMAGE_OBJ, clMemObj, argVal, argSize);
1553
1554 void *surfaceState = nullptr;
1555 if (isValidOffset(argAsImg.bindless)) {
1556 surfaceState = patchBindlessSurfaceState(pImage->getGraphicsAllocation(rootDeviceIndex), argAsImg.bindless);
1557 } else {
1558 DEBUG_BREAK_IF(isUndefinedOffset(argAsImg.bindful));
1559 surfaceState = ptrOffset(getSurfaceStateHeap(), argAsImg.bindful);
1560 }
1561
1562 // Sets SS structure
1563 if (arg.getExtendedTypeInfo().isMediaImage) {
1564 DEBUG_BREAK_IF(!kernelInfo.kernelDescriptor.kernelAttributes.flags.usesVme);
1565 pImage->setMediaImageArg(surfaceState, rootDeviceIndex);
1566 } else {
1567 pImage->setImageArg(surfaceState, arg.getExtendedTypeInfo().isMediaBlockImage, mipLevel, rootDeviceIndex,
1568 getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics);
1569 }
1570
1571 auto &imageDesc = pImage->getImageDesc();
1572 auto &imageFormat = pImage->getImageFormat();
1573 auto graphicsAllocation = pImage->getGraphicsAllocation(rootDeviceIndex);
1574
1575 if (imageDesc.image_type == CL_MEM_OBJECT_IMAGE3D) {
1576 imageTransformer->registerImage3d(argIndex);
1577 }
1578
1579 patch<uint32_t, cl_uint>(imageDesc.num_samples, crossThreadData, argAsImg.metadataPayload.numSamples);
1580 patch<uint32_t, cl_uint>(imageDesc.num_mip_levels, crossThreadData, argAsImg.metadataPayload.numMipLevels);
1581 patch<uint32_t, uint64_t>(imageDesc.image_width, crossThreadData, argAsImg.metadataPayload.imgWidth);
1582 patch<uint32_t, uint64_t>(imageDesc.image_height, crossThreadData, argAsImg.metadataPayload.imgHeight);
1583 patch<uint32_t, uint64_t>(imageDesc.image_depth, crossThreadData, argAsImg.metadataPayload.imgDepth);
1584 patch<uint32_t, uint64_t>(imageDesc.image_array_size, crossThreadData, argAsImg.metadataPayload.arraySize);
1585 patch<uint32_t, cl_channel_type>(imageFormat.image_channel_data_type, crossThreadData, argAsImg.metadataPayload.channelDataType);
1586 patch<uint32_t, cl_channel_order>(imageFormat.image_channel_order, crossThreadData, argAsImg.metadataPayload.channelOrder);
1587 if (arg.getExtendedTypeInfo().hasDeviceSideEnqueueExtendedDescriptor) {
1588 const auto &explicitArgsExtendedDescriptors = kernelInfo.kernelDescriptor.payloadMappings.explicitArgsExtendedDescriptors;
1589 UNRECOVERABLE_IF(argIndex >= explicitArgsExtendedDescriptors.size());
1590 auto deviceSideEnqueueDescriptor = static_cast<ArgDescriptorDeviceSideEnqueue *>(explicitArgsExtendedDescriptors[argIndex].get());
1591 patch<uint32_t, uint32_t>(argAsImg.bindful, crossThreadData, deviceSideEnqueueDescriptor->objectId);
1592 }
1593
1594 auto pixelSize = pImage->getSurfaceFormatInfo().surfaceFormat.ImageElementSizeInBytes;
1595 patch<uint64_t, uint64_t>(graphicsAllocation->getGpuAddress(), crossThreadData, argAsImg.metadataPayload.flatBaseOffset);
1596 patch<uint32_t, uint64_t>((imageDesc.image_width * pixelSize) - 1, crossThreadData, argAsImg.metadataPayload.flatWidth);
1597 patch<uint32_t, uint64_t>((imageDesc.image_height * pixelSize) - 1, crossThreadData, argAsImg.metadataPayload.flatHeight);
1598 patch<uint32_t, uint64_t>(imageDesc.image_row_pitch - 1, crossThreadData, argAsImg.metadataPayload.flatPitch);
1599
1600 addAllocationToCacheFlushVector(argIndex, graphicsAllocation);
1601 retVal = CL_SUCCESS;
1602 }
1603
1604 return retVal;
1605 }
1606
setArgImmediate(uint32_t argIndex,size_t argSize,const void * argVal)1607 cl_int Kernel::setArgImmediate(uint32_t argIndex,
1608 size_t argSize,
1609 const void *argVal) {
1610
1611 auto retVal = CL_INVALID_ARG_VALUE;
1612
1613 if (argVal) {
1614 storeKernelArg(argIndex, NONE_OBJ, nullptr, nullptr, argSize);
1615
1616 [[maybe_unused]] auto crossThreadDataEnd = ptrOffset(crossThreadData, crossThreadDataSize);
1617 const auto &argAsVal = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[argIndex].as<ArgDescValue>();
1618 for (const auto &element : argAsVal.elements) {
1619 DEBUG_BREAK_IF(element.size <= 0);
1620
1621 auto pDst = ptrOffset(crossThreadData, element.offset);
1622 auto pSrc = ptrOffset(argVal, element.sourceOffset);
1623
1624 DEBUG_BREAK_IF(!(ptrOffset(pDst, element.size) <= crossThreadDataEnd));
1625
1626 if (element.sourceOffset < argSize) {
1627 size_t maxBytesToCopy = argSize - element.sourceOffset;
1628 size_t bytesToCopy = std::min(static_cast<size_t>(element.size), maxBytesToCopy);
1629 memcpy_s(pDst, element.size, pSrc, bytesToCopy);
1630 }
1631 }
1632
1633 retVal = CL_SUCCESS;
1634 }
1635
1636 return retVal;
1637 }
1638
setArgSampler(uint32_t argIndex,size_t argSize,const void * argVal)1639 cl_int Kernel::setArgSampler(uint32_t argIndex,
1640 size_t argSize,
1641 const void *argVal) {
1642 auto retVal = CL_INVALID_SAMPLER;
1643
1644 if (!argVal) {
1645 return retVal;
1646 }
1647
1648 uint32_t *crossThreadData = reinterpret_cast<uint32_t *>(this->crossThreadData);
1649 auto clSamplerObj = *(static_cast<const cl_sampler *>(argVal));
1650 auto pSampler = castToObject<Sampler>(clSamplerObj);
1651
1652 if (pSampler) {
1653 pSampler->incRefInternal();
1654 }
1655
1656 if (kernelArguments.at(argIndex).object) {
1657 auto oldSampler = castToObject<Sampler>(kernelArguments.at(argIndex).object);
1658 UNRECOVERABLE_IF(!oldSampler);
1659 oldSampler->decRefInternal();
1660 }
1661
1662 if (pSampler && argSize == sizeof(cl_sampler *)) {
1663 const auto &arg = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[argIndex];
1664 const auto &argAsSmp = arg.as<ArgDescSampler>();
1665
1666 storeKernelArg(argIndex, SAMPLER_OBJ, clSamplerObj, argVal, argSize);
1667
1668 auto dsh = getDynamicStateHeap();
1669 auto samplerState = ptrOffset(dsh, argAsSmp.bindful);
1670
1671 pSampler->setArg(const_cast<void *>(samplerState), clDevice.getHardwareInfo());
1672
1673 patch<uint32_t, uint32_t>(pSampler->getSnapWaValue(), crossThreadData, argAsSmp.metadataPayload.samplerSnapWa);
1674 patch<uint32_t, uint32_t>(GetAddrModeEnum(pSampler->addressingMode), crossThreadData, argAsSmp.metadataPayload.samplerAddressingMode);
1675 patch<uint32_t, uint32_t>(GetNormCoordsEnum(pSampler->normalizedCoordinates), crossThreadData, argAsSmp.metadataPayload.samplerNormalizedCoords);
1676 if (arg.getExtendedTypeInfo().hasDeviceSideEnqueueExtendedDescriptor) {
1677 const auto &explicitArgsExtendedDescriptors = kernelInfo.kernelDescriptor.payloadMappings.explicitArgsExtendedDescriptors;
1678 UNRECOVERABLE_IF(argIndex >= explicitArgsExtendedDescriptors.size());
1679 auto deviceSideEnqueueDescriptor = static_cast<ArgDescriptorDeviceSideEnqueue *>(explicitArgsExtendedDescriptors[argIndex].get());
1680 patch<uint32_t, uint32_t>(SAMPLER_OBJECT_ID_SHIFT + argAsSmp.bindful, crossThreadData, deviceSideEnqueueDescriptor->objectId);
1681 }
1682
1683 retVal = CL_SUCCESS;
1684 }
1685
1686 return retVal;
1687 }
1688
setArgAccelerator(uint32_t argIndex,size_t argSize,const void * argVal)1689 cl_int Kernel::setArgAccelerator(uint32_t argIndex,
1690 size_t argSize,
1691 const void *argVal) {
1692 auto retVal = CL_INVALID_ARG_VALUE;
1693
1694 if (argSize != sizeof(cl_accelerator_intel)) {
1695 return CL_INVALID_ARG_SIZE;
1696 }
1697
1698 if (!argVal) {
1699 return retVal;
1700 }
1701
1702 auto clAcceleratorObj = *(static_cast<const cl_accelerator_intel *>(argVal));
1703 DBG_LOG_INPUTS("setArgAccelerator cl_mem", clAcceleratorObj);
1704
1705 const auto pAccelerator = castToObject<IntelAccelerator>(clAcceleratorObj);
1706
1707 if (pAccelerator) {
1708 storeKernelArg(argIndex, ACCELERATOR_OBJ, clAcceleratorObj, argVal, argSize);
1709
1710 const auto &arg = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[argIndex];
1711 const auto &argAsSmp = arg.as<ArgDescSampler>();
1712
1713 if (argAsSmp.samplerType == iOpenCL::SAMPLER_OBJECT_VME) {
1714
1715 const auto pVmeAccelerator = castToObjectOrAbort<VmeAccelerator>(pAccelerator);
1716 auto pDesc = static_cast<const cl_motion_estimation_desc_intel *>(pVmeAccelerator->getDescriptor());
1717 DEBUG_BREAK_IF(!pDesc);
1718
1719 if (arg.getExtendedTypeInfo().hasVmeExtendedDescriptor) {
1720 const auto &explicitArgsExtendedDescriptors = kernelInfo.kernelDescriptor.payloadMappings.explicitArgsExtendedDescriptors;
1721 UNRECOVERABLE_IF(argIndex >= explicitArgsExtendedDescriptors.size());
1722 auto vmeDescriptor = static_cast<ArgDescVme *>(explicitArgsExtendedDescriptors[argIndex].get());
1723
1724 auto pVmeMbBlockTypeDst = reinterpret_cast<cl_uint *>(ptrOffset(crossThreadData, vmeDescriptor->mbBlockType));
1725 *pVmeMbBlockTypeDst = pDesc->mb_block_type;
1726
1727 auto pVmeSubpixelMode = reinterpret_cast<cl_uint *>(ptrOffset(crossThreadData, vmeDescriptor->subpixelMode));
1728 *pVmeSubpixelMode = pDesc->subpixel_mode;
1729
1730 auto pVmeSadAdjustMode = reinterpret_cast<cl_uint *>(ptrOffset(crossThreadData, vmeDescriptor->sadAdjustMode));
1731 *pVmeSadAdjustMode = pDesc->sad_adjust_mode;
1732
1733 auto pVmeSearchPathType = reinterpret_cast<cl_uint *>(ptrOffset(crossThreadData, vmeDescriptor->searchPathType));
1734 *pVmeSearchPathType = pDesc->search_path_type;
1735 }
1736
1737 retVal = CL_SUCCESS;
1738 } else if (argAsSmp.samplerType == iOpenCL::SAMPLER_OBJECT_VE) {
1739 retVal = CL_SUCCESS;
1740 }
1741 }
1742
1743 return retVal;
1744 }
1745
setArgDevQueue(uint32_t argIndex,size_t argSize,const void * argVal)1746 cl_int Kernel::setArgDevQueue(uint32_t argIndex,
1747 size_t argSize,
1748 const void *argVal) {
1749 if (argVal == nullptr) {
1750 return CL_INVALID_ARG_VALUE;
1751 }
1752
1753 if (argSize != sizeof(cl_command_queue)) {
1754 return CL_INVALID_ARG_SIZE;
1755 }
1756
1757 auto clDeviceQueue = *(static_cast<const device_queue *>(argVal));
1758 auto pDeviceQueue = castToObject<DeviceQueue>(clDeviceQueue);
1759
1760 if (pDeviceQueue == nullptr) {
1761 return CL_INVALID_DEVICE_QUEUE;
1762 }
1763
1764 storeKernelArg(argIndex, DEVICE_QUEUE_OBJ, clDeviceQueue, argVal, argSize);
1765
1766 const auto &argAsPtr = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[argIndex].as<ArgDescPointer>();
1767 auto patchLocation = ptrOffset(reinterpret_cast<uint32_t *>(crossThreadData), argAsPtr.stateless);
1768 patchWithRequiredSize(patchLocation, argAsPtr.pointerSize,
1769 static_cast<uintptr_t>(pDeviceQueue->getQueueBuffer()->getGpuAddressToPatch()));
1770
1771 return CL_SUCCESS;
1772 }
1773
setKernelArgHandler(uint32_t argIndex,KernelArgHandler handler)1774 void Kernel::setKernelArgHandler(uint32_t argIndex, KernelArgHandler handler) {
1775 if (kernelArgHandlers.size() <= argIndex) {
1776 kernelArgHandlers.resize(argIndex + 1);
1777 }
1778
1779 kernelArgHandlers[argIndex] = handler;
1780 }
1781
unsetArg(uint32_t argIndex)1782 void Kernel::unsetArg(uint32_t argIndex) {
1783 if (kernelArguments[argIndex].isPatched) {
1784 patchedArgumentsNum--;
1785 kernelArguments[argIndex].isPatched = false;
1786 if (kernelArguments[argIndex].isStatelessUncacheable) {
1787 statelessUncacheableArgsCount--;
1788 kernelArguments[argIndex].isStatelessUncacheable = false;
1789 }
1790 }
1791 }
1792
createReflectionSurface()1793 void Kernel::createReflectionSurface() {
1794 auto pClDevice = &clDevice;
1795 if (this->isParentKernel && kernelReflectionSurface == nullptr) {
1796 auto &hwInfo = pClDevice->getHardwareInfo();
1797 auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
1798 BlockKernelManager *blockManager = program->getBlockKernelManager();
1799 uint32_t blockCount = static_cast<uint32_t>(blockManager->getCount());
1800
1801 ObjectCounts objectCount;
1802 getParentObjectCounts(objectCount);
1803 uint32_t parentImageCount = objectCount.imageCount;
1804 uint32_t parentSamplerCount = objectCount.samplerCount;
1805 size_t maxConstantBufferSize = 0;
1806
1807 std::vector<IGIL_KernelCurbeParams> *curbeParamsForBlocks = new std::vector<IGIL_KernelCurbeParams>[blockCount];
1808
1809 uint64_t *tokenMask = new uint64_t[blockCount];
1810 uint32_t *sshTokenOffsetsFromKernelData = new uint32_t[blockCount];
1811
1812 size_t kernelReflectionSize = alignUp(sizeof(IGIL_KernelDataHeader) + blockCount * sizeof(IGIL_KernelAddressData), sizeof(void *));
1813 uint32_t kernelDataOffset = static_cast<uint32_t>(kernelReflectionSize);
1814 uint32_t parentSSHAlignedSize = alignUp(this->kernelInfo.heapInfo.SurfaceStateHeapSize, hwHelper.getBindingTableStateAlignement());
1815 uint32_t btOffset = parentSSHAlignedSize;
1816
1817 for (uint32_t i = 0; i < blockCount; i++) {
1818 const KernelInfo *pBlockInfo = blockManager->getBlockKernelInfo(i);
1819 size_t samplerStateAndBorderColorSize = 0;
1820
1821 uint32_t firstSSHTokenIndex = 0;
1822
1823 ReflectionSurfaceHelper::getCurbeParams(curbeParamsForBlocks[i], tokenMask[i], firstSSHTokenIndex, *pBlockInfo, hwInfo);
1824
1825 maxConstantBufferSize = std::max(maxConstantBufferSize, static_cast<size_t>(pBlockInfo->kernelDescriptor.kernelAttributes.crossThreadDataSize));
1826
1827 samplerStateAndBorderColorSize = pBlockInfo->getSamplerStateArraySize(hwInfo);
1828 samplerStateAndBorderColorSize = alignUp(samplerStateAndBorderColorSize, Sampler::samplerStateArrayAlignment);
1829 samplerStateAndBorderColorSize += pBlockInfo->getBorderColorStateSize();
1830 samplerStateAndBorderColorSize = alignUp(samplerStateAndBorderColorSize, sizeof(void *));
1831
1832 sshTokenOffsetsFromKernelData[i] = offsetof(IGIL_KernelData, m_data) + sizeof(IGIL_KernelCurbeParams) * firstSSHTokenIndex;
1833
1834 kernelReflectionSize += alignUp(sizeof(IGIL_KernelData) + sizeof(IGIL_KernelCurbeParams) * curbeParamsForBlocks[i].size(), sizeof(void *));
1835 kernelReflectionSize += parentSamplerCount * sizeof(IGIL_SamplerParams) + samplerStateAndBorderColorSize;
1836 }
1837
1838 maxConstantBufferSize = alignUp(maxConstantBufferSize, sizeof(void *));
1839 kernelReflectionSize += blockCount * alignUp(maxConstantBufferSize, sizeof(void *));
1840 kernelReflectionSize += parentImageCount * sizeof(IGIL_ImageParamters);
1841 kernelReflectionSize += parentSamplerCount * sizeof(IGIL_ParentSamplerParams);
1842 kernelReflectionSurface = executionEnvironment.memoryManager->allocateGraphicsMemoryWithProperties(
1843 {pClDevice->getRootDeviceIndex(), kernelReflectionSize,
1844 GraphicsAllocation::AllocationType::DEVICE_QUEUE_BUFFER,
1845 pClDevice->getDeviceBitfield()});
1846
1847 for (uint32_t i = 0; i < blockCount; i++) {
1848 const KernelInfo *pBlockInfo = blockManager->getBlockKernelInfo(i);
1849 uint32_t newKernelDataOffset = ReflectionSurfaceHelper::setKernelData(kernelReflectionSurface->getUnderlyingBuffer(),
1850 kernelDataOffset,
1851 curbeParamsForBlocks[i],
1852 tokenMask[i],
1853 maxConstantBufferSize,
1854 parentSamplerCount,
1855 *pBlockInfo,
1856 hwInfo);
1857
1858 uint32_t offset = static_cast<uint32_t>(offsetof(IGIL_KernelDataHeader, m_data) + sizeof(IGIL_KernelAddressData) * i);
1859
1860 uint32_t samplerHeapOffset = static_cast<uint32_t>(alignUp(kernelDataOffset + sizeof(IGIL_KernelData) + curbeParamsForBlocks[i].size() * sizeof(IGIL_KernelCurbeParams), sizeof(void *)));
1861 uint32_t samplerHeapSize = static_cast<uint32_t>(alignUp(pBlockInfo->getSamplerStateArraySize(hwInfo), Sampler::samplerStateArrayAlignment) + pBlockInfo->getBorderColorStateSize());
1862 uint32_t constantBufferOffset = alignUp(samplerHeapOffset + samplerHeapSize, sizeof(void *));
1863
1864 uint32_t samplerParamsOffset = 0;
1865 if (parentSamplerCount) {
1866 samplerParamsOffset = newKernelDataOffset - sizeof(IGIL_SamplerParams) * parentSamplerCount;
1867 IGIL_SamplerParams *pSamplerParams = (IGIL_SamplerParams *)ptrOffset(kernelReflectionSurface->getUnderlyingBuffer(), samplerParamsOffset);
1868 uint32_t sampler = 0;
1869 const auto &args = pBlockInfo->kernelDescriptor.payloadMappings.explicitArgs;
1870 for (uint32_t argID = 0; argID < args.size(); argID++) {
1871 if (args[argID].is<ArgDescriptor::ArgTSampler>()) {
1872
1873 pSamplerParams[sampler].m_ArgID = argID;
1874 pSamplerParams[sampler].m_SamplerStateOffset = args[argID].as<ArgDescSampler>().bindful;
1875 sampler++;
1876 }
1877 }
1878 }
1879
1880 ReflectionSurfaceHelper::setKernelAddressData(kernelReflectionSurface->getUnderlyingBuffer(),
1881 offset,
1882 kernelDataOffset,
1883 samplerHeapOffset,
1884 constantBufferOffset,
1885 samplerParamsOffset,
1886 sshTokenOffsetsFromKernelData[i] + kernelDataOffset,
1887 btOffset,
1888 *pBlockInfo,
1889 hwInfo);
1890
1891 if (samplerHeapSize > 0) {
1892 void *pDst = ptrOffset(kernelReflectionSurface->getUnderlyingBuffer(), samplerHeapOffset);
1893 const void *pSrc = ptrOffset(pBlockInfo->heapInfo.pDsh, pBlockInfo->getBorderColorOffset());
1894 memcpy_s(pDst, samplerHeapSize, pSrc, samplerHeapSize);
1895 }
1896
1897 void *pDst = ptrOffset(kernelReflectionSurface->getUnderlyingBuffer(), constantBufferOffset);
1898 const char *pSrc = pBlockInfo->crossThreadData;
1899 memcpy_s(pDst, pBlockInfo->getConstantBufferSize(), pSrc, pBlockInfo->getConstantBufferSize());
1900
1901 btOffset += pBlockInfo->kernelDescriptor.payloadMappings.bindingTable.tableOffset;
1902 kernelDataOffset = newKernelDataOffset;
1903 }
1904
1905 uint32_t samplerOffset = 0;
1906 if (parentSamplerCount) {
1907 samplerOffset = kernelDataOffset + parentImageCount * sizeof(IGIL_ImageParamters);
1908 }
1909 ReflectionSurfaceHelper::setKernelDataHeader(kernelReflectionSurface->getUnderlyingBuffer(), blockCount, parentImageCount, parentSamplerCount, kernelDataOffset, samplerOffset);
1910 delete[] curbeParamsForBlocks;
1911 delete[] tokenMask;
1912 delete[] sshTokenOffsetsFromKernelData;
1913
1914 // Patch constant values once after reflection surface creation
1915 patchBlocksCurbeWithConstantValues();
1916 }
1917
1918 if (DebugManager.flags.ForceDispatchScheduler.get()) {
1919 if (this->isSchedulerKernel && kernelReflectionSurface == nullptr) {
1920 kernelReflectionSurface = executionEnvironment.memoryManager->allocateGraphicsMemoryWithProperties(
1921 {pClDevice->getRootDeviceIndex(), MemoryConstants::pageSize,
1922 GraphicsAllocation::AllocationType::DEVICE_QUEUE_BUFFER,
1923 pClDevice->getDeviceBitfield()});
1924 }
1925 }
1926 }
1927
getParentObjectCounts(ObjectCounts & objectCount)1928 void Kernel::getParentObjectCounts(ObjectCounts &objectCount) {
1929 objectCount.imageCount = 0;
1930 objectCount.samplerCount = 0;
1931 DEBUG_BREAK_IF(!isParentKernel);
1932
1933 for (const auto &arg : this->kernelArguments) {
1934 if (arg.type == SAMPLER_OBJ) {
1935 objectCount.samplerCount++;
1936 } else if (arg.type == IMAGE_OBJ) {
1937 objectCount.imageCount++;
1938 }
1939 }
1940 }
1941
hasPrintfOutput() const1942 bool Kernel::hasPrintfOutput() const {
1943 return kernelInfo.kernelDescriptor.kernelAttributes.flags.usesPrintf;
1944 }
1945
getInstructionHeapSizeForExecutionModel() const1946 size_t Kernel::getInstructionHeapSizeForExecutionModel() const {
1947 BlockKernelManager *blockManager = program->getBlockKernelManager();
1948 uint32_t blockCount = static_cast<uint32_t>(blockManager->getCount());
1949
1950 size_t totalSize = 0;
1951 if (isParentKernel) {
1952 totalSize = kernelBinaryAlignment - 1; // for initial alignment
1953 for (uint32_t i = 0; i < blockCount; i++) {
1954 const KernelInfo *pBlockInfo = blockManager->getBlockKernelInfo(i);
1955 totalSize += pBlockInfo->heapInfo.KernelHeapSize;
1956 totalSize = alignUp(totalSize, kernelBinaryAlignment);
1957 }
1958 }
1959 return totalSize;
1960 }
1961
patchBlocksCurbeWithConstantValues()1962 void Kernel::patchBlocksCurbeWithConstantValues() {
1963 auto rootDeviceIndex = clDevice.getRootDeviceIndex();
1964 BlockKernelManager *blockManager = program->getBlockKernelManager();
1965 uint32_t blockCount = static_cast<uint32_t>(blockManager->getCount());
1966
1967 uint64_t globalMemoryGpuAddress = program->getGlobalSurface(rootDeviceIndex) != nullptr ? program->getGlobalSurface(rootDeviceIndex)->getGpuAddressToPatch() : 0;
1968 uint64_t constantMemoryGpuAddress = program->getConstantSurface(rootDeviceIndex) != nullptr ? program->getConstantSurface(rootDeviceIndex)->getGpuAddressToPatch() : 0;
1969
1970 for (uint32_t blockID = 0; blockID < blockCount; blockID++) {
1971 const KernelInfo *pBlockInfo = blockManager->getBlockKernelInfo(blockID);
1972
1973 uint64_t globalMemoryCurbeOffset = ReflectionSurfaceHelper::undefinedOffset;
1974 uint32_t globalMemoryPatchSize = 0;
1975 uint64_t constantMemoryCurbeOffset = ReflectionSurfaceHelper::undefinedOffset;
1976 uint32_t constantMemoryPatchSize = 0;
1977
1978 if (isValidOffset(pBlockInfo->kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress.stateless)) {
1979 globalMemoryCurbeOffset = pBlockInfo->kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress.stateless;
1980 globalMemoryPatchSize = pBlockInfo->kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress.pointerSize;
1981 }
1982
1983 if (isValidOffset(pBlockInfo->kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress.stateless)) {
1984 constantMemoryCurbeOffset = pBlockInfo->kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress.stateless;
1985 constantMemoryPatchSize = pBlockInfo->kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress.pointerSize;
1986 }
1987
1988 ReflectionSurfaceHelper::patchBlocksCurbeWithConstantValues(kernelReflectionSurface->getUnderlyingBuffer(), blockID,
1989 globalMemoryCurbeOffset, globalMemoryPatchSize, globalMemoryGpuAddress,
1990 constantMemoryCurbeOffset, constantMemoryPatchSize, constantMemoryGpuAddress,
1991 ReflectionSurfaceHelper::undefinedOffset, 0, 0);
1992 }
1993 }
1994
getCurbeParams(std::vector<IGIL_KernelCurbeParams> & curbeParamsOut,uint64_t & tokenMaskOut,uint32_t & firstSSHTokenIndex,const KernelInfo & kernelInfo,const HardwareInfo & hwInfo)1995 void Kernel::ReflectionSurfaceHelper::getCurbeParams(std::vector<IGIL_KernelCurbeParams> &curbeParamsOut, uint64_t &tokenMaskOut, uint32_t &firstSSHTokenIndex, const KernelInfo &kernelInfo, const HardwareInfo &hwInfo) {
1996 const auto &args = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs;
1997 const auto gpuPointerSize = kernelInfo.kernelDescriptor.kernelAttributes.gpuPointerSize;
1998 uint32_t bindingTableIndex = 253;
1999 uint64_t tokenMask = 0;
2000
2001 for (size_t argNum = 0; argNum < args.size(); argNum++) {
2002 const auto &arg = args[argNum];
2003
2004 auto sizeOfKernelArgForSSH = gpuPointerSize;
2005 bindingTableIndex = 253;
2006
2007 if (arg.is<ArgDescriptor::ArgTPointer>()) {
2008 const auto &argAsPtr = arg.as<ArgDescPointer>();
2009
2010 if (argAsPtr.requiredSlmAlignment) {
2011 curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_OBJECT_ARGUMENT_SIZES, 0, argAsPtr.slmOffset, argAsPtr.requiredSlmAlignment});
2012 tokenMask |= shiftLeftBy(DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_OBJECT_ARGUMENT_SIZES);
2013 } else {
2014 curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{COMPILER_DATA_PARAMETER_GLOBAL_SURFACE, gpuPointerSize, argAsPtr.stateless, static_cast<uint>(argNum)});
2015 tokenMask |= shiftLeftBy(63);
2016 }
2017 } else if (arg.is<ArgDescriptor::ArgTImage>()) {
2018 const auto &argAsImg = arg.as<ArgDescImage>();
2019
2020 auto emplaceIfValidOffset = [&](uint parameterType, NEO::CrossThreadDataOffset offset) {
2021 if (isValidOffset(offset)) {
2022 curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{parameterType + 50, sizeof(uint32_t), offset, static_cast<uint>(argNum)});
2023 }
2024 };
2025 emplaceIfValidOffset(DATA_PARAMETER_IMAGE_WIDTH, argAsImg.metadataPayload.imgWidth);
2026 emplaceIfValidOffset(DATA_PARAMETER_IMAGE_HEIGHT, argAsImg.metadataPayload.imgHeight);
2027 emplaceIfValidOffset(DATA_PARAMETER_IMAGE_DEPTH, argAsImg.metadataPayload.imgDepth);
2028 emplaceIfValidOffset(DATA_PARAMETER_IMAGE_CHANNEL_DATA_TYPE, argAsImg.metadataPayload.channelDataType);
2029 emplaceIfValidOffset(DATA_PARAMETER_IMAGE_CHANNEL_ORDER, argAsImg.metadataPayload.channelOrder);
2030 emplaceIfValidOffset(DATA_PARAMETER_IMAGE_ARRAY_SIZE, argAsImg.metadataPayload.arraySize);
2031 if (arg.getExtendedTypeInfo().hasDeviceSideEnqueueExtendedDescriptor) {
2032 const auto &argsExtDescriptors = kernelInfo.kernelDescriptor.payloadMappings.explicitArgsExtendedDescriptors;
2033 UNRECOVERABLE_IF(argNum >= argsExtDescriptors.size());
2034 const auto &deviceSideEnqueueDescriptor = static_cast<ArgDescriptorDeviceSideEnqueue *>(argsExtDescriptors[argNum].get());
2035 emplaceIfValidOffset(DATA_PARAMETER_OBJECT_ID, deviceSideEnqueueDescriptor->objectId);
2036 }
2037
2038 const auto &bindingTable = kernelInfo.kernelDescriptor.payloadMappings.bindingTable;
2039 if (isValidOffset(bindingTable.tableOffset)) {
2040 auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
2041 const auto ssh = static_cast<const char *>(kernelInfo.heapInfo.pSsh) + bindingTable.tableOffset;
2042
2043 for (uint8_t i = 0; i < bindingTable.numEntries; i++) {
2044 const auto pointer = static_cast<NEO::SurfaceStateHeapOffset>(hwHelper.getBindingTableStateSurfaceStatePointer(ssh, i));
2045 if (pointer == argAsImg.bindful) {
2046 bindingTableIndex = i;
2047 break;
2048 }
2049 }
2050 DEBUG_BREAK_IF(bindingTableIndex == 253);
2051 }
2052
2053 tokenMask |= shiftLeftBy(50);
2054 } else if (arg.is<ArgDescriptor::ArgTSampler>()) {
2055 const auto &argAsSmp = arg.as<ArgDescSampler>();
2056
2057 auto emplaceIfValidOffset = [&](uint parameterType, NEO::CrossThreadDataOffset offset) {
2058 if (isValidOffset(offset)) {
2059 curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{parameterType + 100, sizeof(uint32_t), offset, static_cast<uint>(argNum)});
2060 }
2061 };
2062 emplaceIfValidOffset(DATA_PARAMETER_SAMPLER_COORDINATE_SNAP_WA_REQUIRED, argAsSmp.metadataPayload.samplerSnapWa);
2063 emplaceIfValidOffset(DATA_PARAMETER_SAMPLER_ADDRESS_MODE, argAsSmp.metadataPayload.samplerAddressingMode);
2064 emplaceIfValidOffset(DATA_PARAMETER_SAMPLER_NORMALIZED_COORDS, argAsSmp.metadataPayload.samplerNormalizedCoords);
2065 if (arg.getExtendedTypeInfo().hasDeviceSideEnqueueExtendedDescriptor) {
2066 const auto &argsExtDescriptors = kernelInfo.kernelDescriptor.payloadMappings.explicitArgsExtendedDescriptors;
2067 UNRECOVERABLE_IF(argNum >= argsExtDescriptors.size());
2068 const auto &deviceSideEnqueueDescriptor = static_cast<ArgDescriptorDeviceSideEnqueue *>(argsExtDescriptors[argNum].get());
2069 emplaceIfValidOffset(DATA_PARAMETER_OBJECT_ID, deviceSideEnqueueDescriptor->objectId);
2070 }
2071
2072 tokenMask |= shiftLeftBy(51);
2073 } else {
2074 bindingTableIndex = 0;
2075 sizeOfKernelArgForSSH = 0;
2076 }
2077
2078 curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{1024, sizeOfKernelArgForSSH, bindingTableIndex, static_cast<uint>(argNum)});
2079 }
2080
2081 for (const auto ¶m : kernelInfo.kernelDescriptor.kernelMetadata.allByValueKernelArguments) {
2082 curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{DATA_PARAMETER_KERNEL_ARGUMENT, param.byValueElement.size, param.byValueElement.offset, param.argNum});
2083 tokenMask |= shiftLeftBy(DATA_PARAMETER_KERNEL_ARGUMENT);
2084 }
2085
2086 const auto &dispatchTraits = kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits;
2087 for (uint32_t i = 0; i < 3U; i++) {
2088 auto emplaceIfValidOffsetAndSetTokenMask = [&](uint parameterType, NEO::CrossThreadDataOffset offset) {
2089 constexpr uint paramSize = sizeof(uint32_t);
2090 if (isValidOffset(offset)) {
2091 curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{parameterType, paramSize, offset, static_cast<uint>(i * paramSize)});
2092 tokenMask |= shiftLeftBy(parameterType);
2093 }
2094 };
2095 emplaceIfValidOffsetAndSetTokenMask(DATA_PARAMETER_LOCAL_WORK_SIZE, dispatchTraits.localWorkSize[i]);
2096 emplaceIfValidOffsetAndSetTokenMask(DATA_PARAMETER_LOCAL_WORK_SIZE, dispatchTraits.localWorkSize2[i]);
2097 emplaceIfValidOffsetAndSetTokenMask(DATA_PARAMETER_GLOBAL_WORK_OFFSET, dispatchTraits.globalWorkOffset[i]);
2098 emplaceIfValidOffsetAndSetTokenMask(DATA_PARAMETER_ENQUEUED_LOCAL_WORK_SIZE, dispatchTraits.enqueuedLocalWorkSize[i]);
2099 emplaceIfValidOffsetAndSetTokenMask(DATA_PARAMETER_GLOBAL_WORK_SIZE, dispatchTraits.globalWorkSize[i]);
2100 emplaceIfValidOffsetAndSetTokenMask(DATA_PARAMETER_NUM_WORK_GROUPS, dispatchTraits.numWorkGroups[i]);
2101 }
2102 {
2103 const auto &payloadMappings = kernelInfo.kernelDescriptor.payloadMappings;
2104 auto emplaceIfValidOffsetAndSetTokenMask = [&](uint parameterType, NEO::CrossThreadDataOffset offset) {
2105 if (isValidOffset(offset)) {
2106 curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{parameterType, sizeof(uint32_t), offset, 0});
2107 tokenMask |= shiftLeftBy(parameterType);
2108 }
2109 };
2110 emplaceIfValidOffsetAndSetTokenMask(DATA_PARAMETER_PARENT_EVENT, payloadMappings.implicitArgs.deviceSideEnqueueParentEvent);
2111 emplaceIfValidOffsetAndSetTokenMask(DATA_PARAMETER_WORK_DIMENSIONS, payloadMappings.dispatchTraits.workDim);
2112 }
2113
2114 std::sort(curbeParamsOut.begin(), curbeParamsOut.end(), compareFunction);
2115 tokenMaskOut = tokenMask;
2116 firstSSHTokenIndex = static_cast<uint32_t>(curbeParamsOut.size() - args.size());
2117 }
2118
setKernelData(void * reflectionSurface,uint32_t offset,std::vector<IGIL_KernelCurbeParams> & curbeParamsIn,uint64_t tokenMaskIn,size_t maxConstantBufferSize,size_t samplerCount,const KernelInfo & kernelInfo,const HardwareInfo & hwInfo)2119 uint32_t Kernel::ReflectionSurfaceHelper::setKernelData(void *reflectionSurface, uint32_t offset,
2120 std::vector<IGIL_KernelCurbeParams> &curbeParamsIn, uint64_t tokenMaskIn,
2121 size_t maxConstantBufferSize, size_t samplerCount, const KernelInfo &kernelInfo, const HardwareInfo &hwInfo) {
2122 uint32_t offsetToEnd = 0;
2123 IGIL_KernelData *kernelData = reinterpret_cast<IGIL_KernelData *>(ptrOffset(reflectionSurface, offset));
2124 size_t samplerHeapSize = alignUp(kernelInfo.getSamplerStateArraySize(hwInfo), Sampler::samplerStateArrayAlignment) + kernelInfo.getBorderColorStateSize();
2125
2126 kernelData->m_numberOfCurbeParams = static_cast<uint32_t>(curbeParamsIn.size()); // number of paramters to patch
2127 kernelData->m_numberOfCurbeTokens = static_cast<uint32_t>(curbeParamsIn.size() - kernelInfo.kernelDescriptor.payloadMappings.explicitArgs.size());
2128 kernelData->m_numberOfSamplerStates = static_cast<uint32_t>(kernelInfo.getSamplerStateArrayCount());
2129 kernelData->m_SizeOfSamplerHeap = static_cast<uint32_t>(samplerHeapSize);
2130 kernelData->m_SamplerBorderColorStateOffsetOnDSH = isValidOffset(kernelInfo.kernelDescriptor.payloadMappings.samplerTable.borderColor) ? kernelInfo.kernelDescriptor.payloadMappings.samplerTable.borderColor : 0;
2131 kernelData->m_SamplerStateArrayOffsetOnDSH = isValidOffset(kernelInfo.kernelDescriptor.payloadMappings.samplerTable.tableOffset) ? kernelInfo.kernelDescriptor.payloadMappings.samplerTable.tableOffset : -1;
2132 kernelData->m_sizeOfConstantBuffer = kernelInfo.getConstantBufferSize();
2133 kernelData->m_PatchTokensMask = tokenMaskIn;
2134 kernelData->m_ScratchSpacePatchValue = 0;
2135 kernelData->m_SIMDSize = kernelInfo.getMaxSimdSize();
2136 kernelData->m_HasBarriers = kernelInfo.kernelDescriptor.kernelAttributes.barrierCount;
2137 kernelData->m_RequiredWkgSizes[0] = kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[0];
2138 kernelData->m_RequiredWkgSizes[1] = kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[1];
2139 kernelData->m_RequiredWkgSizes[2] = kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[2];
2140 kernelData->m_InilineSLMSize = kernelInfo.kernelDescriptor.kernelAttributes.slmInlineSize;
2141
2142 bool localIdRequired = false;
2143 if (kernelInfo.kernelDescriptor.kernelAttributes.flags.usesFlattenedLocalIds || (kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels > 0)) {
2144 localIdRequired = true;
2145 }
2146 kernelData->m_PayloadSize = PerThreadDataHelper::getThreadPayloadSize(kernelInfo.kernelDescriptor, hwInfo.capabilityTable.grfSize);
2147
2148 kernelData->m_NeedLocalIDS = localIdRequired ? 1 : 0;
2149 kernelData->m_DisablePreemption = 0u;
2150
2151 bool concurrentExecAllowed = true;
2152
2153 if (kernelInfo.kernelDescriptor.kernelAttributes.perHwThreadPrivateMemorySize > 0) {
2154 concurrentExecAllowed = false;
2155 }
2156 kernelData->m_CanRunConcurently = concurrentExecAllowed ? 1 : 0;
2157
2158 if (DebugManager.flags.DisableConcurrentBlockExecution.get()) {
2159 kernelData->m_CanRunConcurently = false;
2160 }
2161
2162 IGIL_KernelCurbeParams *kernelCurbeParams = kernelData->m_data;
2163
2164 for (uint32_t i = 0; i < curbeParamsIn.size(); i++) {
2165 kernelCurbeParams[i] = curbeParamsIn[i];
2166 }
2167
2168 offsetToEnd = static_cast<uint32_t>(offset +
2169 alignUp(sizeof(IGIL_KernelData) + sizeof(IGIL_KernelCurbeParams) * curbeParamsIn.size(), sizeof(void *)) +
2170 alignUp(samplerHeapSize, sizeof(void *)) +
2171 alignUp(maxConstantBufferSize, sizeof(void *)) +
2172 sizeof(IGIL_SamplerParams) * samplerCount);
2173
2174 return offsetToEnd;
2175 }
2176
setKernelAddressDataBtOffset(void * reflectionSurface,uint32_t blockID,uint32_t btOffset)2177 void Kernel::ReflectionSurfaceHelper::setKernelAddressDataBtOffset(void *reflectionSurface, uint32_t blockID, uint32_t btOffset) {
2178
2179 uint32_t offset = static_cast<uint32_t>(offsetof(IGIL_KernelDataHeader, m_data) + sizeof(IGIL_KernelAddressData) * blockID);
2180 IGIL_KernelAddressData *kernelAddressData = reinterpret_cast<IGIL_KernelAddressData *>(ptrOffset(reflectionSurface, offset));
2181
2182 kernelAddressData->m_BTSoffset = btOffset;
2183 }
2184
setKernelAddressData(void * reflectionSurface,uint32_t offset,uint32_t kernelDataOffset,uint32_t samplerHeapOffset,uint32_t constantBufferOffset,uint32_t samplerParamsOffset,uint32_t sshTokensOffset,uint32_t btOffset,const KernelInfo & kernelInfo,const HardwareInfo & hwInfo)2185 void Kernel::ReflectionSurfaceHelper::setKernelAddressData(void *reflectionSurface, uint32_t offset, uint32_t kernelDataOffset, uint32_t samplerHeapOffset,
2186 uint32_t constantBufferOffset, uint32_t samplerParamsOffset,
2187 uint32_t sshTokensOffset, uint32_t btOffset, const KernelInfo &kernelInfo, const HardwareInfo &hwInfo) {
2188 IGIL_KernelAddressData *kernelAddressData = reinterpret_cast<IGIL_KernelAddressData *>(ptrOffset(reflectionSurface, offset));
2189
2190 auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
2191
2192 kernelAddressData->m_KernelDataOffset = kernelDataOffset;
2193 kernelAddressData->m_SamplerHeapOffset = samplerHeapOffset;
2194 kernelAddressData->m_SamplerParamsOffset = samplerParamsOffset;
2195 kernelAddressData->m_ConstantBufferOffset = constantBufferOffset;
2196 kernelAddressData->m_SSHTokensOffset = sshTokensOffset;
2197 kernelAddressData->m_BTSoffset = btOffset;
2198 kernelAddressData->m_BTSize = static_cast<uint32_t>(kernelInfo.kernelDescriptor.payloadMappings.bindingTable.numEntries * hwHelper.getBindingTableStateSize());
2199 }
2200
2201 template <>
patchBlocksCurbe(void * reflectionSurface,uint32_t blockID,uint64_t defaultDeviceQueueCurbeOffset,uint32_t patchSizeDefaultQueue,uint64_t defaultDeviceQueueGpuAddress,uint64_t eventPoolCurbeOffset,uint32_t patchSizeEventPool,uint64_t eventPoolGpuAddress,uint64_t deviceQueueCurbeOffset,uint32_t patchSizeDeviceQueue,uint64_t deviceQueueGpuAddress,uint64_t printfBufferOffset,uint32_t patchSizePrintfBuffer,uint64_t printfBufferGpuAddress,uint64_t privateSurfaceOffset,uint32_t privateSurfaceSize,uint64_t privateSurfaceGpuAddress)2202 void Kernel::ReflectionSurfaceHelper::patchBlocksCurbe<false>(void *reflectionSurface, uint32_t blockID,
2203 uint64_t defaultDeviceQueueCurbeOffset, uint32_t patchSizeDefaultQueue, uint64_t defaultDeviceQueueGpuAddress,
2204 uint64_t eventPoolCurbeOffset, uint32_t patchSizeEventPool, uint64_t eventPoolGpuAddress,
2205 uint64_t deviceQueueCurbeOffset, uint32_t patchSizeDeviceQueue, uint64_t deviceQueueGpuAddress,
2206 uint64_t printfBufferOffset, uint32_t patchSizePrintfBuffer, uint64_t printfBufferGpuAddress,
2207 uint64_t privateSurfaceOffset, uint32_t privateSurfaceSize, uint64_t privateSurfaceGpuAddress) {
2208
2209 IGIL_KernelDataHeader *pKernelHeader = reinterpret_cast<IGIL_KernelDataHeader *>(reflectionSurface);
2210
2211 // Reflection surface must be initialized prior to patching blocks curbe on KRS
2212 DEBUG_BREAK_IF(blockID >= pKernelHeader->m_numberOfKernels);
2213
2214 IGIL_KernelAddressData *addressData = pKernelHeader->m_data;
2215 // const buffer offsets must be set
2216 DEBUG_BREAK_IF(addressData[blockID].m_ConstantBufferOffset == 0);
2217
2218 void *pCurbe = ptrOffset(reflectionSurface, addressData[blockID].m_ConstantBufferOffset);
2219
2220 if (defaultDeviceQueueCurbeOffset != undefinedOffset) {
2221 auto *patchedPointer = ptrOffset(pCurbe, (size_t)defaultDeviceQueueCurbeOffset);
2222 patchWithRequiredSize(patchedPointer, patchSizeDefaultQueue, (uintptr_t)defaultDeviceQueueGpuAddress);
2223 }
2224 if (eventPoolCurbeOffset != undefinedOffset) {
2225 auto *patchedPointer = ptrOffset(pCurbe, (size_t)eventPoolCurbeOffset);
2226 patchWithRequiredSize(patchedPointer, patchSizeEventPool, (uintptr_t)eventPoolGpuAddress);
2227 }
2228 if (deviceQueueCurbeOffset != undefinedOffset) {
2229 auto *patchedPointer = ptrOffset(pCurbe, (size_t)deviceQueueCurbeOffset);
2230 patchWithRequiredSize(patchedPointer, patchSizeDeviceQueue, (uintptr_t)deviceQueueGpuAddress);
2231 }
2232 if (printfBufferOffset != undefinedOffset) {
2233 auto *patchedPointer = ptrOffset(pCurbe, (size_t)printfBufferOffset);
2234 patchWithRequiredSize(patchedPointer, patchSizePrintfBuffer, (uintptr_t)printfBufferGpuAddress);
2235 }
2236
2237 if (privateSurfaceOffset != undefinedOffset) {
2238 auto *patchedPointer = ptrOffset(pCurbe, (size_t)privateSurfaceOffset);
2239 patchWithRequiredSize(patchedPointer, privateSurfaceSize, (uintptr_t)privateSurfaceGpuAddress);
2240 }
2241 }
2242
patchBlocksCurbeWithConstantValues(void * reflectionSurface,uint32_t blockID,uint64_t globalMemoryCurbeOffset,uint32_t globalMemoryPatchSize,uint64_t globalMemoryGpuAddress,uint64_t constantMemoryCurbeOffset,uint32_t constantMemoryPatchSize,uint64_t constantMemoryGpuAddress,uint64_t privateMemoryCurbeOffset,uint32_t privateMemoryPatchSize,uint64_t privateMemoryGpuAddress)2243 void Kernel::ReflectionSurfaceHelper::patchBlocksCurbeWithConstantValues(void *reflectionSurface, uint32_t blockID,
2244 uint64_t globalMemoryCurbeOffset, uint32_t globalMemoryPatchSize, uint64_t globalMemoryGpuAddress,
2245 uint64_t constantMemoryCurbeOffset, uint32_t constantMemoryPatchSize, uint64_t constantMemoryGpuAddress,
2246 uint64_t privateMemoryCurbeOffset, uint32_t privateMemoryPatchSize, uint64_t privateMemoryGpuAddress) {
2247
2248 IGIL_KernelDataHeader *pKernelHeader = reinterpret_cast<IGIL_KernelDataHeader *>(reflectionSurface);
2249
2250 // Reflection surface must be initialized prior to patching blocks curbe on KRS
2251 DEBUG_BREAK_IF(blockID >= pKernelHeader->m_numberOfKernels);
2252
2253 IGIL_KernelAddressData *addressData = pKernelHeader->m_data;
2254 // const buffer offsets must be set
2255 DEBUG_BREAK_IF(addressData[blockID].m_ConstantBufferOffset == 0);
2256
2257 void *pCurbe = ptrOffset(reflectionSurface, addressData[blockID].m_ConstantBufferOffset);
2258
2259 if (globalMemoryCurbeOffset != undefinedOffset) {
2260 auto *patchedPointer = ptrOffset(pCurbe, (size_t)globalMemoryCurbeOffset);
2261 patchWithRequiredSize(patchedPointer, globalMemoryPatchSize, (uintptr_t)globalMemoryGpuAddress);
2262 }
2263 if (constantMemoryCurbeOffset != undefinedOffset) {
2264 auto *patchedPointer = ptrOffset(pCurbe, (size_t)constantMemoryCurbeOffset);
2265 patchWithRequiredSize(patchedPointer, constantMemoryPatchSize, (uintptr_t)constantMemoryGpuAddress);
2266 }
2267 if (privateMemoryCurbeOffset != undefinedOffset) {
2268 auto *patchedPointer = ptrOffset(pCurbe, (size_t)privateMemoryCurbeOffset);
2269 patchWithRequiredSize(patchedPointer, privateMemoryPatchSize, (uintptr_t)privateMemoryGpuAddress);
2270 }
2271 }
2272
setParentImageParams(void * reflectionSurface,std::vector<Kernel::SimpleKernelArgInfo> & parentArguments,const KernelInfo & parentKernelInfo)2273 void Kernel::ReflectionSurfaceHelper::setParentImageParams(void *reflectionSurface, std::vector<Kernel::SimpleKernelArgInfo> &parentArguments, const KernelInfo &parentKernelInfo) {
2274 IGIL_KernelDataHeader *pKernelHeader = reinterpret_cast<IGIL_KernelDataHeader *>(reflectionSurface);
2275 IGIL_ImageParamters *pImageParameters = reinterpret_cast<IGIL_ImageParamters *>(ptrOffset(pKernelHeader, (size_t)pKernelHeader->m_ParentImageDataOffset));
2276
2277 uint32_t numArgs = (uint32_t)parentArguments.size();
2278 for (uint32_t i = 0; i < numArgs; i++) {
2279 if (parentArguments[i].type == Kernel::kernelArgType::IMAGE_OBJ) {
2280 const Image *image = castToObject<Image>((cl_mem)parentArguments[i].object);
2281 if (image) {
2282 pImageParameters->m_ArraySize = (uint32_t)image->getImageDesc().image_array_size;
2283 pImageParameters->m_Depth = (uint32_t)image->getImageDesc().image_depth;
2284 pImageParameters->m_Height = (uint32_t)image->getImageDesc().image_height;
2285 pImageParameters->m_Width = (uint32_t)image->getImageDesc().image_width;
2286 pImageParameters->m_NumMipLevels = (uint32_t)image->getImageDesc().num_mip_levels;
2287 pImageParameters->m_NumSamples = (uint32_t)image->getImageDesc().num_samples;
2288
2289 pImageParameters->m_ChannelDataType = (uint32_t)image->getImageFormat().image_channel_data_type;
2290 pImageParameters->m_ChannelOrder = (uint32_t)image->getImageFormat().image_channel_data_type;
2291 pImageParameters->m_ObjectID = (uint32_t)parentKernelInfo.kernelDescriptor.payloadMappings.explicitArgs[i].as<ArgDescImage>().bindful;
2292 pImageParameters++;
2293 }
2294 }
2295 }
2296 }
2297
setParentSamplerParams(void * reflectionSurface,std::vector<Kernel::SimpleKernelArgInfo> & parentArguments,const KernelInfo & parentKernelInfo)2298 void Kernel::ReflectionSurfaceHelper::setParentSamplerParams(void *reflectionSurface, std::vector<Kernel::SimpleKernelArgInfo> &parentArguments, const KernelInfo &parentKernelInfo) {
2299 IGIL_KernelDataHeader *pKernelHeader = reinterpret_cast<IGIL_KernelDataHeader *>(reflectionSurface);
2300 IGIL_ParentSamplerParams *pParentSamplerParams = reinterpret_cast<IGIL_ParentSamplerParams *>(ptrOffset(pKernelHeader, (size_t)pKernelHeader->m_ParentSamplerParamsOffset));
2301
2302 uint32_t numArgs = (uint32_t)parentArguments.size();
2303 for (uint32_t i = 0; i < numArgs; i++) {
2304 if (parentArguments[i].type == Kernel::kernelArgType::SAMPLER_OBJ) {
2305 const Sampler *sampler = castToObject<Sampler>((cl_sampler)parentArguments[i].object);
2306 if (sampler) {
2307 pParentSamplerParams->CoordinateSnapRequired = (uint32_t)sampler->getSnapWaValue();
2308 pParentSamplerParams->m_AddressingMode = (uint32_t)sampler->addressingMode;
2309 pParentSamplerParams->NormalizedCoords = (uint32_t)sampler->normalizedCoordinates;
2310
2311 pParentSamplerParams->m_ObjectID = OCLRT_ARG_OFFSET_TO_SAMPLER_OBJECT_ID((uint32_t)parentKernelInfo.kernelDescriptor.payloadMappings.explicitArgs[i].as<ArgDescSampler>().bindful);
2312 pParentSamplerParams++;
2313 }
2314 }
2315 }
2316 }
2317
resetSharedObjectsPatchAddresses()2318 void Kernel::resetSharedObjectsPatchAddresses() {
2319 for (size_t i = 0; i < getKernelArgsNumber(); i++) {
2320 auto clMem = (cl_mem)kernelArguments[i].object;
2321 auto memObj = castToObject<MemObj>(clMem);
2322 if (memObj && memObj->peekSharingHandler()) {
2323 setArg((uint32_t)i, sizeof(cl_mem), &clMem);
2324 }
2325 }
2326 }
2327
provideInitializationHints()2328 void Kernel::provideInitializationHints() {
2329
2330 Context *context = program->getContextPtr();
2331 if (context == nullptr || !context->isProvidingPerformanceHints())
2332 return;
2333
2334 auto pClDevice = &getDevice();
2335 if (privateSurfaceSize) {
2336 context->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, PRIVATE_MEMORY_USAGE_TOO_HIGH,
2337 kernelInfo.kernelDescriptor.kernelMetadata.kernelName.c_str(),
2338 privateSurfaceSize);
2339 }
2340 auto scratchSize = kernelInfo.kernelDescriptor.kernelAttributes.perThreadScratchSize[0] *
2341 pClDevice->getSharedDeviceInfo().computeUnitsUsedForScratch * kernelInfo.getMaxSimdSize();
2342 if (scratchSize > 0) {
2343 context->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, REGISTER_PRESSURE_TOO_HIGH,
2344 kernelInfo.kernelDescriptor.kernelMetadata.kernelName.c_str(), scratchSize);
2345 }
2346 }
2347
patchDefaultDeviceQueue(DeviceQueue * devQueue)2348 void Kernel::patchDefaultDeviceQueue(DeviceQueue *devQueue) {
2349 const auto &defaultQueueSurfaceAddress = kernelInfo.kernelDescriptor.payloadMappings.implicitArgs.deviceSideEnqueueDefaultQueueSurfaceAddress;
2350 if (isValidOffset(defaultQueueSurfaceAddress.stateless) && crossThreadData) {
2351 auto patchLocation = ptrOffset(reinterpret_cast<uint32_t *>(crossThreadData), defaultQueueSurfaceAddress.stateless);
2352 patchWithRequiredSize(patchLocation, defaultQueueSurfaceAddress.pointerSize,
2353 static_cast<uintptr_t>(devQueue->getQueueBuffer()->getGpuAddressToPatch()));
2354 }
2355 if (isValidOffset(defaultQueueSurfaceAddress.bindful)) {
2356 auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap()), defaultQueueSurfaceAddress.bindful);
2357 Buffer::setSurfaceState(&devQueue->getDevice(), surfaceState, false, false, devQueue->getQueueBuffer()->getUnderlyingBufferSize(),
2358 (void *)devQueue->getQueueBuffer()->getGpuAddress(), 0, devQueue->getQueueBuffer(), 0, 0,
2359 kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, areMultipleSubDevicesInContext());
2360 }
2361 }
2362
patchEventPool(DeviceQueue * devQueue)2363 void Kernel::patchEventPool(DeviceQueue *devQueue) {
2364 const auto &eventPoolSurfaceAddress = kernelInfo.kernelDescriptor.payloadMappings.implicitArgs.deviceSideEnqueueEventPoolSurfaceAddress;
2365
2366 if (isValidOffset(eventPoolSurfaceAddress.stateless) && crossThreadData) {
2367 auto patchLocation = ptrOffset(reinterpret_cast<uint32_t *>(crossThreadData), eventPoolSurfaceAddress.stateless);
2368 patchWithRequiredSize(patchLocation, eventPoolSurfaceAddress.pointerSize,
2369 static_cast<uintptr_t>(devQueue->getEventPoolBuffer()->getGpuAddressToPatch()));
2370 }
2371
2372 if (isValidOffset(eventPoolSurfaceAddress.bindful)) {
2373 auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap()), eventPoolSurfaceAddress.bindful);
2374 auto eventPoolBuffer = devQueue->getEventPoolBuffer();
2375 Buffer::setSurfaceState(&devQueue->getDevice(), surfaceState, false, false, eventPoolBuffer->getUnderlyingBufferSize(),
2376 (void *)eventPoolBuffer->getGpuAddress(), 0, eventPoolBuffer, 0, 0,
2377 kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, areMultipleSubDevicesInContext());
2378 }
2379 }
2380
patchBlocksSimdSize()2381 void Kernel::patchBlocksSimdSize() {
2382 BlockKernelManager *blockManager = program->getBlockKernelManager();
2383
2384 for (auto &idOffset : kernelInfo.childrenKernelsIdOffset) {
2385
2386 DEBUG_BREAK_IF(!(idOffset.first < static_cast<uint32_t>(blockManager->getCount())));
2387
2388 const KernelInfo *blockInfo = blockManager->getBlockKernelInfo(idOffset.first);
2389 uint32_t *simdSize = reinterpret_cast<uint32_t *>(&crossThreadData[idOffset.second]);
2390 *simdSize = blockInfo->getMaxSimdSize();
2391 }
2392 }
2393
usesSyncBuffer() const2394 bool Kernel::usesSyncBuffer() const {
2395 return kernelInfo.kernelDescriptor.kernelAttributes.flags.usesSyncBuffer;
2396 }
2397
patchSyncBuffer(GraphicsAllocation * gfxAllocation,size_t bufferOffset)2398 void Kernel::patchSyncBuffer(GraphicsAllocation *gfxAllocation, size_t bufferOffset) {
2399 const auto &syncBuffer = kernelInfo.kernelDescriptor.payloadMappings.implicitArgs.syncBufferAddress;
2400 auto bufferPatchAddress = ptrOffset(crossThreadData, syncBuffer.stateless);
2401 patchWithRequiredSize(bufferPatchAddress, syncBuffer.pointerSize,
2402 ptrOffset(gfxAllocation->getGpuAddressToPatch(), bufferOffset));
2403
2404 if (isValidOffset(syncBuffer.bindful)) {
2405 auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap()), syncBuffer.bindful);
2406 auto addressToPatch = gfxAllocation->getUnderlyingBuffer();
2407 auto sizeToPatch = gfxAllocation->getUnderlyingBufferSize();
2408 Buffer::setSurfaceState(&clDevice.getDevice(), surfaceState, false, false, sizeToPatch, addressToPatch, 0, gfxAllocation, 0, 0,
2409 kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, areMultipleSubDevicesInContext());
2410 }
2411 }
2412
2413 template void Kernel::patchReflectionSurface<false>(DeviceQueue *, PrintfHandler *);
2414
isPatched() const2415 bool Kernel::isPatched() const {
2416 return patchedArgumentsNum == kernelInfo.kernelDescriptor.kernelAttributes.numArgsToPatch;
2417 }
checkCorrectImageAccessQualifier(cl_uint argIndex,size_t argSize,const void * argValue) const2418 cl_int Kernel::checkCorrectImageAccessQualifier(cl_uint argIndex,
2419 size_t argSize,
2420 const void *argValue) const {
2421 const auto &arg = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[argIndex];
2422 if (arg.is<ArgDescriptor::ArgTImage>()) {
2423 cl_mem mem = *(static_cast<const cl_mem *>(argValue));
2424 MemObj *pMemObj = nullptr;
2425 WithCastToInternal(mem, &pMemObj);
2426 if (pMemObj) {
2427 auto accessQualifier = arg.getTraits().accessQualifier;
2428 cl_mem_flags flags = pMemObj->getFlags();
2429 if ((accessQualifier == KernelArgMetadata::AccessReadOnly && ((flags | CL_MEM_WRITE_ONLY) == flags)) ||
2430 (accessQualifier == KernelArgMetadata::AccessWriteOnly && ((flags | CL_MEM_READ_ONLY) == flags))) {
2431 return CL_INVALID_ARG_VALUE;
2432 }
2433 } else {
2434 return CL_INVALID_ARG_VALUE;
2435 }
2436 }
2437 return CL_SUCCESS;
2438 }
2439
resolveArgs()2440 void Kernel::resolveArgs() {
2441 if (!Kernel::isPatched() || !imageTransformer->hasRegisteredImages3d() || !canTransformImages())
2442 return;
2443 bool canTransformImageTo2dArray = true;
2444 const auto &args = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs;
2445 for (uint32_t i = 0; i < patchedArgumentsNum; i++) {
2446 if (args[i].is<ArgDescriptor::ArgTSampler>()) {
2447 auto sampler = castToObject<Sampler>(kernelArguments.at(i).object);
2448 if (sampler->isTransformable()) {
2449 canTransformImageTo2dArray = true;
2450 } else {
2451 canTransformImageTo2dArray = false;
2452 break;
2453 }
2454 }
2455 }
2456
2457 if (canTransformImageTo2dArray) {
2458 imageTransformer->transformImagesTo2dArray(kernelInfo, kernelArguments, getSurfaceStateHeap());
2459 } else if (imageTransformer->didTransform()) {
2460 imageTransformer->transformImagesTo3d(kernelInfo, kernelArguments, getSurfaceStateHeap());
2461 }
2462 }
2463
canTransformImages() const2464 bool Kernel::canTransformImages() const {
2465 auto renderCoreFamily = clDevice.getHardwareInfo().platform.eRenderCoreFamily;
2466 return renderCoreFamily >= IGFX_GEN9_CORE && renderCoreFamily <= IGFX_GEN11LP_CORE && !isBuiltIn;
2467 }
2468
fillWithKernelObjsForAuxTranslation(KernelObjsForAuxTranslation & kernelObjsForAuxTranslation)2469 void Kernel::fillWithKernelObjsForAuxTranslation(KernelObjsForAuxTranslation &kernelObjsForAuxTranslation) {
2470 kernelObjsForAuxTranslation.reserve(getKernelArgsNumber());
2471 for (uint32_t i = 0; i < getKernelArgsNumber(); i++) {
2472 const auto &arg = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[i];
2473 if (BUFFER_OBJ == kernelArguments.at(i).type && !arg.as<ArgDescPointer>().isPureStateful()) {
2474 auto buffer = castToObject<Buffer>(getKernelArg(i));
2475 if (buffer && buffer->getMultiGraphicsAllocation().getDefaultGraphicsAllocation()->isCompressionEnabled()) {
2476 kernelObjsForAuxTranslation.insert({KernelObjForAuxTranslation::Type::MEM_OBJ, buffer});
2477 auto &context = this->program->getContext();
2478 if (context.isProvidingPerformanceHints()) {
2479 const auto &argExtMeta = kernelInfo.kernelDescriptor.explicitArgsExtendedMetadata[i];
2480 context.providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, KERNEL_ARGUMENT_AUX_TRANSLATION,
2481 kernelInfo.kernelDescriptor.kernelMetadata.kernelName.c_str(), i, argExtMeta.argName.c_str());
2482 }
2483 }
2484 }
2485 if (SVM_ALLOC_OBJ == getKernelArguments().at(i).type && !arg.as<ArgDescPointer>().isPureStateful()) {
2486 auto svmAlloc = reinterpret_cast<GraphicsAllocation *>(const_cast<void *>(getKernelArg(i)));
2487 if (svmAlloc && svmAlloc->isCompressionEnabled()) {
2488 kernelObjsForAuxTranslation.insert({KernelObjForAuxTranslation::Type::GFX_ALLOC, svmAlloc});
2489 auto &context = this->program->getContext();
2490 if (context.isProvidingPerformanceHints()) {
2491 const auto &argExtMeta = kernelInfo.kernelDescriptor.explicitArgsExtendedMetadata[i];
2492 context.providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, KERNEL_ARGUMENT_AUX_TRANSLATION,
2493 kernelInfo.kernelDescriptor.kernelMetadata.kernelName.c_str(), i, argExtMeta.argName.c_str());
2494 }
2495 }
2496 }
2497 }
2498 const auto &hwInfoConfig = *HwInfoConfig::get(getDevice().getHardwareInfo().platform.eProductFamily);
2499 if (hwInfoConfig.allowStatelessCompression(getDevice().getHardwareInfo())) {
2500 for (auto gfxAllocation : kernelUnifiedMemoryGfxAllocations) {
2501 if (gfxAllocation->isCompressionEnabled()) {
2502 kernelObjsForAuxTranslation.insert({KernelObjForAuxTranslation::Type::GFX_ALLOC, gfxAllocation});
2503 auto &context = this->program->getContext();
2504 if (context.isProvidingPerformanceHints()) {
2505 context.providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, KERNEL_ALLOCATION_AUX_TRANSLATION,
2506 kernelInfo.kernelDescriptor.kernelMetadata.kernelName.c_str(),
2507 reinterpret_cast<void *>(gfxAllocation->getGpuAddress()), gfxAllocation->getUnderlyingBufferSize());
2508 }
2509 }
2510 }
2511 if (getContext().getSVMAllocsManager()) {
2512 for (auto &allocation : getContext().getSVMAllocsManager()->getSVMAllocs()->allocations) {
2513 auto gfxAllocation = allocation.second.gpuAllocations.getDefaultGraphicsAllocation();
2514 if (gfxAllocation->isCompressionEnabled()) {
2515 kernelObjsForAuxTranslation.insert({KernelObjForAuxTranslation::Type::GFX_ALLOC, gfxAllocation});
2516 auto &context = this->program->getContext();
2517 if (context.isProvidingPerformanceHints()) {
2518 context.providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, KERNEL_ALLOCATION_AUX_TRANSLATION,
2519 kernelInfo.kernelDescriptor.kernelMetadata.kernelName.c_str(),
2520 reinterpret_cast<void *>(gfxAllocation->getGpuAddress()), gfxAllocation->getUnderlyingBufferSize());
2521 }
2522 }
2523 }
2524 }
2525 }
2526 }
2527
hasDirectStatelessAccessToSharedBuffer() const2528 bool Kernel::hasDirectStatelessAccessToSharedBuffer() const {
2529 for (uint32_t i = 0; i < getKernelArgsNumber(); i++) {
2530 const auto &arg = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[i];
2531 if (BUFFER_OBJ == kernelArguments.at(i).type && !arg.as<ArgDescPointer>().isPureStateful()) {
2532 auto buffer = castToObject<Buffer>(getKernelArg(i));
2533 if (buffer && buffer->getMultiGraphicsAllocation().getAllocationType() == GraphicsAllocation::AllocationType::SHARED_BUFFER) {
2534 return true;
2535 }
2536 }
2537 }
2538 return false;
2539 }
2540
hasDirectStatelessAccessToHostMemory() const2541 bool Kernel::hasDirectStatelessAccessToHostMemory() const {
2542 for (uint32_t i = 0; i < getKernelArgsNumber(); i++) {
2543 const auto &arg = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[i];
2544 if (BUFFER_OBJ == kernelArguments.at(i).type && !arg.as<ArgDescPointer>().isPureStateful()) {
2545 auto buffer = castToObject<Buffer>(getKernelArg(i));
2546 if (buffer && buffer->getMultiGraphicsAllocation().getAllocationType() == GraphicsAllocation::AllocationType::BUFFER_HOST_MEMORY) {
2547 return true;
2548 }
2549 }
2550 if (SVM_ALLOC_OBJ == kernelArguments.at(i).type && !arg.as<ArgDescPointer>().isPureStateful()) {
2551 auto svmAlloc = reinterpret_cast<const GraphicsAllocation *>(getKernelArg(i));
2552 if (svmAlloc && svmAlloc->getAllocationType() == GraphicsAllocation::AllocationType::BUFFER_HOST_MEMORY) {
2553 return true;
2554 }
2555 }
2556 }
2557 return false;
2558 }
2559
hasIndirectStatelessAccessToHostMemory() const2560 bool Kernel::hasIndirectStatelessAccessToHostMemory() const {
2561 if (!kernelInfo.hasIndirectStatelessAccess) {
2562 return false;
2563 }
2564
2565 for (auto gfxAllocation : kernelUnifiedMemoryGfxAllocations) {
2566 if (gfxAllocation->getAllocationType() == GraphicsAllocation::AllocationType::BUFFER_HOST_MEMORY) {
2567 return true;
2568 }
2569 }
2570
2571 if (unifiedMemoryControls.indirectHostAllocationsAllowed) {
2572 return getContext().getSVMAllocsManager()->hasHostAllocations();
2573 }
2574
2575 return false;
2576 }
2577
getAllocationsForCacheFlush(CacheFlushAllocationsVec & out) const2578 void Kernel::getAllocationsForCacheFlush(CacheFlushAllocationsVec &out) const {
2579 if (false == HwHelper::cacheFlushAfterWalkerSupported(getHardwareInfo())) {
2580 return;
2581 }
2582 for (GraphicsAllocation *alloc : this->kernelArgRequiresCacheFlush) {
2583 if (nullptr == alloc) {
2584 continue;
2585 }
2586
2587 out.push_back(alloc);
2588 }
2589
2590 auto rootDeviceIndex = getDevice().getRootDeviceIndex();
2591 auto global = getProgram()->getGlobalSurface(rootDeviceIndex);
2592 if (global != nullptr) {
2593 out.push_back(global);
2594 }
2595
2596 if (svmAllocationsRequireCacheFlush) {
2597 for (GraphicsAllocation *alloc : kernelSvmGfxAllocations) {
2598 if (allocationForCacheFlush(alloc)) {
2599 out.push_back(alloc);
2600 }
2601 }
2602 }
2603 }
2604
allocationForCacheFlush(GraphicsAllocation * argAllocation) const2605 bool Kernel::allocationForCacheFlush(GraphicsAllocation *argAllocation) const {
2606 return argAllocation->isFlushL3Required();
2607 }
2608
addAllocationToCacheFlushVector(uint32_t argIndex,GraphicsAllocation * argAllocation)2609 void Kernel::addAllocationToCacheFlushVector(uint32_t argIndex, GraphicsAllocation *argAllocation) {
2610 if (argAllocation == nullptr) {
2611 kernelArgRequiresCacheFlush[argIndex] = nullptr;
2612 } else {
2613 if (allocationForCacheFlush(argAllocation)) {
2614 kernelArgRequiresCacheFlush[argIndex] = argAllocation;
2615 } else {
2616 kernelArgRequiresCacheFlush[argIndex] = nullptr;
2617 }
2618 }
2619 }
2620
setReflectionSurfaceBlockBtOffset(uint32_t blockID,uint32_t offset)2621 void Kernel::setReflectionSurfaceBlockBtOffset(uint32_t blockID, uint32_t offset) {
2622 DEBUG_BREAK_IF(blockID >= program->getBlockKernelManager()->getCount());
2623 ReflectionSurfaceHelper::setKernelAddressDataBtOffset(getKernelReflectionSurface()->getUnderlyingBuffer(), blockID, offset);
2624 }
2625
checkIfIsParentKernelAndBlocksUsesPrintf()2626 bool Kernel::checkIfIsParentKernelAndBlocksUsesPrintf() {
2627 return isParentKernel && getProgram()->getBlockKernelManager()->getIfBlockUsesPrintf();
2628 }
2629
getKernelStartOffset(const bool localIdsGenerationByRuntime,const bool kernelUsesLocalIds,const bool isCssUsed) const2630 uint64_t Kernel::getKernelStartOffset(
2631 const bool localIdsGenerationByRuntime,
2632 const bool kernelUsesLocalIds,
2633 const bool isCssUsed) const {
2634
2635 uint64_t kernelStartOffset = 0;
2636
2637 if (kernelInfo.getGraphicsAllocation()) {
2638 kernelStartOffset = kernelInfo.getGraphicsAllocation()->getGpuAddressToPatch();
2639 if (localIdsGenerationByRuntime == false && kernelUsesLocalIds == true) {
2640 kernelStartOffset += kernelInfo.kernelDescriptor.entryPoints.skipPerThreadDataLoad;
2641 }
2642 }
2643
2644 kernelStartOffset += getStartOffset();
2645
2646 auto &hardwareInfo = getHardwareInfo();
2647 auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
2648
2649 if (isCssUsed && hwHelper.isOffsetToSkipSetFFIDGPWARequired(hardwareInfo)) {
2650 kernelStartOffset += kernelInfo.kernelDescriptor.entryPoints.skipSetFFIDGP;
2651 }
2652
2653 return kernelStartOffset;
2654 }
patchBindlessSurfaceState(NEO::GraphicsAllocation * alloc,uint32_t bindless)2655 void *Kernel::patchBindlessSurfaceState(NEO::GraphicsAllocation *alloc, uint32_t bindless) {
2656 auto &hwHelper = HwHelper::get(getDevice().getHardwareInfo().platform.eRenderCoreFamily);
2657 auto surfaceStateSize = hwHelper.getRenderSurfaceStateSize();
2658 NEO::BindlessHeapsHelper *bindlessHeapsHelper = getDevice().getDevice().getBindlessHeapsHelper();
2659 auto ssInHeap = bindlessHeapsHelper->allocateSSInHeap(surfaceStateSize, alloc, NEO::BindlessHeapsHelper::GLOBAL_SSH);
2660 auto patchLocation = ptrOffset(getCrossThreadData(), bindless);
2661 auto patchValue = hwHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast<uint32_t>(ssInHeap.surfaceStateOffset));
2662 patchWithRequiredSize(patchLocation, sizeof(patchValue), patchValue);
2663 return ssInHeap.ssPtr;
2664 }
2665
setAdditionalKernelExecInfo(uint32_t additionalKernelExecInfo)2666 void Kernel::setAdditionalKernelExecInfo(uint32_t additionalKernelExecInfo) {
2667 this->additionalKernelExecInfo = additionalKernelExecInfo;
2668 }
2669
getAdditionalKernelExecInfo() const2670 uint32_t Kernel::getAdditionalKernelExecInfo() const {
2671 return this->additionalKernelExecInfo;
2672 }
2673
requiresWaDisableRccRhwoOptimization() const2674 bool Kernel::requiresWaDisableRccRhwoOptimization() const {
2675 auto &hardwareInfo = getHardwareInfo();
2676 auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
2677 auto rootDeviceIndex = getDevice().getRootDeviceIndex();
2678
2679 if (hwHelper.isWaDisableRccRhwoOptimizationRequired() && isUsingSharedObjArgs()) {
2680 for (auto &arg : getKernelArguments()) {
2681 auto clMemObj = static_cast<cl_mem>(arg.object);
2682 auto memObj = castToObject<MemObj>(clMemObj);
2683 if (memObj && memObj->peekSharingHandler()) {
2684 auto allocation = memObj->getGraphicsAllocation(rootDeviceIndex);
2685 for (uint32_t handleId = 0u; handleId < allocation->getNumGmms(); handleId++) {
2686 if (allocation->getGmm(handleId)->gmmResourceInfo->getResourceFlags()->Info.MediaCompressed) {
2687 return true;
2688 }
2689 }
2690 }
2691 }
2692 }
2693 return false;
2694 }
2695
getHardwareInfo() const2696 const HardwareInfo &Kernel::getHardwareInfo() const {
2697 return getDevice().getHardwareInfo();
2698 }
2699
setWorkDim(uint32_t workDim)2700 void Kernel::setWorkDim(uint32_t workDim) {
2701 patchNonPointer(getCrossThreadDataRef(), getDescriptor().payloadMappings.dispatchTraits.workDim, workDim);
2702 if (pImplicitArgs) {
2703 pImplicitArgs->numWorkDim = workDim;
2704 }
2705 }
2706
setGlobalWorkOffsetValues(uint32_t globalWorkOffsetX,uint32_t globalWorkOffsetY,uint32_t globalWorkOffsetZ)2707 void Kernel::setGlobalWorkOffsetValues(uint32_t globalWorkOffsetX, uint32_t globalWorkOffsetY, uint32_t globalWorkOffsetZ) {
2708 patchVecNonPointer(getCrossThreadDataRef(),
2709 getDescriptor().payloadMappings.dispatchTraits.globalWorkOffset,
2710 {globalWorkOffsetX, globalWorkOffsetY, globalWorkOffsetZ});
2711 if (pImplicitArgs) {
2712 pImplicitArgs->globalOffsetX = globalWorkOffsetX;
2713 pImplicitArgs->globalOffsetY = globalWorkOffsetY;
2714 pImplicitArgs->globalOffsetZ = globalWorkOffsetZ;
2715 }
2716 }
2717
setGlobalWorkSizeValues(uint32_t globalWorkSizeX,uint32_t globalWorkSizeY,uint32_t globalWorkSizeZ)2718 void Kernel::setGlobalWorkSizeValues(uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ) {
2719 patchVecNonPointer(getCrossThreadDataRef(),
2720 getDescriptor().payloadMappings.dispatchTraits.globalWorkSize,
2721 {globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ});
2722 if (pImplicitArgs) {
2723 pImplicitArgs->globalSizeX = globalWorkSizeX;
2724 pImplicitArgs->globalSizeY = globalWorkSizeY;
2725 pImplicitArgs->globalSizeZ = globalWorkSizeZ;
2726 }
2727 }
2728
setLocalWorkSizeValues(uint32_t localWorkSizeX,uint32_t localWorkSizeY,uint32_t localWorkSizeZ)2729 void Kernel::setLocalWorkSizeValues(uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ) {
2730 patchVecNonPointer(getCrossThreadDataRef(),
2731 getDescriptor().payloadMappings.dispatchTraits.localWorkSize,
2732 {localWorkSizeX, localWorkSizeY, localWorkSizeZ});
2733 if (pImplicitArgs) {
2734 pImplicitArgs->localSizeX = localWorkSizeX;
2735 pImplicitArgs->localSizeY = localWorkSizeY;
2736 pImplicitArgs->localSizeZ = localWorkSizeZ;
2737 }
2738 }
2739
setLocalWorkSize2Values(uint32_t localWorkSizeX,uint32_t localWorkSizeY,uint32_t localWorkSizeZ)2740 void Kernel::setLocalWorkSize2Values(uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ) {
2741 patchVecNonPointer(getCrossThreadDataRef(),
2742 getDescriptor().payloadMappings.dispatchTraits.localWorkSize2,
2743 {localWorkSizeX, localWorkSizeY, localWorkSizeZ});
2744 }
2745
setEnqueuedLocalWorkSizeValues(uint32_t localWorkSizeX,uint32_t localWorkSizeY,uint32_t localWorkSizeZ)2746 void Kernel::setEnqueuedLocalWorkSizeValues(uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ) {
2747 patchVecNonPointer(getCrossThreadDataRef(),
2748 getDescriptor().payloadMappings.dispatchTraits.enqueuedLocalWorkSize,
2749 {localWorkSizeX, localWorkSizeY, localWorkSizeZ});
2750 }
2751
setNumWorkGroupsValues(uint32_t numWorkGroupsX,uint32_t numWorkGroupsY,uint32_t numWorkGroupsZ)2752 void Kernel::setNumWorkGroupsValues(uint32_t numWorkGroupsX, uint32_t numWorkGroupsY, uint32_t numWorkGroupsZ) {
2753 patchVecNonPointer(getCrossThreadDataRef(),
2754 getDescriptor().payloadMappings.dispatchTraits.numWorkGroups,
2755 {numWorkGroupsX, numWorkGroupsY, numWorkGroupsZ});
2756 if (pImplicitArgs) {
2757 pImplicitArgs->groupCountX = numWorkGroupsX;
2758 pImplicitArgs->groupCountY = numWorkGroupsY;
2759 pImplicitArgs->groupCountZ = numWorkGroupsZ;
2760 }
2761 }
2762
isLocalWorkSize2Patchable()2763 bool Kernel::isLocalWorkSize2Patchable() {
2764 const auto &localWorkSize2 = getDescriptor().payloadMappings.dispatchTraits.localWorkSize2;
2765 return isValidOffset(localWorkSize2[0]) && isValidOffset(localWorkSize2[1]) && isValidOffset(localWorkSize2[2]);
2766 }
2767
getMaxKernelWorkGroupSize() const2768 uint32_t Kernel::getMaxKernelWorkGroupSize() const {
2769 return maxKernelWorkGroupSize;
2770 }
2771
getSlmTotalSize() const2772 uint32_t Kernel::getSlmTotalSize() const {
2773 return slmTotalSize;
2774 }
2775
areMultipleSubDevicesInContext() const2776 bool Kernel::areMultipleSubDevicesInContext() const {
2777 auto context = program->getContextPtr();
2778 return context ? context->containsMultipleSubDevices(clDevice.getRootDeviceIndex()) : false;
2779 }
2780
reconfigureKernel()2781 void Kernel::reconfigureKernel() {
2782 auto &kernelDescriptor = kernelInfo.kernelDescriptor;
2783 if (kernelDescriptor.kernelAttributes.numGrfRequired == GrfConfig::LargeGrfNumber) {
2784 maxKernelWorkGroupSize >>= 1;
2785 }
2786 this->containsStatelessWrites = kernelDescriptor.kernelAttributes.flags.usesStatelessWrites;
2787 this->specialPipelineSelectMode = kernelDescriptor.kernelAttributes.flags.usesSpecialPipelineSelectMode;
2788 }
requiresCacheFlushCommand(const CommandQueue & commandQueue) const2789 bool Kernel::requiresCacheFlushCommand(const CommandQueue &commandQueue) const {
2790 if (false == HwHelper::cacheFlushAfterWalkerSupported(commandQueue.getDevice().getHardwareInfo())) {
2791 return false;
2792 }
2793
2794 if (DebugManager.flags.EnableCacheFlushAfterWalkerForAllQueues.get() != -1) {
2795 return !!DebugManager.flags.EnableCacheFlushAfterWalkerForAllQueues.get();
2796 }
2797
2798 bool cmdQueueRequiresCacheFlush = commandQueue.getRequiresCacheFlushAfterWalker();
2799 if (false == cmdQueueRequiresCacheFlush) {
2800 return false;
2801 }
2802 if (commandQueue.getGpgpuCommandStreamReceiver().isMultiOsContextCapable()) {
2803 return false;
2804 }
2805 bool isMultiDevice = commandQueue.getContext().containsMultipleSubDevices(commandQueue.getDevice().getRootDeviceIndex());
2806 if (false == isMultiDevice) {
2807 return false;
2808 }
2809 bool isDefaultContext = (commandQueue.getContext().peekContextType() == ContextType::CONTEXT_TYPE_DEFAULT);
2810 if (true == isDefaultContext) {
2811 return false;
2812 }
2813
2814 if (getProgram()->getGlobalSurface(commandQueue.getDevice().getRootDeviceIndex()) != nullptr) {
2815 return true;
2816 }
2817 if (svmAllocationsRequireCacheFlush) {
2818 return true;
2819 }
2820 size_t args = kernelArgRequiresCacheFlush.size();
2821 for (size_t i = 0; i < args; i++) {
2822 if (kernelArgRequiresCacheFlush[i] != nullptr) {
2823 return true;
2824 }
2825 }
2826 return false;
2827 }
2828
requiresLimitedWorkgroupSize() const2829 bool Kernel::requiresLimitedWorkgroupSize() const {
2830 if (!this->isBuiltIn) {
2831 return false;
2832 }
2833 if (this->auxTranslationDirection != AuxTranslationDirection::None) {
2834 return false;
2835 }
2836
2837 //if source is buffer in local memory, no need for limited workgroup
2838 if (this->kernelInfo.getArgDescriptorAt(0).is<ArgDescriptor::ArgTPointer>()) {
2839 if (this->getKernelArgInfo(0).object) {
2840 auto rootDeviceIndex = getDevice().getRootDeviceIndex();
2841 auto buffer = castToObject<Buffer>(this->getKernelArgInfo(0u).object);
2842 if (buffer && buffer->getGraphicsAllocation(rootDeviceIndex)->getMemoryPool() == MemoryPool::LocalMemory) {
2843 return false;
2844 }
2845 }
2846 }
2847
2848 //if we are reading from image no need for limited workgroup
2849 if (this->kernelInfo.getArgDescriptorAt(0).is<ArgDescriptor::ArgTImage>()) {
2850 return false;
2851 }
2852
2853 return true;
2854 }
2855
updateAuxTranslationRequired()2856 void Kernel::updateAuxTranslationRequired() {
2857 const auto &hwInfoConfig = *HwInfoConfig::get(getDevice().getHardwareInfo().platform.eProductFamily);
2858 if (hwInfoConfig.allowStatelessCompression(getDevice().getHardwareInfo())) {
2859 if (hasDirectStatelessAccessToHostMemory() ||
2860 hasIndirectStatelessAccessToHostMemory() ||
2861 hasDirectStatelessAccessToSharedBuffer()) {
2862 setAuxTranslationRequired(true);
2863 }
2864 }
2865 }
2866
setKernelThreadArbitrationPolicy(uint32_t policy)2867 int Kernel::setKernelThreadArbitrationPolicy(uint32_t policy) {
2868 auto &hwInfo = clDevice.getHardwareInfo();
2869 auto &hwHelper = NEO::ClHwHelper::get(hwInfo.platform.eRenderCoreFamily);
2870 if (!hwHelper.isSupportedKernelThreadArbitrationPolicy()) {
2871 this->threadArbitrationPolicy = ThreadArbitrationPolicy::NotPresent;
2872 return CL_INVALID_DEVICE;
2873 } else if (policy == CL_KERNEL_EXEC_INFO_THREAD_ARBITRATION_POLICY_ROUND_ROBIN_INTEL) {
2874 this->threadArbitrationPolicy = ThreadArbitrationPolicy::RoundRobin;
2875 } else if (policy == CL_KERNEL_EXEC_INFO_THREAD_ARBITRATION_POLICY_OLDEST_FIRST_INTEL) {
2876 this->threadArbitrationPolicy = ThreadArbitrationPolicy::AgeBased;
2877 } else if (policy == CL_KERNEL_EXEC_INFO_THREAD_ARBITRATION_POLICY_AFTER_DEPENDENCY_ROUND_ROBIN_INTEL ||
2878 policy == CL_KERNEL_EXEC_INFO_THREAD_ARBITRATION_POLICY_STALL_BASED_ROUND_ROBIN_INTEL) {
2879 this->threadArbitrationPolicy = ThreadArbitrationPolicy::RoundRobinAfterDependency;
2880 } else {
2881 this->threadArbitrationPolicy = ThreadArbitrationPolicy::NotPresent;
2882 return CL_INVALID_VALUE;
2883 }
2884 return CL_SUCCESS;
2885 }
2886
2887 } // namespace NEO
2888