1 /*
2  * Copyright (C) 2020-2021 Intel Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  */
7 
8 #include "level_zero/core/source/kernel/kernel_imp.h"
9 
10 #include "shared/source/helpers/basic_math.h"
11 #include "shared/source/helpers/blit_commands_helper.h"
12 #include "shared/source/helpers/hw_info.h"
13 #include "shared/source/helpers/kernel_helpers.h"
14 #include "shared/source/helpers/local_work_size.h"
15 #include "shared/source/helpers/per_thread_data.h"
16 #include "shared/source/helpers/ray_tracing_helper.h"
17 #include "shared/source/helpers/register_offsets.h"
18 #include "shared/source/helpers/string.h"
19 #include "shared/source/helpers/surface_format_info.h"
20 #include "shared/source/kernel/implicit_args.h"
21 #include "shared/source/kernel/kernel_arg_descriptor.h"
22 #include "shared/source/kernel/kernel_descriptor.h"
23 #include "shared/source/memory_manager/memory_manager.h"
24 #include "shared/source/memory_manager/memory_operations_handler.h"
25 #include "shared/source/memory_manager/unified_memory_manager.h"
26 #include "shared/source/program/kernel_info.h"
27 #include "shared/source/utilities/arrayref.h"
28 
29 #include "level_zero/core/source/debugger/debugger_l0.h"
30 #include "level_zero/core/source/device/device.h"
31 #include "level_zero/core/source/device/device_imp.h"
32 #include "level_zero/core/source/driver/driver_handle_imp.h"
33 #include "level_zero/core/source/image/image.h"
34 #include "level_zero/core/source/image/image_format_desc_helper.h"
35 #include "level_zero/core/source/module/module.h"
36 #include "level_zero/core/source/module/module_imp.h"
37 #include "level_zero/core/source/printf_handler/printf_handler.h"
38 #include "level_zero/core/source/sampler/sampler.h"
39 
40 #include <memory>
41 
42 namespace L0 {
43 enum class SamplerPatchValues : uint32_t {
44     DefaultSampler = 0x00,
45     AddressNone = 0x00,
46     AddressClamp = 0x01,
47     AddressClampToEdge = 0x02,
48     AddressRepeat = 0x03,
49     AddressMirroredRepeat = 0x04,
50     AddressMirroredRepeat101 = 0x05,
51     NormalizedCoordsFalse = 0x00,
52     NormalizedCoordsTrue = 0x08
53 };
54 
getAddrMode(ze_sampler_address_mode_t addressingMode)55 inline SamplerPatchValues getAddrMode(ze_sampler_address_mode_t addressingMode) {
56     switch (addressingMode) {
57     case ZE_SAMPLER_ADDRESS_MODE_REPEAT:
58         return SamplerPatchValues::AddressRepeat;
59     case ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER:
60         return SamplerPatchValues::AddressClampToEdge;
61     case ZE_SAMPLER_ADDRESS_MODE_CLAMP:
62         return SamplerPatchValues::AddressClamp;
63     case ZE_SAMPLER_ADDRESS_MODE_NONE:
64         return SamplerPatchValues::AddressNone;
65     case ZE_SAMPLER_ADDRESS_MODE_MIRROR:
66         return SamplerPatchValues::AddressMirroredRepeat;
67     default:
68         DEBUG_BREAK_IF(true);
69     }
70     return SamplerPatchValues::AddressNone;
71 }
72 
KernelImmutableData(L0::Device * l0device)73 KernelImmutableData::KernelImmutableData(L0::Device *l0device) : device(l0device) {}
74 
~KernelImmutableData()75 KernelImmutableData::~KernelImmutableData() {
76     if (nullptr != isaGraphicsAllocation) {
77         this->getDevice()->getNEODevice()->getMemoryManager()->freeGraphicsMemory(&*isaGraphicsAllocation);
78         isaGraphicsAllocation.release();
79     }
80     crossThreadDataTemplate.reset();
81     surfaceStateHeapTemplate.reset();
82     dynamicStateHeapTemplate.reset();
83 }
84 
patchWithImplicitSurface(ArrayRef<uint8_t> crossThreadData,ArrayRef<uint8_t> surfaceStateHeap,uintptr_t ptrToPatchInCrossThreadData,NEO::GraphicsAllocation & allocation,const NEO::ArgDescPointer & ptr,const NEO::Device & device,bool useGlobalAtomics)85 inline void patchWithImplicitSurface(ArrayRef<uint8_t> crossThreadData, ArrayRef<uint8_t> surfaceStateHeap,
86                                      uintptr_t ptrToPatchInCrossThreadData, NEO::GraphicsAllocation &allocation,
87                                      const NEO::ArgDescPointer &ptr, const NEO::Device &device, bool useGlobalAtomics) {
88     if (false == crossThreadData.empty()) {
89         NEO::patchPointer(crossThreadData, ptr, ptrToPatchInCrossThreadData);
90     }
91 
92     if ((false == surfaceStateHeap.empty()) && (NEO::isValidOffset(ptr.bindful))) {
93         auto surfaceState = surfaceStateHeap.begin() + ptr.bindful;
94         auto addressToPatch = allocation.getGpuAddress();
95         size_t sizeToPatch = allocation.getUnderlyingBufferSize();
96 
97         auto &hwInfo = device.getHardwareInfo();
98         auto &hwHelper = NEO::HwHelper::get(hwInfo.platform.eRenderCoreFamily);
99 
100         NEO::EncodeSurfaceStateArgs args;
101         args.outMemory = surfaceState;
102         args.size = sizeToPatch;
103         args.graphicsAddress = addressToPatch;
104         args.gmmHelper = device.getGmmHelper();
105         args.allocation = &allocation;
106         args.useGlobalAtomics = useGlobalAtomics;
107         args.numAvailableDevices = device.getNumGenericSubDevices();
108         args.areMultipleSubDevicesInContext = args.numAvailableDevices > 1;
109         args.mocs = hwHelper.getMocsIndex(*args.gmmHelper, true, false) << 1;
110 
111         hwHelper.encodeBufferSurfaceState(args);
112     }
113 }
114 
initialize(NEO::KernelInfo * kernelInfo,Device * device,uint32_t computeUnitsUsedForSratch,NEO::GraphicsAllocation * globalConstBuffer,NEO::GraphicsAllocation * globalVarBuffer,bool internalKernel)115 void KernelImmutableData::initialize(NEO::KernelInfo *kernelInfo, Device *device,
116                                      uint32_t computeUnitsUsedForSratch,
117                                      NEO::GraphicsAllocation *globalConstBuffer,
118                                      NEO::GraphicsAllocation *globalVarBuffer, bool internalKernel) {
119 
120     UNRECOVERABLE_IF(kernelInfo == nullptr);
121     this->kernelInfo = kernelInfo;
122     this->kernelDescriptor = &kernelInfo->kernelDescriptor;
123 
124     DeviceImp *deviceImp = static_cast<DeviceImp *>(device);
125     auto neoDevice = deviceImp->getActiveDevice();
126     auto memoryManager = neoDevice->getMemoryManager();
127 
128     auto kernelIsaSize = kernelInfo->heapInfo.KernelHeapSize;
129     UNRECOVERABLE_IF(kernelIsaSize == 0);
130     UNRECOVERABLE_IF(!kernelInfo->heapInfo.pKernelHeap);
131     const auto allocType = internalKernel ? NEO::GraphicsAllocation::AllocationType::KERNEL_ISA_INTERNAL : NEO::GraphicsAllocation::AllocationType::KERNEL_ISA;
132 
133     auto allocation = memoryManager->allocateGraphicsMemoryWithProperties(
134         {neoDevice->getRootDeviceIndex(), kernelIsaSize, allocType, neoDevice->getDeviceBitfield()});
135     UNRECOVERABLE_IF(allocation == nullptr);
136 
137     isaGraphicsAllocation.reset(allocation);
138 
139     if (neoDevice->getDebugger() && kernelInfo->kernelDescriptor.external.debugData.get()) {
140         createRelocatedDebugData(globalConstBuffer, globalVarBuffer);
141         if (device->getL0Debugger()) {
142             device->getL0Debugger()->registerElf(kernelInfo->kernelDescriptor.external.debugData.get(), allocation);
143         }
144     }
145 
146     this->crossThreadDataSize = this->kernelDescriptor->kernelAttributes.crossThreadDataSize;
147 
148     ArrayRef<uint8_t> crossThredDataArrayRef;
149     if (crossThreadDataSize != 0) {
150         crossThreadDataTemplate.reset(new uint8_t[crossThreadDataSize]);
151 
152         if (kernelInfo->crossThreadData) {
153             memcpy_s(crossThreadDataTemplate.get(), crossThreadDataSize,
154                      kernelInfo->crossThreadData, crossThreadDataSize);
155         } else {
156             memset(crossThreadDataTemplate.get(), 0x00, crossThreadDataSize);
157         }
158 
159         crossThredDataArrayRef = ArrayRef<uint8_t>(this->crossThreadDataTemplate.get(), this->crossThreadDataSize);
160 
161         NEO::patchNonPointer<uint32_t>(crossThredDataArrayRef,
162                                        kernelDescriptor->payloadMappings.implicitArgs.simdSize, kernelDescriptor->kernelAttributes.simdSize);
163     }
164 
165     if (kernelInfo->heapInfo.SurfaceStateHeapSize != 0) {
166         this->surfaceStateHeapSize = kernelInfo->heapInfo.SurfaceStateHeapSize;
167         surfaceStateHeapTemplate.reset(new uint8_t[surfaceStateHeapSize]);
168 
169         memcpy_s(surfaceStateHeapTemplate.get(), surfaceStateHeapSize,
170                  kernelInfo->heapInfo.pSsh, surfaceStateHeapSize);
171     }
172 
173     if (kernelInfo->heapInfo.DynamicStateHeapSize != 0) {
174         this->dynamicStateHeapSize = kernelInfo->heapInfo.DynamicStateHeapSize;
175         dynamicStateHeapTemplate.reset(new uint8_t[dynamicStateHeapSize]);
176 
177         memcpy_s(dynamicStateHeapTemplate.get(), dynamicStateHeapSize,
178                  kernelInfo->heapInfo.pDsh, dynamicStateHeapSize);
179     }
180 
181     ArrayRef<uint8_t> surfaceStateHeapArrayRef = ArrayRef<uint8_t>(surfaceStateHeapTemplate.get(), getSurfaceStateHeapSize());
182 
183     if (NEO::isValidOffset(kernelDescriptor->payloadMappings.implicitArgs.globalConstantsSurfaceAddress.stateless)) {
184         UNRECOVERABLE_IF(nullptr == globalConstBuffer);
185 
186         patchWithImplicitSurface(crossThredDataArrayRef, surfaceStateHeapArrayRef,
187                                  static_cast<uintptr_t>(globalConstBuffer->getGpuAddressToPatch()),
188                                  *globalConstBuffer, kernelDescriptor->payloadMappings.implicitArgs.globalConstantsSurfaceAddress,
189                                  *neoDevice, kernelDescriptor->kernelAttributes.flags.useGlobalAtomics);
190         this->residencyContainer.push_back(globalConstBuffer);
191     } else if (nullptr != globalConstBuffer) {
192         this->residencyContainer.push_back(globalConstBuffer);
193     }
194 
195     if (NEO::isValidOffset(kernelDescriptor->payloadMappings.implicitArgs.globalVariablesSurfaceAddress.stateless)) {
196         UNRECOVERABLE_IF(globalVarBuffer == nullptr);
197 
198         patchWithImplicitSurface(crossThredDataArrayRef, surfaceStateHeapArrayRef,
199                                  static_cast<uintptr_t>(globalVarBuffer->getGpuAddressToPatch()),
200                                  *globalVarBuffer, kernelDescriptor->payloadMappings.implicitArgs.globalVariablesSurfaceAddress,
201                                  *neoDevice, kernelDescriptor->kernelAttributes.flags.useGlobalAtomics);
202         this->residencyContainer.push_back(globalVarBuffer);
203     } else if (nullptr != globalVarBuffer) {
204         this->residencyContainer.push_back(globalVarBuffer);
205     }
206 }
207 
createRelocatedDebugData(NEO::GraphicsAllocation * globalConstBuffer,NEO::GraphicsAllocation * globalVarBuffer)208 void KernelImmutableData::createRelocatedDebugData(NEO::GraphicsAllocation *globalConstBuffer,
209                                                    NEO::GraphicsAllocation *globalVarBuffer) {
210     NEO::Linker::SegmentInfo globalData;
211     NEO::Linker::SegmentInfo constData;
212     if (globalVarBuffer) {
213         globalData.gpuAddress = globalVarBuffer->getGpuAddress();
214         globalData.segmentSize = globalVarBuffer->getUnderlyingBufferSize();
215     }
216     if (globalConstBuffer) {
217         constData.gpuAddress = globalConstBuffer->getGpuAddress();
218         constData.segmentSize = globalConstBuffer->getUnderlyingBufferSize();
219     }
220 
221     if (kernelInfo->kernelDescriptor.external.debugData.get()) {
222         std::string outErrReason;
223         std::string outWarning;
224         auto decodedElf = NEO::Elf::decodeElf<NEO::Elf::EI_CLASS_64>(ArrayRef<const uint8_t>(reinterpret_cast<const uint8_t *>(kernelInfo->kernelDescriptor.external.debugData->vIsa),
225                                                                                              kernelInfo->kernelDescriptor.external.debugData->vIsaSize),
226                                                                      outErrReason, outWarning);
227 
228         if (decodedElf.getDebugInfoRelocations().size() > 1) {
229             auto size = kernelInfo->kernelDescriptor.external.debugData->vIsaSize;
230             kernelInfo->kernelDescriptor.external.relocatedDebugData = std::make_unique<uint8_t[]>(size);
231 
232             memcpy_s(kernelInfo->kernelDescriptor.external.relocatedDebugData.get(), size, kernelInfo->kernelDescriptor.external.debugData->vIsa, kernelInfo->kernelDescriptor.external.debugData->vIsaSize);
233 
234             NEO::Linker::SegmentInfo textSegment = {getIsaGraphicsAllocation()->getGpuAddress(),
235                                                     getIsaGraphicsAllocation()->getUnderlyingBufferSize()};
236 
237             NEO::Linker::applyDebugDataRelocations(decodedElf, ArrayRef<uint8_t>(kernelInfo->kernelDescriptor.external.relocatedDebugData.get(), size),
238                                                    textSegment, globalData, constData);
239         }
240     }
241 }
242 
getIsaSize() const243 uint32_t KernelImmutableData::getIsaSize() const {
244     return static_cast<uint32_t>(isaGraphicsAllocation->getUnderlyingBufferSize());
245 }
246 
KernelImp(Module * module)247 KernelImp::KernelImp(Module *module) : module(module) {}
248 
~KernelImp()249 KernelImp::~KernelImp() {
250     if (nullptr != privateMemoryGraphicsAllocation) {
251         module->getDevice()->getNEODevice()->getMemoryManager()->freeGraphicsMemory(privateMemoryGraphicsAllocation);
252     }
253 
254     if (perThreadDataForWholeThreadGroup != nullptr) {
255         alignedFree(perThreadDataForWholeThreadGroup);
256     }
257     if (printfBuffer != nullptr) {
258         //not allowed to call virtual function on destructor, so calling printOutput directly
259         PrintfHandler::printOutput(kernelImmData, this->printfBuffer, module->getDevice());
260         module->getDevice()->getNEODevice()->getMemoryManager()->freeGraphicsMemory(printfBuffer);
261     }
262     slmArgSizes.clear();
263     crossThreadData.reset();
264     surfaceStateHeapData.reset();
265     dynamicStateHeapData.reset();
266 }
267 
setArgumentValue(uint32_t argIndex,size_t argSize,const void * pArgValue)268 ze_result_t KernelImp::setArgumentValue(uint32_t argIndex, size_t argSize,
269                                         const void *pArgValue) {
270     if (argIndex >= kernelArgHandlers.size()) {
271         return ZE_RESULT_ERROR_INVALID_ARGUMENT;
272     }
273     return (this->*kernelArgHandlers[argIndex])(argIndex, argSize, pArgValue);
274 }
275 
setGroupCount(uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)276 void KernelImp::setGroupCount(uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ) {
277     const NEO::KernelDescriptor &desc = kernelImmData->getDescriptor();
278     uint32_t globalWorkSize[3] = {groupCountX * groupSize[0], groupCountY * groupSize[1],
279                                   groupCountZ * groupSize[2]};
280     auto dst = ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize);
281     NEO::patchVecNonPointer(dst, desc.payloadMappings.dispatchTraits.globalWorkSize, globalWorkSize);
282 
283     uint32_t groupCount[3] = {groupCountX, groupCountY, groupCountZ};
284     NEO::patchVecNonPointer(dst, desc.payloadMappings.dispatchTraits.numWorkGroups, groupCount);
285 
286     uint32_t workDim = 1;
287     if (groupCountZ * groupSize[2] > 1) {
288         workDim = 3;
289     } else if (groupCountY * groupSize[1] > 1) {
290         workDim = 2;
291     }
292     auto workDimOffset = desc.payloadMappings.dispatchTraits.workDim;
293     if (NEO::isValidOffset(workDimOffset)) {
294         auto destinationBuffer = ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize);
295         NEO::patchNonPointer(destinationBuffer, desc.payloadMappings.dispatchTraits.workDim, workDim);
296     }
297 
298     if (pImplicitArgs) {
299         pImplicitArgs->numWorkDim = workDim;
300 
301         pImplicitArgs->globalSizeX = globalWorkSize[0];
302         pImplicitArgs->globalSizeY = globalWorkSize[1];
303         pImplicitArgs->globalSizeZ = globalWorkSize[2];
304 
305         pImplicitArgs->groupCountX = groupCount[0];
306         pImplicitArgs->groupCountY = groupCount[1];
307         pImplicitArgs->groupCountZ = groupCount[2];
308     }
309 }
310 
setGroupSize(uint32_t groupSizeX,uint32_t groupSizeY,uint32_t groupSizeZ)311 ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
312                                     uint32_t groupSizeZ) {
313     if ((0 == groupSizeX) || (0 == groupSizeY) || (0 == groupSizeZ)) {
314         return ZE_RESULT_ERROR_INVALID_ARGUMENT;
315     }
316 
317     auto numChannels = kernelImmData->getDescriptor().kernelAttributes.numLocalIdChannels;
318     Vec3<size_t> groupSize{groupSizeX, groupSizeY, groupSizeZ};
319     auto itemsInGroup = Math::computeTotalElementsCount(groupSize);
320 
321     if (itemsInGroup > module->getMaxGroupSize()) {
322         DEBUG_BREAK_IF(true);
323         return ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION;
324     }
325 
326     this->groupSize[0] = groupSizeX;
327     this->groupSize[1] = groupSizeY;
328     this->groupSize[2] = groupSizeZ;
329     const NEO::KernelDescriptor &kernelDescriptor = kernelImmData->getDescriptor();
330     for (uint32_t i = 0u; i < 3u; i++) {
331         if (kernelDescriptor.kernelAttributes.requiredWorkgroupSize[i] != 0 &&
332             kernelDescriptor.kernelAttributes.requiredWorkgroupSize[i] != this->groupSize[i]) {
333             NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr,
334                                   "Invalid group size {%d, %d, %d} specified, requiredWorkGroupSize = {%d, %d, %d}\n",
335                                   this->groupSize[0], this->groupSize[1], this->groupSize[2],
336                                   kernelDescriptor.kernelAttributes.requiredWorkgroupSize[0],
337                                   kernelDescriptor.kernelAttributes.requiredWorkgroupSize[1],
338                                   kernelDescriptor.kernelAttributes.requiredWorkgroupSize[2]);
339             return ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION;
340         }
341     }
342 
343     auto simdSize = kernelDescriptor.kernelAttributes.simdSize;
344     this->numThreadsPerThreadGroup = static_cast<uint32_t>((itemsInGroup + simdSize - 1u) / simdSize);
345     patchWorkgroupSizeInCrossThreadData(groupSizeX, groupSizeY, groupSizeZ);
346 
347     auto remainderSimdLanes = itemsInGroup & (simdSize - 1u);
348     threadExecutionMask = static_cast<uint32_t>(maxNBitValue(remainderSimdLanes));
349     if (!threadExecutionMask) {
350         threadExecutionMask = static_cast<uint32_t>(maxNBitValue((simdSize == 1) ? 32 : simdSize));
351     }
352     evaluateIfRequiresGenerationOfLocalIdsByRuntime(kernelDescriptor);
353 
354     if (kernelRequiresGenerationOfLocalIdsByRuntime) {
355         auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
356         uint32_t perThreadDataSizeForWholeThreadGroupNeeded =
357             static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
358                 simdSize, grfSize, numChannels, itemsInGroup));
359         if (perThreadDataSizeForWholeThreadGroupNeeded >
360             perThreadDataSizeForWholeThreadGroupAllocated) {
361             alignedFree(perThreadDataForWholeThreadGroup);
362             perThreadDataForWholeThreadGroup = static_cast<uint8_t *>(alignedMalloc(perThreadDataSizeForWholeThreadGroupNeeded, 32));
363             perThreadDataSizeForWholeThreadGroupAllocated = perThreadDataSizeForWholeThreadGroupNeeded;
364         }
365         perThreadDataSizeForWholeThreadGroup = perThreadDataSizeForWholeThreadGroupNeeded;
366 
367         if (numChannels > 0) {
368             UNRECOVERABLE_IF(3 != numChannels);
369             NEO::generateLocalIDs(
370                 perThreadDataForWholeThreadGroup,
371                 static_cast<uint16_t>(simdSize),
372                 std::array<uint16_t, 3>{{static_cast<uint16_t>(groupSizeX),
373                                          static_cast<uint16_t>(groupSizeY),
374                                          static_cast<uint16_t>(groupSizeZ)}},
375                 std::array<uint8_t, 3>{{0, 1, 2}},
376                 false, grfSize);
377         }
378 
379         this->perThreadDataSize = perThreadDataSizeForWholeThreadGroup / numThreadsPerThreadGroup;
380     }
381     return ZE_RESULT_SUCCESS;
382 }
383 
suggestGroupSize(uint32_t globalSizeX,uint32_t globalSizeY,uint32_t globalSizeZ,uint32_t * groupSizeX,uint32_t * groupSizeY,uint32_t * groupSizeZ)384 ze_result_t KernelImp::suggestGroupSize(uint32_t globalSizeX, uint32_t globalSizeY,
385                                         uint32_t globalSizeZ, uint32_t *groupSizeX,
386                                         uint32_t *groupSizeY, uint32_t *groupSizeZ) {
387     size_t retGroupSize[3] = {};
388     auto maxWorkGroupSize = module->getMaxGroupSize();
389     auto simd = kernelImmData->getDescriptor().kernelAttributes.simdSize;
390     size_t workItems[3] = {globalSizeX, globalSizeY, globalSizeZ};
391     uint32_t dim = (globalSizeY > 1U) ? 2 : 1U;
392     dim = (globalSizeZ > 1U) ? 3 : dim;
393 
394     if (NEO::DebugManager.flags.EnableComputeWorkSizeND.get()) {
395         auto usesImages = getImmutableData()->getDescriptor().kernelAttributes.flags.usesImages;
396         auto neoDevice = module->getDevice()->getNEODevice();
397         const auto hwInfo = &neoDevice->getHardwareInfo();
398         const auto &deviceInfo = neoDevice->getDeviceInfo();
399         uint32_t numThreadsPerSubSlice = (uint32_t)deviceInfo.maxNumEUsPerSubSlice * deviceInfo.numThreadsPerEU;
400         uint32_t localMemSize = (uint32_t)deviceInfo.localMemSize;
401 
402         NEO::WorkSizeInfo wsInfo(maxWorkGroupSize, kernelImmData->getDescriptor().kernelAttributes.usesBarriers(), simd, this->getSlmTotalSize(),
403                                  hwInfo, numThreadsPerSubSlice, localMemSize,
404                                  usesImages, false);
405         NEO::computeWorkgroupSizeND(wsInfo, retGroupSize, workItems, dim);
406     } else {
407         if (1U == dim) {
408             NEO::computeWorkgroupSize1D(maxWorkGroupSize, retGroupSize, workItems, simd);
409         } else if (NEO::DebugManager.flags.EnableComputeWorkSizeSquared.get() && (2U == dim)) {
410             NEO::computeWorkgroupSizeSquared(maxWorkGroupSize, retGroupSize, workItems, simd, dim);
411         } else {
412             NEO::computeWorkgroupSize2D(maxWorkGroupSize, retGroupSize, workItems, simd);
413         }
414     }
415 
416     *groupSizeX = static_cast<uint32_t>(retGroupSize[0]);
417     *groupSizeY = static_cast<uint32_t>(retGroupSize[1]);
418     *groupSizeZ = static_cast<uint32_t>(retGroupSize[2]);
419 
420     return ZE_RESULT_SUCCESS;
421 }
422 
suggestMaxCooperativeGroupCount(uint32_t * totalGroupCount,NEO::EngineGroupType engineGroupType,bool isEngineInstanced)423 ze_result_t KernelImp::suggestMaxCooperativeGroupCount(uint32_t *totalGroupCount, NEO::EngineGroupType engineGroupType,
424                                                        bool isEngineInstanced) {
425     UNRECOVERABLE_IF(0 == groupSize[0]);
426     UNRECOVERABLE_IF(0 == groupSize[1]);
427     UNRECOVERABLE_IF(0 == groupSize[2]);
428 
429     auto &hardwareInfo = module->getDevice()->getHwInfo();
430 
431     auto dssCount = hardwareInfo.gtSystemInfo.DualSubSliceCount;
432     if (dssCount == 0) {
433         dssCount = hardwareInfo.gtSystemInfo.SubSliceCount;
434     }
435     auto &hwHelper = NEO::HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
436     auto &descriptor = kernelImmData->getDescriptor();
437     auto availableThreadCount = hwHelper.calculateAvailableThreadCount(
438         hardwareInfo.platform.eProductFamily,
439         descriptor.kernelAttributes.numGrfRequired,
440         hardwareInfo.gtSystemInfo.EUCount, hardwareInfo.gtSystemInfo.ThreadCount / hardwareInfo.gtSystemInfo.EUCount);
441 
442     auto barrierCount = descriptor.kernelAttributes.barrierCount;
443     const uint32_t workDim = 3;
444     const size_t localWorkSize[] = {groupSize[0], groupSize[1], groupSize[2]};
445     *totalGroupCount = NEO::KernelHelper::getMaxWorkGroupCount(descriptor.kernelAttributes.simdSize,
446                                                                availableThreadCount,
447                                                                dssCount,
448                                                                dssCount * KB * hardwareInfo.capabilityTable.slmSize,
449                                                                hwHelper.alignSlmSize(slmArgsTotalSize + descriptor.kernelAttributes.slmInlineSize),
450                                                                static_cast<uint32_t>(hwHelper.getMaxBarrierRegisterPerSlice()),
451                                                                hwHelper.getBarriersCountFromHasBarriers(barrierCount),
452                                                                workDim,
453                                                                localWorkSize);
454     *totalGroupCount = hwHelper.adjustMaxWorkGroupCount(*totalGroupCount, engineGroupType, hardwareInfo, isEngineInstanced);
455     return ZE_RESULT_SUCCESS;
456 }
457 
setIndirectAccess(ze_kernel_indirect_access_flags_t flags)458 ze_result_t KernelImp::setIndirectAccess(ze_kernel_indirect_access_flags_t flags) {
459     if (NEO::DebugManager.flags.DisableIndirectAccess.get() == 1 || this->kernelHasIndirectAccess == false) {
460         return ZE_RESULT_SUCCESS;
461     }
462 
463     if (flags & ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE) {
464         this->unifiedMemoryControls.indirectDeviceAllocationsAllowed = true;
465     }
466     if (flags & ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST) {
467         this->unifiedMemoryControls.indirectHostAllocationsAllowed = true;
468     }
469     if (flags & ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED) {
470         this->unifiedMemoryControls.indirectSharedAllocationsAllowed = true;
471     }
472 
473     return ZE_RESULT_SUCCESS;
474 }
475 
getIndirectAccess(ze_kernel_indirect_access_flags_t * flags)476 ze_result_t KernelImp::getIndirectAccess(ze_kernel_indirect_access_flags_t *flags) {
477     *flags = 0;
478     if (this->unifiedMemoryControls.indirectDeviceAllocationsAllowed) {
479         *flags |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE;
480     }
481     if (this->unifiedMemoryControls.indirectHostAllocationsAllowed) {
482         *flags |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST;
483     }
484     if (this->unifiedMemoryControls.indirectSharedAllocationsAllowed) {
485         *flags |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED;
486     }
487 
488     return ZE_RESULT_SUCCESS;
489 }
490 
getSourceAttributes(uint32_t * pSize,char ** pString)491 ze_result_t KernelImp::getSourceAttributes(uint32_t *pSize, char **pString) {
492     auto &desc = kernelImmData->getDescriptor();
493     if (pString == nullptr) {
494         *pSize = (uint32_t)desc.kernelMetadata.kernelLanguageAttributes.length() + 1;
495     } else {
496         strncpy_s(*pString, desc.kernelMetadata.kernelLanguageAttributes.length() + 1,
497                   desc.kernelMetadata.kernelLanguageAttributes.c_str(),
498                   desc.kernelMetadata.kernelLanguageAttributes.length() + 1);
499     }
500     return ZE_RESULT_SUCCESS;
501 }
502 
setArgImmediate(uint32_t argIndex,size_t argSize,const void * argVal)503 ze_result_t KernelImp::setArgImmediate(uint32_t argIndex, size_t argSize, const void *argVal) {
504     if (kernelImmData->getDescriptor().payloadMappings.explicitArgs.size() <= argIndex) {
505         return ZE_RESULT_ERROR_INVALID_ARGUMENT;
506     }
507 
508     const auto &arg = kernelImmData->getDescriptor().payloadMappings.explicitArgs[argIndex];
509 
510     for (const auto &element : arg.as<NEO::ArgDescValue>().elements) {
511         if (element.sourceOffset < argSize) {
512             size_t maxBytesToCopy = argSize - element.sourceOffset;
513             size_t bytesToCopy = std::min(static_cast<size_t>(element.size), maxBytesToCopy);
514 
515             auto pDst = ptrOffset(crossThreadData.get(), element.offset);
516             if (argVal) {
517                 auto pSrc = ptrOffset(argVal, element.sourceOffset);
518                 memcpy_s(pDst, element.size, pSrc, bytesToCopy);
519             } else {
520                 uint64_t val = 0;
521                 memcpy_s(pDst, element.size,
522                          reinterpret_cast<void *>(&val), bytesToCopy);
523             }
524         } else {
525             return ZE_RESULT_ERROR_INVALID_ARGUMENT;
526         }
527     }
528     return ZE_RESULT_SUCCESS;
529 }
530 
setArgRedescribedImage(uint32_t argIndex,ze_image_handle_t argVal)531 ze_result_t KernelImp::setArgRedescribedImage(uint32_t argIndex, ze_image_handle_t argVal) {
532     const auto &arg = kernelImmData->getDescriptor().payloadMappings.explicitArgs[argIndex].as<NEO::ArgDescImage>();
533     if (argVal == nullptr) {
534         residencyContainer[argIndex] = nullptr;
535         return ZE_RESULT_SUCCESS;
536     }
537 
538     const auto image = Image::fromHandle(argVal);
539     image->copyRedescribedSurfaceStateToSSH(surfaceStateHeapData.get(), arg.bindful);
540     residencyContainer[argIndex] = image->getAllocation();
541 
542     return ZE_RESULT_SUCCESS;
543 }
544 
setArgBufferWithAlloc(uint32_t argIndex,uintptr_t argVal,NEO::GraphicsAllocation * allocation)545 ze_result_t KernelImp::setArgBufferWithAlloc(uint32_t argIndex, uintptr_t argVal, NEO::GraphicsAllocation *allocation) {
546     const auto &arg = kernelImmData->getDescriptor().payloadMappings.explicitArgs[argIndex].as<NEO::ArgDescPointer>();
547     const auto val = argVal;
548 
549     NEO::patchPointer(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg, val);
550     if (NEO::isValidOffset(arg.bindful) || NEO::isValidOffset(arg.bindless)) {
551         setBufferSurfaceState(argIndex, reinterpret_cast<void *>(val), allocation);
552     }
553 
554     auto allocData = this->module->getDevice()->getDriverHandle()->getSvmAllocsManager()->getSVMAlloc(reinterpret_cast<void *>(allocation->getGpuAddress()));
555     if (allocData) {
556         bool argWasUncacheable = isArgUncached[argIndex];
557         bool argIsUncacheable = allocData->allocationFlagsProperty.flags.locallyUncachedResource;
558         if (argWasUncacheable == false && argIsUncacheable) {
559             kernelRequiresUncachedMocsCount++;
560         } else if (argWasUncacheable && argIsUncacheable == false) {
561             kernelRequiresUncachedMocsCount--;
562         }
563         this->setKernelArgUncached(argIndex, argIsUncacheable);
564     }
565 
566     residencyContainer[argIndex] = allocation;
567 
568     return ZE_RESULT_SUCCESS;
569 }
570 
setArgUnknown(uint32_t argIndex,size_t argSize,const void * argVal)571 ze_result_t KernelImp::setArgUnknown(uint32_t argIndex, size_t argSize, const void *argVal) {
572     return ZE_RESULT_SUCCESS;
573 }
574 
setArgBuffer(uint32_t argIndex,size_t argSize,const void * argVal)575 ze_result_t KernelImp::setArgBuffer(uint32_t argIndex, size_t argSize, const void *argVal) {
576     const auto &allArgs = kernelImmData->getDescriptor().payloadMappings.explicitArgs;
577     const auto &currArg = allArgs[argIndex];
578     if (currArg.getTraits().getAddressQualifier() == NEO::KernelArgMetadata::AddrLocal) {
579         slmArgSizes[argIndex] = static_cast<uint32_t>(argSize);
580         UNRECOVERABLE_IF(NEO::isUndefinedOffset(currArg.as<NEO::ArgDescPointer>().slmOffset));
581         auto slmOffset = *reinterpret_cast<uint32_t *>(crossThreadData.get() + currArg.as<NEO::ArgDescPointer>().slmOffset);
582         slmOffset += static_cast<uint32_t>(argSize);
583         ++argIndex;
584         while (argIndex < kernelImmData->getDescriptor().payloadMappings.explicitArgs.size()) {
585             if (allArgs[argIndex].getTraits().getAddressQualifier() != NEO::KernelArgMetadata::AddrLocal) {
586                 ++argIndex;
587                 continue;
588             }
589             const auto &nextArg = allArgs[argIndex].as<NEO::ArgDescPointer>();
590             UNRECOVERABLE_IF(0 == nextArg.requiredSlmAlignment);
591             slmOffset = alignUp<uint32_t>(slmOffset, nextArg.requiredSlmAlignment);
592             NEO::patchNonPointer<uint32_t>(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), nextArg.slmOffset, slmOffset);
593 
594             slmOffset += static_cast<uint32_t>(slmArgSizes[argIndex]);
595             ++argIndex;
596         }
597         slmArgsTotalSize = static_cast<uint32_t>(alignUp(slmOffset, KB));
598         return ZE_RESULT_SUCCESS;
599     }
600 
601     if (nullptr == argVal) {
602         residencyContainer[argIndex] = nullptr;
603         const auto &arg = kernelImmData->getDescriptor().payloadMappings.explicitArgs[argIndex].as<NEO::ArgDescPointer>();
604         uintptr_t nullBufferValue = 0;
605         NEO::patchPointer(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg, nullBufferValue);
606         return ZE_RESULT_SUCCESS;
607     }
608 
609     auto requestedAddress = *reinterpret_cast<void *const *>(argVal);
610     uintptr_t gpuAddress = 0u;
611     NEO::GraphicsAllocation *alloc = module->getDevice()->getDriverHandle()->getDriverSystemMemoryAllocation(requestedAddress,
612                                                                                                              1u,
613                                                                                                              module->getDevice()->getRootDeviceIndex(),
614                                                                                                              &gpuAddress);
615     DeviceImp *device = static_cast<DeviceImp *>(this->module->getDevice());
616     DriverHandleImp *driverHandle = static_cast<DriverHandleImp *>(device->getDriverHandle());
617     auto allocData = driverHandle->getSvmAllocsManager()->getSVMAlloc(requestedAddress);
618     if (driverHandle->isRemoteResourceNeeded(requestedAddress, alloc, allocData, device)) {
619         if (allocData == nullptr) {
620             return ZE_RESULT_ERROR_INVALID_ARGUMENT;
621         }
622 
623         uint64_t pbase = allocData->gpuAllocations.getDefaultGraphicsAllocation()->getGpuAddress();
624         uint64_t offset = (uint64_t)requestedAddress - pbase;
625 
626         alloc = driverHandle->getPeerAllocation(device, allocData, reinterpret_cast<void *>(pbase), &gpuAddress);
627         if (alloc == nullptr) {
628             return ZE_RESULT_ERROR_INVALID_ARGUMENT;
629         }
630         gpuAddress += offset;
631     }
632 
633     return setArgBufferWithAlloc(argIndex, gpuAddress, alloc);
634 }
635 
setArgImage(uint32_t argIndex,size_t argSize,const void * argVal)636 ze_result_t KernelImp::setArgImage(uint32_t argIndex, size_t argSize, const void *argVal) {
637     if (argVal == nullptr) {
638         residencyContainer[argIndex] = nullptr;
639         return ZE_RESULT_SUCCESS;
640     }
641 
642     const auto &hwInfo = module->getDevice()->getNEODevice()->getHardwareInfo();
643     auto isMediaBlockImage = (hwInfo.capabilityTable.supportsMediaBlock &&
644                               kernelImmData->getDescriptor().payloadMappings.explicitArgs[argIndex].getExtendedTypeInfo().isMediaBlockImage);
645     const auto &arg = kernelImmData->getDescriptor().payloadMappings.explicitArgs[argIndex].as<NEO::ArgDescImage>();
646     const auto image = Image::fromHandle(*static_cast<const ze_image_handle_t *>(argVal));
647 
648     if (kernelImmData->getDescriptor().kernelAttributes.imageAddressingMode == NEO::KernelDescriptor::Bindless) {
649         image->copySurfaceStateToSSH(patchBindlessSurfaceState(image->getAllocation(), arg.bindless), 0u, isMediaBlockImage);
650     } else {
651         image->copySurfaceStateToSSH(surfaceStateHeapData.get(), arg.bindful, isMediaBlockImage);
652     }
653 
654     residencyContainer[argIndex] = image->getAllocation();
655 
656     auto imageInfo = image->getImageInfo();
657     auto clChannelType = getClChannelDataType(image->getImageDesc().format);
658     auto clChannelOrder = getClChannelOrder(image->getImageDesc().format);
659     NEO::patchNonPointer<size_t>(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg.metadataPayload.imgWidth, imageInfo.imgDesc.imageWidth);
660     NEO::patchNonPointer<size_t>(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg.metadataPayload.imgHeight, imageInfo.imgDesc.imageHeight);
661     NEO::patchNonPointer<size_t>(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg.metadataPayload.imgDepth, imageInfo.imgDesc.imageDepth);
662     NEO::patchNonPointer<uint32_t>(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg.metadataPayload.numSamples, imageInfo.imgDesc.numSamples);
663     NEO::patchNonPointer<size_t>(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg.metadataPayload.arraySize, imageInfo.imgDesc.imageArraySize);
664     NEO::patchNonPointer<cl_channel_type>(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg.metadataPayload.channelDataType, clChannelType);
665     NEO::patchNonPointer<cl_channel_order>(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg.metadataPayload.channelOrder, clChannelOrder);
666     NEO::patchNonPointer<uint32_t>(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg.metadataPayload.numMipLevels, imageInfo.imgDesc.numMipLevels);
667 
668     auto pixelSize = imageInfo.surfaceFormat->ImageElementSizeInBytes;
669     NEO::patchNonPointer<uint64_t>(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg.metadataPayload.flatBaseOffset, image->getAllocation()->getGpuAddress());
670     NEO::patchNonPointer<size_t>(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg.metadataPayload.flatWidth, (imageInfo.imgDesc.imageWidth * pixelSize) - 1u);
671     NEO::patchNonPointer<size_t>(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg.metadataPayload.flatHeight, (imageInfo.imgDesc.imageHeight * pixelSize) - 1u);
672     NEO::patchNonPointer<size_t>(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg.metadataPayload.flatPitch, imageInfo.imgDesc.imageRowPitch - 1u);
673 
674     return ZE_RESULT_SUCCESS;
675 }
676 
setArgSampler(uint32_t argIndex,size_t argSize,const void * argVal)677 ze_result_t KernelImp::setArgSampler(uint32_t argIndex, size_t argSize, const void *argVal) {
678     const auto &arg = kernelImmData->getDescriptor().payloadMappings.explicitArgs[argIndex].as<NEO::ArgDescSampler>();
679     const auto sampler = Sampler::fromHandle(*static_cast<const ze_sampler_handle_t *>(argVal));
680     sampler->copySamplerStateToDSH(dynamicStateHeapData.get(), dynamicStateHeapDataSize, arg.bindful);
681 
682     auto samplerDesc = sampler->getSamplerDesc();
683 
684     NEO::patchNonPointer<uint32_t>(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg.metadataPayload.samplerSnapWa, (samplerDesc.addressMode == ZE_SAMPLER_ADDRESS_MODE_CLAMP && samplerDesc.filterMode == ZE_SAMPLER_FILTER_MODE_NEAREST) ? std::numeric_limits<uint32_t>::max() : 0u);
685     NEO::patchNonPointer<uint32_t>(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg.metadataPayload.samplerAddressingMode, static_cast<uint32_t>(getAddrMode(samplerDesc.addressMode)));
686     NEO::patchNonPointer<uint32_t>(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg.metadataPayload.samplerNormalizedCoords, samplerDesc.isNormalized ? static_cast<uint32_t>(SamplerPatchValues::NormalizedCoordsTrue) : static_cast<uint32_t>(SamplerPatchValues::NormalizedCoordsFalse));
687 
688     return ZE_RESULT_SUCCESS;
689 }
690 
getKernelName(size_t * pSize,char * pName)691 ze_result_t KernelImp::getKernelName(size_t *pSize, char *pName) {
692     size_t kernelNameSize = this->kernelImmData->getDescriptor().kernelMetadata.kernelName.size() + 1;
693     if (0 == *pSize || nullptr == pName) {
694         *pSize = kernelNameSize;
695         return ZE_RESULT_SUCCESS;
696     }
697 
698     *pSize = std::min(*pSize, kernelNameSize);
699     strncpy_s(pName, *pSize,
700               this->kernelImmData->getDescriptor().kernelMetadata.kernelName.c_str(), kernelNameSize);
701 
702     return ZE_RESULT_SUCCESS;
703 }
704 
getProperties(ze_kernel_properties_t * pKernelProperties)705 ze_result_t KernelImp::getProperties(ze_kernel_properties_t *pKernelProperties) {
706     const auto &kernelDescriptor = this->kernelImmData->getDescriptor();
707     pKernelProperties->numKernelArgs = static_cast<uint32_t>(kernelDescriptor.payloadMappings.explicitArgs.size());
708     pKernelProperties->requiredGroupSizeX = kernelDescriptor.kernelAttributes.requiredWorkgroupSize[0];
709     pKernelProperties->requiredGroupSizeY = kernelDescriptor.kernelAttributes.requiredWorkgroupSize[1];
710     pKernelProperties->requiredGroupSizeZ = kernelDescriptor.kernelAttributes.requiredWorkgroupSize[2];
711     pKernelProperties->requiredNumSubGroups = kernelDescriptor.kernelMetadata.compiledSubGroupsNumber;
712     pKernelProperties->requiredSubgroupSize = kernelDescriptor.kernelMetadata.requiredSubGroupSize;
713     pKernelProperties->maxSubgroupSize = kernelDescriptor.kernelAttributes.simdSize;
714     pKernelProperties->localMemSize = kernelDescriptor.kernelAttributes.slmInlineSize;
715     pKernelProperties->privateMemSize = kernelDescriptor.kernelAttributes.perHwThreadPrivateMemorySize;
716     pKernelProperties->spillMemSize = kernelDescriptor.kernelAttributes.perThreadScratchSize[0];
717     memset(pKernelProperties->uuid.kid, 0, ZE_MAX_KERNEL_UUID_SIZE);
718     memset(pKernelProperties->uuid.mid, 0, ZE_MAX_MODULE_UUID_SIZE);
719 
720     uint32_t maxKernelWorkGroupSize = static_cast<uint32_t>(this->module->getDevice()->getNEODevice()->getDeviceInfo().maxWorkGroupSize);
721     pKernelProperties->maxNumSubgroups = maxKernelWorkGroupSize / kernelDescriptor.kernelAttributes.simdSize;
722 
723     void *pNext = pKernelProperties->pNext;
724     while (pNext) {
725         ze_base_desc_t *extendedProperties = reinterpret_cast<ze_base_desc_t *>(pKernelProperties->pNext);
726         if (extendedProperties->stype == ZE_STRUCTURE_TYPE_KERNEL_PREFERRED_GROUP_SIZE_PROPERTIES) {
727             ze_kernel_preferred_group_size_properties_t *preferredGroupSizeProperties =
728                 reinterpret_cast<ze_kernel_preferred_group_size_properties_t *>(extendedProperties);
729 
730             preferredGroupSizeProperties->preferredMultiple = this->kernelImmData->getKernelInfo()->getMaxSimdSize();
731             auto &hwHelper = NEO::HwHelper::get(this->module->getDevice()->getHwInfo().platform.eRenderCoreFamily);
732             if (hwHelper.isFusedEuDispatchEnabled(this->module->getDevice()->getHwInfo())) {
733                 preferredGroupSizeProperties->preferredMultiple *= 2;
734             }
735         }
736 
737         pNext = const_cast<void *>(extendedProperties->pNext);
738     }
739 
740     return ZE_RESULT_SUCCESS;
741 }
742 
allocatePrivateMemoryGraphicsAllocation()743 NEO::GraphicsAllocation *KernelImp::allocatePrivateMemoryGraphicsAllocation() {
744     auto &kernelAttributes = kernelImmData->getDescriptor().kernelAttributes;
745     auto neoDevice = module->getDevice()->getNEODevice();
746 
747     auto privateSurfaceSize = NEO::KernelHelper::getPrivateSurfaceSize(kernelAttributes.perHwThreadPrivateMemorySize,
748                                                                        neoDevice->getDeviceInfo().computeUnitsUsedForScratch);
749 
750     UNRECOVERABLE_IF(privateSurfaceSize == 0);
751     auto privateMemoryGraphicsAllocation = neoDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(
752         {neoDevice->getRootDeviceIndex(), privateSurfaceSize, NEO::GraphicsAllocation::AllocationType::PRIVATE_SURFACE, neoDevice->getDeviceBitfield()});
753 
754     UNRECOVERABLE_IF(privateMemoryGraphicsAllocation == nullptr);
755     return privateMemoryGraphicsAllocation;
756 }
757 
patchCrossthreadDataWithPrivateAllocation(NEO::GraphicsAllocation * privateAllocation)758 void KernelImp::patchCrossthreadDataWithPrivateAllocation(NEO::GraphicsAllocation *privateAllocation) {
759     auto &kernelAttributes = kernelImmData->getDescriptor().kernelAttributes;
760     auto device = module->getDevice();
761 
762     ArrayRef<uint8_t> crossThredDataArrayRef = ArrayRef<uint8_t>(this->crossThreadData.get(), this->crossThreadDataSize);
763     ArrayRef<uint8_t> surfaceStateHeapArrayRef = ArrayRef<uint8_t>(this->surfaceStateHeapData.get(), this->surfaceStateHeapDataSize);
764 
765     patchWithImplicitSurface(crossThredDataArrayRef, surfaceStateHeapArrayRef,
766                              static_cast<uintptr_t>(privateAllocation->getGpuAddressToPatch()),
767                              *privateAllocation, kernelImmData->getDescriptor().payloadMappings.implicitArgs.privateMemoryAddress,
768                              *device->getNEODevice(), kernelAttributes.flags.useGlobalAtomics);
769 }
770 
initialize(const ze_kernel_desc_t * desc)771 ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
772     this->kernelImmData = module->getKernelImmutableData(desc->pKernelName);
773     if (this->kernelImmData == nullptr) {
774         return ZE_RESULT_ERROR_INVALID_KERNEL_NAME;
775     }
776 
777     auto isaAllocation = this->kernelImmData->getIsaGraphicsAllocation();
778 
779     auto neoDevice = module->getDevice()->getNEODevice();
780     auto &hwInfo = neoDevice->getHardwareInfo();
781     auto &hwHelper = NEO::HwHelper::get(hwInfo.platform.eRenderCoreFamily);
782     auto &kernelDescriptor = kernelImmData->getDescriptor();
783 
784     this->schedulingHintExpFlag = hwHelper.getDefaultThreadArbitrationPolicy();
785     UNRECOVERABLE_IF(!this->kernelImmData->getKernelInfo()->heapInfo.pKernelHeap);
786 
787     if (isaAllocation->getAllocationType() == NEO::GraphicsAllocation::AllocationType::KERNEL_ISA_INTERNAL) {
788         NEO::MemoryTransferHelper::transferMemoryToAllocation(hwHelper.isBlitCopyRequiredForLocalMemory(hwInfo, *isaAllocation),
789                                                               *neoDevice,
790                                                               isaAllocation,
791                                                               0,
792                                                               this->kernelImmData->getKernelInfo()->heapInfo.pKernelHeap,
793                                                               static_cast<size_t>(this->kernelImmData->getKernelInfo()->heapInfo.KernelHeapSize));
794     }
795 
796     for (const auto &argT : kernelDescriptor.payloadMappings.explicitArgs) {
797         switch (argT.type) {
798         default:
799             this->kernelArgHandlers.push_back(&KernelImp::setArgUnknown);
800             break;
801         case NEO::ArgDescriptor::ArgTPointer:
802             this->kernelArgHandlers.push_back(&KernelImp::setArgBuffer);
803             break;
804         case NEO::ArgDescriptor::ArgTImage:
805             this->kernelArgHandlers.push_back(&KernelImp::setArgImage);
806             break;
807         case NEO::ArgDescriptor::ArgTSampler:
808             this->kernelArgHandlers.push_back(&KernelImp::setArgSampler);
809             break;
810         case NEO::ArgDescriptor::ArgTValue:
811             this->kernelArgHandlers.push_back(&KernelImp::setArgImmediate);
812             break;
813         }
814     }
815 
816     slmArgSizes.resize(this->kernelArgHandlers.size(), 0);
817 
818     isArgUncached.resize(this->kernelArgHandlers.size(), 0);
819 
820     if (kernelImmData->getSurfaceStateHeapSize() > 0) {
821         this->surfaceStateHeapData.reset(new uint8_t[kernelImmData->getSurfaceStateHeapSize()]);
822         memcpy_s(this->surfaceStateHeapData.get(),
823                  kernelImmData->getSurfaceStateHeapSize(),
824                  kernelImmData->getSurfaceStateHeapTemplate(),
825                  kernelImmData->getSurfaceStateHeapSize());
826         this->surfaceStateHeapDataSize = kernelImmData->getSurfaceStateHeapSize();
827     }
828 
829     if (kernelDescriptor.kernelAttributes.crossThreadDataSize != 0) {
830         this->crossThreadData.reset(new uint8_t[kernelDescriptor.kernelAttributes.crossThreadDataSize]);
831         memcpy_s(this->crossThreadData.get(),
832                  kernelDescriptor.kernelAttributes.crossThreadDataSize,
833                  kernelImmData->getCrossThreadDataTemplate(),
834                  kernelDescriptor.kernelAttributes.crossThreadDataSize);
835         this->crossThreadDataSize = kernelDescriptor.kernelAttributes.crossThreadDataSize;
836     }
837 
838     if (kernelImmData->getDynamicStateHeapDataSize() != 0) {
839         this->dynamicStateHeapData.reset(new uint8_t[kernelImmData->getDynamicStateHeapDataSize()]);
840         memcpy_s(this->dynamicStateHeapData.get(),
841                  kernelImmData->getDynamicStateHeapDataSize(),
842                  kernelImmData->getDynamicStateHeapTemplate(),
843                  kernelImmData->getDynamicStateHeapDataSize());
844         this->dynamicStateHeapDataSize = kernelImmData->getDynamicStateHeapDataSize();
845     }
846 
847     if (kernelDescriptor.kernelAttributes.requiredWorkgroupSize[0] > 0) {
848         auto *reqdSize = kernelDescriptor.kernelAttributes.requiredWorkgroupSize;
849         UNRECOVERABLE_IF(reqdSize[1] == 0);
850         UNRECOVERABLE_IF(reqdSize[2] == 0);
851         auto result = setGroupSize(reqdSize[0], reqdSize[1], reqdSize[2]);
852         if (result != ZE_RESULT_SUCCESS) {
853             return result;
854         }
855     } else {
856         auto result = setGroupSize(kernelDescriptor.kernelAttributes.simdSize, 1, 1);
857         if (result != ZE_RESULT_SUCCESS) {
858             return result;
859         }
860     }
861 
862     residencyContainer.resize(this->kernelArgHandlers.size(), nullptr);
863 
864     auto &kernelAttributes = kernelDescriptor.kernelAttributes;
865     if ((kernelAttributes.perHwThreadPrivateMemorySize != 0U) && (false == module->shouldAllocatePrivateMemoryPerDispatch())) {
866         this->privateMemoryGraphicsAllocation = allocatePrivateMemoryGraphicsAllocation();
867         this->patchCrossthreadDataWithPrivateAllocation(this->privateMemoryGraphicsAllocation);
868         this->residencyContainer.push_back(this->privateMemoryGraphicsAllocation);
869     }
870     if (kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs) {
871         pImplicitArgs = std::make_unique<NEO::ImplicitArgs>();
872         *pImplicitArgs = {};
873         pImplicitArgs->structSize = sizeof(NEO::ImplicitArgs);
874         pImplicitArgs->structVersion = 0;
875         pImplicitArgs->simdWidth = kernelDescriptor.kernelAttributes.simdSize;
876     }
877 
878     this->createPrintfBuffer();
879 
880     this->setDebugSurface();
881 
882     residencyContainer.insert(residencyContainer.end(), kernelImmData->getResidencyContainer().begin(),
883                               kernelImmData->getResidencyContainer().end());
884 
885     kernelHasIndirectAccess = kernelDescriptor.kernelAttributes.hasNonKernelArgLoad ||
886                               kernelDescriptor.kernelAttributes.hasNonKernelArgStore ||
887                               kernelDescriptor.kernelAttributes.hasNonKernelArgAtomic;
888 
889     if (this->usesRayTracing()) {
890         if (this->getImmutableData()->getDescriptor().payloadMappings.implicitArgs.rtDispatchGlobals.pointerSize > 0) {
891             uint32_t bvhLevels = NEO::RayTracingHelper::maxBvhLevels;
892             neoDevice->initializeRayTracing(bvhLevels);
893             auto rtDispatchGlobals = neoDevice->getRTDispatchGlobals(bvhLevels);
894             if (rtDispatchGlobals == nullptr) {
895                 return ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY;
896             }
897             this->residencyContainer.push_back(neoDevice->getRTMemoryBackedBuffer());
898             this->residencyContainer.push_back(rtDispatchGlobals);
899 
900             NEO::patchPointer(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize),
901                               this->getImmutableData()->getDescriptor().payloadMappings.implicitArgs.rtDispatchGlobals,
902                               static_cast<uintptr_t>(rtDispatchGlobals->getGpuAddressToPatch()));
903         } else {
904             neoDevice->initializeRayTracing(0);
905             this->residencyContainer.push_back(neoDevice->getRTMemoryBackedBuffer());
906         }
907     }
908 
909     return ZE_RESULT_SUCCESS;
910 }
911 
createPrintfBuffer()912 void KernelImp::createPrintfBuffer() {
913     if (this->kernelImmData->getDescriptor().kernelAttributes.flags.usesPrintf || pImplicitArgs) {
914         this->printfBuffer = PrintfHandler::createPrintfBuffer(this->module->getDevice());
915         this->residencyContainer.push_back(printfBuffer);
916         if (this->kernelImmData->getDescriptor().kernelAttributes.flags.usesPrintf) {
917             NEO::patchPointer(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize),
918                               this->getImmutableData()->getDescriptor().payloadMappings.implicitArgs.printfSurfaceAddress,
919                               static_cast<uintptr_t>(this->printfBuffer->getGpuAddressToPatch()));
920         }
921         if (pImplicitArgs) {
922             pImplicitArgs->printfBufferPtr = printfBuffer->getGpuAddress();
923         }
924     }
925 }
926 
printPrintfOutput()927 void KernelImp::printPrintfOutput() {
928     PrintfHandler::printOutput(kernelImmData, this->printfBuffer, module->getDevice());
929 }
930 
usesSyncBuffer()931 bool KernelImp::usesSyncBuffer() {
932     return this->kernelImmData->getDescriptor().kernelAttributes.flags.usesSyncBuffer;
933 }
934 
patchSyncBuffer(NEO::GraphicsAllocation * gfxAllocation,size_t bufferOffset)935 void KernelImp::patchSyncBuffer(NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) {
936     this->residencyContainer.push_back(gfxAllocation);
937     NEO::patchPointer(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize),
938                       this->getImmutableData()->getDescriptor().payloadMappings.implicitArgs.syncBufferAddress,
939                       static_cast<uintptr_t>(ptrOffset(gfxAllocation->getGpuAddressToPatch(), bufferOffset)));
940 }
941 
setDebugSurface()942 void KernelImp::setDebugSurface() {
943     auto device = module->getDevice();
944     if (module->isDebugEnabled() && device->getNEODevice()->getDebugger()) {
945 
946         auto surfaceStateHeapRef = ArrayRef<uint8_t>(surfaceStateHeapData.get(), surfaceStateHeapDataSize);
947 
948         patchWithImplicitSurface(ArrayRef<uint8_t>(), surfaceStateHeapRef,
949                                  0,
950                                  *device->getDebugSurface(), this->getImmutableData()->getDescriptor().payloadMappings.implicitArgs.systemThreadSurfaceAddress,
951                                  *device->getNEODevice(), getKernelDescriptor().kernelAttributes.flags.useGlobalAtomics);
952     }
953 }
patchBindlessSurfaceState(NEO::GraphicsAllocation * alloc,uint32_t bindless)954 void *KernelImp::patchBindlessSurfaceState(NEO::GraphicsAllocation *alloc, uint32_t bindless) {
955     auto &hwHelper = NEO::HwHelper::get(this->module->getDevice()->getHwInfo().platform.eRenderCoreFamily);
956     auto surfaceStateSize = hwHelper.getRenderSurfaceStateSize();
957     NEO::BindlessHeapsHelper *bindlessHeapsHelper = this->module->getDevice()->getNEODevice()->getBindlessHeapsHelper();
958     auto ssInHeap = bindlessHeapsHelper->allocateSSInHeap(surfaceStateSize, alloc, NEO::BindlessHeapsHelper::GLOBAL_SSH);
959     this->residencyContainer.push_back(ssInHeap.heapAllocation);
960     auto patchLocation = ptrOffset(getCrossThreadData(), bindless);
961     auto patchValue = hwHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast<uint32_t>(ssInHeap.surfaceStateOffset));
962     patchWithRequiredSize(const_cast<uint8_t *>(patchLocation), sizeof(patchValue), patchValue);
963     return ssInHeap.ssPtr;
964 }
patchWorkgroupSizeInCrossThreadData(uint32_t x,uint32_t y,uint32_t z)965 void KernelImp::patchWorkgroupSizeInCrossThreadData(uint32_t x, uint32_t y, uint32_t z) {
966     const NEO::KernelDescriptor &desc = kernelImmData->getDescriptor();
967     auto dst = ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize);
968     uint32_t workgroupSize[3] = {x, y, z};
969     NEO::patchVecNonPointer(dst, desc.payloadMappings.dispatchTraits.localWorkSize, workgroupSize);
970     NEO::patchVecNonPointer(dst, desc.payloadMappings.dispatchTraits.localWorkSize2, workgroupSize);
971     NEO::patchVecNonPointer(dst, desc.payloadMappings.dispatchTraits.enqueuedLocalWorkSize, workgroupSize);
972     if (pImplicitArgs) {
973         pImplicitArgs->localSizeX = x;
974         pImplicitArgs->localSizeY = y;
975         pImplicitArgs->localSizeZ = z;
976     }
977 }
978 
setGlobalOffsetExp(uint32_t offsetX,uint32_t offsetY,uint32_t offsetZ)979 ze_result_t KernelImp::setGlobalOffsetExp(uint32_t offsetX,
980                                           uint32_t offsetY,
981                                           uint32_t offsetZ) {
982     this->globalOffsets[0] = offsetX;
983     this->globalOffsets[1] = offsetY;
984     this->globalOffsets[2] = offsetZ;
985 
986     return ZE_RESULT_SUCCESS;
987 }
988 
patchGlobalOffset()989 void KernelImp::patchGlobalOffset() {
990     const NEO::KernelDescriptor &desc = kernelImmData->getDescriptor();
991     auto dst = ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize);
992     NEO::patchVecNonPointer(dst, desc.payloadMappings.dispatchTraits.globalWorkOffset, this->globalOffsets);
993     if (pImplicitArgs) {
994         pImplicitArgs->globalOffsetX = globalOffsets[0];
995         pImplicitArgs->globalOffsetY = globalOffsets[1];
996         pImplicitArgs->globalOffsetZ = globalOffsets[2];
997     }
998 }
999 
create(uint32_t productFamily,Module * module,const ze_kernel_desc_t * desc,ze_result_t * res)1000 Kernel *Kernel::create(uint32_t productFamily, Module *module,
1001                        const ze_kernel_desc_t *desc, ze_result_t *res) {
1002     UNRECOVERABLE_IF(productFamily >= IGFX_MAX_PRODUCT);
1003     KernelAllocatorFn allocator = kernelFactory[productFamily];
1004     auto kernel = static_cast<KernelImp *>(allocator(module));
1005     *res = kernel->initialize(desc);
1006     if (*res) {
1007         kernel->destroy();
1008         return nullptr;
1009     }
1010     return kernel;
1011 }
1012 
hasIndirectAllocationsAllowed() const1013 bool KernelImp::hasIndirectAllocationsAllowed() const {
1014     return (unifiedMemoryControls.indirectDeviceAllocationsAllowed ||
1015             unifiedMemoryControls.indirectHostAllocationsAllowed ||
1016             unifiedMemoryControls.indirectSharedAllocationsAllowed);
1017 }
1018 
getSlmTotalSize() const1019 uint32_t KernelImp::getSlmTotalSize() const {
1020     return slmArgsTotalSize + getImmutableData()->getDescriptor().kernelAttributes.slmInlineSize;
1021 }
1022 
setCacheConfig(ze_cache_config_flags_t flags)1023 ze_result_t KernelImp::setCacheConfig(ze_cache_config_flags_t flags) {
1024     cacheConfigFlags = flags;
1025     return ZE_RESULT_SUCCESS;
1026 }
1027 
getIsaAllocation() const1028 NEO::GraphicsAllocation *KernelImp::getIsaAllocation() const {
1029     return getImmutableData()->getIsaGraphicsAllocation();
1030 }
1031 
setSchedulingHintExp(ze_scheduling_hint_exp_desc_t * pHint)1032 ze_result_t KernelImp::setSchedulingHintExp(ze_scheduling_hint_exp_desc_t *pHint) {
1033     this->schedulingHintExpFlag = pHint->flags;
1034     return ZE_RESULT_SUCCESS;
1035 }
1036 
getSchedulingHintExp()1037 uint32_t KernelImp::getSchedulingHintExp() {
1038     if (NEO::DebugManager.flags.OverrideThreadArbitrationPolicy.get() != -1) {
1039         this->schedulingHintExpFlag = static_cast<uint32_t>(NEO::DebugManager.flags.OverrideThreadArbitrationPolicy.get());
1040     }
1041     return this->schedulingHintExpFlag;
1042 }
1043 
getSizeForImplicitArgsPatching() const1044 uint32_t KernelImp::getSizeForImplicitArgsPatching() const {
1045     if (!pImplicitArgs) {
1046         return 0;
1047     }
1048     auto implicitArgsSize = static_cast<uint32_t>(sizeof(NEO::ImplicitArgs));
1049     const NEO::KernelDescriptor &kernelDescriptor = kernelImmData->getDescriptor();
1050     auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
1051     Vec3<size_t> groupSize{this->groupSize[0], this->groupSize[1], this->groupSize[2]};
1052     auto itemsInGroup = Math::computeTotalElementsCount(groupSize);
1053     uint32_t localIdsSizeNeeded =
1054         alignUp(static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
1055                     kernelDescriptor.kernelAttributes.simdSize, grfSize, 3u, itemsInGroup)),
1056                 MemoryConstants::cacheLineSize);
1057     return implicitArgsSize + localIdsSizeNeeded;
1058 }
1059 
patchImplicitArgs(void * & pOut) const1060 void KernelImp::patchImplicitArgs(void *&pOut) const {
1061     if (!pImplicitArgs) {
1062         return;
1063     }
1064     const NEO::KernelDescriptor &kernelDescriptor = kernelImmData->getDescriptor();
1065     auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
1066     NEO::generateLocalIDs(
1067         pOut,
1068         static_cast<uint16_t>(kernelDescriptor.kernelAttributes.simdSize),
1069         std::array<uint16_t, 3>{{static_cast<uint16_t>(groupSize[0]),
1070                                  static_cast<uint16_t>(groupSize[1]),
1071                                  static_cast<uint16_t>(groupSize[2])}},
1072         std::array<uint8_t, 3>{{
1073             kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[0],
1074             kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[1],
1075             kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[2],
1076         }},
1077         false, grfSize);
1078     auto sizeForLocalIdsProgramming = getSizeForImplicitArgsPatching() - sizeof(NEO::ImplicitArgs);
1079     pOut = ptrOffset(pOut, sizeForLocalIdsProgramming);
1080     memcpy_s(pOut, sizeof(NEO::ImplicitArgs), pImplicitArgs.get(), sizeof(NEO::ImplicitArgs));
1081     pOut = ptrOffset(pOut, sizeof(NEO::ImplicitArgs));
1082 }
1083 } // namespace L0
1084