1 /*
2 * Copyright (C) 2020-2021 Intel Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 */
7
8 #include "level_zero/core/source/kernel/kernel_imp.h"
9
10 #include "shared/source/helpers/basic_math.h"
11 #include "shared/source/helpers/blit_commands_helper.h"
12 #include "shared/source/helpers/hw_info.h"
13 #include "shared/source/helpers/kernel_helpers.h"
14 #include "shared/source/helpers/local_work_size.h"
15 #include "shared/source/helpers/per_thread_data.h"
16 #include "shared/source/helpers/ray_tracing_helper.h"
17 #include "shared/source/helpers/register_offsets.h"
18 #include "shared/source/helpers/string.h"
19 #include "shared/source/helpers/surface_format_info.h"
20 #include "shared/source/kernel/implicit_args.h"
21 #include "shared/source/kernel/kernel_arg_descriptor.h"
22 #include "shared/source/kernel/kernel_descriptor.h"
23 #include "shared/source/memory_manager/memory_manager.h"
24 #include "shared/source/memory_manager/memory_operations_handler.h"
25 #include "shared/source/memory_manager/unified_memory_manager.h"
26 #include "shared/source/program/kernel_info.h"
27 #include "shared/source/utilities/arrayref.h"
28
29 #include "level_zero/core/source/debugger/debugger_l0.h"
30 #include "level_zero/core/source/device/device.h"
31 #include "level_zero/core/source/device/device_imp.h"
32 #include "level_zero/core/source/driver/driver_handle_imp.h"
33 #include "level_zero/core/source/image/image.h"
34 #include "level_zero/core/source/image/image_format_desc_helper.h"
35 #include "level_zero/core/source/module/module.h"
36 #include "level_zero/core/source/module/module_imp.h"
37 #include "level_zero/core/source/printf_handler/printf_handler.h"
38 #include "level_zero/core/source/sampler/sampler.h"
39
40 #include <memory>
41
42 namespace L0 {
43 enum class SamplerPatchValues : uint32_t {
44 DefaultSampler = 0x00,
45 AddressNone = 0x00,
46 AddressClamp = 0x01,
47 AddressClampToEdge = 0x02,
48 AddressRepeat = 0x03,
49 AddressMirroredRepeat = 0x04,
50 AddressMirroredRepeat101 = 0x05,
51 NormalizedCoordsFalse = 0x00,
52 NormalizedCoordsTrue = 0x08
53 };
54
getAddrMode(ze_sampler_address_mode_t addressingMode)55 inline SamplerPatchValues getAddrMode(ze_sampler_address_mode_t addressingMode) {
56 switch (addressingMode) {
57 case ZE_SAMPLER_ADDRESS_MODE_REPEAT:
58 return SamplerPatchValues::AddressRepeat;
59 case ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER:
60 return SamplerPatchValues::AddressClampToEdge;
61 case ZE_SAMPLER_ADDRESS_MODE_CLAMP:
62 return SamplerPatchValues::AddressClamp;
63 case ZE_SAMPLER_ADDRESS_MODE_NONE:
64 return SamplerPatchValues::AddressNone;
65 case ZE_SAMPLER_ADDRESS_MODE_MIRROR:
66 return SamplerPatchValues::AddressMirroredRepeat;
67 default:
68 DEBUG_BREAK_IF(true);
69 }
70 return SamplerPatchValues::AddressNone;
71 }
72
KernelImmutableData(L0::Device * l0device)73 KernelImmutableData::KernelImmutableData(L0::Device *l0device) : device(l0device) {}
74
~KernelImmutableData()75 KernelImmutableData::~KernelImmutableData() {
76 if (nullptr != isaGraphicsAllocation) {
77 this->getDevice()->getNEODevice()->getMemoryManager()->freeGraphicsMemory(&*isaGraphicsAllocation);
78 isaGraphicsAllocation.release();
79 }
80 crossThreadDataTemplate.reset();
81 surfaceStateHeapTemplate.reset();
82 dynamicStateHeapTemplate.reset();
83 }
84
patchWithImplicitSurface(ArrayRef<uint8_t> crossThreadData,ArrayRef<uint8_t> surfaceStateHeap,uintptr_t ptrToPatchInCrossThreadData,NEO::GraphicsAllocation & allocation,const NEO::ArgDescPointer & ptr,const NEO::Device & device,bool useGlobalAtomics)85 inline void patchWithImplicitSurface(ArrayRef<uint8_t> crossThreadData, ArrayRef<uint8_t> surfaceStateHeap,
86 uintptr_t ptrToPatchInCrossThreadData, NEO::GraphicsAllocation &allocation,
87 const NEO::ArgDescPointer &ptr, const NEO::Device &device, bool useGlobalAtomics) {
88 if (false == crossThreadData.empty()) {
89 NEO::patchPointer(crossThreadData, ptr, ptrToPatchInCrossThreadData);
90 }
91
92 if ((false == surfaceStateHeap.empty()) && (NEO::isValidOffset(ptr.bindful))) {
93 auto surfaceState = surfaceStateHeap.begin() + ptr.bindful;
94 auto addressToPatch = allocation.getGpuAddress();
95 size_t sizeToPatch = allocation.getUnderlyingBufferSize();
96
97 auto &hwInfo = device.getHardwareInfo();
98 auto &hwHelper = NEO::HwHelper::get(hwInfo.platform.eRenderCoreFamily);
99
100 NEO::EncodeSurfaceStateArgs args;
101 args.outMemory = surfaceState;
102 args.size = sizeToPatch;
103 args.graphicsAddress = addressToPatch;
104 args.gmmHelper = device.getGmmHelper();
105 args.allocation = &allocation;
106 args.useGlobalAtomics = useGlobalAtomics;
107 args.numAvailableDevices = device.getNumGenericSubDevices();
108 args.areMultipleSubDevicesInContext = args.numAvailableDevices > 1;
109 args.mocs = hwHelper.getMocsIndex(*args.gmmHelper, true, false) << 1;
110
111 hwHelper.encodeBufferSurfaceState(args);
112 }
113 }
114
initialize(NEO::KernelInfo * kernelInfo,Device * device,uint32_t computeUnitsUsedForSratch,NEO::GraphicsAllocation * globalConstBuffer,NEO::GraphicsAllocation * globalVarBuffer,bool internalKernel)115 void KernelImmutableData::initialize(NEO::KernelInfo *kernelInfo, Device *device,
116 uint32_t computeUnitsUsedForSratch,
117 NEO::GraphicsAllocation *globalConstBuffer,
118 NEO::GraphicsAllocation *globalVarBuffer, bool internalKernel) {
119
120 UNRECOVERABLE_IF(kernelInfo == nullptr);
121 this->kernelInfo = kernelInfo;
122 this->kernelDescriptor = &kernelInfo->kernelDescriptor;
123
124 DeviceImp *deviceImp = static_cast<DeviceImp *>(device);
125 auto neoDevice = deviceImp->getActiveDevice();
126 auto memoryManager = neoDevice->getMemoryManager();
127
128 auto kernelIsaSize = kernelInfo->heapInfo.KernelHeapSize;
129 UNRECOVERABLE_IF(kernelIsaSize == 0);
130 UNRECOVERABLE_IF(!kernelInfo->heapInfo.pKernelHeap);
131 const auto allocType = internalKernel ? NEO::GraphicsAllocation::AllocationType::KERNEL_ISA_INTERNAL : NEO::GraphicsAllocation::AllocationType::KERNEL_ISA;
132
133 auto allocation = memoryManager->allocateGraphicsMemoryWithProperties(
134 {neoDevice->getRootDeviceIndex(), kernelIsaSize, allocType, neoDevice->getDeviceBitfield()});
135 UNRECOVERABLE_IF(allocation == nullptr);
136
137 isaGraphicsAllocation.reset(allocation);
138
139 if (neoDevice->getDebugger() && kernelInfo->kernelDescriptor.external.debugData.get()) {
140 createRelocatedDebugData(globalConstBuffer, globalVarBuffer);
141 if (device->getL0Debugger()) {
142 device->getL0Debugger()->registerElf(kernelInfo->kernelDescriptor.external.debugData.get(), allocation);
143 }
144 }
145
146 this->crossThreadDataSize = this->kernelDescriptor->kernelAttributes.crossThreadDataSize;
147
148 ArrayRef<uint8_t> crossThredDataArrayRef;
149 if (crossThreadDataSize != 0) {
150 crossThreadDataTemplate.reset(new uint8_t[crossThreadDataSize]);
151
152 if (kernelInfo->crossThreadData) {
153 memcpy_s(crossThreadDataTemplate.get(), crossThreadDataSize,
154 kernelInfo->crossThreadData, crossThreadDataSize);
155 } else {
156 memset(crossThreadDataTemplate.get(), 0x00, crossThreadDataSize);
157 }
158
159 crossThredDataArrayRef = ArrayRef<uint8_t>(this->crossThreadDataTemplate.get(), this->crossThreadDataSize);
160
161 NEO::patchNonPointer<uint32_t>(crossThredDataArrayRef,
162 kernelDescriptor->payloadMappings.implicitArgs.simdSize, kernelDescriptor->kernelAttributes.simdSize);
163 }
164
165 if (kernelInfo->heapInfo.SurfaceStateHeapSize != 0) {
166 this->surfaceStateHeapSize = kernelInfo->heapInfo.SurfaceStateHeapSize;
167 surfaceStateHeapTemplate.reset(new uint8_t[surfaceStateHeapSize]);
168
169 memcpy_s(surfaceStateHeapTemplate.get(), surfaceStateHeapSize,
170 kernelInfo->heapInfo.pSsh, surfaceStateHeapSize);
171 }
172
173 if (kernelInfo->heapInfo.DynamicStateHeapSize != 0) {
174 this->dynamicStateHeapSize = kernelInfo->heapInfo.DynamicStateHeapSize;
175 dynamicStateHeapTemplate.reset(new uint8_t[dynamicStateHeapSize]);
176
177 memcpy_s(dynamicStateHeapTemplate.get(), dynamicStateHeapSize,
178 kernelInfo->heapInfo.pDsh, dynamicStateHeapSize);
179 }
180
181 ArrayRef<uint8_t> surfaceStateHeapArrayRef = ArrayRef<uint8_t>(surfaceStateHeapTemplate.get(), getSurfaceStateHeapSize());
182
183 if (NEO::isValidOffset(kernelDescriptor->payloadMappings.implicitArgs.globalConstantsSurfaceAddress.stateless)) {
184 UNRECOVERABLE_IF(nullptr == globalConstBuffer);
185
186 patchWithImplicitSurface(crossThredDataArrayRef, surfaceStateHeapArrayRef,
187 static_cast<uintptr_t>(globalConstBuffer->getGpuAddressToPatch()),
188 *globalConstBuffer, kernelDescriptor->payloadMappings.implicitArgs.globalConstantsSurfaceAddress,
189 *neoDevice, kernelDescriptor->kernelAttributes.flags.useGlobalAtomics);
190 this->residencyContainer.push_back(globalConstBuffer);
191 } else if (nullptr != globalConstBuffer) {
192 this->residencyContainer.push_back(globalConstBuffer);
193 }
194
195 if (NEO::isValidOffset(kernelDescriptor->payloadMappings.implicitArgs.globalVariablesSurfaceAddress.stateless)) {
196 UNRECOVERABLE_IF(globalVarBuffer == nullptr);
197
198 patchWithImplicitSurface(crossThredDataArrayRef, surfaceStateHeapArrayRef,
199 static_cast<uintptr_t>(globalVarBuffer->getGpuAddressToPatch()),
200 *globalVarBuffer, kernelDescriptor->payloadMappings.implicitArgs.globalVariablesSurfaceAddress,
201 *neoDevice, kernelDescriptor->kernelAttributes.flags.useGlobalAtomics);
202 this->residencyContainer.push_back(globalVarBuffer);
203 } else if (nullptr != globalVarBuffer) {
204 this->residencyContainer.push_back(globalVarBuffer);
205 }
206 }
207
createRelocatedDebugData(NEO::GraphicsAllocation * globalConstBuffer,NEO::GraphicsAllocation * globalVarBuffer)208 void KernelImmutableData::createRelocatedDebugData(NEO::GraphicsAllocation *globalConstBuffer,
209 NEO::GraphicsAllocation *globalVarBuffer) {
210 NEO::Linker::SegmentInfo globalData;
211 NEO::Linker::SegmentInfo constData;
212 if (globalVarBuffer) {
213 globalData.gpuAddress = globalVarBuffer->getGpuAddress();
214 globalData.segmentSize = globalVarBuffer->getUnderlyingBufferSize();
215 }
216 if (globalConstBuffer) {
217 constData.gpuAddress = globalConstBuffer->getGpuAddress();
218 constData.segmentSize = globalConstBuffer->getUnderlyingBufferSize();
219 }
220
221 if (kernelInfo->kernelDescriptor.external.debugData.get()) {
222 std::string outErrReason;
223 std::string outWarning;
224 auto decodedElf = NEO::Elf::decodeElf<NEO::Elf::EI_CLASS_64>(ArrayRef<const uint8_t>(reinterpret_cast<const uint8_t *>(kernelInfo->kernelDescriptor.external.debugData->vIsa),
225 kernelInfo->kernelDescriptor.external.debugData->vIsaSize),
226 outErrReason, outWarning);
227
228 if (decodedElf.getDebugInfoRelocations().size() > 1) {
229 auto size = kernelInfo->kernelDescriptor.external.debugData->vIsaSize;
230 kernelInfo->kernelDescriptor.external.relocatedDebugData = std::make_unique<uint8_t[]>(size);
231
232 memcpy_s(kernelInfo->kernelDescriptor.external.relocatedDebugData.get(), size, kernelInfo->kernelDescriptor.external.debugData->vIsa, kernelInfo->kernelDescriptor.external.debugData->vIsaSize);
233
234 NEO::Linker::SegmentInfo textSegment = {getIsaGraphicsAllocation()->getGpuAddress(),
235 getIsaGraphicsAllocation()->getUnderlyingBufferSize()};
236
237 NEO::Linker::applyDebugDataRelocations(decodedElf, ArrayRef<uint8_t>(kernelInfo->kernelDescriptor.external.relocatedDebugData.get(), size),
238 textSegment, globalData, constData);
239 }
240 }
241 }
242
getIsaSize() const243 uint32_t KernelImmutableData::getIsaSize() const {
244 return static_cast<uint32_t>(isaGraphicsAllocation->getUnderlyingBufferSize());
245 }
246
KernelImp(Module * module)247 KernelImp::KernelImp(Module *module) : module(module) {}
248
~KernelImp()249 KernelImp::~KernelImp() {
250 if (nullptr != privateMemoryGraphicsAllocation) {
251 module->getDevice()->getNEODevice()->getMemoryManager()->freeGraphicsMemory(privateMemoryGraphicsAllocation);
252 }
253
254 if (perThreadDataForWholeThreadGroup != nullptr) {
255 alignedFree(perThreadDataForWholeThreadGroup);
256 }
257 if (printfBuffer != nullptr) {
258 //not allowed to call virtual function on destructor, so calling printOutput directly
259 PrintfHandler::printOutput(kernelImmData, this->printfBuffer, module->getDevice());
260 module->getDevice()->getNEODevice()->getMemoryManager()->freeGraphicsMemory(printfBuffer);
261 }
262 slmArgSizes.clear();
263 crossThreadData.reset();
264 surfaceStateHeapData.reset();
265 dynamicStateHeapData.reset();
266 }
267
setArgumentValue(uint32_t argIndex,size_t argSize,const void * pArgValue)268 ze_result_t KernelImp::setArgumentValue(uint32_t argIndex, size_t argSize,
269 const void *pArgValue) {
270 if (argIndex >= kernelArgHandlers.size()) {
271 return ZE_RESULT_ERROR_INVALID_ARGUMENT;
272 }
273 return (this->*kernelArgHandlers[argIndex])(argIndex, argSize, pArgValue);
274 }
275
setGroupCount(uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)276 void KernelImp::setGroupCount(uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ) {
277 const NEO::KernelDescriptor &desc = kernelImmData->getDescriptor();
278 uint32_t globalWorkSize[3] = {groupCountX * groupSize[0], groupCountY * groupSize[1],
279 groupCountZ * groupSize[2]};
280 auto dst = ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize);
281 NEO::patchVecNonPointer(dst, desc.payloadMappings.dispatchTraits.globalWorkSize, globalWorkSize);
282
283 uint32_t groupCount[3] = {groupCountX, groupCountY, groupCountZ};
284 NEO::patchVecNonPointer(dst, desc.payloadMappings.dispatchTraits.numWorkGroups, groupCount);
285
286 uint32_t workDim = 1;
287 if (groupCountZ * groupSize[2] > 1) {
288 workDim = 3;
289 } else if (groupCountY * groupSize[1] > 1) {
290 workDim = 2;
291 }
292 auto workDimOffset = desc.payloadMappings.dispatchTraits.workDim;
293 if (NEO::isValidOffset(workDimOffset)) {
294 auto destinationBuffer = ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize);
295 NEO::patchNonPointer(destinationBuffer, desc.payloadMappings.dispatchTraits.workDim, workDim);
296 }
297
298 if (pImplicitArgs) {
299 pImplicitArgs->numWorkDim = workDim;
300
301 pImplicitArgs->globalSizeX = globalWorkSize[0];
302 pImplicitArgs->globalSizeY = globalWorkSize[1];
303 pImplicitArgs->globalSizeZ = globalWorkSize[2];
304
305 pImplicitArgs->groupCountX = groupCount[0];
306 pImplicitArgs->groupCountY = groupCount[1];
307 pImplicitArgs->groupCountZ = groupCount[2];
308 }
309 }
310
setGroupSize(uint32_t groupSizeX,uint32_t groupSizeY,uint32_t groupSizeZ)311 ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
312 uint32_t groupSizeZ) {
313 if ((0 == groupSizeX) || (0 == groupSizeY) || (0 == groupSizeZ)) {
314 return ZE_RESULT_ERROR_INVALID_ARGUMENT;
315 }
316
317 auto numChannels = kernelImmData->getDescriptor().kernelAttributes.numLocalIdChannels;
318 Vec3<size_t> groupSize{groupSizeX, groupSizeY, groupSizeZ};
319 auto itemsInGroup = Math::computeTotalElementsCount(groupSize);
320
321 if (itemsInGroup > module->getMaxGroupSize()) {
322 DEBUG_BREAK_IF(true);
323 return ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION;
324 }
325
326 this->groupSize[0] = groupSizeX;
327 this->groupSize[1] = groupSizeY;
328 this->groupSize[2] = groupSizeZ;
329 const NEO::KernelDescriptor &kernelDescriptor = kernelImmData->getDescriptor();
330 for (uint32_t i = 0u; i < 3u; i++) {
331 if (kernelDescriptor.kernelAttributes.requiredWorkgroupSize[i] != 0 &&
332 kernelDescriptor.kernelAttributes.requiredWorkgroupSize[i] != this->groupSize[i]) {
333 NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr,
334 "Invalid group size {%d, %d, %d} specified, requiredWorkGroupSize = {%d, %d, %d}\n",
335 this->groupSize[0], this->groupSize[1], this->groupSize[2],
336 kernelDescriptor.kernelAttributes.requiredWorkgroupSize[0],
337 kernelDescriptor.kernelAttributes.requiredWorkgroupSize[1],
338 kernelDescriptor.kernelAttributes.requiredWorkgroupSize[2]);
339 return ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION;
340 }
341 }
342
343 auto simdSize = kernelDescriptor.kernelAttributes.simdSize;
344 this->numThreadsPerThreadGroup = static_cast<uint32_t>((itemsInGroup + simdSize - 1u) / simdSize);
345 patchWorkgroupSizeInCrossThreadData(groupSizeX, groupSizeY, groupSizeZ);
346
347 auto remainderSimdLanes = itemsInGroup & (simdSize - 1u);
348 threadExecutionMask = static_cast<uint32_t>(maxNBitValue(remainderSimdLanes));
349 if (!threadExecutionMask) {
350 threadExecutionMask = static_cast<uint32_t>(maxNBitValue((simdSize == 1) ? 32 : simdSize));
351 }
352 evaluateIfRequiresGenerationOfLocalIdsByRuntime(kernelDescriptor);
353
354 if (kernelRequiresGenerationOfLocalIdsByRuntime) {
355 auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
356 uint32_t perThreadDataSizeForWholeThreadGroupNeeded =
357 static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
358 simdSize, grfSize, numChannels, itemsInGroup));
359 if (perThreadDataSizeForWholeThreadGroupNeeded >
360 perThreadDataSizeForWholeThreadGroupAllocated) {
361 alignedFree(perThreadDataForWholeThreadGroup);
362 perThreadDataForWholeThreadGroup = static_cast<uint8_t *>(alignedMalloc(perThreadDataSizeForWholeThreadGroupNeeded, 32));
363 perThreadDataSizeForWholeThreadGroupAllocated = perThreadDataSizeForWholeThreadGroupNeeded;
364 }
365 perThreadDataSizeForWholeThreadGroup = perThreadDataSizeForWholeThreadGroupNeeded;
366
367 if (numChannels > 0) {
368 UNRECOVERABLE_IF(3 != numChannels);
369 NEO::generateLocalIDs(
370 perThreadDataForWholeThreadGroup,
371 static_cast<uint16_t>(simdSize),
372 std::array<uint16_t, 3>{{static_cast<uint16_t>(groupSizeX),
373 static_cast<uint16_t>(groupSizeY),
374 static_cast<uint16_t>(groupSizeZ)}},
375 std::array<uint8_t, 3>{{0, 1, 2}},
376 false, grfSize);
377 }
378
379 this->perThreadDataSize = perThreadDataSizeForWholeThreadGroup / numThreadsPerThreadGroup;
380 }
381 return ZE_RESULT_SUCCESS;
382 }
383
suggestGroupSize(uint32_t globalSizeX,uint32_t globalSizeY,uint32_t globalSizeZ,uint32_t * groupSizeX,uint32_t * groupSizeY,uint32_t * groupSizeZ)384 ze_result_t KernelImp::suggestGroupSize(uint32_t globalSizeX, uint32_t globalSizeY,
385 uint32_t globalSizeZ, uint32_t *groupSizeX,
386 uint32_t *groupSizeY, uint32_t *groupSizeZ) {
387 size_t retGroupSize[3] = {};
388 auto maxWorkGroupSize = module->getMaxGroupSize();
389 auto simd = kernelImmData->getDescriptor().kernelAttributes.simdSize;
390 size_t workItems[3] = {globalSizeX, globalSizeY, globalSizeZ};
391 uint32_t dim = (globalSizeY > 1U) ? 2 : 1U;
392 dim = (globalSizeZ > 1U) ? 3 : dim;
393
394 if (NEO::DebugManager.flags.EnableComputeWorkSizeND.get()) {
395 auto usesImages = getImmutableData()->getDescriptor().kernelAttributes.flags.usesImages;
396 auto neoDevice = module->getDevice()->getNEODevice();
397 const auto hwInfo = &neoDevice->getHardwareInfo();
398 const auto &deviceInfo = neoDevice->getDeviceInfo();
399 uint32_t numThreadsPerSubSlice = (uint32_t)deviceInfo.maxNumEUsPerSubSlice * deviceInfo.numThreadsPerEU;
400 uint32_t localMemSize = (uint32_t)deviceInfo.localMemSize;
401
402 NEO::WorkSizeInfo wsInfo(maxWorkGroupSize, kernelImmData->getDescriptor().kernelAttributes.usesBarriers(), simd, this->getSlmTotalSize(),
403 hwInfo, numThreadsPerSubSlice, localMemSize,
404 usesImages, false);
405 NEO::computeWorkgroupSizeND(wsInfo, retGroupSize, workItems, dim);
406 } else {
407 if (1U == dim) {
408 NEO::computeWorkgroupSize1D(maxWorkGroupSize, retGroupSize, workItems, simd);
409 } else if (NEO::DebugManager.flags.EnableComputeWorkSizeSquared.get() && (2U == dim)) {
410 NEO::computeWorkgroupSizeSquared(maxWorkGroupSize, retGroupSize, workItems, simd, dim);
411 } else {
412 NEO::computeWorkgroupSize2D(maxWorkGroupSize, retGroupSize, workItems, simd);
413 }
414 }
415
416 *groupSizeX = static_cast<uint32_t>(retGroupSize[0]);
417 *groupSizeY = static_cast<uint32_t>(retGroupSize[1]);
418 *groupSizeZ = static_cast<uint32_t>(retGroupSize[2]);
419
420 return ZE_RESULT_SUCCESS;
421 }
422
suggestMaxCooperativeGroupCount(uint32_t * totalGroupCount,NEO::EngineGroupType engineGroupType,bool isEngineInstanced)423 ze_result_t KernelImp::suggestMaxCooperativeGroupCount(uint32_t *totalGroupCount, NEO::EngineGroupType engineGroupType,
424 bool isEngineInstanced) {
425 UNRECOVERABLE_IF(0 == groupSize[0]);
426 UNRECOVERABLE_IF(0 == groupSize[1]);
427 UNRECOVERABLE_IF(0 == groupSize[2]);
428
429 auto &hardwareInfo = module->getDevice()->getHwInfo();
430
431 auto dssCount = hardwareInfo.gtSystemInfo.DualSubSliceCount;
432 if (dssCount == 0) {
433 dssCount = hardwareInfo.gtSystemInfo.SubSliceCount;
434 }
435 auto &hwHelper = NEO::HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
436 auto &descriptor = kernelImmData->getDescriptor();
437 auto availableThreadCount = hwHelper.calculateAvailableThreadCount(
438 hardwareInfo.platform.eProductFamily,
439 descriptor.kernelAttributes.numGrfRequired,
440 hardwareInfo.gtSystemInfo.EUCount, hardwareInfo.gtSystemInfo.ThreadCount / hardwareInfo.gtSystemInfo.EUCount);
441
442 auto barrierCount = descriptor.kernelAttributes.barrierCount;
443 const uint32_t workDim = 3;
444 const size_t localWorkSize[] = {groupSize[0], groupSize[1], groupSize[2]};
445 *totalGroupCount = NEO::KernelHelper::getMaxWorkGroupCount(descriptor.kernelAttributes.simdSize,
446 availableThreadCount,
447 dssCount,
448 dssCount * KB * hardwareInfo.capabilityTable.slmSize,
449 hwHelper.alignSlmSize(slmArgsTotalSize + descriptor.kernelAttributes.slmInlineSize),
450 static_cast<uint32_t>(hwHelper.getMaxBarrierRegisterPerSlice()),
451 hwHelper.getBarriersCountFromHasBarriers(barrierCount),
452 workDim,
453 localWorkSize);
454 *totalGroupCount = hwHelper.adjustMaxWorkGroupCount(*totalGroupCount, engineGroupType, hardwareInfo, isEngineInstanced);
455 return ZE_RESULT_SUCCESS;
456 }
457
setIndirectAccess(ze_kernel_indirect_access_flags_t flags)458 ze_result_t KernelImp::setIndirectAccess(ze_kernel_indirect_access_flags_t flags) {
459 if (NEO::DebugManager.flags.DisableIndirectAccess.get() == 1 || this->kernelHasIndirectAccess == false) {
460 return ZE_RESULT_SUCCESS;
461 }
462
463 if (flags & ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE) {
464 this->unifiedMemoryControls.indirectDeviceAllocationsAllowed = true;
465 }
466 if (flags & ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST) {
467 this->unifiedMemoryControls.indirectHostAllocationsAllowed = true;
468 }
469 if (flags & ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED) {
470 this->unifiedMemoryControls.indirectSharedAllocationsAllowed = true;
471 }
472
473 return ZE_RESULT_SUCCESS;
474 }
475
getIndirectAccess(ze_kernel_indirect_access_flags_t * flags)476 ze_result_t KernelImp::getIndirectAccess(ze_kernel_indirect_access_flags_t *flags) {
477 *flags = 0;
478 if (this->unifiedMemoryControls.indirectDeviceAllocationsAllowed) {
479 *flags |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE;
480 }
481 if (this->unifiedMemoryControls.indirectHostAllocationsAllowed) {
482 *flags |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST;
483 }
484 if (this->unifiedMemoryControls.indirectSharedAllocationsAllowed) {
485 *flags |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED;
486 }
487
488 return ZE_RESULT_SUCCESS;
489 }
490
getSourceAttributes(uint32_t * pSize,char ** pString)491 ze_result_t KernelImp::getSourceAttributes(uint32_t *pSize, char **pString) {
492 auto &desc = kernelImmData->getDescriptor();
493 if (pString == nullptr) {
494 *pSize = (uint32_t)desc.kernelMetadata.kernelLanguageAttributes.length() + 1;
495 } else {
496 strncpy_s(*pString, desc.kernelMetadata.kernelLanguageAttributes.length() + 1,
497 desc.kernelMetadata.kernelLanguageAttributes.c_str(),
498 desc.kernelMetadata.kernelLanguageAttributes.length() + 1);
499 }
500 return ZE_RESULT_SUCCESS;
501 }
502
setArgImmediate(uint32_t argIndex,size_t argSize,const void * argVal)503 ze_result_t KernelImp::setArgImmediate(uint32_t argIndex, size_t argSize, const void *argVal) {
504 if (kernelImmData->getDescriptor().payloadMappings.explicitArgs.size() <= argIndex) {
505 return ZE_RESULT_ERROR_INVALID_ARGUMENT;
506 }
507
508 const auto &arg = kernelImmData->getDescriptor().payloadMappings.explicitArgs[argIndex];
509
510 for (const auto &element : arg.as<NEO::ArgDescValue>().elements) {
511 if (element.sourceOffset < argSize) {
512 size_t maxBytesToCopy = argSize - element.sourceOffset;
513 size_t bytesToCopy = std::min(static_cast<size_t>(element.size), maxBytesToCopy);
514
515 auto pDst = ptrOffset(crossThreadData.get(), element.offset);
516 if (argVal) {
517 auto pSrc = ptrOffset(argVal, element.sourceOffset);
518 memcpy_s(pDst, element.size, pSrc, bytesToCopy);
519 } else {
520 uint64_t val = 0;
521 memcpy_s(pDst, element.size,
522 reinterpret_cast<void *>(&val), bytesToCopy);
523 }
524 } else {
525 return ZE_RESULT_ERROR_INVALID_ARGUMENT;
526 }
527 }
528 return ZE_RESULT_SUCCESS;
529 }
530
setArgRedescribedImage(uint32_t argIndex,ze_image_handle_t argVal)531 ze_result_t KernelImp::setArgRedescribedImage(uint32_t argIndex, ze_image_handle_t argVal) {
532 const auto &arg = kernelImmData->getDescriptor().payloadMappings.explicitArgs[argIndex].as<NEO::ArgDescImage>();
533 if (argVal == nullptr) {
534 residencyContainer[argIndex] = nullptr;
535 return ZE_RESULT_SUCCESS;
536 }
537
538 const auto image = Image::fromHandle(argVal);
539 image->copyRedescribedSurfaceStateToSSH(surfaceStateHeapData.get(), arg.bindful);
540 residencyContainer[argIndex] = image->getAllocation();
541
542 return ZE_RESULT_SUCCESS;
543 }
544
setArgBufferWithAlloc(uint32_t argIndex,uintptr_t argVal,NEO::GraphicsAllocation * allocation)545 ze_result_t KernelImp::setArgBufferWithAlloc(uint32_t argIndex, uintptr_t argVal, NEO::GraphicsAllocation *allocation) {
546 const auto &arg = kernelImmData->getDescriptor().payloadMappings.explicitArgs[argIndex].as<NEO::ArgDescPointer>();
547 const auto val = argVal;
548
549 NEO::patchPointer(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg, val);
550 if (NEO::isValidOffset(arg.bindful) || NEO::isValidOffset(arg.bindless)) {
551 setBufferSurfaceState(argIndex, reinterpret_cast<void *>(val), allocation);
552 }
553
554 auto allocData = this->module->getDevice()->getDriverHandle()->getSvmAllocsManager()->getSVMAlloc(reinterpret_cast<void *>(allocation->getGpuAddress()));
555 if (allocData) {
556 bool argWasUncacheable = isArgUncached[argIndex];
557 bool argIsUncacheable = allocData->allocationFlagsProperty.flags.locallyUncachedResource;
558 if (argWasUncacheable == false && argIsUncacheable) {
559 kernelRequiresUncachedMocsCount++;
560 } else if (argWasUncacheable && argIsUncacheable == false) {
561 kernelRequiresUncachedMocsCount--;
562 }
563 this->setKernelArgUncached(argIndex, argIsUncacheable);
564 }
565
566 residencyContainer[argIndex] = allocation;
567
568 return ZE_RESULT_SUCCESS;
569 }
570
setArgUnknown(uint32_t argIndex,size_t argSize,const void * argVal)571 ze_result_t KernelImp::setArgUnknown(uint32_t argIndex, size_t argSize, const void *argVal) {
572 return ZE_RESULT_SUCCESS;
573 }
574
setArgBuffer(uint32_t argIndex,size_t argSize,const void * argVal)575 ze_result_t KernelImp::setArgBuffer(uint32_t argIndex, size_t argSize, const void *argVal) {
576 const auto &allArgs = kernelImmData->getDescriptor().payloadMappings.explicitArgs;
577 const auto &currArg = allArgs[argIndex];
578 if (currArg.getTraits().getAddressQualifier() == NEO::KernelArgMetadata::AddrLocal) {
579 slmArgSizes[argIndex] = static_cast<uint32_t>(argSize);
580 UNRECOVERABLE_IF(NEO::isUndefinedOffset(currArg.as<NEO::ArgDescPointer>().slmOffset));
581 auto slmOffset = *reinterpret_cast<uint32_t *>(crossThreadData.get() + currArg.as<NEO::ArgDescPointer>().slmOffset);
582 slmOffset += static_cast<uint32_t>(argSize);
583 ++argIndex;
584 while (argIndex < kernelImmData->getDescriptor().payloadMappings.explicitArgs.size()) {
585 if (allArgs[argIndex].getTraits().getAddressQualifier() != NEO::KernelArgMetadata::AddrLocal) {
586 ++argIndex;
587 continue;
588 }
589 const auto &nextArg = allArgs[argIndex].as<NEO::ArgDescPointer>();
590 UNRECOVERABLE_IF(0 == nextArg.requiredSlmAlignment);
591 slmOffset = alignUp<uint32_t>(slmOffset, nextArg.requiredSlmAlignment);
592 NEO::patchNonPointer<uint32_t>(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), nextArg.slmOffset, slmOffset);
593
594 slmOffset += static_cast<uint32_t>(slmArgSizes[argIndex]);
595 ++argIndex;
596 }
597 slmArgsTotalSize = static_cast<uint32_t>(alignUp(slmOffset, KB));
598 return ZE_RESULT_SUCCESS;
599 }
600
601 if (nullptr == argVal) {
602 residencyContainer[argIndex] = nullptr;
603 const auto &arg = kernelImmData->getDescriptor().payloadMappings.explicitArgs[argIndex].as<NEO::ArgDescPointer>();
604 uintptr_t nullBufferValue = 0;
605 NEO::patchPointer(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg, nullBufferValue);
606 return ZE_RESULT_SUCCESS;
607 }
608
609 auto requestedAddress = *reinterpret_cast<void *const *>(argVal);
610 uintptr_t gpuAddress = 0u;
611 NEO::GraphicsAllocation *alloc = module->getDevice()->getDriverHandle()->getDriverSystemMemoryAllocation(requestedAddress,
612 1u,
613 module->getDevice()->getRootDeviceIndex(),
614 &gpuAddress);
615 DeviceImp *device = static_cast<DeviceImp *>(this->module->getDevice());
616 DriverHandleImp *driverHandle = static_cast<DriverHandleImp *>(device->getDriverHandle());
617 auto allocData = driverHandle->getSvmAllocsManager()->getSVMAlloc(requestedAddress);
618 if (driverHandle->isRemoteResourceNeeded(requestedAddress, alloc, allocData, device)) {
619 if (allocData == nullptr) {
620 return ZE_RESULT_ERROR_INVALID_ARGUMENT;
621 }
622
623 uint64_t pbase = allocData->gpuAllocations.getDefaultGraphicsAllocation()->getGpuAddress();
624 uint64_t offset = (uint64_t)requestedAddress - pbase;
625
626 alloc = driverHandle->getPeerAllocation(device, allocData, reinterpret_cast<void *>(pbase), &gpuAddress);
627 if (alloc == nullptr) {
628 return ZE_RESULT_ERROR_INVALID_ARGUMENT;
629 }
630 gpuAddress += offset;
631 }
632
633 return setArgBufferWithAlloc(argIndex, gpuAddress, alloc);
634 }
635
setArgImage(uint32_t argIndex,size_t argSize,const void * argVal)636 ze_result_t KernelImp::setArgImage(uint32_t argIndex, size_t argSize, const void *argVal) {
637 if (argVal == nullptr) {
638 residencyContainer[argIndex] = nullptr;
639 return ZE_RESULT_SUCCESS;
640 }
641
642 const auto &hwInfo = module->getDevice()->getNEODevice()->getHardwareInfo();
643 auto isMediaBlockImage = (hwInfo.capabilityTable.supportsMediaBlock &&
644 kernelImmData->getDescriptor().payloadMappings.explicitArgs[argIndex].getExtendedTypeInfo().isMediaBlockImage);
645 const auto &arg = kernelImmData->getDescriptor().payloadMappings.explicitArgs[argIndex].as<NEO::ArgDescImage>();
646 const auto image = Image::fromHandle(*static_cast<const ze_image_handle_t *>(argVal));
647
648 if (kernelImmData->getDescriptor().kernelAttributes.imageAddressingMode == NEO::KernelDescriptor::Bindless) {
649 image->copySurfaceStateToSSH(patchBindlessSurfaceState(image->getAllocation(), arg.bindless), 0u, isMediaBlockImage);
650 } else {
651 image->copySurfaceStateToSSH(surfaceStateHeapData.get(), arg.bindful, isMediaBlockImage);
652 }
653
654 residencyContainer[argIndex] = image->getAllocation();
655
656 auto imageInfo = image->getImageInfo();
657 auto clChannelType = getClChannelDataType(image->getImageDesc().format);
658 auto clChannelOrder = getClChannelOrder(image->getImageDesc().format);
659 NEO::patchNonPointer<size_t>(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg.metadataPayload.imgWidth, imageInfo.imgDesc.imageWidth);
660 NEO::patchNonPointer<size_t>(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg.metadataPayload.imgHeight, imageInfo.imgDesc.imageHeight);
661 NEO::patchNonPointer<size_t>(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg.metadataPayload.imgDepth, imageInfo.imgDesc.imageDepth);
662 NEO::patchNonPointer<uint32_t>(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg.metadataPayload.numSamples, imageInfo.imgDesc.numSamples);
663 NEO::patchNonPointer<size_t>(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg.metadataPayload.arraySize, imageInfo.imgDesc.imageArraySize);
664 NEO::patchNonPointer<cl_channel_type>(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg.metadataPayload.channelDataType, clChannelType);
665 NEO::patchNonPointer<cl_channel_order>(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg.metadataPayload.channelOrder, clChannelOrder);
666 NEO::patchNonPointer<uint32_t>(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg.metadataPayload.numMipLevels, imageInfo.imgDesc.numMipLevels);
667
668 auto pixelSize = imageInfo.surfaceFormat->ImageElementSizeInBytes;
669 NEO::patchNonPointer<uint64_t>(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg.metadataPayload.flatBaseOffset, image->getAllocation()->getGpuAddress());
670 NEO::patchNonPointer<size_t>(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg.metadataPayload.flatWidth, (imageInfo.imgDesc.imageWidth * pixelSize) - 1u);
671 NEO::patchNonPointer<size_t>(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg.metadataPayload.flatHeight, (imageInfo.imgDesc.imageHeight * pixelSize) - 1u);
672 NEO::patchNonPointer<size_t>(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg.metadataPayload.flatPitch, imageInfo.imgDesc.imageRowPitch - 1u);
673
674 return ZE_RESULT_SUCCESS;
675 }
676
setArgSampler(uint32_t argIndex,size_t argSize,const void * argVal)677 ze_result_t KernelImp::setArgSampler(uint32_t argIndex, size_t argSize, const void *argVal) {
678 const auto &arg = kernelImmData->getDescriptor().payloadMappings.explicitArgs[argIndex].as<NEO::ArgDescSampler>();
679 const auto sampler = Sampler::fromHandle(*static_cast<const ze_sampler_handle_t *>(argVal));
680 sampler->copySamplerStateToDSH(dynamicStateHeapData.get(), dynamicStateHeapDataSize, arg.bindful);
681
682 auto samplerDesc = sampler->getSamplerDesc();
683
684 NEO::patchNonPointer<uint32_t>(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg.metadataPayload.samplerSnapWa, (samplerDesc.addressMode == ZE_SAMPLER_ADDRESS_MODE_CLAMP && samplerDesc.filterMode == ZE_SAMPLER_FILTER_MODE_NEAREST) ? std::numeric_limits<uint32_t>::max() : 0u);
685 NEO::patchNonPointer<uint32_t>(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg.metadataPayload.samplerAddressingMode, static_cast<uint32_t>(getAddrMode(samplerDesc.addressMode)));
686 NEO::patchNonPointer<uint32_t>(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg.metadataPayload.samplerNormalizedCoords, samplerDesc.isNormalized ? static_cast<uint32_t>(SamplerPatchValues::NormalizedCoordsTrue) : static_cast<uint32_t>(SamplerPatchValues::NormalizedCoordsFalse));
687
688 return ZE_RESULT_SUCCESS;
689 }
690
getKernelName(size_t * pSize,char * pName)691 ze_result_t KernelImp::getKernelName(size_t *pSize, char *pName) {
692 size_t kernelNameSize = this->kernelImmData->getDescriptor().kernelMetadata.kernelName.size() + 1;
693 if (0 == *pSize || nullptr == pName) {
694 *pSize = kernelNameSize;
695 return ZE_RESULT_SUCCESS;
696 }
697
698 *pSize = std::min(*pSize, kernelNameSize);
699 strncpy_s(pName, *pSize,
700 this->kernelImmData->getDescriptor().kernelMetadata.kernelName.c_str(), kernelNameSize);
701
702 return ZE_RESULT_SUCCESS;
703 }
704
getProperties(ze_kernel_properties_t * pKernelProperties)705 ze_result_t KernelImp::getProperties(ze_kernel_properties_t *pKernelProperties) {
706 const auto &kernelDescriptor = this->kernelImmData->getDescriptor();
707 pKernelProperties->numKernelArgs = static_cast<uint32_t>(kernelDescriptor.payloadMappings.explicitArgs.size());
708 pKernelProperties->requiredGroupSizeX = kernelDescriptor.kernelAttributes.requiredWorkgroupSize[0];
709 pKernelProperties->requiredGroupSizeY = kernelDescriptor.kernelAttributes.requiredWorkgroupSize[1];
710 pKernelProperties->requiredGroupSizeZ = kernelDescriptor.kernelAttributes.requiredWorkgroupSize[2];
711 pKernelProperties->requiredNumSubGroups = kernelDescriptor.kernelMetadata.compiledSubGroupsNumber;
712 pKernelProperties->requiredSubgroupSize = kernelDescriptor.kernelMetadata.requiredSubGroupSize;
713 pKernelProperties->maxSubgroupSize = kernelDescriptor.kernelAttributes.simdSize;
714 pKernelProperties->localMemSize = kernelDescriptor.kernelAttributes.slmInlineSize;
715 pKernelProperties->privateMemSize = kernelDescriptor.kernelAttributes.perHwThreadPrivateMemorySize;
716 pKernelProperties->spillMemSize = kernelDescriptor.kernelAttributes.perThreadScratchSize[0];
717 memset(pKernelProperties->uuid.kid, 0, ZE_MAX_KERNEL_UUID_SIZE);
718 memset(pKernelProperties->uuid.mid, 0, ZE_MAX_MODULE_UUID_SIZE);
719
720 uint32_t maxKernelWorkGroupSize = static_cast<uint32_t>(this->module->getDevice()->getNEODevice()->getDeviceInfo().maxWorkGroupSize);
721 pKernelProperties->maxNumSubgroups = maxKernelWorkGroupSize / kernelDescriptor.kernelAttributes.simdSize;
722
723 void *pNext = pKernelProperties->pNext;
724 while (pNext) {
725 ze_base_desc_t *extendedProperties = reinterpret_cast<ze_base_desc_t *>(pKernelProperties->pNext);
726 if (extendedProperties->stype == ZE_STRUCTURE_TYPE_KERNEL_PREFERRED_GROUP_SIZE_PROPERTIES) {
727 ze_kernel_preferred_group_size_properties_t *preferredGroupSizeProperties =
728 reinterpret_cast<ze_kernel_preferred_group_size_properties_t *>(extendedProperties);
729
730 preferredGroupSizeProperties->preferredMultiple = this->kernelImmData->getKernelInfo()->getMaxSimdSize();
731 auto &hwHelper = NEO::HwHelper::get(this->module->getDevice()->getHwInfo().platform.eRenderCoreFamily);
732 if (hwHelper.isFusedEuDispatchEnabled(this->module->getDevice()->getHwInfo())) {
733 preferredGroupSizeProperties->preferredMultiple *= 2;
734 }
735 }
736
737 pNext = const_cast<void *>(extendedProperties->pNext);
738 }
739
740 return ZE_RESULT_SUCCESS;
741 }
742
allocatePrivateMemoryGraphicsAllocation()743 NEO::GraphicsAllocation *KernelImp::allocatePrivateMemoryGraphicsAllocation() {
744 auto &kernelAttributes = kernelImmData->getDescriptor().kernelAttributes;
745 auto neoDevice = module->getDevice()->getNEODevice();
746
747 auto privateSurfaceSize = NEO::KernelHelper::getPrivateSurfaceSize(kernelAttributes.perHwThreadPrivateMemorySize,
748 neoDevice->getDeviceInfo().computeUnitsUsedForScratch);
749
750 UNRECOVERABLE_IF(privateSurfaceSize == 0);
751 auto privateMemoryGraphicsAllocation = neoDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(
752 {neoDevice->getRootDeviceIndex(), privateSurfaceSize, NEO::GraphicsAllocation::AllocationType::PRIVATE_SURFACE, neoDevice->getDeviceBitfield()});
753
754 UNRECOVERABLE_IF(privateMemoryGraphicsAllocation == nullptr);
755 return privateMemoryGraphicsAllocation;
756 }
757
patchCrossthreadDataWithPrivateAllocation(NEO::GraphicsAllocation * privateAllocation)758 void KernelImp::patchCrossthreadDataWithPrivateAllocation(NEO::GraphicsAllocation *privateAllocation) {
759 auto &kernelAttributes = kernelImmData->getDescriptor().kernelAttributes;
760 auto device = module->getDevice();
761
762 ArrayRef<uint8_t> crossThredDataArrayRef = ArrayRef<uint8_t>(this->crossThreadData.get(), this->crossThreadDataSize);
763 ArrayRef<uint8_t> surfaceStateHeapArrayRef = ArrayRef<uint8_t>(this->surfaceStateHeapData.get(), this->surfaceStateHeapDataSize);
764
765 patchWithImplicitSurface(crossThredDataArrayRef, surfaceStateHeapArrayRef,
766 static_cast<uintptr_t>(privateAllocation->getGpuAddressToPatch()),
767 *privateAllocation, kernelImmData->getDescriptor().payloadMappings.implicitArgs.privateMemoryAddress,
768 *device->getNEODevice(), kernelAttributes.flags.useGlobalAtomics);
769 }
770
initialize(const ze_kernel_desc_t * desc)771 ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
772 this->kernelImmData = module->getKernelImmutableData(desc->pKernelName);
773 if (this->kernelImmData == nullptr) {
774 return ZE_RESULT_ERROR_INVALID_KERNEL_NAME;
775 }
776
777 auto isaAllocation = this->kernelImmData->getIsaGraphicsAllocation();
778
779 auto neoDevice = module->getDevice()->getNEODevice();
780 auto &hwInfo = neoDevice->getHardwareInfo();
781 auto &hwHelper = NEO::HwHelper::get(hwInfo.platform.eRenderCoreFamily);
782 auto &kernelDescriptor = kernelImmData->getDescriptor();
783
784 this->schedulingHintExpFlag = hwHelper.getDefaultThreadArbitrationPolicy();
785 UNRECOVERABLE_IF(!this->kernelImmData->getKernelInfo()->heapInfo.pKernelHeap);
786
787 if (isaAllocation->getAllocationType() == NEO::GraphicsAllocation::AllocationType::KERNEL_ISA_INTERNAL) {
788 NEO::MemoryTransferHelper::transferMemoryToAllocation(hwHelper.isBlitCopyRequiredForLocalMemory(hwInfo, *isaAllocation),
789 *neoDevice,
790 isaAllocation,
791 0,
792 this->kernelImmData->getKernelInfo()->heapInfo.pKernelHeap,
793 static_cast<size_t>(this->kernelImmData->getKernelInfo()->heapInfo.KernelHeapSize));
794 }
795
796 for (const auto &argT : kernelDescriptor.payloadMappings.explicitArgs) {
797 switch (argT.type) {
798 default:
799 this->kernelArgHandlers.push_back(&KernelImp::setArgUnknown);
800 break;
801 case NEO::ArgDescriptor::ArgTPointer:
802 this->kernelArgHandlers.push_back(&KernelImp::setArgBuffer);
803 break;
804 case NEO::ArgDescriptor::ArgTImage:
805 this->kernelArgHandlers.push_back(&KernelImp::setArgImage);
806 break;
807 case NEO::ArgDescriptor::ArgTSampler:
808 this->kernelArgHandlers.push_back(&KernelImp::setArgSampler);
809 break;
810 case NEO::ArgDescriptor::ArgTValue:
811 this->kernelArgHandlers.push_back(&KernelImp::setArgImmediate);
812 break;
813 }
814 }
815
816 slmArgSizes.resize(this->kernelArgHandlers.size(), 0);
817
818 isArgUncached.resize(this->kernelArgHandlers.size(), 0);
819
820 if (kernelImmData->getSurfaceStateHeapSize() > 0) {
821 this->surfaceStateHeapData.reset(new uint8_t[kernelImmData->getSurfaceStateHeapSize()]);
822 memcpy_s(this->surfaceStateHeapData.get(),
823 kernelImmData->getSurfaceStateHeapSize(),
824 kernelImmData->getSurfaceStateHeapTemplate(),
825 kernelImmData->getSurfaceStateHeapSize());
826 this->surfaceStateHeapDataSize = kernelImmData->getSurfaceStateHeapSize();
827 }
828
829 if (kernelDescriptor.kernelAttributes.crossThreadDataSize != 0) {
830 this->crossThreadData.reset(new uint8_t[kernelDescriptor.kernelAttributes.crossThreadDataSize]);
831 memcpy_s(this->crossThreadData.get(),
832 kernelDescriptor.kernelAttributes.crossThreadDataSize,
833 kernelImmData->getCrossThreadDataTemplate(),
834 kernelDescriptor.kernelAttributes.crossThreadDataSize);
835 this->crossThreadDataSize = kernelDescriptor.kernelAttributes.crossThreadDataSize;
836 }
837
838 if (kernelImmData->getDynamicStateHeapDataSize() != 0) {
839 this->dynamicStateHeapData.reset(new uint8_t[kernelImmData->getDynamicStateHeapDataSize()]);
840 memcpy_s(this->dynamicStateHeapData.get(),
841 kernelImmData->getDynamicStateHeapDataSize(),
842 kernelImmData->getDynamicStateHeapTemplate(),
843 kernelImmData->getDynamicStateHeapDataSize());
844 this->dynamicStateHeapDataSize = kernelImmData->getDynamicStateHeapDataSize();
845 }
846
847 if (kernelDescriptor.kernelAttributes.requiredWorkgroupSize[0] > 0) {
848 auto *reqdSize = kernelDescriptor.kernelAttributes.requiredWorkgroupSize;
849 UNRECOVERABLE_IF(reqdSize[1] == 0);
850 UNRECOVERABLE_IF(reqdSize[2] == 0);
851 auto result = setGroupSize(reqdSize[0], reqdSize[1], reqdSize[2]);
852 if (result != ZE_RESULT_SUCCESS) {
853 return result;
854 }
855 } else {
856 auto result = setGroupSize(kernelDescriptor.kernelAttributes.simdSize, 1, 1);
857 if (result != ZE_RESULT_SUCCESS) {
858 return result;
859 }
860 }
861
862 residencyContainer.resize(this->kernelArgHandlers.size(), nullptr);
863
864 auto &kernelAttributes = kernelDescriptor.kernelAttributes;
865 if ((kernelAttributes.perHwThreadPrivateMemorySize != 0U) && (false == module->shouldAllocatePrivateMemoryPerDispatch())) {
866 this->privateMemoryGraphicsAllocation = allocatePrivateMemoryGraphicsAllocation();
867 this->patchCrossthreadDataWithPrivateAllocation(this->privateMemoryGraphicsAllocation);
868 this->residencyContainer.push_back(this->privateMemoryGraphicsAllocation);
869 }
870 if (kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs) {
871 pImplicitArgs = std::make_unique<NEO::ImplicitArgs>();
872 *pImplicitArgs = {};
873 pImplicitArgs->structSize = sizeof(NEO::ImplicitArgs);
874 pImplicitArgs->structVersion = 0;
875 pImplicitArgs->simdWidth = kernelDescriptor.kernelAttributes.simdSize;
876 }
877
878 this->createPrintfBuffer();
879
880 this->setDebugSurface();
881
882 residencyContainer.insert(residencyContainer.end(), kernelImmData->getResidencyContainer().begin(),
883 kernelImmData->getResidencyContainer().end());
884
885 kernelHasIndirectAccess = kernelDescriptor.kernelAttributes.hasNonKernelArgLoad ||
886 kernelDescriptor.kernelAttributes.hasNonKernelArgStore ||
887 kernelDescriptor.kernelAttributes.hasNonKernelArgAtomic;
888
889 if (this->usesRayTracing()) {
890 if (this->getImmutableData()->getDescriptor().payloadMappings.implicitArgs.rtDispatchGlobals.pointerSize > 0) {
891 uint32_t bvhLevels = NEO::RayTracingHelper::maxBvhLevels;
892 neoDevice->initializeRayTracing(bvhLevels);
893 auto rtDispatchGlobals = neoDevice->getRTDispatchGlobals(bvhLevels);
894 if (rtDispatchGlobals == nullptr) {
895 return ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY;
896 }
897 this->residencyContainer.push_back(neoDevice->getRTMemoryBackedBuffer());
898 this->residencyContainer.push_back(rtDispatchGlobals);
899
900 NEO::patchPointer(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize),
901 this->getImmutableData()->getDescriptor().payloadMappings.implicitArgs.rtDispatchGlobals,
902 static_cast<uintptr_t>(rtDispatchGlobals->getGpuAddressToPatch()));
903 } else {
904 neoDevice->initializeRayTracing(0);
905 this->residencyContainer.push_back(neoDevice->getRTMemoryBackedBuffer());
906 }
907 }
908
909 return ZE_RESULT_SUCCESS;
910 }
911
createPrintfBuffer()912 void KernelImp::createPrintfBuffer() {
913 if (this->kernelImmData->getDescriptor().kernelAttributes.flags.usesPrintf || pImplicitArgs) {
914 this->printfBuffer = PrintfHandler::createPrintfBuffer(this->module->getDevice());
915 this->residencyContainer.push_back(printfBuffer);
916 if (this->kernelImmData->getDescriptor().kernelAttributes.flags.usesPrintf) {
917 NEO::patchPointer(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize),
918 this->getImmutableData()->getDescriptor().payloadMappings.implicitArgs.printfSurfaceAddress,
919 static_cast<uintptr_t>(this->printfBuffer->getGpuAddressToPatch()));
920 }
921 if (pImplicitArgs) {
922 pImplicitArgs->printfBufferPtr = printfBuffer->getGpuAddress();
923 }
924 }
925 }
926
printPrintfOutput()927 void KernelImp::printPrintfOutput() {
928 PrintfHandler::printOutput(kernelImmData, this->printfBuffer, module->getDevice());
929 }
930
usesSyncBuffer()931 bool KernelImp::usesSyncBuffer() {
932 return this->kernelImmData->getDescriptor().kernelAttributes.flags.usesSyncBuffer;
933 }
934
patchSyncBuffer(NEO::GraphicsAllocation * gfxAllocation,size_t bufferOffset)935 void KernelImp::patchSyncBuffer(NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) {
936 this->residencyContainer.push_back(gfxAllocation);
937 NEO::patchPointer(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize),
938 this->getImmutableData()->getDescriptor().payloadMappings.implicitArgs.syncBufferAddress,
939 static_cast<uintptr_t>(ptrOffset(gfxAllocation->getGpuAddressToPatch(), bufferOffset)));
940 }
941
setDebugSurface()942 void KernelImp::setDebugSurface() {
943 auto device = module->getDevice();
944 if (module->isDebugEnabled() && device->getNEODevice()->getDebugger()) {
945
946 auto surfaceStateHeapRef = ArrayRef<uint8_t>(surfaceStateHeapData.get(), surfaceStateHeapDataSize);
947
948 patchWithImplicitSurface(ArrayRef<uint8_t>(), surfaceStateHeapRef,
949 0,
950 *device->getDebugSurface(), this->getImmutableData()->getDescriptor().payloadMappings.implicitArgs.systemThreadSurfaceAddress,
951 *device->getNEODevice(), getKernelDescriptor().kernelAttributes.flags.useGlobalAtomics);
952 }
953 }
patchBindlessSurfaceState(NEO::GraphicsAllocation * alloc,uint32_t bindless)954 void *KernelImp::patchBindlessSurfaceState(NEO::GraphicsAllocation *alloc, uint32_t bindless) {
955 auto &hwHelper = NEO::HwHelper::get(this->module->getDevice()->getHwInfo().platform.eRenderCoreFamily);
956 auto surfaceStateSize = hwHelper.getRenderSurfaceStateSize();
957 NEO::BindlessHeapsHelper *bindlessHeapsHelper = this->module->getDevice()->getNEODevice()->getBindlessHeapsHelper();
958 auto ssInHeap = bindlessHeapsHelper->allocateSSInHeap(surfaceStateSize, alloc, NEO::BindlessHeapsHelper::GLOBAL_SSH);
959 this->residencyContainer.push_back(ssInHeap.heapAllocation);
960 auto patchLocation = ptrOffset(getCrossThreadData(), bindless);
961 auto patchValue = hwHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast<uint32_t>(ssInHeap.surfaceStateOffset));
962 patchWithRequiredSize(const_cast<uint8_t *>(patchLocation), sizeof(patchValue), patchValue);
963 return ssInHeap.ssPtr;
964 }
patchWorkgroupSizeInCrossThreadData(uint32_t x,uint32_t y,uint32_t z)965 void KernelImp::patchWorkgroupSizeInCrossThreadData(uint32_t x, uint32_t y, uint32_t z) {
966 const NEO::KernelDescriptor &desc = kernelImmData->getDescriptor();
967 auto dst = ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize);
968 uint32_t workgroupSize[3] = {x, y, z};
969 NEO::patchVecNonPointer(dst, desc.payloadMappings.dispatchTraits.localWorkSize, workgroupSize);
970 NEO::patchVecNonPointer(dst, desc.payloadMappings.dispatchTraits.localWorkSize2, workgroupSize);
971 NEO::patchVecNonPointer(dst, desc.payloadMappings.dispatchTraits.enqueuedLocalWorkSize, workgroupSize);
972 if (pImplicitArgs) {
973 pImplicitArgs->localSizeX = x;
974 pImplicitArgs->localSizeY = y;
975 pImplicitArgs->localSizeZ = z;
976 }
977 }
978
setGlobalOffsetExp(uint32_t offsetX,uint32_t offsetY,uint32_t offsetZ)979 ze_result_t KernelImp::setGlobalOffsetExp(uint32_t offsetX,
980 uint32_t offsetY,
981 uint32_t offsetZ) {
982 this->globalOffsets[0] = offsetX;
983 this->globalOffsets[1] = offsetY;
984 this->globalOffsets[2] = offsetZ;
985
986 return ZE_RESULT_SUCCESS;
987 }
988
patchGlobalOffset()989 void KernelImp::patchGlobalOffset() {
990 const NEO::KernelDescriptor &desc = kernelImmData->getDescriptor();
991 auto dst = ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize);
992 NEO::patchVecNonPointer(dst, desc.payloadMappings.dispatchTraits.globalWorkOffset, this->globalOffsets);
993 if (pImplicitArgs) {
994 pImplicitArgs->globalOffsetX = globalOffsets[0];
995 pImplicitArgs->globalOffsetY = globalOffsets[1];
996 pImplicitArgs->globalOffsetZ = globalOffsets[2];
997 }
998 }
999
create(uint32_t productFamily,Module * module,const ze_kernel_desc_t * desc,ze_result_t * res)1000 Kernel *Kernel::create(uint32_t productFamily, Module *module,
1001 const ze_kernel_desc_t *desc, ze_result_t *res) {
1002 UNRECOVERABLE_IF(productFamily >= IGFX_MAX_PRODUCT);
1003 KernelAllocatorFn allocator = kernelFactory[productFamily];
1004 auto kernel = static_cast<KernelImp *>(allocator(module));
1005 *res = kernel->initialize(desc);
1006 if (*res) {
1007 kernel->destroy();
1008 return nullptr;
1009 }
1010 return kernel;
1011 }
1012
hasIndirectAllocationsAllowed() const1013 bool KernelImp::hasIndirectAllocationsAllowed() const {
1014 return (unifiedMemoryControls.indirectDeviceAllocationsAllowed ||
1015 unifiedMemoryControls.indirectHostAllocationsAllowed ||
1016 unifiedMemoryControls.indirectSharedAllocationsAllowed);
1017 }
1018
getSlmTotalSize() const1019 uint32_t KernelImp::getSlmTotalSize() const {
1020 return slmArgsTotalSize + getImmutableData()->getDescriptor().kernelAttributes.slmInlineSize;
1021 }
1022
setCacheConfig(ze_cache_config_flags_t flags)1023 ze_result_t KernelImp::setCacheConfig(ze_cache_config_flags_t flags) {
1024 cacheConfigFlags = flags;
1025 return ZE_RESULT_SUCCESS;
1026 }
1027
getIsaAllocation() const1028 NEO::GraphicsAllocation *KernelImp::getIsaAllocation() const {
1029 return getImmutableData()->getIsaGraphicsAllocation();
1030 }
1031
setSchedulingHintExp(ze_scheduling_hint_exp_desc_t * pHint)1032 ze_result_t KernelImp::setSchedulingHintExp(ze_scheduling_hint_exp_desc_t *pHint) {
1033 this->schedulingHintExpFlag = pHint->flags;
1034 return ZE_RESULT_SUCCESS;
1035 }
1036
getSchedulingHintExp()1037 uint32_t KernelImp::getSchedulingHintExp() {
1038 if (NEO::DebugManager.flags.OverrideThreadArbitrationPolicy.get() != -1) {
1039 this->schedulingHintExpFlag = static_cast<uint32_t>(NEO::DebugManager.flags.OverrideThreadArbitrationPolicy.get());
1040 }
1041 return this->schedulingHintExpFlag;
1042 }
1043
getSizeForImplicitArgsPatching() const1044 uint32_t KernelImp::getSizeForImplicitArgsPatching() const {
1045 if (!pImplicitArgs) {
1046 return 0;
1047 }
1048 auto implicitArgsSize = static_cast<uint32_t>(sizeof(NEO::ImplicitArgs));
1049 const NEO::KernelDescriptor &kernelDescriptor = kernelImmData->getDescriptor();
1050 auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
1051 Vec3<size_t> groupSize{this->groupSize[0], this->groupSize[1], this->groupSize[2]};
1052 auto itemsInGroup = Math::computeTotalElementsCount(groupSize);
1053 uint32_t localIdsSizeNeeded =
1054 alignUp(static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
1055 kernelDescriptor.kernelAttributes.simdSize, grfSize, 3u, itemsInGroup)),
1056 MemoryConstants::cacheLineSize);
1057 return implicitArgsSize + localIdsSizeNeeded;
1058 }
1059
patchImplicitArgs(void * & pOut) const1060 void KernelImp::patchImplicitArgs(void *&pOut) const {
1061 if (!pImplicitArgs) {
1062 return;
1063 }
1064 const NEO::KernelDescriptor &kernelDescriptor = kernelImmData->getDescriptor();
1065 auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
1066 NEO::generateLocalIDs(
1067 pOut,
1068 static_cast<uint16_t>(kernelDescriptor.kernelAttributes.simdSize),
1069 std::array<uint16_t, 3>{{static_cast<uint16_t>(groupSize[0]),
1070 static_cast<uint16_t>(groupSize[1]),
1071 static_cast<uint16_t>(groupSize[2])}},
1072 std::array<uint8_t, 3>{{
1073 kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[0],
1074 kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[1],
1075 kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[2],
1076 }},
1077 false, grfSize);
1078 auto sizeForLocalIdsProgramming = getSizeForImplicitArgsPatching() - sizeof(NEO::ImplicitArgs);
1079 pOut = ptrOffset(pOut, sizeForLocalIdsProgramming);
1080 memcpy_s(pOut, sizeof(NEO::ImplicitArgs), pImplicitArgs.get(), sizeof(NEO::ImplicitArgs));
1081 pOut = ptrOffset(pOut, sizeof(NEO::ImplicitArgs));
1082 }
1083 } // namespace L0
1084