1 /*
2 * Copyright (C) 2018-2021 Intel Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 */
7
8 #include "shared/source/program/kernel_info.h"
9
10 #include "shared/source/device/device.h"
11 #include "shared/source/device_binary_format/patchtokens_decoder.h"
12 #include "shared/source/helpers/aligned_memory.h"
13 #include "shared/source/helpers/blit_commands_helper.h"
14 #include "shared/source/helpers/hw_helper.h"
15 #include "shared/source/helpers/kernel_helpers.h"
16 #include "shared/source/helpers/ptr_math.h"
17 #include "shared/source/helpers/string.h"
18 #include "shared/source/memory_manager/memory_manager.h"
19
20 #include <cstdint>
21 #include <cstring>
22 #include <map>
23 #include <sstream>
24 #include <unordered_map>
25
26 namespace NEO {
27
28 struct KernelArgumentType {
29 const char *argTypeQualifier;
30 uint64_t argTypeQualifierValue;
31 };
32
WorkSizeInfo(uint32_t maxWorkGroupSize,bool hasBarriers,uint32_t simdSize,uint32_t slmTotalSize,const HardwareInfo * hwInfo,uint32_t numThreadsPerSubSlice,uint32_t localMemSize,bool imgUsed,bool yTiledSurface)33 WorkSizeInfo::WorkSizeInfo(uint32_t maxWorkGroupSize, bool hasBarriers, uint32_t simdSize, uint32_t slmTotalSize, const HardwareInfo *hwInfo, uint32_t numThreadsPerSubSlice, uint32_t localMemSize, bool imgUsed, bool yTiledSurface) {
34 this->maxWorkGroupSize = maxWorkGroupSize;
35 this->hasBarriers = hasBarriers;
36 this->simdSize = simdSize;
37 this->slmTotalSize = slmTotalSize;
38 this->coreFamily = hwInfo->platform.eRenderCoreFamily;
39 this->numThreadsPerSubSlice = numThreadsPerSubSlice;
40 this->localMemSize = localMemSize;
41 this->imgUsed = imgUsed;
42 this->yTiledSurfaces = yTiledSurface;
43
44 setMinWorkGroupSize(hwInfo);
45 }
46
setIfUseImg(const KernelInfo & kernelInfo)47 void WorkSizeInfo::setIfUseImg(const KernelInfo &kernelInfo) {
48 for (const auto &arg : kernelInfo.kernelDescriptor.payloadMappings.explicitArgs) {
49 if (arg.is<ArgDescriptor::ArgTImage>()) {
50 imgUsed = true;
51 yTiledSurfaces = true;
52 return;
53 }
54 }
55 }
setMinWorkGroupSize(const HardwareInfo * hwInfo)56 void WorkSizeInfo::setMinWorkGroupSize(const HardwareInfo *hwInfo) {
57 minWorkGroupSize = 0;
58 if (hasBarriers) {
59 uint32_t maxBarriersPerHSlice = (coreFamily >= IGFX_GEN9_CORE) ? 32 : 16;
60 minWorkGroupSize = numThreadsPerSubSlice * simdSize / maxBarriersPerHSlice;
61 }
62 if (slmTotalSize > 0) {
63 UNRECOVERABLE_IF(localMemSize < slmTotalSize);
64 minWorkGroupSize = std::max(maxWorkGroupSize / ((localMemSize / slmTotalSize)), minWorkGroupSize);
65 }
66
67 const auto &hwHelper = HwHelper::get(hwInfo->platform.eRenderCoreFamily);
68 if (hwHelper.isFusedEuDispatchEnabled(*hwInfo)) {
69 minWorkGroupSize *= 2;
70 }
71 }
checkRatio(const size_t workItems[3])72 void WorkSizeInfo::checkRatio(const size_t workItems[3]) {
73 if (slmTotalSize > 0) {
74 useRatio = true;
75 targetRatio = log((float)workItems[0]) - log((float)workItems[1]);
76 useStrictRatio = false;
77 } else if (yTiledSurfaces == true) {
78 useRatio = true;
79 targetRatio = YTilingRatioValue;
80 useStrictRatio = true;
81 }
82 }
83
~KernelInfo()84 KernelInfo::~KernelInfo() {
85 delete[] crossThreadData;
86 }
87
getSamplerStateArrayCount() const88 size_t KernelInfo::getSamplerStateArrayCount() const {
89 return kernelDescriptor.payloadMappings.samplerTable.numSamplers;
90 }
getSamplerStateArraySize(const HardwareInfo & hwInfo) const91 size_t KernelInfo::getSamplerStateArraySize(const HardwareInfo &hwInfo) const {
92 size_t samplerStateArraySize = getSamplerStateArrayCount() * HwHelper::get(hwInfo.platform.eRenderCoreFamily).getSamplerStateSize();
93 return samplerStateArraySize;
94 }
95
getBorderColorStateSize() const96 size_t KernelInfo::getBorderColorStateSize() const {
97 size_t borderColorSize = 0;
98 if (kernelDescriptor.payloadMappings.samplerTable.numSamplers > 0U) {
99 borderColorSize = kernelDescriptor.payloadMappings.samplerTable.tableOffset - kernelDescriptor.payloadMappings.samplerTable.borderColor;
100 }
101 return borderColorSize;
102 }
103
getBorderColorOffset() const104 size_t KernelInfo::getBorderColorOffset() const {
105 size_t borderColorOffset = 0;
106 if (kernelDescriptor.payloadMappings.samplerTable.numSamplers > 0U) {
107 borderColorOffset = kernelDescriptor.payloadMappings.samplerTable.borderColor;
108 }
109 return borderColorOffset;
110 }
111
getConstantBufferSize() const112 uint32_t KernelInfo::getConstantBufferSize() const {
113 return kernelDescriptor.kernelAttributes.crossThreadDataSize;
114 }
getArgNumByName(const char * name) const115 int32_t KernelInfo::getArgNumByName(const char *name) const {
116 int32_t argNum = 0;
117 for (const auto &argMeta : kernelDescriptor.explicitArgsExtendedMetadata) {
118 if (argMeta.argName.compare(name) == 0) {
119 return argNum;
120 }
121 ++argNum;
122 }
123 return -1;
124 }
125
createKernelAllocation(const Device & device,bool internalIsa)126 bool KernelInfo::createKernelAllocation(const Device &device, bool internalIsa) {
127 UNRECOVERABLE_IF(kernelAllocation);
128 auto kernelIsaSize = heapInfo.KernelHeapSize;
129 const auto allocType = internalIsa ? GraphicsAllocation::AllocationType::KERNEL_ISA_INTERNAL : GraphicsAllocation::AllocationType::KERNEL_ISA;
130 kernelAllocation = device.getMemoryManager()->allocateGraphicsMemoryWithProperties({device.getRootDeviceIndex(), kernelIsaSize, allocType, device.getDeviceBitfield()});
131 if (!kernelAllocation) {
132 return false;
133 }
134
135 auto &hwInfo = device.getHardwareInfo();
136 auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
137
138 return MemoryTransferHelper::transferMemoryToAllocation(hwHelper.isBlitCopyRequiredForLocalMemory(hwInfo, *kernelAllocation),
139 device, kernelAllocation, 0, heapInfo.pKernelHeap,
140 static_cast<size_t>(kernelIsaSize));
141 }
142
apply(const DeviceInfoKernelPayloadConstants & constants)143 void KernelInfo::apply(const DeviceInfoKernelPayloadConstants &constants) {
144 if (nullptr == this->crossThreadData) {
145 return;
146 }
147
148 const auto &implicitArgs = kernelDescriptor.payloadMappings.implicitArgs;
149 const auto privateMemorySize = static_cast<uint32_t>(KernelHelper::getPrivateSurfaceSize(kernelDescriptor.kernelAttributes.perHwThreadPrivateMemorySize,
150 constants.computeUnitsUsedForScratch));
151
152 auto setIfValidOffset = [&](auto value, NEO::CrossThreadDataOffset offset) {
153 if (isValidOffset(offset)) {
154 *ptrOffset(reinterpret_cast<decltype(value) *>(crossThreadData), offset) = value;
155 }
156 };
157 setIfValidOffset(reinterpret_cast<uintptr_t>(constants.slmWindow), implicitArgs.localMemoryStatelessWindowStartAddres);
158 setIfValidOffset(constants.slmWindowSize, implicitArgs.localMemoryStatelessWindowSize);
159 setIfValidOffset(privateMemorySize, implicitArgs.privateMemorySize);
160 setIfValidOffset(constants.maxWorkGroupSize, implicitArgs.maxWorkGroupSize);
161 }
162
concatenateKernelNames(ArrayRef<KernelInfo * > kernelInfos)163 std::string concatenateKernelNames(ArrayRef<KernelInfo *> kernelInfos) {
164 std::string semiColonDelimitedKernelNameStr;
165
166 for (const auto &kernelInfo : kernelInfos) {
167 if (!semiColonDelimitedKernelNameStr.empty()) {
168 semiColonDelimitedKernelNameStr += ';';
169 }
170 semiColonDelimitedKernelNameStr += kernelInfo->kernelDescriptor.kernelMetadata.kernelName;
171 }
172
173 return semiColonDelimitedKernelNameStr;
174 }
175
176 } // namespace NEO
177