1 /*
2  * Copyright (C) 2018-2021 Intel Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  */
7 
8 #include "shared/source/program/kernel_info.h"
9 
10 #include "shared/source/device/device.h"
11 #include "shared/source/device_binary_format/patchtokens_decoder.h"
12 #include "shared/source/helpers/aligned_memory.h"
13 #include "shared/source/helpers/blit_commands_helper.h"
14 #include "shared/source/helpers/hw_helper.h"
15 #include "shared/source/helpers/kernel_helpers.h"
16 #include "shared/source/helpers/ptr_math.h"
17 #include "shared/source/helpers/string.h"
18 #include "shared/source/memory_manager/memory_manager.h"
19 
20 #include <cstdint>
21 #include <cstring>
22 #include <map>
23 #include <sstream>
24 #include <unordered_map>
25 
26 namespace NEO {
27 
28 struct KernelArgumentType {
29     const char *argTypeQualifier;
30     uint64_t argTypeQualifierValue;
31 };
32 
WorkSizeInfo(uint32_t maxWorkGroupSize,bool hasBarriers,uint32_t simdSize,uint32_t slmTotalSize,const HardwareInfo * hwInfo,uint32_t numThreadsPerSubSlice,uint32_t localMemSize,bool imgUsed,bool yTiledSurface)33 WorkSizeInfo::WorkSizeInfo(uint32_t maxWorkGroupSize, bool hasBarriers, uint32_t simdSize, uint32_t slmTotalSize, const HardwareInfo *hwInfo, uint32_t numThreadsPerSubSlice, uint32_t localMemSize, bool imgUsed, bool yTiledSurface) {
34     this->maxWorkGroupSize = maxWorkGroupSize;
35     this->hasBarriers = hasBarriers;
36     this->simdSize = simdSize;
37     this->slmTotalSize = slmTotalSize;
38     this->coreFamily = hwInfo->platform.eRenderCoreFamily;
39     this->numThreadsPerSubSlice = numThreadsPerSubSlice;
40     this->localMemSize = localMemSize;
41     this->imgUsed = imgUsed;
42     this->yTiledSurfaces = yTiledSurface;
43 
44     setMinWorkGroupSize(hwInfo);
45 }
46 
setIfUseImg(const KernelInfo & kernelInfo)47 void WorkSizeInfo::setIfUseImg(const KernelInfo &kernelInfo) {
48     for (const auto &arg : kernelInfo.kernelDescriptor.payloadMappings.explicitArgs) {
49         if (arg.is<ArgDescriptor::ArgTImage>()) {
50             imgUsed = true;
51             yTiledSurfaces = true;
52             return;
53         }
54     }
55 }
setMinWorkGroupSize(const HardwareInfo * hwInfo)56 void WorkSizeInfo::setMinWorkGroupSize(const HardwareInfo *hwInfo) {
57     minWorkGroupSize = 0;
58     if (hasBarriers) {
59         uint32_t maxBarriersPerHSlice = (coreFamily >= IGFX_GEN9_CORE) ? 32 : 16;
60         minWorkGroupSize = numThreadsPerSubSlice * simdSize / maxBarriersPerHSlice;
61     }
62     if (slmTotalSize > 0) {
63         UNRECOVERABLE_IF(localMemSize < slmTotalSize);
64         minWorkGroupSize = std::max(maxWorkGroupSize / ((localMemSize / slmTotalSize)), minWorkGroupSize);
65     }
66 
67     const auto &hwHelper = HwHelper::get(hwInfo->platform.eRenderCoreFamily);
68     if (hwHelper.isFusedEuDispatchEnabled(*hwInfo)) {
69         minWorkGroupSize *= 2;
70     }
71 }
checkRatio(const size_t workItems[3])72 void WorkSizeInfo::checkRatio(const size_t workItems[3]) {
73     if (slmTotalSize > 0) {
74         useRatio = true;
75         targetRatio = log((float)workItems[0]) - log((float)workItems[1]);
76         useStrictRatio = false;
77     } else if (yTiledSurfaces == true) {
78         useRatio = true;
79         targetRatio = YTilingRatioValue;
80         useStrictRatio = true;
81     }
82 }
83 
~KernelInfo()84 KernelInfo::~KernelInfo() {
85     delete[] crossThreadData;
86 }
87 
getSamplerStateArrayCount() const88 size_t KernelInfo::getSamplerStateArrayCount() const {
89     return kernelDescriptor.payloadMappings.samplerTable.numSamplers;
90 }
getSamplerStateArraySize(const HardwareInfo & hwInfo) const91 size_t KernelInfo::getSamplerStateArraySize(const HardwareInfo &hwInfo) const {
92     size_t samplerStateArraySize = getSamplerStateArrayCount() * HwHelper::get(hwInfo.platform.eRenderCoreFamily).getSamplerStateSize();
93     return samplerStateArraySize;
94 }
95 
getBorderColorStateSize() const96 size_t KernelInfo::getBorderColorStateSize() const {
97     size_t borderColorSize = 0;
98     if (kernelDescriptor.payloadMappings.samplerTable.numSamplers > 0U) {
99         borderColorSize = kernelDescriptor.payloadMappings.samplerTable.tableOffset - kernelDescriptor.payloadMappings.samplerTable.borderColor;
100     }
101     return borderColorSize;
102 }
103 
getBorderColorOffset() const104 size_t KernelInfo::getBorderColorOffset() const {
105     size_t borderColorOffset = 0;
106     if (kernelDescriptor.payloadMappings.samplerTable.numSamplers > 0U) {
107         borderColorOffset = kernelDescriptor.payloadMappings.samplerTable.borderColor;
108     }
109     return borderColorOffset;
110 }
111 
getConstantBufferSize() const112 uint32_t KernelInfo::getConstantBufferSize() const {
113     return kernelDescriptor.kernelAttributes.crossThreadDataSize;
114 }
getArgNumByName(const char * name) const115 int32_t KernelInfo::getArgNumByName(const char *name) const {
116     int32_t argNum = 0;
117     for (const auto &argMeta : kernelDescriptor.explicitArgsExtendedMetadata) {
118         if (argMeta.argName.compare(name) == 0) {
119             return argNum;
120         }
121         ++argNum;
122     }
123     return -1;
124 }
125 
createKernelAllocation(const Device & device,bool internalIsa)126 bool KernelInfo::createKernelAllocation(const Device &device, bool internalIsa) {
127     UNRECOVERABLE_IF(kernelAllocation);
128     auto kernelIsaSize = heapInfo.KernelHeapSize;
129     const auto allocType = internalIsa ? GraphicsAllocation::AllocationType::KERNEL_ISA_INTERNAL : GraphicsAllocation::AllocationType::KERNEL_ISA;
130     kernelAllocation = device.getMemoryManager()->allocateGraphicsMemoryWithProperties({device.getRootDeviceIndex(), kernelIsaSize, allocType, device.getDeviceBitfield()});
131     if (!kernelAllocation) {
132         return false;
133     }
134 
135     auto &hwInfo = device.getHardwareInfo();
136     auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
137 
138     return MemoryTransferHelper::transferMemoryToAllocation(hwHelper.isBlitCopyRequiredForLocalMemory(hwInfo, *kernelAllocation),
139                                                             device, kernelAllocation, 0, heapInfo.pKernelHeap,
140                                                             static_cast<size_t>(kernelIsaSize));
141 }
142 
apply(const DeviceInfoKernelPayloadConstants & constants)143 void KernelInfo::apply(const DeviceInfoKernelPayloadConstants &constants) {
144     if (nullptr == this->crossThreadData) {
145         return;
146     }
147 
148     const auto &implicitArgs = kernelDescriptor.payloadMappings.implicitArgs;
149     const auto privateMemorySize = static_cast<uint32_t>(KernelHelper::getPrivateSurfaceSize(kernelDescriptor.kernelAttributes.perHwThreadPrivateMemorySize,
150                                                                                              constants.computeUnitsUsedForScratch));
151 
152     auto setIfValidOffset = [&](auto value, NEO::CrossThreadDataOffset offset) {
153         if (isValidOffset(offset)) {
154             *ptrOffset(reinterpret_cast<decltype(value) *>(crossThreadData), offset) = value;
155         }
156     };
157     setIfValidOffset(reinterpret_cast<uintptr_t>(constants.slmWindow), implicitArgs.localMemoryStatelessWindowStartAddres);
158     setIfValidOffset(constants.slmWindowSize, implicitArgs.localMemoryStatelessWindowSize);
159     setIfValidOffset(privateMemorySize, implicitArgs.privateMemorySize);
160     setIfValidOffset(constants.maxWorkGroupSize, implicitArgs.maxWorkGroupSize);
161 }
162 
concatenateKernelNames(ArrayRef<KernelInfo * > kernelInfos)163 std::string concatenateKernelNames(ArrayRef<KernelInfo *> kernelInfos) {
164     std::string semiColonDelimitedKernelNameStr;
165 
166     for (const auto &kernelInfo : kernelInfos) {
167         if (!semiColonDelimitedKernelNameStr.empty()) {
168             semiColonDelimitedKernelNameStr += ';';
169         }
170         semiColonDelimitedKernelNameStr += kernelInfo->kernelDescriptor.kernelMetadata.kernelName;
171     }
172 
173     return semiColonDelimitedKernelNameStr;
174 }
175 
176 } // namespace NEO
177