1 /*
2  * Copyright (C) 2020-2021 Intel Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  */
7 
8 #pragma once
9 
10 #include "shared/source/helpers/aligned_memory.h"
11 #include "shared/source/helpers/debug_helpers.h"
12 #include "shared/source/kernel/debug_data.h"
13 #include "shared/source/kernel/kernel_arg_descriptor.h"
14 #include "shared/source/kernel/kernel_arg_metadata.h"
15 #include "shared/source/utilities/arrayref.h"
16 #include "shared/source/utilities/stackvec.h"
17 
18 #include <cinttypes>
19 #include <cstddef>
20 #include <limits>
21 #include <memory>
22 #include <unordered_map>
23 #include <vector>
24 
25 namespace NEO {
26 
27 using StringMap = std::unordered_map<uint32_t, std::string>;
28 using InstructionsSegmentOffset = uint16_t;
29 
30 struct ExtendedInfoBase {
31     virtual ~ExtendedInfoBase() = default;
32 };
33 
34 struct KernelDescriptor {
35     enum AddressingMode : uint8_t {
36         AddrNone,
37         Stateless,
38         Bindful,
39         Bindless,
40         BindfulAndStateless,
41         BindlessAndStateless
42     };
43 
44     KernelDescriptor() = default;
45     virtual ~KernelDescriptor() = default;
46     virtual bool hasRTCalls() const;
47 
updateCrossThreadDataSizeKernelDescriptor48     void updateCrossThreadDataSize() {
49         uint32_t crossThreadDataSize = 0;
50         for (uint32_t i = 0; i < 3; i++) {
51             if (isValidOffset(payloadMappings.dispatchTraits.globalWorkOffset[i])) {
52                 crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, payloadMappings.dispatchTraits.globalWorkOffset[i] + sizeof(uint32_t));
53             }
54             if (isValidOffset(payloadMappings.dispatchTraits.globalWorkSize[i])) {
55                 crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, payloadMappings.dispatchTraits.globalWorkSize[i] + sizeof(uint32_t));
56             }
57             if (isValidOffset(payloadMappings.dispatchTraits.localWorkSize[i])) {
58                 crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, payloadMappings.dispatchTraits.localWorkSize[i] + sizeof(uint32_t));
59             }
60             if (isValidOffset(payloadMappings.dispatchTraits.localWorkSize2[i])) {
61                 crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, payloadMappings.dispatchTraits.localWorkSize2[i] + sizeof(uint32_t));
62             }
63             if (isValidOffset(payloadMappings.dispatchTraits.enqueuedLocalWorkSize[i])) {
64                 crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, payloadMappings.dispatchTraits.enqueuedLocalWorkSize[i] + sizeof(uint32_t));
65             }
66             if (isValidOffset(payloadMappings.dispatchTraits.numWorkGroups[i])) {
67                 crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, payloadMappings.dispatchTraits.numWorkGroups[i] + sizeof(uint32_t));
68             }
69         }
70 
71         if (isValidOffset(payloadMappings.dispatchTraits.workDim)) {
72             crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, payloadMappings.dispatchTraits.workDim + sizeof(uint32_t));
73         }
74 
75         StackVec<ArgDescPointer *, 8> implicitArgsVec({&payloadMappings.implicitArgs.printfSurfaceAddress,
76                                                        &payloadMappings.implicitArgs.globalVariablesSurfaceAddress,
77                                                        &payloadMappings.implicitArgs.globalConstantsSurfaceAddress,
78                                                        &payloadMappings.implicitArgs.privateMemoryAddress,
79                                                        &payloadMappings.implicitArgs.deviceSideEnqueueEventPoolSurfaceAddress,
80                                                        &payloadMappings.implicitArgs.deviceSideEnqueueDefaultQueueSurfaceAddress,
81                                                        &payloadMappings.implicitArgs.systemThreadSurfaceAddress,
82                                                        &payloadMappings.implicitArgs.syncBufferAddress});
83 
84         for (size_t i = 0; i < implicitArgsVec.size(); i++) {
85             if (isValidOffset(implicitArgsVec[i]->bindless)) {
86                 crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, implicitArgsVec[i]->bindless + sizeof(uint32_t));
87             }
88 
89             if (isValidOffset(implicitArgsVec[i]->stateless)) {
90                 crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, implicitArgsVec[i]->stateless + implicitArgsVec[i]->pointerSize);
91             }
92         }
93 
94         StackVec<CrossThreadDataOffset *, 7> implicitArgsVec2({&payloadMappings.implicitArgs.privateMemorySize,
95                                                                &payloadMappings.implicitArgs.maxWorkGroupSize,
96                                                                &payloadMappings.implicitArgs.simdSize,
97                                                                &payloadMappings.implicitArgs.deviceSideEnqueueParentEvent,
98                                                                &payloadMappings.implicitArgs.preferredWkgMultiple,
99                                                                &payloadMappings.implicitArgs.localMemoryStatelessWindowSize,
100                                                                &payloadMappings.implicitArgs.localMemoryStatelessWindowStartAddres});
101 
102         for (size_t i = 0; i < implicitArgsVec2.size(); i++) {
103             if (isValidOffset(*implicitArgsVec2[i])) {
104                 crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, *implicitArgsVec2[i] + sizeof(uint32_t));
105             }
106         }
107 
108         for (size_t i = 0; i < payloadMappings.explicitArgs.size(); i++) {
109 
110             switch (payloadMappings.explicitArgs[i].type) {
111             case ArgDescriptor::ArgType::ArgTImage: {
112                 auto &argImage = payloadMappings.explicitArgs[i].as<ArgDescImage>(false);
113                 if (isValidOffset(argImage.bindless)) {
114                     crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, argImage.bindless + sizeof(uint32_t));
115                 }
116             } break;
117             case ArgDescriptor::ArgType::ArgTPointer: {
118                 auto &argPtr = payloadMappings.explicitArgs[i].as<ArgDescPointer>(false);
119                 if (isValidOffset(argPtr.bindless)) {
120                     crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, argPtr.bindless + sizeof(uint32_t));
121                 }
122                 if (isValidOffset(argPtr.stateless)) {
123                     crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, argPtr.stateless + argPtr.pointerSize);
124                 }
125             } break;
126             case ArgDescriptor::ArgType::ArgTSampler: {
127                 auto &argSampler = payloadMappings.explicitArgs[i].as<ArgDescSampler>(false);
128                 UNRECOVERABLE_IF(isValidOffset(argSampler.bindless));
129             } break;
130             case ArgDescriptor::ArgType::ArgTValue: {
131                 auto &argVal = payloadMappings.explicitArgs[i].as<ArgDescValue>(false);
132                 for (size_t i = 0; i < argVal.elements.size(); i++) {
133                     UNRECOVERABLE_IF(!isValidOffset(argVal.elements[i].offset));
134                     crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, argVal.elements[i].offset + argVal.elements[i].size);
135                 }
136             } break;
137             default:
138                 break;
139             }
140         }
141 
142         this->kernelAttributes.crossThreadDataSize = std::max<uint16_t>(this->kernelAttributes.crossThreadDataSize, static_cast<uint16_t>(alignUp(crossThreadDataSize, 32)));
143     }
144 
145     struct KernelAttributes {
KernelAttributesKernelDescriptor::KernelAttributes146         KernelAttributes() { flags.packed = 0U; }
147 
148         uint32_t slmInlineSize = 0U;
149         uint32_t perThreadScratchSize[2] = {0U, 0U};
150         uint32_t perHwThreadPrivateMemorySize = 0U;
151         uint32_t perThreadSystemThreadSurfaceSize = 0U;
152         uint16_t requiredWorkgroupSize[3] = {0U, 0U, 0U};
153         uint16_t crossThreadDataSize = 0U;
154         uint16_t inlineDataPayloadSize = 0U;
155         uint16_t perThreadDataSize = 0U;
156         uint16_t numArgsToPatch = 0U;
157         uint16_t numGrfRequired = 0U;
158         uint8_t barrierCount = 0u;
159         bool hasNonKernelArgLoad = true;
160         bool hasNonKernelArgStore = true;
161         bool hasNonKernelArgAtomic = true;
162 
163         AddressingMode bufferAddressingMode = BindfulAndStateless;
164         AddressingMode imageAddressingMode = Bindful;
165         AddressingMode samplerAddressingMode = Bindful;
166 
167         uint8_t workgroupWalkOrder[3] = {0, 1, 2};
168         uint8_t workgroupDimensionsOrder[3] = {0, 1, 2};
169 
170         uint8_t gpuPointerSize = 0;
171         uint8_t simdSize = 8;
172         uint8_t numLocalIdChannels = 0;
173         uint8_t localId[3] = {0U, 0U, 0U};
174 
supportsBuffersBiggerThan4GbKernelDescriptor::KernelAttributes175         bool supportsBuffersBiggerThan4Gb() const {
176             return Stateless == bufferAddressingMode;
177         }
178 
usesBarriersKernelDescriptor::KernelAttributes179         bool usesBarriers() const {
180             return 0 != barrierCount;
181         }
182 
183         union {
184             struct {
185                 bool usesSpecialPipelineSelectMode : 1;
186                 bool usesStringMapForPrintf : 1;
187                 bool usesPrintf : 1;
188                 bool usesFencesForReadWriteImages : 1;
189                 bool usesFlattenedLocalIds;
190                 bool usesPrivateMemory : 1;
191                 bool usesVme : 1;
192                 bool usesImages : 1;
193                 bool usesSamplers : 1;
194                 bool usesDeviceSideEnqueue : 1;
195                 bool usesSyncBuffer : 1;
196                 bool useGlobalAtomics : 1;
197                 bool usesStatelessWrites : 1;
198                 bool passInlineData : 1;
199                 bool perThreadDataHeaderIsPresent : 1;
200                 bool perThreadDataUnusedGrfIsPresent : 1;
201                 bool requiresDisabledMidThreadPreemption : 1;
202                 bool requiresSubgroupIndependentForwardProgress : 1;
203                 bool requiresWorkgroupWalkOrder : 1;
204                 bool requiresImplicitArgs : 1;
205                 bool useStackCalls : 1;
206             };
207             uint32_t packed;
208         } flags;
209         static_assert(sizeof(KernelAttributes::flags) == sizeof(KernelAttributes::flags.packed), "");
210     } kernelAttributes;
211 
212     struct {
213         InstructionsSegmentOffset skipPerThreadDataLoad = 0U;
214         InstructionsSegmentOffset skipSetFFIDGP = 0U;
215         InstructionsSegmentOffset systemKernel = 0U;
216     } entryPoints;
217 
218     struct PayloadMappings {
219         struct {
220             CrossThreadDataOffset globalWorkOffset[3] = {undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>};
221             CrossThreadDataOffset globalWorkSize[3] = {undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>};
222             CrossThreadDataOffset localWorkSize[3] = {undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>};
223             CrossThreadDataOffset localWorkSize2[3] = {undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>};
224 
225             CrossThreadDataOffset enqueuedLocalWorkSize[3] = {undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>};
226             CrossThreadDataOffset numWorkGroups[3] = {undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>};
227             CrossThreadDataOffset workDim = undefined<CrossThreadDataOffset>;
228         } dispatchTraits;
229 
230         struct {
231             SurfaceStateHeapOffset tableOffset = undefined<SurfaceStateHeapOffset>;
232             uint8_t numEntries = 0;
233         } bindingTable;
234 
235         struct {
236             DynamicStateHeapOffset tableOffset = undefined<DynamicStateHeapOffset>;
237             DynamicStateHeapOffset borderColor = undefined<DynamicStateHeapOffset>;
238             uint8_t numSamplers = 0;
239         } samplerTable;
240 
241         StackVec<ArgDescriptor, 16> explicitArgs;
242 
243         struct {
244             ArgDescPointer printfSurfaceAddress;
245             ArgDescPointer globalVariablesSurfaceAddress;
246             ArgDescPointer globalConstantsSurfaceAddress;
247             ArgDescPointer privateMemoryAddress;
248             ArgDescPointer deviceSideEnqueueEventPoolSurfaceAddress;
249             ArgDescPointer deviceSideEnqueueDefaultQueueSurfaceAddress;
250             ArgDescPointer systemThreadSurfaceAddress;
251             ArgDescPointer syncBufferAddress;
252             ArgDescPointer rtDispatchGlobals;
253             CrossThreadDataOffset privateMemorySize = undefined<CrossThreadDataOffset>;
254             CrossThreadDataOffset maxWorkGroupSize = undefined<CrossThreadDataOffset>;
255             CrossThreadDataOffset simdSize = undefined<CrossThreadDataOffset>;
256             CrossThreadDataOffset deviceSideEnqueueParentEvent = undefined<CrossThreadDataOffset>;
257             CrossThreadDataOffset preferredWkgMultiple = undefined<CrossThreadDataOffset>;
258             CrossThreadDataOffset localMemoryStatelessWindowSize = undefined<CrossThreadDataOffset>;
259             CrossThreadDataOffset localMemoryStatelessWindowStartAddres = undefined<CrossThreadDataOffset>;
260         } implicitArgs;
261 
262         std::vector<std::unique_ptr<ArgDescriptorExtended>> explicitArgsExtendedDescriptors;
263     } payloadMappings;
264 
265     std::vector<ArgTypeMetadataExtended> explicitArgsExtendedMetadata;
266 
267     struct {
268         std::string kernelName;
269         std::string kernelLanguageAttributes;
270         StringMap printfStringsMap;
271         std::vector<std::pair<uint32_t, uint32_t>> deviceSideEnqueueChildrenKernelsIdOffset;
272         uint32_t deviceSideEnqueueBlockInterfaceDescriptorOffset = 0U;
273 
274         struct ByValueArgument {
275             ArgDescValue::Element byValueElement;
276             uint16_t argNum;
277         };
278         StackVec<ByValueArgument, 32> allByValueKernelArguments;
279 
280         uint16_t compiledSubGroupsNumber = 0U;
281         uint8_t requiredSubGroupSize = 0U;
282     } kernelMetadata;
283 
284     struct {
285         std::unique_ptr<DebugData> debugData;
286         std::unique_ptr<uint8_t[]> relocatedDebugData;
287         const void *igcInfoForGtpin = nullptr;
288     } external;
289 
290     std::vector<uint8_t> generatedHeaps;
291     std::unique_ptr<ExtendedInfoBase> extendedInfo;
292 };
293 
294 } // namespace NEO
295