1 /*
2  * Copyright (C) 2020-2021 Intel Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  */
7 
8 #pragma once
9 
10 #include "shared/source/kernel/dispatch_kernel_encoder_interface.h"
11 #include "shared/source/unified_memory/unified_memory.h"
12 
13 #include "level_zero/core/source/kernel/kernel.h"
14 
15 #include <memory>
16 
17 namespace L0 {
18 
19 struct KernelImp : Kernel {
20     KernelImp(Module *module);
21 
22     ~KernelImp() override;
23 
destroyKernelImp24     ze_result_t destroy() override {
25         delete this;
26         return ZE_RESULT_SUCCESS;
27     }
28 
29     ze_result_t setIndirectAccess(ze_kernel_indirect_access_flags_t flags) override;
30     ze_result_t getIndirectAccess(ze_kernel_indirect_access_flags_t *flags) override;
31     ze_result_t getSourceAttributes(uint32_t *pSize, char **pString) override;
32 
33     ze_result_t getProperties(ze_kernel_properties_t *pKernelProperties) override;
34 
35     ze_result_t setArgumentValue(uint32_t argIndex, size_t argSize, const void *pArgValue) override;
36 
37     void setGroupCount(uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ) override;
38 
39     ze_result_t setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
40                              uint32_t groupSizeZ) override;
41 
42     ze_result_t suggestGroupSize(uint32_t globalSizeX, uint32_t globalSizeY, uint32_t globalSizeZ,
43                                  uint32_t *groupSizeX, uint32_t *groupSizeY,
44                                  uint32_t *groupSizeZ) override;
45 
46     ze_result_t getKernelName(size_t *pSize, char *pName) override;
47 
48     ze_result_t suggestMaxCooperativeGroupCount(uint32_t *totalGroupCount, NEO::EngineGroupType engineGroupType,
49                                                 bool isEngineInstanced) override;
50 
getCrossThreadDataKernelImp51     const uint8_t *getCrossThreadData() const override { return crossThreadData.get(); }
getCrossThreadDataSizeKernelImp52     uint32_t getCrossThreadDataSize() const override { return crossThreadDataSize; }
53 
getResidencyContainerKernelImp54     const std::vector<NEO::GraphicsAllocation *> &getResidencyContainer() const override {
55         return residencyContainer;
56     }
57 
58     ze_result_t setArgImmediate(uint32_t argIndex, size_t argSize, const void *argVal);
59 
60     ze_result_t setArgBuffer(uint32_t argIndex, size_t argSize, const void *argVal);
61 
62     ze_result_t setArgUnknown(uint32_t argIndex, size_t argSize, const void *argVal);
63 
64     ze_result_t setArgRedescribedImage(uint32_t argIndex, ze_image_handle_t argVal) override;
65 
66     ze_result_t setArgBufferWithAlloc(uint32_t argIndex, uintptr_t argVal, NEO::GraphicsAllocation *allocation) override;
67 
68     ze_result_t setArgImage(uint32_t argIndex, size_t argSize, const void *argVal);
69 
70     ze_result_t setArgSampler(uint32_t argIndex, size_t argSize, const void *argVal);
71 
72     virtual void setBufferSurfaceState(uint32_t argIndex, void *address, NEO::GraphicsAllocation *alloc) = 0;
73 
74     ze_result_t initialize(const ze_kernel_desc_t *desc);
75 
getPerThreadDataKernelImp76     const uint8_t *getPerThreadData() const override { return perThreadDataForWholeThreadGroup; }
getPerThreadDataSizeForWholeThreadGroupKernelImp77     uint32_t getPerThreadDataSizeForWholeThreadGroup() const override { return perThreadDataSizeForWholeThreadGroup; }
78 
getPerThreadDataSizeKernelImp79     uint32_t getPerThreadDataSize() const override { return perThreadDataSize; }
getNumThreadsPerThreadGroupKernelImp80     uint32_t getNumThreadsPerThreadGroup() const override { return numThreadsPerThreadGroup; }
getThreadExecutionMaskKernelImp81     uint32_t getThreadExecutionMask() const override { return threadExecutionMask; }
82 
getPrintfBufferAllocationKernelImp83     NEO::GraphicsAllocation *getPrintfBufferAllocation() override { return this->printfBuffer; }
84     void printPrintfOutput() override;
85 
86     bool usesSyncBuffer() override;
87     void patchSyncBuffer(NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) override;
88 
getSurfaceStateHeapDataKernelImp89     const uint8_t *getSurfaceStateHeapData() const override { return surfaceStateHeapData.get(); }
getSurfaceStateHeapDataSizeKernelImp90     uint32_t getSurfaceStateHeapDataSize() const override { return surfaceStateHeapDataSize; }
91 
getDynamicStateHeapDataKernelImp92     const uint8_t *getDynamicStateHeapData() const override { return dynamicStateHeapData.get(); }
93 
getImmutableDataKernelImp94     const KernelImmutableData *getImmutableData() const override { return kernelImmData; }
95 
getUnifiedMemoryControlsKernelImp96     UnifiedMemoryControls getUnifiedMemoryControls() const override { return unifiedMemoryControls; }
97     bool hasIndirectAllocationsAllowed() const override;
98 
getKernelDescriptorKernelImp99     const NEO::KernelDescriptor &getKernelDescriptor() const override {
100         return kernelImmData->getDescriptor();
101     }
getGroupSizeKernelImp102     const uint32_t *getGroupSize() const override {
103         return groupSize;
104     }
105     uint32_t getSlmTotalSize() const override;
106 
getSlmPolicyKernelImp107     NEO::SlmPolicy getSlmPolicy() const override {
108         if (cacheConfigFlags & ZE_CACHE_CONFIG_FLAG_LARGE_SLM) {
109             return NEO::SlmPolicy::SlmPolicyLargeSlm;
110         } else if (cacheConfigFlags & ZE_CACHE_CONFIG_FLAG_LARGE_DATA) {
111             return NEO::SlmPolicy::SlmPolicyLargeData;
112         } else {
113             return NEO::SlmPolicy::SlmPolicyNone;
114         }
115     }
116 
117     NEO::GraphicsAllocation *getIsaAllocation() const override;
118 
getRequiredWorkgroupOrderKernelImp119     uint32_t getRequiredWorkgroupOrder() const override { return requiredWorkgroupOrder; }
requiresGenerationOfLocalIdsByRuntimeKernelImp120     bool requiresGenerationOfLocalIdsByRuntime() const override { return kernelRequiresGenerationOfLocalIdsByRuntime; }
getKernelRequiresUncachedMocsKernelImp121     bool getKernelRequiresUncachedMocs() { return (kernelRequiresUncachedMocsCount > 0); }
getKernelRequiresQueueUncachedMocsKernelImp122     bool getKernelRequiresQueueUncachedMocs() { return (kernelRequiresQueueUncachedMocsCount > 0); }
setKernelArgUncachedKernelImp123     void setKernelArgUncached(uint32_t index, bool val) { isArgUncached[index] = val; }
124 
getGlobalOffsetsKernelImp125     uint32_t *getGlobalOffsets() override {
126         return this->globalOffsets;
127     }
128     ze_result_t setGlobalOffsetExp(uint32_t offsetX, uint32_t offsetY, uint32_t offsetZ) override;
129     void patchGlobalOffset() override;
130 
131     ze_result_t setCacheConfig(ze_cache_config_flags_t flags) override;
usesRayTracingKernelImp132     bool usesRayTracing() {
133         return kernelImmData->getDescriptor().hasRTCalls();
134     }
135 
getProfileInfoKernelImp136     ze_result_t getProfileInfo(zet_profile_properties_t *pProfileProperties) override {
137         pProfileProperties->flags = 0;
138         pProfileProperties->numTokens = 0;
139         return ZE_RESULT_SUCCESS;
140     }
141 
hasIndirectAccessKernelImp142     bool hasIndirectAccess() {
143         return kernelHasIndirectAccess;
144     }
145 
146     NEO::GraphicsAllocation *allocatePrivateMemoryGraphicsAllocation() override;
147     void patchCrossthreadDataWithPrivateAllocation(NEO::GraphicsAllocation *privateAllocation) override;
148 
getPrivateMemoryGraphicsAllocationKernelImp149     NEO::GraphicsAllocation *getPrivateMemoryGraphicsAllocation() override {
150         return privateMemoryGraphicsAllocation;
151     }
152 
153     ze_result_t setSchedulingHintExp(ze_scheduling_hint_exp_desc_t *pHint) override;
154     uint32_t getSchedulingHintExp();
155 
getImplicitArgsKernelImp156     NEO::ImplicitArgs *getImplicitArgs() const override { return pImplicitArgs.get(); }
157     uint32_t getSizeForImplicitArgsPatching() const override;
158     void patchImplicitArgs(void *&pOut) const override;
159 
160   protected:
161     KernelImp() = default;
162 
163     void patchWorkgroupSizeInCrossThreadData(uint32_t x, uint32_t y, uint32_t z);
164 
165     NEO::GraphicsAllocation *privateMemoryGraphicsAllocation = nullptr;
166 
167     void createPrintfBuffer();
168     void setDebugSurface();
169     virtual void evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor) = 0;
170     void *patchBindlessSurfaceState(NEO::GraphicsAllocation *alloc, uint32_t bindless);
171 
172     const KernelImmutableData *kernelImmData = nullptr;
173     Module *module = nullptr;
174 
175     typedef ze_result_t (KernelImp::*KernelArgHandler)(uint32_t argIndex, size_t argSize, const void *argVal);
176     std::vector<KernelImp::KernelArgHandler> kernelArgHandlers;
177     std::vector<NEO::GraphicsAllocation *> residencyContainer;
178 
179     NEO::GraphicsAllocation *printfBuffer = nullptr;
180 
181     uint32_t groupSize[3] = {0u, 0u, 0u};
182     uint32_t numThreadsPerThreadGroup = 1u;
183     uint32_t threadExecutionMask = 0u;
184 
185     std::unique_ptr<uint8_t[]> crossThreadData = nullptr;
186     uint32_t crossThreadDataSize = 0;
187 
188     std::unique_ptr<uint8_t[]> surfaceStateHeapData = nullptr;
189     uint32_t surfaceStateHeapDataSize = 0;
190 
191     std::unique_ptr<uint8_t[]> dynamicStateHeapData = nullptr;
192     uint32_t dynamicStateHeapDataSize = 0;
193 
194     uint8_t *perThreadDataForWholeThreadGroup = nullptr;
195     uint32_t perThreadDataSizeForWholeThreadGroupAllocated = 0;
196     uint32_t perThreadDataSizeForWholeThreadGroup = 0u;
197     uint32_t perThreadDataSize = 0u;
198 
199     UnifiedMemoryControls unifiedMemoryControls;
200     std::vector<uint32_t> slmArgSizes;
201     uint32_t slmArgsTotalSize = 0U;
202     uint32_t requiredWorkgroupOrder = 0u;
203 
204     bool kernelRequiresGenerationOfLocalIdsByRuntime = true;
205     uint32_t kernelRequiresUncachedMocsCount = false;
206     uint32_t kernelRequiresQueueUncachedMocsCount = false;
207     std::vector<bool> isArgUncached;
208 
209     uint32_t globalOffsets[3] = {};
210 
211     ze_cache_config_flags_t cacheConfigFlags = 0u;
212 
213     bool kernelHasIndirectAccess = true;
214 
215     uint32_t schedulingHintExpFlag = 0u;
216     std::unique_ptr<NEO::ImplicitArgs> pImplicitArgs;
217 };
218 
219 } // namespace L0
220