1/*
2 * Copyright (C) 2019-2021 Intel Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 */
7
8#include "shared/source/command_container/command_encoder.h"
9#include "shared/source/command_stream/csr_definitions.h"
10#include "shared/source/command_stream/preemption.h"
11#include "shared/source/debug_settings/debug_settings_manager.h"
12#include "shared/source/helpers/address_patch.h"
13#include "shared/source/helpers/aligned_memory.h"
14#include "shared/source/helpers/basic_math.h"
15#include "shared/source/helpers/hw_helper.h"
16#include "shared/source/helpers/local_id_gen.h"
17#include "shared/source/helpers/ptr_math.h"
18#include "shared/source/helpers/string.h"
19#include "shared/source/indirect_heap/indirect_heap.h"
20
21#include "opencl/source/cl_device/cl_device.h"
22#include "opencl/source/context/context.h"
23#include "opencl/source/helpers/dispatch_info.h"
24#include "opencl/source/kernel/kernel.h"
25#include "opencl/source/program/block_kernel_manager.h"
26#include "opencl/source/scheduler/scheduler_kernel.h"
27
28#include <cstring>
29
30namespace NEO {
31
32template <typename GfxFamily>
33size_t HardwareCommandsHelper<GfxFamily>::getSizeRequiredDSH(const Kernel &kernel) {
34    using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
35    using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE;
36    const auto &samplerTable = kernel.getKernelInfo().kernelDescriptor.payloadMappings.samplerTable;
37
38    auto samplerCount = samplerTable.numSamplers;
39    auto totalSize = samplerCount
40                         ? alignUp(samplerCount * sizeof(SAMPLER_STATE), INTERFACE_DESCRIPTOR_DATA::SAMPLERSTATEPOINTER_ALIGN_SIZE)
41                         : 0;
42
43    auto borderColorSize = samplerTable.borderColor;
44    borderColorSize = alignUp(borderColorSize + EncodeStates<GfxFamily>::alignIndirectStatePointer - 1,
45                              EncodeStates<GfxFamily>::alignIndirectStatePointer);
46
47    totalSize += borderColorSize + additionalSizeRequiredDsh();
48
49    DEBUG_BREAK_IF(!(totalSize >= kernel.getDynamicStateHeapSize() || kernel.isVmeKernel()));
50
51    return alignUp(totalSize, EncodeStates<GfxFamily>::alignInterfaceDescriptorData);
52}
53
54template <typename GfxFamily>
55size_t HardwareCommandsHelper<GfxFamily>::getSizeRequiredIOH(const Kernel &kernel,
56                                                             size_t localWorkSize) {
57    typedef typename GfxFamily::WALKER_TYPE WALKER_TYPE;
58    const auto &kernelInfo = kernel.getKernelInfo();
59
60    auto numChannels = kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels;
61    uint32_t grfSize = sizeof(typename GfxFamily::GRF);
62    auto size = kernel.getCrossThreadDataSize() +
63                getPerThreadDataSizeTotal(kernelInfo.getMaxSimdSize(), grfSize, numChannels, localWorkSize);
64
65    if (kernel.getImplicitArgs()) {
66        size += sizeof(ImplicitArgs) + alignUp(getPerThreadDataSizeTotal(kernelInfo.getMaxSimdSize(), grfSize, 3u, localWorkSize), MemoryConstants::cacheLineSize);
67    }
68    return alignUp(size, WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
69}
70
71template <typename GfxFamily>
72size_t HardwareCommandsHelper<GfxFamily>::getSizeRequiredSSH(const Kernel &kernel) {
73    typedef typename GfxFamily::BINDING_TABLE_STATE BINDING_TABLE_STATE;
74    auto sizeSSH = kernel.getSurfaceStateHeapSize();
75    sizeSSH += sizeSSH ? BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE : 0;
76    return sizeSSH;
77}
78
79template <typename SizeGetterT, typename... ArgsT>
80size_t getSizeRequired(const MultiDispatchInfo &multiDispatchInfo, SizeGetterT &&getSize, ArgsT... args) {
81    size_t totalSize = 0;
82    auto it = multiDispatchInfo.begin();
83    for (auto e = multiDispatchInfo.end(); it != e; ++it) {
84        totalSize = alignUp(totalSize, MemoryConstants::cacheLineSize);
85        totalSize += getSize(*it, std::forward<ArgsT>(args)...);
86    }
87    totalSize = alignUp(totalSize, MemoryConstants::pageSize);
88    return totalSize;
89}
90
91template <typename GfxFamily>
92size_t HardwareCommandsHelper<GfxFamily>::getTotalSizeRequiredDSH(
93    const MultiDispatchInfo &multiDispatchInfo) {
94    return getSizeRequired(multiDispatchInfo, [](const DispatchInfo &dispatchInfo) { return getSizeRequiredDSH(*dispatchInfo.getKernel()); });
95}
96
97template <typename GfxFamily>
98size_t HardwareCommandsHelper<GfxFamily>::getTotalSizeRequiredIOH(
99    const MultiDispatchInfo &multiDispatchInfo) {
100    return getSizeRequired(multiDispatchInfo, [](const DispatchInfo &dispatchInfo) { return getSizeRequiredIOH(
101                                                                                         *dispatchInfo.getKernel(),
102                                                                                         Math::computeTotalElementsCount(dispatchInfo.getLocalWorkgroupSize())); });
103}
104
105template <typename GfxFamily>
106size_t HardwareCommandsHelper<GfxFamily>::getTotalSizeRequiredSSH(
107    const MultiDispatchInfo &multiDispatchInfo) {
108    return getSizeRequired(multiDispatchInfo, [](const DispatchInfo &dispatchInfo) { return getSizeRequiredSSH(*dispatchInfo.getKernel()); });
109}
110
111template <typename GfxFamily>
112size_t HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(const Kernel &kernel) {
113    typedef typename GfxFamily::BINDING_TABLE_STATE BINDING_TABLE_STATE;
114
115    size_t totalSize = 0;
116    BlockKernelManager *blockManager = kernel.getProgram()->getBlockKernelManager();
117    uint32_t blockCount = static_cast<uint32_t>(blockManager->getCount());
118    uint32_t maxBindingTableCount = 0;
119
120    totalSize = BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE - 1;
121
122    for (uint32_t i = 0; i < blockCount; i++) {
123        const KernelInfo *pBlockInfo = blockManager->getBlockKernelInfo(i);
124        totalSize += pBlockInfo->heapInfo.SurfaceStateHeapSize;
125        totalSize = alignUp(totalSize, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
126
127        maxBindingTableCount = std::max(maxBindingTableCount, static_cast<uint32_t>(pBlockInfo->kernelDescriptor.payloadMappings.bindingTable.numEntries));
128    }
129
130    SchedulerKernel &scheduler = kernel.getContext().getSchedulerKernel();
131
132    totalSize += getSizeRequiredSSH(scheduler);
133
134    totalSize += maxBindingTableCount * sizeof(BINDING_TABLE_STATE) * DeviceQueue::interfaceDescriptorEntries;
135    totalSize = alignUp(totalSize, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
136
137    return totalSize;
138}
139
140template <typename GfxFamily>
141size_t HardwareCommandsHelper<GfxFamily>::sendInterfaceDescriptorData(
142    const IndirectHeap &indirectHeap,
143    uint64_t offsetInterfaceDescriptor,
144    uint64_t kernelStartOffset,
145    size_t sizeCrossThreadData,
146    size_t sizePerThreadData,
147    size_t bindingTablePointer,
148    size_t offsetSamplerState,
149    uint32_t numSamplers,
150    uint32_t threadsPerThreadGroup,
151    const Kernel &kernel,
152    uint32_t bindingTablePrefetchSize,
153    PreemptionMode preemptionMode,
154    INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor,
155    const Device &device) {
156    using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE;
157    using SHARED_LOCAL_MEMORY_SIZE = typename INTERFACE_DESCRIPTOR_DATA::SHARED_LOCAL_MEMORY_SIZE;
158
159    const auto &hardwareInfo = device.getHardwareInfo();
160
161    // Allocate some memory for the interface descriptor
162    auto pInterfaceDescriptor = getInterfaceDescriptor(indirectHeap, offsetInterfaceDescriptor, inlineInterfaceDescriptor);
163    auto interfaceDescriptor = GfxFamily::cmdInitInterfaceDescriptorData;
164
165    // Program the kernel start pointer
166    interfaceDescriptor.setKernelStartPointerHigh(kernelStartOffset >> 32);
167    interfaceDescriptor.setKernelStartPointer((uint32_t)kernelStartOffset);
168
169    // # of threads in thread group should be based on LWS.
170    interfaceDescriptor.setNumberOfThreadsInGpgpuThreadGroup(threadsPerThreadGroup);
171
172    interfaceDescriptor.setDenormMode(INTERFACE_DESCRIPTOR_DATA::DENORM_MODE_SETBYKERNEL);
173
174    auto slmTotalSize = kernel.getSlmTotalSize();
175
176    setGrfInfo(&interfaceDescriptor, kernel, sizeCrossThreadData, sizePerThreadData);
177    EncodeDispatchKernel<GfxFamily>::appendAdditionalIDDFields(&interfaceDescriptor, hardwareInfo, threadsPerThreadGroup, slmTotalSize, SlmPolicy::SlmPolicyNone);
178
179    interfaceDescriptor.setBindingTablePointer(static_cast<uint32_t>(bindingTablePointer));
180
181    interfaceDescriptor.setSamplerStatePointer(static_cast<uint32_t>(offsetSamplerState));
182
183    EncodeDispatchKernel<GfxFamily>::adjustBindingTablePrefetch(interfaceDescriptor, numSamplers, bindingTablePrefetchSize);
184
185    auto programmableIDSLMSize =
186        static_cast<SHARED_LOCAL_MEMORY_SIZE>(HwHelperHw<GfxFamily>::get().computeSlmValues(hardwareInfo, slmTotalSize));
187
188    if (DebugManager.flags.OverrideSlmAllocationSize.get() != -1) {
189        programmableIDSLMSize = static_cast<SHARED_LOCAL_MEMORY_SIZE>(DebugManager.flags.OverrideSlmAllocationSize.get());
190    }
191
192    interfaceDescriptor.setSharedLocalMemorySize(programmableIDSLMSize);
193    EncodeDispatchKernel<GfxFamily>::programBarrierEnable(interfaceDescriptor,
194                                                          kernel.getKernelInfo().kernelDescriptor.kernelAttributes.barrierCount,
195                                                          hardwareInfo);
196
197    PreemptionHelper::programInterfaceDescriptorDataPreemption<GfxFamily>(&interfaceDescriptor, preemptionMode);
198    EncodeDispatchKernel<GfxFamily>::adjustInterfaceDescriptorData(interfaceDescriptor, hardwareInfo);
199
200    *pInterfaceDescriptor = interfaceDescriptor;
201    return (size_t)offsetInterfaceDescriptor;
202}
203
204template <typename GfxFamily>
205size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
206    LinearStream &commandStream,
207    IndirectHeap &dsh,
208    IndirectHeap &ioh,
209    IndirectHeap &ssh,
210    Kernel &kernel,
211    uint64_t kernelStartOffset,
212    uint32_t simd,
213    const size_t localWorkSize[3],
214    const uint64_t offsetInterfaceDescriptorTable,
215    uint32_t &interfaceDescriptorIndex,
216    PreemptionMode preemptionMode,
217    WALKER_TYPE *walkerCmd,
218    INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor,
219    bool localIdsGenerationByRuntime,
220    const Device &device) {
221
222    using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE;
223
224    auto rootDeviceIndex = device.getRootDeviceIndex();
225
226    DEBUG_BREAK_IF(simd != 1 && simd != 8 && simd != 16 && simd != 32);
227    auto inlineDataProgrammingRequired = HardwareCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(kernel);
228
229    // Copy the kernel over to the ISH
230    const auto &kernelInfo = kernel.getKernelInfo();
231
232    ssh.align(BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
233
234    auto dstBindingTablePointer = EncodeSurfaceState<GfxFamily>::pushBindingTableAndSurfaceStates(ssh, kernelInfo.kernelDescriptor.payloadMappings.bindingTable.numEntries,
235                                                                                                  kernel.getSurfaceStateHeap(), kernel.getSurfaceStateHeapSize(),
236                                                                                                  kernel.getNumberOfBindingTableStates(), kernel.getBindingTableOffset());
237
238    // Copy our sampler state if it exists
239    const auto &samplerTable = kernelInfo.kernelDescriptor.payloadMappings.samplerTable;
240    uint32_t samplerCount = 0;
241    uint32_t samplerStateOffset = 0;
242    if (isValidOffset(samplerTable.tableOffset) && isValidOffset(samplerTable.borderColor)) {
243        samplerCount = samplerTable.numSamplers;
244        samplerStateOffset = EncodeStates<GfxFamily>::copySamplerState(&dsh, samplerTable.tableOffset,
245                                                                       samplerCount, samplerTable.borderColor,
246                                                                       kernel.getDynamicStateHeap(), device.getBindlessHeapsHelper(),
247                                                                       device.getHardwareInfo());
248    }
249
250    auto localWorkItems = localWorkSize[0] * localWorkSize[1] * localWorkSize[2];
251    auto threadsPerThreadGroup = static_cast<uint32_t>(getThreadsPerWG(simd, localWorkItems));
252    auto numChannels = static_cast<uint32_t>(kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels);
253
254    auto pImplicitArgs = kernel.getImplicitArgs();
255    if (pImplicitArgs) {
256        constexpr uint32_t grfSize = sizeof(typename GfxFamily::GRF);
257        auto offsetLocalIds = sendPerThreadData(
258            ioh,
259            simd,
260            grfSize,
261            3u, // all channels for implicit args
262            std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSize[0]), static_cast<uint16_t>(localWorkSize[1]), static_cast<uint16_t>(localWorkSize[2])}},
263            {{kernelInfo.kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[0],
264              kernelInfo.kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[1],
265              kernelInfo.kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[2]}},
266            kernel.usesOnlyImages());
267
268        pImplicitArgs->localIdTablePtr = offsetLocalIds + ioh.getGraphicsAllocation()->getGpuAddress();
269    }
270
271    uint32_t sizeCrossThreadData = kernel.getCrossThreadDataSize();
272
273    size_t offsetCrossThreadData = HardwareCommandsHelper<GfxFamily>::sendCrossThreadData(
274        ioh, kernel, inlineDataProgrammingRequired,
275        walkerCmd, sizeCrossThreadData);
276
277    size_t sizePerThreadDataTotal = 0;
278    size_t sizePerThreadData = 0;
279
280    HardwareCommandsHelper<GfxFamily>::programPerThreadData(
281        sizePerThreadData,
282        localIdsGenerationByRuntime,
283        ioh,
284        simd,
285        numChannels,
286        localWorkSize,
287        kernel,
288        sizePerThreadDataTotal,
289        localWorkItems,
290        rootDeviceIndex);
291
292    uint64_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable + interfaceDescriptorIndex * sizeof(INTERFACE_DESCRIPTOR_DATA);
293
294    auto bindingTablePrefetchSize = std::min(31u, static_cast<uint32_t>(kernel.getNumberOfBindingTableStates()));
295    if (resetBindingTablePrefetch(kernel)) {
296        bindingTablePrefetchSize = 0;
297    }
298
299    HardwareCommandsHelper<GfxFamily>::sendInterfaceDescriptorData(
300        dsh,
301        offsetInterfaceDescriptor,
302        kernelStartOffset,
303        sizeCrossThreadData,
304        sizePerThreadData,
305        dstBindingTablePointer,
306        samplerStateOffset,
307        samplerCount,
308        threadsPerThreadGroup,
309        kernel,
310        bindingTablePrefetchSize,
311        preemptionMode,
312        inlineInterfaceDescriptor,
313        device);
314
315    if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) {
316        PatchInfoData patchInfoData(kernelStartOffset, 0, PatchInfoAllocationType::InstructionHeap, dsh.getGraphicsAllocation()->getGpuAddress(), offsetInterfaceDescriptor, PatchInfoAllocationType::DynamicStateHeap);
317        kernel.getPatchInfoDataList().push_back(patchInfoData);
318    }
319
320    // Program media state flush to set interface descriptor offset
321    sendMediaStateFlush(
322        commandStream,
323        interfaceDescriptorIndex);
324
325    DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0);
326    walkerCmd->setIndirectDataStartAddress(static_cast<uint32_t>(offsetCrossThreadData));
327    setInterfaceDescriptorOffset(walkerCmd, interfaceDescriptorIndex);
328
329    auto indirectDataLength = alignUp(static_cast<uint32_t>(sizeCrossThreadData + sizePerThreadDataTotal),
330                                      WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
331    walkerCmd->setIndirectDataLength(indirectDataLength);
332
333    return offsetCrossThreadData;
334}
335
336template <typename GfxFamily>
337void HardwareCommandsHelper<GfxFamily>::updatePerThreadDataTotal(
338    size_t &sizePerThreadData,
339    uint32_t &simd,
340    uint32_t &numChannels,
341    size_t &sizePerThreadDataTotal,
342    size_t &localWorkItems) {
343    uint32_t grfSize = sizeof(typename GfxFamily::GRF);
344    sizePerThreadData = getPerThreadSizeLocalIDs(simd, grfSize, numChannels);
345
346    uint32_t localIdSizePerThread = PerThreadDataHelper::getLocalIdSizePerThread(simd, grfSize, numChannels);
347    localIdSizePerThread = std::max(localIdSizePerThread, grfSize);
348
349    sizePerThreadDataTotal = getThreadsPerWG(simd, localWorkItems) * localIdSizePerThread;
350    DEBUG_BREAK_IF(sizePerThreadDataTotal == 0); // Hardware requires at least 1 GRF of perThreadData for each thread in thread group
351}
352
353template <typename GfxFamily>
354bool HardwareCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(const Kernel &kernel) {
355    auto checkKernelForInlineData = true;
356    if (DebugManager.flags.EnablePassInlineData.get() != -1) {
357        checkKernelForInlineData = !!DebugManager.flags.EnablePassInlineData.get();
358    }
359    if (checkKernelForInlineData) {
360        return kernel.getKernelInfo().kernelDescriptor.kernelAttributes.flags.passInlineData;
361    }
362    return false;
363}
364
365template <typename GfxFamily>
366bool HardwareCommandsHelper<GfxFamily>::kernelUsesLocalIds(const Kernel &kernel) {
367    return kernel.getKernelInfo().kernelDescriptor.kernelAttributes.numLocalIdChannels > 0;
368}
369
370} // namespace NEO
371