1/*
2 * Copyright (C) 2019-2021 Intel Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 */
7
8#include "shared/source/command_container/command_encoder.h"
9#include "shared/source/command_stream/stream_properties.h"
10
11#include "opencl/source/cl_device/cl_device.h"
12#include "opencl/source/device_queue/device_queue_hw_base.inl"
13#include "opencl/source/program/block_kernel_manager.h"
14
15namespace NEO {
16
17template <typename GfxFamily>
18size_t DeviceQueueHw<GfxFamily>::getMinimumSlbSize() {
19    using MEDIA_STATE_FLUSH = typename GfxFamily::MEDIA_STATE_FLUSH;
20    using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename GfxFamily::MEDIA_INTERFACE_DESCRIPTOR_LOAD;
21    using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
22
23    return sizeof(MEDIA_STATE_FLUSH) +
24           sizeof(MEDIA_INTERFACE_DESCRIPTOR_LOAD) +
25           sizeof(PIPE_CONTROL) +
26           sizeof(GPGPU_WALKER) +
27           sizeof(MEDIA_STATE_FLUSH) +
28           sizeof(PIPE_CONTROL) +
29           DeviceQueueHw<GfxFamily>::getCSPrefetchSize();
30}
31
32template <typename GfxFamily>
33void DeviceQueueHw<GfxFamily>::buildSlbDummyCommands() {
34    using MEDIA_STATE_FLUSH = typename GfxFamily::MEDIA_STATE_FLUSH;
35    using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename GfxFamily::MEDIA_INTERFACE_DESCRIPTOR_LOAD;
36    using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
37
38    auto igilCmdQueue = reinterpret_cast<IGIL_CommandQueue *>(queueBuffer->getUnderlyingBuffer());
39    auto slbEndOffset = igilCmdQueue->m_controls.m_SLBENDoffsetInBytes;
40    size_t commandsSize = getMinimumSlbSize() + getWaCommandsSize();
41    size_t numEnqueues = numberOfDeviceEnqueues;
42
43    // buildSlbDummyCommands is called from resetDeviceQueue() - reset slbCS each time
44    slbCS.replaceBuffer(slbBuffer->getUnderlyingBuffer(), slbBuffer->getUnderlyingBufferSize());
45
46    if (slbEndOffset >= 0) {
47        DEBUG_BREAK_IF(slbEndOffset % commandsSize != 0);
48        //We always overwrite at most one enqueue space with BB_START command pointing to cleanup section
49        //if SLBENDoffset is the at the end then BB_START added after scheduler did not corrupt anything so no need to regenerate
50        numEnqueues = (slbEndOffset == static_cast<int>(commandsSize)) ? 0 : 1;
51        slbCS.getSpace(slbEndOffset);
52    }
53
54    for (size_t i = 0; i < numEnqueues; i++) {
55        auto mediaStateFlush = slbCS.getSpaceForCmd<MEDIA_STATE_FLUSH>();
56        *mediaStateFlush = GfxFamily::cmdInitMediaStateFlush;
57
58        addArbCheckCmdWa();
59
60        addMiAtomicCmdWa((uint64_t)&igilCmdQueue->m_controls.m_DummyAtomicOperationPlaceholder);
61
62        auto mediaIdLoadSpace = slbCS.getSpaceForCmd<MEDIA_INTERFACE_DESCRIPTOR_LOAD>();
63        auto mediaIdLoad = GfxFamily::cmdInitMediaInterfaceDescriptorLoad;
64        mediaIdLoad.setInterfaceDescriptorTotalLength(2048);
65
66        auto dataStartAddress = colorCalcStateSize;
67        mediaIdLoad.setInterfaceDescriptorDataStartAddress(dataStartAddress + sizeof(INTERFACE_DESCRIPTOR_DATA) * schedulerIDIndex);
68        *mediaIdLoadSpace = mediaIdLoad;
69
70        addLriCmdWa(true);
71
72        if (isProfilingEnabled()) {
73            addPipeControlCmdWa();
74            auto pipeControl = slbCS.getSpaceForCmd<PIPE_CONTROL>();
75            initPipeControl(pipeControl);
76
77        } else {
78            auto noop = slbCS.getSpace(sizeof(PIPE_CONTROL));
79            memset(noop, 0x0, sizeof(PIPE_CONTROL));
80            addPipeControlCmdWa(true);
81        }
82
83        auto gpgpuWalkerSpace = slbCS.getSpaceForCmd<GPGPU_WALKER>();
84        auto gpgpuWalker = GfxFamily::cmdInitGpgpuWalker;
85        gpgpuWalker.setSimdSize(GPGPU_WALKER::SIMD_SIZE::SIMD_SIZE_SIMD16);
86        gpgpuWalker.setThreadGroupIdXDimension(1);
87        gpgpuWalker.setThreadGroupIdYDimension(1);
88        gpgpuWalker.setThreadGroupIdZDimension(1);
89        gpgpuWalker.setRightExecutionMask(0xFFFFFFFF);
90        gpgpuWalker.setBottomExecutionMask(0xFFFFFFFF);
91        *gpgpuWalkerSpace = gpgpuWalker;
92
93        mediaStateFlush = slbCS.getSpaceForCmd<MEDIA_STATE_FLUSH>();
94        *mediaStateFlush = GfxFamily::cmdInitMediaStateFlush;
95
96        addArbCheckCmdWa();
97
98        addPipeControlCmdWa();
99
100        auto pipeControl2 = slbCS.getSpaceForCmd<PIPE_CONTROL>();
101        initPipeControl(pipeControl2);
102
103        addLriCmdWa(false);
104
105        auto prefetch = slbCS.getSpace(getCSPrefetchSize());
106        memset(prefetch, 0x0, getCSPrefetchSize());
107    }
108
109    // always the same BBStart position (after 128 enqueues)
110    auto bbStartOffset = (commandsSize * 128) - slbCS.getUsed();
111    slbCS.getSpace(bbStartOffset);
112
113    auto bbStartSpace = slbCS.getSpaceForCmd<MI_BATCH_BUFFER_START>();
114    auto bbStart = GfxFamily::cmdInitBatchBufferStart;
115    auto slbPtr = reinterpret_cast<uintptr_t>(slbBuffer->getUnderlyingBuffer());
116    bbStart.setBatchBufferStartAddress(slbPtr);
117    *bbStartSpace = bbStart;
118
119    igilCmdQueue->m_controls.m_CleanupSectionSize = 0;
120    igilQueue->m_controls.m_CleanupSectionAddress = 0;
121}
122
123template <typename GfxFamily>
124void DeviceQueueHw<GfxFamily>::addMediaStateClearCmds() {
125    typedef typename GfxFamily::MEDIA_VFE_STATE MEDIA_VFE_STATE;
126
127    addPipeControlCmdWa();
128
129    auto pipeControlSpace = slbCS.getSpaceForCmd<PIPE_CONTROL>();
130    auto pipeControl = GfxFamily::cmdInitPipeControl;
131    pipeControl.setGenericMediaStateClear(true);
132    pipeControl.setCommandStreamerStallEnable(true);
133    addDcFlushToPipeControlWa(&pipeControl);
134    *pipeControlSpace = pipeControl;
135
136    auto pVfeState = PreambleHelper<GfxFamily>::getSpaceForVfeState(&slbCS, device->getHardwareInfo(), EngineGroupType::RenderCompute);
137    StreamProperties emptyProperties{};
138    PreambleHelper<GfxFamily>::programVfeState(pVfeState, device->getHardwareInfo(), 0u, 0, device->getSharedDeviceInfo().maxFrontEndThreads, emptyProperties);
139}
140
141template <typename GfxFamily>
142size_t DeviceQueueHw<GfxFamily>::getMediaStateClearCmdsSize() {
143    using MEDIA_VFE_STATE = typename GfxFamily::MEDIA_VFE_STATE;
144    // PC with GenreicMediaStateClear + WA PC
145    size_t size = 2 * sizeof(PIPE_CONTROL);
146
147    // VFE state cmds
148    size += sizeof(PIPE_CONTROL);
149    size += sizeof(MEDIA_VFE_STATE);
150    return size;
151}
152
153template <typename GfxFamily>
154void DeviceQueueHw<GfxFamily>::setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount, bool isCcsUsed) {
155    using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
156    void *pDSH = dynamicStateHeap.getCpuBase();
157    // Set scheduler ID to last entry in first table, it will have ID == 0, blocks will have following entries.
158    auto igilCmdQueue = reinterpret_cast<IGIL_CommandQueue *>(queueBuffer->getUnderlyingBuffer());
159    igilCmdQueue->m_controls.m_IDTstart = colorCalcStateSize + sizeof(INTERFACE_DESCRIPTOR_DATA) * (interfaceDescriptorEntries - 2);
160
161    // Parent's dsh is located after ColorCalcState and 2 ID tables
162    igilCmdQueue->m_controls.m_DynamicHeapStart = offsetDsh + alignUp(static_cast<uint32_t>(parentKernel->getDynamicStateHeapSize()), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
163    igilCmdQueue->m_controls.m_DynamicHeapSizeInBytes = (uint32_t)dshBuffer->getUnderlyingBufferSize();
164
165    igilCmdQueue->m_controls.m_CurrentDSHoffset = igilCmdQueue->m_controls.m_DynamicHeapStart;
166    igilCmdQueue->m_controls.m_ParentDSHOffset = offsetDsh;
167
168    uint32_t blockIndex = parentIDCount;
169
170    pDSH = ptrOffset(pDSH, colorCalcStateSize);
171
172    INTERFACE_DESCRIPTOR_DATA *pIDDestination = static_cast<INTERFACE_DESCRIPTOR_DATA *>(pDSH);
173
174    BlockKernelManager *blockManager = parentKernel->getProgram()->getBlockKernelManager();
175    uint32_t blockCount = static_cast<uint32_t>(blockManager->getCount());
176
177    uint32_t maxBindingTableCount = 0;
178    uint32_t totalBlockSSHSize = 0;
179
180    igilCmdQueue->m_controls.m_StartBlockID = blockIndex;
181
182    for (uint32_t i = 0; i < blockCount; i++) {
183        const KernelInfo *pBlockInfo = blockManager->getBlockKernelInfo(i);
184
185        auto blockKernelStartPointer = getBlockKernelStartPointer(getDevice(), pBlockInfo, isCcsUsed);
186
187        auto bindingTableCount = static_cast<uint32_t>(pBlockInfo->kernelDescriptor.payloadMappings.bindingTable.numEntries);
188        maxBindingTableCount = std::max(maxBindingTableCount, bindingTableCount);
189
190        totalBlockSSHSize += alignUp(pBlockInfo->heapInfo.SurfaceStateHeapSize, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
191
192        surfaceStateHeap.align(BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
193        auto btOffset = EncodeSurfaceState<GfxFamily>::pushBindingTableAndSurfaceStates(surfaceStateHeap, bindingTableCount,
194                                                                                        pBlockInfo->heapInfo.pSsh,
195                                                                                        pBlockInfo->heapInfo.SurfaceStateHeapSize,
196                                                                                        bindingTableCount,
197                                                                                        pBlockInfo->kernelDescriptor.payloadMappings.bindingTable.tableOffset);
198
199        parentKernel->setReflectionSurfaceBlockBtOffset(i, static_cast<uint32_t>(btOffset));
200
201        // Determine SIMD size
202        uint32_t simd = pBlockInfo->getMaxSimdSize();
203
204        uint32_t idOffset = pBlockInfo->kernelDescriptor.kernelMetadata.deviceSideEnqueueBlockInterfaceDescriptorOffset;
205        const INTERFACE_DESCRIPTOR_DATA *pBlockID = static_cast<const INTERFACE_DESCRIPTOR_DATA *>(ptrOffset(pBlockInfo->heapInfo.pDsh, idOffset));
206
207        pIDDestination[blockIndex + i] = *pBlockID;
208        pIDDestination[blockIndex + i].setKernelStartPointerHigh(blockKernelStartPointer >> 32);
209        pIDDestination[blockIndex + i].setKernelStartPointer(static_cast<uint32_t>(blockKernelStartPointer));
210        pIDDestination[blockIndex + i].setDenormMode(INTERFACE_DESCRIPTOR_DATA::DENORM_MODE_SETBYKERNEL);
211        EncodeDispatchKernel<GfxFamily>::programBarrierEnable(pIDDestination[blockIndex + i],
212                                                              pBlockInfo->kernelDescriptor.kernelAttributes.barrierCount,
213                                                              device->getHardwareInfo());
214
215        // Set offset to sampler states, block's DHSOffset is added by scheduler
216        pIDDestination[blockIndex + i].setSamplerStatePointer(static_cast<uint32_t>(pBlockInfo->getBorderColorStateSize()));
217
218        auto numChannels = pBlockInfo->kernelDescriptor.kernelAttributes.numLocalIdChannels;
219        auto grfSize = device->getDeviceInfo().grfSize;
220        auto sizePerThreadData = getPerThreadSizeLocalIDs(simd, grfSize, numChannels);
221        auto numGrfPerThreadData = static_cast<uint32_t>(sizePerThreadData / grfSize);
222
223        // HW requires a minimum of 1 GRF of perThreadData for each thread in a thread group
224        // when sizeCrossThreadData != 0
225        numGrfPerThreadData = std::max(numGrfPerThreadData, 1u);
226        pIDDestination[blockIndex + i].setConstantIndirectUrbEntryReadLength(numGrfPerThreadData);
227    }
228
229    igilCmdQueue->m_controls.m_BTmaxSize = alignUp(maxBindingTableCount * (uint32_t)sizeof(BINDING_TABLE_STATE), INTERFACE_DESCRIPTOR_DATA::BINDINGTABLEPOINTER::BINDINGTABLEPOINTER_ALIGN_SIZE);
230    igilCmdQueue->m_controls.m_BTbaseOffset = alignUp((uint32_t)surfaceStateHeap.getUsed(), INTERFACE_DESCRIPTOR_DATA::BINDINGTABLEPOINTER::BINDINGTABLEPOINTER_ALIGN_SIZE);
231    igilCmdQueue->m_controls.m_CurrentSSHoffset = igilCmdQueue->m_controls.m_BTbaseOffset;
232}
233
234} // namespace NEO
235