1/*
2 * Copyright (C) 2019-2021 Intel Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 */
7
8#pragma once
9#include "shared/source/helpers/hw_helper.h"
10#include "shared/source/helpers/pipe_control_args.h"
11#include "shared/source/helpers/preamble.h"
12#include "shared/source/helpers/string.h"
13#include "shared/source/memory_manager/memory_manager.h"
14#include "shared/source/utilities/tag_allocator.h"
15
16#include "opencl/source/command_queue/gpgpu_walker.h"
17#include "opencl/source/device_queue/device_queue_hw.h"
18#include "opencl/source/helpers/hardware_commands_helper.h"
19
20namespace NEO {
21template <typename GfxFamily>
22void DeviceQueueHw<GfxFamily>::allocateSlbBuffer() {
23    auto slbSize = getMinimumSlbSize() + getWaCommandsSize();
24    slbSize *= 128; //num of enqueues
25    slbSize += sizeof(MI_BATCH_BUFFER_START);
26    slbSize = alignUp(slbSize, MemoryConstants::pageSize);
27    slbSize += DeviceQueueHw<GfxFamily>::getExecutionModelCleanupSectionSize();
28    slbSize += (4 * MemoryConstants::pageSize); // +4 pages spec restriction
29    slbSize = alignUp(slbSize, MemoryConstants::pageSize);
30
31    slbBuffer = device->getMemoryManager()->allocateGraphicsMemoryWithProperties({device->getRootDeviceIndex(), slbSize, GraphicsAllocation::AllocationType::DEVICE_QUEUE_BUFFER, device->getDeviceBitfield()});
32}
33
34template <typename GfxFamily>
35void DeviceQueueHw<GfxFamily>::resetDeviceQueue() {
36    auto &caps = device->getDeviceInfo();
37    auto igilEventPool = reinterpret_cast<IGIL_EventPool *>(eventPoolBuffer->getUnderlyingBuffer());
38
39    memset(eventPoolBuffer->getUnderlyingBuffer(), 0x0, eventPoolBuffer->getUnderlyingBufferSize());
40    igilEventPool->m_TimestampResolution = static_cast<float>(device->getProfilingTimerResolution());
41    igilEventPool->m_size = caps.maxOnDeviceEvents;
42
43    auto igilCmdQueue = reinterpret_cast<IGIL_CommandQueue *>(queueBuffer->getUnderlyingBuffer());
44    igilQueue = igilCmdQueue;
45
46    igilCmdQueue->m_controls.m_StackSize =
47        static_cast<uint32_t>((stackBuffer->getUnderlyingBufferSize() / sizeof(cl_uint)) - 1);
48    igilCmdQueue->m_controls.m_StackTop =
49        static_cast<uint32_t>((stackBuffer->getUnderlyingBufferSize() / sizeof(cl_uint)) - 1);
50    igilCmdQueue->m_controls.m_PreviousHead = IGIL_DEVICE_QUEUE_HEAD_INIT;
51    igilCmdQueue->m_controls.m_IDTAfterFirstPhase = 1;
52    igilCmdQueue->m_controls.m_CurrentIDToffset = 1;
53    igilCmdQueue->m_controls.m_PreviousStorageTop = static_cast<uint32_t>(queueStorageBuffer->getUnderlyingBufferSize());
54    igilCmdQueue->m_controls.m_PreviousStackTop =
55        static_cast<uint32_t>((stackBuffer->getUnderlyingBufferSize() / sizeof(cl_uint)) - 1);
56    igilCmdQueue->m_controls.m_DebugNextBlockID = 0xFFFFFFFF;
57    igilCmdQueue->m_controls.m_QstorageSize = static_cast<uint32_t>(queueStorageBuffer->getUnderlyingBufferSize());
58    igilCmdQueue->m_controls.m_QstorageTop = static_cast<uint32_t>(queueStorageBuffer->getUnderlyingBufferSize());
59    igilCmdQueue->m_controls.m_IsProfilingEnabled = static_cast<uint32_t>(isProfilingEnabled());
60    igilCmdQueue->m_controls.m_IsSimulation = static_cast<uint32_t>(device->isSimulation());
61
62    igilCmdQueue->m_controls.m_LastScheduleEventNumber = 0;
63    igilCmdQueue->m_controls.m_PreviousNumberOfQueues = 0;
64    igilCmdQueue->m_controls.m_EnqueueMarkerScheduled = 0;
65    igilCmdQueue->m_controls.m_SecondLevelBatchOffset = 0;
66    igilCmdQueue->m_controls.m_TotalNumberOfQueues = 0;
67    igilCmdQueue->m_controls.m_EventTimestampAddress = 0;
68    igilCmdQueue->m_controls.m_ErrorCode = 0;
69    igilCmdQueue->m_controls.m_CurrentScheduleEventNumber = 0;
70    igilCmdQueue->m_controls.m_DummyAtomicOperationPlaceholder = 0x00;
71    igilCmdQueue->m_controls.m_DebugNextBlockGWS = 0;
72
73    // set first stack element in surface at value "1", it protects Scheduler in corner case when StackTop is empty after Child execution
74    auto stack = static_cast<uint32_t *>(stackBuffer->getUnderlyingBuffer());
75    stack += ((stackBuffer->getUnderlyingBufferSize() / sizeof(cl_uint)) - 1);
76    *stack = 1;
77
78    igilCmdQueue->m_head = IGIL_DEVICE_QUEUE_HEAD_INIT;
79    igilCmdQueue->m_size = static_cast<uint32_t>(queueBuffer->getUnderlyingBufferSize() - sizeof(IGIL_CommandQueue));
80    igilCmdQueue->m_magic = IGIL_MAGIC_NUMBER;
81
82    igilCmdQueue->m_controls.m_SchedulerEarlyReturn = DebugManager.flags.SchedulerSimulationReturnInstance.get();
83    igilCmdQueue->m_controls.m_SchedulerEarlyReturnCounter = 0;
84
85    buildSlbDummyCommands();
86
87    igilCmdQueue->m_controls.m_SLBENDoffsetInBytes = -1;
88
89    igilCmdQueue->m_controls.m_CriticalSection = ExecutionModelCriticalSection::Free;
90
91    resetDSH();
92}
93
94template <typename GfxFamily>
95void DeviceQueueHw<GfxFamily>::initPipeControl(PIPE_CONTROL *pc) {
96    auto cmd = GfxFamily::cmdInitPipeControl;
97    cmd.setStateCacheInvalidationEnable(0x1);
98    cmd.setDcFlushEnable(true);
99    cmd.setPipeControlFlushEnable(true);
100    cmd.setTextureCacheInvalidationEnable(true);
101    cmd.setCommandStreamerStallEnable(true);
102
103    *pc = cmd;
104}
105
106template <typename GfxFamily>
107void DeviceQueueHw<GfxFamily>::addExecutionModelCleanUpSection(Kernel *parentKernel, TagNodeBase *hwTimeStamp, uint64_t tagAddress, uint32_t taskCount) {
108    // CleanUp Section
109    auto offset = slbCS.getUsed();
110    auto alignmentSize = alignUp(offset, MemoryConstants::pageSize) - offset;
111    slbCS.getSpace(alignmentSize);
112    offset = slbCS.getUsed();
113
114    igilQueue->m_controls.m_CleanupSectionAddress = ptrOffset(slbBuffer->getGpuAddress(), slbCS.getUsed());
115    GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(&slbCS, *parentKernel, true);
116
117    using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
118
119    if (hwTimeStamp != nullptr) {
120        uint64_t timeStampAddress = hwTimeStamp->getGpuAddress() + offsetof(HwTimeStamps, ContextCompleteTS);
121        igilQueue->m_controls.m_EventTimestampAddress = timeStampAddress;
122
123        addProfilingEndCmds(timeStampAddress);
124
125        //enable preemption
126        addLriCmd(false);
127    }
128
129    uint64_t criticalSectionAddress = (uint64_t)&igilQueue->m_controls.m_CriticalSection;
130    PipeControlArgs args;
131    MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
132        slbCS,
133        PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
134        criticalSectionAddress,
135        ExecutionModelCriticalSection::Free,
136        device->getHardwareInfo(),
137        args);
138
139    MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
140        slbCS,
141        PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
142        tagAddress,
143        taskCount,
144        device->getHardwareInfo(),
145        args);
146
147    addMediaStateClearCmds();
148
149    auto pBBE = slbCS.getSpaceForCmd<MI_BATCH_BUFFER_END>();
150    *pBBE = GfxFamily::cmdInitBatchBufferEnd;
151
152    igilQueue->m_controls.m_CleanupSectionSize = (uint32_t)(slbCS.getUsed() - offset);
153}
154
155template <typename GfxFamily>
156void DeviceQueueHw<GfxFamily>::resetDSH() {
157    if (heaps[IndirectHeap::DYNAMIC_STATE]) {
158        heaps[IndirectHeap::DYNAMIC_STATE]->replaceBuffer(heaps[IndirectHeap::DYNAMIC_STATE]->getCpuBase(), heaps[IndirectHeap::DYNAMIC_STATE]->getMaxAvailableSpace());
159        heaps[IndirectHeap::DYNAMIC_STATE]->getSpace(colorCalcStateSize);
160    }
161}
162
163template <typename GfxFamily>
164IndirectHeap *DeviceQueueHw<GfxFamily>::getIndirectHeap(IndirectHeap::Type type) {
165    UNRECOVERABLE_IF(type != IndirectHeap::DYNAMIC_STATE);
166
167    if (!heaps[type]) {
168        heaps[type] = new IndirectHeap(dshBuffer);
169        // get space for colorCalc and 2 ID tables at the beginning
170        heaps[type]->getSpace(colorCalcStateSize);
171    }
172    return heaps[type];
173}
174
175template <typename GfxFamily>
176size_t DeviceQueueHw<GfxFamily>::setSchedulerCrossThreadData(SchedulerKernel &scheduler) {
177    using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
178    size_t offset = dshBuffer->getUnderlyingBufferSize() - scheduler.getCurbeSize() - 4096; // Page size padding
179
180    auto igilCmdQueue = reinterpret_cast<IGIL_CommandQueue *>(queueBuffer->getUnderlyingBuffer());
181    igilCmdQueue->m_controls.m_SchedulerDSHOffset = (uint32_t)offset;
182    igilCmdQueue->m_controls.m_SchedulerConstantBufferSize = (uint32_t)scheduler.getCurbeSize();
183
184    return offset;
185}
186
187template <typename GfxFamily>
188void DeviceQueueHw<GfxFamily>::dispatchScheduler(LinearStream &commandStream, SchedulerKernel &scheduler, PreemptionMode preemptionMode, IndirectHeap *ssh, IndirectHeap *dsh, bool isCcsUsed) {
189    GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(commandStream,
190                                                    *this,
191                                                    preemptionMode,
192                                                    scheduler,
193                                                    ssh,
194                                                    dsh,
195                                                    isCcsUsed);
196    return;
197}
198
199template <typename GfxFamily>
200size_t DeviceQueueHw<GfxFamily>::getCSPrefetchSize() {
201    return 512;
202}
203
204template <typename GfxFamily>
205void DeviceQueueHw<GfxFamily>::addLriCmd(bool setArbCheck) {
206    // CTXT_PREMP_DBG offset
207    constexpr uint32_t registerAddress = 0x2248u;
208    uint32_t value = 0u;
209    if (setArbCheck) {
210        // set only bit 8 (Preempt On MI_ARB_CHK Only)
211        value = 0x00000100;
212    }
213
214    LriHelper<GfxFamily>::program(&slbCS,
215                                  registerAddress,
216                                  value,
217                                  false);
218}
219
220template <typename GfxFamily>
221size_t DeviceQueueHw<GfxFamily>::getExecutionModelCleanupSectionSize() {
222    size_t totalSize = 0;
223    totalSize += sizeof(PIPE_CONTROL) +
224                 2 * sizeof(MI_LOAD_REGISTER_REG) +
225                 sizeof(MI_LOAD_REGISTER_IMM) +
226                 sizeof(PIPE_CONTROL) +
227                 sizeof(MI_MATH) +
228                 NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE);
229
230    totalSize += getProfilingEndCmdsSize();
231    totalSize += getMediaStateClearCmdsSize();
232
233    totalSize += 4 * sizeof(PIPE_CONTROL);
234    totalSize += sizeof(MI_BATCH_BUFFER_END);
235    return totalSize;
236}
237
238template <typename GfxFamily>
239size_t DeviceQueueHw<GfxFamily>::getProfilingEndCmdsSize() {
240    size_t size = 0;
241    size += sizeof(PIPE_CONTROL) + sizeof(MI_STORE_REGISTER_MEM);
242    size += sizeof(MI_LOAD_REGISTER_IMM);
243    return size;
244}
245
246template <typename GfxFamily>
247void DeviceQueueHw<GfxFamily>::addDcFlushToPipeControlWa(PIPE_CONTROL *pc) {}
248
249template <typename GfxFamily>
250uint64_t DeviceQueueHw<GfxFamily>::getBlockKernelStartPointer(const Device &device, const KernelInfo *blockInfo, bool isCcsUsed) {
251    auto blockAllocation = blockInfo->getGraphicsAllocation();
252    DEBUG_BREAK_IF(!blockAllocation);
253
254    auto blockKernelStartPointer = blockAllocation ? blockAllocation->getGpuAddressToPatch() : 0llu;
255
256    auto &hardwareInfo = device.getHardwareInfo();
257    auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
258
259    if (blockAllocation && isCcsUsed && hwHelper.isOffsetToSkipSetFFIDGPWARequired(hardwareInfo)) {
260        blockKernelStartPointer += blockInfo->kernelDescriptor.entryPoints.skipSetFFIDGP;
261    }
262    return blockKernelStartPointer;
263}
264
265} // namespace NEO
266