1/* 2 * Copyright (C) 2019-2021 Intel Corporation 3 * 4 * SPDX-License-Identifier: MIT 5 * 6 */ 7 8#pragma once 9#include "shared/source/helpers/hw_helper.h" 10#include "shared/source/helpers/pipe_control_args.h" 11#include "shared/source/helpers/preamble.h" 12#include "shared/source/helpers/string.h" 13#include "shared/source/memory_manager/memory_manager.h" 14#include "shared/source/utilities/tag_allocator.h" 15 16#include "opencl/source/command_queue/gpgpu_walker.h" 17#include "opencl/source/device_queue/device_queue_hw.h" 18#include "opencl/source/helpers/hardware_commands_helper.h" 19 20namespace NEO { 21template <typename GfxFamily> 22void DeviceQueueHw<GfxFamily>::allocateSlbBuffer() { 23 auto slbSize = getMinimumSlbSize() + getWaCommandsSize(); 24 slbSize *= 128; //num of enqueues 25 slbSize += sizeof(MI_BATCH_BUFFER_START); 26 slbSize = alignUp(slbSize, MemoryConstants::pageSize); 27 slbSize += DeviceQueueHw<GfxFamily>::getExecutionModelCleanupSectionSize(); 28 slbSize += (4 * MemoryConstants::pageSize); // +4 pages spec restriction 29 slbSize = alignUp(slbSize, MemoryConstants::pageSize); 30 31 slbBuffer = device->getMemoryManager()->allocateGraphicsMemoryWithProperties({device->getRootDeviceIndex(), slbSize, GraphicsAllocation::AllocationType::DEVICE_QUEUE_BUFFER, device->getDeviceBitfield()}); 32} 33 34template <typename GfxFamily> 35void DeviceQueueHw<GfxFamily>::resetDeviceQueue() { 36 auto &caps = device->getDeviceInfo(); 37 auto igilEventPool = reinterpret_cast<IGIL_EventPool *>(eventPoolBuffer->getUnderlyingBuffer()); 38 39 memset(eventPoolBuffer->getUnderlyingBuffer(), 0x0, eventPoolBuffer->getUnderlyingBufferSize()); 40 igilEventPool->m_TimestampResolution = static_cast<float>(device->getProfilingTimerResolution()); 41 igilEventPool->m_size = caps.maxOnDeviceEvents; 42 43 auto igilCmdQueue = reinterpret_cast<IGIL_CommandQueue *>(queueBuffer->getUnderlyingBuffer()); 44 igilQueue = igilCmdQueue; 45 46 igilCmdQueue->m_controls.m_StackSize = 47 static_cast<uint32_t>((stackBuffer->getUnderlyingBufferSize() / sizeof(cl_uint)) - 1); 48 igilCmdQueue->m_controls.m_StackTop = 49 static_cast<uint32_t>((stackBuffer->getUnderlyingBufferSize() / sizeof(cl_uint)) - 1); 50 igilCmdQueue->m_controls.m_PreviousHead = IGIL_DEVICE_QUEUE_HEAD_INIT; 51 igilCmdQueue->m_controls.m_IDTAfterFirstPhase = 1; 52 igilCmdQueue->m_controls.m_CurrentIDToffset = 1; 53 igilCmdQueue->m_controls.m_PreviousStorageTop = static_cast<uint32_t>(queueStorageBuffer->getUnderlyingBufferSize()); 54 igilCmdQueue->m_controls.m_PreviousStackTop = 55 static_cast<uint32_t>((stackBuffer->getUnderlyingBufferSize() / sizeof(cl_uint)) - 1); 56 igilCmdQueue->m_controls.m_DebugNextBlockID = 0xFFFFFFFF; 57 igilCmdQueue->m_controls.m_QstorageSize = static_cast<uint32_t>(queueStorageBuffer->getUnderlyingBufferSize()); 58 igilCmdQueue->m_controls.m_QstorageTop = static_cast<uint32_t>(queueStorageBuffer->getUnderlyingBufferSize()); 59 igilCmdQueue->m_controls.m_IsProfilingEnabled = static_cast<uint32_t>(isProfilingEnabled()); 60 igilCmdQueue->m_controls.m_IsSimulation = static_cast<uint32_t>(device->isSimulation()); 61 62 igilCmdQueue->m_controls.m_LastScheduleEventNumber = 0; 63 igilCmdQueue->m_controls.m_PreviousNumberOfQueues = 0; 64 igilCmdQueue->m_controls.m_EnqueueMarkerScheduled = 0; 65 igilCmdQueue->m_controls.m_SecondLevelBatchOffset = 0; 66 igilCmdQueue->m_controls.m_TotalNumberOfQueues = 0; 67 igilCmdQueue->m_controls.m_EventTimestampAddress = 0; 68 igilCmdQueue->m_controls.m_ErrorCode = 0; 69 igilCmdQueue->m_controls.m_CurrentScheduleEventNumber = 0; 70 igilCmdQueue->m_controls.m_DummyAtomicOperationPlaceholder = 0x00; 71 igilCmdQueue->m_controls.m_DebugNextBlockGWS = 0; 72 73 // set first stack element in surface at value "1", it protects Scheduler in corner case when StackTop is empty after Child execution 74 auto stack = static_cast<uint32_t *>(stackBuffer->getUnderlyingBuffer()); 75 stack += ((stackBuffer->getUnderlyingBufferSize() / sizeof(cl_uint)) - 1); 76 *stack = 1; 77 78 igilCmdQueue->m_head = IGIL_DEVICE_QUEUE_HEAD_INIT; 79 igilCmdQueue->m_size = static_cast<uint32_t>(queueBuffer->getUnderlyingBufferSize() - sizeof(IGIL_CommandQueue)); 80 igilCmdQueue->m_magic = IGIL_MAGIC_NUMBER; 81 82 igilCmdQueue->m_controls.m_SchedulerEarlyReturn = DebugManager.flags.SchedulerSimulationReturnInstance.get(); 83 igilCmdQueue->m_controls.m_SchedulerEarlyReturnCounter = 0; 84 85 buildSlbDummyCommands(); 86 87 igilCmdQueue->m_controls.m_SLBENDoffsetInBytes = -1; 88 89 igilCmdQueue->m_controls.m_CriticalSection = ExecutionModelCriticalSection::Free; 90 91 resetDSH(); 92} 93 94template <typename GfxFamily> 95void DeviceQueueHw<GfxFamily>::initPipeControl(PIPE_CONTROL *pc) { 96 auto cmd = GfxFamily::cmdInitPipeControl; 97 cmd.setStateCacheInvalidationEnable(0x1); 98 cmd.setDcFlushEnable(true); 99 cmd.setPipeControlFlushEnable(true); 100 cmd.setTextureCacheInvalidationEnable(true); 101 cmd.setCommandStreamerStallEnable(true); 102 103 *pc = cmd; 104} 105 106template <typename GfxFamily> 107void DeviceQueueHw<GfxFamily>::addExecutionModelCleanUpSection(Kernel *parentKernel, TagNodeBase *hwTimeStamp, uint64_t tagAddress, uint32_t taskCount) { 108 // CleanUp Section 109 auto offset = slbCS.getUsed(); 110 auto alignmentSize = alignUp(offset, MemoryConstants::pageSize) - offset; 111 slbCS.getSpace(alignmentSize); 112 offset = slbCS.getUsed(); 113 114 igilQueue->m_controls.m_CleanupSectionAddress = ptrOffset(slbBuffer->getGpuAddress(), slbCS.getUsed()); 115 GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(&slbCS, *parentKernel, true); 116 117 using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; 118 119 if (hwTimeStamp != nullptr) { 120 uint64_t timeStampAddress = hwTimeStamp->getGpuAddress() + offsetof(HwTimeStamps, ContextCompleteTS); 121 igilQueue->m_controls.m_EventTimestampAddress = timeStampAddress; 122 123 addProfilingEndCmds(timeStampAddress); 124 125 //enable preemption 126 addLriCmd(false); 127 } 128 129 uint64_t criticalSectionAddress = (uint64_t)&igilQueue->m_controls.m_CriticalSection; 130 PipeControlArgs args; 131 MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation( 132 slbCS, 133 PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, 134 criticalSectionAddress, 135 ExecutionModelCriticalSection::Free, 136 device->getHardwareInfo(), 137 args); 138 139 MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation( 140 slbCS, 141 PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, 142 tagAddress, 143 taskCount, 144 device->getHardwareInfo(), 145 args); 146 147 addMediaStateClearCmds(); 148 149 auto pBBE = slbCS.getSpaceForCmd<MI_BATCH_BUFFER_END>(); 150 *pBBE = GfxFamily::cmdInitBatchBufferEnd; 151 152 igilQueue->m_controls.m_CleanupSectionSize = (uint32_t)(slbCS.getUsed() - offset); 153} 154 155template <typename GfxFamily> 156void DeviceQueueHw<GfxFamily>::resetDSH() { 157 if (heaps[IndirectHeap::DYNAMIC_STATE]) { 158 heaps[IndirectHeap::DYNAMIC_STATE]->replaceBuffer(heaps[IndirectHeap::DYNAMIC_STATE]->getCpuBase(), heaps[IndirectHeap::DYNAMIC_STATE]->getMaxAvailableSpace()); 159 heaps[IndirectHeap::DYNAMIC_STATE]->getSpace(colorCalcStateSize); 160 } 161} 162 163template <typename GfxFamily> 164IndirectHeap *DeviceQueueHw<GfxFamily>::getIndirectHeap(IndirectHeap::Type type) { 165 UNRECOVERABLE_IF(type != IndirectHeap::DYNAMIC_STATE); 166 167 if (!heaps[type]) { 168 heaps[type] = new IndirectHeap(dshBuffer); 169 // get space for colorCalc and 2 ID tables at the beginning 170 heaps[type]->getSpace(colorCalcStateSize); 171 } 172 return heaps[type]; 173} 174 175template <typename GfxFamily> 176size_t DeviceQueueHw<GfxFamily>::setSchedulerCrossThreadData(SchedulerKernel &scheduler) { 177 using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA; 178 size_t offset = dshBuffer->getUnderlyingBufferSize() - scheduler.getCurbeSize() - 4096; // Page size padding 179 180 auto igilCmdQueue = reinterpret_cast<IGIL_CommandQueue *>(queueBuffer->getUnderlyingBuffer()); 181 igilCmdQueue->m_controls.m_SchedulerDSHOffset = (uint32_t)offset; 182 igilCmdQueue->m_controls.m_SchedulerConstantBufferSize = (uint32_t)scheduler.getCurbeSize(); 183 184 return offset; 185} 186 187template <typename GfxFamily> 188void DeviceQueueHw<GfxFamily>::dispatchScheduler(LinearStream &commandStream, SchedulerKernel &scheduler, PreemptionMode preemptionMode, IndirectHeap *ssh, IndirectHeap *dsh, bool isCcsUsed) { 189 GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(commandStream, 190 *this, 191 preemptionMode, 192 scheduler, 193 ssh, 194 dsh, 195 isCcsUsed); 196 return; 197} 198 199template <typename GfxFamily> 200size_t DeviceQueueHw<GfxFamily>::getCSPrefetchSize() { 201 return 512; 202} 203 204template <typename GfxFamily> 205void DeviceQueueHw<GfxFamily>::addLriCmd(bool setArbCheck) { 206 // CTXT_PREMP_DBG offset 207 constexpr uint32_t registerAddress = 0x2248u; 208 uint32_t value = 0u; 209 if (setArbCheck) { 210 // set only bit 8 (Preempt On MI_ARB_CHK Only) 211 value = 0x00000100; 212 } 213 214 LriHelper<GfxFamily>::program(&slbCS, 215 registerAddress, 216 value, 217 false); 218} 219 220template <typename GfxFamily> 221size_t DeviceQueueHw<GfxFamily>::getExecutionModelCleanupSectionSize() { 222 size_t totalSize = 0; 223 totalSize += sizeof(PIPE_CONTROL) + 224 2 * sizeof(MI_LOAD_REGISTER_REG) + 225 sizeof(MI_LOAD_REGISTER_IMM) + 226 sizeof(PIPE_CONTROL) + 227 sizeof(MI_MATH) + 228 NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE); 229 230 totalSize += getProfilingEndCmdsSize(); 231 totalSize += getMediaStateClearCmdsSize(); 232 233 totalSize += 4 * sizeof(PIPE_CONTROL); 234 totalSize += sizeof(MI_BATCH_BUFFER_END); 235 return totalSize; 236} 237 238template <typename GfxFamily> 239size_t DeviceQueueHw<GfxFamily>::getProfilingEndCmdsSize() { 240 size_t size = 0; 241 size += sizeof(PIPE_CONTROL) + sizeof(MI_STORE_REGISTER_MEM); 242 size += sizeof(MI_LOAD_REGISTER_IMM); 243 return size; 244} 245 246template <typename GfxFamily> 247void DeviceQueueHw<GfxFamily>::addDcFlushToPipeControlWa(PIPE_CONTROL *pc) {} 248 249template <typename GfxFamily> 250uint64_t DeviceQueueHw<GfxFamily>::getBlockKernelStartPointer(const Device &device, const KernelInfo *blockInfo, bool isCcsUsed) { 251 auto blockAllocation = blockInfo->getGraphicsAllocation(); 252 DEBUG_BREAK_IF(!blockAllocation); 253 254 auto blockKernelStartPointer = blockAllocation ? blockAllocation->getGpuAddressToPatch() : 0llu; 255 256 auto &hardwareInfo = device.getHardwareInfo(); 257 auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily); 258 259 if (blockAllocation && isCcsUsed && hwHelper.isOffsetToSkipSetFFIDGPWARequired(hardwareInfo)) { 260 blockKernelStartPointer += blockInfo->kernelDescriptor.entryPoints.skipSetFFIDGP; 261 } 262 return blockKernelStartPointer; 263} 264 265} // namespace NEO 266