1/* 2 * Copyright (C) 2019-2021 Intel Corporation 3 * 4 * SPDX-License-Identifier: MIT 5 * 6 */ 7 8#include "shared/source/command_container/command_encoder.h" 9#include "shared/source/command_stream/stream_properties.h" 10 11#include "opencl/source/cl_device/cl_device.h" 12#include "opencl/source/device_queue/device_queue_hw_base.inl" 13#include "opencl/source/program/block_kernel_manager.h" 14 15namespace NEO { 16 17template <typename GfxFamily> 18size_t DeviceQueueHw<GfxFamily>::getMinimumSlbSize() { 19 using MEDIA_STATE_FLUSH = typename GfxFamily::MEDIA_STATE_FLUSH; 20 using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename GfxFamily::MEDIA_INTERFACE_DESCRIPTOR_LOAD; 21 using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER; 22 23 return sizeof(MEDIA_STATE_FLUSH) + 24 sizeof(MEDIA_INTERFACE_DESCRIPTOR_LOAD) + 25 sizeof(PIPE_CONTROL) + 26 sizeof(GPGPU_WALKER) + 27 sizeof(MEDIA_STATE_FLUSH) + 28 sizeof(PIPE_CONTROL) + 29 DeviceQueueHw<GfxFamily>::getCSPrefetchSize(); 30} 31 32template <typename GfxFamily> 33void DeviceQueueHw<GfxFamily>::buildSlbDummyCommands() { 34 using MEDIA_STATE_FLUSH = typename GfxFamily::MEDIA_STATE_FLUSH; 35 using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename GfxFamily::MEDIA_INTERFACE_DESCRIPTOR_LOAD; 36 using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER; 37 38 auto igilCmdQueue = reinterpret_cast<IGIL_CommandQueue *>(queueBuffer->getUnderlyingBuffer()); 39 auto slbEndOffset = igilCmdQueue->m_controls.m_SLBENDoffsetInBytes; 40 size_t commandsSize = getMinimumSlbSize() + getWaCommandsSize(); 41 size_t numEnqueues = numberOfDeviceEnqueues; 42 43 // buildSlbDummyCommands is called from resetDeviceQueue() - reset slbCS each time 44 slbCS.replaceBuffer(slbBuffer->getUnderlyingBuffer(), slbBuffer->getUnderlyingBufferSize()); 45 46 if (slbEndOffset >= 0) { 47 DEBUG_BREAK_IF(slbEndOffset % commandsSize != 0); 48 //We always overwrite at most one enqueue space with BB_START command pointing to cleanup section 49 //if SLBENDoffset is the at the end then BB_START added after scheduler did not corrupt anything so no need to regenerate 50 numEnqueues = (slbEndOffset == static_cast<int>(commandsSize)) ? 0 : 1; 51 slbCS.getSpace(slbEndOffset); 52 } 53 54 for (size_t i = 0; i < numEnqueues; i++) { 55 auto mediaStateFlush = slbCS.getSpaceForCmd<MEDIA_STATE_FLUSH>(); 56 *mediaStateFlush = GfxFamily::cmdInitMediaStateFlush; 57 58 addArbCheckCmdWa(); 59 60 addMiAtomicCmdWa((uint64_t)&igilCmdQueue->m_controls.m_DummyAtomicOperationPlaceholder); 61 62 auto mediaIdLoadSpace = slbCS.getSpaceForCmd<MEDIA_INTERFACE_DESCRIPTOR_LOAD>(); 63 auto mediaIdLoad = GfxFamily::cmdInitMediaInterfaceDescriptorLoad; 64 mediaIdLoad.setInterfaceDescriptorTotalLength(2048); 65 66 auto dataStartAddress = colorCalcStateSize; 67 mediaIdLoad.setInterfaceDescriptorDataStartAddress(dataStartAddress + sizeof(INTERFACE_DESCRIPTOR_DATA) * schedulerIDIndex); 68 *mediaIdLoadSpace = mediaIdLoad; 69 70 addLriCmdWa(true); 71 72 if (isProfilingEnabled()) { 73 addPipeControlCmdWa(); 74 auto pipeControl = slbCS.getSpaceForCmd<PIPE_CONTROL>(); 75 initPipeControl(pipeControl); 76 77 } else { 78 auto noop = slbCS.getSpace(sizeof(PIPE_CONTROL)); 79 memset(noop, 0x0, sizeof(PIPE_CONTROL)); 80 addPipeControlCmdWa(true); 81 } 82 83 auto gpgpuWalkerSpace = slbCS.getSpaceForCmd<GPGPU_WALKER>(); 84 auto gpgpuWalker = GfxFamily::cmdInitGpgpuWalker; 85 gpgpuWalker.setSimdSize(GPGPU_WALKER::SIMD_SIZE::SIMD_SIZE_SIMD16); 86 gpgpuWalker.setThreadGroupIdXDimension(1); 87 gpgpuWalker.setThreadGroupIdYDimension(1); 88 gpgpuWalker.setThreadGroupIdZDimension(1); 89 gpgpuWalker.setRightExecutionMask(0xFFFFFFFF); 90 gpgpuWalker.setBottomExecutionMask(0xFFFFFFFF); 91 *gpgpuWalkerSpace = gpgpuWalker; 92 93 mediaStateFlush = slbCS.getSpaceForCmd<MEDIA_STATE_FLUSH>(); 94 *mediaStateFlush = GfxFamily::cmdInitMediaStateFlush; 95 96 addArbCheckCmdWa(); 97 98 addPipeControlCmdWa(); 99 100 auto pipeControl2 = slbCS.getSpaceForCmd<PIPE_CONTROL>(); 101 initPipeControl(pipeControl2); 102 103 addLriCmdWa(false); 104 105 auto prefetch = slbCS.getSpace(getCSPrefetchSize()); 106 memset(prefetch, 0x0, getCSPrefetchSize()); 107 } 108 109 // always the same BBStart position (after 128 enqueues) 110 auto bbStartOffset = (commandsSize * 128) - slbCS.getUsed(); 111 slbCS.getSpace(bbStartOffset); 112 113 auto bbStartSpace = slbCS.getSpaceForCmd<MI_BATCH_BUFFER_START>(); 114 auto bbStart = GfxFamily::cmdInitBatchBufferStart; 115 auto slbPtr = reinterpret_cast<uintptr_t>(slbBuffer->getUnderlyingBuffer()); 116 bbStart.setBatchBufferStartAddress(slbPtr); 117 *bbStartSpace = bbStart; 118 119 igilCmdQueue->m_controls.m_CleanupSectionSize = 0; 120 igilQueue->m_controls.m_CleanupSectionAddress = 0; 121} 122 123template <typename GfxFamily> 124void DeviceQueueHw<GfxFamily>::addMediaStateClearCmds() { 125 typedef typename GfxFamily::MEDIA_VFE_STATE MEDIA_VFE_STATE; 126 127 addPipeControlCmdWa(); 128 129 auto pipeControlSpace = slbCS.getSpaceForCmd<PIPE_CONTROL>(); 130 auto pipeControl = GfxFamily::cmdInitPipeControl; 131 pipeControl.setGenericMediaStateClear(true); 132 pipeControl.setCommandStreamerStallEnable(true); 133 addDcFlushToPipeControlWa(&pipeControl); 134 *pipeControlSpace = pipeControl; 135 136 auto pVfeState = PreambleHelper<GfxFamily>::getSpaceForVfeState(&slbCS, device->getHardwareInfo(), EngineGroupType::RenderCompute); 137 StreamProperties emptyProperties{}; 138 PreambleHelper<GfxFamily>::programVfeState(pVfeState, device->getHardwareInfo(), 0u, 0, device->getSharedDeviceInfo().maxFrontEndThreads, emptyProperties); 139} 140 141template <typename GfxFamily> 142size_t DeviceQueueHw<GfxFamily>::getMediaStateClearCmdsSize() { 143 using MEDIA_VFE_STATE = typename GfxFamily::MEDIA_VFE_STATE; 144 // PC with GenreicMediaStateClear + WA PC 145 size_t size = 2 * sizeof(PIPE_CONTROL); 146 147 // VFE state cmds 148 size += sizeof(PIPE_CONTROL); 149 size += sizeof(MEDIA_VFE_STATE); 150 return size; 151} 152 153template <typename GfxFamily> 154void DeviceQueueHw<GfxFamily>::setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount, bool isCcsUsed) { 155 using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER; 156 void *pDSH = dynamicStateHeap.getCpuBase(); 157 // Set scheduler ID to last entry in first table, it will have ID == 0, blocks will have following entries. 158 auto igilCmdQueue = reinterpret_cast<IGIL_CommandQueue *>(queueBuffer->getUnderlyingBuffer()); 159 igilCmdQueue->m_controls.m_IDTstart = colorCalcStateSize + sizeof(INTERFACE_DESCRIPTOR_DATA) * (interfaceDescriptorEntries - 2); 160 161 // Parent's dsh is located after ColorCalcState and 2 ID tables 162 igilCmdQueue->m_controls.m_DynamicHeapStart = offsetDsh + alignUp(static_cast<uint32_t>(parentKernel->getDynamicStateHeapSize()), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); 163 igilCmdQueue->m_controls.m_DynamicHeapSizeInBytes = (uint32_t)dshBuffer->getUnderlyingBufferSize(); 164 165 igilCmdQueue->m_controls.m_CurrentDSHoffset = igilCmdQueue->m_controls.m_DynamicHeapStart; 166 igilCmdQueue->m_controls.m_ParentDSHOffset = offsetDsh; 167 168 uint32_t blockIndex = parentIDCount; 169 170 pDSH = ptrOffset(pDSH, colorCalcStateSize); 171 172 INTERFACE_DESCRIPTOR_DATA *pIDDestination = static_cast<INTERFACE_DESCRIPTOR_DATA *>(pDSH); 173 174 BlockKernelManager *blockManager = parentKernel->getProgram()->getBlockKernelManager(); 175 uint32_t blockCount = static_cast<uint32_t>(blockManager->getCount()); 176 177 uint32_t maxBindingTableCount = 0; 178 uint32_t totalBlockSSHSize = 0; 179 180 igilCmdQueue->m_controls.m_StartBlockID = blockIndex; 181 182 for (uint32_t i = 0; i < blockCount; i++) { 183 const KernelInfo *pBlockInfo = blockManager->getBlockKernelInfo(i); 184 185 auto blockKernelStartPointer = getBlockKernelStartPointer(getDevice(), pBlockInfo, isCcsUsed); 186 187 auto bindingTableCount = static_cast<uint32_t>(pBlockInfo->kernelDescriptor.payloadMappings.bindingTable.numEntries); 188 maxBindingTableCount = std::max(maxBindingTableCount, bindingTableCount); 189 190 totalBlockSSHSize += alignUp(pBlockInfo->heapInfo.SurfaceStateHeapSize, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); 191 192 surfaceStateHeap.align(BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); 193 auto btOffset = EncodeSurfaceState<GfxFamily>::pushBindingTableAndSurfaceStates(surfaceStateHeap, bindingTableCount, 194 pBlockInfo->heapInfo.pSsh, 195 pBlockInfo->heapInfo.SurfaceStateHeapSize, 196 bindingTableCount, 197 pBlockInfo->kernelDescriptor.payloadMappings.bindingTable.tableOffset); 198 199 parentKernel->setReflectionSurfaceBlockBtOffset(i, static_cast<uint32_t>(btOffset)); 200 201 // Determine SIMD size 202 uint32_t simd = pBlockInfo->getMaxSimdSize(); 203 204 uint32_t idOffset = pBlockInfo->kernelDescriptor.kernelMetadata.deviceSideEnqueueBlockInterfaceDescriptorOffset; 205 const INTERFACE_DESCRIPTOR_DATA *pBlockID = static_cast<const INTERFACE_DESCRIPTOR_DATA *>(ptrOffset(pBlockInfo->heapInfo.pDsh, idOffset)); 206 207 pIDDestination[blockIndex + i] = *pBlockID; 208 pIDDestination[blockIndex + i].setKernelStartPointerHigh(blockKernelStartPointer >> 32); 209 pIDDestination[blockIndex + i].setKernelStartPointer(static_cast<uint32_t>(blockKernelStartPointer)); 210 pIDDestination[blockIndex + i].setDenormMode(INTERFACE_DESCRIPTOR_DATA::DENORM_MODE_SETBYKERNEL); 211 EncodeDispatchKernel<GfxFamily>::programBarrierEnable(pIDDestination[blockIndex + i], 212 pBlockInfo->kernelDescriptor.kernelAttributes.barrierCount, 213 device->getHardwareInfo()); 214 215 // Set offset to sampler states, block's DHSOffset is added by scheduler 216 pIDDestination[blockIndex + i].setSamplerStatePointer(static_cast<uint32_t>(pBlockInfo->getBorderColorStateSize())); 217 218 auto numChannels = pBlockInfo->kernelDescriptor.kernelAttributes.numLocalIdChannels; 219 auto grfSize = device->getDeviceInfo().grfSize; 220 auto sizePerThreadData = getPerThreadSizeLocalIDs(simd, grfSize, numChannels); 221 auto numGrfPerThreadData = static_cast<uint32_t>(sizePerThreadData / grfSize); 222 223 // HW requires a minimum of 1 GRF of perThreadData for each thread in a thread group 224 // when sizeCrossThreadData != 0 225 numGrfPerThreadData = std::max(numGrfPerThreadData, 1u); 226 pIDDestination[blockIndex + i].setConstantIndirectUrbEntryReadLength(numGrfPerThreadData); 227 } 228 229 igilCmdQueue->m_controls.m_BTmaxSize = alignUp(maxBindingTableCount * (uint32_t)sizeof(BINDING_TABLE_STATE), INTERFACE_DESCRIPTOR_DATA::BINDINGTABLEPOINTER::BINDINGTABLEPOINTER_ALIGN_SIZE); 230 igilCmdQueue->m_controls.m_BTbaseOffset = alignUp((uint32_t)surfaceStateHeap.getUsed(), INTERFACE_DESCRIPTOR_DATA::BINDINGTABLEPOINTER::BINDINGTABLEPOINTER_ALIGN_SIZE); 231 igilCmdQueue->m_controls.m_CurrentSSHoffset = igilCmdQueue->m_controls.m_BTbaseOffset; 232} 233 234} // namespace NEO 235