1/* 2 * Copyright (C) 2019-2021 Intel Corporation 3 * 4 * SPDX-License-Identifier: MIT 5 * 6 */ 7 8#include "shared/source/command_container/command_encoder.h" 9#include "shared/source/command_stream/csr_definitions.h" 10#include "shared/source/command_stream/preemption.h" 11#include "shared/source/debug_settings/debug_settings_manager.h" 12#include "shared/source/helpers/address_patch.h" 13#include "shared/source/helpers/aligned_memory.h" 14#include "shared/source/helpers/basic_math.h" 15#include "shared/source/helpers/hw_helper.h" 16#include "shared/source/helpers/local_id_gen.h" 17#include "shared/source/helpers/ptr_math.h" 18#include "shared/source/helpers/string.h" 19#include "shared/source/indirect_heap/indirect_heap.h" 20 21#include "opencl/source/cl_device/cl_device.h" 22#include "opencl/source/context/context.h" 23#include "opencl/source/helpers/dispatch_info.h" 24#include "opencl/source/kernel/kernel.h" 25#include "opencl/source/program/block_kernel_manager.h" 26#include "opencl/source/scheduler/scheduler_kernel.h" 27 28#include <cstring> 29 30namespace NEO { 31 32template <typename GfxFamily> 33size_t HardwareCommandsHelper<GfxFamily>::getSizeRequiredDSH(const Kernel &kernel) { 34 using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA; 35 using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE; 36 const auto &samplerTable = kernel.getKernelInfo().kernelDescriptor.payloadMappings.samplerTable; 37 38 auto samplerCount = samplerTable.numSamplers; 39 auto totalSize = samplerCount 40 ? alignUp(samplerCount * sizeof(SAMPLER_STATE), INTERFACE_DESCRIPTOR_DATA::SAMPLERSTATEPOINTER_ALIGN_SIZE) 41 : 0; 42 43 auto borderColorSize = samplerTable.borderColor; 44 borderColorSize = alignUp(borderColorSize + EncodeStates<GfxFamily>::alignIndirectStatePointer - 1, 45 EncodeStates<GfxFamily>::alignIndirectStatePointer); 46 47 totalSize += borderColorSize + additionalSizeRequiredDsh(); 48 49 DEBUG_BREAK_IF(!(totalSize >= kernel.getDynamicStateHeapSize() || kernel.isVmeKernel())); 50 51 return alignUp(totalSize, EncodeStates<GfxFamily>::alignInterfaceDescriptorData); 52} 53 54template <typename GfxFamily> 55size_t HardwareCommandsHelper<GfxFamily>::getSizeRequiredIOH(const Kernel &kernel, 56 size_t localWorkSize) { 57 typedef typename GfxFamily::WALKER_TYPE WALKER_TYPE; 58 const auto &kernelInfo = kernel.getKernelInfo(); 59 60 auto numChannels = kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels; 61 uint32_t grfSize = sizeof(typename GfxFamily::GRF); 62 auto size = kernel.getCrossThreadDataSize() + 63 getPerThreadDataSizeTotal(kernelInfo.getMaxSimdSize(), grfSize, numChannels, localWorkSize); 64 65 if (kernel.getImplicitArgs()) { 66 size += sizeof(ImplicitArgs) + alignUp(getPerThreadDataSizeTotal(kernelInfo.getMaxSimdSize(), grfSize, 3u, localWorkSize), MemoryConstants::cacheLineSize); 67 } 68 return alignUp(size, WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); 69} 70 71template <typename GfxFamily> 72size_t HardwareCommandsHelper<GfxFamily>::getSizeRequiredSSH(const Kernel &kernel) { 73 typedef typename GfxFamily::BINDING_TABLE_STATE BINDING_TABLE_STATE; 74 auto sizeSSH = kernel.getSurfaceStateHeapSize(); 75 sizeSSH += sizeSSH ? BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE : 0; 76 return sizeSSH; 77} 78 79template <typename SizeGetterT, typename... ArgsT> 80size_t getSizeRequired(const MultiDispatchInfo &multiDispatchInfo, SizeGetterT &&getSize, ArgsT... args) { 81 size_t totalSize = 0; 82 auto it = multiDispatchInfo.begin(); 83 for (auto e = multiDispatchInfo.end(); it != e; ++it) { 84 totalSize = alignUp(totalSize, MemoryConstants::cacheLineSize); 85 totalSize += getSize(*it, std::forward<ArgsT>(args)...); 86 } 87 totalSize = alignUp(totalSize, MemoryConstants::pageSize); 88 return totalSize; 89} 90 91template <typename GfxFamily> 92size_t HardwareCommandsHelper<GfxFamily>::getTotalSizeRequiredDSH( 93 const MultiDispatchInfo &multiDispatchInfo) { 94 return getSizeRequired(multiDispatchInfo, [](const DispatchInfo &dispatchInfo) { return getSizeRequiredDSH(*dispatchInfo.getKernel()); }); 95} 96 97template <typename GfxFamily> 98size_t HardwareCommandsHelper<GfxFamily>::getTotalSizeRequiredIOH( 99 const MultiDispatchInfo &multiDispatchInfo) { 100 return getSizeRequired(multiDispatchInfo, [](const DispatchInfo &dispatchInfo) { return getSizeRequiredIOH( 101 *dispatchInfo.getKernel(), 102 Math::computeTotalElementsCount(dispatchInfo.getLocalWorkgroupSize())); }); 103} 104 105template <typename GfxFamily> 106size_t HardwareCommandsHelper<GfxFamily>::getTotalSizeRequiredSSH( 107 const MultiDispatchInfo &multiDispatchInfo) { 108 return getSizeRequired(multiDispatchInfo, [](const DispatchInfo &dispatchInfo) { return getSizeRequiredSSH(*dispatchInfo.getKernel()); }); 109} 110 111template <typename GfxFamily> 112size_t HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(const Kernel &kernel) { 113 typedef typename GfxFamily::BINDING_TABLE_STATE BINDING_TABLE_STATE; 114 115 size_t totalSize = 0; 116 BlockKernelManager *blockManager = kernel.getProgram()->getBlockKernelManager(); 117 uint32_t blockCount = static_cast<uint32_t>(blockManager->getCount()); 118 uint32_t maxBindingTableCount = 0; 119 120 totalSize = BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE - 1; 121 122 for (uint32_t i = 0; i < blockCount; i++) { 123 const KernelInfo *pBlockInfo = blockManager->getBlockKernelInfo(i); 124 totalSize += pBlockInfo->heapInfo.SurfaceStateHeapSize; 125 totalSize = alignUp(totalSize, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); 126 127 maxBindingTableCount = std::max(maxBindingTableCount, static_cast<uint32_t>(pBlockInfo->kernelDescriptor.payloadMappings.bindingTable.numEntries)); 128 } 129 130 SchedulerKernel &scheduler = kernel.getContext().getSchedulerKernel(); 131 132 totalSize += getSizeRequiredSSH(scheduler); 133 134 totalSize += maxBindingTableCount * sizeof(BINDING_TABLE_STATE) * DeviceQueue::interfaceDescriptorEntries; 135 totalSize = alignUp(totalSize, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); 136 137 return totalSize; 138} 139 140template <typename GfxFamily> 141size_t HardwareCommandsHelper<GfxFamily>::sendInterfaceDescriptorData( 142 const IndirectHeap &indirectHeap, 143 uint64_t offsetInterfaceDescriptor, 144 uint64_t kernelStartOffset, 145 size_t sizeCrossThreadData, 146 size_t sizePerThreadData, 147 size_t bindingTablePointer, 148 size_t offsetSamplerState, 149 uint32_t numSamplers, 150 uint32_t threadsPerThreadGroup, 151 const Kernel &kernel, 152 uint32_t bindingTablePrefetchSize, 153 PreemptionMode preemptionMode, 154 INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor, 155 const Device &device) { 156 using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE; 157 using SHARED_LOCAL_MEMORY_SIZE = typename INTERFACE_DESCRIPTOR_DATA::SHARED_LOCAL_MEMORY_SIZE; 158 159 const auto &hardwareInfo = device.getHardwareInfo(); 160 161 // Allocate some memory for the interface descriptor 162 auto pInterfaceDescriptor = getInterfaceDescriptor(indirectHeap, offsetInterfaceDescriptor, inlineInterfaceDescriptor); 163 auto interfaceDescriptor = GfxFamily::cmdInitInterfaceDescriptorData; 164 165 // Program the kernel start pointer 166 interfaceDescriptor.setKernelStartPointerHigh(kernelStartOffset >> 32); 167 interfaceDescriptor.setKernelStartPointer((uint32_t)kernelStartOffset); 168 169 // # of threads in thread group should be based on LWS. 170 interfaceDescriptor.setNumberOfThreadsInGpgpuThreadGroup(threadsPerThreadGroup); 171 172 interfaceDescriptor.setDenormMode(INTERFACE_DESCRIPTOR_DATA::DENORM_MODE_SETBYKERNEL); 173 174 auto slmTotalSize = kernel.getSlmTotalSize(); 175 176 setGrfInfo(&interfaceDescriptor, kernel, sizeCrossThreadData, sizePerThreadData); 177 EncodeDispatchKernel<GfxFamily>::appendAdditionalIDDFields(&interfaceDescriptor, hardwareInfo, threadsPerThreadGroup, slmTotalSize, SlmPolicy::SlmPolicyNone); 178 179 interfaceDescriptor.setBindingTablePointer(static_cast<uint32_t>(bindingTablePointer)); 180 181 interfaceDescriptor.setSamplerStatePointer(static_cast<uint32_t>(offsetSamplerState)); 182 183 EncodeDispatchKernel<GfxFamily>::adjustBindingTablePrefetch(interfaceDescriptor, numSamplers, bindingTablePrefetchSize); 184 185 auto programmableIDSLMSize = 186 static_cast<SHARED_LOCAL_MEMORY_SIZE>(HwHelperHw<GfxFamily>::get().computeSlmValues(hardwareInfo, slmTotalSize)); 187 188 if (DebugManager.flags.OverrideSlmAllocationSize.get() != -1) { 189 programmableIDSLMSize = static_cast<SHARED_LOCAL_MEMORY_SIZE>(DebugManager.flags.OverrideSlmAllocationSize.get()); 190 } 191 192 interfaceDescriptor.setSharedLocalMemorySize(programmableIDSLMSize); 193 EncodeDispatchKernel<GfxFamily>::programBarrierEnable(interfaceDescriptor, 194 kernel.getKernelInfo().kernelDescriptor.kernelAttributes.barrierCount, 195 hardwareInfo); 196 197 PreemptionHelper::programInterfaceDescriptorDataPreemption<GfxFamily>(&interfaceDescriptor, preemptionMode); 198 EncodeDispatchKernel<GfxFamily>::adjustInterfaceDescriptorData(interfaceDescriptor, hardwareInfo); 199 200 *pInterfaceDescriptor = interfaceDescriptor; 201 return (size_t)offsetInterfaceDescriptor; 202} 203 204template <typename GfxFamily> 205size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState( 206 LinearStream &commandStream, 207 IndirectHeap &dsh, 208 IndirectHeap &ioh, 209 IndirectHeap &ssh, 210 Kernel &kernel, 211 uint64_t kernelStartOffset, 212 uint32_t simd, 213 const size_t localWorkSize[3], 214 const uint64_t offsetInterfaceDescriptorTable, 215 uint32_t &interfaceDescriptorIndex, 216 PreemptionMode preemptionMode, 217 WALKER_TYPE *walkerCmd, 218 INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor, 219 bool localIdsGenerationByRuntime, 220 const Device &device) { 221 222 using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE; 223 224 auto rootDeviceIndex = device.getRootDeviceIndex(); 225 226 DEBUG_BREAK_IF(simd != 1 && simd != 8 && simd != 16 && simd != 32); 227 auto inlineDataProgrammingRequired = HardwareCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(kernel); 228 229 // Copy the kernel over to the ISH 230 const auto &kernelInfo = kernel.getKernelInfo(); 231 232 ssh.align(BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); 233 234 auto dstBindingTablePointer = EncodeSurfaceState<GfxFamily>::pushBindingTableAndSurfaceStates(ssh, kernelInfo.kernelDescriptor.payloadMappings.bindingTable.numEntries, 235 kernel.getSurfaceStateHeap(), kernel.getSurfaceStateHeapSize(), 236 kernel.getNumberOfBindingTableStates(), kernel.getBindingTableOffset()); 237 238 // Copy our sampler state if it exists 239 const auto &samplerTable = kernelInfo.kernelDescriptor.payloadMappings.samplerTable; 240 uint32_t samplerCount = 0; 241 uint32_t samplerStateOffset = 0; 242 if (isValidOffset(samplerTable.tableOffset) && isValidOffset(samplerTable.borderColor)) { 243 samplerCount = samplerTable.numSamplers; 244 samplerStateOffset = EncodeStates<GfxFamily>::copySamplerState(&dsh, samplerTable.tableOffset, 245 samplerCount, samplerTable.borderColor, 246 kernel.getDynamicStateHeap(), device.getBindlessHeapsHelper(), 247 device.getHardwareInfo()); 248 } 249 250 auto localWorkItems = localWorkSize[0] * localWorkSize[1] * localWorkSize[2]; 251 auto threadsPerThreadGroup = static_cast<uint32_t>(getThreadsPerWG(simd, localWorkItems)); 252 auto numChannels = static_cast<uint32_t>(kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels); 253 254 auto pImplicitArgs = kernel.getImplicitArgs(); 255 if (pImplicitArgs) { 256 constexpr uint32_t grfSize = sizeof(typename GfxFamily::GRF); 257 auto offsetLocalIds = sendPerThreadData( 258 ioh, 259 simd, 260 grfSize, 261 3u, // all channels for implicit args 262 std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSize[0]), static_cast<uint16_t>(localWorkSize[1]), static_cast<uint16_t>(localWorkSize[2])}}, 263 {{kernelInfo.kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[0], 264 kernelInfo.kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[1], 265 kernelInfo.kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[2]}}, 266 kernel.usesOnlyImages()); 267 268 pImplicitArgs->localIdTablePtr = offsetLocalIds + ioh.getGraphicsAllocation()->getGpuAddress(); 269 } 270 271 uint32_t sizeCrossThreadData = kernel.getCrossThreadDataSize(); 272 273 size_t offsetCrossThreadData = HardwareCommandsHelper<GfxFamily>::sendCrossThreadData( 274 ioh, kernel, inlineDataProgrammingRequired, 275 walkerCmd, sizeCrossThreadData); 276 277 size_t sizePerThreadDataTotal = 0; 278 size_t sizePerThreadData = 0; 279 280 HardwareCommandsHelper<GfxFamily>::programPerThreadData( 281 sizePerThreadData, 282 localIdsGenerationByRuntime, 283 ioh, 284 simd, 285 numChannels, 286 localWorkSize, 287 kernel, 288 sizePerThreadDataTotal, 289 localWorkItems, 290 rootDeviceIndex); 291 292 uint64_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable + interfaceDescriptorIndex * sizeof(INTERFACE_DESCRIPTOR_DATA); 293 294 auto bindingTablePrefetchSize = std::min(31u, static_cast<uint32_t>(kernel.getNumberOfBindingTableStates())); 295 if (resetBindingTablePrefetch(kernel)) { 296 bindingTablePrefetchSize = 0; 297 } 298 299 HardwareCommandsHelper<GfxFamily>::sendInterfaceDescriptorData( 300 dsh, 301 offsetInterfaceDescriptor, 302 kernelStartOffset, 303 sizeCrossThreadData, 304 sizePerThreadData, 305 dstBindingTablePointer, 306 samplerStateOffset, 307 samplerCount, 308 threadsPerThreadGroup, 309 kernel, 310 bindingTablePrefetchSize, 311 preemptionMode, 312 inlineInterfaceDescriptor, 313 device); 314 315 if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) { 316 PatchInfoData patchInfoData(kernelStartOffset, 0, PatchInfoAllocationType::InstructionHeap, dsh.getGraphicsAllocation()->getGpuAddress(), offsetInterfaceDescriptor, PatchInfoAllocationType::DynamicStateHeap); 317 kernel.getPatchInfoDataList().push_back(patchInfoData); 318 } 319 320 // Program media state flush to set interface descriptor offset 321 sendMediaStateFlush( 322 commandStream, 323 interfaceDescriptorIndex); 324 325 DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0); 326 walkerCmd->setIndirectDataStartAddress(static_cast<uint32_t>(offsetCrossThreadData)); 327 setInterfaceDescriptorOffset(walkerCmd, interfaceDescriptorIndex); 328 329 auto indirectDataLength = alignUp(static_cast<uint32_t>(sizeCrossThreadData + sizePerThreadDataTotal), 330 WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); 331 walkerCmd->setIndirectDataLength(indirectDataLength); 332 333 return offsetCrossThreadData; 334} 335 336template <typename GfxFamily> 337void HardwareCommandsHelper<GfxFamily>::updatePerThreadDataTotal( 338 size_t &sizePerThreadData, 339 uint32_t &simd, 340 uint32_t &numChannels, 341 size_t &sizePerThreadDataTotal, 342 size_t &localWorkItems) { 343 uint32_t grfSize = sizeof(typename GfxFamily::GRF); 344 sizePerThreadData = getPerThreadSizeLocalIDs(simd, grfSize, numChannels); 345 346 uint32_t localIdSizePerThread = PerThreadDataHelper::getLocalIdSizePerThread(simd, grfSize, numChannels); 347 localIdSizePerThread = std::max(localIdSizePerThread, grfSize); 348 349 sizePerThreadDataTotal = getThreadsPerWG(simd, localWorkItems) * localIdSizePerThread; 350 DEBUG_BREAK_IF(sizePerThreadDataTotal == 0); // Hardware requires at least 1 GRF of perThreadData for each thread in thread group 351} 352 353template <typename GfxFamily> 354bool HardwareCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(const Kernel &kernel) { 355 auto checkKernelForInlineData = true; 356 if (DebugManager.flags.EnablePassInlineData.get() != -1) { 357 checkKernelForInlineData = !!DebugManager.flags.EnablePassInlineData.get(); 358 } 359 if (checkKernelForInlineData) { 360 return kernel.getKernelInfo().kernelDescriptor.kernelAttributes.flags.passInlineData; 361 } 362 return false; 363} 364 365template <typename GfxFamily> 366bool HardwareCommandsHelper<GfxFamily>::kernelUsesLocalIds(const Kernel &kernel) { 367 return kernel.getKernelInfo().kernelDescriptor.kernelAttributes.numLocalIdChannels > 0; 368} 369 370} // namespace NEO 371