1 /* 2 * Copyright (C) 2020-2021 Intel Corporation 3 * 4 * SPDX-License-Identifier: MIT 5 * 6 */ 7 8 #pragma once 9 10 #include "shared/source/helpers/aligned_memory.h" 11 #include "shared/source/helpers/debug_helpers.h" 12 #include "shared/source/kernel/debug_data.h" 13 #include "shared/source/kernel/kernel_arg_descriptor.h" 14 #include "shared/source/kernel/kernel_arg_metadata.h" 15 #include "shared/source/utilities/arrayref.h" 16 #include "shared/source/utilities/stackvec.h" 17 18 #include <cinttypes> 19 #include <cstddef> 20 #include <limits> 21 #include <memory> 22 #include <unordered_map> 23 #include <vector> 24 25 namespace NEO { 26 27 using StringMap = std::unordered_map<uint32_t, std::string>; 28 using InstructionsSegmentOffset = uint16_t; 29 30 struct ExtendedInfoBase { 31 virtual ~ExtendedInfoBase() = default; 32 }; 33 34 struct KernelDescriptor { 35 enum AddressingMode : uint8_t { 36 AddrNone, 37 Stateless, 38 Bindful, 39 Bindless, 40 BindfulAndStateless, 41 BindlessAndStateless 42 }; 43 44 KernelDescriptor() = default; 45 virtual ~KernelDescriptor() = default; 46 virtual bool hasRTCalls() const; 47 updateCrossThreadDataSizeKernelDescriptor48 void updateCrossThreadDataSize() { 49 uint32_t crossThreadDataSize = 0; 50 for (uint32_t i = 0; i < 3; i++) { 51 if (isValidOffset(payloadMappings.dispatchTraits.globalWorkOffset[i])) { 52 crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, payloadMappings.dispatchTraits.globalWorkOffset[i] + sizeof(uint32_t)); 53 } 54 if (isValidOffset(payloadMappings.dispatchTraits.globalWorkSize[i])) { 55 crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, payloadMappings.dispatchTraits.globalWorkSize[i] + sizeof(uint32_t)); 56 } 57 if (isValidOffset(payloadMappings.dispatchTraits.localWorkSize[i])) { 58 crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, payloadMappings.dispatchTraits.localWorkSize[i] + sizeof(uint32_t)); 59 } 60 if (isValidOffset(payloadMappings.dispatchTraits.localWorkSize2[i])) { 61 crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, payloadMappings.dispatchTraits.localWorkSize2[i] + sizeof(uint32_t)); 62 } 63 if (isValidOffset(payloadMappings.dispatchTraits.enqueuedLocalWorkSize[i])) { 64 crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, payloadMappings.dispatchTraits.enqueuedLocalWorkSize[i] + sizeof(uint32_t)); 65 } 66 if (isValidOffset(payloadMappings.dispatchTraits.numWorkGroups[i])) { 67 crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, payloadMappings.dispatchTraits.numWorkGroups[i] + sizeof(uint32_t)); 68 } 69 } 70 71 if (isValidOffset(payloadMappings.dispatchTraits.workDim)) { 72 crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, payloadMappings.dispatchTraits.workDim + sizeof(uint32_t)); 73 } 74 75 StackVec<ArgDescPointer *, 8> implicitArgsVec({&payloadMappings.implicitArgs.printfSurfaceAddress, 76 &payloadMappings.implicitArgs.globalVariablesSurfaceAddress, 77 &payloadMappings.implicitArgs.globalConstantsSurfaceAddress, 78 &payloadMappings.implicitArgs.privateMemoryAddress, 79 &payloadMappings.implicitArgs.deviceSideEnqueueEventPoolSurfaceAddress, 80 &payloadMappings.implicitArgs.deviceSideEnqueueDefaultQueueSurfaceAddress, 81 &payloadMappings.implicitArgs.systemThreadSurfaceAddress, 82 &payloadMappings.implicitArgs.syncBufferAddress}); 83 84 for (size_t i = 0; i < implicitArgsVec.size(); i++) { 85 if (isValidOffset(implicitArgsVec[i]->bindless)) { 86 crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, implicitArgsVec[i]->bindless + sizeof(uint32_t)); 87 } 88 89 if (isValidOffset(implicitArgsVec[i]->stateless)) { 90 crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, implicitArgsVec[i]->stateless + implicitArgsVec[i]->pointerSize); 91 } 92 } 93 94 StackVec<CrossThreadDataOffset *, 7> implicitArgsVec2({&payloadMappings.implicitArgs.privateMemorySize, 95 &payloadMappings.implicitArgs.maxWorkGroupSize, 96 &payloadMappings.implicitArgs.simdSize, 97 &payloadMappings.implicitArgs.deviceSideEnqueueParentEvent, 98 &payloadMappings.implicitArgs.preferredWkgMultiple, 99 &payloadMappings.implicitArgs.localMemoryStatelessWindowSize, 100 &payloadMappings.implicitArgs.localMemoryStatelessWindowStartAddres}); 101 102 for (size_t i = 0; i < implicitArgsVec2.size(); i++) { 103 if (isValidOffset(*implicitArgsVec2[i])) { 104 crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, *implicitArgsVec2[i] + sizeof(uint32_t)); 105 } 106 } 107 108 for (size_t i = 0; i < payloadMappings.explicitArgs.size(); i++) { 109 110 switch (payloadMappings.explicitArgs[i].type) { 111 case ArgDescriptor::ArgType::ArgTImage: { 112 auto &argImage = payloadMappings.explicitArgs[i].as<ArgDescImage>(false); 113 if (isValidOffset(argImage.bindless)) { 114 crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, argImage.bindless + sizeof(uint32_t)); 115 } 116 } break; 117 case ArgDescriptor::ArgType::ArgTPointer: { 118 auto &argPtr = payloadMappings.explicitArgs[i].as<ArgDescPointer>(false); 119 if (isValidOffset(argPtr.bindless)) { 120 crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, argPtr.bindless + sizeof(uint32_t)); 121 } 122 if (isValidOffset(argPtr.stateless)) { 123 crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, argPtr.stateless + argPtr.pointerSize); 124 } 125 } break; 126 case ArgDescriptor::ArgType::ArgTSampler: { 127 auto &argSampler = payloadMappings.explicitArgs[i].as<ArgDescSampler>(false); 128 UNRECOVERABLE_IF(isValidOffset(argSampler.bindless)); 129 } break; 130 case ArgDescriptor::ArgType::ArgTValue: { 131 auto &argVal = payloadMappings.explicitArgs[i].as<ArgDescValue>(false); 132 for (size_t i = 0; i < argVal.elements.size(); i++) { 133 UNRECOVERABLE_IF(!isValidOffset(argVal.elements[i].offset)); 134 crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, argVal.elements[i].offset + argVal.elements[i].size); 135 } 136 } break; 137 default: 138 break; 139 } 140 } 141 142 this->kernelAttributes.crossThreadDataSize = std::max<uint16_t>(this->kernelAttributes.crossThreadDataSize, static_cast<uint16_t>(alignUp(crossThreadDataSize, 32))); 143 } 144 145 struct KernelAttributes { KernelAttributesKernelDescriptor::KernelAttributes146 KernelAttributes() { flags.packed = 0U; } 147 148 uint32_t slmInlineSize = 0U; 149 uint32_t perThreadScratchSize[2] = {0U, 0U}; 150 uint32_t perHwThreadPrivateMemorySize = 0U; 151 uint32_t perThreadSystemThreadSurfaceSize = 0U; 152 uint16_t requiredWorkgroupSize[3] = {0U, 0U, 0U}; 153 uint16_t crossThreadDataSize = 0U; 154 uint16_t inlineDataPayloadSize = 0U; 155 uint16_t perThreadDataSize = 0U; 156 uint16_t numArgsToPatch = 0U; 157 uint16_t numGrfRequired = 0U; 158 uint8_t barrierCount = 0u; 159 bool hasNonKernelArgLoad = true; 160 bool hasNonKernelArgStore = true; 161 bool hasNonKernelArgAtomic = true; 162 163 AddressingMode bufferAddressingMode = BindfulAndStateless; 164 AddressingMode imageAddressingMode = Bindful; 165 AddressingMode samplerAddressingMode = Bindful; 166 167 uint8_t workgroupWalkOrder[3] = {0, 1, 2}; 168 uint8_t workgroupDimensionsOrder[3] = {0, 1, 2}; 169 170 uint8_t gpuPointerSize = 0; 171 uint8_t simdSize = 8; 172 uint8_t numLocalIdChannels = 0; 173 uint8_t localId[3] = {0U, 0U, 0U}; 174 supportsBuffersBiggerThan4GbKernelDescriptor::KernelAttributes175 bool supportsBuffersBiggerThan4Gb() const { 176 return Stateless == bufferAddressingMode; 177 } 178 usesBarriersKernelDescriptor::KernelAttributes179 bool usesBarriers() const { 180 return 0 != barrierCount; 181 } 182 183 union { 184 struct { 185 bool usesSpecialPipelineSelectMode : 1; 186 bool usesStringMapForPrintf : 1; 187 bool usesPrintf : 1; 188 bool usesFencesForReadWriteImages : 1; 189 bool usesFlattenedLocalIds; 190 bool usesPrivateMemory : 1; 191 bool usesVme : 1; 192 bool usesImages : 1; 193 bool usesSamplers : 1; 194 bool usesDeviceSideEnqueue : 1; 195 bool usesSyncBuffer : 1; 196 bool useGlobalAtomics : 1; 197 bool usesStatelessWrites : 1; 198 bool passInlineData : 1; 199 bool perThreadDataHeaderIsPresent : 1; 200 bool perThreadDataUnusedGrfIsPresent : 1; 201 bool requiresDisabledMidThreadPreemption : 1; 202 bool requiresSubgroupIndependentForwardProgress : 1; 203 bool requiresWorkgroupWalkOrder : 1; 204 bool requiresImplicitArgs : 1; 205 bool useStackCalls : 1; 206 }; 207 uint32_t packed; 208 } flags; 209 static_assert(sizeof(KernelAttributes::flags) == sizeof(KernelAttributes::flags.packed), ""); 210 } kernelAttributes; 211 212 struct { 213 InstructionsSegmentOffset skipPerThreadDataLoad = 0U; 214 InstructionsSegmentOffset skipSetFFIDGP = 0U; 215 InstructionsSegmentOffset systemKernel = 0U; 216 } entryPoints; 217 218 struct PayloadMappings { 219 struct { 220 CrossThreadDataOffset globalWorkOffset[3] = {undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>}; 221 CrossThreadDataOffset globalWorkSize[3] = {undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>}; 222 CrossThreadDataOffset localWorkSize[3] = {undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>}; 223 CrossThreadDataOffset localWorkSize2[3] = {undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>}; 224 225 CrossThreadDataOffset enqueuedLocalWorkSize[3] = {undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>}; 226 CrossThreadDataOffset numWorkGroups[3] = {undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>}; 227 CrossThreadDataOffset workDim = undefined<CrossThreadDataOffset>; 228 } dispatchTraits; 229 230 struct { 231 SurfaceStateHeapOffset tableOffset = undefined<SurfaceStateHeapOffset>; 232 uint8_t numEntries = 0; 233 } bindingTable; 234 235 struct { 236 DynamicStateHeapOffset tableOffset = undefined<DynamicStateHeapOffset>; 237 DynamicStateHeapOffset borderColor = undefined<DynamicStateHeapOffset>; 238 uint8_t numSamplers = 0; 239 } samplerTable; 240 241 StackVec<ArgDescriptor, 16> explicitArgs; 242 243 struct { 244 ArgDescPointer printfSurfaceAddress; 245 ArgDescPointer globalVariablesSurfaceAddress; 246 ArgDescPointer globalConstantsSurfaceAddress; 247 ArgDescPointer privateMemoryAddress; 248 ArgDescPointer deviceSideEnqueueEventPoolSurfaceAddress; 249 ArgDescPointer deviceSideEnqueueDefaultQueueSurfaceAddress; 250 ArgDescPointer systemThreadSurfaceAddress; 251 ArgDescPointer syncBufferAddress; 252 ArgDescPointer rtDispatchGlobals; 253 CrossThreadDataOffset privateMemorySize = undefined<CrossThreadDataOffset>; 254 CrossThreadDataOffset maxWorkGroupSize = undefined<CrossThreadDataOffset>; 255 CrossThreadDataOffset simdSize = undefined<CrossThreadDataOffset>; 256 CrossThreadDataOffset deviceSideEnqueueParentEvent = undefined<CrossThreadDataOffset>; 257 CrossThreadDataOffset preferredWkgMultiple = undefined<CrossThreadDataOffset>; 258 CrossThreadDataOffset localMemoryStatelessWindowSize = undefined<CrossThreadDataOffset>; 259 CrossThreadDataOffset localMemoryStatelessWindowStartAddres = undefined<CrossThreadDataOffset>; 260 } implicitArgs; 261 262 std::vector<std::unique_ptr<ArgDescriptorExtended>> explicitArgsExtendedDescriptors; 263 } payloadMappings; 264 265 std::vector<ArgTypeMetadataExtended> explicitArgsExtendedMetadata; 266 267 struct { 268 std::string kernelName; 269 std::string kernelLanguageAttributes; 270 StringMap printfStringsMap; 271 std::vector<std::pair<uint32_t, uint32_t>> deviceSideEnqueueChildrenKernelsIdOffset; 272 uint32_t deviceSideEnqueueBlockInterfaceDescriptorOffset = 0U; 273 274 struct ByValueArgument { 275 ArgDescValue::Element byValueElement; 276 uint16_t argNum; 277 }; 278 StackVec<ByValueArgument, 32> allByValueKernelArguments; 279 280 uint16_t compiledSubGroupsNumber = 0U; 281 uint8_t requiredSubGroupSize = 0U; 282 } kernelMetadata; 283 284 struct { 285 std::unique_ptr<DebugData> debugData; 286 std::unique_ptr<uint8_t[]> relocatedDebugData; 287 const void *igcInfoForGtpin = nullptr; 288 } external; 289 290 std::vector<uint8_t> generatedHeaps; 291 std::unique_ptr<ExtendedInfoBase> extendedInfo; 292 }; 293 294 } // namespace NEO 295