1 /*========================== begin_copyright_notice ============================ 2 3 Copyright (C) 2017-2021 Intel Corporation 4 5 SPDX-License-Identifier: MIT 6 7 ============================= end_copyright_notice ===========================*/ 8 9 #pragma once 10 11 #include "IGC/common/StringMacros.hpp" 12 #include "usc.h" 13 #include "usc_gen7.h" 14 #include "usc_gen9.h" 15 #include "common/Stats.hpp" 16 #include "common/Types.hpp" 17 #include "common/allocator.h" 18 #include "common/igc_resourceDimTypes.h" 19 // hack 20 #include "common/debug/Debug.hpp" 21 #include "common/debug/Dump.hpp" 22 #include <set> 23 #include <string.h> 24 #include <sstream> 25 #include "Compiler/CISACodeGen/ShaderUnits.hpp" 26 #include "Compiler/CISACodeGen/Platform.hpp" 27 #include "Compiler/CISACodeGen/DriverInfo.hpp" 28 #include "Compiler/CISACodeGen/helper.h" 29 #include "Compiler/MetaDataApi/MetaDataApi.h" 30 #include "Compiler/MetaDataApi/IGCMetaDataHelper.h" 31 #include "Compiler/CodeGenContextWrapper.hpp" 32 #include "visa/include/RelocationInfo.h" 33 #include "ZEBinWriter/zebin/source/autogen/ZEInfo.hpp" 34 35 #include "../AdaptorOCL/OCL/sp/spp_g8.h" 36 #include "../GenISAIntrinsics/GenIntrinsics.h" 37 #include "../GenISAIntrinsics/GenIntrinsicInst.h" 38 #include "common/LLVMWarningsPush.hpp" 39 #include <llvm/IR/LLVMContext.h> 40 #include <llvm/ADT/DenseMap.h> 41 #include <llvm/ADT/StringMap.h> 42 #include <llvm/ADT/StringRef.h> 43 #include <llvm/IR/IRBuilder.h> 44 #include "llvm/IR/Function.h" 45 #include "llvm/IR/ValueMap.h" 46 #include "llvm/IR/AssemblyAnnotationWriter.h" 47 #include "common/LLVMWarningsPop.hpp" 48 #include "CodeGenPublicEnums.h" 49 #include "AdaptorOCL/TranslationBlock.h" 50 #include "common/MDFrameWork.h" 51 #include "CompilerStats.h" 52 #include <unordered_set> 53 #include "Probe/Assertion.h" 54 #include <optional> 55 #include <Metrics/IGCMetric.h> 56 57 /************************************************************************ 58 This file contains the interface structure and functions to communicate 59 between front ends and code generator 60 ************************************************************************/ 61 62 namespace llvm 63 { 64 class Module; 65 class Function; 66 } 67 68 #define MAX_VSHADER_INPUT_REGISTERS_PACKAGEABLE 32 69 static const unsigned int g_c_Max_PS_attributes = 32; 70 71 namespace IGC 72 { 73 class CodeGenContext; 74 class PixelShaderContext; 75 class ComputeShaderContext; 76 77 struct SProgramOutput 78 { 79 public: 80 typedef std::vector<vISA::ZESymEntry> SymbolListTy; 81 typedef std::vector<vISA::ZERelocEntry> RelocListTy; 82 typedef std::vector<vISA::ZEFuncAttribEntry> FuncAttrListTy; 83 // function scope symbols 84 struct ZEBinFuncSymbolTable { 85 SymbolListTy function; // function symbols 86 SymbolListTy sampler; // sampler symbols 87 SymbolListTy local; // local symbols 88 }; 89 90 public: 91 void* m_programBin = nullptr; //<! Must be 16 byte aligned, and padded to a 64 byte boundary 92 unsigned int m_programSize = 0; //<! Number of bytes of program data (including padding) 93 unsigned int m_unpaddedProgramSize = 0; //<! program size without padding used for binary linking 94 unsigned int m_startReg = 0; //<! Which GRF to start with 95 unsigned int m_scratchSpaceUsedBySpills = 0; //<! amount of scratch space needed for shader spilling 96 unsigned int m_scratchSpaceUsedByShader = 0; //<! amount of scratch space needed by shader 97 unsigned int m_scratchSpaceUsedByGtpin = 0; //<! amount of scratch space used by gtpin 98 void* m_debugData = nullptr; //<! elf file containing debug information for the kernel (source->genIsa) 99 unsigned int m_debugDataSize = 0; //<! size of the elf file containing debug information 100 // TODO: m_debugDataGenISA and m_debugDataGenISASize 101 // are not really needed, consider removal 102 void* m_debugDataGenISA = nullptr; //<! GenISA debug data (VISA -> GenISA) 103 unsigned int m_debugDataGenISASize = 0; //<! Number of bytes of GenISA debug data 104 unsigned int m_InstructionCount = 0; 105 unsigned int m_BasicBlockCount = 0; 106 void* m_gtpinBuffer = nullptr; // Will be populated by VISA only when special switch is passed by gtpin 107 unsigned int m_gtpinBufferSize = 0; 108 void* m_funcSymbolTable = nullptr; 109 unsigned int m_funcSymbolTableSize = 0; 110 unsigned int m_funcSymbolTableEntries = 0; 111 ZEBinFuncSymbolTable m_symbols; // duplicated information of m_funcSymbolTable, for zebin 112 void* m_funcRelocationTable = nullptr; 113 unsigned int m_funcRelocationTableSize = 0; 114 unsigned int m_funcRelocationTableEntries = 0; 115 RelocListTy m_relocs; // duplicated information of m_funcRelocationTable, for zebin 116 void* m_funcAttributeTable = nullptr; 117 unsigned int m_funcAttributeTableSize = 0; 118 unsigned int m_funcAttributeTableEntries = 0; 119 FuncAttrListTy m_funcAttrs; // duplicated information of m_funcAttributeTable, for zebin 120 unsigned int m_offsetToSkipPerThreadDataLoad = 0; 121 uint32_t m_offsetToSkipSetFFIDGP = 0; 122 bool m_roundPower2KBytes = false; 123 bool m_UseScratchSpacePrivateMemory = true; 124 unsigned int m_scratchSpaceSizeLimit = 0; 125 unsigned int m_numGRFTotal = 128; 126 std::string m_VISAAsm; 127 128 // Optional statistics 129 std::optional<uint64_t> m_NumGRFSpill; 130 std::optional<uint64_t> m_NumGRFFill; 131 std::optional<uint64_t> m_NumSends; 132 std::optional<uint64_t> m_NumCycles; 133 std::optional<uint64_t> m_NumSendStallCycles; 134 135 DestroySProgramOutput136 void Destroy() 137 { 138 if (m_programBin) 139 { 140 IGC::aligned_free(m_programBin); 141 } 142 if (m_debugData) 143 { 144 IGC::aligned_free(m_debugData); 145 } 146 if (m_debugDataGenISA) 147 { 148 IGC::aligned_free(m_debugDataGenISA); 149 } 150 if (m_funcAttributeTable) 151 { 152 IGC::aligned_free(m_funcAttributeTable); 153 } 154 } 155 initSProgramOutput156 void init(bool roundPower2KBytes, unsigned int scratchSpaceSizeLimitT, bool useScratchSpacePrivateMemory) 157 { 158 m_roundPower2KBytes = roundPower2KBytes; 159 m_scratchSpaceSizeLimit = scratchSpaceSizeLimitT; 160 m_UseScratchSpacePrivateMemory = useScratchSpacePrivateMemory; 161 } 162 163 //InSlot0 164 //Todo: rename later getScratchSpaceUsageInSlot0SProgramOutput165 unsigned int getScratchSpaceUsageInSlot0() const 166 { 167 unsigned int privateMemoryScratchSpaceSize = 168 getScratchSpaceUsageInSlot1() > 0 || getScratchSpaceUsageInStateless() > 0 ? 0 : m_scratchSpaceUsedByShader; 169 unsigned int result = roundSize(m_scratchSpaceUsedBySpills + m_scratchSpaceUsedByGtpin + privateMemoryScratchSpaceSize); 170 IGC_ASSERT(result <= m_scratchSpaceSizeLimit); 171 return result; 172 } 173 getScratchSpaceUsageInSlot1SProgramOutput174 unsigned int getScratchSpaceUsageInSlot1() const 175 { 176 unsigned int result = 0; 177 //FIXME: temporarily disable slot1, enable it again when IGC is ready to handle r0.5+1 178 // result = roundSize(m_UseScratchSpacePrivateMemory ? m_scratchSpaceUsedByShader : 0); 179 IGC_ASSERT(result <= m_scratchSpaceSizeLimit); 180 return result; 181 } 182 getScratchSpaceUsageInStatelessSProgramOutput183 unsigned int getScratchSpaceUsageInStateless() const 184 { 185 return roundSize(!m_UseScratchSpacePrivateMemory ? m_scratchSpaceUsedByShader : 0); 186 } 187 setScratchSpaceUsedByShaderSProgramOutput188 void setScratchSpaceUsedByShader(unsigned int scratchSpaceUsedByShader) 189 { 190 m_scratchSpaceUsedByShader = scratchSpaceUsedByShader; 191 } 192 private: roundSizeSProgramOutput193 unsigned int roundSize(unsigned int size) const 194 { 195 if (m_roundPower2KBytes) 196 { 197 size = roundPower2KBbyte(size); 198 } 199 return size; 200 } 201 roundPower2KBbyteSProgramOutput202 unsigned int roundPower2KBbyte(unsigned int size) const 203 { 204 return (size ? iSTD::RoundPower2(iSTD::Max(int_cast<DWORD>(size), static_cast<DWORD>(sizeof(KILOBYTE)))) : 0); 205 } 206 207 // XeHP_SDV+ : we round to one of values: pow(2, (0, 6, 7, 8...18)) roundPower2ByteSProgramOutput208 unsigned int roundPower2Byte(unsigned int size) const 209 { 210 unsigned int ret = (size ? iSTD::RoundPower2(int_cast<DWORD>(size)) : 0); 211 //round any value in (0,32] to 64 BYTEs 212 ret = ((ret > 0 && ret <= 32) ? 64 : ret); 213 return ret; 214 } 215 }; 216 217 enum InstrStatTypes 218 { 219 SROA_PROMOTED, 220 LICM_STAT, 221 TOTAL_TYPES 222 }; 223 enum InstrStatStage 224 { 225 BEGIN, 226 END, 227 EXCEED_THRESHOLD, 228 TOTAL_STAGE 229 }; 230 231 struct SInstrTypes 232 { 233 bool CorrelatedValuePropagationEnable; 234 bool hasLoop; 235 bool hasMultipleBB; 236 bool hasCmp; 237 bool hasSwitch; 238 bool hasPhi; 239 bool hasLoadStore; 240 bool hasCall; 241 bool hasIndirectCall; 242 bool hasInlineAsm; 243 bool hasInlineAsmPointerAccess; 244 bool hasIndirectBranch; 245 bool hasFunctionAddressTaken; 246 bool hasSel; 247 bool hasPointer; 248 bool hasLocalLoadStore; 249 bool hasGlobalLoad; // has (stateless) loads from global addresspace 250 bool hasGlobalStore; // has (stateless) stores to global addresspace 251 bool hasStorageBufferLoad; // has (stateful) loads from storage buffers (UAV/SSBO) 252 bool hasStorageBufferStore; // has (stateful) stores to storage buffers (UAV/SSBO) 253 bool hasSubroutines; 254 bool hasPrimitiveAlloca; 255 bool hasNonPrimitiveAlloca; 256 bool hasReadOnlyArray; 257 bool hasBuiltin; 258 bool hasFRem; 259 bool psHasSideEffect; //<! only relevant to pixel shader, has other memory writes besides RTWrite 260 bool hasGenericAddressSpacePointers; 261 bool hasDebugInfo; //<! true only if module contains debug info !llvm.dbg.cu 262 bool hasAtomics; 263 bool hasBarrier; //<! true if module has thread group barrier 264 bool hasDiscard; 265 bool hasTypedRead; 266 bool hasTypedwrite; 267 bool mayHaveIndirectOperands; //<! true if code may have indirect operands like r5[a0]. 268 // true if shader may have indirect texture or buffer. 269 // Note: does not check for indirect sampler 270 bool mayHaveIndirectResources; 271 bool hasUniformAssumptions; 272 bool hasWaveIntrinsics; 273 bool hasPullBary; 274 bool sampleCmpToDiscardOptimizationPossible; 275 unsigned int sampleCmpToDiscardOptimizationSlot; 276 unsigned int numSample; 277 unsigned int numBB; 278 unsigned int numLoopInsts; 279 unsigned int numOfLoop; 280 unsigned int numInsts; //<! measured after optimization, used as a compiler heuristic 281 unsigned int numAllocaInsts; 282 unsigned int numPsInputs; 283 bool hasDynamicGenericLoadStore; 284 bool hasUnmaskedRegion; 285 unsigned int numGlobalInsts; 286 unsigned int numLocalInsts; 287 }; 288 289 struct SSimplePushInfo 290 { 291 // Constant buffer Binding Table Index or Surface State Offset. 292 // Valid only if 'isStateless' is false. 293 // If 'isBindless' is false then 'm_cbIdx' contains a Binding Table 294 // Index otherwise it contains a Surface State Offset in 64-byte units. 295 uint m_cbIdx = 0; 296 // m_pushableAddressGrfOffset and m_pushableOffsetGrfOffset are GRF 297 // offsets (in DWORDS) in the runtime data pushed to the shader. These 298 // fields are valid only if greater or equal to 0. If a field is valid 299 // it means that the runtime data from the GRF offset was used in 300 // the buffer address calculation. 301 // These fields must contain values provided by frontend in 302 // pushInfo.pushableAddresses metadata. 303 // m_pushableAddressGrfOffset is only valid when isStateless is true. 304 // m_pushableOffsetGrfOffset is only valid when isStateless or 305 // isBindless is true. 306 // When isStateless is true runtime data at m_pushableAddressGrfOffset 307 // contains a 64bit canonicalized address. Data starting at 308 // m_pushableOffsetGrfOffset contains 32bit offset relative to the 64bit 309 // starting address. 310 // PushAnalysiss pass matches the following pattern: 311 // uint8_t* pShaderRuntimeData ={...}; // to be pushed 312 // uint64_t pushableAddress = 313 // *(uint64_t*)(pShaderRuntimeData + 4*pushableAddressGrfOffset); 314 // if (pushableOffsetGrfOffset >=0) { 315 // pushableAddress += 316 // *(uint32_t*)(pShaderRuntimeData + 4*pushableOffsetGrfOffset); 317 // } 318 // pushableAddress += m_offset; 319 // 320 // m_pushableOffsetGrfOffset is also used when isBindless is true and 321 // contains the GRF offset that was used to calculate the Surface State 322 // Offset of the buffer. It must contain one of the values provided by 323 // frontend in pushInfo.bindlessPushInfo metadata. 324 int m_pushableAddressGrfOffset = -1; 325 int m_pushableOffsetGrfOffset = -1; 326 // Immediate offset in bytes add to the start of the simple push region. 327 uint m_offset = 0; 328 // Data size in bytes, must be a multiple of GRF size 329 uint m_size = 0; 330 bool isStateless = false; 331 bool isBindless = false; 332 }; 333 334 struct ConstantPayloadInfo 335 { 336 int DerivedConstantsOffset = -1; 337 }; 338 339 340 struct SResInfoFoldingOutput 341 { 342 uint32_t textureID; 343 bool value[4]; 344 }; 345 346 enum SIMDInfoBit 347 { 348 SIMD_SELECTED, // 0: if the SIMD is selected. If 1, all the other bits are ignored. 349 SIMD_RETRY, // 1: is a retry 350 SIMD_SKIP_HW, // 2: skip this SIMD due to HW restriction / WA. 351 SIMD_SKIP_REGPRES, // 3: skip this SIMD due to register pressure early out. 352 SIMD_SKIP_SPILL, // 4: skip this SIMD due to spill or high chance of spilling. 353 SIMD_SKIP_STALL, // 5: skip this SIMD due to stall cycle or thread occupancy heuristic. 354 SIMD_SKIP_THGRPSIZE, // 6: skip due to threadGroupSize heuristic(CS / OCL only). 355 SIMD_SKIP_PERF // 7: skip this SIMD due to performance concern (dx12 + discard, MRT, etc) or other reasons. 356 }; 357 358 enum SIMDInfoOffset 359 { 360 SIMD8_OFFSET = 0, 361 SIMD16_OFFSET = 8, 362 SIMD32_OFFSET = 16, 363 }; 364 365 struct SKernelProgram 366 { 367 SProgramOutput simd1; 368 SProgramOutput simd8; 369 SProgramOutput simd16; 370 SProgramOutput simd32; 371 unsigned int bindingTableEntryCount = 0; 372 373 char* gatherMap = nullptr; 374 unsigned int gatherMapSize = 0; 375 unsigned int ConstantBufferLength = 0; 376 unsigned int ConstantBufferMask = 0; 377 unsigned int MaxNumberOfThreads = 0; 378 bool isMessageTargetDataCacheDataPort = false; 379 380 unsigned int NOSBufferSize = 0; 381 unsigned int ConstantBufferLoaded = 0; 382 uint64_t UavLoaded = 0; 383 unsigned int ShaderResourceLoaded[4]; 384 unsigned int RenderTargetLoaded = 0; 385 386 bool hasControlFlow = false; 387 unsigned int bufferSlot = 0; 388 unsigned int statelessCBPushedSize = 0; 389 390 std::vector<SResInfoFoldingOutput> m_ResInfoFoldingOutput; 391 // GenUpdateCB outputs 392 void* m_ConstantBufferReplaceShaderPatterns = nullptr; 393 uint m_ConstantBufferReplaceShaderPatternsSize = 0; 394 uint m_ConstantBufferUsageMask = 0; 395 uint m_ConstantBufferReplaceSize = 0; 396 397 SSimplePushInfo simplePushInfoArr[g_c_maxNumberOfBufferPushed]; 398 399 uint64_t SIMDInfo; 400 }; 401 402 struct SPixelShaderKernelProgram : SKernelProgram 403 { 404 405 USC::GFX3DSTATE_SF_ATTRIBUTE_ACTIVE_COMPONENT attributeActiveComponent[g_c_Max_PS_attributes]; 406 DWORD m_AccessedBySampleC[4]; 407 408 unsigned int nbOfSFOutput; 409 unsigned int renderTargetMask; 410 unsigned int constantInterpolationEnableMask; 411 unsigned int primIdLocation; 412 unsigned int pointCoordLocation; 413 unsigned int samplerCount; 414 unsigned int BindingTableEntryBitmap; 415 unsigned int sampleCmpToDiscardOptimizationSlot; 416 417 unsigned char OutputUseMask[USC::NUM_PSHADER_OUTPUT_REGISTERS]; 418 419 bool needPerspectiveBary; 420 bool needPerspectiveCentroidBary; 421 bool needPerspectiveSampleBary; 422 bool needNonPerspectiveBary; 423 bool needNonPerspectiveCentroidBary; 424 bool needNonPerspectiveSampleBary; 425 bool needSourceDepth; 426 bool needSourceW; 427 bool hasInputCoverageMask; 428 bool hasPullBary; 429 bool killPixel; 430 bool outputDepth; 431 bool outputStencil; 432 bool isPerSample; 433 bool oMask; 434 bool VectorMask; 435 436 bool hasPrimID; 437 bool hasPointCoord; 438 bool isCoarsePS; 439 bool hasCoarsePixelSize; 440 bool hasSampleOffset; 441 bool hasZWDelta; 442 bool needPerspectiveBaryPlane; 443 bool needNonPerspectiveBaryPlane; 444 bool posXYOffsetEnable; 445 bool blendToFillEnabled; 446 bool forceEarlyZ; 447 448 bool sampleCmpToDiscardOptimizationPossible; 449 450 bool needPSSync; 451 }; 452 453 /// Gen10+, corresponds to 3DSTATE_VF_SGVS_2 as described below 454 struct SVertexFetchSGVExtendedParameters 455 { 456 struct 457 { 458 bool enabled = false; //<! XPn Enable = XPn Source Select = (*) 459 unsigned int location = 0; //<! Linear offset of the 32bit component in VUE 460 } extendedParameters[3] = {}; //<! Order of elements: XP0, XP1, XP2 461 }; 462 463 struct SVertexShaderKernelProgram : SKernelProgram 464 { 465 /// corresponds to 3DSTATE_VS Vertex URB Entry Read Length 466 OctEltUnit VertexURBEntryReadLength; 467 /// corresponds to 3DSTATE_VS Vertex URB Entry Read Offset 468 OctEltUnit VertexURBEntryReadOffset; 469 /// corresponds to 3DSTATE_VS VS Vertex URB Entry Output Length 470 OctEltUnit VertexURBEntryOutputReadLength; 471 /// corresponds to 3DSTATE_VS VS Vertex URB Entry Output Offset 472 OctEltUnit VertexURBEntryOutputReadOffset; 473 /// corresponds to 3DSTATE_SBE Vertex URB Entry Read Offset 474 OctEltUnit SBEURBReadOffset; 475 OctEltUnit URBAllocationSize; 476 QuadEltUnit MaxNumInputRegister; 477 478 bool enableElementComponentPacking; 479 /// corresponds to 3DSTATE_VF_COMPONENT_PACKING 480 unsigned char ElementComponentDeliverMask[32]; 481 /// vertex ID information 482 bool hasVertexID; 483 unsigned int vertexIdLocation; 484 /// instance ID information 485 bool hasInstanceID; 486 unsigned int instanceIdLocation; 487 bool singleInstanceVertexShader; 488 /// corresponds to 3DSTATE_VF_SGVS_2 489 SVertexFetchSGVExtendedParameters vertexFetchSGVExtendedParameters; 490 //RTAI and VPAI 491 bool DeclaresRTAIndex; 492 bool DeclaresVPAIndex; 493 494 DWORD m_AccessedBySampleC[4]; 495 bool HasClipCullAsOutput; 496 497 498 unsigned int BindingTableEntryBitmap; 499 unsigned int m_SamplerCount; 500 }; 501 502 struct SGeometryShaderKernelProgram : SKernelProgram 503 { 504 // Gen 7 defined ones 505 USC::GFX3DPRIMITIVE_TOPOLOGY_TYPE OutputTopology; 506 unsigned int SamplerCount; 507 QuadEltUnit OutputVertexSize; 508 OctEltUnit VertexEntryReadLength; // URB Entry Read Length 509 OctEltUnit VertexEntryReadOffset; 510 bool IncludeVertexHandles; 511 USC::GFX3DSTATE_CONTROL_DATA_FORMAT ControlDataHeaderFormat; 512 OctEltUnit ControlDataHeaderSize; 513 unsigned int DefaultStreamID; 514 unsigned int InstanceCount; 515 USC::GFX3DSTATE_GEOMETRY_SHADER_DISPATCH_MODE DispatchMode; 516 bool IncludePrimitiveIDEnable; 517 bool ReorderEnable; 518 bool DiscardAdjacencyEnable; 519 OctEltUnit SBEVertexURBEntryReadOffset; 520 URBAllocationUnit URBAllocationSize; 521 unsigned int UserClipDistancesMask; 522 unsigned int UserCullDistancesMask; 523 unsigned int MaxOutputVertexCount; 524 unsigned int BindingTableEntryBitmap; 525 526 bool DeclaresVPAIndex; 527 bool DeclaresRTAIndex; 528 529 USC::GFX3DSTATE_PROGRAM_FLOW SingleProgramFlow; 530 bool GSEnable; 531 532 // Gen 8 defined ones 533 unsigned int ExpectedVertexCount; 534 unsigned int StaticOutputVertexCount; 535 OctEltUnit GSVertexURBEntryOutputReadOffset; 536 OctEltUnit GSVertexURBEntryOutputReadLength; 537 538 bool StaticOutput; 539 540 DWORD m_AccessedBySampleC[4]; 541 542 bool m_bCanEnableRectList; 543 }; 544 545 struct SComputeShaderKernelProgram : SKernelProgram 546 { 547 USC::GFX3DSTATE_FLOATING_POINT_MODE FloatingPointMode; 548 USC::GFX3DSTATE_PROGRAM_FLOW SingleProgramFlow; 549 550 unsigned int SamplerCount; 551 unsigned int BindingTableEntryCount; 552 unsigned int CurbeReadOffset; 553 unsigned int CurbeReadLength; 554 unsigned int PhysicalThreadsInGroup; 555 556 bool BarrierUsed; 557 558 USC::GFX3DSTATE_ROUNDING_MODE RoundingMode; 559 560 unsigned int BarrierReturnGRFOffset; 561 562 int GtwBypass; 563 int GtwResetTimer; 564 565 unsigned int URBEntriesNum; 566 unsigned int URBEntryAllocationSize; 567 unsigned int CurbeTotalDataLength; 568 569 USC::GFXMEDIA_GPUWALKER_SIMD SimdWidth; 570 571 unsigned int ThreadGroupSize; 572 unsigned int SlmSize; 573 574 void* ThreadPayloadData; 575 576 unsigned int CSHThreadDispatchChannel; 577 578 bool CompiledForIndirectPayload; 579 580 bool DispatchAlongY; 581 582 unsigned int ThreadGroupModifier_X; 583 unsigned int ThreadGroupModifier_Y; 584 585 /* Output related to only the PingPong Textures */ 586 bool SecondCompile; 587 bool IsRowMajor; 588 bool PerformSecondCompile; 589 590 unsigned int NumChannelsUsed; 591 bool DisableMidThreadPreemption; 592 593 DWORD m_AccessedBySampleC[4]; 594 }; 595 596 struct SHullShaderKernelProgram : SKernelProgram 597 { 598 bool IncludeVertexHandles; 599 OctEltUnit URBAllocationSize; 600 OctEltUnit PatchConstantURBSize; 601 OctEltUnit VertexURBEntryReadLength; 602 OctEltUnit VertexURBEntryReadOffset; 603 bool IncludePrimitiveIDEnable; 604 HullShaderDispatchModes DispatchMode; 605 unsigned int InstanceCount; 606 DWORD m_AccessedBySampleC[4]; 607 unsigned int BindingTableEntryBitmap; 608 }; 609 610 struct SDomainShaderKernelProgram : SKernelProgram 611 { 612 OctEltUnit URBAllocationSize; 613 OctEltUnit VertexURBEntryReadLength; 614 OctEltUnit VertexURBEntryReadOffset; 615 OctEltUnit VertexURBEntryOutputLength; 616 OctEltUnit VertexURBEntryOutputReadOffset; 617 bool ComputeWAttribute; 618 DomainShaderDispatchModes DispatchMode; 619 SProgramOutput simd8DualPatch; 620 bool DeclaresRTAIndex; 621 bool DeclaresVPAIndex; 622 bool HasClipCullAsOutput; 623 bool HasPrimitiveIDInput; 624 DWORD m_AccessedBySampleC[4]; 625 unsigned int BindingTableEntryBitmap; 626 }; 627 628 629 struct SOpenCLKernelInfo 630 { 631 struct SResourceInfo 632 { 633 enum { RES_UAV, RES_SRV, RES_OTHER } Type; 634 int Index; 635 }; 636 SOpenCLKernelInfoSOpenCLKernelInfo637 SOpenCLKernelInfo() {}; 638 639 std::string m_kernelName = {}; 640 QWORD m_ShaderHashCode = {}; 641 642 std::vector<std::unique_ptr<iOpenCL::PointerInputAnnotation>> m_pointerInput; 643 std::vector<std::shared_ptr<iOpenCL::PointerArgumentAnnotation>> m_pointerArgument; 644 std::vector<std::unique_ptr<iOpenCL::LocalArgumentAnnotation>> m_localPointerArgument; 645 std::vector<std::unique_ptr<iOpenCL::SamplerInputAnnotation>> m_samplerInput; 646 std::vector<std::unique_ptr<iOpenCL::SamplerArgumentAnnotation>> m_samplerArgument; 647 std::vector<std::unique_ptr<iOpenCL::ConstantInputAnnotation>> m_constantInputAnnotation; 648 std::vector<std::unique_ptr<iOpenCL::ConstantArgumentAnnotation>> m_constantArgumentAnnotation; 649 std::vector<std::unique_ptr<iOpenCL::ImageArgumentAnnotation>> m_imageInputAnnotations; 650 std::vector<std::unique_ptr<iOpenCL::KernelArgumentInfoAnnotation>> m_kernelArgInfo; 651 std::vector<std::unique_ptr<iOpenCL::PrintfStringAnnotation>> m_printfStringAnnotations; 652 653 std::unique_ptr<iOpenCL::PrintfBufferAnnotation> m_printfBufferAnnotation = nullptr; 654 std::unique_ptr<iOpenCL::SyncBufferAnnotation> m_syncBufferAnnotation = nullptr; 655 std::unique_ptr<iOpenCL::StartGASAnnotation> m_startGAS = nullptr; 656 std::unique_ptr<iOpenCL::WindowSizeGASAnnotation> m_WindowSizeGAS = nullptr; 657 std::unique_ptr<iOpenCL::PrivateMemSizeAnnotation> m_PrivateMemSize = nullptr; 658 std::string m_kernelAttributeInfo = {}; 659 660 bool m_HasInlineVmeSamplers = false; 661 662 // This maps argument numbers to BTI and sampler indices 663 // (e.g. kernel argument 3, which is is an image_2d, may be mapped to BTI 6) 664 std::map<DWORD, unsigned int> m_argIndexMap = {}; 665 666 std::map<unsigned int, std::shared_ptr<iOpenCL::PointerArgumentAnnotation>> m_argOffsetMap = {}; 667 668 iOpenCL::ThreadPayload m_threadPayload = {}; 669 670 iOpenCL::ExecutionEnivronment m_executionEnivronment = {}; 671 672 iOpenCL::KernelTypeProgramBinaryInfo m_kernelTypeInfo = {}; 673 674 SKernelProgram m_kernelProgram = {}; 675 676 // Information for zebin 677 // Cross-thread payload arguments 678 zebin::PayloadArgumentsTy m_zePayloadArgs; 679 // BTI information for payload arguments 680 zebin::BindingTableIndicesTy m_zeBTIArgs; 681 682 // Analysis result of if there are non-kernel-argument ld/st in the kernel 683 // If all false, we can avoid expensive memory setting of each kernel during runtime 684 int m_hasNonKernelArgLoad = -1; 685 int m_hasNonKernelArgStore = -1; 686 int m_hasNonKernelArgAtomic = -1; 687 }; 688 689 690 struct SOpenCLProgramInfo 691 { 692 struct ZEBinRelocTable 693 { 694 std::vector<vISA::ZERelocEntry> globalReloc; 695 std::vector<vISA::ZERelocEntry> globalConstReloc; 696 }; 697 // program scope symbols 698 struct ZEBinProgramSymbolTable 699 { 700 using SymbolSeq = std::vector<vISA::ZESymEntry>; 701 SymbolSeq global; // global symbols 702 SymbolSeq globalConst; // global constant symbols 703 SymbolSeq globalStringConst; // global string constant symbols 704 }; 705 struct LegacySymbolTable 706 { 707 void* m_buffer = nullptr; 708 unsigned int m_size = 0; 709 unsigned int m_entries = 0; 710 }; 711 712 std::unique_ptr<iOpenCL::InitConstantAnnotation> m_initConstantAnnotation; 713 std::unique_ptr<iOpenCL::InitConstantAnnotation> m_initConstantStringAnnotation; 714 std::unique_ptr<iOpenCL::InitGlobalAnnotation> m_initGlobalAnnotation; 715 std::vector<std::unique_ptr<iOpenCL::ConstantPointerAnnotation> > m_initConstantPointerAnnotation; 716 std::vector<std::unique_ptr<iOpenCL::GlobalPointerAnnotation> > m_initGlobalPointerAnnotation; 717 std::vector<std::unique_ptr<iOpenCL::KernelTypeProgramBinaryInfo> > m_initKernelTypeAnnotation; 718 719 ZEBinRelocTable m_GlobalPointerAddressRelocAnnotation; 720 ZEBinProgramSymbolTable m_zebinSymbolTable; 721 LegacySymbolTable m_legacySymbolTable; 722 }; 723 724 class CBTILayout 725 { 726 public: 727 unsigned int GetSystemThreadBindingTableIndex(void) const; 728 unsigned int GetBindingTableEntryCount(void) const; 729 unsigned int GetTextureIndex(unsigned int index) const; 730 unsigned int GetUavIndex(unsigned int index) const; 731 unsigned int GetRenderTargetIndex(unsigned int index) const; 732 unsigned int GetConstantBufferIndex(unsigned int index) const; GetTextureIndexSize()733 unsigned int GetTextureIndexSize() const { return m_pLayout->maxResourceIdx - m_pLayout->minResourceIdx; } GetUavIndexSize()734 unsigned int GetUavIndexSize() const { return m_pLayout->maxUAVIdx - m_pLayout->minUAVIdx; } GetRenderTargetIndexSize()735 unsigned int GetRenderTargetIndexSize() const { return m_pLayout->maxColorBufferIdx - m_pLayout->minColorBufferIdx; } GetConstantBufferIndexSize()736 unsigned int GetConstantBufferIndexSize() const { return m_pLayout->maxConstantBufferIdx - m_pLayout->minConstantBufferIdx; } 737 unsigned int GetNullSurfaceIdx() const; 738 unsigned int GetTGSMIndex() const; 739 unsigned int GetScratchSurfaceBindingTableIndex() const; 740 unsigned int GetStatelessBindingTableIndex() const; 741 unsigned int GetImmediateConstantBufferOffset() const; 742 unsigned int GetDrawIndirectBufferIndex() const; GetBtLayout()743 const USC::SShaderStageBTLayout* GetBtLayout() const { return m_pLayout; }; GetColorBufferMappingTable()744 const std::vector<unsigned char>& GetColorBufferMappingTable() const { return m_ColorBufferMappings; } 745 CBTILayout(const USC::SShaderStageBTLayout * pLayout)746 CBTILayout(const USC::SShaderStageBTLayout* pLayout) : m_pLayout(pLayout) 747 {} 748 CBTILayout(const USC::SShaderStageBTLayout * pLayout,const std::vector<unsigned char> & colorBufferMappings)749 CBTILayout( 750 const USC::SShaderStageBTLayout* pLayout, 751 const std::vector<unsigned char>& colorBufferMappings) : 752 m_pLayout(pLayout), 753 m_ColorBufferMappings(colorBufferMappings) 754 {} 755 756 protected: 757 const USC::SShaderStageBTLayout* m_pLayout; 758 759 // Vulkan front end provides a separate vector with color buffer mappings. 760 const std::vector<unsigned char> m_ColorBufferMappings; 761 }; 762 763 // This is insanely ugly, but it's the pretties solution we could 764 // think of that preserves the GFX code. 765 // This is temporary and will go away once image access between 766 // OCL and GFX is unified. 767 // This happens because in GFX the layout comes from the driver and is 768 // immutable, while in OCL we need to change the layout mid-codegen. 769 class COCLBTILayout : public CBTILayout 770 { 771 public: COCLBTILayout(const USC::SShaderStageBTLayout * pLayout)772 COCLBTILayout(const USC::SShaderStageBTLayout* pLayout) : CBTILayout(pLayout) 773 {} 774 775 USC::SShaderStageBTLayout* getModifiableLayout(); 776 }; 777 778 class RetryManager 779 { 780 public: 781 RetryManager(); 782 ~RetryManager(); 783 784 bool AdvanceState(); 785 bool AllowLICM(); 786 bool AllowPromotePrivateMemory(); 787 bool AllowPreRAScheduler(); 788 bool AllowVISAPreRAScheduler(); 789 bool AllowCodeSinking(); 790 bool AllowSimd32Slicing(); 791 bool AllowLargeURBWrite(); 792 void SetFirstStateId(int id); 793 bool IsFirstTry(); 794 bool IsLastTry(); 795 unsigned GetRetryId() const; 796 797 void Enable(); 798 void Disable(); 799 800 void SetSpillSize(unsigned int spillSize); 801 unsigned int GetLastSpillSize(); 802 unsigned int numInstructions = 0; 803 /// the set of OCL kernels that need to recompile 804 std::set<std::string> kernelSet; 805 806 void ClearSpillParams(); 807 // save entry for given SIMD mode, to avoid recompile for next retry. 808 void SaveSIMDEntry(SIMDMode simdMode, CShader* shader); 809 CShader* GetSIMDEntry(SIMDMode simdMode); 810 bool AnyKernelSpills(); 811 812 // Try to pickup the simd mode & kernel based on heuristics and fill 813 // programOutput. If returning true, then stop the further retry. 814 bool PickupKernels(CodeGenContext* cgCtx); 815 816 private: 817 unsigned stateId; 818 // For debugging purposes, it can be useful to start on a particular 819 // ID rather than id 0. 820 unsigned firstStateId; 821 822 unsigned getStateCnt(); 823 824 /// internal knob to disable retry manager. 825 bool enabled; 826 827 unsigned lastSpillSize = 0; 828 829 // cache the compiled kernel during retry 830 CShader* m_simdEntries[3]; 831 832 CShader* PickCSEntryForcedFromDriver(SIMDMode& simdMode, 833 unsigned char forcedSIMDModeFromDriver); 834 CShader* PickCSEntryByRegKey(SIMDMode& simdMode, ComputeShaderContext* cgCtx); 835 CShader* PickCSEntryEarly(SIMDMode& simdMode, 836 ComputeShaderContext* cgCtx); 837 CShader* PickCSEntryFinally(SIMDMode& simdMode); 838 void FreeAllocatedMemForNotPickedCS(SIMDMode simdMode); 839 bool PickupCS(ComputeShaderContext* cgCtx); 840 }; 841 842 /// this class adds intrinsic cache to LLVM context 843 class LLVMContextWrapper : public llvm::LLVMContext 844 { 845 LLVMContextWrapper(LLVMContextWrapper&) = delete; 846 LLVMContextWrapper& operator =(LLVMContextWrapper&) = delete; 847 848 public: 849 LLVMContextWrapper(bool createResourceDimTypes = true); 850 /// ref count the LLVMContext as now CodeGenContext owns it 851 unsigned int refCount = 0; 852 /// IntrinsicIDCache - Cache of intrinsic pointer to numeric ID mappings 853 /// requested in this context 854 typedef llvm::ValueMap<const llvm::Function*, unsigned> SafeIntrinsicIDCacheTy; 855 SafeIntrinsicIDCacheTy m_SafeIntrinsicIDCache; 856 void AddRef(); 857 void Release(); 858 }; 859 860 861 class CodeGenContext 862 { 863 public: 864 /// input: hash key 865 ShaderHash hash; 866 ShaderType type; 867 /// input: Platform features supported 868 const CPlatform& platform; 869 /// input: binding table layout used by the driver 870 const CBTILayout& btiLayout; 871 /// information about the driver 872 const CDriverInfo& m_DriverInfo; 873 /// output: driver instrumentation 874 TimeStats* m_compilerTimeStats = nullptr; 875 ShaderStats* m_sumShaderStats = nullptr; 876 /// output: list of buffer IDs which are promoted to direct AS 877 // Map of promoted buffer ids with their respective buffer offsets if needed. Buffer offset will be -1 if no need of buffer offset 878 std::map<unsigned, int> m_buffersPromotedToDirectAS; 879 // float 16, float32 and float64 denorm mode 880 Float_DenormMode m_floatDenormMode16 = FLOAT_DENORM_FLUSH_TO_ZERO; 881 Float_DenormMode m_floatDenormMode32 = FLOAT_DENORM_FLUSH_TO_ZERO; 882 Float_DenormMode m_floatDenormMode64 = FLOAT_DENORM_FLUSH_TO_ZERO; 883 884 PushConstantMode m_pushConstantMode = PushConstantMode::DEFAULT; 885 886 SInstrTypes m_instrTypes; 887 888 ///// used for instruction statistic before/after pass 889 int instrStat[TOTAL_TYPES][TOTAL_STAGE]; 890 891 // Module flag for subroutines/stackcalls enabled 892 bool m_enableSubroutine = false; 893 // Module flag for function pointers enabled 894 bool m_enableFunctionPointer = false; 895 // Module flag for when we need to compile multiple SIMD sizes to support SIMD variants 896 bool m_enableSimdVariantCompilation = false; 897 898 // Adding multiversioning to partially redundant samples, if AIL is on. 899 bool m_enableSampleMultiversioning = false; 900 901 // Do not generate gen binary, emit vISA only. 902 bool m_compileToVISAOnly = false; 903 904 bool m_src1RemovedForBlendOpt = false; 905 llvm::AssemblyAnnotationWriter* annotater = nullptr; 906 907 RetryManager m_retryManager; 908 909 IGCMetrics::IGCMetric metrics; 910 911 // shader stat for opt customization 912 uint32_t m_tempCount = 0; 913 uint32_t m_sampler = 0; 914 uint32_t m_inputCount = 0; 915 uint32_t m_dxbcCount = 0; 916 uint32_t m_ConstantBufferCount = 0; 917 uint32_t m_numGradientSinked = 0; 918 std::vector<unsigned> m_indexableTempSize; 919 bool m_highPsRegisterPressure = 0; 920 921 // Record previous simd for code patching 922 CShader* m_prevShader = nullptr; 923 924 // For IR dump after pass 925 unsigned m_numPasses = 0; 926 bool m_threadCombiningOptDone = false; 927 928 void* m_ConstantBufferReplaceShaderPatterns = nullptr; 929 uint m_ConstantBufferReplaceShaderPatternsSize = 0; 930 uint m_ConstantBufferUsageMask = 0; 931 uint m_ConstantBufferReplaceSize = 0; 932 // tracking next available GRF offset for constants payload 933 unsigned int m_constantPayloadNextAvailableGRFOffset = 0; 934 ConstantPayloadInfo m_constantPayloadOffsets; 935 936 void* gtpin_init = nullptr; 937 bool m_hasLegacyDebugInfo = false; 938 bool m_hasEmu64BitInsts = false; 939 940 CompilerStats m_Stats; 941 // Flag for staged compilation 942 CG_FLAG_t m_CgFlag = FLAG_CG_ALL_SIMDS; 943 // Staging context passing from Stage 1 for compile continuation 944 CG_CTX_t* m_StagingCtx = nullptr; 945 // We determine whether generating SIMD32 based on SIMD16's result 946 // For staged compilation, we record if SIMD32 will be generated in Stage1, and 947 // pass it to Stage2. 948 bool m_doSimd32Stage2 = false; 949 bool m_doSimd16Stage2 = false; 950 std::string m_savedBitcodeString; 951 SInstrTypes m_savedInstrTypes; 952 953 bool m_hasVendorExtension = false; 954 bool PsHighSimdDisable = false; 955 956 std::vector<int> m_hsIdxMap; 957 std::vector<int> m_dsIdxMap; 958 std::vector<int> m_gsIdxMap; 959 std::vector<int> m_hsNonDefaultIdxMap; 960 std::vector<int> m_dsNonDefaultIdxMap; 961 std::vector<int> m_gsNonDefaultIdxMap; 962 std::vector<int> m_psIdxMap; 963 DWORD dsInSize = 0; 964 DWORD LtoUsedMask = 0; 965 uint64_t m_SIMDInfo; 966 private: 967 //For storing error message 968 std::stringstream oclErrorMessage; 969 //For storing warning message 970 std::stringstream oclWarningMessage; 971 972 protected: 973 // Objects pointed to by these pointers are owned by this class. 974 LLVMContextWrapper* llvmCtxWrapper; 975 /// input: LLVM module 976 IGCLLVM::Module* module = nullptr; 977 /// input: IGC MetaData Utils 978 IGC::IGCMD::MetaDataUtils* m_pMdUtils = nullptr; 979 IGC::ModuleMetaData* modMD = nullptr; 980 981 virtual void setFlagsPerCtx(); 982 public: 983 CodeGenContext( 984 ShaderType _type, ///< shader type 985 const CBTILayout& _bitLayout, ///< binding table layout to be used in code gen 986 const CPlatform& _platform, ///< IGC HW platform description 987 const CDriverInfo& driverInfo, ///< Queries to know runtime features support 988 const bool createResourceDimTypes = true, 989 LLVMContextWrapper* LLVMContext = nullptr)///< LLVM context to use, if null a new one will be created type(_type)990 : type(_type), platform(_platform), btiLayout(_bitLayout), m_DriverInfo(driverInfo), 991 llvmCtxWrapper(LLVMContext), m_SIMDInfo(0) 992 { 993 if (llvmCtxWrapper == nullptr) 994 { 995 initLLVMContextWrapper(createResourceDimTypes); 996 } 997 else 998 { 999 llvmCtxWrapper->AddRef(); 1000 } 1001 1002 m_indexableTempSize.resize(64); 1003 1004 for (uint i = 0; i < TOTAL_TYPES; i++) 1005 { 1006 for (uint j = 0; j < TOTAL_STAGE; j++) 1007 { 1008 instrStat[i][j] = 0; 1009 } 1010 } 1011 1012 // Per context flag adjustment 1013 setFlagsPerCtx(); 1014 } 1015 1016 CodeGenContext(CodeGenContext&) = delete; 1017 CodeGenContext& operator =(CodeGenContext&) = delete; 1018 1019 void initLLVMContextWrapper(bool createResourceDimTypes = true); 1020 llvm::LLVMContext* getLLVMContext() const; 1021 IGC::IGCMD::MetaDataUtils* getMetaDataUtils() const; 1022 IGCLLVM::Module* getModule() const; 1023 1024 void setModule(llvm::Module* m); 1025 // Several clients explicitly delete module without resetting module to null. 1026 // This causes the issue later when the dtor is invoked (trying to delete a 1027 // dangling pointer again). This function is used to replace any explicit 1028 // delete in order to prevent deleting dangling pointers happening. 1029 void deleteModule(); 1030 IGC::ModuleMetaData* getModuleMetaData() const; 1031 unsigned int getRegisterPointerSizeInBits(unsigned int AS) const; 1032 bool enableFunctionCall() const; 1033 void CheckEnableSubroutine(llvm::Module& M); 1034 virtual void InitVarMetaData(); 1035 virtual ~CodeGenContext(); 1036 void clear(); 1037 void EmitError(std::ostream &OS, const char* errorstr, const llvm::Value *context) const; 1038 void EmitError(const char* errorstr, const llvm::Value *context); 1039 void EmitWarning(const char* warningstr); HasError()1040 inline bool HasError() const { return !this->oclErrorMessage.str().empty(); } HasWarning()1041 inline bool HasWarning() const { return !this->oclWarningMessage.str().empty(); } GetWarning()1042 inline const std::string GetWarning() { return this->oclWarningMessage.str(); } GetError()1043 inline const std::string GetError() { return this->oclErrorMessage.str(); } GetErrorAndWarning()1044 inline const std::string GetErrorAndWarning() { return GetWarning() + GetError(); } 1045 1046 CompOptions& getCompilerOption(); 1047 virtual void resetOnRetry(); 1048 virtual uint32_t getNumThreadsPerEU() const; 1049 virtual uint32_t getNumGRFPerThread() const; 1050 virtual bool forceGlobalMemoryAllocation() const; 1051 virtual bool allocatePrivateAsGlobalBuffer() const; 1052 virtual bool hasNoLocalToGenericCast() const; 1053 virtual bool hasNoPrivateToGenericCast() const; 1054 virtual int16_t getVectorCoalescingControl() const; 1055 bool isPOSH() const; 1056 Stats()1057 CompilerStats& Stats() 1058 { 1059 return m_Stats; 1060 } 1061 GetSIMDInfoOffset(SIMDMode simd,ShaderDispatchMode mode)1062 unsigned int GetSIMDInfoOffset(SIMDMode simd, ShaderDispatchMode mode) 1063 { 1064 unsigned int offset = 0; 1065 1066 switch (mode) { 1067 case ShaderDispatchMode::NOT_APPLICABLE: 1068 switch (simd) { 1069 case SIMDMode::SIMD8: 1070 offset = SIMD8_OFFSET; 1071 break; 1072 case SIMDMode::SIMD16: 1073 offset = SIMD16_OFFSET; 1074 break; 1075 case SIMDMode::SIMD32: 1076 offset = SIMD32_OFFSET; 1077 break; 1078 default: 1079 break; 1080 } 1081 break; 1082 1083 default: 1084 break; 1085 } 1086 return offset; 1087 } 1088 SetSIMDInfo(SIMDInfoBit bit,SIMDMode simd,ShaderDispatchMode mode)1089 void SetSIMDInfo(SIMDInfoBit bit, SIMDMode simd, ShaderDispatchMode mode) 1090 { 1091 unsigned int offset = GetSIMDInfoOffset(simd, mode); 1092 m_SIMDInfo |= (uint64_t)1 << (bit + offset); 1093 } 1094 ClearSIMDInfo(SIMDMode simd,ShaderDispatchMode mode)1095 void ClearSIMDInfo(SIMDMode simd, ShaderDispatchMode mode) 1096 { 1097 unsigned int offset = GetSIMDInfoOffset(simd, mode); 1098 m_SIMDInfo &= ~(0xff << offset); 1099 } 1100 GetSIMDInfo()1101 uint64_t GetSIMDInfo() { return m_SIMDInfo; } 1102 knownSIMDSize()1103 virtual llvm::Optional<SIMDMode> knownSIMDSize() const { 1104 return llvm::None; 1105 } 1106 1107 // This can be paired with `EncodeAS4GFXResource()` to get a unique 1108 // index. getUniqueIndirectIdx()1109 uint32_t getUniqueIndirectIdx() 1110 { 1111 return getModuleMetaData()->CurUniqueIndirectIdx++; 1112 } 1113 1114 // Frontends may elect to compute indices in their own way. If so, 1115 // they should call this at the end to mark the max index they have 1116 // reserved so that later passes can ensure that `getUniqueIndirectIdx()` 1117 // won't collide with any indices from the frontend. setUniqueIndirectIdx(uint32_t NewVal)1118 void setUniqueIndirectIdx(uint32_t NewVal) 1119 { 1120 uint32_t &CurVal = getModuleMetaData()->CurUniqueIndirectIdx; 1121 CurVal = std::max(CurVal, NewVal); 1122 } 1123 }; 1124 1125 class VertexShaderContext : public CodeGenContext 1126 { 1127 public: 1128 // output: shader information 1129 SVertexShaderKernelProgram programOutput; 1130 VertexShaderContext( 1131 const CBTILayout& btiLayout, ///< binding table layout to be used in code gen 1132 const CPlatform& platform, ///< IGC HW platform description 1133 const CDriverInfo& driverInfo, 1134 const bool createResourceDimTypes = true, 1135 LLVMContextWrapper* llvmCtxWrapper = nullptr) ///< LLVM context to use, if null a new one will be created CodeGenContext(ShaderType::VERTEX_SHADER,btiLayout,platform,driverInfo,createResourceDimTypes,llvmCtxWrapper)1136 : CodeGenContext(ShaderType::VERTEX_SHADER, btiLayout, platform, driverInfo, createResourceDimTypes, llvmCtxWrapper), 1137 programOutput() 1138 { 1139 } 1140 1141 }; 1142 1143 class PixelShaderContext : public CodeGenContext 1144 { 1145 public: 1146 // output: shader information 1147 SPixelShaderKernelProgram programOutput; 1148 PixelShaderContext( 1149 const CBTILayout& btiLayout, ///< binding table layout to be used in code gen 1150 const CPlatform& platform, ///< IGC HW platform description 1151 const CDriverInfo& driverInfo, 1152 const bool createResourceDimTypes = true, 1153 LLVMContextWrapper* llvmCtxWrapper = nullptr) ///< LLVM context to use, if null a new one will be created CodeGenContext(ShaderType::PIXEL_SHADER,btiLayout,platform,driverInfo,createResourceDimTypes,llvmCtxWrapper)1154 : CodeGenContext(ShaderType::PIXEL_SHADER, btiLayout, platform, driverInfo, createResourceDimTypes, llvmCtxWrapper), 1155 programOutput() 1156 { 1157 } 1158 }; 1159 1160 class GeometryShaderContext : public CodeGenContext 1161 { 1162 public: 1163 // output: shader information 1164 SGeometryShaderKernelProgram programOutput; 1165 GeometryShaderContext( 1166 const CBTILayout& btiLayout, ///< binding table layout to be used in code gen 1167 const CPlatform& platform, ///< IGC HW platform description 1168 const CDriverInfo& driverInfo, 1169 const bool createResourceDimTypes = true, 1170 LLVMContextWrapper* llvmCtxWrapper = nullptr) ///< LLVM context to use, if null a new one will be created CodeGenContext(ShaderType::GEOMETRY_SHADER,btiLayout,platform,driverInfo,createResourceDimTypes,llvmCtxWrapper)1171 : CodeGenContext(ShaderType::GEOMETRY_SHADER, btiLayout, platform, driverInfo, createResourceDimTypes, llvmCtxWrapper), 1172 programOutput() 1173 { 1174 } 1175 }; 1176 1177 struct SComputeShaderSecondCompileInput 1178 { 1179 bool secondCompile; 1180 bool isRowMajor; 1181 int numChannelsUsed; 1182 int runtimeVal_LoopCount; 1183 int runtimeVal_ResWidthOrHeight; 1184 int runtimeVal_ConstBufferSize; 1185 SComputeShaderSecondCompileInputSComputeShaderSecondCompileInput1186 SComputeShaderSecondCompileInput() 1187 : secondCompile(false) 1188 , isRowMajor(false) 1189 , numChannelsUsed(0) 1190 , runtimeVal_LoopCount(0) 1191 , runtimeVal_ResWidthOrHeight(0) 1192 , runtimeVal_ConstBufferSize(0) 1193 {} 1194 }; 1195 1196 class ComputeShaderContext : public CodeGenContext 1197 { 1198 public: 1199 SComputeShaderKernelProgram programOutput; 1200 bool isSecondCompile; 1201 bool m_IsPingPongSecond; 1202 unsigned m_slmSize; 1203 bool numWorkGroupsUsed; 1204 bool m_ForceOneSIMD = false; 1205 1206 ComputeShaderContext( 1207 const CBTILayout& btiLayout, ///< binding table layout to be used in code gen 1208 const CPlatform& platform, ///< IGC HW platform description 1209 const CDriverInfo& driverInfo, 1210 const bool createResourceDimTypes = true, 1211 LLVMContextWrapper* llvmCtxWrapper = nullptr) ///< LLVM context to use, if null a new one will be created CodeGenContext(ShaderType::COMPUTE_SHADER,btiLayout,platform,driverInfo,createResourceDimTypes,llvmCtxWrapper)1212 : CodeGenContext(ShaderType::COMPUTE_SHADER, btiLayout, platform, driverInfo, createResourceDimTypes, llvmCtxWrapper), 1213 programOutput() 1214 { 1215 isSecondCompile = false; 1216 m_IsPingPongSecond = false; 1217 m_slmSize = 0; 1218 numWorkGroupsUsed = false; 1219 m_threadGroupSize_X = 0; 1220 m_threadGroupSize_Y = 0; 1221 m_threadGroupSize_Z = 0; 1222 } 1223 1224 /** get shader's thread group size */ 1225 unsigned GetThreadGroupSize(); GetThreadGroupSizeX()1226 unsigned GetThreadGroupSizeX() { return m_threadGroupSize_X; } GetThreadGroupSizeY()1227 unsigned GetThreadGroupSizeY() { return m_threadGroupSize_Y; } GetThreadGroupSizeZ()1228 unsigned GetThreadGroupSizeZ() { return m_threadGroupSize_Z; } 1229 unsigned GetSlmSizePerSubslice(); 1230 unsigned GetSlmSize() const; 1231 float GetThreadOccupancy(SIMDMode simdMode); 1232 /** get smallest SIMD mode allowed based on thread group size */ 1233 SIMDMode GetLeastSIMDModeAllowed(); 1234 /** get largest SIMD mode for performance based on thread group size */ 1235 SIMDMode GetMaxSIMDMode(); 1236 1237 float GetSpillThreshold() const; 1238 private: 1239 unsigned m_threadGroupSize_X; 1240 unsigned m_threadGroupSize_Y; 1241 unsigned m_threadGroupSize_Z; 1242 }; 1243 1244 class HullShaderContext : public CodeGenContext 1245 { 1246 public: 1247 // output: shader information 1248 SHullShaderKernelProgram programOutput; 1249 HullShaderContext( 1250 const CBTILayout& btiLayout, ///< binding table layout to be used in code gen 1251 const CPlatform& platform, ///< IGC HW platform description 1252 const CDriverInfo& driverInfo, 1253 const bool createResourceDimTypes = true, 1254 LLVMContextWrapper* llvmCtxWrapper = nullptr) ///< LLVM context to use, if null a new one will be created CodeGenContext(ShaderType::HULL_SHADER,btiLayout,platform,driverInfo,createResourceDimTypes,llvmCtxWrapper)1255 : CodeGenContext(ShaderType::HULL_SHADER, btiLayout, platform, driverInfo, createResourceDimTypes, llvmCtxWrapper), 1256 programOutput() 1257 { 1258 } 1259 }; 1260 1261 class DomainShaderContext : public CodeGenContext 1262 { 1263 public: 1264 // output: shader information 1265 SDomainShaderKernelProgram programOutput; 1266 DomainShaderContext( 1267 const CBTILayout& btiLayout, ///< binding table layout to be used in code gen 1268 const CPlatform& platform, ///< IGC HW platform description 1269 const CDriverInfo& driverInfo, 1270 const bool createResourceDimTypes = true, 1271 LLVMContextWrapper* llvmCtxWrapper = nullptr) ///< LLVM context to use, if null a new one will be created CodeGenContext(ShaderType::DOMAIN_SHADER,btiLayout,platform,driverInfo,createResourceDimTypes,llvmCtxWrapper)1272 : CodeGenContext(ShaderType::DOMAIN_SHADER, btiLayout, platform, driverInfo, createResourceDimTypes, llvmCtxWrapper), 1273 programOutput() 1274 { 1275 } 1276 }; 1277 class OpenCLProgramContext : public CodeGenContext 1278 { 1279 public: 1280 // We should probably replace all of this with proper option parsing, 1281 // like RS does 1282 class InternalOptions 1283 { 1284 public: InternalOptions(const TC::STB_TranslateInputArgs * pInputArgs)1285 InternalOptions(const TC::STB_TranslateInputArgs* pInputArgs) : 1286 KernelDebugEnable(false), 1287 IncludeSIPCSR(false), 1288 IncludeSIPKernelDebug(false), 1289 IntelGreaterThan4GBBufferRequired(false), 1290 Use32BitPtrArith(false), 1291 IncludeSIPKernelDebugWithLocalMemory(false), 1292 IntelHasPositivePointerOffset(false), 1293 IntelHasBufferOffsetArg(false), 1294 IntelBufferOffsetArgOptional(true), 1295 IntelHasSubDWAlignedPtrArg(false) 1296 { 1297 if (pInputArgs == nullptr) 1298 return; 1299 1300 if (pInputArgs->pInternalOptions != nullptr) 1301 { 1302 parseOptions(pInputArgs->pInternalOptions); 1303 } 1304 1305 // Internal options are passed in via pOptions as well. 1306 if (pInputArgs->pOptions != nullptr) 1307 { 1308 parseOptions(pInputArgs->pOptions); 1309 } 1310 } 1311 1312 bool KernelDebugEnable; 1313 bool IncludeSIPCSR; 1314 bool IncludeSIPKernelDebug; 1315 bool IntelGreaterThan4GBBufferRequired; 1316 bool IntelDisableA64WA = false; 1317 bool IntelForceEnableA64WA = false; 1318 bool Use32BitPtrArith = false; 1319 bool IncludeSIPKernelDebugWithLocalMemory; 1320 1321 bool GTPinReRA = false; 1322 bool GTPinGRFInfo = false; 1323 bool GTPinScratchAreaSize = false; 1324 uint32_t GTPinScratchAreaSizeValue = 0; 1325 1326 // stateless to stateful optimization 1327 bool IntelHasPositivePointerOffset; // default: false 1328 bool IntelHasBufferOffsetArg; // default: false 1329 bool IntelBufferOffsetArgOptional; // default: true 1330 bool IntelHasSubDWAlignedPtrArg; 1331 // default: false, meaning kernel's sub-DW ptrArgs (char*, short*) are DW-aligned. 1332 // This default is stronger than the natural alignment implied by char*/short*. But 1333 // for historical reason, we have this. 1334 1335 bool replaceGlobalOffsetsByZero = false; 1336 bool IntelEnablePreRAScheduling = true; 1337 bool PromoteStatelessToBindless = false; 1338 bool PreferBindlessImages = false; 1339 bool UseBindlessMode = false; 1340 bool UseBindlessPrintf = false; 1341 bool UseBindlessLegacyMode = true; 1342 bool EnableZEBinary = false; 1343 bool NoSpill = false; 1344 1345 // Generic address related 1346 bool HasNoLocalToGeneric = false; 1347 bool ForceGlobalMemoryAllocation = false; 1348 1349 // -1 : initial value that means it is not set from cmdline 1350 // 0-5: valid values set from the cmdline 1351 int16_t VectorCoalescingControl = -1; 1352 1353 bool Intel128GRFPerThread = false; 1354 bool Intel256GRFPerThread = false; 1355 bool IntelNumThreadPerEU = false; 1356 uint32_t numThreadsPerEU = 0; 1357 1358 private: 1359 void parseOptions(const char* IntOptStr); 1360 }; 1361 1362 class Options 1363 { 1364 public: Options(const TC::STB_TranslateInputArgs * pInputArgs)1365 Options(const TC::STB_TranslateInputArgs* pInputArgs) : 1366 CorrectlyRoundedSqrt(false), 1367 NoSubgroupIFP(false), 1368 UniformWGS(false) 1369 { 1370 if (pInputArgs == nullptr) 1371 return; 1372 1373 if (pInputArgs->pOptions == nullptr) 1374 return; 1375 1376 // Build options are of the form -cl-xxxx and -ze-xxxx 1377 // So we skip these prefixes when reading the options to be agnostic of their source 1378 1379 // Runtime passes internal options via pOptions as well, and those 1380 // internal options will be handled by InternalOptions class (parseOptions). 1381 // !!! When adding a new internal option, please add it into internalOptions class!!! 1382 // (Might combine both Options and InternalOptions into a single class!) 1383 const char* options = pInputArgs->pOptions; 1384 if (strstr(options, "-fp32-correctly-rounded-divide-sqrt")) 1385 { 1386 CorrectlyRoundedSqrt = true; 1387 } 1388 1389 if (strstr(options, "-no-subgroup-ifp")) 1390 { 1391 NoSubgroupIFP = true; 1392 } 1393 1394 if (strstr(options, "-uniform-work-group-size")) 1395 { 1396 // Note that this is only available for -cl-std >= 2.0. 1397 // This will be checked before we place this into the 1398 // the module metadata. 1399 UniformWGS = true; 1400 } 1401 if (strstr(options, "-take-global-address")) 1402 { 1403 EnableTakeGlobalAddress = true; 1404 } 1405 if (strstr(options, "-library-compilation")) 1406 { 1407 IsLibraryCompilation = true; 1408 } 1409 if (const char* op = strstr(options, "-intel-reqd-eu-thread-count")) 1410 { 1411 IntelRequiredEUThreadCount = true; 1412 // Take an integer value after this option 1413 // atoi(..) ignores leading white spaces and characters after the actual number 1414 requiredEUThreadCount = atoi(op + strlen("-intel-reqd-eu-thread-count=")); 1415 } 1416 } 1417 1418 bool CorrectlyRoundedSqrt; 1419 bool NoSubgroupIFP; 1420 bool UniformWGS; 1421 bool EnableTakeGlobalAddress = false; 1422 bool IsLibraryCompilation = false; 1423 bool IntelRequiredEUThreadCount = false; 1424 uint32_t requiredEUThreadCount = 0; 1425 }; 1426 1427 // output: shader information 1428 iOpenCL::CGen8OpenCLProgram m_programOutput; 1429 SOpenCLProgramInfo m_programInfo; 1430 const InternalOptions m_InternalOptions; 1431 const Options m_Options; 1432 bool isSpirV; 1433 float m_ProfilingTimerResolution; 1434 bool m_ShouldUseNonCoherentStatelessBTI; 1435 uint32_t m_numUAVs = 0; 1436 1437 OpenCLProgramContext( 1438 const COCLBTILayout& btiLayout, 1439 const CPlatform& platform, 1440 const TC::STB_TranslateInputArgs* pInputArgs, 1441 const CDriverInfo& driverInfo, 1442 LLVMContextWrapper* llvmContext = nullptr, 1443 bool shouldUseNonCoherentStatelessBTI = false, 1444 const bool createResourceDimTypes = true) CodeGenContext(ShaderType::OPENCL_SHADER,btiLayout,platform,driverInfo,createResourceDimTypes,llvmContext)1445 : CodeGenContext(ShaderType::OPENCL_SHADER, btiLayout, platform, driverInfo, createResourceDimTypes, llvmContext), 1446 m_programOutput(platform.getPlatformInfo(), *this), 1447 m_InternalOptions(pInputArgs), 1448 m_Options(pInputArgs), 1449 isSpirV(false), 1450 m_ShouldUseNonCoherentStatelessBTI(shouldUseNonCoherentStatelessBTI) 1451 { 1452 } 1453 bool isSPIRV() const; 1454 void setAsSPIRV(); 1455 float getProfilingTimerResolution(); 1456 uint32_t getNumGRFPerThread() const override; 1457 uint32_t getNumThreadsPerEU() const override; 1458 bool forceGlobalMemoryAllocation() const override; 1459 bool allocatePrivateAsGlobalBuffer() const override; 1460 bool hasNoLocalToGenericCast() const override; 1461 bool hasNoPrivateToGenericCast() const override; 1462 int16_t getVectorCoalescingControl() const override; 1463 private: 1464 llvm::DenseMap<llvm::Function*, std::string> m_hashes_per_kernel; 1465 }; 1466 1467 void CodeGen(PixelShaderContext* ctx); 1468 void CodeGen(ComputeShaderContext* ctx); 1469 void CodeGen(DomainShaderContext* ctx); 1470 void CodeGen(HullShaderContext* ctx); 1471 void CodeGen(VertexShaderContext* ctx); 1472 void CodeGen(GeometryShaderContext* ctx); 1473 void CodeGen(OpenCLProgramContext* ctx); 1474 1475 void OptimizeIR(CodeGenContext* ctx); 1476 1477 /** 1478 * Fold derived constants. Load CB data from CBptr with index & offset, 1479 * calculate the new data based on LLVM bitcode and store results to pNewCB. 1480 * Then driver will push pNewCB to thread payload. 1481 */ 1482 void FoldDerivedConstant(char* bitcode, uint bitcodeSize, void* CBptr[15], 1483 std::function<void(uint[4], uint, uint, bool)> getResInfoCB, uint* pNewCB); 1484 } // end IGC namespace 1485