1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2017-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #pragma once
10 
11 #include "IGC/common/StringMacros.hpp"
12 #include "usc.h"
13 #include "usc_gen7.h"
14 #include "usc_gen9.h"
15 #include "common/Stats.hpp"
16 #include "common/Types.hpp"
17 #include "common/allocator.h"
18 #include "common/igc_resourceDimTypes.h"
19 // hack
20 #include "common/debug/Debug.hpp"
21 #include "common/debug/Dump.hpp"
22 #include <set>
23 #include <string.h>
24 #include <sstream>
25 #include "Compiler/CISACodeGen/ShaderUnits.hpp"
26 #include "Compiler/CISACodeGen/Platform.hpp"
27 #include "Compiler/CISACodeGen/DriverInfo.hpp"
28 #include "Compiler/CISACodeGen/helper.h"
29 #include "Compiler/MetaDataApi/MetaDataApi.h"
30 #include "Compiler/MetaDataApi/IGCMetaDataHelper.h"
31 #include "Compiler/CodeGenContextWrapper.hpp"
32 #include "visa/include/RelocationInfo.h"
33 #include "ZEBinWriter/zebin/source/autogen/ZEInfo.hpp"
34 
35 #include "../AdaptorOCL/OCL/sp/spp_g8.h"
36 #include "../GenISAIntrinsics/GenIntrinsics.h"
37 #include "../GenISAIntrinsics/GenIntrinsicInst.h"
38 #include "common/LLVMWarningsPush.hpp"
39 #include <llvm/IR/LLVMContext.h>
40 #include <llvm/ADT/DenseMap.h>
41 #include <llvm/ADT/StringMap.h>
42 #include <llvm/ADT/StringRef.h>
43 #include <llvm/IR/IRBuilder.h>
44 #include "llvm/IR/Function.h"
45 #include "llvm/IR/ValueMap.h"
46 #include "llvm/IR/AssemblyAnnotationWriter.h"
47 #include "common/LLVMWarningsPop.hpp"
48 #include "CodeGenPublicEnums.h"
49 #include "AdaptorOCL/TranslationBlock.h"
50 #include "common/MDFrameWork.h"
51 #include "CompilerStats.h"
52 #include <unordered_set>
53 #include "Probe/Assertion.h"
54 #include <optional>
55 #include <Metrics/IGCMetric.h>
56 
57 /************************************************************************
58 This file contains the interface structure and functions to communicate
59 between front ends and code generator
60 ************************************************************************/
61 
62 namespace llvm
63 {
64     class Module;
65     class Function;
66 }
67 
68 #define MAX_VSHADER_INPUT_REGISTERS_PACKAGEABLE 32
69 static const unsigned int g_c_Max_PS_attributes = 32;
70 
71 namespace IGC
72 {
73     class CodeGenContext;
74     class PixelShaderContext;
75     class ComputeShaderContext;
76 
77     struct SProgramOutput
78     {
79     public:
80         typedef std::vector<vISA::ZESymEntry> SymbolListTy;
81         typedef std::vector<vISA::ZERelocEntry> RelocListTy;
82         typedef std::vector<vISA::ZEFuncAttribEntry> FuncAttrListTy;
83         // function scope symbols
84         struct ZEBinFuncSymbolTable {
85             SymbolListTy function;          // function symbols
86             SymbolListTy sampler;           // sampler symbols
87             SymbolListTy local;             // local symbols
88         };
89 
90     public:
91         void* m_programBin = nullptr;     //<! Must be 16 byte aligned, and padded to a 64 byte boundary
92         unsigned int    m_programSize = 0;    //<! Number of bytes of program data (including padding)
93         unsigned int    m_unpaddedProgramSize = 0;      //<! program size without padding used for binary linking
94         unsigned int    m_startReg = 0;                 //<! Which GRF to start with
95         unsigned int    m_scratchSpaceUsedBySpills = 0; //<! amount of scratch space needed for shader spilling
96         unsigned int    m_scratchSpaceUsedByShader = 0; //<! amount of scratch space needed by shader
97         unsigned int    m_scratchSpaceUsedByGtpin = 0; //<! amount of scratch space used by gtpin
98         void*           m_debugData = nullptr;      //<! elf file containing debug information for the kernel (source->genIsa)
99         unsigned int    m_debugDataSize = 0;        //<! size of the elf file containing debug information
100         // TODO: m_debugDataGenISA and m_debugDataGenISASize
101         // are not really needed, consider removal
102         void* m_debugDataGenISA = nullptr;          //<! GenISA debug data (VISA -> GenISA)
103         unsigned int    m_debugDataGenISASize = 0;      //<! Number of bytes of GenISA debug data
104         unsigned int    m_InstructionCount = 0;
105         unsigned int    m_BasicBlockCount = 0;
106         void* m_gtpinBuffer = nullptr;              // Will be populated by VISA only when special switch is passed by gtpin
107         unsigned int    m_gtpinBufferSize = 0;
108         void* m_funcSymbolTable = nullptr;
109         unsigned int    m_funcSymbolTableSize = 0;
110         unsigned int    m_funcSymbolTableEntries = 0;
111         ZEBinFuncSymbolTable m_symbols;           // duplicated information of m_funcSymbolTable, for zebin
112         void* m_funcRelocationTable = nullptr;
113         unsigned int    m_funcRelocationTableSize = 0;
114         unsigned int    m_funcRelocationTableEntries = 0;
115         RelocListTy     m_relocs;                  // duplicated information of m_funcRelocationTable, for zebin
116         void* m_funcAttributeTable = nullptr;
117         unsigned int    m_funcAttributeTableSize = 0;
118         unsigned int    m_funcAttributeTableEntries = 0;
119         FuncAttrListTy  m_funcAttrs;               // duplicated information of m_funcAttributeTable, for zebin
120         unsigned int    m_offsetToSkipPerThreadDataLoad = 0;
121         uint32_t        m_offsetToSkipSetFFIDGP = 0;
122         bool            m_roundPower2KBytes = false;
123         bool            m_UseScratchSpacePrivateMemory = true;
124         unsigned int m_scratchSpaceSizeLimit = 0;
125         unsigned int m_numGRFTotal = 128;
126         std::string m_VISAAsm;
127 
128         // Optional statistics
129         std::optional<uint64_t> m_NumGRFSpill;
130         std::optional<uint64_t> m_NumGRFFill;
131         std::optional<uint64_t> m_NumSends;
132         std::optional<uint64_t> m_NumCycles;
133         std::optional<uint64_t> m_NumSendStallCycles;
134 
135 
DestroySProgramOutput136         void Destroy()
137         {
138             if (m_programBin)
139             {
140                 IGC::aligned_free(m_programBin);
141             }
142             if (m_debugData)
143             {
144                 IGC::aligned_free(m_debugData);
145             }
146             if (m_debugDataGenISA)
147             {
148                 IGC::aligned_free(m_debugDataGenISA);
149             }
150             if (m_funcAttributeTable)
151             {
152                 IGC::aligned_free(m_funcAttributeTable);
153             }
154         }
155 
initSProgramOutput156         void init(bool roundPower2KBytes, unsigned int scratchSpaceSizeLimitT, bool useScratchSpacePrivateMemory)
157         {
158             m_roundPower2KBytes = roundPower2KBytes;
159             m_scratchSpaceSizeLimit = scratchSpaceSizeLimitT;
160             m_UseScratchSpacePrivateMemory = useScratchSpacePrivateMemory;
161         }
162 
163         //InSlot0
164         //Todo: rename later
getScratchSpaceUsageInSlot0SProgramOutput165         unsigned int getScratchSpaceUsageInSlot0() const
166         {
167             unsigned int privateMemoryScratchSpaceSize =
168                 getScratchSpaceUsageInSlot1() > 0 || getScratchSpaceUsageInStateless() > 0 ? 0 : m_scratchSpaceUsedByShader;
169             unsigned int result = roundSize(m_scratchSpaceUsedBySpills + m_scratchSpaceUsedByGtpin + privateMemoryScratchSpaceSize);
170             IGC_ASSERT(result <= m_scratchSpaceSizeLimit);
171             return result;
172         }
173 
getScratchSpaceUsageInSlot1SProgramOutput174         unsigned int getScratchSpaceUsageInSlot1() const
175         {
176             unsigned int result = 0;
177             //FIXME: temporarily disable slot1, enable it again when IGC is ready to handle r0.5+1
178             // result = roundSize(m_UseScratchSpacePrivateMemory ? m_scratchSpaceUsedByShader : 0);
179             IGC_ASSERT(result <= m_scratchSpaceSizeLimit);
180             return result;
181         }
182 
getScratchSpaceUsageInStatelessSProgramOutput183         unsigned int getScratchSpaceUsageInStateless() const
184         {
185             return roundSize(!m_UseScratchSpacePrivateMemory ? m_scratchSpaceUsedByShader : 0);
186         }
187 
setScratchSpaceUsedByShaderSProgramOutput188         void setScratchSpaceUsedByShader(unsigned int scratchSpaceUsedByShader)
189         {
190             m_scratchSpaceUsedByShader = scratchSpaceUsedByShader;
191         }
192     private:
roundSizeSProgramOutput193         unsigned int roundSize(unsigned int size) const
194         {
195             if (m_roundPower2KBytes)
196             {
197                 size = roundPower2KBbyte(size);
198             }
199             return size;
200         }
201 
roundPower2KBbyteSProgramOutput202         unsigned int roundPower2KBbyte(unsigned int size) const
203         {
204             return (size ? iSTD::RoundPower2(iSTD::Max(int_cast<DWORD>(size), static_cast<DWORD>(sizeof(KILOBYTE)))) : 0);
205         }
206 
207         // XeHP_SDV+ : we round to one of values: pow(2, (0, 6, 7, 8...18))
roundPower2ByteSProgramOutput208         unsigned int roundPower2Byte(unsigned int size) const
209         {
210             unsigned int ret = (size ? iSTD::RoundPower2(int_cast<DWORD>(size)) : 0);
211             //round any value in (0,32] to 64 BYTEs
212             ret = ((ret > 0 && ret <= 32) ? 64 : ret);
213             return ret;
214         }
215     };
216 
217     enum InstrStatTypes
218     {
219         SROA_PROMOTED,
220         LICM_STAT,
221         TOTAL_TYPES
222     };
223     enum InstrStatStage
224     {
225         BEGIN,
226         END,
227         EXCEED_THRESHOLD,
228         TOTAL_STAGE
229     };
230 
231     struct SInstrTypes
232     {
233         bool CorrelatedValuePropagationEnable;
234         bool hasLoop;
235         bool hasMultipleBB;
236         bool hasCmp;
237         bool hasSwitch;
238         bool hasPhi;
239         bool hasLoadStore;
240         bool hasCall;
241         bool hasIndirectCall;
242         bool hasInlineAsm;
243         bool hasInlineAsmPointerAccess;
244         bool hasIndirectBranch;
245         bool hasFunctionAddressTaken;
246         bool hasSel;
247         bool hasPointer;
248         bool hasLocalLoadStore;
249         bool hasGlobalLoad; // has (stateless) loads from global addresspace
250         bool hasGlobalStore; // has (stateless) stores to global addresspace
251         bool hasStorageBufferLoad; // has (stateful) loads from storage buffers (UAV/SSBO)
252         bool hasStorageBufferStore; // has (stateful) stores to storage buffers (UAV/SSBO)
253         bool hasSubroutines;
254         bool hasPrimitiveAlloca;
255         bool hasNonPrimitiveAlloca;
256         bool hasReadOnlyArray;
257         bool hasBuiltin;
258         bool hasFRem;
259         bool psHasSideEffect;     //<! only relevant to pixel shader, has other memory writes besides RTWrite
260         bool hasGenericAddressSpacePointers;
261         bool hasDebugInfo;        //<! true only if module contains debug info !llvm.dbg.cu
262         bool hasAtomics;
263         bool hasBarrier;        //<! true if module has thread group barrier
264         bool hasDiscard;
265         bool hasTypedRead;
266         bool hasTypedwrite;
267         bool mayHaveIndirectOperands;  //<! true if code may have indirect operands like r5[a0].
268         // true if shader may have indirect texture or buffer.
269         // Note: does not check for indirect sampler
270         bool mayHaveIndirectResources;
271         bool hasUniformAssumptions;
272         bool hasWaveIntrinsics;
273         bool hasPullBary;
274         bool sampleCmpToDiscardOptimizationPossible;
275         unsigned int sampleCmpToDiscardOptimizationSlot;
276         unsigned int numSample;
277         unsigned int numBB;
278         unsigned int numLoopInsts;
279         unsigned int numOfLoop;
280         unsigned int numInsts;    //<! measured after optimization, used as a compiler heuristic
281         unsigned int numAllocaInsts;
282         unsigned int numPsInputs;
283         bool hasDynamicGenericLoadStore;
284         bool hasUnmaskedRegion;
285         unsigned int numGlobalInsts;
286         unsigned int numLocalInsts;
287     };
288 
289     struct SSimplePushInfo
290     {
291         // Constant buffer Binding Table Index or Surface State Offset.
292         // Valid only if 'isStateless' is false.
293         // If 'isBindless' is false then 'm_cbIdx' contains a Binding Table
294         // Index otherwise it contains a Surface State Offset in 64-byte units.
295         uint m_cbIdx = 0;
296         // m_pushableAddressGrfOffset and m_pushableOffsetGrfOffset are GRF
297         // offsets (in DWORDS) in the runtime data pushed to the shader. These
298         // fields are valid only if greater or equal to 0. If a field is valid
299         // it means that the runtime data from the GRF offset was used in
300         // the buffer address calculation.
301         // These fields must contain values provided by frontend in
302         // pushInfo.pushableAddresses metadata.
303         // m_pushableAddressGrfOffset is only valid when isStateless is true.
304         // m_pushableOffsetGrfOffset is only valid when isStateless or
305         // isBindless is true.
306         // When isStateless is true runtime data at m_pushableAddressGrfOffset
307         // contains a 64bit canonicalized address. Data starting at
308         // m_pushableOffsetGrfOffset contains 32bit offset relative to the 64bit
309         // starting address.
310         // PushAnalysiss pass matches the following pattern:
311         //   uint8_t* pShaderRuntimeData ={...}; // to be pushed
312         //   uint64_t pushableAddress =
313         //     *(uint64_t*)(pShaderRuntimeData + 4*pushableAddressGrfOffset);
314         //   if (pushableOffsetGrfOffset >=0) {
315         //     pushableAddress +=
316         //       *(uint32_t*)(pShaderRuntimeData + 4*pushableOffsetGrfOffset);
317         //   }
318         //   pushableAddress += m_offset;
319         //
320         // m_pushableOffsetGrfOffset is also used when isBindless is true and
321         // contains the GRF offset that was used to calculate the Surface State
322         // Offset of the buffer. It must contain one of the values provided by
323         // frontend in pushInfo.bindlessPushInfo metadata.
324         int m_pushableAddressGrfOffset = -1;
325         int m_pushableOffsetGrfOffset = -1;
326         // Immediate offset in bytes add to the start of the simple push region.
327         uint m_offset = 0;
328         // Data size in bytes, must be a multiple of GRF size
329         uint m_size = 0;
330         bool isStateless = false;
331         bool isBindless = false;
332     };
333 
334     struct ConstantPayloadInfo
335     {
336         int  DerivedConstantsOffset = -1;
337     };
338 
339 
340     struct SResInfoFoldingOutput
341     {
342         uint32_t textureID;
343         bool value[4];
344     };
345 
346     enum SIMDInfoBit
347     {
348         SIMD_SELECTED,       // 0: if the SIMD is selected. If 1, all the other bits are ignored.
349         SIMD_RETRY,          // 1: is a retry
350         SIMD_SKIP_HW,        // 2: skip this SIMD due to HW restriction / WA.
351         SIMD_SKIP_REGPRES,   // 3: skip this SIMD due to register pressure early out.
352         SIMD_SKIP_SPILL,     // 4: skip this SIMD due to spill or high chance of spilling.
353         SIMD_SKIP_STALL,     // 5: skip this SIMD due to stall cycle or thread occupancy heuristic.
354         SIMD_SKIP_THGRPSIZE, // 6: skip due to threadGroupSize heuristic(CS / OCL only).
355         SIMD_SKIP_PERF       // 7: skip this SIMD due to performance concern (dx12 + discard, MRT, etc) or other reasons.
356     };
357 
358     enum SIMDInfoOffset
359     {
360         SIMD8_OFFSET = 0,
361         SIMD16_OFFSET = 8,
362         SIMD32_OFFSET = 16,
363     };
364 
365     struct SKernelProgram
366     {
367         SProgramOutput simd1;
368         SProgramOutput simd8;
369         SProgramOutput simd16;
370         SProgramOutput simd32;
371         unsigned int bindingTableEntryCount = 0;
372 
373         char* gatherMap = nullptr;
374         unsigned int gatherMapSize = 0;
375         unsigned int ConstantBufferLength = 0;
376         unsigned int ConstantBufferMask   = 0;
377         unsigned int MaxNumberOfThreads   = 0;
378         bool         isMessageTargetDataCacheDataPort = false;
379 
380         unsigned int NOSBufferSize = 0;
381         unsigned int ConstantBufferLoaded = 0;
382         uint64_t     UavLoaded = 0;
383         unsigned int ShaderResourceLoaded[4];
384         unsigned int RenderTargetLoaded = 0;
385 
386         bool         hasControlFlow = false;
387         unsigned int bufferSlot = 0;
388         unsigned int statelessCBPushedSize = 0;
389 
390         std::vector<SResInfoFoldingOutput> m_ResInfoFoldingOutput;
391         // GenUpdateCB outputs
392         void*       m_ConstantBufferReplaceShaderPatterns = nullptr;
393         uint        m_ConstantBufferReplaceShaderPatternsSize = 0;
394         uint        m_ConstantBufferUsageMask = 0;
395         uint        m_ConstantBufferReplaceSize = 0;
396 
397         SSimplePushInfo simplePushInfoArr[g_c_maxNumberOfBufferPushed];
398 
399         uint64_t    SIMDInfo;
400     };
401 
402     struct SPixelShaderKernelProgram : SKernelProgram
403     {
404 
405         USC::GFX3DSTATE_SF_ATTRIBUTE_ACTIVE_COMPONENT attributeActiveComponent[g_c_Max_PS_attributes];
406         DWORD m_AccessedBySampleC[4];
407 
408         unsigned int nbOfSFOutput;
409         unsigned int renderTargetMask;
410         unsigned int constantInterpolationEnableMask;
411         unsigned int primIdLocation;
412         unsigned int pointCoordLocation;
413         unsigned int samplerCount;
414         unsigned int BindingTableEntryBitmap;
415         unsigned int sampleCmpToDiscardOptimizationSlot;
416 
417         unsigned char OutputUseMask[USC::NUM_PSHADER_OUTPUT_REGISTERS];
418 
419         bool needPerspectiveBary;
420         bool needPerspectiveCentroidBary;
421         bool needPerspectiveSampleBary;
422         bool needNonPerspectiveBary;
423         bool needNonPerspectiveCentroidBary;
424         bool needNonPerspectiveSampleBary;
425         bool needSourceDepth;
426         bool needSourceW;
427         bool hasInputCoverageMask;
428         bool hasPullBary;
429         bool killPixel;
430         bool outputDepth;
431         bool outputStencil;
432         bool isPerSample;
433         bool oMask;
434         bool VectorMask;
435 
436         bool hasPrimID;
437         bool hasPointCoord;
438         bool isCoarsePS;
439         bool hasCoarsePixelSize;
440         bool hasSampleOffset;
441         bool hasZWDelta;
442         bool needPerspectiveBaryPlane;
443         bool needNonPerspectiveBaryPlane;
444         bool posXYOffsetEnable;
445         bool blendToFillEnabled;
446         bool forceEarlyZ;
447 
448         bool sampleCmpToDiscardOptimizationPossible;
449 
450         bool needPSSync;
451     };
452 
453     /// Gen10+, corresponds to 3DSTATE_VF_SGVS_2 as described below
454     struct SVertexFetchSGVExtendedParameters
455     {
456         struct
457         {
458             bool enabled = false;      //<! XPn Enable = XPn Source Select = (*)
459             unsigned int location = 0; //<! Linear offset of the 32bit component in VUE
460         } extendedParameters[3] = {};  //<! Order of elements: XP0, XP1, XP2
461     };
462 
463     struct SVertexShaderKernelProgram : SKernelProgram
464     {
465         /// corresponds to 3DSTATE_VS Vertex URB Entry Read Length
466         OctEltUnit VertexURBEntryReadLength;
467         /// corresponds to 3DSTATE_VS Vertex URB Entry Read Offset
468         OctEltUnit VertexURBEntryReadOffset;
469         /// corresponds to 3DSTATE_VS VS Vertex URB Entry Output Length
470         OctEltUnit VertexURBEntryOutputReadLength;
471         /// corresponds to 3DSTATE_VS VS Vertex URB Entry Output Offset
472         OctEltUnit VertexURBEntryOutputReadOffset;
473         /// corresponds to 3DSTATE_SBE Vertex URB Entry Read Offset
474         OctEltUnit SBEURBReadOffset;
475         OctEltUnit URBAllocationSize;
476         QuadEltUnit MaxNumInputRegister;
477 
478         bool enableElementComponentPacking;
479         /// corresponds to 3DSTATE_VF_COMPONENT_PACKING
480         unsigned char ElementComponentDeliverMask[32];
481         /// vertex ID information
482         bool         hasVertexID;
483         unsigned int vertexIdLocation;
484         /// instance ID information
485         bool         hasInstanceID;
486         unsigned int instanceIdLocation;
487         bool         singleInstanceVertexShader;
488         /// corresponds to 3DSTATE_VF_SGVS_2
489         SVertexFetchSGVExtendedParameters vertexFetchSGVExtendedParameters;
490         //RTAI and VPAI
491         bool         DeclaresRTAIndex;
492         bool         DeclaresVPAIndex;
493 
494         DWORD        m_AccessedBySampleC[4];
495         bool         HasClipCullAsOutput;
496 
497 
498         unsigned int BindingTableEntryBitmap;
499         unsigned int m_SamplerCount;
500     };
501 
502     struct SGeometryShaderKernelProgram : SKernelProgram
503     {
504         // Gen 7 defined ones
505         USC::GFX3DPRIMITIVE_TOPOLOGY_TYPE OutputTopology;
506         unsigned int SamplerCount;
507         QuadEltUnit  OutputVertexSize;
508         OctEltUnit   VertexEntryReadLength;   // URB Entry Read Length
509         OctEltUnit   VertexEntryReadOffset;
510         bool         IncludeVertexHandles;
511         USC::GFX3DSTATE_CONTROL_DATA_FORMAT ControlDataHeaderFormat;
512         OctEltUnit   ControlDataHeaderSize;
513         unsigned int DefaultStreamID;
514         unsigned int InstanceCount;
515         USC::GFX3DSTATE_GEOMETRY_SHADER_DISPATCH_MODE DispatchMode;
516         bool         IncludePrimitiveIDEnable;
517         bool         ReorderEnable;
518         bool         DiscardAdjacencyEnable;
519         OctEltUnit   SBEVertexURBEntryReadOffset;
520         URBAllocationUnit URBAllocationSize;
521         unsigned int UserClipDistancesMask;
522         unsigned int UserCullDistancesMask;
523         unsigned int MaxOutputVertexCount;
524         unsigned int BindingTableEntryBitmap;
525 
526         bool         DeclaresVPAIndex;
527         bool         DeclaresRTAIndex;
528 
529         USC::GFX3DSTATE_PROGRAM_FLOW SingleProgramFlow;
530         bool GSEnable;
531 
532         // Gen 8 defined ones
533         unsigned int ExpectedVertexCount;
534         unsigned int StaticOutputVertexCount;
535         OctEltUnit GSVertexURBEntryOutputReadOffset;
536         OctEltUnit GSVertexURBEntryOutputReadLength;
537 
538         bool StaticOutput;
539 
540         DWORD m_AccessedBySampleC[4];
541 
542         bool m_bCanEnableRectList;
543     };
544 
545     struct SComputeShaderKernelProgram : SKernelProgram
546     {
547         USC::GFX3DSTATE_FLOATING_POINT_MODE FloatingPointMode;
548         USC::GFX3DSTATE_PROGRAM_FLOW        SingleProgramFlow;
549 
550         unsigned int                        SamplerCount;
551         unsigned int                        BindingTableEntryCount;
552         unsigned int                        CurbeReadOffset;
553         unsigned int                        CurbeReadLength;
554         unsigned int                        PhysicalThreadsInGroup;
555 
556         bool                                BarrierUsed;
557 
558         USC::GFX3DSTATE_ROUNDING_MODE       RoundingMode;
559 
560         unsigned int                        BarrierReturnGRFOffset;
561 
562         int                                 GtwBypass;
563         int                                 GtwResetTimer;
564 
565         unsigned int                        URBEntriesNum;
566         unsigned int                        URBEntryAllocationSize;
567         unsigned int                        CurbeTotalDataLength;
568 
569         USC::GFXMEDIA_GPUWALKER_SIMD        SimdWidth;
570 
571         unsigned int                        ThreadGroupSize;
572         unsigned int                        SlmSize;
573 
574         void* ThreadPayloadData;
575 
576         unsigned int                        CSHThreadDispatchChannel;
577 
578         bool                                CompiledForIndirectPayload;
579 
580         bool                                DispatchAlongY;
581 
582         unsigned int                        ThreadGroupModifier_X;
583         unsigned int                        ThreadGroupModifier_Y;
584 
585         /* Output related to only the PingPong Textures */
586         bool                                SecondCompile;
587         bool                                IsRowMajor;
588         bool                                PerformSecondCompile;
589 
590         unsigned int                        NumChannelsUsed;
591         bool                                DisableMidThreadPreemption;
592 
593         DWORD m_AccessedBySampleC[4];
594     };
595 
596     struct SHullShaderKernelProgram : SKernelProgram
597     {
598         bool                                IncludeVertexHandles;
599         OctEltUnit                          URBAllocationSize;
600         OctEltUnit                          PatchConstantURBSize;
601         OctEltUnit                          VertexURBEntryReadLength;
602         OctEltUnit                          VertexURBEntryReadOffset;
603         bool                                IncludePrimitiveIDEnable;
604         HullShaderDispatchModes             DispatchMode;
605         unsigned int                        InstanceCount;
606         DWORD m_AccessedBySampleC[4];
607         unsigned int                        BindingTableEntryBitmap;
608     };
609 
610     struct SDomainShaderKernelProgram : SKernelProgram
611     {
612         OctEltUnit                          URBAllocationSize;
613         OctEltUnit                          VertexURBEntryReadLength;
614         OctEltUnit                          VertexURBEntryReadOffset;
615         OctEltUnit                          VertexURBEntryOutputLength;
616         OctEltUnit                          VertexURBEntryOutputReadOffset;
617         bool                                ComputeWAttribute;
618         DomainShaderDispatchModes           DispatchMode;
619         SProgramOutput                      simd8DualPatch;
620         bool                                DeclaresRTAIndex;
621         bool                                DeclaresVPAIndex;
622         bool                                HasClipCullAsOutput;
623         bool                                HasPrimitiveIDInput;
624         DWORD m_AccessedBySampleC[4];
625         unsigned int                        BindingTableEntryBitmap;
626     };
627 
628 
629     struct SOpenCLKernelInfo
630     {
631         struct SResourceInfo
632         {
633             enum { RES_UAV, RES_SRV, RES_OTHER } Type;
634             int Index;
635         };
636 
SOpenCLKernelInfoSOpenCLKernelInfo637         SOpenCLKernelInfo() {};
638 
639         std::string m_kernelName = {};
640         QWORD       m_ShaderHashCode = {};
641 
642         std::vector<std::unique_ptr<iOpenCL::PointerInputAnnotation>>       m_pointerInput;
643         std::vector<std::shared_ptr<iOpenCL::PointerArgumentAnnotation>>    m_pointerArgument;
644         std::vector<std::unique_ptr<iOpenCL::LocalArgumentAnnotation>>      m_localPointerArgument;
645         std::vector<std::unique_ptr<iOpenCL::SamplerInputAnnotation>>       m_samplerInput;
646         std::vector<std::unique_ptr<iOpenCL::SamplerArgumentAnnotation>>    m_samplerArgument;
647         std::vector<std::unique_ptr<iOpenCL::ConstantInputAnnotation>>      m_constantInputAnnotation;
648         std::vector<std::unique_ptr<iOpenCL::ConstantArgumentAnnotation>>   m_constantArgumentAnnotation;
649         std::vector<std::unique_ptr<iOpenCL::ImageArgumentAnnotation>>      m_imageInputAnnotations;
650         std::vector<std::unique_ptr<iOpenCL::KernelArgumentInfoAnnotation>> m_kernelArgInfo;
651         std::vector<std::unique_ptr<iOpenCL::PrintfStringAnnotation>>       m_printfStringAnnotations;
652 
653         std::unique_ptr<iOpenCL::PrintfBufferAnnotation>    m_printfBufferAnnotation = nullptr;
654         std::unique_ptr<iOpenCL::SyncBufferAnnotation>      m_syncBufferAnnotation = nullptr;
655         std::unique_ptr<iOpenCL::StartGASAnnotation>        m_startGAS = nullptr;
656         std::unique_ptr<iOpenCL::WindowSizeGASAnnotation>   m_WindowSizeGAS = nullptr;
657         std::unique_ptr<iOpenCL::PrivateMemSizeAnnotation>  m_PrivateMemSize = nullptr;
658         std::string                                         m_kernelAttributeInfo = {};
659 
660         bool                                                m_HasInlineVmeSamplers = false;
661 
662         // This maps argument numbers to BTI and sampler indices
663         // (e.g. kernel argument 3, which is is an image_2d, may be mapped to BTI 6)
664         std::map<DWORD, unsigned int> m_argIndexMap = {};
665 
666         std::map<unsigned int, std::shared_ptr<iOpenCL::PointerArgumentAnnotation>> m_argOffsetMap = {};
667 
668         iOpenCL::ThreadPayload        m_threadPayload = {};
669 
670         iOpenCL::ExecutionEnivronment m_executionEnivronment = {};
671 
672         iOpenCL::KernelTypeProgramBinaryInfo m_kernelTypeInfo = {};
673 
674         SKernelProgram                m_kernelProgram = {};
675 
676         // Information for zebin
677         // Cross-thread payload arguments
678         zebin::PayloadArgumentsTy m_zePayloadArgs;
679         // BTI information for payload arguments
680         zebin::BindingTableIndicesTy m_zeBTIArgs;
681 
682         // Analysis result of if there are non-kernel-argument ld/st in the kernel
683         // If all false, we can avoid expensive memory setting of each kernel during runtime
684         int m_hasNonKernelArgLoad = -1;
685         int m_hasNonKernelArgStore = -1;
686         int m_hasNonKernelArgAtomic = -1;
687     };
688 
689 
690     struct SOpenCLProgramInfo
691     {
692         struct ZEBinRelocTable
693         {
694             std::vector<vISA::ZERelocEntry> globalReloc;
695             std::vector<vISA::ZERelocEntry> globalConstReloc;
696         };
697         // program scope symbols
698         struct ZEBinProgramSymbolTable
699         {
700             using SymbolSeq = std::vector<vISA::ZESymEntry>;
701             SymbolSeq global;            // global symbols
702             SymbolSeq globalConst;       // global constant symbols
703             SymbolSeq globalStringConst; // global string constant symbols
704         };
705         struct LegacySymbolTable
706         {
707             void* m_buffer = nullptr;
708             unsigned int m_size = 0;
709             unsigned int m_entries = 0;
710         };
711 
712         std::unique_ptr<iOpenCL::InitConstantAnnotation> m_initConstantAnnotation;
713         std::unique_ptr<iOpenCL::InitConstantAnnotation> m_initConstantStringAnnotation;
714         std::unique_ptr<iOpenCL::InitGlobalAnnotation> m_initGlobalAnnotation;
715         std::vector<std::unique_ptr<iOpenCL::ConstantPointerAnnotation> > m_initConstantPointerAnnotation;
716         std::vector<std::unique_ptr<iOpenCL::GlobalPointerAnnotation> > m_initGlobalPointerAnnotation;
717         std::vector<std::unique_ptr<iOpenCL::KernelTypeProgramBinaryInfo> > m_initKernelTypeAnnotation;
718 
719         ZEBinRelocTable m_GlobalPointerAddressRelocAnnotation;
720         ZEBinProgramSymbolTable m_zebinSymbolTable;
721         LegacySymbolTable m_legacySymbolTable;
722     };
723 
724     class CBTILayout
725     {
726     public:
727         unsigned int GetSystemThreadBindingTableIndex(void) const;
728         unsigned int GetBindingTableEntryCount(void) const;
729         unsigned int GetTextureIndex(unsigned int index) const;
730         unsigned int GetUavIndex(unsigned int index) const;
731         unsigned int GetRenderTargetIndex(unsigned int index) const;
732         unsigned int GetConstantBufferIndex(unsigned int index) const;
GetTextureIndexSize()733         unsigned int GetTextureIndexSize() const { return m_pLayout->maxResourceIdx - m_pLayout->minResourceIdx; }
GetUavIndexSize()734         unsigned int GetUavIndexSize() const { return m_pLayout->maxUAVIdx - m_pLayout->minUAVIdx; }
GetRenderTargetIndexSize()735         unsigned int GetRenderTargetIndexSize() const { return m_pLayout->maxColorBufferIdx - m_pLayout->minColorBufferIdx; }
GetConstantBufferIndexSize()736         unsigned int GetConstantBufferIndexSize() const { return m_pLayout->maxConstantBufferIdx - m_pLayout->minConstantBufferIdx; }
737         unsigned int GetNullSurfaceIdx() const;
738         unsigned int GetTGSMIndex() const;
739         unsigned int GetScratchSurfaceBindingTableIndex() const;
740         unsigned int GetStatelessBindingTableIndex() const;
741         unsigned int GetImmediateConstantBufferOffset() const;
742         unsigned int GetDrawIndirectBufferIndex() const;
GetBtLayout()743         const USC::SShaderStageBTLayout* GetBtLayout() const { return m_pLayout; };
GetColorBufferMappingTable()744         const std::vector<unsigned char>& GetColorBufferMappingTable() const { return m_ColorBufferMappings; }
745 
CBTILayout(const USC::SShaderStageBTLayout * pLayout)746         CBTILayout(const USC::SShaderStageBTLayout* pLayout) : m_pLayout(pLayout)
747         {}
748 
CBTILayout(const USC::SShaderStageBTLayout * pLayout,const std::vector<unsigned char> & colorBufferMappings)749         CBTILayout(
750             const USC::SShaderStageBTLayout* pLayout,
751             const std::vector<unsigned char>& colorBufferMappings) :
752             m_pLayout(pLayout),
753             m_ColorBufferMappings(colorBufferMappings)
754         {}
755 
756     protected:
757         const USC::SShaderStageBTLayout* m_pLayout;
758 
759         // Vulkan front end provides a separate vector with color buffer mappings.
760         const std::vector<unsigned char> m_ColorBufferMappings;
761     };
762 
763     // This is insanely ugly, but it's the pretties solution we could
764     // think of that preserves the GFX code.
765     // This is temporary and will go away once image access between
766     // OCL and GFX is unified.
767     // This happens because in GFX the layout comes from the driver and is
768     // immutable, while in OCL we need to change the layout mid-codegen.
769     class COCLBTILayout : public CBTILayout
770     {
771     public:
COCLBTILayout(const USC::SShaderStageBTLayout * pLayout)772         COCLBTILayout(const USC::SShaderStageBTLayout* pLayout) : CBTILayout(pLayout)
773         {}
774 
775         USC::SShaderStageBTLayout* getModifiableLayout();
776     };
777 
778     class RetryManager
779     {
780     public:
781         RetryManager();
782         ~RetryManager();
783 
784         bool AdvanceState();
785         bool AllowLICM();
786         bool AllowPromotePrivateMemory();
787         bool AllowPreRAScheduler();
788         bool AllowVISAPreRAScheduler();
789         bool AllowCodeSinking();
790         bool AllowSimd32Slicing();
791         bool AllowLargeURBWrite();
792         void SetFirstStateId(int id);
793         bool IsFirstTry();
794         bool IsLastTry();
795         unsigned GetRetryId() const;
796 
797         void Enable();
798         void Disable();
799 
800         void SetSpillSize(unsigned int spillSize);
801         unsigned int GetLastSpillSize();
802         unsigned int numInstructions = 0;
803         /// the set of OCL kernels that need to recompile
804         std::set<std::string> kernelSet;
805 
806         void ClearSpillParams();
807         // save entry for given SIMD mode, to avoid recompile for next retry.
808         void SaveSIMDEntry(SIMDMode simdMode, CShader* shader);
809         CShader* GetSIMDEntry(SIMDMode simdMode);
810         bool AnyKernelSpills();
811 
812         // Try to pickup the simd mode & kernel based on heuristics and fill
813         // programOutput.  If returning true, then stop the further retry.
814         bool PickupKernels(CodeGenContext* cgCtx);
815 
816     private:
817         unsigned stateId;
818         // For debugging purposes, it can be useful to start on a particular
819         // ID rather than id 0.
820         unsigned firstStateId;
821 
822         unsigned getStateCnt();
823 
824         /// internal knob to disable retry manager.
825         bool enabled;
826 
827         unsigned lastSpillSize = 0;
828 
829         // cache the compiled kernel during retry
830         CShader* m_simdEntries[3];
831 
832         CShader* PickCSEntryForcedFromDriver(SIMDMode& simdMode,
833             unsigned char forcedSIMDModeFromDriver);
834         CShader* PickCSEntryByRegKey(SIMDMode& simdMode, ComputeShaderContext* cgCtx);
835         CShader* PickCSEntryEarly(SIMDMode& simdMode,
836             ComputeShaderContext* cgCtx);
837         CShader* PickCSEntryFinally(SIMDMode& simdMode);
838         void FreeAllocatedMemForNotPickedCS(SIMDMode simdMode);
839         bool PickupCS(ComputeShaderContext* cgCtx);
840     };
841 
842     /// this class adds intrinsic cache to LLVM context
843     class LLVMContextWrapper : public llvm::LLVMContext
844     {
845         LLVMContextWrapper(LLVMContextWrapper&) = delete;
846         LLVMContextWrapper& operator =(LLVMContextWrapper&) = delete;
847 
848     public:
849         LLVMContextWrapper(bool createResourceDimTypes = true);
850         /// ref count the LLVMContext as now CodeGenContext owns it
851         unsigned int refCount = 0;
852         /// IntrinsicIDCache - Cache of intrinsic pointer to numeric ID mappings
853         /// requested in this context
854         typedef llvm::ValueMap<const llvm::Function*, unsigned> SafeIntrinsicIDCacheTy;
855         SafeIntrinsicIDCacheTy m_SafeIntrinsicIDCache;
856         void AddRef();
857         void Release();
858     };
859 
860 
861     class CodeGenContext
862     {
863     public:
864         /// input: hash key
865         ShaderHash    hash;
866         ShaderType    type;
867         /// input: Platform features supported
868         const CPlatform& platform;
869         /// input: binding table layout used by the driver
870         const CBTILayout& btiLayout;
871         /// information about the driver
872         const CDriverInfo& m_DriverInfo;
873         /// output: driver instrumentation
874         TimeStats* m_compilerTimeStats = nullptr;
875         ShaderStats* m_sumShaderStats = nullptr;
876         /// output: list of buffer IDs which are promoted to direct AS
877         // Map of promoted buffer ids with their respective buffer offsets if needed. Buffer offset will be -1 if no need of buffer offset
878         std::map<unsigned, int> m_buffersPromotedToDirectAS;
879         // float 16, float32 and float64 denorm mode
880         Float_DenormMode    m_floatDenormMode16 = FLOAT_DENORM_FLUSH_TO_ZERO;
881         Float_DenormMode    m_floatDenormMode32 = FLOAT_DENORM_FLUSH_TO_ZERO;
882         Float_DenormMode    m_floatDenormMode64 = FLOAT_DENORM_FLUSH_TO_ZERO;
883 
884         PushConstantMode m_pushConstantMode = PushConstantMode::DEFAULT;
885 
886         SInstrTypes m_instrTypes;
887 
888         /////  used for instruction statistic before/after pass
889         int instrStat[TOTAL_TYPES][TOTAL_STAGE];
890 
891         // Module flag for subroutines/stackcalls enabled
892         bool m_enableSubroutine = false;
893         // Module flag for function pointers enabled
894         bool m_enableFunctionPointer = false;
895         // Module flag for when we need to compile multiple SIMD sizes to support SIMD variants
896         bool m_enableSimdVariantCompilation = false;
897 
898         // Adding multiversioning to partially redundant samples, if AIL is on.
899         bool m_enableSampleMultiversioning = false;
900 
901         // Do not generate gen binary, emit vISA only.
902         bool m_compileToVISAOnly = false;
903 
904         bool m_src1RemovedForBlendOpt = false;
905         llvm::AssemblyAnnotationWriter* annotater = nullptr;
906 
907         RetryManager m_retryManager;
908 
909         IGCMetrics::IGCMetric metrics;
910 
911         // shader stat for opt customization
912         uint32_t     m_tempCount = 0;
913         uint32_t     m_sampler = 0;
914         uint32_t     m_inputCount = 0;
915         uint32_t     m_dxbcCount = 0;
916         uint32_t     m_ConstantBufferCount = 0;
917         uint32_t     m_numGradientSinked = 0;
918         std::vector<unsigned> m_indexableTempSize;
919         bool         m_highPsRegisterPressure = 0;
920 
921         // Record previous simd for code patching
922         CShader* m_prevShader = nullptr;
923 
924         // For IR dump after pass
925         unsigned     m_numPasses = 0;
926         bool m_threadCombiningOptDone = false;
927 
928         void* m_ConstantBufferReplaceShaderPatterns = nullptr;
929         uint m_ConstantBufferReplaceShaderPatternsSize = 0;
930         uint m_ConstantBufferUsageMask = 0;
931         uint m_ConstantBufferReplaceSize = 0;
932         // tracking next available GRF offset for constants payload
933         unsigned int        m_constantPayloadNextAvailableGRFOffset = 0;
934         ConstantPayloadInfo m_constantPayloadOffsets;
935 
936         void* gtpin_init = nullptr;
937         bool m_hasLegacyDebugInfo = false;
938         bool m_hasEmu64BitInsts = false;
939 
940         CompilerStats m_Stats;
941         // Flag for staged compilation
942         CG_FLAG_t m_CgFlag = FLAG_CG_ALL_SIMDS;
943         // Staging context passing from Stage 1 for compile continuation
944         CG_CTX_t* m_StagingCtx = nullptr;
945         // We determine whether generating SIMD32 based on SIMD16's result
946         // For staged compilation, we record if SIMD32 will be generated in Stage1, and
947         // pass it to Stage2.
948         bool m_doSimd32Stage2 = false;
949         bool m_doSimd16Stage2 = false;
950         std::string m_savedBitcodeString;
951         SInstrTypes m_savedInstrTypes;
952 
953         bool m_hasVendorExtension = false;
954         bool PsHighSimdDisable = false;
955 
956         std::vector<int> m_hsIdxMap;
957         std::vector<int> m_dsIdxMap;
958         std::vector<int> m_gsIdxMap;
959         std::vector<int> m_hsNonDefaultIdxMap;
960         std::vector<int> m_dsNonDefaultIdxMap;
961         std::vector<int> m_gsNonDefaultIdxMap;
962         std::vector<int> m_psIdxMap;
963         DWORD dsInSize = 0;
964         DWORD LtoUsedMask = 0;
965         uint64_t m_SIMDInfo;
966     private:
967         //For storing error message
968         std::stringstream oclErrorMessage;
969         //For storing warning message
970         std::stringstream oclWarningMessage;
971 
972     protected:
973         // Objects pointed to by these pointers are owned by this class.
974         LLVMContextWrapper* llvmCtxWrapper;
975         /// input: LLVM module
976         IGCLLVM::Module* module = nullptr;
977         /// input: IGC MetaData Utils
978         IGC::IGCMD::MetaDataUtils* m_pMdUtils = nullptr;
979         IGC::ModuleMetaData* modMD = nullptr;
980 
981         virtual void setFlagsPerCtx();
982     public:
983         CodeGenContext(
984             ShaderType          _type,      ///< shader type
985             const CBTILayout& _bitLayout, ///< binding table layout to be used in code gen
986             const CPlatform& _platform,  ///< IGC HW platform description
987             const CDriverInfo& driverInfo, ///< Queries to know runtime features support
988             const bool          createResourceDimTypes = true,
989             LLVMContextWrapper* LLVMContext = nullptr)///< LLVM context to use, if null a new one will be created
type(_type)990             : type(_type), platform(_platform), btiLayout(_bitLayout), m_DriverInfo(driverInfo),
991             llvmCtxWrapper(LLVMContext), m_SIMDInfo(0)
992         {
993             if (llvmCtxWrapper == nullptr)
994             {
995                 initLLVMContextWrapper(createResourceDimTypes);
996             }
997             else
998             {
999                 llvmCtxWrapper->AddRef();
1000             }
1001 
1002             m_indexableTempSize.resize(64);
1003 
1004             for (uint i = 0; i < TOTAL_TYPES; i++)
1005             {
1006                 for (uint j = 0; j < TOTAL_STAGE; j++)
1007                 {
1008                     instrStat[i][j] = 0;
1009                 }
1010             }
1011 
1012             // Per context flag adjustment
1013             setFlagsPerCtx();
1014         }
1015 
1016         CodeGenContext(CodeGenContext&) = delete;
1017         CodeGenContext& operator =(CodeGenContext&) = delete;
1018 
1019         void initLLVMContextWrapper(bool createResourceDimTypes = true);
1020         llvm::LLVMContext* getLLVMContext() const;
1021         IGC::IGCMD::MetaDataUtils* getMetaDataUtils() const;
1022         IGCLLVM::Module* getModule() const;
1023 
1024         void setModule(llvm::Module* m);
1025         // Several clients explicitly delete module without resetting module to null.
1026         // This causes the issue later when the dtor is invoked (trying to delete a
1027         // dangling pointer again). This function is used to replace any explicit
1028         // delete in order to prevent deleting dangling pointers happening.
1029         void deleteModule();
1030         IGC::ModuleMetaData* getModuleMetaData() const;
1031         unsigned int getRegisterPointerSizeInBits(unsigned int AS) const;
1032         bool enableFunctionCall() const;
1033         void CheckEnableSubroutine(llvm::Module& M);
1034         virtual void InitVarMetaData();
1035         virtual ~CodeGenContext();
1036         void clear();
1037         void EmitError(std::ostream &OS, const char* errorstr, const llvm::Value *context) const;
1038         void EmitError(const char* errorstr, const llvm::Value *context);
1039         void EmitWarning(const char* warningstr);
HasError()1040         inline bool HasError() const { return !this->oclErrorMessage.str().empty(); }
HasWarning()1041         inline bool HasWarning() const { return !this->oclWarningMessage.str().empty(); }
GetWarning()1042         inline const std::string GetWarning() { return this->oclWarningMessage.str(); }
GetError()1043         inline const std::string GetError() { return this->oclErrorMessage.str(); }
GetErrorAndWarning()1044         inline const std::string GetErrorAndWarning() { return GetWarning() + GetError(); }
1045 
1046         CompOptions& getCompilerOption();
1047         virtual void resetOnRetry();
1048         virtual uint32_t getNumThreadsPerEU() const;
1049         virtual uint32_t getNumGRFPerThread() const;
1050         virtual bool forceGlobalMemoryAllocation() const;
1051         virtual bool allocatePrivateAsGlobalBuffer() const;
1052         virtual bool hasNoLocalToGenericCast() const;
1053         virtual bool hasNoPrivateToGenericCast() const;
1054         virtual int16_t getVectorCoalescingControl() const;
1055         bool isPOSH() const;
1056 
Stats()1057         CompilerStats& Stats()
1058         {
1059             return m_Stats;
1060         }
1061 
GetSIMDInfoOffset(SIMDMode simd,ShaderDispatchMode mode)1062         unsigned int GetSIMDInfoOffset(SIMDMode simd, ShaderDispatchMode mode)
1063         {
1064             unsigned int offset = 0;
1065 
1066             switch (mode) {
1067             case ShaderDispatchMode::NOT_APPLICABLE:
1068                 switch (simd) {
1069                 case SIMDMode::SIMD8:
1070                     offset = SIMD8_OFFSET;
1071                     break;
1072                 case SIMDMode::SIMD16:
1073                     offset = SIMD16_OFFSET;
1074                     break;
1075                 case SIMDMode::SIMD32:
1076                     offset = SIMD32_OFFSET;
1077                     break;
1078                 default:
1079                     break;
1080                 }
1081                 break;
1082 
1083             default:
1084                 break;
1085             }
1086             return offset;
1087         }
1088 
SetSIMDInfo(SIMDInfoBit bit,SIMDMode simd,ShaderDispatchMode mode)1089         void SetSIMDInfo(SIMDInfoBit bit, SIMDMode simd, ShaderDispatchMode mode)
1090         {
1091             unsigned int offset = GetSIMDInfoOffset(simd, mode);
1092             m_SIMDInfo |= (uint64_t)1 << (bit + offset);
1093         }
1094 
ClearSIMDInfo(SIMDMode simd,ShaderDispatchMode mode)1095         void ClearSIMDInfo(SIMDMode simd, ShaderDispatchMode mode)
1096         {
1097             unsigned int offset = GetSIMDInfoOffset(simd, mode);
1098             m_SIMDInfo &= ~(0xff << offset);
1099         }
1100 
GetSIMDInfo()1101         uint64_t GetSIMDInfo() { return m_SIMDInfo; }
1102 
knownSIMDSize()1103         virtual llvm::Optional<SIMDMode> knownSIMDSize() const {
1104             return llvm::None;
1105         }
1106 
1107         // This can be paired with `EncodeAS4GFXResource()` to get a unique
1108         // index.
getUniqueIndirectIdx()1109         uint32_t getUniqueIndirectIdx()
1110         {
1111             return getModuleMetaData()->CurUniqueIndirectIdx++;
1112         }
1113 
1114         // Frontends may elect to compute indices in their own way. If so,
1115         // they should call this at the end to mark the max index they have
1116         // reserved so that later passes can ensure that `getUniqueIndirectIdx()`
1117         // won't collide with any indices from the frontend.
setUniqueIndirectIdx(uint32_t NewVal)1118         void setUniqueIndirectIdx(uint32_t NewVal)
1119         {
1120             uint32_t &CurVal = getModuleMetaData()->CurUniqueIndirectIdx;
1121             CurVal = std::max(CurVal, NewVal);
1122         }
1123     };
1124 
1125     class VertexShaderContext : public CodeGenContext
1126     {
1127     public:
1128         // output: shader information
1129         SVertexShaderKernelProgram programOutput;
1130         VertexShaderContext(
1131             const CBTILayout& btiLayout, ///< binding table layout to be used in code gen
1132             const CPlatform& platform,  ///< IGC HW platform description
1133             const CDriverInfo& driverInfo,
1134             const bool          createResourceDimTypes = true,
1135             LLVMContextWrapper* llvmCtxWrapper = nullptr) ///< LLVM context to use, if null a new one will be created
CodeGenContext(ShaderType::VERTEX_SHADER,btiLayout,platform,driverInfo,createResourceDimTypes,llvmCtxWrapper)1136             : CodeGenContext(ShaderType::VERTEX_SHADER, btiLayout, platform, driverInfo, createResourceDimTypes, llvmCtxWrapper),
1137             programOutput()
1138         {
1139         }
1140 
1141     };
1142 
1143     class PixelShaderContext : public CodeGenContext
1144     {
1145     public:
1146         // output: shader information
1147         SPixelShaderKernelProgram programOutput;
1148         PixelShaderContext(
1149             const CBTILayout& btiLayout, ///< binding table layout to be used in code gen
1150             const CPlatform& platform,  ///< IGC HW platform description
1151             const CDriverInfo& driverInfo,
1152             const bool          createResourceDimTypes = true,
1153             LLVMContextWrapper* llvmCtxWrapper = nullptr) ///< LLVM context to use, if null a new one will be created
CodeGenContext(ShaderType::PIXEL_SHADER,btiLayout,platform,driverInfo,createResourceDimTypes,llvmCtxWrapper)1154             : CodeGenContext(ShaderType::PIXEL_SHADER, btiLayout, platform, driverInfo, createResourceDimTypes, llvmCtxWrapper),
1155             programOutput()
1156         {
1157         }
1158     };
1159 
1160     class GeometryShaderContext : public CodeGenContext
1161     {
1162     public:
1163         // output: shader information
1164         SGeometryShaderKernelProgram programOutput;
1165         GeometryShaderContext(
1166             const CBTILayout& btiLayout, ///< binding table layout to be used in code gen
1167             const CPlatform& platform,  ///< IGC HW platform description
1168             const CDriverInfo& driverInfo,
1169             const bool          createResourceDimTypes = true,
1170             LLVMContextWrapper* llvmCtxWrapper = nullptr) ///< LLVM context to use, if null a new one will be created
CodeGenContext(ShaderType::GEOMETRY_SHADER,btiLayout,platform,driverInfo,createResourceDimTypes,llvmCtxWrapper)1171             : CodeGenContext(ShaderType::GEOMETRY_SHADER, btiLayout, platform, driverInfo, createResourceDimTypes, llvmCtxWrapper),
1172             programOutput()
1173         {
1174         }
1175     };
1176 
1177     struct SComputeShaderSecondCompileInput
1178     {
1179         bool secondCompile;
1180         bool isRowMajor;
1181         int numChannelsUsed;
1182         int runtimeVal_LoopCount;
1183         int runtimeVal_ResWidthOrHeight;
1184         int runtimeVal_ConstBufferSize;
1185 
SComputeShaderSecondCompileInputSComputeShaderSecondCompileInput1186         SComputeShaderSecondCompileInput()
1187             : secondCompile(false)
1188             , isRowMajor(false)
1189             , numChannelsUsed(0)
1190             , runtimeVal_LoopCount(0)
1191             , runtimeVal_ResWidthOrHeight(0)
1192             , runtimeVal_ConstBufferSize(0)
1193         {}
1194     };
1195 
1196     class ComputeShaderContext : public CodeGenContext
1197     {
1198     public:
1199         SComputeShaderKernelProgram programOutput;
1200         bool isSecondCompile;
1201         bool m_IsPingPongSecond;
1202         unsigned m_slmSize;
1203         bool numWorkGroupsUsed;
1204         bool m_ForceOneSIMD = false;
1205 
1206         ComputeShaderContext(
1207             const CBTILayout& btiLayout, ///< binding table layout to be used in code gen
1208             const CPlatform& platform,  ///< IGC HW platform description
1209             const CDriverInfo& driverInfo,
1210             const bool          createResourceDimTypes = true,
1211             LLVMContextWrapper* llvmCtxWrapper = nullptr) ///< LLVM context to use, if null a new one will be created
CodeGenContext(ShaderType::COMPUTE_SHADER,btiLayout,platform,driverInfo,createResourceDimTypes,llvmCtxWrapper)1212             : CodeGenContext(ShaderType::COMPUTE_SHADER, btiLayout, platform, driverInfo, createResourceDimTypes, llvmCtxWrapper),
1213             programOutput()
1214         {
1215             isSecondCompile = false;
1216             m_IsPingPongSecond = false;
1217             m_slmSize = 0;
1218             numWorkGroupsUsed = false;
1219             m_threadGroupSize_X = 0;
1220             m_threadGroupSize_Y = 0;
1221             m_threadGroupSize_Z = 0;
1222         }
1223 
1224         /** get shader's thread group size */
1225         unsigned GetThreadGroupSize();
GetThreadGroupSizeX()1226         unsigned GetThreadGroupSizeX() { return m_threadGroupSize_X; }
GetThreadGroupSizeY()1227         unsigned GetThreadGroupSizeY() { return m_threadGroupSize_Y; }
GetThreadGroupSizeZ()1228         unsigned GetThreadGroupSizeZ() { return m_threadGroupSize_Z; }
1229         unsigned GetSlmSizePerSubslice();
1230         unsigned GetSlmSize() const;
1231         float GetThreadOccupancy(SIMDMode simdMode);
1232         /** get smallest SIMD mode allowed based on thread group size */
1233         SIMDMode GetLeastSIMDModeAllowed();
1234         /** get largest SIMD mode for performance based on thread group size */
1235         SIMDMode GetMaxSIMDMode();
1236 
1237         float GetSpillThreshold() const;
1238     private:
1239         unsigned m_threadGroupSize_X;
1240         unsigned m_threadGroupSize_Y;
1241         unsigned m_threadGroupSize_Z;
1242     };
1243 
1244     class HullShaderContext : public CodeGenContext
1245     {
1246     public:
1247         // output: shader information
1248         SHullShaderKernelProgram programOutput;
1249         HullShaderContext(
1250             const CBTILayout& btiLayout, ///< binding table layout to be used in code gen
1251             const CPlatform& platform,  ///< IGC HW platform description
1252             const CDriverInfo& driverInfo,
1253             const bool          createResourceDimTypes = true,
1254             LLVMContextWrapper* llvmCtxWrapper = nullptr) ///< LLVM context to use, if null a new one will be created
CodeGenContext(ShaderType::HULL_SHADER,btiLayout,platform,driverInfo,createResourceDimTypes,llvmCtxWrapper)1255             : CodeGenContext(ShaderType::HULL_SHADER, btiLayout, platform, driverInfo, createResourceDimTypes, llvmCtxWrapper),
1256             programOutput()
1257         {
1258         }
1259     };
1260 
1261     class DomainShaderContext : public CodeGenContext
1262     {
1263     public:
1264         // output: shader information
1265         SDomainShaderKernelProgram programOutput;
1266         DomainShaderContext(
1267             const CBTILayout& btiLayout, ///< binding table layout to be used in code gen
1268             const CPlatform& platform,  ///< IGC HW platform description
1269             const CDriverInfo& driverInfo,
1270             const bool          createResourceDimTypes = true,
1271             LLVMContextWrapper* llvmCtxWrapper = nullptr) ///< LLVM context to use, if null a new one will be created
CodeGenContext(ShaderType::DOMAIN_SHADER,btiLayout,platform,driverInfo,createResourceDimTypes,llvmCtxWrapper)1272             : CodeGenContext(ShaderType::DOMAIN_SHADER, btiLayout, platform, driverInfo, createResourceDimTypes, llvmCtxWrapper),
1273             programOutput()
1274         {
1275         }
1276     };
1277     class OpenCLProgramContext : public CodeGenContext
1278     {
1279     public:
1280         // We should probably replace all of this with proper option parsing,
1281         // like RS does
1282         class InternalOptions
1283         {
1284         public:
InternalOptions(const TC::STB_TranslateInputArgs * pInputArgs)1285             InternalOptions(const TC::STB_TranslateInputArgs* pInputArgs) :
1286                 KernelDebugEnable(false),
1287                 IncludeSIPCSR(false),
1288                 IncludeSIPKernelDebug(false),
1289                 IntelGreaterThan4GBBufferRequired(false),
1290                 Use32BitPtrArith(false),
1291                 IncludeSIPKernelDebugWithLocalMemory(false),
1292                 IntelHasPositivePointerOffset(false),
1293                 IntelHasBufferOffsetArg(false),
1294                 IntelBufferOffsetArgOptional(true),
1295                 IntelHasSubDWAlignedPtrArg(false)
1296             {
1297                 if (pInputArgs == nullptr)
1298                     return;
1299 
1300                 if (pInputArgs->pInternalOptions != nullptr)
1301                 {
1302                     parseOptions(pInputArgs->pInternalOptions);
1303                 }
1304 
1305                 // Internal options are passed in via pOptions as well.
1306                 if (pInputArgs->pOptions != nullptr)
1307                 {
1308                     parseOptions(pInputArgs->pOptions);
1309                 }
1310             }
1311 
1312             bool KernelDebugEnable;
1313             bool IncludeSIPCSR;
1314             bool IncludeSIPKernelDebug;
1315             bool IntelGreaterThan4GBBufferRequired;
1316             bool IntelDisableA64WA = false;
1317             bool IntelForceEnableA64WA = false;
1318             bool Use32BitPtrArith = false;
1319             bool IncludeSIPKernelDebugWithLocalMemory;
1320 
1321             bool GTPinReRA = false;
1322             bool GTPinGRFInfo = false;
1323             bool GTPinScratchAreaSize = false;
1324             uint32_t GTPinScratchAreaSizeValue = 0;
1325 
1326             // stateless to stateful optimization
1327             bool IntelHasPositivePointerOffset; // default: false
1328             bool IntelHasBufferOffsetArg;       // default: false
1329             bool IntelBufferOffsetArgOptional;  // default: true
1330             bool IntelHasSubDWAlignedPtrArg;
1331                  // default: false, meaning kernel's sub-DW ptrArgs (char*, short*) are DW-aligned.
1332                  // This default is stronger than the natural alignment implied by char*/short*. But
1333                  // for historical reason, we have this.
1334 
1335             bool replaceGlobalOffsetsByZero = false;
1336             bool IntelEnablePreRAScheduling = true;
1337             bool PromoteStatelessToBindless = false;
1338             bool PreferBindlessImages = false;
1339             bool UseBindlessMode = false;
1340             bool UseBindlessPrintf = false;
1341             bool UseBindlessLegacyMode = true;
1342             bool EnableZEBinary = false;
1343             bool NoSpill = false;
1344 
1345             // Generic address related
1346             bool HasNoLocalToGeneric = false;
1347             bool ForceGlobalMemoryAllocation = false;
1348 
1349             // -1 : initial value that means it is not set from cmdline
1350             // 0-5: valid values set from the cmdline
1351             int16_t VectorCoalescingControl = -1;
1352 
1353             bool Intel128GRFPerThread = false;
1354             bool Intel256GRFPerThread = false;
1355             bool IntelNumThreadPerEU = false;
1356             uint32_t numThreadsPerEU = 0;
1357 
1358             private:
1359                 void parseOptions(const char* IntOptStr);
1360         };
1361 
1362         class Options
1363         {
1364         public:
Options(const TC::STB_TranslateInputArgs * pInputArgs)1365             Options(const TC::STB_TranslateInputArgs* pInputArgs) :
1366                 CorrectlyRoundedSqrt(false),
1367                 NoSubgroupIFP(false),
1368                 UniformWGS(false)
1369             {
1370                 if (pInputArgs == nullptr)
1371                     return;
1372 
1373                 if (pInputArgs->pOptions == nullptr)
1374                     return;
1375 
1376                 // Build options are of the form -cl-xxxx and -ze-xxxx
1377                 // So we skip these prefixes when reading the options to be agnostic of their source
1378 
1379                 // Runtime passes internal options via pOptions as well, and those
1380                 // internal options will be handled by InternalOptions class (parseOptions).
1381                 // !!! When adding a new internal option, please add it into internalOptions class!!!
1382                 // (Might combine both Options and InternalOptions into a single class!)
1383                 const char* options = pInputArgs->pOptions;
1384                 if (strstr(options, "-fp32-correctly-rounded-divide-sqrt"))
1385                 {
1386                     CorrectlyRoundedSqrt = true;
1387                 }
1388 
1389                 if (strstr(options, "-no-subgroup-ifp"))
1390                 {
1391                     NoSubgroupIFP = true;
1392                 }
1393 
1394                 if (strstr(options, "-uniform-work-group-size"))
1395                 {
1396                     // Note that this is only available for -cl-std >= 2.0.
1397                     // This will be checked before we place this into the
1398                     // the module metadata.
1399                     UniformWGS = true;
1400                 }
1401                 if (strstr(options, "-take-global-address"))
1402                 {
1403                     EnableTakeGlobalAddress = true;
1404                 }
1405                 if (strstr(options, "-library-compilation"))
1406                 {
1407                     IsLibraryCompilation = true;
1408                 }
1409                 if (const char* op = strstr(options, "-intel-reqd-eu-thread-count"))
1410                 {
1411                     IntelRequiredEUThreadCount = true;
1412                     // Take an integer value after this option
1413                     // atoi(..) ignores leading white spaces and characters after the actual number
1414                     requiredEUThreadCount = atoi(op + strlen("-intel-reqd-eu-thread-count="));
1415                 }
1416             }
1417 
1418             bool CorrectlyRoundedSqrt;
1419             bool NoSubgroupIFP;
1420             bool UniformWGS;
1421             bool EnableTakeGlobalAddress = false;
1422             bool IsLibraryCompilation = false;
1423             bool IntelRequiredEUThreadCount = false;
1424             uint32_t requiredEUThreadCount = 0;
1425         };
1426 
1427         // output: shader information
1428         iOpenCL::CGen8OpenCLProgram m_programOutput;
1429         SOpenCLProgramInfo m_programInfo;
1430         const InternalOptions m_InternalOptions;
1431         const Options m_Options;
1432         bool isSpirV;
1433         float m_ProfilingTimerResolution;
1434         bool m_ShouldUseNonCoherentStatelessBTI;
1435         uint32_t m_numUAVs = 0;
1436 
1437         OpenCLProgramContext(
1438             const COCLBTILayout& btiLayout,
1439             const CPlatform& platform,
1440             const TC::STB_TranslateInputArgs* pInputArgs,
1441             const CDriverInfo& driverInfo,
1442             LLVMContextWrapper* llvmContext = nullptr,
1443             bool shouldUseNonCoherentStatelessBTI = false,
1444             const bool createResourceDimTypes = true)
CodeGenContext(ShaderType::OPENCL_SHADER,btiLayout,platform,driverInfo,createResourceDimTypes,llvmContext)1445             : CodeGenContext(ShaderType::OPENCL_SHADER, btiLayout, platform, driverInfo, createResourceDimTypes, llvmContext),
1446             m_programOutput(platform.getPlatformInfo(), *this),
1447             m_InternalOptions(pInputArgs),
1448             m_Options(pInputArgs),
1449             isSpirV(false),
1450             m_ShouldUseNonCoherentStatelessBTI(shouldUseNonCoherentStatelessBTI)
1451         {
1452         }
1453         bool isSPIRV() const;
1454         void setAsSPIRV();
1455         float getProfilingTimerResolution();
1456         uint32_t getNumGRFPerThread() const override;
1457         uint32_t getNumThreadsPerEU() const override;
1458         bool forceGlobalMemoryAllocation() const override;
1459         bool allocatePrivateAsGlobalBuffer() const override;
1460         bool hasNoLocalToGenericCast() const override;
1461         bool hasNoPrivateToGenericCast() const override;
1462         int16_t getVectorCoalescingControl() const override;
1463     private:
1464         llvm::DenseMap<llvm::Function*, std::string> m_hashes_per_kernel;
1465     };
1466 
1467     void CodeGen(PixelShaderContext* ctx);
1468     void CodeGen(ComputeShaderContext* ctx);
1469     void CodeGen(DomainShaderContext* ctx);
1470     void CodeGen(HullShaderContext* ctx);
1471     void CodeGen(VertexShaderContext* ctx);
1472     void CodeGen(GeometryShaderContext* ctx);
1473     void CodeGen(OpenCLProgramContext* ctx);
1474 
1475     void OptimizeIR(CodeGenContext* ctx);
1476 
1477     /**
1478      * Fold derived constants.  Load CB data from CBptr with index & offset,
1479      * calculate the new data based on LLVM bitcode and store results to pNewCB.
1480      * Then driver will push pNewCB to thread payload.
1481      */
1482     void FoldDerivedConstant(char* bitcode, uint bitcodeSize, void* CBptr[15],
1483         std::function<void(uint[4], uint, uint, bool)> getResInfoCB, uint* pNewCB);
1484 } // end IGC namespace
1485