1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #ifndef G4_KERNEL_HPP
10 #define G4_KERNEL_HPP
11 
12 #include "G4_IR.hpp"
13 #include "FlowGraph.h"
14 #include "RelocationEntry.hpp"
15 #include "include/gtpin_IGC_interface.h"
16 
17 #include <cstdint>
18 #include <map>
19 #include <iostream>
20 #include <unordered_map>
21 #include <unordered_set>
22 #include <utility>
23 #include <vector>
24 
25 namespace vISA
26 {
27 #define RA_TYPE(DO)                                                            \
28   DO(TRIVIAL_BC_RA)                                                            \
29   DO(TRIVIAL_RA)                                                               \
30   DO(LOCAL_ROUND_ROBIN_BC_RA)                                                  \
31   DO(LOCAL_ROUND_ROBIN_RA)                                                     \
32   DO(LOCAL_FIRST_FIT_BC_RA)                                                    \
33   DO(LOCAL_FIRST_FIT_RA)                                                       \
34   DO(HYBRID_BC_RA)                                                             \
35   DO(HYBRID_RA)                                                                \
36   DO(GRAPH_COLORING_RR_BC_RA)                                                  \
37   DO(GRAPH_COLORING_FF_BC_RA)                                                  \
38   DO(GRAPH_COLORING_RR_RA)                                                     \
39   DO(GRAPH_COLORING_FF_RA)                                                     \
40   DO(GRAPH_COLORING_SPILL_RR_BC_RA)                                            \
41   DO(GRAPH_COLORING_SPILL_FF_BC_RA)                                            \
42   DO(GRAPH_COLORING_SPILL_RR_RA)                                               \
43   DO(GRAPH_COLORING_SPILL_FF_RA)                                               \
44   DO(GLOBAL_LINEAR_SCAN_RA)                                                    \
45   DO(GLOBAL_LINEAR_SCAN_BC_RA)                                                 \
46   DO(UNKNOWN_RA)
47 
48 enum RA_Type
49 {
50     RA_TYPE(MAKE_ENUM)
51 };
52 
53 class G4_Kernel;
54 
55 class gtPinData
56 {
57 public:
58     enum RAPass
59     {
60         FirstRAPass = 0,
61         ReRAPass = 1
62     };
63 
gtPinData(G4_Kernel & k)64     gtPinData(G4_Kernel& k) : kernel(k) {whichRAPass = FirstRAPass;}
~gtPinData()65     ~gtPinData() { }
66 
operator new(size_t sz,Mem_Manager & m)67     void *operator new(size_t sz, Mem_Manager& m) { return m.alloc(sz); }
68 
markInst(G4_INST * i)69     void markInst(G4_INST* i) {
70         MUST_BE_TRUE(whichRAPass == FirstRAPass,
71             "Unexpectedly marking in re-RA pass.");
72         markedInsts.insert(i);
73     }
74 
75     void markInsts();
clearMarkedInsts()76     void clearMarkedInsts() { markedInsts.clear(); }
77     void removeUnmarkedInsts();
78 
isFirstRAPass() const79     bool isFirstRAPass() const { return whichRAPass == RAPass::FirstRAPass; }
isReRAPass() const80     bool isReRAPass() const { return whichRAPass == RAPass::ReRAPass; }
setRAPass(RAPass p)81     void setRAPass(RAPass p) { whichRAPass = p; }
82 
83     // All following functions work on byte granularity of GRF file
clearFreeGlobalRegs()84     void clearFreeGlobalRegs() { globalFreeRegs.clear(); }
getNumFreeGlobalRegs() const85     unsigned getNumFreeGlobalRegs() const { return (unsigned)globalFreeRegs.size(); }
getFreeGlobalReg(unsigned n) const86     unsigned getFreeGlobalReg(unsigned n) const { return globalFreeRegs[n]; }
addFreeGlobalReg(unsigned n)87     void addFreeGlobalReg(unsigned n) { globalFreeRegs.push_back(n); }
setFreeGlobalRegs(std::vector<unsigned> & vec)88     void setFreeGlobalRegs(std::vector<unsigned>& vec) {globalFreeRegs = vec;}
89 
90     // This function internally mallocs memory to hold buffer
91     // of free GRFs. It is meant to be freed by caller after
92     // last use of the buffer.
93     void* getFreeGRFInfo(unsigned& size);
94     void  setGTPinInit(void* buffer);
95 
getGTPinInit()96     gtpin::igc::igc_init_t* getGTPinInit() { return gtpin_init; }
97 
98     // return igc_info_t format buffer. caller casts it to igc_info_t.
99     void* getGTPinInfoBuffer(unsigned &bufferSize);
100 
setScratchNextFree(unsigned next)101     void setScratchNextFree(unsigned next) {
102         nextScratchFree = ((next + numEltPerGRF<Type_UB>() - 1) / numEltPerGRF<Type_UB>()) * numEltPerGRF<Type_UB>();
103     }
104     uint32_t getNumBytesScratchUse() const;
105 
setGTPinInitFromL0(bool val)106     void setGTPinInitFromL0(bool val) { gtpinInitFromL0 = val; }
isGTPinInitFromL0() const107     bool isGTPinInitFromL0() const { return gtpinInitFromL0; }
108 
109 private:
110     G4_Kernel& kernel;
111     std::set<G4_INST*> markedInsts;
112     RAPass whichRAPass;
113     // globalFreeRegs are in units of bytes in linearized register file.
114     // Data is assumed to be sorted in ascending order during insertion.
115     // Duplicates are not allowed.
116     std::vector<unsigned> globalFreeRegs;
117     // Member stores next free scratch slot
118     unsigned nextScratchFree = 0;
119 
120     bool gtpinInitFromL0 = false;
121     gtpin::igc::igc_init_t* gtpin_init = nullptr;
122 }; // class gtPinData
123 
124 class G4_BB;
125 class KernelDebugInfo;
126 class VarSplitPass;
127 
128 
129 class G4_Kernel
130 {
131 public:
132     using RelocationTableTy = std::vector<RelocationEntry>;
133 
134 private:
135     const char* name;
136     unsigned numRegTotal;
137     unsigned numThreads;
138     unsigned numSWSBTokens;
139     unsigned numAcc;
140     G4_ExecSize simdSize {0u}; // must start as 0
141     bool channelSliced = true;
142     bool hasAddrTaken;
143     bool regSharingHeuristics;
144     Options *m_options;
145     const Attributes* m_kernelAttrs;
146 
147     RA_Type RAType;
148     KernelDebugInfo* kernelDbgInfo = nullptr;
149     gtPinData* gtPinInfo = nullptr;
150 
151     uint32_t asmInstCount;
152     uint64_t kernelID;
153 
154     unsigned callerSaveLastGRF;
155 
156     bool m_hasIndirectCall = false;
157 
158     VarSplitPass* varSplitPass = nullptr;
159 
160     // map key is filename string with complete path.
161     // if first elem of pair is false, the file wasn't found.
162     // the second elem of pair stores the actual source line stream
163     // for each source file referenced by this kernel.
164     std::map<std::string, std::pair<bool, std::vector<std::string>>> debugSrcLineMap;
165 
166     // This must be explicitly set by kernel attributes later
167     VISATarget kernelType = VISA_3D;
168 
169     // stores all relocations to be performed after binary encoding
170     RelocationTableTy relocationTable;
171 
172     // the last output we dumped for this kernel and index of next dump
173     std::string            lastG4Asm;
174     int                    nextDumpIndex = 0;
175 
176     bool sharedDebugInfo = false;
177     bool sharedGTPinInfo = false;
178 
179     G4_BB* perThreadPayloadBB = nullptr;
180     G4_BB* crossThreadPayloadBB = nullptr;
181     // There's two entires prolog for setting FFID for compute shaders.
182     G4_BB* computeFFIDGP = nullptr;
183     G4_BB* computeFFIDGP1 = nullptr;
184 public:
185     FlowGraph              fg;
186     DECLARE_LIST           Declares;
187     DECLARE_LIST           callerRestoreDecls;
188 
189     unsigned char major_version;
190     unsigned char minor_version;
191 
192     G4_Kernel(INST_LIST_NODE_ALLOCATOR& alloc,
193         Mem_Manager& m, Options* options, Attributes* anAttr,
194         unsigned char major, unsigned char minor);
195     ~G4_Kernel();
196 
operator new(size_t sz,Mem_Manager & m)197     void *operator new(size_t sz, Mem_Manager& m) {return m.alloc(sz);}
198 
setBuilder(IR_Builder * pBuilder)199     void setBuilder(IR_Builder *pBuilder) {fg.setBuilder(pBuilder);}
200 
useRegSharingHeuristics() const201     bool useRegSharingHeuristics() const {
202         // Register sharing not enabled in presence of stack calls
203         return regSharingHeuristics && !m_hasIndirectCall &&
204             !fg.getIsStackCallFunc() && !fg.getHasStackCalls();
205     }
206 
setNumThreads(int nThreads)207     void     setNumThreads(int nThreads) { numThreads = nThreads; }
getNumThreads() const208     uint32_t getNumThreads() const { return numThreads; }
209 
getNumSWSBTokens() const210     uint32_t getNumSWSBTokens() const { return numSWSBTokens; }
211 
getNumAcc() const212     uint32_t getNumAcc() const { return numAcc; }
213 
setAsmCount(int count)214     void     setAsmCount(int count) { asmInstCount = count; }
getAsmCount() const215     uint32_t getAsmCount() const { return asmInstCount; }
216 
setKernelID(uint64_t ID)217     void     setKernelID(uint64_t ID) { kernelID = ID; }
getKernelID() const218     uint64_t getKernelID() const { return kernelID; }
219 
getOptions()220     Options *getOptions() { return m_options; }
getKernelAttrs() const221     const Attributes* getKernelAttrs() const { return m_kernelAttrs; }
getBoolKernelAttr(Attributes::ID aID) const222     bool getBoolKernelAttr(Attributes::ID aID) const {
223         return getKernelAttrs()->getBoolKernelAttr(aID);
224     }
getInt32KernelAttr(Attributes::ID aID) const225     int32_t getInt32KernelAttr(Attributes::ID aID) const {
226         return getKernelAttrs()->getInt32KernelAttr(aID);
227     }
getOption(vISAOptions opt) const228     bool getOption(vISAOptions opt) const { return m_options->getOption(opt); }
229     void computeChannelSlicing();
230     void calculateSimdSize();
getSimdSize()231     G4_ExecSize getSimdSize() { return simdSize; }
getChannelSlicing() const232     bool getChannelSlicing() const { return channelSliced; }
getSimdSizeWithSlicing()233     unsigned getSimdSizeWithSlicing() { return channelSliced ? simdSize/2 : simdSize; }
234 
setHasAddrTaken(bool val)235     void setHasAddrTaken(bool val) { hasAddrTaken = val; }
getHasAddrTaken()236     bool getHasAddrTaken() { return hasAddrTaken;  }
237 
setNumRegTotal(unsigned num)238     void setNumRegTotal(unsigned num) { numRegTotal = num; }
getNumRegTotal() const239     unsigned getNumRegTotal() const { return numRegTotal; }
240 
setName(const char * n)241     void setName(const char* n) { name = n; }
getName() const242     const char* getName() const { return name; }
243 
244     void updateKernelByNumThreads(int nThreads);
245 
246     void evalAddrExp();
247 
setRAType(RA_Type type)248     void setRAType(RA_Type type) { RAType = type; }
getRAType() const249     RA_Type getRAType() const { return RAType; }
250 
hasKernelDebugInfo() const251     bool hasKernelDebugInfo() const {return kernelDbgInfo;}
252     void setKernelDebugInfo(KernelDebugInfo* k);
253     KernelDebugInfo* getKernelDebugInfo();
254 
255     void setGTPinData(gtPinData* p);
hasGTPinInit() const256     bool hasGTPinInit() const {return gtPinInfo && gtPinInfo->getGTPinInit();}
getGTPinData()257     gtPinData* getGTPinData() {
258         if (!gtPinInfo)
259             allocGTPinData();
260 
261         return gtPinInfo;
262     }
allocGTPinData()263     void allocGTPinData() {gtPinInfo = new(fg.mem) gtPinData(*this);}
264 
getCallerSaveLastGRF() const265     unsigned getCallerSaveLastGRF() const { return callerSaveLastGRF; }
266 
267     // This function returns starting register number to use
268     // for allocating FE/BE stack/frame ptrs.
269     unsigned getStackCallStartReg() const;
270     unsigned calleeSaveStart() const;
271     unsigned getNumCalleeSaveRegs() const;
272 
273     // return the number of reserved GRFs for stack call ABI
274     // the reserved registers are at the end of the GRF file (e.g., r125-r127)
numReservedABIGRF() const275     uint32_t numReservedABIGRF() const {
276         return 3;
277     }
278 
279     // purpose of the GRFs reserved for stack call ABI
280     const int FPSPGRF = 0;
281     const int SpillHeaderGRF = 1;
282     const int ThreadHeaderGRF = 2;
283 
getFPSPGRF() const284     uint32_t getFPSPGRF() const{
285         return getStackCallStartReg() + FPSPGRF;
286     }
287 
getSpillHeaderGRF() const288     uint32_t getSpillHeaderGRF() const{
289         return getStackCallStartReg() + SpillHeaderGRF;
290     }
291 
getThreadHeaderGRF() const292     uint32_t getThreadHeaderGRF() const{
293         return getStackCallStartReg() + ThreadHeaderGRF;
294     }
295 
296     void renameAliasDeclares();
297 
hasIndirectCall() const298     bool hasIndirectCall() const {return m_hasIndirectCall;}
setHasIndirectCall()299     void setHasIndirectCall() {m_hasIndirectCall = true;}
300 
getRelocationTable()301     RelocationTableTy& getRelocationTable() {
302         return relocationTable;
303     }
304 
getRelocationTable() const305     const RelocationTableTy& getRelocationTable() const {
306         return relocationTable;
307     }
308 
309     void doRelocation(void* binary, uint32_t binarySize);
310 
311     G4_INST* getFirstNonLabelInst() const;
312 
313     std::string getDebugSrcLine(const std::string& filename, int lineNo);
314 
315     VarSplitPass* getVarSplitPass();
316 
getKernelType() const317     VISATarget getKernelType() const { return kernelType; }
setKernelType(VISATarget t)318     void setKernelType(VISATarget t) { kernelType = t; }
319 
320 
321     /// dump this kernel to the standard error
322     void dump(std::ostream &os = std::cerr) const;  // used in debugger
323 
324     // dumps .dot files (if enabled) and .g4 (if enabled)
325     void dumpToFile(const std::string &suffix);
326 
327     void emitDeviceAsm(std::ostream& output, const void * binary, uint32_t binarySize);
328 
329     void emitRegInfo();
330     void emitRegInfoKernel(std::ostream& output);
331 
hasPerThreadPayloadBB() const332     bool hasPerThreadPayloadBB() const { return perThreadPayloadBB != nullptr; }
getPerThreadPayloadBB() const333     G4_BB* getPerThreadPayloadBB() const { return perThreadPayloadBB; }
setPerThreadPayloadBB(G4_BB * bb)334     void setPerThreadPayloadBB(G4_BB* bb) { perThreadPayloadBB = bb; }
hasCrossThreadPayloadBB() const335     bool hasCrossThreadPayloadBB() const { return crossThreadPayloadBB != nullptr; }
getCrossThreadPayloadBB() const336     G4_BB* getCrossThreadPayloadBB() const { return crossThreadPayloadBB; }
setCrossThreadPayloadBB(G4_BB * bb)337     void setCrossThreadPayloadBB(G4_BB* bb) { crossThreadPayloadBB = bb; }
hasComputeFFIDProlog() const338     bool hasComputeFFIDProlog() const {
339         return computeFFIDGP != nullptr && computeFFIDGP1 != nullptr;
340     }
setComputeFFIDGPBB(G4_BB * bb)341     void setComputeFFIDGPBB(G4_BB* bb) { computeFFIDGP = bb; }
setComputeFFIDGP1BB(G4_BB * bb)342     void setComputeFFIDGP1BB(G4_BB* bb) { computeFFIDGP1 = bb; }
343 
344     unsigned getCrossThreadNextOff() const;
345     unsigned getPerThreadNextOff() const;
346     unsigned getComputeFFIDGPNextOff() const;
347     unsigned getComputeFFIDGP1NextOff() const;
348 
349 private:
350     G4_BB* getNextBB(G4_BB* bb) const;
351     unsigned getBinOffsetOfBB(G4_BB* bb) const;
352 
353     void setKernelParameters();
354 
355     void dumpDotFileInternal(const std::string &baseName);
356     void dumpG4Internal(const std::string &baseName);
357     void dumpG4InternalTo(std::ostream &os);
358 
359     // stuff pertaining to emitDeviceAsm
360     void emitDeviceAsmHeaderComment(std::ostream& os);
361     void emitDeviceAsmInstructionsIga(std::ostream& os, const void * binary, uint32_t binarySize);
362     void emitDeviceAsmInstructionsOldAsm(std::ostream& os);
363 
364 public:
365     // fused call wa
366     //     add  v10  -ip,  v20                    // ip_start_inst
367     //     add  v11   v10, 0x33 (-label_patch)    // patch inst
368     //     ...
369     //     call v11                               // ip_end_inst
370     // This map keeps  patch inst --> <ip_start_inst, ip_end_inst>
371     // Once encoding is decided, label_path = IP(ip_end_inst) - IP(ip_start_inst)
372     //
373     // m_labelPatchInsts: keep the info described above
374     // m_instToBBs:  convenient map to BBs that those insts belong to
375     // m_waCallInsts: call insts whose targets should be defined outside the smallEU branch.
376     // m_maskOffWAInsts: insts whose MaskOff needs to be changed for this WA.
377     std::unordered_map<G4_INST*, std::pair<G4_INST*, G4_INST*>> m_labelPatchInsts;
378     std::unordered_map<G4_INST*, G4_BB*> m_instToBBs;
379     std::list<G4_INST*> m_waCallInsts;
380     std::unordered_map <G4_INST*, G4_BB*> m_maskOffWAInsts;
setMaskOffset(G4_INST * I,G4_InstOption MO)381     void setMaskOffset(G4_INST* I, G4_InstOption MO) {
382         // For call WA
383         assert((I->getMaskOffset() + I->getExecSize()) <= 16);
384         assert(I->getPredicate() == nullptr && I->getCondMod() == nullptr);
385         I->setMaskOption(MO);
386     }
387 
388 }; // G4_Kernel
389 }
390 
391 
392 
393 #endif // G4_KERNEL_HPP
394