1 /*========================== begin_copyright_notice ============================ 2 3 Copyright (C) 2021 Intel Corporation 4 5 SPDX-License-Identifier: MIT 6 7 ============================= end_copyright_notice ===========================*/ 8 9 #ifndef G4_KERNEL_HPP 10 #define G4_KERNEL_HPP 11 12 #include "G4_IR.hpp" 13 #include "FlowGraph.h" 14 #include "RelocationEntry.hpp" 15 #include "include/gtpin_IGC_interface.h" 16 17 #include <cstdint> 18 #include <map> 19 #include <iostream> 20 #include <unordered_map> 21 #include <unordered_set> 22 #include <utility> 23 #include <vector> 24 25 namespace vISA 26 { 27 #define RA_TYPE(DO) \ 28 DO(TRIVIAL_BC_RA) \ 29 DO(TRIVIAL_RA) \ 30 DO(LOCAL_ROUND_ROBIN_BC_RA) \ 31 DO(LOCAL_ROUND_ROBIN_RA) \ 32 DO(LOCAL_FIRST_FIT_BC_RA) \ 33 DO(LOCAL_FIRST_FIT_RA) \ 34 DO(HYBRID_BC_RA) \ 35 DO(HYBRID_RA) \ 36 DO(GRAPH_COLORING_RR_BC_RA) \ 37 DO(GRAPH_COLORING_FF_BC_RA) \ 38 DO(GRAPH_COLORING_RR_RA) \ 39 DO(GRAPH_COLORING_FF_RA) \ 40 DO(GRAPH_COLORING_SPILL_RR_BC_RA) \ 41 DO(GRAPH_COLORING_SPILL_FF_BC_RA) \ 42 DO(GRAPH_COLORING_SPILL_RR_RA) \ 43 DO(GRAPH_COLORING_SPILL_FF_RA) \ 44 DO(GLOBAL_LINEAR_SCAN_RA) \ 45 DO(GLOBAL_LINEAR_SCAN_BC_RA) \ 46 DO(UNKNOWN_RA) 47 48 enum RA_Type 49 { 50 RA_TYPE(MAKE_ENUM) 51 }; 52 53 class G4_Kernel; 54 55 class gtPinData 56 { 57 public: 58 enum RAPass 59 { 60 FirstRAPass = 0, 61 ReRAPass = 1 62 }; 63 gtPinData(G4_Kernel & k)64 gtPinData(G4_Kernel& k) : kernel(k) {whichRAPass = FirstRAPass;} ~gtPinData()65 ~gtPinData() { } 66 operator new(size_t sz,Mem_Manager & m)67 void *operator new(size_t sz, Mem_Manager& m) { return m.alloc(sz); } 68 markInst(G4_INST * i)69 void markInst(G4_INST* i) { 70 MUST_BE_TRUE(whichRAPass == FirstRAPass, 71 "Unexpectedly marking in re-RA pass."); 72 markedInsts.insert(i); 73 } 74 75 void markInsts(); clearMarkedInsts()76 void clearMarkedInsts() { markedInsts.clear(); } 77 void removeUnmarkedInsts(); 78 isFirstRAPass() const79 bool isFirstRAPass() const { return whichRAPass == RAPass::FirstRAPass; } isReRAPass() const80 bool isReRAPass() const { return whichRAPass == RAPass::ReRAPass; } setRAPass(RAPass p)81 void setRAPass(RAPass p) { whichRAPass = p; } 82 83 // All following functions work on byte granularity of GRF file clearFreeGlobalRegs()84 void clearFreeGlobalRegs() { globalFreeRegs.clear(); } getNumFreeGlobalRegs() const85 unsigned getNumFreeGlobalRegs() const { return (unsigned)globalFreeRegs.size(); } getFreeGlobalReg(unsigned n) const86 unsigned getFreeGlobalReg(unsigned n) const { return globalFreeRegs[n]; } addFreeGlobalReg(unsigned n)87 void addFreeGlobalReg(unsigned n) { globalFreeRegs.push_back(n); } setFreeGlobalRegs(std::vector<unsigned> & vec)88 void setFreeGlobalRegs(std::vector<unsigned>& vec) {globalFreeRegs = vec;} 89 90 // This function internally mallocs memory to hold buffer 91 // of free GRFs. It is meant to be freed by caller after 92 // last use of the buffer. 93 void* getFreeGRFInfo(unsigned& size); 94 void setGTPinInit(void* buffer); 95 getGTPinInit()96 gtpin::igc::igc_init_t* getGTPinInit() { return gtpin_init; } 97 98 // return igc_info_t format buffer. caller casts it to igc_info_t. 99 void* getGTPinInfoBuffer(unsigned &bufferSize); 100 setScratchNextFree(unsigned next)101 void setScratchNextFree(unsigned next) { 102 nextScratchFree = ((next + numEltPerGRF<Type_UB>() - 1) / numEltPerGRF<Type_UB>()) * numEltPerGRF<Type_UB>(); 103 } 104 uint32_t getNumBytesScratchUse() const; 105 setGTPinInitFromL0(bool val)106 void setGTPinInitFromL0(bool val) { gtpinInitFromL0 = val; } isGTPinInitFromL0() const107 bool isGTPinInitFromL0() const { return gtpinInitFromL0; } 108 109 private: 110 G4_Kernel& kernel; 111 std::set<G4_INST*> markedInsts; 112 RAPass whichRAPass; 113 // globalFreeRegs are in units of bytes in linearized register file. 114 // Data is assumed to be sorted in ascending order during insertion. 115 // Duplicates are not allowed. 116 std::vector<unsigned> globalFreeRegs; 117 // Member stores next free scratch slot 118 unsigned nextScratchFree = 0; 119 120 bool gtpinInitFromL0 = false; 121 gtpin::igc::igc_init_t* gtpin_init = nullptr; 122 }; // class gtPinData 123 124 class G4_BB; 125 class KernelDebugInfo; 126 class VarSplitPass; 127 128 129 class G4_Kernel 130 { 131 public: 132 using RelocationTableTy = std::vector<RelocationEntry>; 133 134 private: 135 const char* name; 136 unsigned numRegTotal; 137 unsigned numThreads; 138 unsigned numSWSBTokens; 139 unsigned numAcc; 140 G4_ExecSize simdSize {0u}; // must start as 0 141 bool channelSliced = true; 142 bool hasAddrTaken; 143 bool regSharingHeuristics; 144 Options *m_options; 145 const Attributes* m_kernelAttrs; 146 147 RA_Type RAType; 148 KernelDebugInfo* kernelDbgInfo = nullptr; 149 gtPinData* gtPinInfo = nullptr; 150 151 uint32_t asmInstCount; 152 uint64_t kernelID; 153 154 unsigned callerSaveLastGRF; 155 156 bool m_hasIndirectCall = false; 157 158 VarSplitPass* varSplitPass = nullptr; 159 160 // map key is filename string with complete path. 161 // if first elem of pair is false, the file wasn't found. 162 // the second elem of pair stores the actual source line stream 163 // for each source file referenced by this kernel. 164 std::map<std::string, std::pair<bool, std::vector<std::string>>> debugSrcLineMap; 165 166 // This must be explicitly set by kernel attributes later 167 VISATarget kernelType = VISA_3D; 168 169 // stores all relocations to be performed after binary encoding 170 RelocationTableTy relocationTable; 171 172 // the last output we dumped for this kernel and index of next dump 173 std::string lastG4Asm; 174 int nextDumpIndex = 0; 175 176 bool sharedDebugInfo = false; 177 bool sharedGTPinInfo = false; 178 179 G4_BB* perThreadPayloadBB = nullptr; 180 G4_BB* crossThreadPayloadBB = nullptr; 181 // There's two entires prolog for setting FFID for compute shaders. 182 G4_BB* computeFFIDGP = nullptr; 183 G4_BB* computeFFIDGP1 = nullptr; 184 public: 185 FlowGraph fg; 186 DECLARE_LIST Declares; 187 DECLARE_LIST callerRestoreDecls; 188 189 unsigned char major_version; 190 unsigned char minor_version; 191 192 G4_Kernel(INST_LIST_NODE_ALLOCATOR& alloc, 193 Mem_Manager& m, Options* options, Attributes* anAttr, 194 unsigned char major, unsigned char minor); 195 ~G4_Kernel(); 196 operator new(size_t sz,Mem_Manager & m)197 void *operator new(size_t sz, Mem_Manager& m) {return m.alloc(sz);} 198 setBuilder(IR_Builder * pBuilder)199 void setBuilder(IR_Builder *pBuilder) {fg.setBuilder(pBuilder);} 200 useRegSharingHeuristics() const201 bool useRegSharingHeuristics() const { 202 // Register sharing not enabled in presence of stack calls 203 return regSharingHeuristics && !m_hasIndirectCall && 204 !fg.getIsStackCallFunc() && !fg.getHasStackCalls(); 205 } 206 setNumThreads(int nThreads)207 void setNumThreads(int nThreads) { numThreads = nThreads; } getNumThreads() const208 uint32_t getNumThreads() const { return numThreads; } 209 getNumSWSBTokens() const210 uint32_t getNumSWSBTokens() const { return numSWSBTokens; } 211 getNumAcc() const212 uint32_t getNumAcc() const { return numAcc; } 213 setAsmCount(int count)214 void setAsmCount(int count) { asmInstCount = count; } getAsmCount() const215 uint32_t getAsmCount() const { return asmInstCount; } 216 setKernelID(uint64_t ID)217 void setKernelID(uint64_t ID) { kernelID = ID; } getKernelID() const218 uint64_t getKernelID() const { return kernelID; } 219 getOptions()220 Options *getOptions() { return m_options; } getKernelAttrs() const221 const Attributes* getKernelAttrs() const { return m_kernelAttrs; } getBoolKernelAttr(Attributes::ID aID) const222 bool getBoolKernelAttr(Attributes::ID aID) const { 223 return getKernelAttrs()->getBoolKernelAttr(aID); 224 } getInt32KernelAttr(Attributes::ID aID) const225 int32_t getInt32KernelAttr(Attributes::ID aID) const { 226 return getKernelAttrs()->getInt32KernelAttr(aID); 227 } getOption(vISAOptions opt) const228 bool getOption(vISAOptions opt) const { return m_options->getOption(opt); } 229 void computeChannelSlicing(); 230 void calculateSimdSize(); getSimdSize()231 G4_ExecSize getSimdSize() { return simdSize; } getChannelSlicing() const232 bool getChannelSlicing() const { return channelSliced; } getSimdSizeWithSlicing()233 unsigned getSimdSizeWithSlicing() { return channelSliced ? simdSize/2 : simdSize; } 234 setHasAddrTaken(bool val)235 void setHasAddrTaken(bool val) { hasAddrTaken = val; } getHasAddrTaken()236 bool getHasAddrTaken() { return hasAddrTaken; } 237 setNumRegTotal(unsigned num)238 void setNumRegTotal(unsigned num) { numRegTotal = num; } getNumRegTotal() const239 unsigned getNumRegTotal() const { return numRegTotal; } 240 setName(const char * n)241 void setName(const char* n) { name = n; } getName() const242 const char* getName() const { return name; } 243 244 void updateKernelByNumThreads(int nThreads); 245 246 void evalAddrExp(); 247 setRAType(RA_Type type)248 void setRAType(RA_Type type) { RAType = type; } getRAType() const249 RA_Type getRAType() const { return RAType; } 250 hasKernelDebugInfo() const251 bool hasKernelDebugInfo() const {return kernelDbgInfo;} 252 void setKernelDebugInfo(KernelDebugInfo* k); 253 KernelDebugInfo* getKernelDebugInfo(); 254 255 void setGTPinData(gtPinData* p); hasGTPinInit() const256 bool hasGTPinInit() const {return gtPinInfo && gtPinInfo->getGTPinInit();} getGTPinData()257 gtPinData* getGTPinData() { 258 if (!gtPinInfo) 259 allocGTPinData(); 260 261 return gtPinInfo; 262 } allocGTPinData()263 void allocGTPinData() {gtPinInfo = new(fg.mem) gtPinData(*this);} 264 getCallerSaveLastGRF() const265 unsigned getCallerSaveLastGRF() const { return callerSaveLastGRF; } 266 267 // This function returns starting register number to use 268 // for allocating FE/BE stack/frame ptrs. 269 unsigned getStackCallStartReg() const; 270 unsigned calleeSaveStart() const; 271 unsigned getNumCalleeSaveRegs() const; 272 273 // return the number of reserved GRFs for stack call ABI 274 // the reserved registers are at the end of the GRF file (e.g., r125-r127) numReservedABIGRF() const275 uint32_t numReservedABIGRF() const { 276 return 3; 277 } 278 279 // purpose of the GRFs reserved for stack call ABI 280 const int FPSPGRF = 0; 281 const int SpillHeaderGRF = 1; 282 const int ThreadHeaderGRF = 2; 283 getFPSPGRF() const284 uint32_t getFPSPGRF() const{ 285 return getStackCallStartReg() + FPSPGRF; 286 } 287 getSpillHeaderGRF() const288 uint32_t getSpillHeaderGRF() const{ 289 return getStackCallStartReg() + SpillHeaderGRF; 290 } 291 getThreadHeaderGRF() const292 uint32_t getThreadHeaderGRF() const{ 293 return getStackCallStartReg() + ThreadHeaderGRF; 294 } 295 296 void renameAliasDeclares(); 297 hasIndirectCall() const298 bool hasIndirectCall() const {return m_hasIndirectCall;} setHasIndirectCall()299 void setHasIndirectCall() {m_hasIndirectCall = true;} 300 getRelocationTable()301 RelocationTableTy& getRelocationTable() { 302 return relocationTable; 303 } 304 getRelocationTable() const305 const RelocationTableTy& getRelocationTable() const { 306 return relocationTable; 307 } 308 309 void doRelocation(void* binary, uint32_t binarySize); 310 311 G4_INST* getFirstNonLabelInst() const; 312 313 std::string getDebugSrcLine(const std::string& filename, int lineNo); 314 315 VarSplitPass* getVarSplitPass(); 316 getKernelType() const317 VISATarget getKernelType() const { return kernelType; } setKernelType(VISATarget t)318 void setKernelType(VISATarget t) { kernelType = t; } 319 320 321 /// dump this kernel to the standard error 322 void dump(std::ostream &os = std::cerr) const; // used in debugger 323 324 // dumps .dot files (if enabled) and .g4 (if enabled) 325 void dumpToFile(const std::string &suffix); 326 327 void emitDeviceAsm(std::ostream& output, const void * binary, uint32_t binarySize); 328 329 void emitRegInfo(); 330 void emitRegInfoKernel(std::ostream& output); 331 hasPerThreadPayloadBB() const332 bool hasPerThreadPayloadBB() const { return perThreadPayloadBB != nullptr; } getPerThreadPayloadBB() const333 G4_BB* getPerThreadPayloadBB() const { return perThreadPayloadBB; } setPerThreadPayloadBB(G4_BB * bb)334 void setPerThreadPayloadBB(G4_BB* bb) { perThreadPayloadBB = bb; } hasCrossThreadPayloadBB() const335 bool hasCrossThreadPayloadBB() const { return crossThreadPayloadBB != nullptr; } getCrossThreadPayloadBB() const336 G4_BB* getCrossThreadPayloadBB() const { return crossThreadPayloadBB; } setCrossThreadPayloadBB(G4_BB * bb)337 void setCrossThreadPayloadBB(G4_BB* bb) { crossThreadPayloadBB = bb; } hasComputeFFIDProlog() const338 bool hasComputeFFIDProlog() const { 339 return computeFFIDGP != nullptr && computeFFIDGP1 != nullptr; 340 } setComputeFFIDGPBB(G4_BB * bb)341 void setComputeFFIDGPBB(G4_BB* bb) { computeFFIDGP = bb; } setComputeFFIDGP1BB(G4_BB * bb)342 void setComputeFFIDGP1BB(G4_BB* bb) { computeFFIDGP1 = bb; } 343 344 unsigned getCrossThreadNextOff() const; 345 unsigned getPerThreadNextOff() const; 346 unsigned getComputeFFIDGPNextOff() const; 347 unsigned getComputeFFIDGP1NextOff() const; 348 349 private: 350 G4_BB* getNextBB(G4_BB* bb) const; 351 unsigned getBinOffsetOfBB(G4_BB* bb) const; 352 353 void setKernelParameters(); 354 355 void dumpDotFileInternal(const std::string &baseName); 356 void dumpG4Internal(const std::string &baseName); 357 void dumpG4InternalTo(std::ostream &os); 358 359 // stuff pertaining to emitDeviceAsm 360 void emitDeviceAsmHeaderComment(std::ostream& os); 361 void emitDeviceAsmInstructionsIga(std::ostream& os, const void * binary, uint32_t binarySize); 362 void emitDeviceAsmInstructionsOldAsm(std::ostream& os); 363 364 public: 365 // fused call wa 366 // add v10 -ip, v20 // ip_start_inst 367 // add v11 v10, 0x33 (-label_patch) // patch inst 368 // ... 369 // call v11 // ip_end_inst 370 // This map keeps patch inst --> <ip_start_inst, ip_end_inst> 371 // Once encoding is decided, label_path = IP(ip_end_inst) - IP(ip_start_inst) 372 // 373 // m_labelPatchInsts: keep the info described above 374 // m_instToBBs: convenient map to BBs that those insts belong to 375 // m_waCallInsts: call insts whose targets should be defined outside the smallEU branch. 376 // m_maskOffWAInsts: insts whose MaskOff needs to be changed for this WA. 377 std::unordered_map<G4_INST*, std::pair<G4_INST*, G4_INST*>> m_labelPatchInsts; 378 std::unordered_map<G4_INST*, G4_BB*> m_instToBBs; 379 std::list<G4_INST*> m_waCallInsts; 380 std::unordered_map <G4_INST*, G4_BB*> m_maskOffWAInsts; setMaskOffset(G4_INST * I,G4_InstOption MO)381 void setMaskOffset(G4_INST* I, G4_InstOption MO) { 382 // For call WA 383 assert((I->getMaskOffset() + I->getExecSize()) <= 16); 384 assert(I->getPredicate() == nullptr && I->getCondMod() == nullptr); 385 I->setMaskOption(MO); 386 } 387 388 }; // G4_Kernel 389 } 390 391 392 393 #endif // G4_KERNEL_HPP 394