1 /*========================== begin_copyright_notice ============================ 2 3 Copyright (C) 2017-2021 Intel Corporation 4 5 SPDX-License-Identifier: MIT 6 7 ============================= end_copyright_notice ===========================*/ 8 9 #ifndef __OPTIMIZER_H__ 10 #define __OPTIMIZER_H__ 11 12 13 #include "BuildIR.h" 14 #include "RegAlloc.h" 15 #include "HWConformity.h" 16 #include "LocalScheduler/LocalScheduler_G4IR.h" 17 #include "LocalScheduler/SWSB_G4IR.h" 18 #include <unordered_set> 19 #include <optional> 20 21 typedef struct _AddrSubReg_Node{ 22 short immAddrOff = 0; 23 int subReg = 0; 24 INST_LIST_ITER iter; 25 bool canRemoveInst = false; 26 bool canUseImmed = false; 27 bool usedImmed = false; 28 } AddrSubReg_Node; 29 30 /** 31 * Below data structures are used for message header optimization 32 */ 33 typedef enum _HEADER_ORDER_ 34 { 35 HEADER_UNDEF = 0, 36 HEADER_FULL_REGISTER = 1, 37 HEADER_X = 2, 38 HEADER_Y = 3, 39 HEADER_SIZE = 4 40 } HEADER_ORDER; 41 42 namespace vISA 43 { 44 class DPASSrc2RSCache 45 { 46 public: 47 std::vector<int> GRFCache; 48 unsigned latestID; 49 bool firstDpas; 50 DPASSrc2RSCache()51 DPASSrc2RSCache() 52 { 53 latestID = 0; 54 firstDpas = true; 55 GRFCache.resize(16, -1); 56 } ~DPASSrc2RSCache()57 ~DPASSrc2RSCache() 58 { 59 } 60 }; 61 class MSGTable 62 { 63 public: 64 //below instructions are to be compared against the current send 65 G4_INST * send; // the reference send 66 G4_INST * a0Dot0; // def of a0.0 67 G4_INST * m; // def of m 68 G4_INST * mDot0; // def of m.0: X 69 G4_INST * mDot1; // def of m.1: Y 70 G4_INST * mDot2; // def of m.2: size 71 72 INST_LIST_ITER a0Dot0_it; 73 INST_LIST_ITER m_it; 74 INST_LIST_ITER mDot0_it; 75 INST_LIST_ITER mDot1_it; 76 INST_LIST_ITER mDot2_it; 77 78 bool invalid = false; 79 bool opt; // if the catched send is used 80 81 //below shows whether there are new defs to determine reuse or remove 82 //if redundant, then remove; otherwise, reuse the catched header 83 bool isXRedef; // X is used to define m.0 84 bool isYRedef; // Y is used to define m.1 85 bool isSizeRedef; // Size is used to define m.2 86 bool isR0Dot0Redef; // r0.0 is used to define m 87 HEADER_ORDER first; 88 89 void insertHeaderMovInst(G4_INST *, IR_Builder&, G4_BB *); 90 void reusePreviousHeader(G4_INST *, G4_INST *, G4_INST *, IR_Builder&); 91 ~MSGTable()92 ~MSGTable() {}; 93 94 }; // to remove redundant message headers 95 } 96 typedef std::list<vISA::MSGTable*> MSGTableList; 97 typedef std::list<vISA::MSGTable*>::iterator MSGTable_ITER; 98 99 #define MESSAGE_HEADER_THRESHOLD 1 100 101 typedef struct _DEFA0 102 { 103 vISA::G4_INST *pred = nullptr; 104 vISA::G4_INST *curr = nullptr; 105 INST_LIST_ITER predIt; 106 INST_LIST_ITER currIt; 107 bool isA0Redef = false; 108 } DEFA0; 109 110 /** 111 * end of data structures for message header optimization 112 */ 113 114 // auxiliary structure for inserting save and restore instructions 115 namespace vISA 116 { 117 118 class Optimizer 119 { 120 IR_Builder& builder; 121 G4_Kernel& kernel; 122 FlowGraph& fg; 123 124 vISA::Mem_Manager& mem; 125 // 126 // optimization phases 127 // 128 G4_SrcModifier mergeModifier(G4_Operand *def, G4_Operand *use); 129 void cleanMessageHeader(); 130 void sendFusion(); 131 void renameRegister(); 132 void localDefHoisting(); 133 void reassociateConst(); 134 void removePartialMovs(); 135 void localCopyPropagation(); 136 void localInstCombine(); 137 void optimizeLogicOperation(); 138 void cselPeepHoleOpt(); 139 void regAlloc(); 140 void insertFallThroughJump(); 141 void reverseOffsetProp( 142 AddrSubReg_Node addrRegInfo[8], 143 int subReg, 144 unsigned int srcNum, 145 INST_LIST_ITER lastIter, 146 INST_LIST_ITER iend 147 ); 148 void FoldAddrImmediate(); 149 bool foldCmpSel(G4_BB *BB, G4_INST *selInst, INST_LIST_ITER &selInst_II); 150 bool foldPseudoNot(G4_BB *bb, INST_LIST_ITER& iter); 151 bool createSmov(G4_BB *bb, G4_INST* flagMove, G4_INST* nextInst); 152 bool foldCmpToCondMod(G4_BB* BB, INST_LIST_ITER& iter); 153 void HWWorkaround(); 154 void preRA_HWWorkaround(); 155 G4_INST* evenlySplitDPASInst(INST_LIST_ITER iter, G4_BB* bb); 156 bool hasDPASSourceTwoReuse(DPASSrc2RSCache* src2GRFCache, G4_INST* inst); 157 void DPASWA(G4_BB* bb, INST_LIST_ITER ii, DPASSrc2RSCache* src2GRFCache); 158 void normalizeRegion(); 159 void initializePayload(); 160 void dumpPayload(); 161 void collectStats(); 162 void createR0Copy(); 163 164 void fixEndIfWhileLabels(); 165 void mergeScalarInst(); HWConformityChk()166 void HWConformityChk() { ::HWConformityChk(builder, kernel, mem); } removeRedundMov()167 void removeRedundMov() { fg.removeRedundMov(); } removeEmptyBlocks()168 void removeEmptyBlocks() { fg.removeEmptyBlocks(); } reassignBlockIDs()169 void reassignBlockIDs() { fg.reassignBlockIDs(); } evalAddrExp()170 void evalAddrExp() { kernel.evalAddrExp(); } preRA_Schedule()171 void preRA_Schedule() 172 { 173 if (kernel.useRegSharingHeuristics()) 174 { 175 preRA_RegSharing Sched(kernel, mem, /*rpe*/ nullptr); 176 Sched.run(); 177 } 178 else 179 { 180 preRA_Scheduler Sched(kernel, mem, /*rpe*/ nullptr); 181 Sched.run(); 182 } 183 } localSchedule()184 void localSchedule() 185 { 186 LocalScheduler lSched(kernel.fg, mem); 187 lSched.localScheduling(); 188 } 189 190 void adjustIndirectCallOffsetAfterSWSBSet(); 191 192 void addSWSBInfo(); 193 194 void lowerMadSequence(); 195 196 void LVN(); 197 198 void ifCvt(); 199 200 void ifCvtFCCall(); 201 202 void reRAPostSchedule(); 203 204 void dce(); 205 206 void accSubPostSchedule(); 207 208 void accSubBeforeRA(); 209 210 211 // return true if BuiltInR0 gets a different allocation than r0 212 bool R0CopyNeeded(); 213 214 private: 215 /* below member functions are used for message header opt */ 216 bool isHeaderOptCandidate(G4_INST *, G4_INST *); 217 bool isHeaderOptReuse(G4_INST *, G4_INST *); 218 bool headerOptValidityCheck(MSGTable *, MSGTable *); 219 bool isHeaderCachingCandidate(G4_INST *); 220 void messageHeaderReport(size_t, size_t, G4_Kernel&); 221 void optMessageHeaders(MSGTableList &, G4_BB* , DEFA0&); 222 void addEntryToMessageTable(G4_INST *, MSGTableList &, G4_BB*, INST_LIST_ITER, DEFA0 &); 223 bool chkNewDefBetweenSends(G4_INST *, MSGTableList& , DEFA0&); 224 /* below member functions are used for barrier header opt */ 225 void removeRedundantBarrierHeaders(G4_INST *, G4_SrcRegRegion*, bool); 226 bool isBarrierPattern(G4_INST *, G4_SrcRegRegion* &); 227 void hoistBarrierHeaderToTop(G4_SrcRegRegion*); 228 /* end of member functions for message header opt */ 229 void cleanupBindless(); 230 G4_Operand* updateSendsHeaderReuse(std::vector<std::vector<G4_INST*>> &, std::vector<G4_INST*> &, INST_LIST_ITER); 231 void countGRFUsage(); 232 void changeMoveType(); 233 void split4GRFVars(); 234 void legalizeType(); 235 void analyzeMove(); 236 237 void removeInstrinsics(); 238 239 void countBankConflicts(); 240 unsigned int numBankConflicts; 241 242 bool chkFwdOutputHazard(INST_LIST_ITER &, INST_LIST_ITER&); 243 bool chkFwdOutputHazard(G4_INST*, INST_LIST_ITER); 244 bool chkBwdOutputHazard(INST_LIST_ITER &, INST_LIST_ITER&); 245 bool chkBwdOutputHazard(G4_INST *, INST_LIST_ITER&); 246 bool chkBwdOutputHazard(G4_INST *, INST_LIST_ITER&, G4_INST *); 247 bool chkBwdWARdep(G4_INST*, INST_LIST_ITER); 248 bool chkBwdWAWdep(G4_INST*, INST_LIST_ITER); 249 250 // various HW WA 251 void addSwitchOptionToBB(G4_BB*, bool isSubroutine = false); 252 void linePlaneWA(G4_INST* inst); 253 void fixSendSrcRegion(G4_INST* inst); 254 void clearARFDependencies(); 255 void clearSendDependencies(); 256 void loadThreadPayload(); 257 void addFFIDProlog(); 258 void insertFenceBeforeEOT(); 259 void insertScratchReadBeforeEOT(); 260 void resetA0(); 261 void setA0toTdrForSendc(); 262 void replaceRetWithJmpi(); 263 void doNoMaskWA(); 264 void applyFusedCallWA(); 265 void finishFusedCallWA(); 266 void doNoMaskWA_postRA(); 267 void insertFenceAtEntry(); 268 void expandMulPostSchedule(); 269 void expandMadwPostSchedule(); 270 void fixReadSuppressioninFPU0(); 271 272 typedef std::vector<vISA::G4_INST*> InstListType; 273 // create instruction sequence to calculate call offset from ip 274 void expandIndirectCallWithRegTarget(); 275 // a helper function to create instruction sequence to replace indirect call with jmpi 276 void createInstForJmpiSequence(InstListType& insts, G4_INST* fcall); 277 // a hlper function to create the instructions to calculate the jump target offset, 278 // return G4_Declare of the new created jmp target 279 G4_Declare* createInstsForCallTargetOffset( 280 InstListType& insts, G4_INST* fcall, int64_t adjust_off); 281 // a helper function to create the instructions to get ip from call's dst 282 // This is a WA for platforms can't support ip register 283 // The give add_with_ip must be an add instruction with ip register as its src0 284 void replaceIPWithCall(InstListType& insts, G4_INST* add_with_ip); 285 286 void insertDummyMad(G4_BB* bb, INST_LIST_ITER inst_it); 287 288 void insertDummyCsel(G4_BB* bb, INST_LIST_ITER inst_it, bool newBB); 289 290 void insertDummyMov(G4_BB* bb, INST_LIST_ITER inst_it, G4_Operand* opnd); 291 void insertDummyMovForHWRSWADPAS(G4_BB* bb); 292 void insertDummyMovForHWRSWA(); 293 void insertHashMovs(); 294 void insertDummyCompactInst(); 295 void removeLifetimeOps(); 296 void recomputeBound(std::unordered_set<G4_Declare*>& declares); 297 298 void mapOrphans(); 299 void varSplit(); 300 void cloneSampleInst(); 301 302 /// Each optimization should be a member function of this class. 303 /// This defines a pass type as a pointer to member function. 304 typedef void (Optimizer::*PassType)(); 305 306 /// Data structure that collects information about passes. 307 struct PassInfo { 308 /// The pass to be executed for this kernel. 309 PassType Pass; 310 311 /// The member function name as a pass. 312 const char *Name; 313 314 /// The option that controls this pass. This might not be a one-to-one 315 /// relation between pass and option. For example, multiple passes 316 /// could be mapped to a single option, like vISA_EnableAlways. 317 vISAOptions Option; 318 319 /// Corresponding timer for this pass. When it is not a concrete 320 /// timer i.e. TIMER_NUM_TIMERS, then no time will be recorded. 321 TimerID Timer; 322 323 PassInfo(PassType P, const char *N, vISAOptions O, 324 TimerID T = TimerID::NUM_TIMERS) PassPassInfo325 : Pass(P), Name(N), Option(O), Timer(T) {} 326 PassInfoPassInfo327 PassInfo() : Pass(0), Name(0), Option(vISA_EnableAlways), 328 Timer(TimerID::NUM_TIMERS) {} 329 }; 330 331 bool foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& iter); 332 333 public: 334 /// Index enum for each pass in the pass array. 335 enum PassIndex { 336 PI_cleanMessageHeader = 0, 337 PI_sendFusion, 338 PI_renameRegister, 339 PI_localDefHoisting, 340 PI_localCopyPropagation, 341 PI_localInstCombine, 342 PI_removePartialMovs, 343 PI_cselPeepHoleOpt, 344 PI_optimizeLogicOperation, 345 PI_HWConformityChk, // always 346 PI_preRA_HWWorkaround, // always, each WA under specific control 347 PI_preRA_Schedule, 348 PI_regAlloc, // always 349 PI_removeLifetimeOps, // always 350 PI_countBankConflicts, 351 PI_removeRedundMov, // always 352 PI_removeEmptyBlocks, // always 353 PI_insertFallThroughJump, // always 354 PI_reassignBlockIDs, // always 355 PI_evalAddrExp, // always 356 PI_FoldAddrImmediate, 357 PI_localSchedule, 358 PI_HWWorkaround, // always 359 PI_fixEndIfWhileLabels, // always 360 PI_insertHashMovs, 361 PI_insertDummyMovForHWRSWA, 362 PI_insertDummyCompactInst, 363 PI_mergeScalarInst, 364 PI_lowerMadSequence, 365 PI_LVN, 366 PI_ifCvt, 367 PI_normalizeRegion, // always 368 PI_dumpPayload, 369 PI_collectStats, // always 370 PI_createR0Copy, 371 PI_initializePayload, 372 PI_cleanupBindless, 373 PI_countGRFUsage, 374 PI_changeMoveType, 375 PI_reRAPostSchedule, 376 PI_accSubBeforeRA, 377 PI_accSubPostSchedule, 378 PI_dce, 379 PI_reassociateConst, 380 PI_split4GRFVars, 381 PI_loadThreadPayload, 382 PI_addFFIDProlog, 383 PI_insertFenceBeforeEOT, 384 PI_insertScratchReadBeforeEOT, 385 PI_mapOrphans, 386 PI_varSplit, 387 PI_legalizeType, 388 PI_analyzeMove, 389 PI_removeInstrinsics, 390 PI_expandMulPostSchedule, 391 PI_addSWSBInfo, 392 PI_expandMadwPostSchedule, 393 PI_NUM_PASSES 394 }; 395 396 private: 397 /// Array of passes registered. 398 PassInfo Passes[PI_NUM_PASSES]; 399 400 // indicates whether RA has failed 401 bool RAFail; 402 403 /// Initialize all passes during the construction. 404 void initOptimizations(); 405 406 /// Common interface to execute a pass. 407 void runPass(PassIndex Index); 408 409 bool isCopyPropProfitable(G4_INST* movInst) const; 410 411 std::optional<INST_LIST_ITER> findFenceCommitPos(INST_LIST_ITER fence, G4_BB* bb) const; 412 413 bool addFenceCommit(INST_LIST_ITER iter, G4_BB* bb, bool scheduleFenceCommit); 414 415 public: Optimizer(vISA::Mem_Manager & m,IR_Builder & b,G4_Kernel & k,FlowGraph & f)416 Optimizer(vISA::Mem_Manager& m, IR_Builder& b, G4_Kernel& k, FlowGraph& f) : 417 builder(b), kernel(k), fg(f), mem(m), RAFail(false) 418 { 419 numBankConflicts = 0; 420 initOptimizations(); 421 } 422 int optimization(); 423 424 }; 425 426 } 427 428 #endif // __OPTIMIZER_H__ 429