1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2017-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #ifndef __OPTIMIZER_H__
10 #define __OPTIMIZER_H__
11 
12 
13 #include "BuildIR.h"
14 #include "RegAlloc.h"
15 #include "HWConformity.h"
16 #include "LocalScheduler/LocalScheduler_G4IR.h"
17 #include "LocalScheduler/SWSB_G4IR.h"
18 #include <unordered_set>
19 #include <optional>
20 
21 typedef struct _AddrSubReg_Node{
22     short immAddrOff = 0;
23     int subReg = 0;
24     INST_LIST_ITER iter;
25     bool canRemoveInst = false;
26     bool canUseImmed = false;
27     bool usedImmed = false;
28 } AddrSubReg_Node;
29 
30 /**
31  *  Below data structures are used for message header optimization
32  */
33 typedef enum _HEADER_ORDER_
34 {
35     HEADER_UNDEF         = 0,
36     HEADER_FULL_REGISTER = 1,
37     HEADER_X             = 2,
38     HEADER_Y             = 3,
39     HEADER_SIZE          = 4
40 } HEADER_ORDER;
41 
42 namespace vISA
43 {
44 class DPASSrc2RSCache
45 {
46 public:
47     std::vector<int> GRFCache;
48     unsigned latestID;
49     bool firstDpas;
50 
DPASSrc2RSCache()51     DPASSrc2RSCache()
52     {
53         latestID = 0;
54         firstDpas = true;
55         GRFCache.resize(16, -1);
56     }
~DPASSrc2RSCache()57     ~DPASSrc2RSCache()
58     {
59     }
60 };
61 class MSGTable
62 {
63 public:
64     //below instructions are to be compared against the current send
65     G4_INST * send;     // the reference send
66     G4_INST * a0Dot0;   // def of a0.0
67     G4_INST * m;        // def of m
68     G4_INST * mDot0;    // def of m.0: X
69     G4_INST * mDot1;    // def of m.1: Y
70     G4_INST * mDot2;    // def of m.2: size
71 
72     INST_LIST_ITER a0Dot0_it;
73     INST_LIST_ITER m_it;
74     INST_LIST_ITER mDot0_it;
75     INST_LIST_ITER mDot1_it;
76     INST_LIST_ITER mDot2_it;
77 
78     bool invalid = false;
79     bool opt;             // if the catched send is used
80 
81     //below shows whether there are new defs to determine reuse or remove
82     //if redundant, then remove; otherwise, reuse the catched header
83     bool isXRedef;      // X is used to define m.0
84     bool isYRedef;      // Y is used to define m.1
85     bool isSizeRedef;   // Size is used to define m.2
86     bool isR0Dot0Redef; // r0.0 is used to define m
87     HEADER_ORDER first;
88 
89     void insertHeaderMovInst(G4_INST *, IR_Builder&, G4_BB *);
90     void reusePreviousHeader(G4_INST *, G4_INST *, G4_INST *, IR_Builder&);
91 
~MSGTable()92     ~MSGTable() {};
93 
94 };  // to remove redundant message headers
95 }
96 typedef std::list<vISA::MSGTable*> MSGTableList;
97 typedef std::list<vISA::MSGTable*>::iterator MSGTable_ITER;
98 
99 #define MESSAGE_HEADER_THRESHOLD 1
100 
101 typedef struct _DEFA0
102 {
103     vISA::G4_INST *pred = nullptr;
104     vISA::G4_INST *curr = nullptr;
105     INST_LIST_ITER predIt;
106     INST_LIST_ITER currIt;
107     bool isA0Redef = false;
108 } DEFA0;
109 
110 /**
111  *  end of data structures for message header optimization
112  */
113 
114 // auxiliary structure for inserting save and restore instructions
115 namespace vISA
116 {
117 
118 class Optimizer
119 {
120     IR_Builder& builder;
121     G4_Kernel&  kernel;
122     FlowGraph&  fg;
123 
124     vISA::Mem_Manager& mem;
125     //
126     // optimization phases
127     //
128     G4_SrcModifier mergeModifier(G4_Operand *def, G4_Operand *use);
129     void cleanMessageHeader();
130     void sendFusion();
131     void renameRegister();
132     void localDefHoisting();
133     void reassociateConst();
134     void removePartialMovs();
135     void localCopyPropagation();
136     void localInstCombine();
137     void optimizeLogicOperation();
138     void cselPeepHoleOpt();
139     void regAlloc();
140     void insertFallThroughJump();
141     void reverseOffsetProp(
142             AddrSubReg_Node addrRegInfo[8],
143             int subReg,
144             unsigned int srcNum,
145             INST_LIST_ITER lastIter,
146             INST_LIST_ITER iend
147        );
148     void FoldAddrImmediate();
149     bool foldCmpSel(G4_BB *BB, G4_INST *selInst, INST_LIST_ITER &selInst_II);
150     bool foldPseudoNot(G4_BB *bb, INST_LIST_ITER& iter);
151     bool createSmov(G4_BB *bb, G4_INST* flagMove, G4_INST* nextInst);
152     bool foldCmpToCondMod(G4_BB* BB, INST_LIST_ITER& iter);
153     void HWWorkaround();
154     void preRA_HWWorkaround();
155     G4_INST* evenlySplitDPASInst(INST_LIST_ITER iter, G4_BB* bb);
156     bool hasDPASSourceTwoReuse(DPASSrc2RSCache* src2GRFCache, G4_INST* inst);
157     void DPASWA(G4_BB* bb, INST_LIST_ITER ii, DPASSrc2RSCache* src2GRFCache);
158     void normalizeRegion();
159     void initializePayload();
160     void dumpPayload();
161     void collectStats();
162     void createR0Copy();
163 
164     void fixEndIfWhileLabels();
165     void mergeScalarInst();
HWConformityChk()166     void HWConformityChk() { ::HWConformityChk(builder, kernel, mem); }
removeRedundMov()167     void removeRedundMov() { fg.removeRedundMov(); }
removeEmptyBlocks()168     void removeEmptyBlocks() { fg.removeEmptyBlocks(); }
reassignBlockIDs()169     void reassignBlockIDs() { fg.reassignBlockIDs(); }
evalAddrExp()170     void evalAddrExp() { kernel.evalAddrExp(); }
preRA_Schedule()171     void preRA_Schedule()
172     {
173         if (kernel.useRegSharingHeuristics())
174         {
175             preRA_RegSharing Sched(kernel, mem, /*rpe*/ nullptr);
176             Sched.run();
177         }
178         else
179         {
180             preRA_Scheduler Sched(kernel, mem, /*rpe*/ nullptr);
181             Sched.run();
182         }
183     }
localSchedule()184     void localSchedule()
185     {
186         LocalScheduler lSched(kernel.fg, mem);
187         lSched.localScheduling();
188     }
189 
190     void adjustIndirectCallOffsetAfterSWSBSet();
191 
192     void addSWSBInfo();
193 
194     void lowerMadSequence();
195 
196     void LVN();
197 
198     void ifCvt();
199 
200     void ifCvtFCCall();
201 
202     void reRAPostSchedule();
203 
204     void dce();
205 
206     void accSubPostSchedule();
207 
208     void accSubBeforeRA();
209 
210 
211     // return true if BuiltInR0 gets a different allocation than r0
212     bool R0CopyNeeded();
213 
214 private:
215     /* below member functions are used for message header opt */
216     bool isHeaderOptCandidate(G4_INST *, G4_INST *);
217     bool isHeaderOptReuse(G4_INST *, G4_INST *);
218     bool headerOptValidityCheck(MSGTable *, MSGTable *);
219     bool isHeaderCachingCandidate(G4_INST *);
220     void messageHeaderReport(size_t, size_t, G4_Kernel&);
221     void optMessageHeaders(MSGTableList &, G4_BB* , DEFA0&);
222     void addEntryToMessageTable(G4_INST *, MSGTableList &, G4_BB*, INST_LIST_ITER, DEFA0 &);
223     bool chkNewDefBetweenSends(G4_INST *, MSGTableList& , DEFA0&);
224     /* below member functions are used for barrier header opt */
225     void removeRedundantBarrierHeaders(G4_INST *, G4_SrcRegRegion*, bool);
226     bool isBarrierPattern(G4_INST *, G4_SrcRegRegion* &);
227     void hoistBarrierHeaderToTop(G4_SrcRegRegion*);
228     /* end of member functions for message header opt */
229     void cleanupBindless();
230     G4_Operand* updateSendsHeaderReuse(std::vector<std::vector<G4_INST*>> &, std::vector<G4_INST*> &, INST_LIST_ITER);
231     void countGRFUsage();
232     void changeMoveType();
233     void split4GRFVars();
234     void legalizeType();
235     void analyzeMove();
236 
237     void removeInstrinsics();
238 
239     void countBankConflicts();
240     unsigned int numBankConflicts;
241 
242     bool chkFwdOutputHazard(INST_LIST_ITER &, INST_LIST_ITER&);
243     bool chkFwdOutputHazard(G4_INST*, INST_LIST_ITER);
244     bool chkBwdOutputHazard(INST_LIST_ITER &, INST_LIST_ITER&);
245     bool chkBwdOutputHazard(G4_INST *, INST_LIST_ITER&);
246     bool chkBwdOutputHazard(G4_INST *, INST_LIST_ITER&, G4_INST *);
247     bool chkBwdWARdep(G4_INST*, INST_LIST_ITER);
248     bool chkBwdWAWdep(G4_INST*, INST_LIST_ITER);
249 
250     // various HW WA
251     void addSwitchOptionToBB(G4_BB*, bool isSubroutine = false);
252     void linePlaneWA(G4_INST* inst);
253     void fixSendSrcRegion(G4_INST* inst);
254     void clearARFDependencies();
255     void clearSendDependencies();
256     void loadThreadPayload();
257     void addFFIDProlog();
258     void insertFenceBeforeEOT();
259     void insertScratchReadBeforeEOT();
260     void resetA0();
261     void setA0toTdrForSendc();
262     void replaceRetWithJmpi();
263     void doNoMaskWA();
264     void applyFusedCallWA();
265     void finishFusedCallWA();
266     void doNoMaskWA_postRA();
267     void insertFenceAtEntry();
268     void expandMulPostSchedule();
269     void expandMadwPostSchedule();
270     void fixReadSuppressioninFPU0();
271 
272     typedef std::vector<vISA::G4_INST*> InstListType;
273     // create instruction sequence to calculate call offset from ip
274     void expandIndirectCallWithRegTarget();
275     // a helper function to create instruction sequence to replace indirect call with jmpi
276     void createInstForJmpiSequence(InstListType& insts, G4_INST* fcall);
277     // a hlper function to create the instructions to calculate the jump target offset,
278     // return G4_Declare of the new created jmp target
279     G4_Declare* createInstsForCallTargetOffset(
280         InstListType& insts, G4_INST* fcall, int64_t adjust_off);
281     // a helper function to create the instructions to get ip from call's dst
282     // This is a WA for platforms can't support ip register
283     // The give add_with_ip must be an add instruction with ip register as its src0
284     void replaceIPWithCall(InstListType& insts, G4_INST* add_with_ip);
285 
286     void insertDummyMad(G4_BB* bb, INST_LIST_ITER inst_it);
287 
288     void insertDummyCsel(G4_BB* bb, INST_LIST_ITER inst_it, bool newBB);
289 
290     void insertDummyMov(G4_BB* bb, INST_LIST_ITER inst_it, G4_Operand* opnd);
291     void insertDummyMovForHWRSWADPAS(G4_BB* bb);
292     void insertDummyMovForHWRSWA();
293     void insertHashMovs();
294     void insertDummyCompactInst();
295     void removeLifetimeOps();
296     void recomputeBound(std::unordered_set<G4_Declare*>& declares);
297 
298     void mapOrphans();
299     void varSplit();
300     void cloneSampleInst();
301 
302     /// Each optimization should be a member function of this class.
303     /// This defines a pass type as a pointer to member function.
304     typedef void (Optimizer::*PassType)();
305 
306     /// Data structure that collects information about passes.
307     struct PassInfo {
308         /// The pass to be executed for this kernel.
309         PassType Pass;
310 
311         /// The member function name as a pass.
312         const char *Name;
313 
314         /// The option that controls this pass. This might not be a one-to-one
315         /// relation between pass and option. For example, multiple passes
316         /// could be mapped to a single option, like vISA_EnableAlways.
317         vISAOptions Option;
318 
319         /// Corresponding timer for this pass. When it is not a concrete
320         /// timer i.e. TIMER_NUM_TIMERS, then no time will be recorded.
321         TimerID Timer;
322 
323         PassInfo(PassType P, const char *N, vISAOptions O,
324                  TimerID T = TimerID::NUM_TIMERS)
PassPassInfo325             : Pass(P), Name(N), Option(O), Timer(T) {}
326 
PassInfoPassInfo327         PassInfo() : Pass(0), Name(0), Option(vISA_EnableAlways),
328             Timer(TimerID::NUM_TIMERS) {}
329     };
330 
331     bool foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& iter);
332 
333 public:
334     /// Index enum for each pass in the pass array.
335     enum PassIndex {
336         PI_cleanMessageHeader = 0,
337         PI_sendFusion,
338         PI_renameRegister,
339         PI_localDefHoisting,
340         PI_localCopyPropagation,
341         PI_localInstCombine,
342         PI_removePartialMovs,
343         PI_cselPeepHoleOpt,
344         PI_optimizeLogicOperation,
345         PI_HWConformityChk,            // always
346         PI_preRA_HWWorkaround,         // always, each WA under specific control
347         PI_preRA_Schedule,
348         PI_regAlloc,                   // always
349         PI_removeLifetimeOps,          // always
350         PI_countBankConflicts,
351         PI_removeRedundMov,            // always
352         PI_removeEmptyBlocks,          // always
353         PI_insertFallThroughJump,      // always
354         PI_reassignBlockIDs,           // always
355         PI_evalAddrExp,                // always
356         PI_FoldAddrImmediate,
357         PI_localSchedule,
358         PI_HWWorkaround,               // always
359         PI_fixEndIfWhileLabels,           // always
360         PI_insertHashMovs,
361         PI_insertDummyMovForHWRSWA,
362         PI_insertDummyCompactInst,
363         PI_mergeScalarInst,
364         PI_lowerMadSequence,
365         PI_LVN,
366         PI_ifCvt,
367         PI_normalizeRegion,            // always
368         PI_dumpPayload,
369         PI_collectStats,          // always
370         PI_createR0Copy,
371         PI_initializePayload,
372         PI_cleanupBindless,
373         PI_countGRFUsage,
374         PI_changeMoveType,
375         PI_reRAPostSchedule,
376         PI_accSubBeforeRA,
377         PI_accSubPostSchedule,
378         PI_dce,
379         PI_reassociateConst,
380         PI_split4GRFVars,
381         PI_loadThreadPayload,
382         PI_addFFIDProlog,
383         PI_insertFenceBeforeEOT,
384         PI_insertScratchReadBeforeEOT,
385         PI_mapOrphans,
386         PI_varSplit,
387         PI_legalizeType,
388         PI_analyzeMove,
389         PI_removeInstrinsics,
390         PI_expandMulPostSchedule,
391         PI_addSWSBInfo,
392         PI_expandMadwPostSchedule,
393         PI_NUM_PASSES
394     };
395 
396 private:
397     /// Array of passes registered.
398     PassInfo Passes[PI_NUM_PASSES];
399 
400     // indicates whether RA has failed
401     bool RAFail;
402 
403     /// Initialize all passes during the construction.
404     void initOptimizations();
405 
406     /// Common interface to execute a pass.
407     void runPass(PassIndex Index);
408 
409     bool isCopyPropProfitable(G4_INST* movInst) const;
410 
411     std::optional<INST_LIST_ITER> findFenceCommitPos(INST_LIST_ITER fence, G4_BB* bb) const;
412 
413     bool addFenceCommit(INST_LIST_ITER iter, G4_BB* bb, bool scheduleFenceCommit);
414 
415 public:
Optimizer(vISA::Mem_Manager & m,IR_Builder & b,G4_Kernel & k,FlowGraph & f)416     Optimizer(vISA::Mem_Manager& m, IR_Builder& b, G4_Kernel& k, FlowGraph& f) :
417         builder(b), kernel(k), fg(f), mem(m), RAFail(false)
418     {
419         numBankConflicts = 0;
420         initOptimizations();
421     }
422     int optimization();
423 
424 };
425 
426 }
427 
428 #endif // __OPTIMIZER_H__
429