1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2017-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 #include "InstCombine.hpp"
9 
10 #include <limits>
11 #include <functional>
12 #include <unordered_set>
13 
14 using namespace vISA;
15 
16 
17 class InstCombiner
18 {
19     IR_Builder& builder;
20     // G4_Kernel&  kernel;
21     FlowGraph&  fg;
22     // Mem_Manager& mem;
23 
24     // In occasions where we need to force GRF alignment of a scalar register
25     // (e.g. for targeting send payloads) we don't want to blow up register
26     // allocation; so we limit ourselves to just a few we should distinguish
27     // between variables that are global and those that are local.
28     static const int MAX_FORCE_ALIGNS = 2;
29     //
30     int forceAlignsLeft = MAX_FORCE_ALIGNS;
31 public:
InstCombiner(IR_Builder & _builder,FlowGraph & _fg)32     InstCombiner(IR_Builder& _builder, FlowGraph& _fg)
33         : builder(_builder), fg(_fg)
34     {
35     }
36 
37     void run();
38 private:
39     bool tryInstPropagate(INST_LIST_ITER iitr, INST_LIST_ITER eitr);
40 }; // InstCombiner
41 
42 
43 
InstCombine(IR_Builder & builder,FlowGraph & fg)44 void vISA::InstCombine(
45     IR_Builder& builder,
46     FlowGraph&  fg)
47 {
48     InstCombiner ic(builder, fg);
49     ic.run();
50 }
51 
run()52 void InstCombiner::run()
53 {
54     for (BB_LIST_ITER bitr = fg.begin(); bitr != fg.end(); ++bitr) {
55         G4_BB* bb = *bitr;
56 
57         bb->resetLocalIds();
58 
59         INST_LIST_ITER iitr = bb->begin(), eitr = bb->end();
60         while (iitr != eitr)
61         {
62             G4_INST *defInst = *iitr;
63             G4_Operand *defDst = defInst->getDst();
64             if (!defDst) {
65                 iitr++;
66                 continue;
67             }
68 
69             builder.doConsFolding(defInst);
70             builder.doSimplification(defInst);
71             if (tryInstPropagate(iitr, eitr)) {
72                 INST_LIST_ITER tmp = iitr;
73                 iitr++;
74                 bb->erase(tmp);
75             } else {
76                 iitr++;
77             }
78         } // for insts in bb
79     } // for blocks
80 }
81 
82 // TMP = def (...) A,  B
83 // ...
84 // A = op ... (prevents us from propagating: A and B)
85 // ...
86 // ... = use ... TMP
87 //
88 // prevents us from forwarding
hasAntiDependenceBetweenLastUse(INST_LIST_ITER iitr,INST_LIST_ITER eitr)89 static bool hasAntiDependenceBetweenLastUse(
90     INST_LIST_ITER iitr, INST_LIST_ITER eitr)
91 {
92     // find the furthest use
93     G4_INST* inst = *iitr;
94     G4_INST* lastUse = inst->use_front().first;
95     for (USE_EDGE_LIST_ITER iter = inst->use_begin(),
96         uend = inst->use_end(); iter != uend; ++iter)
97     {
98         G4_INST* useInst = iter->first;
99         if (useInst->getLocalId() > lastUse->getLocalId()) {
100             lastUse = useInst;
101         }
102     }
103     INST_LIST_ITER forwardIter = iitr;
104     forwardIter++;
105     while (forwardIter != eitr && *forwardIter != lastUse) {
106         if ((*forwardIter)->isWARdep(inst)) {
107             return true;
108         }
109         forwardIter++;
110     }
111     MUST_BE_TRUE(forwardIter != eitr, "hit end of block without finding use");
112     return false;
113 }
114 
115 // Returns true sure there is a lifetime.end for any of the sources of the
116 // defining inst.
117 //
118 //     add T, A, B
119 //     ...
120 //     lifetime.end A << cannot propagate A past this
121 //     ...
122 //     op ..., T, ...
123 //
hasLifetimeEndBetweenLastUse(INST_LIST_ITER iitr,INST_LIST_ITER eitr,const G4_INST * useInst,const G4_Operand * defSrc0,const G4_Operand * defSrc1=nullptr,const G4_Operand * defSrc2=nullptr)124 static bool hasLifetimeEndBetweenLastUse(
125     INST_LIST_ITER iitr,
126     INST_LIST_ITER eitr,
127     const G4_INST *useInst,
128     const G4_Operand *defSrc0,
129     const G4_Operand *defSrc1 = nullptr,
130     const G4_Operand *defSrc2 = nullptr)
131 {
132     auto findDecl = [&](const G4_Operand *defSrc) {
133         const G4_SrcRegRegion *defSrcRegRgn =
134             defSrc && defSrc->isSrcRegRegion() ? defSrc->asSrcRegRegion() : nullptr;
135         const G4_Declare* currInstDclSrc =
136             defSrcRegRgn ? defSrcRegRgn->getBaseRegVarRootDeclare() : nullptr;
137         return currInstDclSrc;
138     };
139     const G4_Declare* currInstDclSrc0 = findDecl(defSrc0);
140     const G4_Declare* currInstDclSrc1 = findDecl(defSrc1);
141     const G4_Declare* currInstDclSrc2 = findDecl(defSrc2);
142 
143     INST_LIST_ITER cpIter = iitr;
144     cpIter++;
145     while (*cpIter != useInst)
146     {
147         if ((*cpIter)->isLifeTimeEnd()) {
148             // Check whether lifetime end is for same opnd
149             const G4_Declare* lifetimeEndTopDcl = GetTopDclFromRegRegion((*cpIter)->getSrc(0));
150             if (lifetimeEndTopDcl == currInstDclSrc0 ||
151                 lifetimeEndTopDcl == currInstDclSrc1 ||
152                 lifetimeEndTopDcl == currInstDclSrc2)
153             {
154                 return true;
155             }
156         }
157         cpIter++;
158         if (cpIter == eitr) // end of block
159             return false;
160     }
161 
162     return false;
163 }
164 
165 
166 // integer folds:
167 //   add T, s0, s1; add *, *, T ==> add3 *, s0, s1, T
168 //   add T, s0, s1; add *, T, * ==> add3 *, T, s0, s1
169 //
170 //   add T, s0, immA; add *, T, immB ==> add *, s0, (immA+immB)
171 //   (we could do this via add3 and reduction again)
172 //
173 // TODO:
174 //   mul T, s0, s1 -> add *, T, * ==> mad  *, *, s0, s1
175 //   mul T, s0, s1 -> add *, *, T ==> mad  *, s0, s1, *
176 //
177 //   shl T, s0 << N -> add *, X, T ==> mad X, s0, 2^n
178 //
179 //   logic -> logic ==> bfn
tryInstPropagate(INST_LIST_ITER iitr,INST_LIST_ITER eitr)180 bool InstCombiner::tryInstPropagate(
181     INST_LIST_ITER iitr, INST_LIST_ITER eitr)
182 {
183     G4_INST *defInst = *iitr;
184     if (!defInst->canPropagateBinaryToTernary()) {
185         return false;
186     } else if (defInst->use_size() == 0) {
187         return false; // probably unreachable, but keep for sanity sake
188     } else if (hasAntiDependenceBetweenLastUse(iitr, eitr)) {
189         return false; // someone clobbers one of the def() sources before the use()
190     }
191 
192     bool canFoldAdd3 = getGenxPlatform() >= XeHP_SDV;
193 
194     // defer folding until we can prove all uses can be done
195     std::vector<std::function<void()>> applyUses;
196     //
197     std::unordered_set<G4_Declare*> grfForcedAlignments;
198     std::unordered_set<G4_INST *> usedAddsTargetedToAdd3;
199 
200     G4_Operand *defSrc0 = defInst->getSrc(0);
201     G4_Operand *defSrc1 = defInst->getSrc(1);
202     bool defIsSimd1WrEn = defInst->getExecSize() == 1 && defInst->isWriteEnableInst();
203     // OKAY
204     //     def (E)
205     //     use (E)
206     //
207     // (W) def (1)
208     //     use (E)
209     //
210     // (W) def (E)
211     // (W) use (E)
212     auto execInfoCanCanPropagate = [&](const G4_INST *useInst) {
213         return defIsSimd1WrEn ||
214             (defInst->isWriteEnableInst() == useInst->isWriteEnableInst() &&
215                 defInst->getExecLaneMask() == useInst->getExecLaneMask());
216     };
217 
218     // copy def[fromDefSrcIx] to toUseInst[toUseSrcIx]
219     auto copyOperand = [&](
220         Gen4_Operand_Number fromDefSrcIx,
221         G4_INST            *toUseInst,
222         Gen4_Operand_Number toUseSrcIx)
223     {
224         G4_Operand *oldUseSrc = toUseInst->getSrc(toUseSrcIx - 1);
225         if (oldUseSrc) {
226             toUseInst->removeDefUse(toUseSrcIx);
227         }
228         G4_Operand *defSrc = defInst->getSrc(fromDefSrcIx - 1);
229         G4_Operand *newUseSrc = builder.duplicateOperand(defSrc);
230         toUseInst->setSrc(newUseSrc, toUseSrcIx - 1);
231         // for all defs of defInst targeting defSrcIx copy those defs over
232         defInst->copyDef(toUseInst, fromDefSrcIx, toUseSrcIx);
233     };
234 
235 
236     // check if each use can be combined
237     for (USE_EDGE_LIST_ITER uitr = defInst->use_begin();
238         uitr != defInst->use_end();
239         uitr++)
240     {
241         G4_INST *useInst = uitr->first;
242         // copies toUseSrcA from def
243         auto copyOperandsToUseSrcs = [&](
244             Gen4_Operand_Number toUseSrcA,
245             Gen4_Operand_Number toUseSrcB)
246         {
247             copyOperand(Opnd_src0, useInst, toUseSrcA);
248             copyOperand(Opnd_src1, useInst, toUseSrcB);
249         };
250 
251         if (hasLifetimeEndBetweenLastUse(iitr, eitr, useInst, defSrc0, defSrc1)) {
252             return false;
253         }
254 
255         auto opsAre = [&] (G4_opcode defOp, G4_opcode useOp) {
256             return defInst->opcode() == defOp && useInst->opcode() == useOp;
257         };
258         G4_Operand *useSrcOpnd = nullptr, *useOtherSrcOpnd = nullptr;
259         if (uitr->second == Opnd_src0) {
260             useSrcOpnd = useInst->getSrc(0);
261             useOtherSrcOpnd = useInst->getSrc(1);
262         } else if (uitr->second == Opnd_src1) {
263             useSrcOpnd = useInst->getSrc(1);
264             useOtherSrcOpnd = useInst->getSrc(0);
265         } else {
266             return false;
267         }
268 
269         // similar criteria from G4_INST::canPropagateTo
270         if (useSrcOpnd->getType() != defInst->getDst()->getType()) {
271             return false; // don't bother with type conversion
272         } else if (useInst->isLifeTimeEnd()) {
273             return false;
274         } else if (useInst->getPredicate()) {
275             return false; // punt on predication (could match predicates)
276         } else if (useInst->getDst() == nullptr) {
277             return false; // e.g. G4_pseudo_fcall
278         } else if (!execInfoCanCanPropagate(useInst)) {
279             return false; // e.g. oddball ExecSize/NoMask combinations
280         }
281 
282         const bool ENABLE_ADD_FOLD = false; // TODO: incrementally enable
283         if (ENABLE_ADD_FOLD && opsAre(G4_add, G4_add)) {
284             // see if we can reassociate the source operands
285             //   add T,   s0, immA
286             //   add dst, T,  immB
287             // =>
288             //   add dst, s0, (immA + immB)
289             // (there's an older reassoc pass, but we need to do it
290             // here since other folds in this pass can generate the pattern
291             // and we don't want to ping/pong between these passes)
292             const Gen4_Operand_Number useSrcIx = (*uitr).second;
293             bool foldedChainedAdds = false;
294             if ((defSrc0->isImm() || defSrc1->isImm()) && useOtherSrcOpnd->isImm())
295             {
296                 int64_t imm = useOtherSrcOpnd->asImm()->getImm();
297                 Gen4_Operand_Number defVarSrcIx = Opnd_src0;
298                 if (defSrc0->isImm()) {
299                     imm += defSrc0->asImm()->getImm();
300                     defVarSrcIx = Opnd_src1;
301                 } else {
302                     imm += defSrc1->asImm()->getImm();
303                     defVarSrcIx = Opnd_src0;
304                 }
305                 if (imm >= std::numeric_limits<int32_t>::min() ||
306                     imm <= std::numeric_limits<int32_t>::max())
307                 {
308                     foldedChainedAdds = true;
309                     applyUses.emplace_back(
310                         [&]() {
311                             copyOperand(defVarSrcIx, useInst, useSrcIx);
312                             G4_Imm *foldedImm =
313                                 builder.createImmWithLowerType(imm, useInst->getExecType());
314                             unsigned ix = (useSrcIx == Opnd_src0 ? Opnd_src1 : Opnd_src0) - 1;
315                             useInst->setSrc(foldedImm, ix);
316                         });
317                 }
318             } // chained add
319 
320               // if that fails, but we have an add3 operation then we can make
321               //   add T,   s0, s1
322               //   add dst, T,  s2
323               // =>
324               //   add3 dst, s0, s1, s2
325             if (!foldedChainedAdds && canFoldAdd3) {
326                 if (usedAddsTargetedToAdd3.find(useInst) != usedAddsTargetedToAdd3.end()) {
327                     // FIXME: this needs to handle folding to the same target add
328                     //   add  D2  = D0  D1
329                     //   add  ... = D2  D2
330                     // This will show up as dual uses (src0 and src1)
331                     // and will want to expand around both slots.
332                     // So it will become (assume use in src0 slot folds first):
333                     //   add3 ... =  D0  D2  D1
334                     // then the next application clobbers this...
335                     //
336                     // TODO: apply a better identity here (turn into a mad)
337                     return false;
338                 }
339                 usedAddsTargetedToAdd3.insert(useInst);
340                 applyUses.emplace_back(
341                     [&]() {
342                         // promote the second add to an add3;
343                         // replace src2 and the transitive operand
344                         //   (which can be either src0 or src1)
345                         useInst->setOpcode(G4_add3);
346                         if (useSrcIx == Opnd_src1) {
347                             copyOperandsToUseSrcs(Opnd_src1, Opnd_src2);
348                             defInst->copyDef(useInst, Opnd_src1, Opnd_src1);
349                         } else {
350                             copyOperandsToUseSrcs(Opnd_src0, Opnd_src2);
351                         }
352                     });
353             }
354             } else {
355                 // unsupported pattern
356                 return false;
357             }
358     } // for uses
359 
360       // commit the changes
361     for (auto &apply : applyUses) {
362         apply();
363     }
364 
365     // unlink our def and use pairs (defInst is being removed)
366     defInst->removeDefUse(Opnd_src0);
367     defInst->removeDefUse(Opnd_src1);
368     forceAlignsLeft -= (int)grfForcedAlignments.size();
369 
370     return true;
371 }
372