1 /*========================== begin_copyright_notice ============================
2
3 Copyright (C) 2017-2021 Intel Corporation
4
5 SPDX-License-Identifier: MIT
6
7 ============================= end_copyright_notice ===========================*/
8 #include "InstCombine.hpp"
9
10 #include <limits>
11 #include <functional>
12 #include <unordered_set>
13
14 using namespace vISA;
15
16
17 class InstCombiner
18 {
19 IR_Builder& builder;
20 // G4_Kernel& kernel;
21 FlowGraph& fg;
22 // Mem_Manager& mem;
23
24 // In occasions where we need to force GRF alignment of a scalar register
25 // (e.g. for targeting send payloads) we don't want to blow up register
26 // allocation; so we limit ourselves to just a few we should distinguish
27 // between variables that are global and those that are local.
28 static const int MAX_FORCE_ALIGNS = 2;
29 //
30 int forceAlignsLeft = MAX_FORCE_ALIGNS;
31 public:
InstCombiner(IR_Builder & _builder,FlowGraph & _fg)32 InstCombiner(IR_Builder& _builder, FlowGraph& _fg)
33 : builder(_builder), fg(_fg)
34 {
35 }
36
37 void run();
38 private:
39 bool tryInstPropagate(INST_LIST_ITER iitr, INST_LIST_ITER eitr);
40 }; // InstCombiner
41
42
43
InstCombine(IR_Builder & builder,FlowGraph & fg)44 void vISA::InstCombine(
45 IR_Builder& builder,
46 FlowGraph& fg)
47 {
48 InstCombiner ic(builder, fg);
49 ic.run();
50 }
51
run()52 void InstCombiner::run()
53 {
54 for (BB_LIST_ITER bitr = fg.begin(); bitr != fg.end(); ++bitr) {
55 G4_BB* bb = *bitr;
56
57 bb->resetLocalIds();
58
59 INST_LIST_ITER iitr = bb->begin(), eitr = bb->end();
60 while (iitr != eitr)
61 {
62 G4_INST *defInst = *iitr;
63 G4_Operand *defDst = defInst->getDst();
64 if (!defDst) {
65 iitr++;
66 continue;
67 }
68
69 builder.doConsFolding(defInst);
70 builder.doSimplification(defInst);
71 if (tryInstPropagate(iitr, eitr)) {
72 INST_LIST_ITER tmp = iitr;
73 iitr++;
74 bb->erase(tmp);
75 } else {
76 iitr++;
77 }
78 } // for insts in bb
79 } // for blocks
80 }
81
82 // TMP = def (...) A, B
83 // ...
84 // A = op ... (prevents us from propagating: A and B)
85 // ...
86 // ... = use ... TMP
87 //
88 // prevents us from forwarding
hasAntiDependenceBetweenLastUse(INST_LIST_ITER iitr,INST_LIST_ITER eitr)89 static bool hasAntiDependenceBetweenLastUse(
90 INST_LIST_ITER iitr, INST_LIST_ITER eitr)
91 {
92 // find the furthest use
93 G4_INST* inst = *iitr;
94 G4_INST* lastUse = inst->use_front().first;
95 for (USE_EDGE_LIST_ITER iter = inst->use_begin(),
96 uend = inst->use_end(); iter != uend; ++iter)
97 {
98 G4_INST* useInst = iter->first;
99 if (useInst->getLocalId() > lastUse->getLocalId()) {
100 lastUse = useInst;
101 }
102 }
103 INST_LIST_ITER forwardIter = iitr;
104 forwardIter++;
105 while (forwardIter != eitr && *forwardIter != lastUse) {
106 if ((*forwardIter)->isWARdep(inst)) {
107 return true;
108 }
109 forwardIter++;
110 }
111 MUST_BE_TRUE(forwardIter != eitr, "hit end of block without finding use");
112 return false;
113 }
114
115 // Returns true sure there is a lifetime.end for any of the sources of the
116 // defining inst.
117 //
118 // add T, A, B
119 // ...
120 // lifetime.end A << cannot propagate A past this
121 // ...
122 // op ..., T, ...
123 //
hasLifetimeEndBetweenLastUse(INST_LIST_ITER iitr,INST_LIST_ITER eitr,const G4_INST * useInst,const G4_Operand * defSrc0,const G4_Operand * defSrc1=nullptr,const G4_Operand * defSrc2=nullptr)124 static bool hasLifetimeEndBetweenLastUse(
125 INST_LIST_ITER iitr,
126 INST_LIST_ITER eitr,
127 const G4_INST *useInst,
128 const G4_Operand *defSrc0,
129 const G4_Operand *defSrc1 = nullptr,
130 const G4_Operand *defSrc2 = nullptr)
131 {
132 auto findDecl = [&](const G4_Operand *defSrc) {
133 const G4_SrcRegRegion *defSrcRegRgn =
134 defSrc && defSrc->isSrcRegRegion() ? defSrc->asSrcRegRegion() : nullptr;
135 const G4_Declare* currInstDclSrc =
136 defSrcRegRgn ? defSrcRegRgn->getBaseRegVarRootDeclare() : nullptr;
137 return currInstDclSrc;
138 };
139 const G4_Declare* currInstDclSrc0 = findDecl(defSrc0);
140 const G4_Declare* currInstDclSrc1 = findDecl(defSrc1);
141 const G4_Declare* currInstDclSrc2 = findDecl(defSrc2);
142
143 INST_LIST_ITER cpIter = iitr;
144 cpIter++;
145 while (*cpIter != useInst)
146 {
147 if ((*cpIter)->isLifeTimeEnd()) {
148 // Check whether lifetime end is for same opnd
149 const G4_Declare* lifetimeEndTopDcl = GetTopDclFromRegRegion((*cpIter)->getSrc(0));
150 if (lifetimeEndTopDcl == currInstDclSrc0 ||
151 lifetimeEndTopDcl == currInstDclSrc1 ||
152 lifetimeEndTopDcl == currInstDclSrc2)
153 {
154 return true;
155 }
156 }
157 cpIter++;
158 if (cpIter == eitr) // end of block
159 return false;
160 }
161
162 return false;
163 }
164
165
166 // integer folds:
167 // add T, s0, s1; add *, *, T ==> add3 *, s0, s1, T
168 // add T, s0, s1; add *, T, * ==> add3 *, T, s0, s1
169 //
170 // add T, s0, immA; add *, T, immB ==> add *, s0, (immA+immB)
171 // (we could do this via add3 and reduction again)
172 //
173 // TODO:
174 // mul T, s0, s1 -> add *, T, * ==> mad *, *, s0, s1
175 // mul T, s0, s1 -> add *, *, T ==> mad *, s0, s1, *
176 //
177 // shl T, s0 << N -> add *, X, T ==> mad X, s0, 2^n
178 //
179 // logic -> logic ==> bfn
tryInstPropagate(INST_LIST_ITER iitr,INST_LIST_ITER eitr)180 bool InstCombiner::tryInstPropagate(
181 INST_LIST_ITER iitr, INST_LIST_ITER eitr)
182 {
183 G4_INST *defInst = *iitr;
184 if (!defInst->canPropagateBinaryToTernary()) {
185 return false;
186 } else if (defInst->use_size() == 0) {
187 return false; // probably unreachable, but keep for sanity sake
188 } else if (hasAntiDependenceBetweenLastUse(iitr, eitr)) {
189 return false; // someone clobbers one of the def() sources before the use()
190 }
191
192 bool canFoldAdd3 = getGenxPlatform() >= XeHP_SDV;
193
194 // defer folding until we can prove all uses can be done
195 std::vector<std::function<void()>> applyUses;
196 //
197 std::unordered_set<G4_Declare*> grfForcedAlignments;
198 std::unordered_set<G4_INST *> usedAddsTargetedToAdd3;
199
200 G4_Operand *defSrc0 = defInst->getSrc(0);
201 G4_Operand *defSrc1 = defInst->getSrc(1);
202 bool defIsSimd1WrEn = defInst->getExecSize() == 1 && defInst->isWriteEnableInst();
203 // OKAY
204 // def (E)
205 // use (E)
206 //
207 // (W) def (1)
208 // use (E)
209 //
210 // (W) def (E)
211 // (W) use (E)
212 auto execInfoCanCanPropagate = [&](const G4_INST *useInst) {
213 return defIsSimd1WrEn ||
214 (defInst->isWriteEnableInst() == useInst->isWriteEnableInst() &&
215 defInst->getExecLaneMask() == useInst->getExecLaneMask());
216 };
217
218 // copy def[fromDefSrcIx] to toUseInst[toUseSrcIx]
219 auto copyOperand = [&](
220 Gen4_Operand_Number fromDefSrcIx,
221 G4_INST *toUseInst,
222 Gen4_Operand_Number toUseSrcIx)
223 {
224 G4_Operand *oldUseSrc = toUseInst->getSrc(toUseSrcIx - 1);
225 if (oldUseSrc) {
226 toUseInst->removeDefUse(toUseSrcIx);
227 }
228 G4_Operand *defSrc = defInst->getSrc(fromDefSrcIx - 1);
229 G4_Operand *newUseSrc = builder.duplicateOperand(defSrc);
230 toUseInst->setSrc(newUseSrc, toUseSrcIx - 1);
231 // for all defs of defInst targeting defSrcIx copy those defs over
232 defInst->copyDef(toUseInst, fromDefSrcIx, toUseSrcIx);
233 };
234
235
236 // check if each use can be combined
237 for (USE_EDGE_LIST_ITER uitr = defInst->use_begin();
238 uitr != defInst->use_end();
239 uitr++)
240 {
241 G4_INST *useInst = uitr->first;
242 // copies toUseSrcA from def
243 auto copyOperandsToUseSrcs = [&](
244 Gen4_Operand_Number toUseSrcA,
245 Gen4_Operand_Number toUseSrcB)
246 {
247 copyOperand(Opnd_src0, useInst, toUseSrcA);
248 copyOperand(Opnd_src1, useInst, toUseSrcB);
249 };
250
251 if (hasLifetimeEndBetweenLastUse(iitr, eitr, useInst, defSrc0, defSrc1)) {
252 return false;
253 }
254
255 auto opsAre = [&] (G4_opcode defOp, G4_opcode useOp) {
256 return defInst->opcode() == defOp && useInst->opcode() == useOp;
257 };
258 G4_Operand *useSrcOpnd = nullptr, *useOtherSrcOpnd = nullptr;
259 if (uitr->second == Opnd_src0) {
260 useSrcOpnd = useInst->getSrc(0);
261 useOtherSrcOpnd = useInst->getSrc(1);
262 } else if (uitr->second == Opnd_src1) {
263 useSrcOpnd = useInst->getSrc(1);
264 useOtherSrcOpnd = useInst->getSrc(0);
265 } else {
266 return false;
267 }
268
269 // similar criteria from G4_INST::canPropagateTo
270 if (useSrcOpnd->getType() != defInst->getDst()->getType()) {
271 return false; // don't bother with type conversion
272 } else if (useInst->isLifeTimeEnd()) {
273 return false;
274 } else if (useInst->getPredicate()) {
275 return false; // punt on predication (could match predicates)
276 } else if (useInst->getDst() == nullptr) {
277 return false; // e.g. G4_pseudo_fcall
278 } else if (!execInfoCanCanPropagate(useInst)) {
279 return false; // e.g. oddball ExecSize/NoMask combinations
280 }
281
282 const bool ENABLE_ADD_FOLD = false; // TODO: incrementally enable
283 if (ENABLE_ADD_FOLD && opsAre(G4_add, G4_add)) {
284 // see if we can reassociate the source operands
285 // add T, s0, immA
286 // add dst, T, immB
287 // =>
288 // add dst, s0, (immA + immB)
289 // (there's an older reassoc pass, but we need to do it
290 // here since other folds in this pass can generate the pattern
291 // and we don't want to ping/pong between these passes)
292 const Gen4_Operand_Number useSrcIx = (*uitr).second;
293 bool foldedChainedAdds = false;
294 if ((defSrc0->isImm() || defSrc1->isImm()) && useOtherSrcOpnd->isImm())
295 {
296 int64_t imm = useOtherSrcOpnd->asImm()->getImm();
297 Gen4_Operand_Number defVarSrcIx = Opnd_src0;
298 if (defSrc0->isImm()) {
299 imm += defSrc0->asImm()->getImm();
300 defVarSrcIx = Opnd_src1;
301 } else {
302 imm += defSrc1->asImm()->getImm();
303 defVarSrcIx = Opnd_src0;
304 }
305 if (imm >= std::numeric_limits<int32_t>::min() ||
306 imm <= std::numeric_limits<int32_t>::max())
307 {
308 foldedChainedAdds = true;
309 applyUses.emplace_back(
310 [&]() {
311 copyOperand(defVarSrcIx, useInst, useSrcIx);
312 G4_Imm *foldedImm =
313 builder.createImmWithLowerType(imm, useInst->getExecType());
314 unsigned ix = (useSrcIx == Opnd_src0 ? Opnd_src1 : Opnd_src0) - 1;
315 useInst->setSrc(foldedImm, ix);
316 });
317 }
318 } // chained add
319
320 // if that fails, but we have an add3 operation then we can make
321 // add T, s0, s1
322 // add dst, T, s2
323 // =>
324 // add3 dst, s0, s1, s2
325 if (!foldedChainedAdds && canFoldAdd3) {
326 if (usedAddsTargetedToAdd3.find(useInst) != usedAddsTargetedToAdd3.end()) {
327 // FIXME: this needs to handle folding to the same target add
328 // add D2 = D0 D1
329 // add ... = D2 D2
330 // This will show up as dual uses (src0 and src1)
331 // and will want to expand around both slots.
332 // So it will become (assume use in src0 slot folds first):
333 // add3 ... = D0 D2 D1
334 // then the next application clobbers this...
335 //
336 // TODO: apply a better identity here (turn into a mad)
337 return false;
338 }
339 usedAddsTargetedToAdd3.insert(useInst);
340 applyUses.emplace_back(
341 [&]() {
342 // promote the second add to an add3;
343 // replace src2 and the transitive operand
344 // (which can be either src0 or src1)
345 useInst->setOpcode(G4_add3);
346 if (useSrcIx == Opnd_src1) {
347 copyOperandsToUseSrcs(Opnd_src1, Opnd_src2);
348 defInst->copyDef(useInst, Opnd_src1, Opnd_src1);
349 } else {
350 copyOperandsToUseSrcs(Opnd_src0, Opnd_src2);
351 }
352 });
353 }
354 } else {
355 // unsupported pattern
356 return false;
357 }
358 } // for uses
359
360 // commit the changes
361 for (auto &apply : applyUses) {
362 apply();
363 }
364
365 // unlink our def and use pairs (defInst is being removed)
366 defInst->removeDefUse(Opnd_src0);
367 defInst->removeDefUse(Opnd_src1);
368 forceAlignsLeft -= (int)grfForcedAlignments.size();
369
370 return true;
371 }
372