1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2017-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include "Rematerialization.h"
10 
11 namespace vISA
12 {
populateRefs()13     void Rematerialization::populateRefs()
14     {
15         unsigned int id = 0;
16         for (auto bb : kernel.fg)
17         {
18             for (auto inst : *bb)
19             {
20                 inst->setLexicalId(id++);
21 
22                 if (inst->isPseudoKill())
23                     continue;
24 
25                 auto dst = inst->getDst();
26 
27                 if (dst && !dst->isNullReg())
28                 {
29                     auto topdcl = dst->getTopDcl();
30 
31                     if (topdcl)
32                     {
33                         operations[topdcl].def.push_back(std::make_pair(inst, bb));
34                     }
35                 }
36 
37                 for (unsigned int i = 0; i < G4_MAX_SRCS; i++)
38                 {
39                     auto srcOpnd = inst->getSrc(i);
40                     if (srcOpnd &&
41                         srcOpnd->isSrcRegRegion())
42                     {
43                         auto topdcl = srcOpnd->asSrcRegRegion()->getTopDcl();
44                         unsigned int startRow = srcOpnd->getLeftBound() / numEltPerGRF<Type_UB>();
45                         unsigned int endRow = srcOpnd->getRightBound() / numEltPerGRF<Type_UB>();
46                         if (topdcl)
47                         {
48                             auto dclIt = operations.find(topdcl);
49                             if (dclIt == operations.end())
50                             {
51                                 References r;
52                                 r.numUses = 1;
53                                 for (unsigned int k = startRow; k <= endRow; k++)
54                                 {
55                                     r.rowsUsed.insert(k);
56                                 }
57                                 //r.uses.push_back(std::make_pair(inst, bb));
58                                 r.lastUseLexId = inst->getLexicalId();
59                                 operations.insert(std::make_pair(topdcl, r));
60                             }
61                             else
62                             {
63                                 (*dclIt).second.numUses++;
64                                 for (unsigned int k = startRow; k <= endRow; k++)
65                                 {
66                                     (*dclIt).second.rowsUsed.insert(k);
67                                 }
68                                 (*dclIt).second.lastUseLexId = inst->getLexicalId();
69                                 //(*dclIt).second.uses.push_back(std::make_pair(inst, bb));
70                             }
71                         }
72                     }
73                 }
74             }
75 
76             // Update lastUseLexId based on BB live-out set
77             for (unsigned int i = 0; i < liveness.getNumSelectedVar(); i++)
78             {
79                 if (bb->size() > 0 && liveness.isLiveAtExit(bb, i))
80                 {
81                     auto lr = coloring.getLiveRanges()[i];
82                     auto dclIt = operations.find(lr->getDcl()->getRootDeclare());
83                     if (dclIt != operations.end())
84                     {
85                         (*dclIt).second.lastUseLexId = bb->back()->getLexicalId();
86                     }
87                 }
88             }
89         }
90 
91         for (auto& ref : operations)
92         {
93             auto dcl = ref.first;
94             if (dcl->getRegVar() &&
95                 dcl->getRegVar()->getPhyReg())
96                 preDefinedVars.push_back(dcl);
97         }
98     }
99 
populateSamplerHeaderMap()100     void Rematerialization::populateSamplerHeaderMap()
101     {
102         samplerHeaderMapPopulated = true;
103 
104         if (!samplerHeader)
105             return;
106 
107         for (auto bb : kernel.fg)
108         {
109             G4_INST* samplerHeaderMov = nullptr;
110             for (auto inst : *bb)
111             {
112                 if (inst->getDst() &&
113                     inst->getDst()->getTopDcl() == samplerHeader)
114                 {
115                     samplerHeaderMov = inst;
116                     continue;
117                 }
118 
119                 if (samplerHeaderMov &&
120                     inst->isSplitSend() &&
121                     inst->getMsgDesc()->isSampler() &&
122                     inst->getMsgDescRaw() &&
123                     inst->getMsgDescRaw()->isHeaderPresent())
124                 {
125                     MUST_BE_TRUE(samplerHeaderMov->getExecSize() == 1, "Unexpected sampler header");
126                     samplerHeaderMap.insert(std::make_pair(inst, samplerHeaderMov));
127                 }
128             }
129         }
130     }
131 
deLVNSamplers(G4_BB * bb)132     void Rematerialization::deLVNSamplers(G4_BB* bb)
133     {
134         // LVN pass removes redundant samplerHeader movs. This way
135         // several consecutive samplers can use same samplerHeader
136         // instruction. However, when remat is done, extra care
137         // needs to be taken so that all samplers still use same
138         // header as before. Consider this snippet:
139         //
140         // samplerHeader(0,2) = a
141         // send (16) ... samplerHeader ...
142         // = V1
143         // send (16) ... samplerHeader ...
144         //
145         // After remating V1:
146         //
147         // samplerHeader(0,2) = a
148         // send (16) ... samplerHeader ...
149         // samplerHeader(0,2) = b
150         // send (16) REMAT_V1 samplerHeader ...
151         // send (16) ... samplerHeader ... <-- Uses incorrect samplerHeader!
152         //
153         // This function deLVNs all samplerHeaders in the program and later
154         // we LVN them back after remating is done. This ensures correctness.
155         if (!samplerHeader)
156             return;
157 
158         for (auto instIt = bb->begin();
159             instIt != bb->end();
160             )
161         {
162             auto inst = (*instIt);
163 
164             if (inst->isSplitSend() &&
165                 inst->getMsgDesc()->isSampler())
166             {
167                 auto samplerHeaderInstIt = samplerHeaderMap.find(inst);
168 
169                 if (samplerHeaderInstIt != samplerHeaderMap.end())
170                 {
171                     auto samplerHeaderMov = (*samplerHeaderInstIt).second;
172 
173                     auto dupOp = samplerHeaderMov->cloneInst();
174 
175                     bb->insertBefore(instIt, dupOp);
176                 }
177             }
178 
179             instIt++;
180         }
181     }
182 
inSameSubroutine(G4_BB * use,G4_BB * def)183     bool Rematerialization::inSameSubroutine(G4_BB* use, G4_BB* def)
184     {
185         // Return true if both BBs belong to same sub
186         auto defBBIt = BBPerSubroutine.find(def);
187         auto useBBIt = BBPerSubroutine.find(use);
188 
189         // Neither BBs found in map means both are part of main kernel
190         if (defBBIt == BBPerSubroutine.end() &&
191             useBBIt == BBPerSubroutine.end())
192             return true;
193 
194         if (defBBIt != BBPerSubroutine.end() &&
195             useBBIt != BBPerSubroutine.end())
196         {
197             // Both BBs part of same subroutine
198             if ((*defBBIt).second == (*useBBIt).second)
199                 return true;
200         }
201 
202         // BBs not part of same subroutine
203         return false;
204     }
205 
206     // bb1 should block defining original computation and
207     // bb2 should be the block where remat is expected.
areInSameLoop(G4_BB * bb1,G4_BB * bb2,bool & bb1OutsideLoop)208     bool Rematerialization::areInSameLoop(G4_BB* bb1, G4_BB* bb2, bool& bb1OutsideLoop)
209     {
210         bool bb1InAnyLoop = false;
211         bb1OutsideLoop = false;
212 
213         // Check whether bb1 is in any loop at all. If not,
214         // then we can allow remat even if bb2 is in a loop.
215         // The case that is disallowed is where bb1 and bb2
216         // are both in loops, but in different ones.
217         for (auto&& be : kernel.fg.backEdges)
218         {
219             auto loopIt = kernel.fg.naturalLoops.find(be);
220 
221             if (loopIt != kernel.fg.naturalLoops.end())
222             {
223                 auto&& bbsInLoop = (*loopIt).second;
224 
225                 auto bb1InLoop = bbsInLoop.find(bb1);
226                 if (bb1InLoop != bbsInLoop.end())
227                 {
228                     bb1InAnyLoop = true;
229                     break;
230                 }
231             }
232         }
233 
234         if (!bb1InAnyLoop)
235             bb1OutsideLoop = true;
236 
237         for (auto&& be : kernel.fg.backEdges)
238         {
239             auto loopIt = kernel.fg.naturalLoops.find(be);
240 
241             if (loopIt != kernel.fg.naturalLoops.end())
242             {
243                 auto&& bbsInLoop = (*loopIt).second;
244 
245                 auto bb1InLoop = bbsInLoop.find(bb1);
246                 auto bb2InLoop = bbsInLoop.find(bb2);
247 
248                 // Both BBs must be present in all nested loops
249                 if ((bb1InLoop == bbsInLoop.end() && bb2InLoop != bbsInLoop.end()) ||
250                     (bb1InLoop != bbsInLoop.end() && bb2InLoop == bbsInLoop.end()))
251                 {
252                     return false;
253                 }
254             }
255         }
256 
257         return true;
258     }
259 
isRangeSpilled(G4_Declare * dcl)260     bool Rematerialization::isRangeSpilled(G4_Declare* dcl)
261     {
262         if (dcl)
263             return dcl->isSpilled();
264 
265         return false;
266     }
267 
areAllDefsInBB(G4_Declare * dcl,G4_BB * bb,unsigned int lexId)268     bool Rematerialization::areAllDefsInBB(G4_Declare* dcl, G4_BB* bb, unsigned int lexId)
269     {
270         auto defsIt = operations.find(dcl);
271         if (defsIt == operations.end())
272             return false;
273 
274         auto&& refs = (*defsIt).second;
275         // Each def must be in same BB as sampler header must appear lexically before sampler
276         for (auto&& d : refs.def)
277         {
278             if (d.second != bb)
279                 return false;
280 
281             if (d.first->getLexicalId() > lexId)
282                 return false;
283         }
284 
285 
286         return true;
287     }
288 
getLastUseLexId(G4_Declare * dcl)289     unsigned int Rematerialization::getLastUseLexId(G4_Declare* dcl)
290     {
291         unsigned int lastLexId = 0;
292         auto it = operations.find(dcl);
293         if (it != operations.end())
294             lastLexId = (*it).second.lastUseLexId;
295 
296         return lastLexId;
297     }
298 
cleanRedundantSamplerHeaders()299     void Rematerialization::cleanRedundantSamplerHeaders()
300     {
301         if (!samplerHeader)
302             return;
303 
304         for (auto bb : kernel.fg)
305         {
306             std::list<G4_INST*> lastMov;
307 
308             INST_LIST_ITER toErase = bb->end();
309 
310             if (deLVNedBBs.find(bb) == deLVNedBBs.end())
311                 continue;
312 
313             for (auto instIt = bb->begin(), instItEnd = bb->end();
314                 instIt != instItEnd;
315                 )
316             {
317                 auto inst = (*instIt);
318 
319                 if (toErase != bb->end())
320                 {
321                     for (unsigned int i = 0; i != inst->getNumSrc(); ++i)
322                     {
323                         auto src = inst->getSrc(i);
324                         if (src && src->isSrcRegRegion())
325                         {
326                             auto topdcl = src->getTopDcl();
327                             if (topdcl == samplerHeader)
328                             {
329                                 // samplerHeader is used, so can't erase it
330                                 toErase = bb->end();
331                             }
332                         }
333                     }
334                 }
335 
336                 if (inst->isMov() && inst->getDst() && inst->getExecSize() == 1)
337                 {
338                     // mov (1|NM) samplerHeader(0,2)<1>:ud   imm
339                     auto dstTopDcl = inst->getDst()->getTopDcl();
340 
341                     if (dstTopDcl == samplerHeader)
342                     {
343                         if (toErase != bb->end())
344                         {
345                             lastMov.remove(*toErase);
346                             bb->erase(toErase);
347                             toErase = instIt;
348                         }
349 
350                         if (lastMov.size() > 0)
351                         {
352                             auto lastMovSrc0 = lastMov.back()->getSrc(0);
353                             auto instSrc0 = inst->getSrc(0);
354 
355                             if (inst->getDst()->getSubRegOff() == 2 &&
356                                 lastMovSrc0->isImm() == instSrc0->isImm() &&
357                                 lastMovSrc0->asImm()->getImm() == instSrc0->asImm()->getImm() &&
358                                 lastMovSrc0->getType() == instSrc0->getType())
359                             {
360                                 // Remove current instruction
361 #if 0
362                                 printf("Removing sampler header mov at $%d\n", inst->getCISAOff());
363 #endif
364                                 instIt = bb->erase(instIt);
365                                 toErase = bb->end();
366                                 continue;
367                             }
368                         }
369 
370                         toErase = instIt;
371 
372                         lastMov.push_back(inst);
373                     }
374                 }
375 
376                 instIt++;
377             }
378 
379             if (toErase != bb->end())
380                 bb->erase(toErase);
381         }
382     }
383 
checkLocalWAR(G4_INST * defInst,G4_BB * bb,INST_LIST_ITER useIter)384     bool Rematerialization::checkLocalWAR(G4_INST* defInst, G4_BB* bb, INST_LIST_ITER useIter)
385     {
386         INST_LIST_ITER currIter = useIter;
387         while (currIter != bb->begin())
388         {
389             currIter--;
390             auto currInst = *currIter;
391             if (currInst == defInst)
392                 break;
393 
394             auto currDst = currInst->getDst();
395             if (currDst && !currDst->isNullReg())
396             {
397                 auto dstDcl = currDst->getTopDcl();
398                 unsigned int curLb = currDst->getLeftBound();
399                 unsigned int curRb = currDst->getRightBound();
400 
401                 for (unsigned int i = 0; i < G4_MAX_SRCS; i++)
402                 {
403                     auto srcOpnd = defInst->getSrc(i);
404                     if (srcOpnd &&
405                         !(srcOpnd->isNullReg()) &&
406                         srcOpnd->isSrcRegRegion())
407                     {
408                         G4_SrcRegRegion* srcRegion = srcOpnd->asSrcRegRegion();
409                         auto srcDcl = srcRegion->getTopDcl();
410                         unsigned int srcLb = srcRegion->getLeftBound(), srcRb = srcRegion->getRightBound();
411 
412                         if (dstDcl == srcDcl &&
413                             curRb >= srcLb &&
414                             curLb <= srcRb)
415                         {
416                             return false;
417                         }
418                     }
419                 }
420             }
421         }
422 
423         MUST_BE_TRUE(*currIter == defInst, "Cannot find defInst for Remat candidate!");
424 
425         return true;
426     }
427 
usesNoMaskWA(const Reference * uniqueDef)428     bool Rematerialization::usesNoMaskWA(const Reference* uniqueDef)
429     {
430         auto defInst = uniqueDef->first;
431 
432         // look for pattern like:
433         // (W&fx.y.anyh) inst
434         //
435         // where fx.y is:
436         // cmp.eq.fx.y (..)   null   rega   rega
437         if (!defInst->isWriteEnableInst())
438             return false;
439 
440         if (!defInst->getPredicate())
441             return false;
442 
443         auto predCtrl = defInst->getPredicate()->getControl();
444         if (predCtrl != PRED_ANY8H &&
445             predCtrl != PRED_ANY16H &&
446             predCtrl != PRED_ANY32H)
447             return false;
448 
449         return defInst->getPredicate()->isSameAsNoMask();
450     }
451 
isPartGRFBusyInput(G4_Declare * inputDcl,unsigned int atLexId)452     bool Rematerialization::isPartGRFBusyInput(G4_Declare* inputDcl, unsigned int atLexId)
453     {
454         // inputDcl is an input G4_Declare that has pre-defined assignment.
455         // Extending a pre-assigned assignment can be bad if its a scalar
456         // and no other part of that GRF is busy. OTOH, it may be beneficial
457         // to extend inputDcl if there is another pre-defined G4_Declare
458         // sharing physical register assignment (different sub-register)
459         // with inputDcl and is live beyond where we want to extend inputDcl.
460 
461         // This function checks whether there is any other G4_Declare that
462         // shares same GRF assignment as inputDcl. If there is then check
463         // whether last use of that assignment is beyond atLexId. If one
464         // if found then return true. Return false otherwise.
465 
466         if (!inputDcl->getRegVar()->getPhyReg() ||
467             !inputDcl->getRegVar()->getPhyReg()->isGreg())
468         {
469             return false;
470         }
471 
472         auto inputRegNum = inputDcl->getRegVar()->getPhyReg()->asGreg()->getRegNum();
473 
474         for (auto dcl : preDefinedVars)
475         {
476             auto ref = operations.find(dcl);
477             if (ref == operations.end())
478                 continue;
479 
480             if (!dcl->getRegVar()->getPhyReg() ||
481                 !dcl->getRegVar()->getPhyReg()->isGreg())
482                 continue;
483 
484             auto regNum = dcl->getRegVar()->getPhyReg()->asGreg()->getRegNum();
485             if (regNum == inputRegNum)
486             {
487                 if ((*ref).second.lastUseLexId >= atLexId)
488                     return true;
489             }
490         }
491 
492         return false;
493     }
494 
495 
canRematerialize(G4_SrcRegRegion * src,G4_BB * bb,const Reference * & ref,INST_LIST_ITER instIter)496     bool Rematerialization::canRematerialize(G4_SrcRegRegion* src, G4_BB* bb, const Reference*& ref, INST_LIST_ITER instIter)
497     {
498         // op1 (8) A   B   C
499         // ...
500         // op2 (8) D   A   X
501         //
502         // This function will check whether rematerialize an operand,
503         // eg A in op2 is possible.
504         //
505         auto topdcl = src->getTopDcl();
506         if (!topdcl)
507             return false;
508 
509         if (src->getInst()->isSplitIntrinsic())
510             return false;
511 
512         // ADDRESS/FLAG spilled declare
513         if (topdcl->getSpilledDeclare())
514             return false;
515 
516         if (topdcl->getAddressed())
517             return false;
518 
519         if (topdcl->getRegVar()->getPhyReg())
520             return false;
521 
522         // Src must belong to GRF file
523         if ((topdcl->getRegFile() &
524             (G4_RegFileKind::G4_GRF | G4_RegFileKind::G4_INPUT)) == 0x0)
525             return false;
526 
527         // Skip remat if src opnd uses special acc registers
528         if (src->getAccRegSel() != ACC_UNDEFINED)
529             return false;
530 
531         // Lookup defs of src in program
532         auto opIt = operations.find(topdcl);
533         if (opIt == operations.end())
534             return false;
535 
536         auto&& refs = (*opIt).second;
537         auto uniqueDef = findUniqueDef(refs, src);
538 
539         if (!uniqueDef)
540             return false;
541 
542         if (gra.isNoRemat(uniqueDef->first))
543             return false;
544 
545         // Def has a lot of uses so we will need lots of remat to make this profitable
546         if (refs.numUses > MAX_USES_REMAT)
547             return false;
548 
549         if (uniqueDef->first->getCondMod())
550             return false;
551 
552         if (uniqueDef->first->getPredicate() &&
553             !usesNoMaskWA(uniqueDef))
554             return false;
555 
556         // It is illegal to rematerialize intrinsic.split instruction as it
557         // is dependent on an earlier send.
558         if (uniqueDef->first->isSplitIntrinsic())
559             return false;
560 
561         ref = uniqueDef;
562 
563         // Check whether op1 can be recomputed
564         auto srcInst = src->getInst();
565         auto uniqueDefInst = uniqueDef->first;
566         auto uniqueDefBB = uniqueDef->second;
567 
568         if (!isRematCandidateOp(uniqueDefInst))
569             return false;
570 
571         unsigned int srcLexId = srcInst->getLexicalId();
572         unsigned int origOpLexId = uniqueDefInst->getLexicalId();
573 
574         if (origOpLexId > srcLexId)
575             return false;
576 
577         // Def-use must be far away
578         unsigned int minDefUseDist = MIN_DEF_USE_DISTANCE;
579 
580         // If def is a scalar and its def/use lie entirely in a BB,
581         // then increase min def use distance heuristic as remating
582         // closeby is unlikely to provide perf benefit.
583         if (uniqueDefInst->getExecSize() == 1)
584         {
585             if(uniqueDefBB->back()->getLexicalId() >= refs.lastUseLexId)
586                 minDefUseDist *= 2;
587         }
588 
589         if ((srcLexId - origOpLexId) < minDefUseDist)
590             return false;
591 
592         if (!inSameSubroutine(bb, uniqueDefBB))
593             return false;
594 
595         // If uniqueDefBB is not under SIMD CF, current BB is under SIMD CF
596         // and use has NoMask set, then we can remat only if def has NoMask
597         // option set.
598         if (!uniqueDefBB->isDivergent() &&
599             bb->isDivergent() &&
600             !uniqueDefInst->isWriteEnableInst() &&
601             srcInst->isWriteEnableInst())
602         {
603             return false;
604         }
605 
606         // Check whether they are in a loop. If yes, they should be in same loop.
607         bool uniqueDefOutsideLoop = false;
608         bool srcDclSpilled = isRangeSpilled(topdcl);
609         bool inSameLoop = areInSameLoop(uniqueDefBB, bb, uniqueDefOutsideLoop);
610         bool onlyUseInLoop = uniqueDefOutsideLoop && !inSameLoop;
611         bool doNumRematCheck = false;
612 
613         // Decide whether it is profitable to push def inside loop before each use
614         if (onlyUseInLoop && !srcDclSpilled)
615         {
616             // If topdcl does not interfere with other spilled
617             // range then skip remating this operation.
618             // Be less aggressive if this is SIMD8 since we run the
619             // chance of perf penalty with this.
620             if ((kernel.getSimdSize() == 8 && rpe.getRegisterPressure(srcInst) < (float)rematLoopRegPressure * 1.6f) ||
621                 rematCandidates[topdcl->getRegVar()->getId()] == false ||
622                 rpe.getRegisterPressure(srcInst) < rematLoopRegPressure)
623                 return false;
624 
625             if (getNumRematsInLoop() > 0)
626             {
627                 // Restrict non-SIMD1 remats to a low percent of loop instructions.
628                 float loopInstToTotalInstRatio = (float)getNumRematsInLoop() / (float)loopInstsBeforeRemat*100.0f;
629                 if (rpe.getMaxRP() < rematRegPressure * 1.4f)
630                 {
631                     // If max RPE is not very high, dont sink too many instructions in loop
632                     if(loopInstToTotalInstRatio > 1.75f)
633                         return false;
634                 }
635                 else if (loopInstToTotalInstRatio > 3.89f)
636                     return false;
637             }
638         }
639 
640         if (!inSameLoop)
641         {
642             if (!uniqueDefOutsideLoop)
643                 return false;
644             else
645             {
646                 // When op1 is outside loop and op2 is indside loop,
647                 // allow remat if op1 dst dcl is marked spilled.
648                 // Because that means a load will  be inserted in the
649                 // loop and remat might be more efficient here.
650                 if (!srcDclSpilled)
651                 {
652                     // If src dcl is not spilled, check whether all
653                     // src opnds of defInst have been remat'd atleast once.
654                     // This heuristic helps decide if remat will be worthwhile
655                     // in a loop.
656                     doNumRematCheck = true;
657                 }
658             }
659         }
660 
661         if (inSameLoop && !uniqueDefOutsideLoop)
662         {
663             // Remat is done in loop only if declare
664             // is marked as spill, so remat will
665             // benefit it. Otherwise, if var has a
666             // single use within the loop then remat
667             // can be done as it doesnt contribute to
668             // increase in inst count.
669             if (!srcDclSpilled && refs.numUses > 1)
670                 return false;
671         }
672 
673         // Check liveness of each src operand in original op
674         bool srcLive[G4_MAX_SRCS];
675         bool anySrcNotLive = false;
676         for (unsigned int i = 0; i < G4_MAX_SRCS; i++)
677         {
678             srcLive[i] = true;
679             auto srcOpnd = uniqueDefInst->getSrc(i);
680             if (!srcOpnd || srcOpnd->isImm() || srcOpnd->isNullReg())
681                 continue;
682 
683             if (srcOpnd->isSrcRegRegion())
684             {
685                 // If src operand base is non-regvar (eg, architecture
686                 // register) then dont remat. Moving around such
687                 // registers could be dangerous.
688                 if (!srcOpnd->getBase()->isRegVar())
689                     return false;
690 
691                 // Check whether this src has a single unique def
692                 auto srcOpndRgn = srcOpnd->asSrcRegRegion();
693                 auto srcOpndTopDcl = srcOpndRgn->getTopDcl();
694 
695                 if (doNumRematCheck && getNumRemats(srcOpndTopDcl) == 0)
696                 {
697                     return false;
698                 }
699 
700                 const auto &pointsToSet = liveness.getPointsToAnalysis().getIndrUseVectorForBB(bb->getId());
701                 G4_RegVar* srcVar = srcOpndTopDcl->getRegVar();
702                 auto it = std::find_if(pointsToSet.begin(), pointsToSet.end(),
703                     [&srcVar](const pointInfo& element) {return element.var == srcVar && element.off == 0; });
704 
705                 if (srcOpndTopDcl->getAddressed() &&
706                     ((uniqueDefBB != bb) ||
707                       it != pointsToSet.end()))
708                 {
709                     // Indirectly addressed src opnd should not be extended
710                     return false;
711                 }
712 
713                 if ((srcOpndTopDcl->getRegFile() &
714                     (G4_RegFileKind::G4_GRF | G4_RegFileKind::G4_INPUT)) == 0x0)
715                     return false;
716 
717                 // If an instruction has physical registers allocated then
718                 // dont optimize it.
719                 if (srcOpndRgn->getBase()->asRegVar()->getPhyReg() &&
720                     !srcOpndTopDcl->isInput())
721                     return false;
722 
723                 if (srcOpndTopDcl->isInput())
724                 {
725                     auto opIt = operations.find(srcOpndTopDcl);
726                     if (opIt != operations.end())
727                     {
728                         // Check whether input variable has explicit def in function
729                         if ((*opIt).second.def.size() > 0)
730                             return false;
731                     }
732 
733                     if ((*opIt).second.lastUseLexId < srcLexId &&
734                         (!isPartGRFBusyInput((*opIt).first, srcLexId) ||
735                         !inSameLoop))
736                     {
737                         // Inputs are pre-assigned and extending such ranges
738                         // could lead to worse RA results, unless the input
739                         // already extends beyond where we intend to remat.
740                         return false;
741                     }
742                 }
743 
744                 // Run separate checks for sampler
745                 if (uniqueDefInst->isSplitSend() &&
746                     uniqueDefInst->getMsgDesc()->isSampler() &&
747                     uniqueDefInst->getSrc(2)->isImm() &&
748                     uniqueDefInst->getSrc(3)->isImm())
749                 {
750                     if (!kernel.getOptions()->getOption(vISA_cacheSamplerHeader))
751                         return false;
752 
753                     // Sampler definition to be rematerialized
754                     // sends (8) V54(0,0):f samplerHeader(0,0) V53(0,0) 0x42:ud 0x24a7002:ud{Align1, Q1}
755                     // resLen = 4, msgLen = 1, extMsgLen = 1
756                     // samplerHeader can be rematerialized as it is r0.0 with modified r0.2.
757                     // V53 above will simply be extended since it requires extra computation to rematerialize.
758                     // Above sampler inst has a header. Some sampler instructions may not have a header.
759                     // For such headerless samplers we need to check whether it is profitable to extend
760                     // both src operands.
761 
762                     // Ensure resLen > extMsgLen to make rematerialization profitable.
763                     unsigned len = uniqueDefInst->getMsgDesc()->getSrc1LenRegs();
764 
765                     // For Sanity, just verify V53 has defs before sampler send only.
766                     auto extMsgOpnd = uniqueDefInst->getSrc(1);
767                     MUST_BE_TRUE(extMsgOpnd->isSrcRegRegion() == true, "Unexpected src opnd for sampler");
768 
769                     // Dont remat if sampler def is outside loop and use inside loop
770                     if (onlyUseInLoop)
771                         return false;
772 
773                     if (!areAllDefsInBB(extMsgOpnd->asSrcRegRegion()->getTopDcl(), uniqueDefBB, uniqueDefInst->getLexicalId()))
774                         return false;
775 
776                     bool samplerHeaderNotUsed = uniqueDefInst->getSrc(0)->asSrcRegRegion()->getTopDcl() != kernel.fg.builder->getBuiltinSamplerHeader();
777 
778                     if (!uniqueDefInst->getMsgDescRaw() ||
779                         !uniqueDefInst->getMsgDescRaw()->isHeaderPresent() ||
780                         samplerHeaderNotUsed)
781                     {
782                         len += uniqueDefInst->getMsgDesc()->getSrc0LenRegs();
783 
784                         auto msgOpnd = uniqueDefInst->getSrc(0);
785                         if (!areAllDefsInBB(msgOpnd->asSrcRegRegion()->getTopDcl(), uniqueDefBB, uniqueDefInst->getLexicalId()))
786                             return false;
787 
788                         if (liveness.isLiveAtExit(bb, msgOpnd->getTopDcl()->getRegVar()->getId()) ||
789                             getLastUseLexId(msgOpnd->getTopDcl()) >= srcLexId)
790                             len -= uniqueDefInst->getMsgDesc()->getSrc0LenRegs();
791                     }
792 
793                     if (samplerHeaderNotUsed)
794                     {
795                         // Ensure header creation instructions are used only by sampler
796                         auto msgOpndTopDcl = uniqueDefInst->getSrc(0)->asSrcRegRegion()->getTopDcl();
797                         auto topDclOpsIt = operations.find(msgOpndTopDcl);
798                         if (topDclOpsIt == operations.end())
799                             return false;
800 
801                         if ((*topDclOpsIt).second.numUses > 1)
802                             return false;
803 
804                         for (auto& def : (*topDclOpsIt).second.def)
805                         {
806                             for (unsigned int i = 0; i != G4_MAX_SRCS; i++)
807                             {
808                                 auto src = def.first->getSrc(i);
809                                 if (!src)
810                                     continue;
811 
812                                 if (src->isImm())
813                                     continue;
814 
815                                 if (src->isSrcRegRegion() &&
816                                     (src->asSrcRegRegion()->getTopDcl() == kernel.fg.builder->getBuiltinSamplerHeader() ||
817                                         src->asSrcRegRegion()->getTopDcl() == kernel.fg.builder->getBuiltinR0()))
818                                     continue;
819 
820                                 // Using some other var in payload src requires extra checks to remat, so skip it
821                                 return false;
822                             }
823                         }
824                     }
825 
826                     if (liveness.isLiveAtExit(bb, extMsgOpnd->getTopDcl()->getRegVar()->getId()) ||
827                         getLastUseLexId(extMsgOpnd->getTopDcl()) >= srcLexId)
828                         len -= uniqueDefInst->getMsgDesc()->getSrc1LenRegs();
829 
830                     if (refs.rowsUsed.size() <= len)
831                         return false;
832 
833                     return true;
834                 }
835                 else
836                 {
837                     // Non-sampler definition to be rematerialized
838                     if (uniqueDefInst->isSend())
839                         return false;
840 
841                     auto opIt = operations.find(srcOpndTopDcl);
842                     if (opIt == operations.end())
843                         return false;
844 
845                     auto&& srcOpndRefs = (*opIt).second;
846                     auto srcOpndUniqueDef = findUniqueDef(srcOpndRefs, srcOpndRgn);
847 
848                     bool isSrcAvailble = false;
849                     if (kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_CM &&
850                         uniqueDefBB == bb)
851                     {
852                         isSrcAvailble = checkLocalWAR(uniqueDefInst, bb, instIter);
853                     }
854 
855                     if (!srcOpndUniqueDef &&
856                         !isSrcAvailble &&
857                         !srcOpndTopDcl->isInput())
858                         return false;
859 
860                     if (srcOpndUniqueDef &&
861                         !inSameSubroutine(bb, srcOpndUniqueDef->second))
862                         return false;
863 
864                     // Check if its live in/live out to/of current BB
865                     unsigned int id = srcOpndTopDcl->getRegVar()->getId();
866                     if (!liveness.isLiveAtExit(bb, id) &&
867                         // Even if a var is not live-out, its live-range
868                         // might extend till inst of interest.
869                         srcOpndRefs.lastUseLexId < srcInst->getLexicalId())
870                     {
871                         // Opnd may not be live, but it is still possible to
872                         // extend its live-range to remat it. For scalars, this
873                         // could be profitable too.
874                         srcLive[i] = false;
875                         anySrcNotLive = true;
876                     }
877                 }
878             }
879         }
880 
881         if (anySrcNotLive)
882         {
883             // Apply cost heuristic. It may be profitable to extend
884             // scalars sometimes.
885             for (unsigned int i = 0; i < G4_MAX_SRCS; i++)
886             {
887                 if (!srcLive[i])
888                 {
889                     G4_SrcRegRegion* srcRgn = uniqueDefInst->getSrc(i)->asSrcRegRegion();
890 
891                     if (srcRgn->getTopDcl()->getNumElems() > 1 &&
892                         getNumUses(srcRgn->getTopDcl()) < 20)
893                     {
894                         // Extending non-scalar operands can be expensive
895                         return false;
896                     }
897                 }
898             }
899         }
900 
901         // Record remats in loop only for non-scalar operations. This is a heuristic used
902         // to not remat excessively in loops.
903         if (!inSameLoop &&
904             uniqueDefInst->getExecSize() > 1)
905             incNumRematsInLoop();
906 
907         if (cr0DefBB && IS_TYPE_FLOAT_ALL(uniqueDefInst->getExecType()))
908         {
909             return false;
910         }
911 
912         return true;
913     }
914 
rematerialize(G4_SrcRegRegion * src,G4_BB * bb,const Reference * uniqueDef,std::list<G4_INST * > & newInst,G4_INST * & cacheInst)915     G4_SrcRegRegion* Rematerialization::rematerialize(
916         G4_SrcRegRegion* src, G4_BB* bb, const Reference* uniqueDef,
917         std::list<G4_INST*>& newInst, G4_INST*& cacheInst)
918     {
919         // op1 (8) A   B   C
920         // ...
921         // op2 (8) D   A   E
922         //
923         // =>
924         // op1 (8) A   B   C
925         // ...
926         // op1_dup (8) A1   B   C
927         // op2 (8) D   A1   E
928 
929         G4_SrcRegRegion* rematSrc = nullptr;
930 
931         auto dstInst = uniqueDef->first;
932         auto dst = dstInst->getDst();
933         bool isSampler = dstInst->isSplitSend() && dstInst->getMsgDesc()->isSampler();
934 
935         for (unsigned int i = 0; i < G4_MAX_SRCS; i++)
936         {
937             G4_Operand* src = dstInst->getSrc(i);
938             if (src &&
939                 src->isSrcRegRegion())
940             {
941                 incNumRemat(src->asSrcRegRegion()->getTopDcl());
942             }
943         }
944 
945         if (!isSampler)
946         {
947             unsigned int diffBound = dst->getRightBound() - (dst->getRegOff() * numEltPerGRF<Type_UB>());
948             unsigned numElems = (diffBound + 1) / dst->getTypeSize();
949             auto newTemp = kernel.fg.builder->createTempVar(numElems, dst->getType(), Any, "REMAT_");
950             newTemp->copyAlign(dst->getTopDcl());
951             gra.copyAlignment(newTemp, dst->getTopDcl());
952             G4_DstRegRegion* newDst = kernel.fg.builder->createDst(newTemp->getRegVar(), 0,
953                 (dst->getLeftBound() % numEltPerGRF<Type_UB>()) / dst->getTypeSize(),
954                 dst->getHorzStride(), dst->getType());
955             G4_INST* dupOp = dstInst->cloneInst();
956             dupOp->setDest(newDst);
957             dupOp->inheritDIFrom(dstInst);
958 
959             rematSrc = createSrcRgn(src, dst, newTemp);
960 
961             newInst.push_back(dupOp);
962 
963             cacheInst = newInst.back();
964         }
965         else
966         {
967             G4_Operand* src0 = nullptr;
968             // Look up samplerHeader(0,2) definition
969             auto sampleHeaderTopDcl = uniqueDef->first->getSrc(0)->asSrcRegRegion()->getTopDcl();
970             if (sampleHeaderTopDcl == kernel.fg.builder->getBuiltinSamplerHeader())
971             {
972                 samplerHeader = sampleHeaderTopDcl;
973                 if (!samplerHeaderMapPopulated)
974                 {
975                     populateSamplerHeaderMap();
976                 }
977 
978                 if (deLVNedBBs.find(bb) == deLVNedBBs.end())
979                 {
980                     // DeLVN one bb at a time when required
981                     deLVNSamplers(bb);
982                     deLVNedBBs.insert(bb);
983                 }
984 
985                 auto samplerDefIt = samplerHeaderMap.find(uniqueDef->first);
986                 auto prevHeaderMov = (*samplerDefIt).second;
987 
988                 src0 = dstInst->getSrc(0);
989 
990                 // Duplicate sampler header setup instruction
991                 auto dupOp = prevHeaderMov->cloneInst();
992                 newInst.push_back(dupOp);
993             }
994             else
995             {
996                 // Handle sampler when src0 is not builtin sampler header
997                 auto src0Rgn = uniqueDef->first->getSrc(0)->asSrcRegRegion();
998                 auto src0TopDcl = src0Rgn->getTopDcl();
999                 auto ops = operations.find(src0TopDcl);
1000                 MUST_BE_TRUE(ops != operations.end(), "Didnt find record in map");
1001                 MUST_BE_TRUE((*ops).second.numUses == 1, "Expecting src0 to be used only in sampler");
1002 
1003                 G4_Declare* newSrc0Dcl = nullptr;
1004                 if (src0TopDcl->getRegVar()->isPhyRegAssigned())
1005                 {
1006                     newSrc0Dcl = src0TopDcl;
1007                 }
1008                 else
1009                 {
1010                     newSrc0Dcl = kernel.fg.builder->createTempVar(src0TopDcl->getTotalElems(),
1011                         src0TopDcl->getElemType(), gra.getSubRegAlign(src0TopDcl));
1012 
1013                     // Clone all defining instructions for sampler's msg header
1014                     for (unsigned int i = 0; i != (*ops).second.def.size(); i++)
1015                     {
1016                         auto& headerDefInst = (*ops).second.def[i].first;
1017 
1018                         auto dupOp = headerDefInst->cloneInst();
1019                         auto headerDefDst = headerDefInst->getDst();
1020                         assert(!headerDefDst->isIndirect()); // we dont allow send header to be defined indirectly
1021                         dupOp->setDest(kernel.fg.builder->createDst(
1022                             newSrc0Dcl->getRegVar(), headerDefDst->getRegOff(), headerDefDst->getSubRegOff(),
1023                             headerDefDst->getHorzStride(), headerDefDst->getType()));
1024                         newInst.push_back(dupOp);
1025                     }
1026                 }
1027 
1028                 auto rd = kernel.fg.builder->createRegionDesc(src0Rgn->getRegion()->vertStride,
1029                     src0Rgn->getRegion()->width, src0Rgn->getRegion()->horzStride);
1030 
1031                 src0 = kernel.fg.builder->createSrc(
1032                     newSrc0Dcl->getRegVar(), src0Rgn->getRegOff(), src0Rgn->getSubRegOff(),
1033                     rd, src0Rgn->getType());
1034             }
1035 
1036             auto samplerDst = kernel.fg.builder->createTempVar(dst->getTopDcl()->getTotalElems(), dst->getTopDcl()->getElemType(),
1037                 gra.getSubRegAlign(dst->getTopDcl()), "REMAT_SAMPLER_");
1038             auto samplerDstRgn = kernel.fg.builder->createDst(samplerDst->getRegVar(), 0,
1039                 0, 1, samplerDst->getElemType());
1040 
1041             auto dstMsgDesc = dstInst->getMsgDescRaw();
1042             // TODO: this may not hold when we start using load/store descriptors
1043             MUST_BE_TRUE(dstMsgDesc, "expected raw descriptor");
1044 
1045             auto newMsgDesc = kernel.fg.builder->createGeneralMsgDesc(
1046                 dstMsgDesc->getDesc(),
1047                 dstMsgDesc->getExtendedDesc(), dstMsgDesc->getAccess(),
1048                 kernel.fg.builder->duplicateOperand(dstMsgDesc->getSurface()),
1049                 kernel.fg.builder->duplicateOperand(dstMsgDesc->getSti()));
1050 
1051             auto dupOp = kernel.fg.builder->createSplitSendInst(nullptr, dstInst->opcode(), dstInst->getExecSize(), samplerDstRgn,
1052                 kernel.fg.builder->duplicateOperand(src0)->asSrcRegRegion(),
1053                 kernel.fg.builder->duplicateOperand(dstInst->getSrc(1))->asSrcRegRegion(),
1054                 kernel.fg.builder->duplicateOperand(dstInst->asSendInst()->getMsgDescOperand()), dstInst->getOption(),
1055                 newMsgDesc, kernel.fg.builder->duplicateOperand(dstInst->getSrc(3)), true);
1056             dupOp->setCISAOff(dstInst->getCISAOff());
1057             dupOp->inheritDIFrom(dstInst);
1058 
1059             newInst.push_back(dupOp);
1060 
1061             rematSrc = createSrcRgn(src, dst, samplerDst);
1062 
1063             cacheInst = newInst.back();
1064         }
1065 
1066         // Fix for NoMaskWA
1067         for (auto inst : newInst)
1068             if (inst->getPredicate() && inst->getPredicate()->isSameAsNoMask())
1069                 inst->setPredicate(nullptr);
1070 
1071         return rematSrc;
1072     }
1073 
createSrcRgn(G4_SrcRegRegion * srcToRemat,G4_DstRegRegion * uniqueDef,G4_Declare * rematTemp)1074     G4_SrcRegRegion* Rematerialization::createSrcRgn(G4_SrcRegRegion* srcToRemat, G4_DstRegRegion* uniqueDef, G4_Declare* rematTemp)
1075     {
1076         G4_SrcRegRegion* rematSrc = nullptr;
1077 
1078         unsigned row = (srcToRemat->getLeftBound() / numEltPerGRF<Type_UB>()) - (uniqueDef->getLeftBound() / numEltPerGRF<Type_UB>());
1079         unsigned subReg = (srcToRemat->getLeftBound() % numEltPerGRF<Type_UB>()) / srcToRemat->getTypeSize();
1080 
1081         rematSrc = kernel.fg.builder->createSrcRegRegion(srcToRemat->getModifier(), Direct,
1082             rematTemp->getRegVar(), (short)row, (short)subReg, srcToRemat->getRegion(), srcToRemat->getType());
1083 
1084         return rematSrc;
1085     }
1086 
findUniqueDef(References & refs,G4_SrcRegRegion * src)1087     const Reference* Rematerialization::findUniqueDef(References & refs, G4_SrcRegRegion *src)
1088     {
1089         // This function looks up list of definitions for a topdcl (src->getTopDcl()) and
1090         // returns a single dst region that defines that src region. If more than 1 def
1091         // match lb/rb of src then nullptr is returned. If a partial unique def is found
1092         // even then nullptr is returned.
1093 
1094         Reference* uniqueDef = nullptr;
1095 
1096         unsigned int lb = src->getLeftBound(), rb = src->getRightBound();
1097         for (auto&& r : refs.def)
1098         {
1099             auto curdst = r.first->getDst();
1100             unsigned int curlb = curdst->getLeftBound();
1101             unsigned int currb = curdst->getRightBound();
1102 
1103             if (curlb <= lb && currb >= rb)
1104             {
1105                 if (uniqueDef)
1106                 {
1107                     uniqueDef = nullptr;
1108                     break;
1109                 }
1110                 else
1111                 {
1112                     uniqueDef = &r;
1113                 }
1114             }
1115             else if ((curlb <= lb && currb >= lb) ||
1116                 (curlb <= rb && currb >= lb))
1117             {
1118                 // Partial overlap
1119                 uniqueDef = nullptr;
1120                 break;
1121             }
1122         }
1123 
1124         if (uniqueDef)
1125         {
1126             G4_RegFileKind rf = refs.def.front().first->getDst()->getTopDcl()->getRegFile();
1127             if (rf == G4_RegFileKind::G4_INPUT)
1128             {
1129                 // Variable is an input as well as has a def
1130                 uniqueDef = nullptr;
1131             }
1132         }
1133 
1134         return uniqueDef;
1135     }
1136 
getNumSamplers(G4_Kernel & kernel)1137     unsigned int getNumSamplers(G4_Kernel& kernel)
1138     {
1139         unsigned int numSampler = 0;
1140 
1141         for (auto bb : kernel.fg)
1142         {
1143             for (auto inst : *bb)
1144             {
1145                 if (inst->isSplitSend() &&
1146                     inst->getMsgDesc()->isSampler())
1147                 {
1148                     numSampler++;
1149                 }
1150             }
1151         }
1152 
1153         return numSampler;
1154     }
1155 
run()1156     void Rematerialization::run()
1157     {
1158         populateRefs();
1159 
1160         auto firstProgInst = kernel.fg.getEntryBB()->getFirstInst();
1161 
1162         for (auto bb : kernel.fg)
1163         {
1164             if (kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISATarget::VISA_3D)
1165             {
1166                 // For Cm, assume cr0 def is live across BBs
1167                 // For IGC, assume cr0 is reset at each BB entry
1168                 cr0DefBB = false;
1169             }
1170             // Store cache of rematerialized operations so nearby instructions
1171             // can reuse them.
1172             // <Unique def, <Remat'd def, Lexical id of last ref>>
1173             std::map<const Reference*, std::pair<G4_INST*, unsigned int>> rematValues;
1174             for (auto instIt = bb->begin();
1175                 instIt != bb->end();
1176                 instIt++)
1177             {
1178                 auto inst = (*instIt);
1179                 auto dst = inst->getDst();
1180                 bool runRemat = false;
1181 
1182                 cr0DefBB |= dst &&
1183                     dst->isCrReg() && (inst != firstProgInst);
1184 
1185                 // Run remat if any src opnd is spilled
1186                 for (unsigned int opnd = 0; opnd < G4_MAX_SRCS; opnd++)
1187                 {
1188                     auto src = inst->getSrc(opnd);
1189 
1190                     if (src &&
1191                         src->isSrcRegRegion())
1192                     {
1193                         auto srcTopDcl = src->getTopDcl();
1194                         if (srcTopDcl && srcTopDcl->getRegVar()->isRegAllocPartaker() &&
1195                             (isRangeSpilled(srcTopDcl) ||
1196                             rematCandidates[srcTopDcl->getRegVar()->getId()] == true))
1197                         {
1198                             // Run remat for spilled src opnd even if
1199                             // register pressure is low.
1200                             runRemat = true;
1201                             break;
1202                         }
1203                     }
1204                 }
1205 
1206                 if (!runRemat)
1207                 {
1208                     auto regPressure = rpe.getRegisterPressure(inst);
1209 
1210                     if (regPressure < rematRegPressure)
1211                     {
1212                         continue;
1213                     }
1214                 }
1215 
1216                 // High register pressure found at current instruction so try to remat
1217                 for (unsigned int opnd = 0; opnd < G4_MAX_SRCS; opnd++)
1218                 {
1219                     auto src = inst->getSrc(opnd);
1220 
1221                     if (src &&
1222                         src->isSrcRegRegion())
1223                     {
1224                         const Reference* uniqueDef = nullptr;
1225                         G4_SrcRegRegion* rematSrc = nullptr;
1226 
1227                         bool canRemat = canRematerialize(src->asSrcRegRegion(), bb, uniqueDef, instIt);
1228                         if (canRemat)
1229                         {
1230                             bool reUseRemat = false;
1231                             auto prevRematIt = rematValues.find(uniqueDef);
1232                             if (prevRematIt != rematValues.end())
1233                             {
1234                                 if ((inst->getLexicalId() - (*prevRematIt).second.second) <=
1235                                     MAX_LOCAL_REMAT_REUSE_DISTANCE)
1236                                 {
1237                                     reUseRemat = true;
1238                                     rematSrc = createSrcRgn(src->asSrcRegRegion(), uniqueDef->first->getDst(),
1239                                         (*prevRematIt).second.first->getDst()->getTopDcl());
1240 
1241                                     reduceNumUses(src->getTopDcl());
1242 
1243 #if 0
1244                                     printf("Reusing rematerialized value %s in src%d of $%d from %s\n",
1245                                         src->getTopDcl()->getName(), opnd, inst->getCISAOff(),
1246                                         (*prevRematIt).second.first->getDst()->getTopDcl()->getName());
1247 #endif
1248                                 }
1249                                 (*prevRematIt).second.second = inst->getLexicalId();
1250                             }
1251 
1252                             if (!reUseRemat)
1253                             {
1254 #if 0
1255                                 printf("Will rematerialize %s in src%d of $%d. Source computation at $%d\n",
1256                                     src->getTopDcl()->getName(), opnd, inst->getCISAOff(), uniqueDef->first->getCISAOff());
1257 #endif
1258                                 std::list<G4_INST*> newInsts;
1259                                 G4_INST* cacheInst = nullptr;
1260                                 rematSrc = rematerialize(src->asSrcRegRegion(), bb, uniqueDef, newInsts, cacheInst);
1261                                 while (!newInsts.empty())
1262                                 {
1263                                     bb->insertBefore(instIt, newInsts.front());
1264                                     newInsts.pop_front();
1265                                 }
1266 
1267                                 rematValues.insert(std::make_pair(uniqueDef, std::make_pair(cacheInst, src->getInst()->getLexicalId())));
1268 
1269                                 reduceNumUses(src->getTopDcl());
1270 
1271                                 IRChanged = true;
1272                             }
1273 
1274                             inst->setSrc(rematSrc, opnd);
1275                         }
1276                     }
1277                 }
1278             }
1279         }
1280 
1281         cleanRedundantSamplerHeaders();
1282 
1283         kernel.dumpToFile("after.remat");
1284     }
1285 }
1286