1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2019-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include "VarSplit.h"
10 #include "GraphColor.h"
11 
12 namespace vISA
13 {
14 
VarSplitPass(G4_Kernel & k)15 VarSplitPass::VarSplitPass(G4_Kernel& k) : kernel(k)
16 {
17 }
18 
buildPreVerify()19 void VarSplitPass::buildPreVerify()
20 {
21     for (auto bb : kernel.fg)
22     {
23         for (auto inst :*bb)
24         {
25             if (inst->getDst())
26             {
27                 splitVerify[inst].dst = inst->getDst();
28                 splitVerify[inst].dstLb = inst->getDst()->getLeftBound();
29                 splitVerify[inst].dstRb = inst->getDst()->getRightBound();
30             }
31             for (unsigned int i = 0; i != G4_MAX_SRCS; i++)
32             {
33                 if (inst->getSrc(i) && inst->getSrc(i)->isSrcRegRegion())
34                 {
35                     splitVerify[inst].src[i] = inst->getSrc(i);
36                     splitVerify[inst].srcLb[i] = inst->getSrc(i)->getLeftBound();
37                     splitVerify[inst].srcRb[i] = inst->getSrc(i)->getRightBound();
38                 }
39             }
40         }
41     }
42 }
43 
verify()44 void VarSplitPass::verify()
45 {
46     // verify
47     // parent, <child, <lb, rb>>
48     std::unordered_map<G4_Declare*, std::unordered_map<G4_Declare*, std::pair<unsigned int, unsigned int>>> parentSplit;
49     std::unordered_set<G4_Declare*> splitDcls;
50     unsigned int numSplitIntrinsics = 0;
51 
52     auto getChildData = [&](G4_Declare* child)
53     {
54         std::pair<unsigned int, unsigned int> childLbRb = { 0,0 };
55         bool found = false;
56         for (auto& item : parentSplit)
57         {
58             for (auto& itemCh : (item).second)
59             {
60                 if (itemCh.first == child)
61                 {
62                     MUST_BE_TRUE(!found, "Duplicate lb/rb entry found");
63                     childLbRb = itemCh.second;
64                     found = true;
65                 }
66             }
67         }
68 
69         MUST_BE_TRUE(found, "Didnt find child dcl");
70         return childLbRb;
71     };
72 
73     // create parent->child mapping
74     for (auto bb : kernel.fg)
75     {
76         for (auto inst :*bb)
77         {
78             if (inst->isSplitIntrinsic())
79             {
80                 // ensure this is split mov instruction
81                 MUST_BE_TRUE(inst->isSplitIntrinsic(), "Didnt expect new non-split intrinsic instruction");
82 
83                 // verify that split instruction's dst, src(0) is correct
84                 splitDcls.insert(inst->getDst()->getTopDcl());
85 
86                 MUST_BE_TRUE(!inst->getSrc(0)->getTopDcl()->getAddressed(), "Shouldnt split indirectly addressed variable");
87 
88                 auto origSrc0 = inst->getSrc(0)->asSrcRegRegion();
89                 auto origLb = origSrc0->getLeftBound();
90                 auto origRb = origSrc0->getRightBound();
91                 auto itemToInsert = std::make_pair(inst->getDst()->getTopDcl(), std::make_pair((unsigned int)origLb, (unsigned int)origRb));
92                 parentSplit[origSrc0->getTopDcl()].insert(itemToInsert);
93 
94                 numSplitIntrinsics++;
95             }
96         }
97     }
98 
99     // now check whether usage of child is correct
100     std::unordered_map<G4_Declare*, unsigned int> parentDefCount;
101     for (auto bb : kernel.fg)
102     {
103         for (auto inst :*bb)
104         {
105             auto dst = inst->getDst();
106 
107             if (dst && splitDcls.find(dst->getTopDcl()) != splitDcls.end())
108             {
109                 MUST_BE_TRUE(inst->isSplitIntrinsic(), "Found split dcl as dst in non-split intrinsic instruction");
110             }
111 
112             if (dst && parentSplit.find(dst->getTopDcl()) != parentSplit.find(dst->getTopDcl()))
113             {
114                 auto oldDefCount = parentDefCount[dst->getTopDcl()];
115                 parentDefCount[dst->getTopDcl()] = oldDefCount + 1;
116                 MUST_BE_TRUE(oldDefCount == 0, "Found second def of parent of split variable");
117             }
118 
119             for (unsigned int i = 0; i != G4_MAX_SRCS; i++)
120             {
121                 auto src = inst->getSrc(i);
122                 if (!src || !src->asSrcRegRegion())
123                     continue;
124 
125                 if (parentSplit.find(src->asSrcRegRegion()->getTopDcl()) != parentSplit.end())
126                 {
127                     MUST_BE_TRUE(inst->isSplitIntrinsic(), "Found src opnd using pre-split parent");
128                 }
129 
130                 if (splitDcls.find(src->asSrcRegRegion()->getTopDcl()) == splitDcls.end())
131                     continue; // not a split dcl
132 
133                 // src is a split dcl, verify its usage is consistent with pre-transformation data structure
134                 auto lb = src->getLeftBound();
135                 auto rb = src->getRightBound();
136 
137                 auto childData = getChildData(src->asSrcRegRegion()->getTopDcl());
138                 auto childLb = childData.first;
139                 //auto childRb = childData.second;
140 
141                 auto totalLb = childLb + lb;
142                 auto totalRb = childLb + rb;
143 
144                 auto origInstData = splitVerify[inst];
145                 auto origLb = origInstData.srcLb[i];
146                 auto origRb = origInstData.srcRb[i];
147 
148                 MUST_BE_TRUE(origLb == totalLb, "Mismatch in lb");
149                 MUST_BE_TRUE(origRb == totalRb, "Mismatch in rb");
150             }
151         }
152     }
153 
154     printf("Split verification passed successfully - %d split!\n", numSplitIntrinsics);
155 }
156 
verifyOverlap()157 void VarSplitPass::verifyOverlap()
158 {
159     // For defs, map GRF written with G4_Declare of dst rgn.
160     // Verify that src reg# used in intrinsic_split comes from
161     // parent dcl of the split dst.
162     std::unordered_map<unsigned int, G4_Declare*> regToDcl;
163     unsigned int numSplitLeft = 0;
164     for (auto bb : kernel.fg)
165     {
166         for (auto inst :*bb)
167         {
168             if (inst->isSplitIntrinsic())
169             {
170                 auto dst = inst->getDst();
171                 auto src = inst->getSrc(0);
172                 auto dstTopDcl = dst->getTopDcl();
173                 auto srcTopDcl = src->getTopDcl();
174                 MUST_BE_TRUE(dstTopDcl->getRegVar()->getPhyReg() &&
175                     dstTopDcl->getRegVar()->getPhyReg()->isGreg(), "Unexpected assignment condition on split dst");
176                 MUST_BE_TRUE(srcTopDcl->getRegVar()->getPhyReg() &&
177                     srcTopDcl->getRegVar()->getPhyReg()->isGreg(), "Unexpected assignment condition on split src");
178 
179                 auto dstLS = dst->getLinearizedStart();
180                 auto dstLE = dst->getLinearizedEnd();
181 
182                 auto srcLS = src->getLinearizedStart();
183                 auto srcLE = src->getLinearizedEnd();
184 
185                 if (dstLS == srcLS &&
186                     dstLE == srcLE)
187                 {
188                     // assignment is ok and will be coalesced away
189                     continue;
190                 }
191 
192                 numSplitLeft++;
193 
194                 auto srcDclRegNum = srcLS / numEltPerGRF<Type_UB>();
195                 auto srcDclNumRows = ((srcLE + numEltPerGRF<Type_UB>() - 1) / numEltPerGRF<Type_UB>()) - srcDclRegNum;
196 
197                 // check whether src GRF# is written by parent of dst split dcl
198                 for (unsigned int i = srcDclRegNum; i != (srcDclRegNum + srcDclNumRows); i++)
199                 {
200                     auto dcl = regToDcl[i];
201                     if (dcl != getParentDcl(dstTopDcl))
202                     {
203                         if (!dstTopDcl->getRegVar()->isRegVarTransient() &&
204                             !dstTopDcl->getRegVar()->isRegVarCoalesced())
205                         {
206                             MUST_BE_TRUE(false, "split src uses GRF value from non-parent");
207                         }
208                         else if (dstTopDcl->getRegVar()->isRegVarTransient())
209                         {
210                             // dst is spilled
211                             auto dstOrigDcl = dstTopDcl->getRegVar()->getNonTransientBaseRegVar()->getDeclare()->getRootDeclare();
212                             if (dcl != getParentDcl(dstOrigDcl))
213                             {
214                                 MUST_BE_TRUE(false, "split src uses GRF value from non-parent. dst spilled.");
215                             }
216                         }
217                         else
218                         {
219                             // do nothing for coalesced ranges
220                         }
221                     }
222                 }
223             }
224             else
225             {
226                 auto dstRgn = inst->getDst();
227                 if (dstRgn && dstRgn->getTopDcl() &&
228                     dstRgn->getTopDcl()->getRegVar()->getPhyReg() &&
229                     dstRgn->getTopDcl()->getRegVar()->getPhyReg()->isGreg())
230                 {
231                     auto grf = dstRgn->getTopDcl()->getRegVar()->getPhyReg()->asGreg()->getRegNum();
232                     auto numRows = (dstRgn->getLinearizedEnd() - dstRgn->getLinearizedStart() + numEltPerGRF<Type_UB>() - 1) / numEltPerGRF<Type_UB>();
233                     for (unsigned int i = grf; i != (grf + numRows); i++)
234                     {
235                         regToDcl[i] = dstRgn->getTopDcl();
236                     }
237                 }
238             }
239         }
240     }
241 
242     printf("Split assignment overlap passed successfully - %d splits left\n", numSplitLeft);
243 }
244 
run()245 void VarSplitPass::run()
246 {
247     if (kernel.getOption(vISA_VerifyExplicitSplit))
248         buildPreVerify();
249 
250     findSplitCandidates();
251 
252     split();
253 
254     if (kernel.getOption(vISA_VerifyExplicitSplit))
255         verify();
256 }
257 
findSplitCandidates()258 void VarSplitPass::findSplitCandidates()
259 {
260     auto canSplit = [](G4_INST* inst)
261     {
262         // Insert any new split candidates here
263         return (inst->isSend() &&
264             inst->getMsgDesc()->isSampler() &&
265             inst->getMsgDesc()->getDstLenRegs() > 2 &&
266             !inst->getDst()->getTopDcl()->getRegVar()->isRegVarTransient());
267     };
268 
269     // Find all dcls that can be split in to smaller chunks
270     for (auto bb : kernel.fg)
271     {
272         for (auto inst :*bb)
273         {
274             if (inst->getDst() && inst->getDst()->getTopDcl())
275             {
276                 auto dstDcl = inst->getDst()->getTopDcl();
277 
278                 if (dstDcl &&
279                     !dstDcl->getAddressed() &&
280                     dstDcl->getRegFile() == G4_RegFileKind::G4_GRF)
281                 {
282                     auto& prop = splitVars[dstDcl];
283                     prop.numDefs++;
284                     prop.def = std::make_pair(inst->getDst(), bb);
285                     if (canSplit(inst))
286                     {
287                         prop.candidateDef = true;
288                     }
289                 }
290             }
291 
292             for (unsigned int s = 0; s != G4_MAX_SRCS; s++)
293             {
294                 auto src = inst->getSrc(s);
295                 if (!src || !src->isSrcRegRegion() || !src->asSrcRegRegion()->getTopDcl())
296                     continue;
297 
298                 auto srcRgn = src->asSrcRegRegion();
299                 auto dcl = srcRgn->getTopDcl();
300 
301                 if (dcl->getNumRows() < 2)
302                     continue;
303 
304                 // It is possible that splitVars map doesnt have an entry for dcl
305                 // yet as in CFG, a use may appear lexically earlier than its def.
306                 auto& prop = splitVars[dcl];
307                 prop.srcs.push_back(std::make_pair(srcRgn,bb));
308             }
309         }
310     }
311 
312     // Now filter out real candidates
313     for (auto itemIt = splitVars.begin(); itemIt != splitVars.end(); itemIt++)
314     {
315         auto& item = (*itemIt);
316         if (item.second.numDefs != 1 || !item.second.candidateDef ||
317             !item.second.isDefUsesInSameBB())
318         {
319             item.second.legitCandidate = false;
320             continue;
321         }
322 
323         // Check whether each src operand is independent and compute size
324         for (auto& srcpair : item.second.srcs)
325         {
326             auto src = srcpair.first;
327             auto numRows = (src->getRightBound() - src->getLeftBound() + numEltPerGRF<Type_UB>() - 1) / numEltPerGRF<Type_UB>();
328             auto regOff = src->getRegOff();
329 
330             if (item.first->getByteSize() < src->getRightBound())
331             {
332                 item.second.legitCandidate = false;
333                 break;
334             }
335 
336             if (numRows == 1)
337             {
338                 if (item.second.ag == VarProperties::AccessGranularity::Unknown)
339                     item.second.ag = VarProperties::AccessGranularity::OneGrf;
340                 else if (item.second.ag == VarProperties::AccessGranularity::TwoGrf)
341                 {
342                     item.second.legitCandidate = false;
343                     break;
344                 }
345             }
346             else if (numRows == 2)
347             {
348                 if (item.second.ag == VarProperties::AccessGranularity::Unknown)
349                     item.second.ag = VarProperties::AccessGranularity::TwoGrf;
350                 else if (item.second.ag == VarProperties::AccessGranularity::OneGrf)
351                 {
352                     item.second.legitCandidate = false;
353                     break;
354                 }
355                 else if (regOff % 2 != 0)
356                 {
357                     item.second.legitCandidate = false;
358                     break;
359                 }
360             }
361             else if (numRows > 2)
362             {
363                 // use as send src
364                 item.second.legitCandidate = false;
365                 break;
366             }
367         }
368     }
369 
370     for (auto itemIt = splitVars.begin(); itemIt != splitVars.end();)
371     {
372         auto item = (*itemIt);
373         if (!item.second.legitCandidate ||
374             item.second.ag == VarProperties::AccessGranularity::Unknown ||
375             item.second.numDefs > 1)
376         {
377             itemIt = splitVars.erase(itemIt);
378             continue;
379         }
380         itemIt++;
381     }
382 
383     // Apply split cost heuristic
384     std::unordered_map<G4_INST*, unsigned int> instId;
385     for (auto bb : kernel.fg)
386     {
387         for (auto inst :*bb)
388         {
389             instId[inst] = instId.size();
390         }
391     }
392     for (auto itemIt = splitVars.begin(); itemIt != splitVars.end();)
393     {
394         auto item = (*itemIt);
395 
396         if (item.second.srcs.size() > 0)
397         {
398             // Dont emit split if all uses are closeby
399             unsigned int idx = instId[item.second.srcs.front().first->getInst()];
400             bool split = true;
401             if (item.second.srcs.size() > 1)
402             {
403                 for (auto src : item.second.srcs)
404                 {
405                     if ((instId[src.first->getInst()] - idx) < 2)
406                     {
407                         split = false;
408                     }
409                     else
410                     {
411                         split = true;
412                         break;
413                     }
414                     idx = instId[src.first->getInst()];
415                 }
416             }
417             else
418             {
419                 if ((idx - instId[item.second.def.first->getInst()]) < 2)
420                 {
421                     split = false;
422                 }
423             }
424 
425             // dont split if def-last first use distance <= 8
426             if (split &&
427                 (instId[item.second.srcs.back().first->getInst()] - instId[item.second.def.first->getInst()]) <= 8)
428                 split = false;
429 
430             if (!split)
431             {
432                 itemIt = splitVars.erase(itemIt);
433                 continue;
434             }
435         }
436         ++itemIt;
437     }
438 
439     // Each entry in splitVars map is a split candidate
440 }
441 
split()442 void VarSplitPass::split()
443 {
444     auto getIter = [](G4_INST* inst, G4_BB* bb)
445     {
446         for (auto iter = bb->begin(); iter != bb->end(); iter++)
447         {
448             if (*iter == inst)
449                 return iter;
450         }
451         return bb->end();
452     };
453 
454 #ifdef DEBUG_VERBOSE_ON
455     unsigned int numIntrinsicsInserted = 0;
456 #endif
457     // Do actual splitting
458     for(unsigned int i = 0; i != kernel.Declares.size(); ++i)
459     {
460         auto curDcl = kernel.Declares[i];
461         auto isCandidate = splitVars.find(curDcl);
462         if (isCandidate == splitVars.end())
463             continue;
464 
465         auto item = *isCandidate;
466 
467         MUST_BE_TRUE(item.second.legitCandidate, "Cannot split non-candidate");
468 
469         // Insert intrinsics
470         auto it = getIter(item.second.def.first->getInst(), item.second.def.second);
471         MUST_BE_TRUE(it != item.second.def.second->end(), "Instruction not found in BB");
472         it++;
473 
474         auto dstDcl = item.second.def.first->getTopDcl();
475         unsigned int numIntrinInsts = 0;
476         if (item.second.ag == VarProperties::AccessGranularity::OneGrf)
477             numIntrinInsts = dstDcl->getNumRows();
478         else if (item.second.ag == VarProperties::AccessGranularity::TwoGrf)
479             numIntrinInsts = dstDcl->getNumRows() / 2;
480         else
481             MUST_BE_TRUE(false, "unsupported access granularity");
482 
483         // lb, rb, split dcl
484         std::vector<std::tuple<unsigned int, unsigned int, G4_Declare*>> splitDcls;
485         for (unsigned int i = 0; i != numIntrinInsts; i++)
486         {
487             unsigned int numRows = item.second.ag == VarProperties::AccessGranularity::TwoGrf ? 2 : 1;
488             unsigned int lb = getGRFSize() * i*numRows;
489             unsigned int rb = lb + (getGRFSize()*numRows)-1;
490 
491             auto name = kernel.fg.builder->getNameString(kernel.fg.mem, 50, "%s_%d_%d_%d", dstDcl->getName(), i, lb, rb);
492             auto splitDcl = kernel.fg.builder->createDeclareNoLookup(name,
493                 G4_RegFileKind::G4_GRF, numEltPerGRF<Type_UD>(), numRows, Type_UD);
494             splitParentDcl.insert(std::make_pair(splitDcl, dstDcl));
495             splitChildren[dstDcl].push_back(splitDcl);
496 
497             // If this part of dcl is never used in code, then dont create split intrinsic inst for it
498             if (!item.second.isPartDclUsed(lb, rb))
499             {
500                 unusedDcls.insert(splitDcl);
501                 continue;
502             }
503 
504             auto dstRgn = kernel.fg.builder->createDstRegRegion(splitDcl, 1);
505             auto srcRgn = kernel.fg.builder->createSrc(dstDcl->getRegVar(),
506                 item.second.def.first->getRegOff() + (i * numRows), item.second.def.first->getSubRegOff(),
507                 kernel.fg.builder->getRegionStride1(), Type_UD);
508             G4_ExecSize execSize {(getGRFSize() / TypeSize(Type_UD)) * numRows};
509             auto intrin = kernel.fg.builder->createIntrinsicInst(
510                 nullptr, Intrinsic::Split, execSize, dstRgn, srcRgn, nullptr, nullptr,
511                 item.second.def.first->getInst()->getOption() | G4_InstOption::InstOpt_WriteEnable, true);
512             intrin->inheritDIFrom(item.second.def.first->getInst());
513             item.second.def.second->insertBefore(it, intrin);
514             splitDcls.push_back(std::make_tuple(lb, rb, splitDcl));
515             IRchanged = true;
516 #ifdef DEBUG_VERBOSE_ON
517             numIntrinsicsInserted++;
518 #endif
519         }
520 
521         auto getSplitDcl = [&splitDcls](unsigned int lb, unsigned int rb)
522         {
523             for (auto& item : splitDcls)
524             {
525                 if (std::get<0>(item) <= lb &&
526                     std::get<1>(item) >= rb)
527                     return std::tuple(std::get<0>(item),
528                         std::get<1>(item), std::get<2>(item));
529             }
530             return std::tuple(0u,0u,(G4_Declare*)nullptr);
531         };
532 
533         auto getOpndNum = [](G4_SrcRegRegion* src)
534         {
535             auto inst = src->getInst();
536             for (unsigned int i = 0; i != G4_MAX_SRCS; i++)
537             {
538                 if (inst->getSrc(i) == src)
539                     return i;
540             }
541             MUST_BE_TRUE(false, "src operand not found");
542             return 0xffffffff;
543         };
544 
545         // Replace all src operands
546         for (auto& s : item.second.srcs)
547         {
548             auto srcRgn = s.first;
549 
550             auto lb = srcRgn->getLeftBound();
551             auto rb = srcRgn->getRightBound();
552 
553             auto item = getSplitDcl(lb, rb);
554             auto item_lb = std::get<0>(item);
555             auto dcl = std::get<2>(item);
556             MUST_BE_TRUE(dcl, "Didnt find split dcl");
557 
558             unsigned int regNum = (lb - item_lb)/numEltPerGRF<Type_UB>();
559 
560             auto newSrc = kernel.fg.builder->createSrcRegRegion(srcRgn->getModifier(), srcRgn->getRegAccess(), dcl->getRegVar(), regNum, srcRgn->getSubRegOff(), srcRgn->getRegion(), srcRgn->getType());
561             auto opndNum = getOpndNum(srcRgn);
562             auto tup = std::make_tuple(srcRgn->getInst(), srcRgn, opndNum);
563             preSplit.insert(std::make_pair(newSrc, tup));
564             srcRgn->getInst()->setSrc(newSrc, opndNum);
565         }
566     }
567 
568     DEBUG_VERBOSE("Number of split intrinsincs inserted " << numIntrinsicsInserted << std::endl);
569 }
570 
replaceIntrinsics()571 void VarSplitPass::replaceIntrinsics()
572 {
573 #ifdef DEBUG_VERBOSE_ON
574     unsigned int numSplitMovs = 0;
575 #endif
576 
577     if (kernel.getOption(vISA_VerifyExplicitSplit))
578     {
579         // Check whether intrinsic assignments overlap with other
580         // rows of original variable
581         verifyOverlap();
582     }
583 
584     // Replace intrinsic.split with mov
585     for (auto bb : kernel.fg)
586     {
587         for (auto inst :*bb)
588         {
589             if (inst->isSplitIntrinsic())
590             {
591                 inst->setOpcode(G4_mov);
592 #ifdef DEBUG_VERBOSE_ON
593                 if (inst->getDst()->getBase()->asRegVar()->getPhyReg()->asGreg()->getRegNum() !=
594                     inst->getSrc(0)->getBase()->asRegVar()->getPhyReg()->asGreg()->getRegNum())
595                     numSplitMovs++;
596 #endif
597             }
598         }
599     }
600 
601     DEBUG_VERBOSE("Number of split movs left behind " << numSplitMovs << std::endl);
602 }
603 
getParentDcl(G4_Declare * splitDcl)604 G4_Declare* VarSplitPass::getParentDcl(G4_Declare* splitDcl)
605 {
606     auto it = splitParentDcl.find(splitDcl);
607     if (it != splitParentDcl.end())
608         return (*it).second;
609     return nullptr;
610 }
611 
getChildren(G4_Declare * parent)612 std::vector<G4_Declare*>* VarSplitPass::getChildren(G4_Declare* parent)
613 {
614     auto it = splitChildren.find(parent);
615     if (it == splitChildren.end())
616         return nullptr;
617     return &(*it).second;
618 }
619 
getSiblings(G4_Declare * s)620 std::vector<G4_Declare*> VarSplitPass::getSiblings(G4_Declare* s)
621 {
622     // Return all siblings of s, excluding s
623     std::vector<G4_Declare*> siblings;
624 
625     // First lookup parent
626     auto parentDcl = getParentDcl(s);
627     if (!parentDcl)
628         return siblings;
629 
630     // Lookup all children of parent
631     auto children = getChildren(parentDcl);
632     if (!children)
633         return siblings;
634 
635     for (auto item : *children)
636     {
637         if (item != s)
638             siblings.push_back(item);
639     }
640 
641     return siblings;
642 }
643 
isSplitDcl(G4_Declare * d)644 bool VarSplitPass::isSplitDcl(G4_Declare* d)
645 {
646     auto it = splitChildren.find(d);
647     if (it == splitChildren.end())
648         return false;
649     return true;
650 }
651 
isPartialDcl(G4_Declare * d)652 bool VarSplitPass::isPartialDcl(G4_Declare* d)
653 {
654     auto it = splitParentDcl.find(d);
655     if (it == splitParentDcl.end())
656         return false;
657     return true;
658 }
659 
getSiblingNum(G4_Declare * d)660 unsigned int VarSplitPass::getSiblingNum(G4_Declare* d)
661 {
662     // This function must be invoked assuming d is a partial dcl.
663     // Otherwise behavior is unpredictable.
664     if (!isPartialDcl(d))
665         return 0;
666 
667     unsigned int siblingNum = 0;
668     // First lookup parent
669     auto parentDcl = getParentDcl(d);
670 
671     // Lookup all children of parent
672     auto children = getChildren(parentDcl);
673 
674     for (auto s : *children)
675     {
676         if (s == d)
677             break;
678         siblingNum++;
679     }
680     return siblingNum;
681 }
682 
getIdealAllocation(G4_Declare * dcl,LiveRange ** lrs)683 unsigned int VarSplitPass::getIdealAllocation(G4_Declare* dcl, LiveRange** lrs)
684 {
685     // This function is invoked when assigning GRFs to parent.
686     unsigned int idealGRF = 0;
687     if (isSplitDcl(dcl))
688     {
689         // <ideal reg num for parent, num children with this assignment for coalescing>
690         std::unordered_map<unsigned int, unsigned int> idealParentReg;
691         // This is parent dcl
692         auto children = getChildren(dcl);
693         // Try coalescing with first child that is allocated
694         unsigned int numRowsOffset = 0;
695         for (auto child : *children)
696         {
697             if (child->getRegVar()->isRegAllocPartaker())
698             {
699                 auto id = child->getRegVar()->getId();
700                 auto lr = lrs[id];
701                 auto phyReg = lr->getPhyReg();
702                 if (phyReg && !isChildDclUnused(child))
703                 {
704                     auto regNum = phyReg->asGreg()->getRegNum();
705                     if (regNum >= numRowsOffset)
706                     {
707                         idealGRF = regNum - numRowsOffset;
708                         idealParentReg[idealGRF] = idealParentReg[idealGRF] + 1;
709                     }
710                 }
711             }
712             numRowsOffset += child->getNumRows();
713         }
714         std::pair<unsigned int, unsigned int> state;
715         if (idealParentReg.size() > 0)
716         {
717             state.first = idealParentReg.begin()->first;
718             state.second = idealParentReg.begin()->second;
719         }
720 
721         for (auto arg : idealParentReg)
722         {
723             if (arg.second > state.second)
724             {
725                 state.first = arg.first;
726                 state.second = arg.second;
727             }
728         }
729         idealGRF = state.first;
730     }
731 
732     idealGRF = idealGRF >= kernel.getNumRegTotal() ? 0 : idealGRF;
733 
734     return idealGRF;
735 }
736 
isChildDclUnused(G4_Declare * dcl)737 bool VarSplitPass::isChildDclUnused(G4_Declare* dcl)
738 {
739     auto it = unusedDcls.find(dcl);
740     if (it == unusedDcls.end())
741         return false;
742     return true;
743 }
744 
writeHints(G4_Declare * dcl,LiveRange ** lrs)745 void VarSplitPass::writeHints(G4_Declare* dcl, LiveRange** lrs)
746 {
747     bool isParent = isSplitDcl(dcl);
748     bool isChild = isPartialDcl(dcl);
749 
750     // If parent, then set hints in all children
751     if (isParent)
752     {
753         auto regNum = lrs[dcl->getRegVar()->getId()]->getPhyReg()->asGreg()->getRegNum();
754         auto children = getChildren(dcl);
755         unsigned int i = 0;
756         for (auto child : *children)
757         {
758             if (child->getRegVar()->isRegAllocPartaker())
759             {
760                 auto id = child->getRegVar()->getId();
761                 lrs[id]->setAllocHint(regNum + (i * child->getNumRows()));
762             }
763             i++;
764         }
765     }
766     else if (isChild)
767     {
768         // Write hint to parent if its empty
769         if (auto parentDcl = getParentDcl(dcl))
770         {
771             auto idealParentGRF = getIdealAllocation(parentDcl, lrs);
772 
773             if (idealParentGRF != 0 && parentDcl->getRegVar()->isRegAllocPartaker())
774             {
775                 lrs[parentDcl->getRegVar()->getId()]->setAllocHint(idealParentGRF);
776                 auto children = getChildren(getParentDcl(dcl));
777                 unsigned int i = 0;
778                 for (auto child : *children)
779                 {
780                     if (child->getRegVar()->isRegAllocPartaker())
781                     {
782                         lrs[child->getRegVar()->getId()]->setAllocHint(idealParentGRF + (i * child->getNumRows()));
783                     }
784                     i++;
785                 }
786             }
787         }
788     }
789 }
790 
undo(G4_Declare * parentDcl)791 void VarSplitPass::undo(G4_Declare* parentDcl)
792 {
793     // Undo split for parentDcl
794 
795     if (!isSplitDcl(parentDcl))
796         return;
797 
798     auto children = getChildren(parentDcl);
799 
800     // Skip doing anything if any child was spilled
801     for (auto child : *children)
802     {
803         if (child->isSpilled())
804             return;
805     }
806 
807     for (auto& split : preSplit)
808     {
809         auto srcRgn = split.first;
810 
811         if (!srcRgn->getTopDcl() ||
812             getParentDcl(srcRgn->getTopDcl()) != parentDcl)
813             continue;
814 
815         // parentDcl is parent of split
816         auto inst = std::get<0>(split.second);
817         auto opnd = std::get<1>(split.second);
818         auto srcNum = std::get<2>(split.second);
819 
820         inst->setSrc(opnd, srcNum);
821     }
822 
823     // remove explicit variable split instructions
824     for (auto child : *children)
825     {
826         unusedDcls.erase(child);
827     }
828 
829     for (auto bb : kernel.fg)
830     {
831         for (auto instIt = bb->begin(); instIt != bb->end();)
832         {
833             auto inst = (*instIt);
834             if (inst->isSplitIntrinsic() &&
835                 inst->getSrc(0)->getTopDcl() == parentDcl)
836             {
837                 instIt = bb->erase(instIt);
838                 continue;
839             }
840             ++instIt;
841         }
842     }
843 
844     // fix other data structures
845     for (auto childIt = splitParentDcl.begin(); childIt != splitParentDcl.end();)
846     {
847         auto child = (*childIt);
848         if (child.second == parentDcl)
849         {
850             childIt = splitParentDcl.erase(childIt);
851             continue;
852         }
853         ++childIt;
854     }
855 
856     splitChildren.erase(parentDcl);
857 }
858 
reallocParent(G4_Declare * child,LiveRange ** lrs)859 bool VarSplitPass::reallocParent(G4_Declare* child, LiveRange** lrs)
860 {
861     // Given a child, lookup all siblings. If all children
862     // are assigned consecutive GRFs but parent isnt then
863     // return true.
864     bool ret = true;
865 
866     if (!child->getRegVar()->isRegAllocPartaker())
867         return false;
868 
869     auto parent = getParentDcl(child);
870 
871     if (parent == nullptr)
872         return false;
873 
874     auto children = getChildren(parent);
875 
876     auto baseRegNum = 0;
877     const unsigned int UnInit = 0xffffffff;
878     unsigned int nextRegNumExpected = UnInit;
879     for (auto child : *children)
880     {
881         if (!child->getRegVar()->isRegAllocPartaker() ||
882             isChildDclUnused(child))
883         {
884             if (nextRegNumExpected != UnInit)
885             {
886                 nextRegNumExpected += child->getNumRows();
887             }
888             continue;
889         }
890 
891         auto id = child->getRegVar()->getId();
892         auto phyReg = lrs[id]->getPhyReg();
893         if (phyReg)
894         {
895             if (nextRegNumExpected == UnInit)
896             {
897                 baseRegNum = phyReg->asGreg()->getRegNum();
898                 nextRegNumExpected = baseRegNum + (child->getByteSize() / numEltPerGRF<Type_UB>());
899                 continue;
900             }
901             else if (nextRegNumExpected != phyReg->asGreg()->getRegNum())
902             {
903                 return false;
904             }
905         }
906         else
907         {
908             return false;
909         }
910         nextRegNumExpected += child->getNumRows();
911     }
912 
913     G4_VarBase* parentPhyReg = nullptr;
914 
915     if (parent->getRegVar()->isRegAllocPartaker())
916         parentPhyReg = lrs[parent->getRegVar()->getId()]->getPhyReg();
917 
918     if (!parentPhyReg)
919         return false;
920 
921     if (parentPhyReg->asGreg()->getRegNum() == baseRegNum)
922         return false;
923 
924     return ret;
925 }
926 
isParentChildRelation(G4_Declare * parent,G4_Declare * child)927 bool VarSplitPass::isParentChildRelation(G4_Declare* parent, G4_Declare* child)
928 {
929     if (!isSplitDcl(parent))
930         return false;
931 
932     auto children = getChildren(parent);
933 
934     for (auto c : *children)
935     {
936         if (c == child)
937             return true;
938     }
939 
940     return false;
941 }
isSplitVarLocal(G4_Declare * dcl)942 bool VarSplitPass::isSplitVarLocal(G4_Declare* dcl)
943 {
944     auto it = splitVars.find(dcl);
945     MUST_BE_TRUE(it != splitVars.end(), "Not a split dcl");
946 
947     return (*it).second.isDefUsesInSameBB();
948 }
949 
LoopVarSplit(G4_Kernel & k,GraphColor * c,RPE * r)950 LoopVarSplit::LoopVarSplit(G4_Kernel& k, GraphColor* c, RPE* r) : kernel(k), coloring(c), rpe(r), references(k)
951 {
952     for (auto spill : coloring->getSpilledLiveRanges())
953     {
954         spilledDclSet.insert(spill->getDcl());
955     }
956     auto numVars = coloring->getNumVars();
957     auto lrs = coloring->getLiveRanges();
958     for (unsigned int i = 0; i != numVars; ++i)
959     {
960         auto rootDcl = lrs[i]->getDcl();
961         dclSpillCost[rootDcl] = lrs[i]->getSpillCost();
962     }
963 }
964 
run()965 void LoopVarSplit::run()
966 {
967     // 1. consider list of spilled variables sorted in
968     //       descending order of spill cost
969     // 2. iterate over each spilled variable from sorted list
970     //    a. run cost heuristic to decide if split makes sense
971     //    b. if decision is to split, then check which loop(s) to split around
972 
973     std::list<std::pair<G4_Declare*, float>> sortedDcls;
974     for (auto item : dclSpillCost)
975     {
976         if(spilledDclSet.find(item.first) !=spilledDclSet.end())
977             sortedDcls.push_back(std::pair(item.first, item.second));
978     }
979 
980     auto SpillCostDesc = [](const std::pair<G4_Declare*, float>& first,
981         const std::pair<G4_Declare*, float>& second)
982     {
983         return first.second > second.second;
984     };
985 
986     sortedDcls.sort(SpillCostDesc);
987 
988     // TODO: Iterating sequence based on spill cost may not be very accurate.
989     // Thats because long living variables typically have lower spill cost.
990     // But they may be referenced in some nested loop multiple times. In such
991     // cases, it is beneficial to split the variable around such loops. To
992     // accommodate such cases, we should be looking at reference count of
993     // variables in loops and iterate in an order based on that.
994     for (auto var : sortedDcls)
995     {
996         const auto& loops = getLoopsToSplitAround(var.first);
997         for (auto& loop : loops)
998         {
999             split(var.first, *loop);
1000         }
1001     }
1002 }
1003 
getReads(G4_Declare * dcl,Loop & loop)1004 std::vector<G4_SrcRegRegion*> LoopVarSplit::getReads(G4_Declare* dcl, Loop& loop)
1005 {
1006     std::vector<G4_SrcRegRegion*> reads;
1007 
1008     const auto& uses = references.getUses(dcl);
1009     for (auto use : *uses)
1010     {
1011         auto bb = std::get<1>(use);
1012         if (loop.contains(bb))
1013         {
1014             auto inst = std::get<0>(use);
1015             for (unsigned int i = 0; i != inst->getNumSrc(); ++i)
1016             {
1017                 auto opnd = inst->getSrc(i);
1018                 if (!opnd || !opnd->isSrcRegRegion())
1019                     continue;
1020                 auto topdcl = opnd->asSrcRegRegion()->getTopDcl();
1021                 if (topdcl == dcl)
1022                     reads.push_back(opnd->asSrcRegRegion());
1023             }
1024         }
1025     }
1026 
1027     return reads;
1028 }
1029 
getWrites(G4_Declare * dcl,Loop & loop)1030 std::vector<G4_DstRegRegion*> LoopVarSplit::getWrites(G4_Declare* dcl, Loop& loop)
1031 {
1032     std::vector<G4_DstRegRegion*> writes;
1033 
1034     const auto& defs = references.getDefs(dcl);
1035     for (auto def : *defs)
1036     {
1037         auto bb = std::get<1>(def);
1038         if (loop.contains(bb))
1039         {
1040             auto dst = std::get<0>(def)->getDst();
1041             writes.push_back(dst);
1042         }
1043     }
1044 
1045     return writes;
1046 }
1047 
getMaxRegPressureInLoop(Loop & loop)1048 unsigned int LoopVarSplit::getMaxRegPressureInLoop(Loop& loop)
1049 {
1050     auto it = maxRegPressureCache.find(&loop);
1051     if (it != maxRegPressureCache.end())
1052         return it->second;
1053 
1054     unsigned int maxRPE = 0;
1055     auto& bbs = loop.getBBs();
1056 
1057     for (auto bb : bbs)
1058     {
1059         for (auto inst : bb->getInstList())
1060         {
1061             maxRPE = std::max(maxRPE, rpe->getRegisterPressure(inst));
1062         }
1063     }
1064 
1065     maxRegPressureCache[&loop] = maxRPE;
1066 
1067     return maxRPE;
1068 }
1069 
dump(std::ostream & of)1070 void LoopVarSplit::dump(std::ostream& of)
1071 {
1072     for (auto dcl : splitResults)
1073     {
1074         of << "Spilled dcl: " << dcl.first->getName() << std::endl;
1075         for (auto split : dcl.second)
1076         {
1077             of << "\tSplit dcl: " << split.first->getName() << ", for loop L" << split.second->id << std::endl;
1078         }
1079         of << std::endl;
1080     }
1081 }
1082 
removeSplitInsts(GlobalRA * gra,G4_Declare * spillDcl,G4_BB * bb)1083 void LoopVarSplit::removeSplitInsts(GlobalRA* gra, G4_Declare* spillDcl, G4_BB* bb)
1084 {
1085     auto it = gra->splitResults.find(spillDcl);
1086 
1087     if (it == gra->splitResults.end())
1088         return;
1089 
1090     auto& bbInstsToRemove = (*it).second.insts;
1091     for (auto entry : bbInstsToRemove)
1092     {
1093         auto currBB = entry.first;
1094 
1095         if (currBB == bb)
1096         {
1097             auto& instsToRemoveFromBB = entry.second;
1098             for (auto instIt = bb->begin(); instIt != bb->end();)
1099             {
1100                 auto inst = (*instIt);
1101                 if (instsToRemoveFromBB.find(inst) == instsToRemoveFromBB.end())
1102                 {
1103                     ++instIt;
1104                     continue;
1105                 }
1106 
1107                 instIt = bb->erase(instIt);
1108             }
1109         }
1110     }
1111 }
1112 
removeFromPreheader(GlobalRA * gra,G4_Declare * spillDcl,G4_BB * bb,INST_LIST_ITER filledInstIter)1113 bool LoopVarSplit::removeFromPreheader(GlobalRA* gra, G4_Declare* spillDcl, G4_BB* bb, INST_LIST_ITER filledInstIter)
1114 {
1115     auto it = gra->splitResults.find(spillDcl);
1116     if (it != gra->splitResults.end() &&
1117         (*it).second.insts.find(bb) != (*it).second.insts.end())
1118     {
1119         auto inst = *filledInstIter;
1120         if (inst->isRawMov())
1121         {
1122             auto dstTopDcl = inst->getDst()->getTopDcl();
1123             auto it = gra->splitResults.find(dstTopDcl);
1124             if (it == gra->splitResults.end())
1125                 return false;
1126 
1127             removeSplitInsts(gra, spillDcl, bb);
1128             return true;
1129         }
1130     }
1131     return false;
1132 }
1133 
removeFromLoopExit(GlobalRA * gra,G4_Declare * spillDcl,G4_BB * bb,INST_LIST_ITER filledInstIter)1134 bool LoopVarSplit::removeFromLoopExit(GlobalRA* gra, G4_Declare* spillDcl, G4_BB* bb, INST_LIST_ITER filledInstIter)
1135 {
1136     return removeFromPreheader(gra, spillDcl, bb, filledInstIter);
1137 }
1138 
getSplitInsts(GlobalRA * gra,G4_BB * bb)1139 const std::unordered_set<G4_INST*> LoopVarSplit::getSplitInsts(GlobalRA* gra, G4_BB* bb)
1140 {
1141     std::unordered_set<G4_INST*> ret;
1142 
1143     for (auto& splitVar : gra->splitResults)
1144         for (auto& insts : splitVar.second.insts)
1145             if (insts.first == bb)
1146             {
1147                 for (auto item : insts.second)
1148                     ret.insert(item);
1149             }
1150 
1151     return ret;
1152 }
1153 
split(G4_Declare * dcl,Loop & loop)1154 bool LoopVarSplit::split(G4_Declare* dcl, Loop& loop)
1155 {
1156     // Split dcl in given loop. Return true if split was successful.
1157     // It is assumed that dcl is spilled. This method inserts a
1158     // copy in preheader of loop. Dst of this copy is a new tmp and
1159     // source is dcl. All uses/defs of dcl in the loop are replaced
1160     // with the tmp. If dcl is ever written in the loop, a copy from
1161     // tmp to dcl is inserted in loop exit bb.
1162     //
1163     // This transformation requires a valid preheader be present for
1164     // loop. If dcl is written in the loop then a valid exit bb is also
1165     // needed.
1166     if (!loop.preHeader)
1167         return false;
1168 
1169     const auto& dsts = getWrites(dcl, loop);
1170     if (dsts.size() > 0)
1171     {
1172         // evaluate if it makes sense to insert the copy if loop has
1173         // multiple exits.
1174         if (loop.getLoopExits().size() != 1)
1175             return false;
1176         // TODO: Handle creation of loop preheader and loop exit in
1177         // presence of SIMD CF. this requires changing JIP, UIP
1178         // in source goto instructions and fix (if) any data structures
1179         // in VISA that rely on those JIP/UIP.
1180         if (!loop.preHeader->dominates(loop.getLoopExits().front()))
1181             return false;
1182     }
1183 
1184     const auto& srcs = getReads(dcl, loop);
1185 
1186     auto splitDcl = kernel.fg.builder->createTempVar(dcl->getTotalElems(),
1187         dcl->getElemType(), coloring->getGRA().getSubRegAlign(dcl),
1188         "LOOPSPLIT", true);
1189 
1190     auto& splitData = coloring->getGRA().splitResults[splitDcl];
1191     splitData.origDcl = dcl;
1192 
1193     // emit TMP = dcl in preheader
1194     copy(loop.preHeader, splitDcl, dcl, &splitData);
1195 
1196     // emit dcl = TMP in loop exit
1197     if (dsts.size() > 0)
1198     {
1199         copy(loop.getLoopExits().front(), dcl, splitDcl, &splitData, false);
1200     }
1201 
1202     // replace all occurences of dcl in loop with TMP
1203     for (auto src : srcs)
1204         replaceSrc(src, splitDcl);
1205 
1206     for (auto dst : dsts)
1207         replaceDst(dst, splitDcl);
1208 
1209     splitResults[dcl].push_back(std::make_pair(splitDcl, &loop));
1210 
1211     return true;
1212 }
1213 
copy(G4_BB * bb,G4_Declare * dst,G4_Declare * src,SplitResults * splitData,bool pushBack)1214 void LoopVarSplit::copy(G4_BB* bb, G4_Declare* dst, G4_Declare* src, SplitResults* splitData, bool pushBack)
1215 {
1216     // create mov instruction to copy dst->src
1217     // multiple mov instructions may be created depending on size of dcls
1218     // all mov instructions are appended to inst list of bb
1219     // when pushBack argument = true, append to BB (happens in pre-header)
1220     // when pushBack argument = false, insert in bb after label (happens at exit bb)
1221 
1222     dst = dst->getRootDeclare();
1223     src = src->getRootDeclare();
1224     unsigned int numRows = dst->getNumRows();
1225     unsigned int bytesRemaining = dst->getByteSize();
1226 
1227     auto insertCopy = [&](G4_INST* inst)
1228     {
1229         if (pushBack || bb->size() == 0)
1230         {
1231             bb->push_back(inst);
1232             splitData->insts[bb].insert(inst);
1233         }
1234         else
1235         {
1236             // insert immediately after label instruction, if one exists
1237             for (auto it = bb->begin(); it != bb->end(); ++it)
1238             {
1239                 auto inst = (*it);
1240                 if (inst->isLabel())
1241                     continue;
1242                 bb->insertAfter(it, inst);
1243                 splitData->insts[bb].insert(inst);
1244                 break;
1245             }
1246         }
1247     };
1248 
1249     // first copy full GRF rows
1250     if (numRows > 1 || (dst->getTotalElems() * dst->getElemSize() == getGRFSize()))
1251     {
1252         // dcls are GRF sized so emit max SIMD size possible and copy 2 rows at
1253         // a time
1254         for (unsigned int i = 0; i < numRows; i++)
1255         {
1256             const RegionDesc* rd = kernel.fg.builder->getRegionStride1();
1257             G4_ExecSize execSize{ numEltPerGRF<Type_UD>() };
1258 
1259             if ((i + 1) < numRows)
1260                 execSize = G4_ExecSize(numEltPerGRF<Type_UD>() * 2);
1261 
1262             auto dstRgn = kernel.fg.builder->createDst(dst->getRegVar(), (short)i, 0, 1, Type_F);
1263             auto srcRgn = kernel.fg.builder->createSrc(src->getRegVar(), (short)i, 0, rd, Type_F);
1264             auto inst = kernel.fg.builder->createMov(execSize, dstRgn, srcRgn, InstOpt_WriteEnable, false);
1265 
1266             insertCopy(inst);
1267             bytesRemaining -= (execSize.value * G4_Type_Table[Type_F].byteSize);
1268 
1269             if (bytesRemaining < numEltPerGRF<Type_UB>())
1270                 break;
1271         }
1272     }
1273 
1274     while (bytesRemaining > 0)
1275     {
1276         G4_Type type = Type_W;
1277         G4_ExecSize execSize = g4::SIMD16;
1278 
1279         if (bytesRemaining >= 16)
1280             execSize = g4::SIMD8;
1281         else if (bytesRemaining >= 8 && bytesRemaining < 16)
1282             execSize = g4::SIMD4;
1283         else if (bytesRemaining >= 4 && bytesRemaining < 8)
1284             execSize = g4::SIMD2;
1285         else if (bytesRemaining >= 2 && bytesRemaining < 4)
1286             execSize = g4::SIMD1;
1287         else if (bytesRemaining == 1)
1288         {
1289             // If a region has odd number of bytes, copy last byte in final iteration
1290             execSize = g4::SIMD1;
1291             type = Type_UB;
1292         }
1293         else
1294         {
1295             MUST_BE_TRUE(false, "Unexpected condition");
1296         }
1297 
1298         const RegionDesc* rd = kernel.fg.builder->getRegionStride1();
1299 
1300         unsigned int row = (dst->getByteSize() - bytesRemaining) / numEltPerGRF<Type_UB>();
1301         unsigned int col = (dst->getByteSize() - bytesRemaining) % numEltPerGRF<Type_UB>();
1302         if (G4_Type_Table[type].byteSize > 1)
1303         {
1304             MUST_BE_TRUE(col % 2 == 0, "Unexpected condition");
1305             col /= 2;
1306         }
1307 
1308         G4_DstRegRegion* dstRgn = kernel.fg.builder->createDst(dst->getRegVar(), row, col, 1, type);
1309         G4_SrcRegRegion* srcRgn = kernel.fg.builder->createSrc(src->getRegVar(), row, col, rd, type);
1310 
1311         auto inst = kernel.fg.builder->createMov(execSize, dstRgn, srcRgn, InstOpt_WriteEnable, false);
1312 
1313         insertCopy(inst);
1314 
1315         bytesRemaining -= (execSize.value * G4_Type_Table[type].byteSize);
1316     };
1317 }
1318 
replaceSrc(G4_SrcRegRegion * src,G4_Declare * dcl)1319 void LoopVarSplit::replaceSrc(G4_SrcRegRegion* src, G4_Declare* dcl)
1320 {
1321     auto srcDcl = src->getBase()->asRegVar()->getDeclare();
1322     dcl = getNewDcl(srcDcl, dcl);
1323 
1324     auto newSrcRgn = kernel.fg.builder->createSrc(dcl->getRegVar(), src->getRegOff(),
1325         src->getSubRegOff(), src->getRegion(), src->getType(), src->getAccRegSel());
1326 
1327     auto inst = src->getInst();
1328     for (unsigned int i = 0; i != inst->getNumSrc(); ++i)
1329     {
1330         if (inst->getSrc(i) == src)
1331         {
1332             inst->setSrc(newSrcRgn, i);
1333             break;
1334         }
1335     }
1336 }
1337 
replaceDst(G4_DstRegRegion * dst,G4_Declare * dcl)1338 void LoopVarSplit::replaceDst(G4_DstRegRegion* dst, G4_Declare* dcl)
1339 {
1340     auto dstDcl = dst->getBase()->asRegVar()->getDeclare();
1341     dcl = getNewDcl(dstDcl, dcl);
1342 
1343     auto newDstRgn = kernel.fg.builder->createDst(dcl->getRegVar(), dst->getRegOff(),
1344         dst->getSubRegOff(), dst->getHorzStride(), dst->getType(), dst->getAccRegSel());
1345 
1346     auto inst = dst->getInst();
1347     inst->setDest(newDstRgn);
1348 }
1349 
getNewDcl(G4_Declare * dcl1,G4_Declare * dcl2)1350 G4_Declare* LoopVarSplit::getNewDcl(G4_Declare* dcl1, G4_Declare* dcl2)
1351 {
1352     // this method gets args dcl1, dcl2. this method is invoked
1353     // when the transformation replaces existing src/dst rgn with
1354     // equivalent one but using split variable.
1355     //
1356     // dcl1 is a dcl used to construct some src or dst rgn.
1357     // dcl2 is a new dcl that splits dcl1. dcl2 is always root dcl.
1358     // dcl1 may or may not be aliased of another dcl.
1359     // if dcl1 is also root dcl, then return dcl2.
1360     // if dcl1 is an alias dcl, then construct new dcl that aliases
1361     //   dcl2 at similar offset.
1362     // mapping from old dcl to new dcl is stored for future invocations.
1363 
1364     MUST_BE_TRUE(!dcl2->getAliasDeclare(), "Expecting to see root dcl for dcl2");
1365 
1366     auto it = oldNewDcl.find(dcl1);
1367     if (it != oldNewDcl.end())
1368         return (*it).second;
1369 
1370     if (!dcl1->getAliasDeclare())
1371     {
1372         oldNewDcl[dcl1] = dcl2;
1373         return dcl2;
1374     }
1375 
1376     auto newDcl = kernel.fg.builder->createTempVar(dcl1->getTotalElems(), dcl1->getElemType(),
1377         dcl1->getSubRegAlign());
1378     newDcl->setAliasDeclare(getNewDcl(dcl1->getRootDeclare(), dcl2), dcl1->getOffsetFromBase());
1379 
1380     oldNewDcl[dcl1] = newDcl;
1381 
1382     return newDcl;
1383 }
1384 
getLoopsToSplitAround(G4_Declare * dcl)1385 std::vector<Loop*> LoopVarSplit::getLoopsToSplitAround(G4_Declare* dcl)
1386 {
1387     // return a list of Loop* around which variable dcl should be split
1388     std::vector<Loop*> loopsToSplitAround;
1389 
1390     // first make list of all loops where dcl is ever referenced
1391     auto uses = references.getUses(dcl);
1392     auto defs = references.getDefs(dcl);
1393 
1394     auto StableOrder = [](const G4_BB* first, const G4_BB* second)
1395     {
1396         return first->getId() < second->getId();
1397     };
1398     std::set<G4_BB*, decltype(StableOrder)> bbsWithRefToDcl(StableOrder);
1399 
1400     for (auto& use : *uses)
1401     {
1402         auto bb = std::get<1>(use);
1403         bbsWithRefToDcl.insert(bb);
1404     }
1405 
1406     for (auto& def : *defs)
1407     {
1408         auto bb = std::get<1>(def);
1409         bbsWithRefToDcl.insert(bb);
1410     }
1411 
1412     auto OrderByRegPressure = [&](Loop* loop1, Loop* loop2)
1413     {
1414         return getMaxRegPressureInLoop(*loop1) >
1415             getMaxRegPressureInLoop(*loop2);
1416     };
1417 
1418     std::set<Loop*, decltype(OrderByRegPressure)> innerMostLoops(OrderByRegPressure);
1419 
1420     // now collect innermost loop for each referenced BB
1421     for (auto bb : bbsWithRefToDcl)
1422     {
1423         auto innerMost = kernel.fg.getLoops().getInnerMostLoop(bb);
1424         if (innerMost)
1425             innerMostLoops.insert(innerMost);
1426     }
1427 
1428     // prune list of loops
1429     // 1. loops are stored in descending order of max reg pressure
1430     // 2. apply cost heuristic to decide if variable should be split at a loop
1431     // 3. once split at a loop, dont split at any parent or nested loop
1432     //
1433     // Example:
1434     //
1435     // -----
1436     // |
1437     // | Loop A
1438     // | =X
1439     // -----
1440     //
1441     // -----
1442     // |
1443     // | Loop B
1444     // | =X
1445     // | -----
1446     // | | =X
1447     // | | Loop C
1448     // | -----
1449     // |
1450     // -----
1451     //
1452     // Assume variable X is spilled and is referenced in Loop A, Loop B, Loop C.
1453     // Loop C is nested in Loop B.
1454     // The algorithm below decides whether it is better to spill X around Loop C
1455     // or Loop B. X is spilled around only 1 of these 2 loops since they've a
1456     // parent-nested relationship. Independently, the algorithm can also decide to
1457     // split X around Loop A as it is not a parent or nested loop of other loops.
1458     //
1459 
1460 
1461     for (auto loop : innerMostLoops)
1462     {
1463         bool dontSplit = false;
1464         for (auto splitLoop : loopsToSplitAround)
1465         {
1466             if (loop->fullSubset(splitLoop) ||
1467                 loop->fullSuperset(splitLoop))
1468             {
1469                 // variable already split in nested or parent loop
1470                 dontSplit = true;
1471                 break;
1472             }
1473         }
1474         if (dontSplit)
1475             continue;
1476 
1477         // apply cost heuristic
1478         if (dcl->getNumRows() <= 2)
1479         {
1480             if (getMaxRegPressureInLoop(*loop) <= (unsigned int)(1.5f * (float)kernel.getNumRegTotal()))
1481                 loopsToSplitAround.push_back(loop);
1482         }
1483         else if (dcl->getNumRows() <= 4)
1484         {
1485             if (getMaxRegPressureInLoop(*loop) <= (unsigned int)(0.95f * (float)kernel.getNumRegTotal()))
1486                 loopsToSplitAround.push_back(loop);
1487         }
1488         else if (dcl->getNumRows() > 4)
1489         {
1490             // splitting dcls with > 4 rows should be very rare
1491             if (getMaxRegPressureInLoop(*loop) <= (unsigned int)(0.6f * (float)kernel.getNumRegTotal()))
1492                 loopsToSplitAround.push_back(loop);
1493         }
1494     }
1495 
1496     return loopsToSplitAround;
1497 }
1498 
1499 };