1 /*========================== begin_copyright_notice ============================
2
3 Copyright (C) 2019-2021 Intel Corporation
4
5 SPDX-License-Identifier: MIT
6
7 ============================= end_copyright_notice ===========================*/
8
9 #include "VarSplit.h"
10 #include "GraphColor.h"
11
12 namespace vISA
13 {
14
VarSplitPass(G4_Kernel & k)15 VarSplitPass::VarSplitPass(G4_Kernel& k) : kernel(k)
16 {
17 }
18
buildPreVerify()19 void VarSplitPass::buildPreVerify()
20 {
21 for (auto bb : kernel.fg)
22 {
23 for (auto inst :*bb)
24 {
25 if (inst->getDst())
26 {
27 splitVerify[inst].dst = inst->getDst();
28 splitVerify[inst].dstLb = inst->getDst()->getLeftBound();
29 splitVerify[inst].dstRb = inst->getDst()->getRightBound();
30 }
31 for (unsigned int i = 0; i != G4_MAX_SRCS; i++)
32 {
33 if (inst->getSrc(i) && inst->getSrc(i)->isSrcRegRegion())
34 {
35 splitVerify[inst].src[i] = inst->getSrc(i);
36 splitVerify[inst].srcLb[i] = inst->getSrc(i)->getLeftBound();
37 splitVerify[inst].srcRb[i] = inst->getSrc(i)->getRightBound();
38 }
39 }
40 }
41 }
42 }
43
verify()44 void VarSplitPass::verify()
45 {
46 // verify
47 // parent, <child, <lb, rb>>
48 std::unordered_map<G4_Declare*, std::unordered_map<G4_Declare*, std::pair<unsigned int, unsigned int>>> parentSplit;
49 std::unordered_set<G4_Declare*> splitDcls;
50 unsigned int numSplitIntrinsics = 0;
51
52 auto getChildData = [&](G4_Declare* child)
53 {
54 std::pair<unsigned int, unsigned int> childLbRb = { 0,0 };
55 bool found = false;
56 for (auto& item : parentSplit)
57 {
58 for (auto& itemCh : (item).second)
59 {
60 if (itemCh.first == child)
61 {
62 MUST_BE_TRUE(!found, "Duplicate lb/rb entry found");
63 childLbRb = itemCh.second;
64 found = true;
65 }
66 }
67 }
68
69 MUST_BE_TRUE(found, "Didnt find child dcl");
70 return childLbRb;
71 };
72
73 // create parent->child mapping
74 for (auto bb : kernel.fg)
75 {
76 for (auto inst :*bb)
77 {
78 if (inst->isSplitIntrinsic())
79 {
80 // ensure this is split mov instruction
81 MUST_BE_TRUE(inst->isSplitIntrinsic(), "Didnt expect new non-split intrinsic instruction");
82
83 // verify that split instruction's dst, src(0) is correct
84 splitDcls.insert(inst->getDst()->getTopDcl());
85
86 MUST_BE_TRUE(!inst->getSrc(0)->getTopDcl()->getAddressed(), "Shouldnt split indirectly addressed variable");
87
88 auto origSrc0 = inst->getSrc(0)->asSrcRegRegion();
89 auto origLb = origSrc0->getLeftBound();
90 auto origRb = origSrc0->getRightBound();
91 auto itemToInsert = std::make_pair(inst->getDst()->getTopDcl(), std::make_pair((unsigned int)origLb, (unsigned int)origRb));
92 parentSplit[origSrc0->getTopDcl()].insert(itemToInsert);
93
94 numSplitIntrinsics++;
95 }
96 }
97 }
98
99 // now check whether usage of child is correct
100 std::unordered_map<G4_Declare*, unsigned int> parentDefCount;
101 for (auto bb : kernel.fg)
102 {
103 for (auto inst :*bb)
104 {
105 auto dst = inst->getDst();
106
107 if (dst && splitDcls.find(dst->getTopDcl()) != splitDcls.end())
108 {
109 MUST_BE_TRUE(inst->isSplitIntrinsic(), "Found split dcl as dst in non-split intrinsic instruction");
110 }
111
112 if (dst && parentSplit.find(dst->getTopDcl()) != parentSplit.find(dst->getTopDcl()))
113 {
114 auto oldDefCount = parentDefCount[dst->getTopDcl()];
115 parentDefCount[dst->getTopDcl()] = oldDefCount + 1;
116 MUST_BE_TRUE(oldDefCount == 0, "Found second def of parent of split variable");
117 }
118
119 for (unsigned int i = 0; i != G4_MAX_SRCS; i++)
120 {
121 auto src = inst->getSrc(i);
122 if (!src || !src->asSrcRegRegion())
123 continue;
124
125 if (parentSplit.find(src->asSrcRegRegion()->getTopDcl()) != parentSplit.end())
126 {
127 MUST_BE_TRUE(inst->isSplitIntrinsic(), "Found src opnd using pre-split parent");
128 }
129
130 if (splitDcls.find(src->asSrcRegRegion()->getTopDcl()) == splitDcls.end())
131 continue; // not a split dcl
132
133 // src is a split dcl, verify its usage is consistent with pre-transformation data structure
134 auto lb = src->getLeftBound();
135 auto rb = src->getRightBound();
136
137 auto childData = getChildData(src->asSrcRegRegion()->getTopDcl());
138 auto childLb = childData.first;
139 //auto childRb = childData.second;
140
141 auto totalLb = childLb + lb;
142 auto totalRb = childLb + rb;
143
144 auto origInstData = splitVerify[inst];
145 auto origLb = origInstData.srcLb[i];
146 auto origRb = origInstData.srcRb[i];
147
148 MUST_BE_TRUE(origLb == totalLb, "Mismatch in lb");
149 MUST_BE_TRUE(origRb == totalRb, "Mismatch in rb");
150 }
151 }
152 }
153
154 printf("Split verification passed successfully - %d split!\n", numSplitIntrinsics);
155 }
156
verifyOverlap()157 void VarSplitPass::verifyOverlap()
158 {
159 // For defs, map GRF written with G4_Declare of dst rgn.
160 // Verify that src reg# used in intrinsic_split comes from
161 // parent dcl of the split dst.
162 std::unordered_map<unsigned int, G4_Declare*> regToDcl;
163 unsigned int numSplitLeft = 0;
164 for (auto bb : kernel.fg)
165 {
166 for (auto inst :*bb)
167 {
168 if (inst->isSplitIntrinsic())
169 {
170 auto dst = inst->getDst();
171 auto src = inst->getSrc(0);
172 auto dstTopDcl = dst->getTopDcl();
173 auto srcTopDcl = src->getTopDcl();
174 MUST_BE_TRUE(dstTopDcl->getRegVar()->getPhyReg() &&
175 dstTopDcl->getRegVar()->getPhyReg()->isGreg(), "Unexpected assignment condition on split dst");
176 MUST_BE_TRUE(srcTopDcl->getRegVar()->getPhyReg() &&
177 srcTopDcl->getRegVar()->getPhyReg()->isGreg(), "Unexpected assignment condition on split src");
178
179 auto dstLS = dst->getLinearizedStart();
180 auto dstLE = dst->getLinearizedEnd();
181
182 auto srcLS = src->getLinearizedStart();
183 auto srcLE = src->getLinearizedEnd();
184
185 if (dstLS == srcLS &&
186 dstLE == srcLE)
187 {
188 // assignment is ok and will be coalesced away
189 continue;
190 }
191
192 numSplitLeft++;
193
194 auto srcDclRegNum = srcLS / numEltPerGRF<Type_UB>();
195 auto srcDclNumRows = ((srcLE + numEltPerGRF<Type_UB>() - 1) / numEltPerGRF<Type_UB>()) - srcDclRegNum;
196
197 // check whether src GRF# is written by parent of dst split dcl
198 for (unsigned int i = srcDclRegNum; i != (srcDclRegNum + srcDclNumRows); i++)
199 {
200 auto dcl = regToDcl[i];
201 if (dcl != getParentDcl(dstTopDcl))
202 {
203 if (!dstTopDcl->getRegVar()->isRegVarTransient() &&
204 !dstTopDcl->getRegVar()->isRegVarCoalesced())
205 {
206 MUST_BE_TRUE(false, "split src uses GRF value from non-parent");
207 }
208 else if (dstTopDcl->getRegVar()->isRegVarTransient())
209 {
210 // dst is spilled
211 auto dstOrigDcl = dstTopDcl->getRegVar()->getNonTransientBaseRegVar()->getDeclare()->getRootDeclare();
212 if (dcl != getParentDcl(dstOrigDcl))
213 {
214 MUST_BE_TRUE(false, "split src uses GRF value from non-parent. dst spilled.");
215 }
216 }
217 else
218 {
219 // do nothing for coalesced ranges
220 }
221 }
222 }
223 }
224 else
225 {
226 auto dstRgn = inst->getDst();
227 if (dstRgn && dstRgn->getTopDcl() &&
228 dstRgn->getTopDcl()->getRegVar()->getPhyReg() &&
229 dstRgn->getTopDcl()->getRegVar()->getPhyReg()->isGreg())
230 {
231 auto grf = dstRgn->getTopDcl()->getRegVar()->getPhyReg()->asGreg()->getRegNum();
232 auto numRows = (dstRgn->getLinearizedEnd() - dstRgn->getLinearizedStart() + numEltPerGRF<Type_UB>() - 1) / numEltPerGRF<Type_UB>();
233 for (unsigned int i = grf; i != (grf + numRows); i++)
234 {
235 regToDcl[i] = dstRgn->getTopDcl();
236 }
237 }
238 }
239 }
240 }
241
242 printf("Split assignment overlap passed successfully - %d splits left\n", numSplitLeft);
243 }
244
run()245 void VarSplitPass::run()
246 {
247 if (kernel.getOption(vISA_VerifyExplicitSplit))
248 buildPreVerify();
249
250 findSplitCandidates();
251
252 split();
253
254 if (kernel.getOption(vISA_VerifyExplicitSplit))
255 verify();
256 }
257
findSplitCandidates()258 void VarSplitPass::findSplitCandidates()
259 {
260 auto canSplit = [](G4_INST* inst)
261 {
262 // Insert any new split candidates here
263 return (inst->isSend() &&
264 inst->getMsgDesc()->isSampler() &&
265 inst->getMsgDesc()->getDstLenRegs() > 2 &&
266 !inst->getDst()->getTopDcl()->getRegVar()->isRegVarTransient());
267 };
268
269 // Find all dcls that can be split in to smaller chunks
270 for (auto bb : kernel.fg)
271 {
272 for (auto inst :*bb)
273 {
274 if (inst->getDst() && inst->getDst()->getTopDcl())
275 {
276 auto dstDcl = inst->getDst()->getTopDcl();
277
278 if (dstDcl &&
279 !dstDcl->getAddressed() &&
280 dstDcl->getRegFile() == G4_RegFileKind::G4_GRF)
281 {
282 auto& prop = splitVars[dstDcl];
283 prop.numDefs++;
284 prop.def = std::make_pair(inst->getDst(), bb);
285 if (canSplit(inst))
286 {
287 prop.candidateDef = true;
288 }
289 }
290 }
291
292 for (unsigned int s = 0; s != G4_MAX_SRCS; s++)
293 {
294 auto src = inst->getSrc(s);
295 if (!src || !src->isSrcRegRegion() || !src->asSrcRegRegion()->getTopDcl())
296 continue;
297
298 auto srcRgn = src->asSrcRegRegion();
299 auto dcl = srcRgn->getTopDcl();
300
301 if (dcl->getNumRows() < 2)
302 continue;
303
304 // It is possible that splitVars map doesnt have an entry for dcl
305 // yet as in CFG, a use may appear lexically earlier than its def.
306 auto& prop = splitVars[dcl];
307 prop.srcs.push_back(std::make_pair(srcRgn,bb));
308 }
309 }
310 }
311
312 // Now filter out real candidates
313 for (auto itemIt = splitVars.begin(); itemIt != splitVars.end(); itemIt++)
314 {
315 auto& item = (*itemIt);
316 if (item.second.numDefs != 1 || !item.second.candidateDef ||
317 !item.second.isDefUsesInSameBB())
318 {
319 item.second.legitCandidate = false;
320 continue;
321 }
322
323 // Check whether each src operand is independent and compute size
324 for (auto& srcpair : item.second.srcs)
325 {
326 auto src = srcpair.first;
327 auto numRows = (src->getRightBound() - src->getLeftBound() + numEltPerGRF<Type_UB>() - 1) / numEltPerGRF<Type_UB>();
328 auto regOff = src->getRegOff();
329
330 if (item.first->getByteSize() < src->getRightBound())
331 {
332 item.second.legitCandidate = false;
333 break;
334 }
335
336 if (numRows == 1)
337 {
338 if (item.second.ag == VarProperties::AccessGranularity::Unknown)
339 item.second.ag = VarProperties::AccessGranularity::OneGrf;
340 else if (item.second.ag == VarProperties::AccessGranularity::TwoGrf)
341 {
342 item.second.legitCandidate = false;
343 break;
344 }
345 }
346 else if (numRows == 2)
347 {
348 if (item.second.ag == VarProperties::AccessGranularity::Unknown)
349 item.second.ag = VarProperties::AccessGranularity::TwoGrf;
350 else if (item.second.ag == VarProperties::AccessGranularity::OneGrf)
351 {
352 item.second.legitCandidate = false;
353 break;
354 }
355 else if (regOff % 2 != 0)
356 {
357 item.second.legitCandidate = false;
358 break;
359 }
360 }
361 else if (numRows > 2)
362 {
363 // use as send src
364 item.second.legitCandidate = false;
365 break;
366 }
367 }
368 }
369
370 for (auto itemIt = splitVars.begin(); itemIt != splitVars.end();)
371 {
372 auto item = (*itemIt);
373 if (!item.second.legitCandidate ||
374 item.second.ag == VarProperties::AccessGranularity::Unknown ||
375 item.second.numDefs > 1)
376 {
377 itemIt = splitVars.erase(itemIt);
378 continue;
379 }
380 itemIt++;
381 }
382
383 // Apply split cost heuristic
384 std::unordered_map<G4_INST*, unsigned int> instId;
385 for (auto bb : kernel.fg)
386 {
387 for (auto inst :*bb)
388 {
389 instId[inst] = instId.size();
390 }
391 }
392 for (auto itemIt = splitVars.begin(); itemIt != splitVars.end();)
393 {
394 auto item = (*itemIt);
395
396 if (item.second.srcs.size() > 0)
397 {
398 // Dont emit split if all uses are closeby
399 unsigned int idx = instId[item.second.srcs.front().first->getInst()];
400 bool split = true;
401 if (item.second.srcs.size() > 1)
402 {
403 for (auto src : item.second.srcs)
404 {
405 if ((instId[src.first->getInst()] - idx) < 2)
406 {
407 split = false;
408 }
409 else
410 {
411 split = true;
412 break;
413 }
414 idx = instId[src.first->getInst()];
415 }
416 }
417 else
418 {
419 if ((idx - instId[item.second.def.first->getInst()]) < 2)
420 {
421 split = false;
422 }
423 }
424
425 // dont split if def-last first use distance <= 8
426 if (split &&
427 (instId[item.second.srcs.back().first->getInst()] - instId[item.second.def.first->getInst()]) <= 8)
428 split = false;
429
430 if (!split)
431 {
432 itemIt = splitVars.erase(itemIt);
433 continue;
434 }
435 }
436 ++itemIt;
437 }
438
439 // Each entry in splitVars map is a split candidate
440 }
441
split()442 void VarSplitPass::split()
443 {
444 auto getIter = [](G4_INST* inst, G4_BB* bb)
445 {
446 for (auto iter = bb->begin(); iter != bb->end(); iter++)
447 {
448 if (*iter == inst)
449 return iter;
450 }
451 return bb->end();
452 };
453
454 #ifdef DEBUG_VERBOSE_ON
455 unsigned int numIntrinsicsInserted = 0;
456 #endif
457 // Do actual splitting
458 for(unsigned int i = 0; i != kernel.Declares.size(); ++i)
459 {
460 auto curDcl = kernel.Declares[i];
461 auto isCandidate = splitVars.find(curDcl);
462 if (isCandidate == splitVars.end())
463 continue;
464
465 auto item = *isCandidate;
466
467 MUST_BE_TRUE(item.second.legitCandidate, "Cannot split non-candidate");
468
469 // Insert intrinsics
470 auto it = getIter(item.second.def.first->getInst(), item.second.def.second);
471 MUST_BE_TRUE(it != item.second.def.second->end(), "Instruction not found in BB");
472 it++;
473
474 auto dstDcl = item.second.def.first->getTopDcl();
475 unsigned int numIntrinInsts = 0;
476 if (item.second.ag == VarProperties::AccessGranularity::OneGrf)
477 numIntrinInsts = dstDcl->getNumRows();
478 else if (item.second.ag == VarProperties::AccessGranularity::TwoGrf)
479 numIntrinInsts = dstDcl->getNumRows() / 2;
480 else
481 MUST_BE_TRUE(false, "unsupported access granularity");
482
483 // lb, rb, split dcl
484 std::vector<std::tuple<unsigned int, unsigned int, G4_Declare*>> splitDcls;
485 for (unsigned int i = 0; i != numIntrinInsts; i++)
486 {
487 unsigned int numRows = item.second.ag == VarProperties::AccessGranularity::TwoGrf ? 2 : 1;
488 unsigned int lb = getGRFSize() * i*numRows;
489 unsigned int rb = lb + (getGRFSize()*numRows)-1;
490
491 auto name = kernel.fg.builder->getNameString(kernel.fg.mem, 50, "%s_%d_%d_%d", dstDcl->getName(), i, lb, rb);
492 auto splitDcl = kernel.fg.builder->createDeclareNoLookup(name,
493 G4_RegFileKind::G4_GRF, numEltPerGRF<Type_UD>(), numRows, Type_UD);
494 splitParentDcl.insert(std::make_pair(splitDcl, dstDcl));
495 splitChildren[dstDcl].push_back(splitDcl);
496
497 // If this part of dcl is never used in code, then dont create split intrinsic inst for it
498 if (!item.second.isPartDclUsed(lb, rb))
499 {
500 unusedDcls.insert(splitDcl);
501 continue;
502 }
503
504 auto dstRgn = kernel.fg.builder->createDstRegRegion(splitDcl, 1);
505 auto srcRgn = kernel.fg.builder->createSrc(dstDcl->getRegVar(),
506 item.second.def.first->getRegOff() + (i * numRows), item.second.def.first->getSubRegOff(),
507 kernel.fg.builder->getRegionStride1(), Type_UD);
508 G4_ExecSize execSize {(getGRFSize() / TypeSize(Type_UD)) * numRows};
509 auto intrin = kernel.fg.builder->createIntrinsicInst(
510 nullptr, Intrinsic::Split, execSize, dstRgn, srcRgn, nullptr, nullptr,
511 item.second.def.first->getInst()->getOption() | G4_InstOption::InstOpt_WriteEnable, true);
512 intrin->inheritDIFrom(item.second.def.first->getInst());
513 item.second.def.second->insertBefore(it, intrin);
514 splitDcls.push_back(std::make_tuple(lb, rb, splitDcl));
515 IRchanged = true;
516 #ifdef DEBUG_VERBOSE_ON
517 numIntrinsicsInserted++;
518 #endif
519 }
520
521 auto getSplitDcl = [&splitDcls](unsigned int lb, unsigned int rb)
522 {
523 for (auto& item : splitDcls)
524 {
525 if (std::get<0>(item) <= lb &&
526 std::get<1>(item) >= rb)
527 return std::tuple(std::get<0>(item),
528 std::get<1>(item), std::get<2>(item));
529 }
530 return std::tuple(0u,0u,(G4_Declare*)nullptr);
531 };
532
533 auto getOpndNum = [](G4_SrcRegRegion* src)
534 {
535 auto inst = src->getInst();
536 for (unsigned int i = 0; i != G4_MAX_SRCS; i++)
537 {
538 if (inst->getSrc(i) == src)
539 return i;
540 }
541 MUST_BE_TRUE(false, "src operand not found");
542 return 0xffffffff;
543 };
544
545 // Replace all src operands
546 for (auto& s : item.second.srcs)
547 {
548 auto srcRgn = s.first;
549
550 auto lb = srcRgn->getLeftBound();
551 auto rb = srcRgn->getRightBound();
552
553 auto item = getSplitDcl(lb, rb);
554 auto item_lb = std::get<0>(item);
555 auto dcl = std::get<2>(item);
556 MUST_BE_TRUE(dcl, "Didnt find split dcl");
557
558 unsigned int regNum = (lb - item_lb)/numEltPerGRF<Type_UB>();
559
560 auto newSrc = kernel.fg.builder->createSrcRegRegion(srcRgn->getModifier(), srcRgn->getRegAccess(), dcl->getRegVar(), regNum, srcRgn->getSubRegOff(), srcRgn->getRegion(), srcRgn->getType());
561 auto opndNum = getOpndNum(srcRgn);
562 auto tup = std::make_tuple(srcRgn->getInst(), srcRgn, opndNum);
563 preSplit.insert(std::make_pair(newSrc, tup));
564 srcRgn->getInst()->setSrc(newSrc, opndNum);
565 }
566 }
567
568 DEBUG_VERBOSE("Number of split intrinsincs inserted " << numIntrinsicsInserted << std::endl);
569 }
570
replaceIntrinsics()571 void VarSplitPass::replaceIntrinsics()
572 {
573 #ifdef DEBUG_VERBOSE_ON
574 unsigned int numSplitMovs = 0;
575 #endif
576
577 if (kernel.getOption(vISA_VerifyExplicitSplit))
578 {
579 // Check whether intrinsic assignments overlap with other
580 // rows of original variable
581 verifyOverlap();
582 }
583
584 // Replace intrinsic.split with mov
585 for (auto bb : kernel.fg)
586 {
587 for (auto inst :*bb)
588 {
589 if (inst->isSplitIntrinsic())
590 {
591 inst->setOpcode(G4_mov);
592 #ifdef DEBUG_VERBOSE_ON
593 if (inst->getDst()->getBase()->asRegVar()->getPhyReg()->asGreg()->getRegNum() !=
594 inst->getSrc(0)->getBase()->asRegVar()->getPhyReg()->asGreg()->getRegNum())
595 numSplitMovs++;
596 #endif
597 }
598 }
599 }
600
601 DEBUG_VERBOSE("Number of split movs left behind " << numSplitMovs << std::endl);
602 }
603
getParentDcl(G4_Declare * splitDcl)604 G4_Declare* VarSplitPass::getParentDcl(G4_Declare* splitDcl)
605 {
606 auto it = splitParentDcl.find(splitDcl);
607 if (it != splitParentDcl.end())
608 return (*it).second;
609 return nullptr;
610 }
611
getChildren(G4_Declare * parent)612 std::vector<G4_Declare*>* VarSplitPass::getChildren(G4_Declare* parent)
613 {
614 auto it = splitChildren.find(parent);
615 if (it == splitChildren.end())
616 return nullptr;
617 return &(*it).second;
618 }
619
getSiblings(G4_Declare * s)620 std::vector<G4_Declare*> VarSplitPass::getSiblings(G4_Declare* s)
621 {
622 // Return all siblings of s, excluding s
623 std::vector<G4_Declare*> siblings;
624
625 // First lookup parent
626 auto parentDcl = getParentDcl(s);
627 if (!parentDcl)
628 return siblings;
629
630 // Lookup all children of parent
631 auto children = getChildren(parentDcl);
632 if (!children)
633 return siblings;
634
635 for (auto item : *children)
636 {
637 if (item != s)
638 siblings.push_back(item);
639 }
640
641 return siblings;
642 }
643
isSplitDcl(G4_Declare * d)644 bool VarSplitPass::isSplitDcl(G4_Declare* d)
645 {
646 auto it = splitChildren.find(d);
647 if (it == splitChildren.end())
648 return false;
649 return true;
650 }
651
isPartialDcl(G4_Declare * d)652 bool VarSplitPass::isPartialDcl(G4_Declare* d)
653 {
654 auto it = splitParentDcl.find(d);
655 if (it == splitParentDcl.end())
656 return false;
657 return true;
658 }
659
getSiblingNum(G4_Declare * d)660 unsigned int VarSplitPass::getSiblingNum(G4_Declare* d)
661 {
662 // This function must be invoked assuming d is a partial dcl.
663 // Otherwise behavior is unpredictable.
664 if (!isPartialDcl(d))
665 return 0;
666
667 unsigned int siblingNum = 0;
668 // First lookup parent
669 auto parentDcl = getParentDcl(d);
670
671 // Lookup all children of parent
672 auto children = getChildren(parentDcl);
673
674 for (auto s : *children)
675 {
676 if (s == d)
677 break;
678 siblingNum++;
679 }
680 return siblingNum;
681 }
682
getIdealAllocation(G4_Declare * dcl,LiveRange ** lrs)683 unsigned int VarSplitPass::getIdealAllocation(G4_Declare* dcl, LiveRange** lrs)
684 {
685 // This function is invoked when assigning GRFs to parent.
686 unsigned int idealGRF = 0;
687 if (isSplitDcl(dcl))
688 {
689 // <ideal reg num for parent, num children with this assignment for coalescing>
690 std::unordered_map<unsigned int, unsigned int> idealParentReg;
691 // This is parent dcl
692 auto children = getChildren(dcl);
693 // Try coalescing with first child that is allocated
694 unsigned int numRowsOffset = 0;
695 for (auto child : *children)
696 {
697 if (child->getRegVar()->isRegAllocPartaker())
698 {
699 auto id = child->getRegVar()->getId();
700 auto lr = lrs[id];
701 auto phyReg = lr->getPhyReg();
702 if (phyReg && !isChildDclUnused(child))
703 {
704 auto regNum = phyReg->asGreg()->getRegNum();
705 if (regNum >= numRowsOffset)
706 {
707 idealGRF = regNum - numRowsOffset;
708 idealParentReg[idealGRF] = idealParentReg[idealGRF] + 1;
709 }
710 }
711 }
712 numRowsOffset += child->getNumRows();
713 }
714 std::pair<unsigned int, unsigned int> state;
715 if (idealParentReg.size() > 0)
716 {
717 state.first = idealParentReg.begin()->first;
718 state.second = idealParentReg.begin()->second;
719 }
720
721 for (auto arg : idealParentReg)
722 {
723 if (arg.second > state.second)
724 {
725 state.first = arg.first;
726 state.second = arg.second;
727 }
728 }
729 idealGRF = state.first;
730 }
731
732 idealGRF = idealGRF >= kernel.getNumRegTotal() ? 0 : idealGRF;
733
734 return idealGRF;
735 }
736
isChildDclUnused(G4_Declare * dcl)737 bool VarSplitPass::isChildDclUnused(G4_Declare* dcl)
738 {
739 auto it = unusedDcls.find(dcl);
740 if (it == unusedDcls.end())
741 return false;
742 return true;
743 }
744
writeHints(G4_Declare * dcl,LiveRange ** lrs)745 void VarSplitPass::writeHints(G4_Declare* dcl, LiveRange** lrs)
746 {
747 bool isParent = isSplitDcl(dcl);
748 bool isChild = isPartialDcl(dcl);
749
750 // If parent, then set hints in all children
751 if (isParent)
752 {
753 auto regNum = lrs[dcl->getRegVar()->getId()]->getPhyReg()->asGreg()->getRegNum();
754 auto children = getChildren(dcl);
755 unsigned int i = 0;
756 for (auto child : *children)
757 {
758 if (child->getRegVar()->isRegAllocPartaker())
759 {
760 auto id = child->getRegVar()->getId();
761 lrs[id]->setAllocHint(regNum + (i * child->getNumRows()));
762 }
763 i++;
764 }
765 }
766 else if (isChild)
767 {
768 // Write hint to parent if its empty
769 if (auto parentDcl = getParentDcl(dcl))
770 {
771 auto idealParentGRF = getIdealAllocation(parentDcl, lrs);
772
773 if (idealParentGRF != 0 && parentDcl->getRegVar()->isRegAllocPartaker())
774 {
775 lrs[parentDcl->getRegVar()->getId()]->setAllocHint(idealParentGRF);
776 auto children = getChildren(getParentDcl(dcl));
777 unsigned int i = 0;
778 for (auto child : *children)
779 {
780 if (child->getRegVar()->isRegAllocPartaker())
781 {
782 lrs[child->getRegVar()->getId()]->setAllocHint(idealParentGRF + (i * child->getNumRows()));
783 }
784 i++;
785 }
786 }
787 }
788 }
789 }
790
undo(G4_Declare * parentDcl)791 void VarSplitPass::undo(G4_Declare* parentDcl)
792 {
793 // Undo split for parentDcl
794
795 if (!isSplitDcl(parentDcl))
796 return;
797
798 auto children = getChildren(parentDcl);
799
800 // Skip doing anything if any child was spilled
801 for (auto child : *children)
802 {
803 if (child->isSpilled())
804 return;
805 }
806
807 for (auto& split : preSplit)
808 {
809 auto srcRgn = split.first;
810
811 if (!srcRgn->getTopDcl() ||
812 getParentDcl(srcRgn->getTopDcl()) != parentDcl)
813 continue;
814
815 // parentDcl is parent of split
816 auto inst = std::get<0>(split.second);
817 auto opnd = std::get<1>(split.second);
818 auto srcNum = std::get<2>(split.second);
819
820 inst->setSrc(opnd, srcNum);
821 }
822
823 // remove explicit variable split instructions
824 for (auto child : *children)
825 {
826 unusedDcls.erase(child);
827 }
828
829 for (auto bb : kernel.fg)
830 {
831 for (auto instIt = bb->begin(); instIt != bb->end();)
832 {
833 auto inst = (*instIt);
834 if (inst->isSplitIntrinsic() &&
835 inst->getSrc(0)->getTopDcl() == parentDcl)
836 {
837 instIt = bb->erase(instIt);
838 continue;
839 }
840 ++instIt;
841 }
842 }
843
844 // fix other data structures
845 for (auto childIt = splitParentDcl.begin(); childIt != splitParentDcl.end();)
846 {
847 auto child = (*childIt);
848 if (child.second == parentDcl)
849 {
850 childIt = splitParentDcl.erase(childIt);
851 continue;
852 }
853 ++childIt;
854 }
855
856 splitChildren.erase(parentDcl);
857 }
858
reallocParent(G4_Declare * child,LiveRange ** lrs)859 bool VarSplitPass::reallocParent(G4_Declare* child, LiveRange** lrs)
860 {
861 // Given a child, lookup all siblings. If all children
862 // are assigned consecutive GRFs but parent isnt then
863 // return true.
864 bool ret = true;
865
866 if (!child->getRegVar()->isRegAllocPartaker())
867 return false;
868
869 auto parent = getParentDcl(child);
870
871 if (parent == nullptr)
872 return false;
873
874 auto children = getChildren(parent);
875
876 auto baseRegNum = 0;
877 const unsigned int UnInit = 0xffffffff;
878 unsigned int nextRegNumExpected = UnInit;
879 for (auto child : *children)
880 {
881 if (!child->getRegVar()->isRegAllocPartaker() ||
882 isChildDclUnused(child))
883 {
884 if (nextRegNumExpected != UnInit)
885 {
886 nextRegNumExpected += child->getNumRows();
887 }
888 continue;
889 }
890
891 auto id = child->getRegVar()->getId();
892 auto phyReg = lrs[id]->getPhyReg();
893 if (phyReg)
894 {
895 if (nextRegNumExpected == UnInit)
896 {
897 baseRegNum = phyReg->asGreg()->getRegNum();
898 nextRegNumExpected = baseRegNum + (child->getByteSize() / numEltPerGRF<Type_UB>());
899 continue;
900 }
901 else if (nextRegNumExpected != phyReg->asGreg()->getRegNum())
902 {
903 return false;
904 }
905 }
906 else
907 {
908 return false;
909 }
910 nextRegNumExpected += child->getNumRows();
911 }
912
913 G4_VarBase* parentPhyReg = nullptr;
914
915 if (parent->getRegVar()->isRegAllocPartaker())
916 parentPhyReg = lrs[parent->getRegVar()->getId()]->getPhyReg();
917
918 if (!parentPhyReg)
919 return false;
920
921 if (parentPhyReg->asGreg()->getRegNum() == baseRegNum)
922 return false;
923
924 return ret;
925 }
926
isParentChildRelation(G4_Declare * parent,G4_Declare * child)927 bool VarSplitPass::isParentChildRelation(G4_Declare* parent, G4_Declare* child)
928 {
929 if (!isSplitDcl(parent))
930 return false;
931
932 auto children = getChildren(parent);
933
934 for (auto c : *children)
935 {
936 if (c == child)
937 return true;
938 }
939
940 return false;
941 }
isSplitVarLocal(G4_Declare * dcl)942 bool VarSplitPass::isSplitVarLocal(G4_Declare* dcl)
943 {
944 auto it = splitVars.find(dcl);
945 MUST_BE_TRUE(it != splitVars.end(), "Not a split dcl");
946
947 return (*it).second.isDefUsesInSameBB();
948 }
949
LoopVarSplit(G4_Kernel & k,GraphColor * c,RPE * r)950 LoopVarSplit::LoopVarSplit(G4_Kernel& k, GraphColor* c, RPE* r) : kernel(k), coloring(c), rpe(r), references(k)
951 {
952 for (auto spill : coloring->getSpilledLiveRanges())
953 {
954 spilledDclSet.insert(spill->getDcl());
955 }
956 auto numVars = coloring->getNumVars();
957 auto lrs = coloring->getLiveRanges();
958 for (unsigned int i = 0; i != numVars; ++i)
959 {
960 auto rootDcl = lrs[i]->getDcl();
961 dclSpillCost[rootDcl] = lrs[i]->getSpillCost();
962 }
963 }
964
run()965 void LoopVarSplit::run()
966 {
967 // 1. consider list of spilled variables sorted in
968 // descending order of spill cost
969 // 2. iterate over each spilled variable from sorted list
970 // a. run cost heuristic to decide if split makes sense
971 // b. if decision is to split, then check which loop(s) to split around
972
973 std::list<std::pair<G4_Declare*, float>> sortedDcls;
974 for (auto item : dclSpillCost)
975 {
976 if(spilledDclSet.find(item.first) !=spilledDclSet.end())
977 sortedDcls.push_back(std::pair(item.first, item.second));
978 }
979
980 auto SpillCostDesc = [](const std::pair<G4_Declare*, float>& first,
981 const std::pair<G4_Declare*, float>& second)
982 {
983 return first.second > second.second;
984 };
985
986 sortedDcls.sort(SpillCostDesc);
987
988 // TODO: Iterating sequence based on spill cost may not be very accurate.
989 // Thats because long living variables typically have lower spill cost.
990 // But they may be referenced in some nested loop multiple times. In such
991 // cases, it is beneficial to split the variable around such loops. To
992 // accommodate such cases, we should be looking at reference count of
993 // variables in loops and iterate in an order based on that.
994 for (auto var : sortedDcls)
995 {
996 const auto& loops = getLoopsToSplitAround(var.first);
997 for (auto& loop : loops)
998 {
999 split(var.first, *loop);
1000 }
1001 }
1002 }
1003
getReads(G4_Declare * dcl,Loop & loop)1004 std::vector<G4_SrcRegRegion*> LoopVarSplit::getReads(G4_Declare* dcl, Loop& loop)
1005 {
1006 std::vector<G4_SrcRegRegion*> reads;
1007
1008 const auto& uses = references.getUses(dcl);
1009 for (auto use : *uses)
1010 {
1011 auto bb = std::get<1>(use);
1012 if (loop.contains(bb))
1013 {
1014 auto inst = std::get<0>(use);
1015 for (unsigned int i = 0; i != inst->getNumSrc(); ++i)
1016 {
1017 auto opnd = inst->getSrc(i);
1018 if (!opnd || !opnd->isSrcRegRegion())
1019 continue;
1020 auto topdcl = opnd->asSrcRegRegion()->getTopDcl();
1021 if (topdcl == dcl)
1022 reads.push_back(opnd->asSrcRegRegion());
1023 }
1024 }
1025 }
1026
1027 return reads;
1028 }
1029
getWrites(G4_Declare * dcl,Loop & loop)1030 std::vector<G4_DstRegRegion*> LoopVarSplit::getWrites(G4_Declare* dcl, Loop& loop)
1031 {
1032 std::vector<G4_DstRegRegion*> writes;
1033
1034 const auto& defs = references.getDefs(dcl);
1035 for (auto def : *defs)
1036 {
1037 auto bb = std::get<1>(def);
1038 if (loop.contains(bb))
1039 {
1040 auto dst = std::get<0>(def)->getDst();
1041 writes.push_back(dst);
1042 }
1043 }
1044
1045 return writes;
1046 }
1047
getMaxRegPressureInLoop(Loop & loop)1048 unsigned int LoopVarSplit::getMaxRegPressureInLoop(Loop& loop)
1049 {
1050 auto it = maxRegPressureCache.find(&loop);
1051 if (it != maxRegPressureCache.end())
1052 return it->second;
1053
1054 unsigned int maxRPE = 0;
1055 auto& bbs = loop.getBBs();
1056
1057 for (auto bb : bbs)
1058 {
1059 for (auto inst : bb->getInstList())
1060 {
1061 maxRPE = std::max(maxRPE, rpe->getRegisterPressure(inst));
1062 }
1063 }
1064
1065 maxRegPressureCache[&loop] = maxRPE;
1066
1067 return maxRPE;
1068 }
1069
dump(std::ostream & of)1070 void LoopVarSplit::dump(std::ostream& of)
1071 {
1072 for (auto dcl : splitResults)
1073 {
1074 of << "Spilled dcl: " << dcl.first->getName() << std::endl;
1075 for (auto split : dcl.second)
1076 {
1077 of << "\tSplit dcl: " << split.first->getName() << ", for loop L" << split.second->id << std::endl;
1078 }
1079 of << std::endl;
1080 }
1081 }
1082
removeSplitInsts(GlobalRA * gra,G4_Declare * spillDcl,G4_BB * bb)1083 void LoopVarSplit::removeSplitInsts(GlobalRA* gra, G4_Declare* spillDcl, G4_BB* bb)
1084 {
1085 auto it = gra->splitResults.find(spillDcl);
1086
1087 if (it == gra->splitResults.end())
1088 return;
1089
1090 auto& bbInstsToRemove = (*it).second.insts;
1091 for (auto entry : bbInstsToRemove)
1092 {
1093 auto currBB = entry.first;
1094
1095 if (currBB == bb)
1096 {
1097 auto& instsToRemoveFromBB = entry.second;
1098 for (auto instIt = bb->begin(); instIt != bb->end();)
1099 {
1100 auto inst = (*instIt);
1101 if (instsToRemoveFromBB.find(inst) == instsToRemoveFromBB.end())
1102 {
1103 ++instIt;
1104 continue;
1105 }
1106
1107 instIt = bb->erase(instIt);
1108 }
1109 }
1110 }
1111 }
1112
removeFromPreheader(GlobalRA * gra,G4_Declare * spillDcl,G4_BB * bb,INST_LIST_ITER filledInstIter)1113 bool LoopVarSplit::removeFromPreheader(GlobalRA* gra, G4_Declare* spillDcl, G4_BB* bb, INST_LIST_ITER filledInstIter)
1114 {
1115 auto it = gra->splitResults.find(spillDcl);
1116 if (it != gra->splitResults.end() &&
1117 (*it).second.insts.find(bb) != (*it).second.insts.end())
1118 {
1119 auto inst = *filledInstIter;
1120 if (inst->isRawMov())
1121 {
1122 auto dstTopDcl = inst->getDst()->getTopDcl();
1123 auto it = gra->splitResults.find(dstTopDcl);
1124 if (it == gra->splitResults.end())
1125 return false;
1126
1127 removeSplitInsts(gra, spillDcl, bb);
1128 return true;
1129 }
1130 }
1131 return false;
1132 }
1133
removeFromLoopExit(GlobalRA * gra,G4_Declare * spillDcl,G4_BB * bb,INST_LIST_ITER filledInstIter)1134 bool LoopVarSplit::removeFromLoopExit(GlobalRA* gra, G4_Declare* spillDcl, G4_BB* bb, INST_LIST_ITER filledInstIter)
1135 {
1136 return removeFromPreheader(gra, spillDcl, bb, filledInstIter);
1137 }
1138
getSplitInsts(GlobalRA * gra,G4_BB * bb)1139 const std::unordered_set<G4_INST*> LoopVarSplit::getSplitInsts(GlobalRA* gra, G4_BB* bb)
1140 {
1141 std::unordered_set<G4_INST*> ret;
1142
1143 for (auto& splitVar : gra->splitResults)
1144 for (auto& insts : splitVar.second.insts)
1145 if (insts.first == bb)
1146 {
1147 for (auto item : insts.second)
1148 ret.insert(item);
1149 }
1150
1151 return ret;
1152 }
1153
split(G4_Declare * dcl,Loop & loop)1154 bool LoopVarSplit::split(G4_Declare* dcl, Loop& loop)
1155 {
1156 // Split dcl in given loop. Return true if split was successful.
1157 // It is assumed that dcl is spilled. This method inserts a
1158 // copy in preheader of loop. Dst of this copy is a new tmp and
1159 // source is dcl. All uses/defs of dcl in the loop are replaced
1160 // with the tmp. If dcl is ever written in the loop, a copy from
1161 // tmp to dcl is inserted in loop exit bb.
1162 //
1163 // This transformation requires a valid preheader be present for
1164 // loop. If dcl is written in the loop then a valid exit bb is also
1165 // needed.
1166 if (!loop.preHeader)
1167 return false;
1168
1169 const auto& dsts = getWrites(dcl, loop);
1170 if (dsts.size() > 0)
1171 {
1172 // evaluate if it makes sense to insert the copy if loop has
1173 // multiple exits.
1174 if (loop.getLoopExits().size() != 1)
1175 return false;
1176 // TODO: Handle creation of loop preheader and loop exit in
1177 // presence of SIMD CF. this requires changing JIP, UIP
1178 // in source goto instructions and fix (if) any data structures
1179 // in VISA that rely on those JIP/UIP.
1180 if (!loop.preHeader->dominates(loop.getLoopExits().front()))
1181 return false;
1182 }
1183
1184 const auto& srcs = getReads(dcl, loop);
1185
1186 auto splitDcl = kernel.fg.builder->createTempVar(dcl->getTotalElems(),
1187 dcl->getElemType(), coloring->getGRA().getSubRegAlign(dcl),
1188 "LOOPSPLIT", true);
1189
1190 auto& splitData = coloring->getGRA().splitResults[splitDcl];
1191 splitData.origDcl = dcl;
1192
1193 // emit TMP = dcl in preheader
1194 copy(loop.preHeader, splitDcl, dcl, &splitData);
1195
1196 // emit dcl = TMP in loop exit
1197 if (dsts.size() > 0)
1198 {
1199 copy(loop.getLoopExits().front(), dcl, splitDcl, &splitData, false);
1200 }
1201
1202 // replace all occurences of dcl in loop with TMP
1203 for (auto src : srcs)
1204 replaceSrc(src, splitDcl);
1205
1206 for (auto dst : dsts)
1207 replaceDst(dst, splitDcl);
1208
1209 splitResults[dcl].push_back(std::make_pair(splitDcl, &loop));
1210
1211 return true;
1212 }
1213
copy(G4_BB * bb,G4_Declare * dst,G4_Declare * src,SplitResults * splitData,bool pushBack)1214 void LoopVarSplit::copy(G4_BB* bb, G4_Declare* dst, G4_Declare* src, SplitResults* splitData, bool pushBack)
1215 {
1216 // create mov instruction to copy dst->src
1217 // multiple mov instructions may be created depending on size of dcls
1218 // all mov instructions are appended to inst list of bb
1219 // when pushBack argument = true, append to BB (happens in pre-header)
1220 // when pushBack argument = false, insert in bb after label (happens at exit bb)
1221
1222 dst = dst->getRootDeclare();
1223 src = src->getRootDeclare();
1224 unsigned int numRows = dst->getNumRows();
1225 unsigned int bytesRemaining = dst->getByteSize();
1226
1227 auto insertCopy = [&](G4_INST* inst)
1228 {
1229 if (pushBack || bb->size() == 0)
1230 {
1231 bb->push_back(inst);
1232 splitData->insts[bb].insert(inst);
1233 }
1234 else
1235 {
1236 // insert immediately after label instruction, if one exists
1237 for (auto it = bb->begin(); it != bb->end(); ++it)
1238 {
1239 auto inst = (*it);
1240 if (inst->isLabel())
1241 continue;
1242 bb->insertAfter(it, inst);
1243 splitData->insts[bb].insert(inst);
1244 break;
1245 }
1246 }
1247 };
1248
1249 // first copy full GRF rows
1250 if (numRows > 1 || (dst->getTotalElems() * dst->getElemSize() == getGRFSize()))
1251 {
1252 // dcls are GRF sized so emit max SIMD size possible and copy 2 rows at
1253 // a time
1254 for (unsigned int i = 0; i < numRows; i++)
1255 {
1256 const RegionDesc* rd = kernel.fg.builder->getRegionStride1();
1257 G4_ExecSize execSize{ numEltPerGRF<Type_UD>() };
1258
1259 if ((i + 1) < numRows)
1260 execSize = G4_ExecSize(numEltPerGRF<Type_UD>() * 2);
1261
1262 auto dstRgn = kernel.fg.builder->createDst(dst->getRegVar(), (short)i, 0, 1, Type_F);
1263 auto srcRgn = kernel.fg.builder->createSrc(src->getRegVar(), (short)i, 0, rd, Type_F);
1264 auto inst = kernel.fg.builder->createMov(execSize, dstRgn, srcRgn, InstOpt_WriteEnable, false);
1265
1266 insertCopy(inst);
1267 bytesRemaining -= (execSize.value * G4_Type_Table[Type_F].byteSize);
1268
1269 if (bytesRemaining < numEltPerGRF<Type_UB>())
1270 break;
1271 }
1272 }
1273
1274 while (bytesRemaining > 0)
1275 {
1276 G4_Type type = Type_W;
1277 G4_ExecSize execSize = g4::SIMD16;
1278
1279 if (bytesRemaining >= 16)
1280 execSize = g4::SIMD8;
1281 else if (bytesRemaining >= 8 && bytesRemaining < 16)
1282 execSize = g4::SIMD4;
1283 else if (bytesRemaining >= 4 && bytesRemaining < 8)
1284 execSize = g4::SIMD2;
1285 else if (bytesRemaining >= 2 && bytesRemaining < 4)
1286 execSize = g4::SIMD1;
1287 else if (bytesRemaining == 1)
1288 {
1289 // If a region has odd number of bytes, copy last byte in final iteration
1290 execSize = g4::SIMD1;
1291 type = Type_UB;
1292 }
1293 else
1294 {
1295 MUST_BE_TRUE(false, "Unexpected condition");
1296 }
1297
1298 const RegionDesc* rd = kernel.fg.builder->getRegionStride1();
1299
1300 unsigned int row = (dst->getByteSize() - bytesRemaining) / numEltPerGRF<Type_UB>();
1301 unsigned int col = (dst->getByteSize() - bytesRemaining) % numEltPerGRF<Type_UB>();
1302 if (G4_Type_Table[type].byteSize > 1)
1303 {
1304 MUST_BE_TRUE(col % 2 == 0, "Unexpected condition");
1305 col /= 2;
1306 }
1307
1308 G4_DstRegRegion* dstRgn = kernel.fg.builder->createDst(dst->getRegVar(), row, col, 1, type);
1309 G4_SrcRegRegion* srcRgn = kernel.fg.builder->createSrc(src->getRegVar(), row, col, rd, type);
1310
1311 auto inst = kernel.fg.builder->createMov(execSize, dstRgn, srcRgn, InstOpt_WriteEnable, false);
1312
1313 insertCopy(inst);
1314
1315 bytesRemaining -= (execSize.value * G4_Type_Table[type].byteSize);
1316 };
1317 }
1318
replaceSrc(G4_SrcRegRegion * src,G4_Declare * dcl)1319 void LoopVarSplit::replaceSrc(G4_SrcRegRegion* src, G4_Declare* dcl)
1320 {
1321 auto srcDcl = src->getBase()->asRegVar()->getDeclare();
1322 dcl = getNewDcl(srcDcl, dcl);
1323
1324 auto newSrcRgn = kernel.fg.builder->createSrc(dcl->getRegVar(), src->getRegOff(),
1325 src->getSubRegOff(), src->getRegion(), src->getType(), src->getAccRegSel());
1326
1327 auto inst = src->getInst();
1328 for (unsigned int i = 0; i != inst->getNumSrc(); ++i)
1329 {
1330 if (inst->getSrc(i) == src)
1331 {
1332 inst->setSrc(newSrcRgn, i);
1333 break;
1334 }
1335 }
1336 }
1337
replaceDst(G4_DstRegRegion * dst,G4_Declare * dcl)1338 void LoopVarSplit::replaceDst(G4_DstRegRegion* dst, G4_Declare* dcl)
1339 {
1340 auto dstDcl = dst->getBase()->asRegVar()->getDeclare();
1341 dcl = getNewDcl(dstDcl, dcl);
1342
1343 auto newDstRgn = kernel.fg.builder->createDst(dcl->getRegVar(), dst->getRegOff(),
1344 dst->getSubRegOff(), dst->getHorzStride(), dst->getType(), dst->getAccRegSel());
1345
1346 auto inst = dst->getInst();
1347 inst->setDest(newDstRgn);
1348 }
1349
getNewDcl(G4_Declare * dcl1,G4_Declare * dcl2)1350 G4_Declare* LoopVarSplit::getNewDcl(G4_Declare* dcl1, G4_Declare* dcl2)
1351 {
1352 // this method gets args dcl1, dcl2. this method is invoked
1353 // when the transformation replaces existing src/dst rgn with
1354 // equivalent one but using split variable.
1355 //
1356 // dcl1 is a dcl used to construct some src or dst rgn.
1357 // dcl2 is a new dcl that splits dcl1. dcl2 is always root dcl.
1358 // dcl1 may or may not be aliased of another dcl.
1359 // if dcl1 is also root dcl, then return dcl2.
1360 // if dcl1 is an alias dcl, then construct new dcl that aliases
1361 // dcl2 at similar offset.
1362 // mapping from old dcl to new dcl is stored for future invocations.
1363
1364 MUST_BE_TRUE(!dcl2->getAliasDeclare(), "Expecting to see root dcl for dcl2");
1365
1366 auto it = oldNewDcl.find(dcl1);
1367 if (it != oldNewDcl.end())
1368 return (*it).second;
1369
1370 if (!dcl1->getAliasDeclare())
1371 {
1372 oldNewDcl[dcl1] = dcl2;
1373 return dcl2;
1374 }
1375
1376 auto newDcl = kernel.fg.builder->createTempVar(dcl1->getTotalElems(), dcl1->getElemType(),
1377 dcl1->getSubRegAlign());
1378 newDcl->setAliasDeclare(getNewDcl(dcl1->getRootDeclare(), dcl2), dcl1->getOffsetFromBase());
1379
1380 oldNewDcl[dcl1] = newDcl;
1381
1382 return newDcl;
1383 }
1384
getLoopsToSplitAround(G4_Declare * dcl)1385 std::vector<Loop*> LoopVarSplit::getLoopsToSplitAround(G4_Declare* dcl)
1386 {
1387 // return a list of Loop* around which variable dcl should be split
1388 std::vector<Loop*> loopsToSplitAround;
1389
1390 // first make list of all loops where dcl is ever referenced
1391 auto uses = references.getUses(dcl);
1392 auto defs = references.getDefs(dcl);
1393
1394 auto StableOrder = [](const G4_BB* first, const G4_BB* second)
1395 {
1396 return first->getId() < second->getId();
1397 };
1398 std::set<G4_BB*, decltype(StableOrder)> bbsWithRefToDcl(StableOrder);
1399
1400 for (auto& use : *uses)
1401 {
1402 auto bb = std::get<1>(use);
1403 bbsWithRefToDcl.insert(bb);
1404 }
1405
1406 for (auto& def : *defs)
1407 {
1408 auto bb = std::get<1>(def);
1409 bbsWithRefToDcl.insert(bb);
1410 }
1411
1412 auto OrderByRegPressure = [&](Loop* loop1, Loop* loop2)
1413 {
1414 return getMaxRegPressureInLoop(*loop1) >
1415 getMaxRegPressureInLoop(*loop2);
1416 };
1417
1418 std::set<Loop*, decltype(OrderByRegPressure)> innerMostLoops(OrderByRegPressure);
1419
1420 // now collect innermost loop for each referenced BB
1421 for (auto bb : bbsWithRefToDcl)
1422 {
1423 auto innerMost = kernel.fg.getLoops().getInnerMostLoop(bb);
1424 if (innerMost)
1425 innerMostLoops.insert(innerMost);
1426 }
1427
1428 // prune list of loops
1429 // 1. loops are stored in descending order of max reg pressure
1430 // 2. apply cost heuristic to decide if variable should be split at a loop
1431 // 3. once split at a loop, dont split at any parent or nested loop
1432 //
1433 // Example:
1434 //
1435 // -----
1436 // |
1437 // | Loop A
1438 // | =X
1439 // -----
1440 //
1441 // -----
1442 // |
1443 // | Loop B
1444 // | =X
1445 // | -----
1446 // | | =X
1447 // | | Loop C
1448 // | -----
1449 // |
1450 // -----
1451 //
1452 // Assume variable X is spilled and is referenced in Loop A, Loop B, Loop C.
1453 // Loop C is nested in Loop B.
1454 // The algorithm below decides whether it is better to spill X around Loop C
1455 // or Loop B. X is spilled around only 1 of these 2 loops since they've a
1456 // parent-nested relationship. Independently, the algorithm can also decide to
1457 // split X around Loop A as it is not a parent or nested loop of other loops.
1458 //
1459
1460
1461 for (auto loop : innerMostLoops)
1462 {
1463 bool dontSplit = false;
1464 for (auto splitLoop : loopsToSplitAround)
1465 {
1466 if (loop->fullSubset(splitLoop) ||
1467 loop->fullSuperset(splitLoop))
1468 {
1469 // variable already split in nested or parent loop
1470 dontSplit = true;
1471 break;
1472 }
1473 }
1474 if (dontSplit)
1475 continue;
1476
1477 // apply cost heuristic
1478 if (dcl->getNumRows() <= 2)
1479 {
1480 if (getMaxRegPressureInLoop(*loop) <= (unsigned int)(1.5f * (float)kernel.getNumRegTotal()))
1481 loopsToSplitAround.push_back(loop);
1482 }
1483 else if (dcl->getNumRows() <= 4)
1484 {
1485 if (getMaxRegPressureInLoop(*loop) <= (unsigned int)(0.95f * (float)kernel.getNumRegTotal()))
1486 loopsToSplitAround.push_back(loop);
1487 }
1488 else if (dcl->getNumRows() > 4)
1489 {
1490 // splitting dcls with > 4 rows should be very rare
1491 if (getMaxRegPressureInLoop(*loop) <= (unsigned int)(0.6f * (float)kernel.getNumRegTotal()))
1492 loopsToSplitAround.push_back(loop);
1493 }
1494 }
1495
1496 return loopsToSplitAround;
1497 }
1498
1499 };