1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2017-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include "LocalRA.h"
10 #include "RegAlloc.h"
11 #include "DebugInfo.h"
12 #include "common.h"
13 
14 #include <fstream>
15 #include <tuple>
16 
17 using namespace vISA;
18 
19 //#define _DEBUG
20 //#define cout cerr
21 
22 #define NUM_PREGS_FOR_UNIQUE_ASSIGN 50
23 
24 #define SPLIT_REF_CNT_THRESHOLD 3
25 #define SPLIT_USE_CNT_THRESHOLD 2
26 #define SPLIT_USE_DISTANCE_THRESHOLD 100
27 
28 void getForbiddenGRFs(std::vector<unsigned int>& regNum, G4_Kernel& kernel, unsigned stackCallRegSize, unsigned reserveSpillSize, unsigned reservedRegNum);
29 void getCallerSaveGRF(std::vector<unsigned int>& regNum, G4_Kernel* kernel);
30 
LocalRA(BankConflictPass & b,GlobalRA & g)31 LocalRA::LocalRA(BankConflictPass& b, GlobalRA& g) :
32     kernel(g.kernel), builder(g.builder), mem(g.builder.mem), bc(b), gra(g)
33 {
34 }
35 
getBankAlignForUniqueAssign(G4_Declare * dcl)36 BankAlign LocalRA::getBankAlignForUniqueAssign(G4_Declare *dcl)
37 {
38     // FIXME: this code is rather suspicious (we return even alignment
39     // for first_half_odd???), should revisit
40     switch (gra.getBankConflict(dcl))
41     {
42     case BANK_CONFLICT_FIRST_HALF_EVEN:
43     case BANK_CONFLICT_FIRST_HALF_ODD:
44         return builder.oneGRFBankDivision() ? BankAlign::Even : BankAlign::Even2GRF;
45     case BANK_CONFLICT_SECOND_HALF_EVEN:
46     case BANK_CONFLICT_SECOND_HALF_ODD:
47         return builder.oneGRFBankDivision() ? BankAlign::Odd : BankAlign::Odd2GRF;
48     default:
49         return BankAlign::Either;
50     }
51 }
52 
53 // returns true if kernel contains a back edge
54 // call/return edge may also be considered as a back edge depending on layout.
hasBackEdge()55 bool LocalRA::hasBackEdge()
56 {
57     for (auto curBB : kernel.fg)
58     {
59 
60         MUST_BE_TRUE(curBB->size() > 0 || curBB->Succs.size() > 1,
61             "Error in detecting back-edge: Basic block found with no inst list and multiple successors.");
62 
63         if (curBB->size() > 0)
64         {
65             for (auto succ : curBB->Succs)
66             {
67                 MUST_BE_TRUE(succ->size() > 0,
68                     "Error in detecting back-edge: Destination basic block of a jmp has no instructions.");
69                 if (curBB->getId() >= succ->getId())
70                 {
71                     return true;
72                 }
73             }
74         }
75     }
76     return false;
77 }
78 
getRowInfo(int size,int & nrows,int & lastRowSize)79 void LocalRA::getRowInfo(int size, int& nrows, int& lastRowSize)
80 {
81     if (size <= (int)numEltPerGRF<Type_UW>())
82     {
83         nrows = 1;
84     }
85     else
86     {
87         // nrows is total number of rows, including last row even if it is partial
88         nrows = size / numEltPerGRF<Type_UW>();
89         // lastrowsize is number of words actually used in last row
90         lastRowSize = size%numEltPerGRF<Type_UW>();
91 
92         if (size%numEltPerGRF<Type_UW>() != 0)
93         {
94             nrows++;
95         }
96 
97         if (lastRowSize == 0)
98         {
99             lastRowSize = numEltPerGRF<Type_UW>();
100         }
101     }
102 
103     return;
104 }
105 
evenAlign()106 void LocalRA::evenAlign()
107 {
108     if (kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D &&
109         kernel.fg.size() > 2)
110     {
111         if ((GlobalRA::useGenericAugAlign() && kernel.getSimdSize() >= numEltPerGRF<Type_UD>()) ||
112             (!GlobalRA::useGenericAugAlign() && kernel.getSimdSize() > numEltPerGRF<Type_UD>()))
113         {
114             // Set alignment of all GRF candidates
115             // to 2GRF except for NoMask variables
116 #ifdef DEBUG_VERBOSE_ON
117             DEBUG_VERBOSE("Updating alignment to Even for GRF candidates" << std::endl);
118 #endif
119             gra.evenAlign();
120         }
121         gra.updateSubRegAlignment(GRFALIGN);
122         // Since we are piggy backing on mask field of G4_Declare,
123         // we need to make sure we reset it before going further.
124         resetMasks();
125     }
126 
127     return;
128 }
129 
130 /*
131  *  Pre-localRA analsyis including:
132  *  1) Mark reference to create live ranges, 2) alignment handling, 3) Avialable register calcuating and marking
133  */
preLocalRAAnalysis()134 void LocalRA::preLocalRAAnalysis()
135 {
136     unsigned int   numRowsEOT = 0;
137     bool lifetimeOpFound = false;
138 
139     int numGRF = kernel.getNumRegTotal();
140 
141     // Clear LocalLiveRange* computed preRA
142     gra.clearStaleLiveRanges();
143 
144     // Mark references made to decls to sieve local from global ranges
145     markReferences(numRowsEOT, lifetimeOpFound);
146 
147     // Mark those physical registers unavailable that are declared with Output attribute
148     blockOutputPhyRegs();
149 
150     if (lifetimeOpFound == true)
151     {
152         // Check whether pseudo_kill/lifetime.end are only references
153         // for their respective variables. Remove them if so. Doing this
154         // helps reduce number of variables in symbol table increasing
155         // changes of skipping global RA.
156         removeUnrequiredLifetimeOps();
157     }
158 
159     unsigned int numRowsReserved = numRowsEOT;
160 
161     // Remove unreferenced dcls
162     gra.removeUnreferencedDcls();
163 
164     if (builder.getOption(vISA_HybridRAWithSpill) || builder.getOption(vISA_FastCompileRA))
165     {
166         unsigned reserveSpillSize = 0;
167         unsigned int spillRegSize = 0;
168         unsigned int indrSpillRegSize = 0;
169         gra.determineSpillRegSize(spillRegSize, indrSpillRegSize);
170 
171         reserveSpillSize = spillRegSize + indrSpillRegSize;
172         if (reserveSpillSize >= kernel.getNumCalleeSaveRegs())
173         {
174             const_cast<Options*>(builder.getOptions())->setOption(vISA_HybridRAWithSpill, false);
175             const_cast<Options*>(builder.getOptions())->setOption(vISA_FastCompileRA, false);
176             numRegLRA = numGRF - numRowsReserved;
177         }
178         else
179         {
180             numRegLRA = numGRF - numRowsReserved - reserveSpillSize - 3; // spill Header, scratch offset, a0.2
181         }
182     }
183     else
184     {
185         numRegLRA = numGRF - numRowsReserved;
186     }
187     bool isStackCall = (kernel.fg.getHasStackCalls() || kernel.fg.getIsStackCallFunc());
188     unsigned int stackCallRegSize = isStackCall ? kernel.numReservedABIGRF() : 0;
189     unsigned int reservedGRFNum = builder.getOptions()->getuInt32Option(vISA_ReservedGRFNum);
190 
191     if (isStackCall || reservedGRFNum || builder.getOption(vISA_Debug))
192     {
193         std::vector<unsigned int> forbiddenRegs;
194         getForbiddenGRFs(forbiddenRegs, kernel, stackCallRegSize, 0, reservedGRFNum);
195         for (unsigned int i = 0, size = forbiddenRegs.size(); i < size; i++)
196         {
197             unsigned int regNum = forbiddenRegs[i];
198             pregs->setGRFUnavailable(regNum);
199         }
200 
201         // FIXME: this will break if # of GRF is not 128.
202         if (isStackCall)
203         {
204             // Set r60 to r99 as unavailable for local RA since these registers are callee saved
205             // Local RA will use only caller save registers for allocation.
206             std::vector<unsigned int> callerSaveRegs;
207             getCallerSaveGRF(callerSaveRegs, &kernel);
208             for (unsigned int i = 0, size = callerSaveRegs.size(); i < size; i++)
209             {
210                 unsigned int regNum = callerSaveRegs[i];
211                 pregs->setGRFUnavailable(regNum);
212             }
213 
214             numRegLRA = (kernel.getNumRegTotal() - 3) - numRowsReserved;
215         }
216 
217         if (builder.getOption(vISA_Debug))
218         {
219             // Since LocalRA is not undone when debug info generation is required,
220             // for keeping compile time low, we allow fewer physical registers
221             // as assignable candidates. Without this, we could run in to a
222             // situation where very few physical registers are available for GRA
223             // and it is unable to assign registers even with spilling.
224 #define USABLE_GRFS_WITH_DEBUG_INFO 80
225             int maxSendReg = 0;
226             for (auto bb : kernel.fg)
227             {
228                 for (auto inst : *bb)
229                 {
230                     if (inst->isSend() || inst->isSplitSend())
231                     {
232                         maxSendReg = ((int)inst->getMsgDesc()->getDstLenRegs() > maxSendReg) ?
233                             ((int)inst->getMsgDesc()->getDstLenRegs()) : maxSendReg;
234                         maxSendReg = ((int)inst->getMsgDesc()->getSrc0LenRegs() > maxSendReg) ?
235                             ((int)inst->getMsgDesc()->getSrc0LenRegs()) : maxSendReg;
236                         maxSendReg = ((int)inst->getMsgDesc()->getSrc1LenRegs() > maxSendReg) ?
237                             ((int)inst->getMsgDesc()->getSrc1LenRegs()) : maxSendReg;
238                     }
239                 }
240             }
241 
242             int maxRegsToUse = USABLE_GRFS_WITH_DEBUG_INFO;
243             if (maxSendReg > (numGRF - USABLE_GRFS_WITH_DEBUG_INFO))
244             {
245                 maxRegsToUse = (numGRF - maxSendReg) - 10;
246             }
247 
248             // Also check max size of addressed GRF
249             unsigned int maxAddressedRows = 0;
250             for (auto dcl : kernel.Declares)
251             {
252                 if (dcl->getAddressed() &&
253                     maxAddressedRows < dcl->getNumRows())
254                 {
255                     maxAddressedRows = dcl->getNumRows();
256                 }
257             }
258 
259             // Assume indirect operand of maxAddressedRows exists
260             // on dst, src0, src1. This is overly conservative but
261             // should work for general cases.
262             if ((numGRF - maxRegsToUse) / 3 < (int)maxAddressedRows)
263             {
264                 maxRegsToUse = (numGRF - (maxAddressedRows * 3));
265 
266                 if (maxRegsToUse < 0)
267                     maxRegsToUse = 0;
268             }
269 
270             for (int i = maxRegsToUse; i < numGRF; i++)
271             {
272                 pregs->setGRFUnavailable(i);
273             }
274         }
275     }
276     else
277     {
278         pregs->setSimpleGRFAvailable(true);
279         const Options *opt = builder.getOptions();
280         if (kernel.getInt32KernelAttr(Attributes::ATTR_Target) != VISA_3D ||
281             opt->getOption(vISA_enablePreemption) ||
282             opt->getOption(vISA_ReserveR0))
283         {
284             pregs->setR0Forbidden();
285         }
286 
287         if (opt->getOption(vISA_enablePreemption))
288         {
289             pregs->setR1Forbidden();
290         }
291     }
292 
293     return;
294 }
295 
trivialAssignRA(bool & needGlobalRA,bool threeSourceCandidate)296 void LocalRA::trivialAssignRA(bool& needGlobalRA, bool threeSourceCandidate)
297 {
298     if (builder.getOption(vISA_RATrace))
299     {
300         std::cout << "\t--trivial RA\n";
301     }
302 
303     needGlobalRA = assignUniqueRegisters(threeSourceCandidate, builder.lowHighBundle(), false);
304 
305     if (!needGlobalRA)
306     {
307         if (threeSourceCandidate)
308         {
309             kernel.setRAType(RA_Type::TRIVIAL_BC_RA);
310         }
311         else
312         {
313             kernel.setRAType(RA_Type::TRIVIAL_RA);
314         }
315     }
316 
317     return;
318 }
319 
320 // Entry point to local RA
localRAPass(bool doRoundRobin,bool doSplitLLR)321 bool LocalRA::localRAPass(bool doRoundRobin, bool doSplitLLR)
322 {
323     bool needGlobalRA = true;
324     PhyRegsLocalRA localPregs = *pregs;
325 
326     // Iterate over each BB and perform linear scan
327 #ifdef DEBUG_VERBOSE_ON
328     localPregs.printBusyRegs();
329     printAddressTakenDecls();
330     printLocalRACandidates();
331 #endif
332 
333     if (kernel.fg.funcInfoTable.empty() && !hasBackEdge())
334     {
335         calculateInputIntervals();
336     }
337 
338 #ifdef DEBUG_VERBOSE_ON
339     printInputLiveIntervals();
340 #endif
341 
342     int totalGRFNum = kernel.getNumRegTotal();
343     for (auto curBB : kernel.fg)
344     {
345         PhyRegsManager pregManager(localPregs, doBCR);
346         std::vector<LocalLiveRange*> liveIntervals;
347 
348         PhyRegSummary* summary = new (mem)PhyRegSummary(totalGRFNum);
349 
350         calculateLiveIntervals(curBB, liveIntervals);
351 
352 #ifdef DEBUG_VERBOSE_ON
353         printLocalLiveIntervals(curBB, liveIntervals);
354 #endif
355 
356         LinearScan ra(gra, liveIntervals, inputIntervals, pregManager,
357                       localPregs, mem, summary, numRegLRA, globalLRSize,
358                       doRoundRobin, doBCR,
359                       highInternalConflict, doSplitLLR, kernel.getSimdSize());
360         ra.run(curBB, builder, LLRUseMap);
361 
362 #ifdef DEBUG_VERBOSE_ON
363         COUT_ERROR << "BB" << curBB->getId() << std::endl;
364         summary->printBusyRegs();
365 #endif
366 
367         // Tag summary of physical registers used by local RA
368         kernel.fg.addBBLRASummary(curBB, summary);
369 
370         if (kernel.getOptions()->getOption(vISA_GenerateDebugInfo))
371         {
372             updateDebugInfo(kernel, liveIntervals);
373         }
374     }
375 
376     if (unassignedRangeFound() == false)
377     {
378         needGlobalRA = false;
379 #ifdef DEBUG_VERBOSE_ON
380         DEBUG_VERBOSE("Skipping global RA because local RA allocated all live-ranges." << std::endl);
381 #endif
382     }
383 
384     if (builder.getOption(vISA_OptReport))
385     {
386         localRAOptReport();
387     }
388 
389     if (needGlobalRA == true &&
390         !(kernel.fg.getHasStackCalls() || kernel.fg.getIsStackCallFunc()))
391     {
392         // Check whether unique physical registers can be assigned
393         // to global ranges.
394         bool twoBanksAssign = false;
395 
396         if (builder.oneGRFBankDivision())
397         {
398             twoBanksAssign = highInternalConflict || kernel.getSimdSize() == g4::SIMD16;
399         }
400         else
401         {
402             twoBanksAssign = highInternalConflict;
403         }
404 
405         needGlobalRA = assignUniqueRegisters(doBCR, twoBanksAssign, builder.getOption(vISA_HybridRAWithSpill) && !doRoundRobin);
406     }
407 
408     if (needGlobalRA && doRoundRobin)
409     {
410         undoLocalRAAssignments(true);
411     }
412 
413     if (needGlobalRA == false && builder.getOption(vISA_OptReport))
414     {
415         std::ofstream optreport;
416         getOptReportStream(optreport, builder.getOptions());
417         optreport << "Allocated 100% GRF ranges without graph coloring." << std::endl;
418         closeOptReportStream(optreport);
419     }
420 
421     return needGlobalRA;
422 }
423 
localRA()424 bool LocalRA::localRA()
425 {
426     if (builder.getOption(vISA_RATrace))
427     {
428         std::cout << "--local RA--\n";
429     }
430 
431     bool doRoundRobin = builder.getOption(vISA_LocalRARoundRobin);
432 
433     int numGRF = kernel.getNumRegTotal();
434     PhyRegsLocalRA phyRegs(&builder, numGRF);
435     pregs = &phyRegs;
436 
437     globalLRSize = 0;
438     bool reduceBCInTAandFF = false;
439     bool needGlobalRA = true;
440 
441     doSplitLLR = (builder.getOption(vISA_SpiltLLR) &&
442         kernel.fg.size() == 1 &&
443         kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D);
444 
445     preLocalRAAnalysis();
446 
447     bool reduceBCInRR = false;
448 
449     if (builder.getOption(vISA_LocalBankConflictReduction) && builder.hasBankCollision())
450     {
451         reduceBCInRR = bc.setupBankConflictsForKernel(doRoundRobin, reduceBCInTAandFF, numRegLRA, highInternalConflict);
452     }
453 
454     if (!kernel.fg.getHasStackCalls() && !kernel.fg.getIsStackCallFunc() &&
455         !hasSplitInsts)
456     {
457         trivialAssignRA(needGlobalRA, reduceBCInTAandFF);
458         if (!needGlobalRA)
459         {
460             return true;
461         }
462     }
463 
464     if (doRoundRobin)
465     {
466         doRoundRobin = countLiveIntervals();
467     }
468 
469     if ((!doRoundRobin && reduceBCInTAandFF) || (doRoundRobin && reduceBCInRR))  //To avoid the scheduling issue for send
470     {
471         doBCR = true;
472     }
473     // Local RA passes:
474     // RR+BC --> FF+BC -->FF-->Hybrids
475     // or
476     // RR --> FF --> Hybrid
477     if (doRoundRobin)
478     {
479         if (builder.getOption(vISA_RATrace))
480         {
481             std::cout << "\t--round-robin " << (doBCR ? "BCR " : "") << "RA\n";
482         }
483         needGlobalRA = localRAPass(true, false);
484         if (needGlobalRA == true)
485         {
486             doRoundRobin = false;
487         }
488     }
489 
490     if (!doRoundRobin)
491     {
492         if (kernel.getOption(vISA_forceBCR) && doBCR)
493         {
494             if (builder.getOption(vISA_RATrace))
495             {
496                 std::cout << "\t--first-fit " << "BCR " << "RA\n";
497             }
498             needGlobalRA = localRAPass(false, doSplitLLR);
499         }
500 
501         if (needGlobalRA)
502         {
503             if (builder.getOption(vISA_RATrace))
504             {
505                 std::cout << "\t--first-fit " << "RA\n";
506             }
507             globalLRSize = 0;
508             if (builder.getOption(vISA_HybridRAWithSpill))
509             {
510                 countLiveIntervals();
511             }
512             else
513             {
514                 globalLRSize = 0;
515             }
516             evenAlign();
517             needGlobalRA = localRAPass(false, doSplitLLR);
518         }
519     }
520 
521     if (needGlobalRA == false)
522     {
523         if (doRoundRobin)
524         {
525             kernel.setRAType(doBCR ? RA_Type::LOCAL_ROUND_ROBIN_BC_RA : RA_Type::LOCAL_ROUND_ROBIN_RA);
526         }
527         else
528         {
529             kernel.setRAType(doBCR ? RA_Type::LOCAL_FIRST_FIT_BC_RA : RA_Type::LOCAL_FIRST_FIT_RA);
530         }
531     }
532 
533     setLexicalID(true);
534 
535     return !needGlobalRA;
536 }
537 
resetMasks()538 void LocalRA::resetMasks()
539 {
540     auto& dcls = kernel.Declares;
541 
542     for (auto dcl : dcls)
543     {
544         gra.setMask(dcl, {});
545     }
546 }
547 
blockOutputPhyRegs()548 void LocalRA::blockOutputPhyRegs()
549 {
550     for (auto dcl : kernel.Declares)
551     {
552         if (dcl->isOutput() &&
553             dcl->isInput())
554         {
555             // The live-range may never be referenced in the CFG so we need
556             // this special pass to block physical registers corresponding
557             // to those declares. This may be true for FC.
558             pregs->markPhyRegs(dcl);
559         }
560     }
561 }
562 
563 class isLifetimeCandidateOpCandidateForRemoval
564 {
565 public:
isLifetimeCandidateOpCandidateForRemoval(GlobalRA & g)566     isLifetimeCandidateOpCandidateForRemoval(GlobalRA& g) : gra(g)
567     {
568     }
569 
570     GlobalRA& gra;
571 
operator ()(G4_INST * inst)572     bool operator()(G4_INST* inst)
573     {
574         if (inst->isPseudoKill() ||
575             inst->isLifeTimeEnd())
576         {
577             G4_Declare* topdcl;
578 
579             if (inst->isPseudoKill())
580             {
581                 topdcl = GetTopDclFromRegRegion(inst->getDst());
582             }
583             else
584             {
585                 topdcl = GetTopDclFromRegRegion(inst->getSrc(0));
586             }
587 
588             if (gra.getNumRefs(topdcl) == 0 &&
589                 (topdcl->getRegFile() == G4_GRF ||
590                     topdcl->getRegFile() == G4_INPUT))
591             {
592                 // Remove this lifetime op
593                 return true;
594             }
595         }
596 
597         return false;
598     }
599 };
600 
removeUnrequiredLifetimeOps()601 void LocalRA::removeUnrequiredLifetimeOps()
602 {
603     // Iterate over all instructions and inspect only
604     // pseudo_kills/lifetime.end instructions. Remove
605     // instructions that have no other useful instruction.
606 
607     for (BB_LIST_ITER bb_it = kernel.fg.begin();
608         bb_it != kernel.fg.end();
609         bb_it++)
610     {
611         G4_BB* bb = (*bb_it);
612         bb->erase(std::remove_if(bb->begin(), bb->end(), isLifetimeCandidateOpCandidateForRemoval(this->gra)),
613             bb->end());
614     }
615 }
616 
findRegisterCandiateWithAlignForward(int & i,BankAlign align,bool evenAlign)617 void PhyRegsLocalRA::findRegisterCandiateWithAlignForward(int &i, BankAlign align, bool evenAlign)
618 {
619     if ((align == BankAlign::Even) && (i % 2 != 0))
620     {
621         i++;
622     }
623     else if ((align == BankAlign::Odd) && (i % 2 == 0))
624     {
625         i++;
626     }
627     else if (align == BankAlign::Even2GRF)
628     {
629         while ((i % 4 >= 2) || (evenAlign && (i % 2 != 0)))
630         {
631             i++;
632         }
633     }
634     else if (align == BankAlign::Odd2GRF)
635     {
636         while ((i % 4 < 2) || (evenAlign && (i % 2 != 0)))
637         {
638             i++;
639         }
640     }
641 }
642 
get_bundle(unsigned int baseReg,int offset)643 unsigned int PhyRegsLocalRA::get_bundle(unsigned int baseReg, int offset)
644 {
645     if (builder->hasPartialInt64Support())
646     {
647         return (((baseReg + offset) % 32) / 2);
648     }
649     return (((baseReg + offset) % 64) / 4);
650 }
651 
findBundleConflictFreeRegister(int curReg,int endReg,unsigned short occupiedBundles,BankAlign align,bool evenAlign)652 int PhyRegsLocalRA::findBundleConflictFreeRegister(int curReg,
653                                             int endReg,
654                                             unsigned short occupiedBundles,
655                                             BankAlign align,
656                                             bool evenAlign)
657 {
658     int i = curReg;
659     while (occupiedBundles & (1 << get_bundle(i, 0)) ||
660            occupiedBundles & (1 << get_bundle(i, 1)))
661     {
662         i++;
663         findRegisterCandiateWithAlignForward(i, align, evenAlign);
664         if (i > endReg) //Out of bound, drop the bundle conflict considration
665         {
666             i = curReg;
667             break;
668         }
669     }
670 
671     return i;
672 }
673 
findRegisterCandiateWithAlignBackward(int & i,BankAlign align,bool evenAlign)674 void PhyRegsLocalRA::findRegisterCandiateWithAlignBackward(int &i, BankAlign align, bool evenAlign)
675 {
676     if ((align == BankAlign::Even) && (i % 2 != 0))
677     {
678         i--;
679     }
680     else if ((align == BankAlign::Odd) && (i % 2 == 0))
681     {
682         i--;
683     }
684     else if (align == BankAlign::Even2GRF)
685     {
686         while (i >= 0 && ((i % 4 >= 2) || (evenAlign && (i % 2 != 0))))
687         {
688             i--;
689         }
690     }
691     else if (align == BankAlign::Odd2GRF)
692     {
693         while (i >= 0 && ((i % 4 < 2) || (evenAlign && (i % 2 != 0))))
694         {
695             i--;
696         }
697     }
698 }
699 
getOccupiedBundle(IR_Builder & builder,GlobalRA & gra,G4_Declare * dcl,BankAlign & preferBank)700 inline static unsigned short getOccupiedBundle(IR_Builder& builder, GlobalRA& gra, G4_Declare* dcl, BankAlign &preferBank)
701 {
702     unsigned short occupiedBundles = 0;
703     unsigned bundleNum = 0;
704     unsigned int evenBankNum = 0;
705     unsigned int oddBankNum = 0;
706 
707     if (!builder.hasDPAS() || !builder.getOption(vISA_EnableDPASBundleConflictReduction))
708     {
709         return 0;
710     }
711 
712     for (const BundleConflict& conflict : gra.getBundleConflicts(dcl))
713     {
714         LocalLiveRange* lr = gra.getLocalLR(conflict.dcl);
715         int  subregnum;
716         G4_VarBase* preg = lr->getPhyReg(subregnum);
717         if (preg == NULL)
718         {
719             preg = lr->getTopDcl()->getRegVar()->getPhyReg();
720         }
721 
722         if (preg != NULL)
723         {
724             int offset = conflict.offset;
725             unsigned int reg = preg->asGreg()->getRegNum();
726             unsigned int bank = gra.get_bank(reg, offset);
727             if (bank)
728             {
729                 oddBankNum++;
730             }
731             else
732             {
733                 evenBankNum++;
734             }
735             unsigned int bundle = gra.get_bundle(reg, offset);
736             unsigned int bundle1 = gra.get_bundle(reg, offset + 1);
737             if (!(occupiedBundles & ((unsigned short)1 << bundle)))
738             {
739                 bundleNum++;
740             }
741             occupiedBundles |= (unsigned short)1 << bundle;
742             occupiedBundles |= (unsigned short)1 << bundle1;
743         }
744     }
745     if (bundleNum > 12)
746     {
747         occupiedBundles = 0;
748     }
749 
750     if (evenBankNum || oddBankNum)
751     {
752         preferBank = (evenBankNum < oddBankNum) ? BankAlign::Even : BankAlign::Odd;
753     }
754 
755     return occupiedBundles;
756 }
757 
assignUniqueRegisters(bool twoBanksRA,bool twoDirectionsAssign,bool hybridWithSpill)758 bool LocalRA::assignUniqueRegisters(bool twoBanksRA, bool twoDirectionsAssign, bool hybridWithSpill)
759 {
760     // Iterate over all dcls and calculate number of rows
761     // required if each unallocated dcl had its own physical
762     // register. Check number of registers used up by local RA.
763     // If unique assignments are possible, then assign and set
764     // needGlobalRA to false.
765     unsigned int numRows = 0;
766     std::vector<G4_Declare*> unallocatedRanges;
767     bool needGlobalRA = true;
768     uint32_t nextEOTGRF = numRegLRA;
769     std::unordered_set<unsigned int> emptyForbidden;
770     auto varSplitPass = gra.getVarSplitPass();
771 
772     if (nextEOTGRF < 112 && builder.hasEOTGRFBinding())
773     {
774         // we can't guarantee unique assignments for all the EOT sources
775         // punt to global RA
776         return true;
777     }
778 
779     // Identify global ranges not having an allocation
780     for (auto dcl : kernel.Declares)
781     {
782         if (dcl->getAliasDeclare() == NULL &&
783             dcl->getRegFile() == G4_GRF &&
784             dcl->getRegVar()->isPhyRegAssigned() == false)
785         {
786             auto dclLR = gra.getLocalLR(dcl);
787 
788             if (dclLR  && dclLR->isEOT() && builder.hasEOTGRFBinding())
789             {
790                 // For EOT use r112 - r127
791                 // for simplicity we assign each EOT source unique GRFs
792                 G4_Greg* phyReg = builder.phyregpool.getGreg(nextEOTGRF);
793                 dcl->getRegVar()->setPhyReg(phyReg, 0);
794                 dclLR->setPhyReg(phyReg, 0);
795                 dclLR->setAssigned(true);
796                 // we have to make sure src0 and src1 of split send get different GRFs
797                 nextEOTGRF += dcl->getNumRows();
798                 if (kernel.getOption(vISA_GenerateDebugInfo))
799                 {
800                     uint32_t start = 0, end = 0;
801                     for (auto rit = kernel.fg.rbegin();
802                         rit != kernel.fg.rend();
803                         rit++)
804                     {
805                         G4_BB* bb = (*rit);
806 
807                         if (bb->size() > 0)
808                         {
809                             end = bb->back()->getCISAOff();
810                             break;
811                         }
812                     }
813                     updateDebugInfo(kernel, dcl, start, end);
814                 }
815             }
816             else
817             {
818                 if (varSplitPass->isSplitDcl(dcl) ||
819                     varSplitPass->isPartialDcl(dcl))
820                 {
821                     return true;
822                 }
823                 numRows += dcl->getNumRows();
824                 if (hybridWithSpill)
825                 {
826                     if (dcl->isDoNotSpill())
827                     {
828                         unallocatedRanges.push_back(dcl);
829                     }
830                 }
831                 else
832                 {
833                     unallocatedRanges.push_back(dcl);
834                 }
835             }
836         }
837     }
838 
839     if (numRows < numRegLRA || hybridWithSpill)
840     {
841         // Get superset of registers used by local RA
842         // in all basic blocks.
843         PhyRegsManager phyRegMgr(*pregs, twoBanksRA);
844 
845         for (auto bb : kernel.fg)
846         {
847             PhyRegSummary* summary = kernel.fg.getBBLRASummary(bb);
848             if (summary != NULL)
849             {
850                 for (unsigned int i = 0; i < numRegLRA; i++)
851                 {
852                     if (summary->isGRFBusy(i))
853                     {
854                         phyRegMgr.getAvaialableRegs()->setGRFBusy(i, 1);
855                     }
856                 }
857             }
858         }
859 
860 
861         needGlobalRA = hybridWithSpill;
862         bool assignFromFront = true;
863 #ifdef DEBUG_VERBOSE_ON
864         COUT_ERROR << "Trival RA: " << std::endl;
865 #endif
866         for (auto dcl : unallocatedRanges)
867         {
868             int regNum = 0;
869             int subregNum = 0;
870             int sizeInWords = dcl->getWordSize();
871             int nrows = 0;
872             BankAlign bankAlign = BankAlign::Either;
873             if (twoBanksRA &&
874                 gra.getBankConflict(dcl) != BANK_CONFLICT_NONE)
875             {
876                 bankAlign = getBankAlignForUniqueAssign(dcl);
877 
878                 if (bankAlign == BankAlign::Even || bankAlign == BankAlign::Even2GRF)
879                 {
880                     assignFromFront = true;
881                 }
882                 if (twoDirectionsAssign && (bankAlign == BankAlign::Odd || bankAlign == BankAlign::Odd2GRF))
883                 {
884                     assignFromFront = false;
885                 }
886             }
887 
888             auto assignAlign = gra.isEvenAligned(dcl) ? BankAlign::Even : bankAlign;
889             G4_SubReg_Align subAlign = builder.GRFAlign() ? GRFALIGN : gra.getSubRegAlign(dcl);
890 
891             if (assignFromFront)
892             {
893                 BankAlign preferBank = BankAlign::Either;
894                 unsigned short occupiedBundles = getOccupiedBundle(builder, gra, dcl, preferBank);
895                 nrows = phyRegMgr.findFreeRegs(sizeInWords, assignAlign,
896                     subAlign, regNum, subregNum, 0, numRegLRA - 1, occupiedBundles, 0, false, emptyForbidden, false, 0);
897             }
898             else
899             {
900                 nrows = phyRegMgr.findFreeRegs(sizeInWords, assignAlign,
901                     subAlign, regNum, subregNum, numRegLRA - 1, 0, 0, 0, false, emptyForbidden, false, 0);
902             }
903 
904             if (nrows)
905             {
906                 G4_Greg* phyReg = builder.phyregpool.getGreg(regNum);
907                 // adjust subreg num to type size
908                 if (dcl->getElemSize() == 1)
909                 {
910                     subregNum *= 2;
911                 }
912                 else
913                 {
914                     subregNum /= (dcl->getElemSize() / 2);
915                 }
916                 dcl->getRegVar()->setPhyReg(phyReg, subregNum);
917             }
918             else
919             {
920                 needGlobalRA = true;
921                 break;
922             }
923             if (twoBanksRA && twoDirectionsAssign)
924             {
925                 assignFromFront = !assignFromFront;
926             }
927         }
928 
929         if (needGlobalRA)
930         {
931             for (auto dcl : unallocatedRanges)
932             {
933                 if (!hybridWithSpill)
934                 {
935                     dcl->getRegVar()->resetPhyReg();
936                 }
937                 else if (!dcl->isDoNotSpill())
938                 {
939                     dcl->getRegVar()->resetPhyReg();
940                 }
941             }
942         }
943         else
944         {
945             if (kernel.getOption(vISA_GenerateDebugInfo))
946             {
947                 uint32_t start = 0, end = UNMAPPABLE_VISA_INDEX;
948                 for (auto rit = kernel.fg.rbegin();
949                     rit != kernel.fg.rend();
950                     rit++)
951                 {
952                     G4_BB* bb = (*rit);
953 
954                     if (bb->size() > 0)
955                     {
956                         for (auto inst : *bb)
957                         {
958                             if (inst->getCISAOff() != UNMAPPABLE_VISA_INDEX)
959                             {
960                                 end = bb->back()->getCISAOff();
961                                 break;
962                             }
963                         }
964                         if (end != UNMAPPABLE_VISA_INDEX)
965                             break;
966                     }
967                 }
968 
969                 for (auto dcl : unallocatedRanges)
970                 {
971                     updateDebugInfo(kernel, dcl, start, end);
972                 }
973 
974                 // unallocatedRanges doesnt contain input dcls
975                 for (auto dcl : kernel.Declares)
976                 {
977                     if (dcl->isInput())
978                     {
979                         updateDebugInfo(kernel, dcl, start, end);
980                     }
981                 }
982             }
983         }
984 
985     }
986     else
987     {
988         needGlobalRA = true;
989     }
990 #ifdef DEBUG_VERBOSE_ON
991     COUT_ERROR << std::endl;
992 #endif
993 
994     return needGlobalRA;
995 }
996 
removeUnreferencedDcls()997 void GlobalRA::removeUnreferencedDcls()
998 {
999     // Iterate over all dcls and remove those with 0
1000     // ref count and not addressed. This is done only for
1001     // GRF dcls.
1002 
1003     // Propagate top dcl info to aliases
1004     for (auto dcl : kernel.Declares)
1005     {
1006         if (dcl->getAliasDeclare() != nullptr)
1007         {
1008             setNumRefs(dcl, getNumRefs(dcl->getAliasDeclare()));
1009         }
1010     }
1011 
1012     auto isUnrefDcl = [this](G4_Declare* dcl)
1013     {
1014         return (dcl->getRegFile() == G4_GRF || dcl->getRegFile() == G4_INPUT) &&
1015             getNumRefs(dcl) == 0 &&
1016             dcl->getRegVar()->isPhyRegAssigned() == false &&
1017             dcl != kernel.fg.builder->getBuiltinR0()
1018             && dcl != kernel.fg.builder->getSpillSurfaceOffset() &&
1019             (!kernel.fg.builder->getOptions()->getuInt32Option(vISA_CodePatch) ||
1020              dcl->getAliasDeclare() != kernel.fg.builder->getInputR1());
1021     };
1022 
1023     kernel.Declares.erase(
1024         std::remove_if(kernel.Declares.begin(), kernel.Declares.end(), isUnrefDcl),
1025         kernel.Declares.end());
1026 }
1027 
unassignedRangeFound()1028 bool LocalRA::unassignedRangeFound()
1029 {
1030     bool unassignedRangeFound = false;
1031     for (auto dcl : kernel.Declares)
1032     {
1033         if (dcl->getAliasDeclare() == NULL &&
1034             dcl->getRegFile() == G4_GRF &&
1035             dcl->getRegVar()->isPhyRegAssigned() == false)
1036         {
1037             unassignedRangeFound = true;
1038             break;
1039         }
1040     }
1041 
1042     return unassignedRangeFound;
1043 }
1044 
1045 // Update numRegsUsed to max physical register used based on summary
updateRegUsage(PhyRegSummary * summary,unsigned int & numRegsUsed)1046 void LocalRA::updateRegUsage(PhyRegSummary* summary, unsigned int& numRegsUsed)
1047 {
1048     for (uint32_t i = 0, numGRF = summary->getNumGRF(); i < numGRF; i++)
1049     {
1050         if (numRegsUsed < i && summary->isGRFBusy(i) == true)
1051         {
1052             numRegsUsed = i;
1053         }
1054     }
1055 }
1056 
1057 // Emit out localRA opt report
localRAOptReport()1058 void LocalRA::localRAOptReport()
1059 {
1060     unsigned int totalRanges = 0, localRanges = 0;
1061 
1062     for (auto dcl : kernel.Declares)
1063     {
1064         auto dcl_it_lr = gra.getLocalLR(dcl);
1065         if (dcl_it_lr &&
1066             dcl_it_lr->isLiveRangeLocal() &&
1067             dcl_it_lr->isGRFRegAssigned())
1068         {
1069             localRanges++;
1070         }
1071 
1072         if ((dcl->getRegFile() == G4_GRF ||
1073             dcl->getRegFile() == G4_INPUT) &&
1074             dcl->getAliasDeclare() == NULL)
1075         {
1076             totalRanges++;
1077         }
1078     }
1079 
1080     if (kernel.getOption(vISA_OptReport))
1081     {
1082         std::ofstream optreport;
1083         getOptReportStream(optreport, kernel.getOptions());
1084         optreport << std::endl;
1085         optreport << "Total GRF ranges: " << totalRanges << std::endl;
1086         optreport << "GRF ranges allocated by local RA: " << localRanges << std::endl;
1087         if (totalRanges)
1088         {
1089             optreport << (int)(localRanges * 100 / totalRanges) <<
1090                 "% allocated by local RA" << std::endl << std::endl;
1091         }
1092         closeOptReportStream(optreport);
1093     }
1094 }
1095 
1096 // Given a src/dst reg region in opnd, traverse to its base and
1097 // return the declaration. Resolve any aliases and return the
1098 // topmost declaration.
1099 // opnd->topdcl should give same result for most cases. But for
1100 // spill code, if an address register spills then opnd->topdcl does
1101 // not return expected dcl for the fill.
GetTopDclFromRegRegion(G4_Operand * opnd)1102 G4_Declare* GetTopDclFromRegRegion(G4_Operand* opnd)
1103 {
1104     G4_Declare* dcl = nullptr;
1105 
1106     MUST_BE_TRUE(opnd->isRegRegion(), "Operand is not a register region so cannot have a top dcl");
1107     G4_VarBase* base = opnd->getBase();
1108 
1109     if (base && base->isRegVar())
1110     {
1111         dcl = base->asRegVar()->getDeclare()->getRootDeclare();
1112     }
1113 
1114     return dcl;
1115 }
1116 
convertSubRegOffFromWords(G4_Declare * dcl,int subregnuminwords)1117 unsigned int LocalRA::convertSubRegOffFromWords(G4_Declare* dcl, int subregnuminwords)
1118 {
1119     // Return subreg offset in units of dcl's element size.
1120     // Input is subregnum in word units.
1121     unsigned int subregnum;
1122 
1123     subregnum = (subregnuminwords * 2) / dcl->getElemSize();
1124 
1125     return subregnum;
1126 }
1127 
convertSubRegOffToWords(G4_Declare * dcl,int subregnum)1128 unsigned int LocalRA::convertSubRegOffToWords(G4_Declare* dcl, int subregnum)
1129 {
1130     // Return subreg offset in units of words from dcl's element size
1131     unsigned int subregnuminwords;
1132 
1133     subregnuminwords = subregnum * dcl->getElemSize() / 2;
1134 
1135     return subregnuminwords;
1136 }
1137 
undoLocalRAAssignments(bool clearInterval)1138 void LocalRA::undoLocalRAAssignments(bool clearInterval)
1139 {
1140     if (kernel.getOption(vISA_OptReport))
1141     {
1142         std::ofstream optreport;
1143         getOptReportStream(optreport, kernel.getOptions());
1144         optreport << "Undoing local RA assignments" << std::endl;
1145         closeOptReportStream(optreport);
1146     }
1147 
1148     // Undo all assignments made by local RA
1149     for (auto dcl : kernel.Declares)
1150     {
1151         LocalLiveRange* lr = gra.getLocalLR(dcl);
1152         if (lr != NULL)
1153         {
1154             if (lr->getAssigned() == true)
1155             {
1156                 // Undo the assignment
1157                 lr->setAssigned(false);
1158                 lr->getTopDcl()->getRegVar()->resetPhyReg();
1159 
1160                 if (kernel.getOption(vISA_GenerateDebugInfo))
1161                 {
1162                     kernel.getKernelDebugInfo()->getLiveIntervalInfo(lr->getTopDcl(), false)->clearLiveIntervals();
1163                 }
1164             }
1165             if (clearInterval)
1166             {
1167                 if (lr->isLiveRangeLocal())
1168                 {
1169                     lr->setFirstRef(NULL, 0);
1170                 }
1171                 if (lr->getTopDcl()->getRegFile() == G4_INPUT)
1172                 {
1173                     lr->setLastRef(NULL, 0);
1174                 }
1175             }
1176         }
1177     }
1178 
1179     // Delete summary information stored with each basic block
1180     kernel.fg.clearBBLRASummaries();
1181 }
1182 
GetOrCreateLocalLiveRange(G4_Declare * topdcl)1183 LocalLiveRange* GlobalRA::GetOrCreateLocalLiveRange(G4_Declare* topdcl)
1184 {
1185     LocalLiveRange* lr = getLocalLR(topdcl);
1186 
1187     // Check topdcl of operand and setup a new live range if required
1188     if (!lr)
1189     {
1190         localLiveRanges.push_back(LocalLiveRange(builder));
1191         lr = &localLiveRanges.back();
1192         setLocalLR(topdcl, lr);
1193     }
1194 
1195     MUST_BE_TRUE(lr != NULL, "Local LR could not be created");
1196     return lr;
1197 }
1198 
markReferencesInOpnd(G4_Operand * opnd,bool isEOT,INST_LIST_ITER inst_it,unsigned int pos)1199 void LocalRA::markReferencesInOpnd(G4_Operand* opnd, bool isEOT, INST_LIST_ITER inst_it,
1200     unsigned int pos)
1201 {
1202     G4_Declare* topdcl = NULL;
1203 
1204     if ((opnd->isSrcRegRegion() || opnd->isDstRegRegion()))
1205     {
1206         topdcl = GetTopDclFromRegRegion(opnd);
1207 
1208         if (topdcl &&
1209             (topdcl->getRegFile() == G4_GRF || topdcl->getRegFile() == G4_INPUT))
1210         {
1211             // Handle GRF here
1212             MUST_BE_TRUE(topdcl->getAliasDeclare() == NULL, "Not topdcl");
1213             LocalLiveRange* lr = gra.GetOrCreateLocalLiveRange(topdcl);
1214             lr->recordRef(curBB);
1215 
1216             if (doSplitLLR &&
1217                 opnd->isSrcRegRegion())
1218             {
1219                 LLR_USE_MAP_ITER useMapIter;
1220                 useMapIter = LLRUseMap.find(lr);
1221                 if (useMapIter == LLRUseMap.end())
1222                 {
1223                     std::vector<std::pair<INST_LIST_ITER, unsigned int>> useList;
1224                     useList.push_back(make_pair(inst_it, pos));
1225                     LLRUseMap.insert(make_pair(lr, useList));
1226                 }
1227                 else
1228                 {
1229                     (*useMapIter).second.push_back(make_pair(inst_it, pos));
1230                 }
1231             }
1232 
1233             if (topdcl->getRegVar()
1234                 && topdcl->getRegVar()->isPhyRegAssigned()
1235                 && topdcl->getRegVar()->getPhyReg()->isGreg())
1236             {
1237                 pregs->markPhyRegs(topdcl);
1238             }
1239 
1240             if (isEOT)
1241             {
1242                 lr->markEOT();
1243             }
1244 
1245             gra.recordRef(topdcl);
1246 
1247             if (kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D &&
1248                 opnd->isDstRegRegion() &&
1249                 !topdcl->getAddressed())
1250             {
1251                 if (opnd->getInst()->isWriteEnableInst() == false)
1252                 {
1253                 }
1254                 else
1255                 {
1256                     gra.setAugmentationMask(topdcl, AugmentationMasks::NonDefault);
1257                 }
1258             }
1259         }
1260         else {
1261             // Handle register other than GRF
1262         }
1263     }
1264     else if (opnd->isAddrExp())
1265     {
1266         G4_AddrExp* addrExp = opnd->asAddrExp();
1267 
1268         topdcl = addrExp->getRegVar()->getDeclare();
1269         while (topdcl->getAliasDeclare() != NULL)
1270             topdcl = topdcl->getAliasDeclare();
1271 
1272         MUST_BE_TRUE(topdcl != NULL, "Top dcl was null for addr exp opnd");
1273 
1274         LocalLiveRange* lr = gra.GetOrCreateLocalLiveRange(topdcl);
1275         lr->recordRef(curBB);
1276         lr->markIndirectRef();
1277 
1278         if (topdcl->getRegVar()
1279             && topdcl->getRegVar()->isPhyRegAssigned()
1280             && topdcl->getRegVar()->getPhyReg()->isGreg())
1281         {
1282             pregs->markPhyRegs(topdcl);
1283         }
1284 
1285         gra.recordRef(topdcl);
1286     }
1287 }
1288 
markReferencesInInst(INST_LIST_ITER inst_it)1289 void LocalRA::markReferencesInInst(INST_LIST_ITER inst_it)
1290 {
1291     auto inst = (*inst_it);
1292 
1293     if (inst->getNumDst() > 0)
1294     {
1295         // Scan dst
1296         G4_Operand* dst = inst->getDst();
1297 
1298         if (dst)
1299         {
1300             markReferencesInOpnd(dst, false, inst_it, 0);
1301         }
1302     }
1303 
1304     // Scan srcs
1305     for (int i = 0, nSrcs = inst->getNumSrc(); i < nSrcs; i++)
1306     {
1307         G4_Operand* src = inst->getSrc(i);
1308 
1309         if (src)
1310         {
1311             markReferencesInOpnd(src, inst->isEOT(), inst_it, i);
1312         }
1313     }
1314 }
1315 
setLexicalID(bool includePseudo)1316 void LocalRA::setLexicalID(bool includePseudo)
1317 {
1318     unsigned int id = 0;
1319     for (auto bb : kernel.fg)
1320     {
1321         for (auto curInst : *bb)
1322         {
1323             if ((!includePseudo) && (curInst->isPseudoKill() ||
1324                 curInst->isLifeTimeEnd()))
1325             {
1326                 curInst->setLexicalId(id);
1327             }
1328             else
1329             {
1330                 curInst->setLexicalId(id++);
1331             }
1332         }
1333     }
1334 }
1335 
markReferences(unsigned int & numRowsEOT,bool & lifetimeOpFound)1336 void LocalRA::markReferences(unsigned int& numRowsEOT,
1337     bool& lifetimeOpFound)
1338 {
1339     unsigned int id = 0;
1340     // Iterate over all BBs
1341     for (auto _curBB : kernel.fg)
1342     {
1343         curBB = _curBB;
1344         // Iterate over all insts
1345         for (INST_LIST_ITER inst_it = curBB->begin(), inst_end = curBB->end(); inst_it != inst_end; ++inst_it)
1346         {
1347             G4_INST* curInst = (*inst_it);
1348 
1349             if (curInst->isPseudoKill() ||
1350                 curInst->isLifeTimeEnd())
1351             {
1352                 curInst->setLexicalId(id);
1353                 lifetimeOpFound = true;
1354                 if (curInst->isLifeTimeEnd())
1355                 {
1356                     markReferencesInInst(inst_it);
1357                 }
1358                 continue;
1359             }
1360 
1361             if (curInst->isSplitIntrinsic())
1362                 hasSplitInsts = true;
1363 
1364             // set lexical ID
1365             curInst->setLexicalId(id++);
1366 
1367             if (curInst->isEOT() && kernel.fg.builder->hasEOTGRFBinding())
1368             {
1369                 numRowsEOT += curInst->getSrc(0)->getTopDcl()->getNumRows();
1370 
1371                 if (curInst->isSplitSend() && !curInst->getSrc(1)->isNullReg())
1372                 {
1373                     // both src0 and src1 have to be >=r112
1374                     numRowsEOT += curInst->getSrc(1)->getTopDcl()->getNumRows();
1375                 }
1376             }
1377 
1378             markReferencesInInst(inst_it);
1379         }
1380     }
1381 }
1382 
1383 // Function to calculate input register live intervals,
1384 // which are now tracked at word granularity because
1385 // there may be overlap between two different input variables.
calculateInputIntervals()1386 void LocalRA::calculateInputIntervals()
1387 {
1388     setLexicalID(false);
1389 
1390     int numGRF = kernel.getNumRegTotal();
1391     std::vector<uint32_t> inputRegLastRef(numGRF * numEltPerGRF<Type_UW>(), UINT_MAX);
1392 
1393     for (BB_LIST_RITER bb_it = kernel.fg.rbegin(), bb_rend = kernel.fg.rend();
1394         bb_it != bb_rend;
1395         bb_it++)
1396     {
1397         G4_BB* bb = (*bb_it);
1398 
1399         for (INST_LIST_RITER inst_it = bb->rbegin(), inst_rend = bb->rend();
1400             inst_it != inst_rend;
1401             inst_it++)
1402         {
1403             G4_INST* curInst = (*inst_it);
1404 
1405             G4_Declare* topdcl = NULL;
1406             LocalLiveRange* lr = NULL;
1407 
1408             // scan dst operand (may be unnecessary but added for safety)
1409             if (curInst->getDst() != NULL)
1410             {
1411                 // Scan dst
1412                 G4_DstRegRegion* dst = curInst->getDst();
1413 
1414                 topdcl = GetTopDclFromRegRegion(dst);
1415 
1416                 if (topdcl && (lr = gra.getLocalLR(topdcl)))
1417                 {
1418                     if (topdcl->getRegFile() == G4_INPUT &&
1419                         !(dst->isAreg()) &&
1420                         topdcl->isOutput() == false &&
1421                         lr->hasIndirectAccess() == false &&
1422                         !builder.isPreDefArg(topdcl))
1423                     {
1424                         unsigned int lastRef = 0;
1425 
1426                         MUST_BE_TRUE(lr->isGRFRegAssigned(), "Input variable has no pre-assigned physical register");
1427 
1428                         if (lr->getLastRef(lastRef) == NULL)
1429                         {
1430                             unsigned int curInstId = curInst->getLexicalId();
1431                             lr->setLastRef(curInst, curInstId);
1432 
1433                             G4_RegVar* var = topdcl->getRegVar();
1434                             unsigned int regNum = var->getPhyReg()->asGreg()->getRegNum();
1435                             unsigned int regOff = var->getPhyRegOff();
1436                             unsigned int idx = regNum * numEltPerGRF<Type_UW>() +
1437                                 (regOff * topdcl->getElemSize()) / G4_WSIZE + topdcl->getWordSize() - 1;
1438 
1439                             unsigned int numWords = topdcl->getWordSize();
1440                             for (int i = numWords - 1; i >= 0; --i, --idx)
1441                             {
1442                                 if (inputRegLastRef[idx] == UINT_MAX &&
1443                                     // Check for registers that are marked as forbidden,
1444                                     // e.g., BuiltinR0 and those reserved for stack call
1445                                     pregs->isGRFAvailable(idx / numEltPerGRF<Type_UW>()))
1446                                 {
1447                                     inputRegLastRef[idx] = curInstId;
1448                                     inputIntervals.push_front(new (mem)InputLiveRange(idx, curInstId));
1449                                     if (kernel.getOptions()->getOption(vISA_GenerateDebugInfo))
1450                                     {
1451                                         updateDebugInfo(kernel, topdcl, 0, curInst->getCISAOff());
1452                                     }
1453                                 }
1454                             }
1455                         }
1456                     }
1457                 }
1458             }
1459 
1460             // Scan src operands
1461             for (int i = 0, nSrcs = curInst->getNumSrc(); i < nSrcs; i++)
1462             {
1463                 G4_Operand* src = curInst->getSrc(i);
1464 
1465                 if (src && src->getTopDcl())
1466                 {
1467                     topdcl = GetTopDclFromRegRegion(src);
1468 
1469                     if (topdcl && (lr = gra.getLocalLR(topdcl)))
1470                     {
1471                         // Check whether it is input
1472                         if ((topdcl->getRegFile() == G4_INPUT || topdcl->isLiveIn()) &&
1473                             !(src->isAreg()) &&
1474                             topdcl->isOutput() == false &&
1475                             lr->hasIndirectAccess() == false &&
1476                             !builder.isPreDefArg(topdcl))
1477                         {
1478                             unsigned int lastRef = 0;
1479 
1480                             MUST_BE_TRUE(lr->isGRFRegAssigned(), "Input variable has no pre-assigned physical register");
1481 
1482                             if (lr->getLastRef(lastRef) == NULL)
1483                             {
1484                                 unsigned int curInstId = curInst->getLexicalId();
1485                                 lr->setLastRef(curInst, curInstId);
1486 
1487                                 G4_RegVar* var = topdcl->getRegVar();
1488                                 unsigned int regNum = var->getPhyReg()->asGreg()->getRegNum();
1489                                 unsigned int regOff = var->getPhyRegOff();
1490                                 unsigned int idx = regNum * numEltPerGRF<Type_UW>() +
1491                                     (regOff * TypeSize(topdcl->getElemType())) / G4_WSIZE + topdcl->getWordSize() - 1;
1492 
1493                                 unsigned int numWords = topdcl->getWordSize();
1494                                 for (int i = numWords - 1; i >= 0; --i, --idx)
1495                                 {
1496                                     if (inputRegLastRef[idx] == UINT_MAX &&
1497                                         // Check for registers that are marked as forbidden,
1498                                         // e.g., BuiltinR0 and those reserved for stack call
1499                                         pregs->isGRFAvailable(idx / numEltPerGRF<Type_UW>()))
1500                                     {
1501                                         inputRegLastRef[idx] = curInstId;
1502                                         if (builder.avoidDstSrcOverlap() &&
1503                                             curInst->getDst() != NULL &&
1504                                             hasDstSrcOverlapPotential(curInst->getDst(), src->asSrcRegRegion()))
1505                                         {
1506                                             inputIntervals.push_front(new (mem)InputLiveRange(idx, curInstId + 1));
1507                                         }
1508                                         else
1509                                         {
1510                                             inputIntervals.push_front(new (mem)InputLiveRange(idx, curInstId));
1511                                         }
1512                                         if (kernel.getOptions()->getOption(vISA_GenerateDebugInfo))
1513                                         {
1514                                             updateDebugInfo(kernel, topdcl, 0, curInst->getCISAOff());
1515                                         }
1516                                     }
1517                                 }
1518                             }
1519                         }
1520                     }
1521                 }
1522             }
1523         }
1524     }
1525 
1526 #ifdef _DEBUG
1527     for (DECLARE_LIST_ITER dcl_it = kernel.Declares.begin();
1528         dcl_it != kernel.Declares.end();
1529         dcl_it++)
1530     {
1531         G4_Declare* curDcl = (*dcl_it);
1532 
1533         if (curDcl->isOutput() &&
1534             gra.getLocalLR(curDcl) && gra.getLocalLR(curDcl)->isGRFRegAssigned())
1535         {
1536             G4_RegVar* var = curDcl->getRegVar();
1537             unsigned int regNum = var->getPhyReg()->asGreg()->getRegNum();
1538             unsigned int regOff = var->getPhyRegOff();
1539             unsigned int idx = regNum * numEltPerGRF<Type_UW>() +
1540                 (regOff * TypeSize(curDcl->getElemType())) / G4_WSIZE;
1541 
1542             unsigned int numWords = curDcl->getWordSize();
1543             for (unsigned int i = 0; i < numWords; ++i, ++idx)
1544             {
1545                 if (inputRegLastRef[idx] != UINT_MAX)
1546                 {
1547                     MUST_BE_TRUE(false, "Invalid output variable with overlapping input variable!");
1548                 }
1549             }
1550         }
1551     }
1552 #endif
1553 }
1554 
hasDstSrcOverlapPotential(G4_DstRegRegion * dst,G4_SrcRegRegion * src)1555 bool LocalRA::hasDstSrcOverlapPotential(G4_DstRegRegion* dst, G4_SrcRegRegion* src)
1556 {
1557     int dstOpndNumRows = 0;
1558 
1559     if (dst->getBase()->isRegVar())
1560     {
1561         G4_Declare* dstDcl = dst->getBase()->asRegVar()->getDeclare();
1562         if (dstDcl != nullptr)
1563         {
1564             int dstOffset = (dstDcl->getOffsetFromBase() + dst->getLeftBound()) / numEltPerGRF<Type_UB>();
1565             G4_DstRegRegion* dstRgn = dst;
1566             dstOpndNumRows = dstRgn->getSubRegOff() + dstRgn->getLinearizedEnd() - dstRgn->getLinearizedStart() + 1 > numEltPerGRF<Type_UB>();
1567 
1568             if (src != NULL &&
1569                 src->isSrcRegRegion() &&
1570                 src->asSrcRegRegion()->getBase()->isRegVar())
1571             {
1572                 G4_SrcRegRegion* srcRgn = src->asSrcRegRegion();
1573                 G4_Declare* srcDcl = src->getBase()->asRegVar()->getDeclare();
1574                 int srcOffset = (srcDcl->getOffsetFromBase() + src->getLeftBound()) / numEltPerGRF<Type_UB>();
1575                 bool srcOpndNumRows = srcRgn->getSubRegOff() + srcRgn->getLinearizedEnd() - srcRgn->getLinearizedStart() + 1 > numEltPerGRF<Type_UB>();
1576 
1577                 if (dstOpndNumRows || srcOpndNumRows)
1578                 {
1579                     if (!(gra.isEvenAligned(dstDcl) && gra.isEvenAligned(srcDcl) &&
1580                         srcOffset % 2 == dstOffset % 2 &&
1581                         dstOpndNumRows && srcOpndNumRows))
1582                     {
1583                         return true;
1584                     }
1585                 }
1586             }
1587         }
1588     }
1589 
1590     return false;
1591 }
1592 
1593 // Function to calculate local live intervals in ascending order of starting point
1594 // in basic block bb.
1595 //
1596 // FIXME: current implementation cannot handle partial use-before-define case as shown below:
1597 //
1598 // loop_start:
1599 // V1 (1) =
1600 // P1 =
1601 // (P1) (16) = V1
1602 // ...
1603 // V1 (16) =
1604 // ...
1605 // (P2) Jmp  loop_start
1606 //
1607 // One way to fix this is to check the emask and simd size of local def/use and make sure they
1608 // satisfy the followign rules:
1609 // 1) The SIMD size of a use should be equal to or smaller than earlier def seen for that variable.
1610 // 2) The Emask of use should be equal of narrower than def seen earlier.
1611 // 3) Each use should be fully defined in current BB (e.g., if a use is predicated,
1612 //    the def should be either non-predicated or have the same predicate).
1613 //
calculateLiveIntervals(G4_BB * bb,std::vector<LocalLiveRange * > & liveIntervals)1614 void LocalRA::calculateLiveIntervals(G4_BB* bb, std::vector<LocalLiveRange*>& liveIntervals)
1615 {
1616     int idx = 0;
1617     bool brk = false;
1618 
1619     for (INST_LIST_ITER inst_it = bb->begin(), bbend = bb->end();
1620         inst_it != bbend && !brk;
1621         inst_it++, idx += 2)
1622     {
1623         G4_INST* curInst = (*inst_it);
1624         G4_Declare* topdcl = NULL;
1625         LocalLiveRange* lr = NULL;
1626 
1627         if (curInst->isPseudoKill() ||
1628             curInst->isLifeTimeEnd())
1629         {
1630             continue;
1631         }
1632 
1633         // Scan srcs
1634         for (int i = 0, nSrcs = curInst->getNumSrc(); i < nSrcs; i++)
1635         {
1636             G4_Operand* src = curInst->getSrc(i);
1637 
1638             if (src && src->getTopDcl())
1639             {
1640                 // Scan all srcs
1641                 topdcl = GetTopDclFromRegRegion(src);
1642                 LocalLiveRange* topdclLR = nullptr;
1643 
1644                 if (topdcl && (topdclLR = gra.getLocalLR(topdcl)))
1645                 {
1646                     lr = topdclLR;
1647 
1648                     // Check whether local LR is a candidate
1649                     if (lr->isLiveRangeLocal() &&
1650                         lr->isGRFRegAssigned() == false)
1651                     {
1652                         unsigned int startIdx;
1653 
1654                         if (lr->getFirstRef(startIdx) == NULL)
1655                         {
1656                             // Skip this lr and update its ref count,
1657                             // So it will be treated as a global LR
1658                             lr->recordRef(NULL);
1659                             continue;
1660                         }
1661 
1662                         MUST_BE_TRUE(idx > 0, "Candidate use found in first inst of basic block");
1663 
1664                         if ((builder.WaDisableSendSrcDstOverlap() &&
1665                             ((curInst->isSend() && i == 0) ||
1666                             (curInst->isSplitSend() && i == 1)))
1667                             || (curInst->isDpas() && i == 1)  //For DPAS, as part of same instruction, src1 should not have overlap with dst. Src0 and src2 are okay to have overlap
1668                             || (builder.avoidDstSrcOverlap() &&  curInst->getDst() != NULL && hasDstSrcOverlapPotential(curInst->getDst(), src->asSrcRegRegion()))
1669                             )
1670                         {
1671 #ifdef DEBUG_VERBOSE_ON
1672                             if (curInst->getDst() != NULL && hasDstSrcOverlapPotential(curInst->getDst(), src->asSrcRegRegion()))
1673                             {
1674                                 curInst->dump();
1675                             }
1676 #endif
1677                             lr->setLastRef(curInst, idx + 1);
1678                         }
1679                         else
1680                         {
1681                             lr->setLastRef(curInst, idx);
1682                         }
1683                     }
1684                 }
1685             }
1686         }
1687 
1688         if (G4_Inst_Table[curInst->opcode()].n_dst == 1 ||
1689             curInst->isSplitIntrinsic())
1690         {
1691             // Scan dst
1692             G4_DstRegRegion* dst = curInst->getDst();
1693 
1694             if (dst)
1695             {
1696                 topdcl = GetTopDclFromRegRegion(dst);
1697 
1698                 if (topdcl && (lr = gra.getLocalLR(topdcl)))
1699                 {
1700                     // Check whether local LR is a candidate
1701                     if (lr->isLiveRangeLocal() &&
1702                         lr->isGRFRegAssigned() == false)
1703                     {
1704                         unsigned int startIdx;
1705 
1706                         if (lr->getFirstRef(startIdx) == NULL)
1707                         {
1708                             lr->setFirstRef(curInst, idx);
1709                             liveIntervals.push_back(lr);
1710                         }
1711 
1712                         lr->setLastRef(curInst, idx);
1713                     }
1714                 }
1715             }
1716         }
1717     }
1718 }
1719 
printAddressTakenDecls()1720 void LocalRA::printAddressTakenDecls()
1721 {
1722     DEBUG_VERBOSE("Address taken decls:" << std::endl);
1723 
1724     for (DECLARE_LIST_ITER dcl_it = kernel.Declares.begin();
1725         dcl_it != kernel.Declares.end();
1726         dcl_it++)
1727     {
1728         G4_Declare* curDcl = (*dcl_it);
1729 
1730         if (curDcl->getAliasDeclare() != NULL)
1731         {
1732             MUST_BE_TRUE(gra.getLocalLR(curDcl) == NULL, "Local LR found for alias declare");
1733             continue;
1734         }
1735 
1736         if (gra.getLocalLR(curDcl) &&
1737             gra.getLocalLR(curDcl)->hasIndirectAccess() == true)
1738         {
1739             DEBUG_VERBOSE(curDcl->getName() << ", ");
1740         }
1741     }
1742 
1743     DEBUG_VERBOSE(std::endl);
1744 }
1745 
printLocalRACandidates()1746 void LocalRA::printLocalRACandidates()
1747 {
1748     DEBUG_VERBOSE("Local RA candidates:" << std::endl);
1749 
1750     for (DECLARE_LIST_ITER dcl_it = kernel.Declares.begin();
1751         dcl_it != kernel.Declares.end();
1752         dcl_it++)
1753     {
1754         G4_Declare* curDcl = (*dcl_it);
1755 
1756         if (curDcl->getAliasDeclare() != NULL)
1757         {
1758             MUST_BE_TRUE(gra.getLocalLR(curDcl) == NULL, "Local LR found for alias declare");
1759             continue;
1760         }
1761 
1762         if (gra.getLocalLR(curDcl) &&
1763             gra.getLocalLR(curDcl)->isLiveRangeLocal() &&
1764             gra.getLocalLR(curDcl)->isGRFRegAssigned() == false)
1765         {
1766             DEBUG_VERBOSE(curDcl->getName() << ", ");
1767         }
1768     }
1769 
1770     DEBUG_VERBOSE(std::endl);
1771 }
1772 
printLocalLiveIntervals(G4_BB * bb,std::vector<LocalLiveRange * > & liveIntervals)1773 void LocalRA::printLocalLiveIntervals(G4_BB* bb, std::vector<LocalLiveRange*>& liveIntervals)
1774 {
1775     DEBUG_VERBOSE("Live intervals for bb " << bb->getId() << std::endl);
1776 
1777     for (auto lr : liveIntervals)
1778     {
1779         unsigned int start, end;
1780 
1781         lr->getFirstRef(start);
1782         lr->getLastRef(end);
1783 
1784         DEBUG_VERBOSE(lr->getTopDcl()->getName() << "(" << start << ", " << end << ", " << lr->getTopDcl()->getByteSize() << ")");
1785         DEBUG_VERBOSE(std::endl);
1786     }
1787 
1788     DEBUG_VERBOSE(std::endl);
1789 }
1790 
printInputLiveIntervals()1791 void LocalRA::printInputLiveIntervals()
1792 {
1793     DEBUG_VERBOSE(std::endl << "Input Live intervals " << std::endl);
1794 
1795     for (std::list<InputLiveRange*>::iterator it = inputIntervals.begin();
1796         it != inputIntervals.end();
1797         it++)
1798     {
1799         unsigned int regWordIdx, lrEndIdx, regNum, subRegInWord;
1800 
1801         InputLiveRange* lr = (*it);
1802 
1803         regWordIdx = lr->getRegWordIdx();
1804         regNum = regWordIdx / numEltPerGRF<Type_UW>();
1805         subRegInWord = regWordIdx % numEltPerGRF<Type_UW>();
1806         lrEndIdx = lr->getLrEndIdx();
1807 
1808         DEBUG_VERBOSE("r" << regNum << "." << subRegInWord << " " << lrEndIdx);
1809         DEBUG_VERBOSE(std::endl);
1810     }
1811 
1812     DEBUG_VERBOSE(std::endl);
1813 }
1814 
countLocalLiveIntervals(std::vector<LocalLiveRange * > & liveIntervals)1815 void LocalRA::countLocalLiveIntervals(std::vector<LocalLiveRange*>& liveIntervals)
1816 {
1817     unsigned int numScalars = 0, numHalfGRF = 0, numOneGRF = 0, numTwoOrMoreGRF = 0, numTotal = 0;
1818 
1819     for (auto lr : liveIntervals)
1820     {
1821         unsigned size, numrows;
1822         G4_Declare* dcl = lr->getTopDcl();
1823 
1824         numrows = dcl->getNumRows();
1825         size = dcl->getNumElems() * dcl->getElemSize();
1826         numTotal++;
1827 
1828         if (numrows > 1)
1829         {
1830             numTwoOrMoreGRF++;
1831         }
1832         else
1833         {
1834             if (dcl->getNumElems() > 1 && size > (getGRFSize() / 2u) && size <= (unsigned int)getGRFSize())
1835                 numOneGRF++;
1836             else if (dcl->getNumElems() > 1 && size <= (getGRFSize() / 2u))
1837                 numHalfGRF++;
1838             else if (dcl->getNumElems() == 1)
1839                 numScalars++;
1840             else
1841                 ASSERT_USER(false, "Unknown size");
1842         }
1843     }
1844 
1845     printLocalLiveIntervalDistribution(numScalars, numHalfGRF, numOneGRF, numTwoOrMoreGRF, numTotal);
1846 }
1847 
printLocalLiveIntervalDistribution(unsigned int numScalars,unsigned int numHalfGRF,unsigned int numOneGRF,unsigned int numTwoOrMoreGRF,unsigned int numTotal)1848 void LocalRA::printLocalLiveIntervalDistribution(unsigned int numScalars,
1849     unsigned int numHalfGRF, unsigned int numOneGRF, unsigned int numTwoOrMoreGRF,
1850     unsigned int numTotal)
1851 {
1852     DEBUG_VERBOSE("In units of num of live ranges:" << std::endl);
1853     DEBUG_VERBOSE("Total local live ranges: " << numTotal << std::endl);
1854     DEBUG_VERBOSE("2 GRFs or more: " << numTwoOrMoreGRF << std::endl);
1855     DEBUG_VERBOSE("1 GRF: " << numOneGRF << std::endl);
1856     DEBUG_VERBOSE("Half GRF: " << numHalfGRF << std::endl);
1857     DEBUG_VERBOSE("Scalar: " << numScalars << std::endl);
1858     DEBUG_VERBOSE(std::endl);
1859 }
1860 
1861 // return false if roundrobin RA should be disabled
countLiveIntervals()1862 bool LocalRA::countLiveIntervals()
1863 {
1864     int globalRows = 0;
1865     uint32_t localRows = 0;
1866     for (auto curDcl : kernel.Declares)
1867     {
1868         LocalLiveRange* curDclLR = nullptr;
1869 
1870         if (curDcl->getAliasDeclare() == NULL &&
1871             curDcl->getRegFile() == G4_GRF &&
1872             (curDclLR = gra.getLocalLR(curDcl)) &&
1873             curDclLR->isGRFRegAssigned() == false)
1874         {
1875             if (curDclLR->isLiveRangeGlobal())
1876             {
1877                 globalRows += curDcl->getNumRows();
1878             }
1879             else if (curDclLR->isLiveRangeLocal())
1880             {
1881                 localRows += curDcl->getNumRows();
1882             }
1883         }
1884     }
1885 
1886     if (globalRows <= NUM_PREGS_FOR_UNIQUE_ASSIGN)
1887     {
1888         globalLRSize = globalRows;
1889     }
1890     else
1891     {
1892         if (localRows < (numRegLRA - NUM_PREGS_FOR_UNIQUE_ASSIGN))
1893         {
1894             globalLRSize = NUM_PREGS_FOR_UNIQUE_ASSIGN;
1895         }
1896         else
1897         {
1898             return false;
1899         }
1900     }
1901 
1902     return true;
1903 }
1904 
1905 // ********* LocalLiveRange class implementation *********
recordRef(G4_BB * bb)1906 void LocalLiveRange::recordRef(G4_BB* bb)
1907 {
1908     if (bb != prevBBRef)
1909         numRefsInFG++;
1910 
1911     prevBBRef = bb;
1912 }
1913 
isLiveRangeLocal() const1914 bool LocalLiveRange::isLiveRangeLocal() const
1915 {
1916     if (isIndirectAccess == false && numRefsInFG == 1 &&
1917         eot == false &&
1918         !builder.isPreDefArg(topdcl) && !builder.isPreDefRet(topdcl) &&
1919         topdcl->isOutput() == false)
1920     {
1921         return true;
1922     }
1923 
1924     return false;
1925 }
1926 
isLiveRangeGlobal()1927 bool LocalLiveRange::isLiveRangeGlobal()
1928 {
1929     if (isIndirectAccess == true || (numRefsInFG > 1 && eot == false) ||
1930         topdcl->isOutput() == true)
1931     {
1932         return true;
1933     }
1934 
1935     return false;
1936 }
1937 
1938 
isGRFRegAssigned()1939 bool LocalLiveRange::isGRFRegAssigned()
1940 {
1941     MUST_BE_TRUE(topdcl != NULL, "Top dcl not set");
1942     G4_RegVar* rvar = topdcl->getRegVar();
1943     bool isPhyRegAssigned = false;
1944 
1945     if (rvar)
1946     {
1947         if (rvar->isPhyRegAssigned())
1948             isPhyRegAssigned = true;
1949     }
1950 
1951     return isPhyRegAssigned;
1952 }
1953 
getSizeInWords()1954 unsigned int LocalLiveRange::getSizeInWords()
1955 {
1956     int nrows = getTopDcl()->getNumRows();
1957     int elemsize = getTopDcl()->getElemSize();
1958     int nelems = getTopDcl()->getNumElems();
1959     int words = 0;
1960 
1961     if (nrows > 1)
1962     {
1963         // If sizeInWords is set, use it otherwise consider entire row reserved
1964         unsigned int sizeInWords = getTopDcl()->getWordSize();
1965 
1966         if (sizeInWords > 0)
1967             words = sizeInWords;
1968         else
1969             words = nrows * numEltPerGRF<Type_UW>();
1970     }
1971     else if (nrows == 1)
1972     {
1973         int nbytesperword = 2;
1974         words = (nelems * elemsize + 1) / nbytesperword;
1975     }
1976 
1977     return words;
1978 }
1979 
1980 // ********* PhyRegsLocalRA class implementation *********
setGRFBusy(int which)1981 void PhyRegsLocalRA::setGRFBusy(int which)
1982 {
1983     MUST_BE_TRUE(isGRFAvailable(which), "Invalid register");
1984 
1985     // all 1 word mask based on register size
1986     uint64_t wordMask = (1ULL << (getGRFSize() / 2)) - 1;
1987     regBusyVector[which] = (uint32_t) wordMask;
1988 
1989     if (twoBanksRA)
1990     {
1991         if (which < SECOND_HALF_BANK_START_GRF)
1992         {
1993             bank1AvailableRegNum--;
1994         }
1995         else
1996         {
1997             bank2AvailableRegNum--;
1998         }
1999     }
2000 }
2001 
2002 // ********* PhyRegsLocalRA class implementation *********
setGRFBusy(int which,int howmany)2003 void PhyRegsLocalRA::setGRFBusy(int which, int howmany)
2004 {
2005     for (int i = 0; i < howmany; i++)
2006     {
2007         setGRFBusy(which + i);
2008     }
2009 }
2010 
setWordBusy(int whichgrf,int word)2011 void PhyRegsLocalRA::setWordBusy(int whichgrf, int word)
2012 {
2013     MUST_BE_TRUE(isGRFAvailable(whichgrf), "Invalid register");
2014     MUST_BE_TRUE(word <= (int)numEltPerGRF<Type_UW>(), "Invalid word");
2015 
2016     if (twoBanksRA)
2017     {
2018         if (regBusyVector[whichgrf] == 0)
2019         {
2020             if (whichgrf < SECOND_HALF_BANK_START_GRF)
2021             {
2022                 bank1AvailableRegNum--;
2023             }
2024             else
2025             {
2026                 bank2AvailableRegNum--;
2027             }
2028         }
2029     }
2030 
2031     regBusyVector[whichgrf] |= (WORD_BUSY << word);
2032 }
2033 
setWordBusy(int whichgrf,int word,int howmany)2034 void PhyRegsLocalRA::setWordBusy(int whichgrf, int word, int howmany)
2035 {
2036     for (int i = 0; i < howmany; i++)
2037     {
2038         setWordBusy(whichgrf, word + i);
2039     }
2040 }
2041 
setGRFNotBusy(int which,int instID)2042 void PhyRegsLocalRA::setGRFNotBusy(int which, int instID)
2043 {
2044     MUST_BE_TRUE(isGRFAvailable(which), "Invalid register");
2045 
2046     regBusyVector[which] = 0;
2047 
2048     if (twoBanksRA)
2049     {
2050         if (which < SECOND_HALF_BANK_START_GRF)
2051         {
2052             lastUseSum1 -= regLastUse[which];
2053             lastUseSum1 += instID;
2054             bank1AvailableRegNum++;
2055         }
2056         else
2057         {
2058             lastUseSum2 -= regLastUse[which];
2059             lastUseSum2 += instID;
2060             bank2AvailableRegNum++;
2061         }
2062     }
2063     if (instID)
2064     {
2065         regLastUse[which] = instID;
2066     }
2067 }
2068 
setWordNotBusy(int whichgrf,int word,int instID)2069 void PhyRegsLocalRA::setWordNotBusy(int whichgrf, int word, int instID)
2070 {
2071     MUST_BE_TRUE(isGRFAvailable(whichgrf), "Invalid register");
2072     MUST_BE_TRUE(word <= (int)numEltPerGRF<Type_UW>(), "Invalid word");
2073 
2074     if (twoBanksRA)
2075     {
2076         if (whichgrf < SECOND_HALF_BANK_START_GRF)
2077         {
2078             lastUseSum1 -= regLastUse[whichgrf];
2079             lastUseSum1 += instID;
2080             if (regBusyVector[whichgrf] == 0)
2081             {
2082                 bank1AvailableRegNum++;
2083             }
2084         }
2085         else
2086         {
2087             lastUseSum2 -= regLastUse[whichgrf];
2088             lastUseSum2 += instID;
2089             if (regBusyVector[whichgrf] == 0)
2090             {
2091                 bank2AvailableRegNum++;
2092             }
2093         }
2094     }
2095     uint32_t mask = ~(1 << word);
2096     regBusyVector[whichgrf] &= mask;
2097     if (instID)
2098     {
2099         regLastUse[whichgrf] = instID;
2100     }
2101 }
2102 
isWordBusy(int whichgrf,int word)2103 inline bool PhyRegsLocalRA::isWordBusy(int whichgrf, int word)
2104 {
2105     MUST_BE_TRUE(isGRFAvailable(whichgrf), "Invalid register");
2106 
2107     MUST_BE_TRUE(word <= (int)numEltPerGRF<Type_UW>(), "Invalid word");
2108     bool isBusy = ((regBusyVector[whichgrf] & (WORD_BUSY << word)) != 0);
2109     return isBusy;
2110 }
2111 
isWordBusy(int whichgrf,int word,int howmany)2112 bool PhyRegsLocalRA::isWordBusy(int whichgrf, int word, int howmany)
2113 {
2114     bool retval = false;
2115 
2116     for (int i = 0; i < howmany && !retval; i++)
2117     {
2118         retval |= isWordBusy(whichgrf, word + i);
2119     }
2120 
2121     return retval;
2122 }
2123 
findFreeMultipleRegsForward(int regIdx,BankAlign align,int & regnum,int nrows,int lastRowSize,int endReg,unsigned short occupiedBundles,int instID,bool isHybridAlloc,std::unordered_set<unsigned int> & forbidden,bool hintSet)2124 bool PhyRegsLocalRA::findFreeMultipleRegsForward(int regIdx, BankAlign align, int &regnum, int nrows, int lastRowSize, int endReg, unsigned short occupiedBundles, int instID, bool isHybridAlloc, std::unordered_set<unsigned int>& forbidden, bool hintSet)
2125 {
2126     int foundItem = 0;
2127     int startReg = 0;
2128     int i = regIdx;
2129     int grfRows = 0;
2130     bool multiSteps = nrows > 1;
2131 
2132     if (lastRowSize % numEltPerGRF<Type_UW>() == 0)
2133     {
2134         grfRows = nrows;
2135     }
2136     else
2137     {
2138         grfRows = nrows - 1;
2139     }
2140 
2141     findRegisterCandiateWithAlignForward(i, align, multiSteps);
2142     i = findBundleConflictFreeRegister(i, endReg, occupiedBundles, align, multiSteps);
2143 
2144     startReg = i;
2145     while (i <= endReg + nrows - 1)
2146     {
2147         if (isGRFAvailable(i) && forbidden.find(i) == forbidden.end() &&
2148             regBusyVector[i] == 0 &&
2149             (!isHybridAlloc || (((instID - regLastUse[i]) / 2 >= LraFFWindowSize) || (regLastUse[i] == 0)) || hintSet))
2150         {
2151             foundItem++;
2152         }
2153         else if (foundItem < grfRows)
2154         {
2155             foundItem = 0;
2156             i++;
2157             findRegisterCandiateWithAlignForward(i, align, multiSteps);
2158             i = findBundleConflictFreeRegister(i, endReg, occupiedBundles, align, multiSteps);
2159             startReg = i;
2160             continue;
2161         }
2162 
2163         if (foundItem == grfRows)
2164         {
2165             if (lastRowSize % numEltPerGRF<Type_UW>() == 0)
2166             {
2167                 regnum = startReg;
2168                 return true;
2169             }
2170             else
2171             {
2172                 if (i + 1 <= endReg + nrows - 1 &&
2173                     isGRFAvailable(i + 1) && forbidden.find(i+1) == forbidden.end() &&
2174                     (isWordBusy(i + 1, 0, lastRowSize) == false) &&
2175                     (!isHybridAlloc || (((instID - regLastUse[i + 1]) / 2 >= LraFFWindowSize) || (regLastUse[i + 1] == 0))))
2176                 {
2177                     regnum = startReg;
2178                     return true;
2179                 }
2180                 else
2181                 {
2182                     foundItem = 0;
2183                     i++;
2184                     findRegisterCandiateWithAlignForward(i, align, multiSteps);
2185                     i = findBundleConflictFreeRegister(i, endReg, occupiedBundles, align, multiSteps);
2186                     startReg = i;
2187                     continue;
2188                 }
2189             }
2190         }
2191 
2192         i++;
2193     }
2194 
2195     return false;
2196 }
2197 
findFreeMultipleRegsBackward(int regIdx,BankAlign align,int & regnum,int nrows,int lastRowSize,int endReg,int instID,bool isHybridAlloc,std::unordered_set<unsigned int> & forbidden)2198 bool PhyRegsLocalRA::findFreeMultipleRegsBackward(int regIdx, BankAlign align, int &regnum, int nrows, int lastRowSize, int endReg, int instID, bool isHybridAlloc, std::unordered_set<unsigned int>& forbidden)
2199 {
2200     int foundItem = 0;
2201     int startReg = 0;
2202     int grfRows = 0;
2203     int i = regIdx;
2204     bool multiSteps = nrows > 1;
2205 
2206     if (lastRowSize % numEltPerGRF<Type_UW>() == 0)
2207     {
2208         grfRows = nrows;
2209     }
2210     else
2211     {
2212         grfRows = nrows - 1;
2213     }
2214 
2215     findRegisterCandiateWithAlignBackward(i, align, multiSteps);
2216 
2217     startReg = i;
2218 
2219     while (i >= endReg && i >= 0)
2220     {
2221         if (isGRFAvailable(i) && forbidden.find(i) == forbidden.end() &&
2222             regBusyVector[i] == 0 &&
2223             (!isHybridAlloc || (((instID - regLastUse[i]) / 2 >= LraFFWindowSize) || (regLastUse[i] == 0))))
2224         {
2225             foundItem++;
2226         }
2227         else if (foundItem < grfRows)
2228         {
2229             foundItem = 0;
2230             i -= nrows;
2231             findRegisterCandiateWithAlignBackward(i, align, multiSteps);
2232 
2233             startReg = i;
2234             continue;
2235         }
2236 
2237         if (foundItem == grfRows)
2238         {
2239             if (lastRowSize % numEltPerGRF<Type_UW>() == 0)
2240             {
2241                 regnum = startReg;
2242                 return true;
2243             }
2244             else
2245             {
2246                 if (i + 1 <= endReg &&
2247                     isGRFAvailable(i + 1) && forbidden.find(i+1) ==forbidden.end() &&
2248                     (isWordBusy(i + 1, 0, lastRowSize) == false) &&
2249                     (!isHybridAlloc || (((instID - regLastUse[i + 1]) / 2 >= LraFFWindowSize) || (regLastUse[i + 1] == 0))))
2250                 {
2251                     regnum = startReg;
2252                     return true;
2253                 }
2254                 else
2255                 {
2256                     foundItem = 0;
2257                     i -= nrows;
2258                     findRegisterCandiateWithAlignBackward(i, align, multiSteps);
2259 
2260                     startReg = i;
2261                     continue;
2262                 }
2263             }
2264         }
2265 
2266         i++;
2267     }
2268 
2269     return false;
2270 }
2271 
findFreeSingleReg(int regIdx,int size,BankAlign align,G4_SubReg_Align subalign,int & regnum,int & subregnum,int endReg,int instID,bool isHybridAlloc,bool forward,std::unordered_set<unsigned int> & forbidden)2272 bool PhyRegsLocalRA::findFreeSingleReg(int regIdx, int size, BankAlign align, G4_SubReg_Align subalign, int &regnum, int &subregnum, int endReg, int instID, bool isHybridAlloc, bool forward, std::unordered_set<unsigned int>& forbidden)
2273 {
2274     int i = regIdx;
2275     bool found = false;
2276 
2277     while (!found)
2278     {
2279         if (forward)
2280         {
2281             if (i > endReg) //<= works
2282                 break;
2283         }
2284         else
2285         {
2286             if (i < endReg) //>= works
2287                 break;
2288         }
2289 
2290         // Align GRF
2291         if ((align == BankAlign::Even) && (i % 2 != 0))
2292         {
2293             i += forward ? 1 : -1;
2294             continue;
2295         }
2296         else if ((align == BankAlign::Odd) && (i % 2 == 0))
2297         {
2298             i += forward ? 1 : -1;
2299             continue;
2300         }
2301         else if ((align == BankAlign::Even2GRF) && ((i % 4 >= 2)))
2302         {
2303             i += forward ? 1 : -1;
2304             continue;
2305         }
2306         else if ((align == BankAlign::Odd2GRF) && ((i % 4 < 2)))
2307         {
2308             i += forward ? 1 : -1;
2309             continue;
2310         }
2311 
2312         if (isGRFAvailable(i, 1) && forbidden.find(i) ==forbidden.end() &&
2313             (!isHybridAlloc || (((instID - regLastUse[i]) / 2 >= LraFFWindowSize) || (regLastUse[i] == 0))))
2314         {
2315             found = findFreeSingleReg(i, subalign, regnum, subregnum, size);
2316             if (found)
2317             {
2318                 return true;
2319             }
2320         }
2321         i += forward ? 1 : -1;
2322     }
2323 
2324     return false;
2325 }
2326 
printBusyRegs()2327 void PhyRegsLocalRA::printBusyRegs()
2328 {
2329     for (int i = 0; i < (int)numRegs; i++)
2330     {
2331         if (isGRFAvailable(i) == false)
2332             continue;
2333 
2334         for (int j = 0; j < (int)numEltPerGRF<Type_UW>(); j++)
2335         {
2336             if (isWordBusy(i, j) == true)
2337             {
2338                 DEBUG_VERBOSE("r" << i << "." << j << ":w, " << std::endl);
2339             }
2340         }
2341     }
2342 }
2343 
2344 // Based on RegVar, mark available registers as busy
markPhyRegs(G4_Declare * topdcl)2345 void PhyRegsLocalRA::markPhyRegs(G4_Declare* topdcl)
2346 {
2347     G4_RegVar* rvar = topdcl->getRegVar();
2348     unsigned numrows = topdcl->getNumRows();
2349     unsigned numwords = 0;
2350     unsigned int regnum = 0;
2351 
2352     if (builder->hasFusedEUWA() && rvar->getPhyReg() == nullptr)
2353     {
2354         return;
2355     }
2356     // Calculate number of physical registers required by this dcl
2357     if (numrows == 1)
2358     {
2359         unsigned numbytes = topdcl->getElemSize() * topdcl->getNumElems();
2360         numwords = (numbytes + 1) / 2;
2361         unsigned int subReg = rvar->getPhyRegOff();
2362         unsigned int subRegInWord = subReg * rvar->getDeclare()->getElemSize() / 2;
2363 
2364         regnum = rvar->getPhyReg()->asGreg()->getRegNum();
2365         if (isGRFAvailable(regnum) == true) {
2366             for (unsigned int i = 0; i < numwords; i++)
2367             {
2368                 // Starting from first word, mark each consecutive word busy
2369                 setWordBusy(regnum, (subRegInWord + i));
2370             }
2371         }
2372     }
2373     else
2374     {
2375         regnum = rvar->getPhyReg()->asGreg()->getRegNum();
2376         for (unsigned int i = 0; i < topdcl->getNumRows(); i++)
2377         {
2378             if (isGRFAvailable(i + regnum) == true)
2379             {
2380                 // Set entire GRF busy
2381                 setGRFBusy(regnum + i);
2382             }
2383         }
2384     }
2385 }
2386 
2387 // ********* PhyRegsManager class implementation *********
findFreeSingleReg(int regIdx,G4_SubReg_Align subalign,int & regnum,int & subregnum,int size)2388 bool PhyRegsLocalRA::findFreeSingleReg(int regIdx, G4_SubReg_Align subalign, int &regnum, int &subregnum, int size)
2389 {
2390     bool found = false;
2391     if (subalign == GRFALIGN)
2392     {
2393         if (isWordBusy(regIdx, 0, size) == false)
2394         {
2395             subregnum = 0;
2396             found = true;
2397         }
2398     }
2399     else if (subalign == HALFGRFALIGN)
2400     {
2401         if (isWordBusy(regIdx, 0, size) == false)
2402         {
2403             subregnum = 0;
2404             found = true;
2405         }
2406         else if (size <= (int)numEltPerGRF<Type_UW>() / 2 && isWordBusy(regIdx, numEltPerGRF<Type_UW>() / 2, size) == false)
2407         {
2408             subregnum = numEltPerGRF<Type_UW>() / 2;
2409             found = true;
2410         }
2411     }
2412     else
2413     {
2414         // ToDo: check if dynamic step size has compile time impact
2415         int step = 1;
2416         int upBound = numEltPerGRF<Type_UW>() - size + 1;
2417         switch (subalign)
2418         {
2419             case Eight_Word: step = 8; break;
2420             case Four_Word: step = 4; break;
2421             case Even_Word: step = 2; break;
2422             case Any:
2423                 upBound = numEltPerGRF<Type_UW>() - size; //FIXME, why not the last word
2424                 step = 1; break;
2425             default:
2426                 assert("unexpected alignment");
2427         }
2428         for (int j = 0; j < upBound; j += step)
2429         {
2430             if (!isWordBusy(regIdx, j, size))
2431             {
2432                 subregnum = j;
2433                 found = true;
2434                 break;
2435             }
2436         }
2437     }
2438 
2439     if (found)
2440     {
2441         regnum = regIdx;
2442     }
2443 
2444     return found;
2445 }
2446 
findFreeRegs(int size,BankAlign align,G4_SubReg_Align subalign,int & regnum,int & subregnum,int startRegNum,int endRegNum,unsigned short occupiedBundles,unsigned int instID,bool isHybridAlloc,std::unordered_set<unsigned int> & forbidden,bool hintSet,unsigned int hintReg)2447 int PhyRegsManager::findFreeRegs(int size, BankAlign align, G4_SubReg_Align subalign, int& regnum, int& subregnum,
2448     int startRegNum, int endRegNum, unsigned short occupiedBundles, unsigned int instID, bool isHybridAlloc, std::unordered_set<unsigned int>& forbidden, bool hintSet, unsigned int hintReg)
2449 {
2450     int nrows = 0;
2451     int lastRowSize = 0;
2452     LocalRA::getRowInfo(size, nrows, lastRowSize);
2453 
2454     bool forward = hintSet ? true : (startRegNum <= endRegNum ? true : false);
2455     int startReg = forward ? startRegNum : startRegNum - nrows + 1;
2456     int endReg = forward ? endRegNum - nrows + 1 : endRegNum;
2457 
2458     bool found = false;
2459 
2460     if (size >= (int)numEltPerGRF<Type_UW>())
2461     {
2462         if (!hintSet)
2463         {
2464             if (forward)
2465             {
2466                 found = availableRegs.findFreeMultipleRegsForward(startReg, align, regnum, nrows, lastRowSize, endReg, occupiedBundles, instID, isHybridAlloc, forbidden, hintSet);
2467             }
2468             else
2469             {
2470                 found = availableRegs.findFreeMultipleRegsBackward(startReg, align, regnum, nrows, lastRowSize, endReg, instID, isHybridAlloc, forbidden);
2471             }
2472         }
2473         else
2474         {
2475             regnum = hintReg;
2476             found = true;
2477         }
2478         if (found)
2479         {
2480             subregnum = 0;
2481             if (size % numEltPerGRF<Type_UW>() == 0)
2482             {
2483                 availableRegs.setGRFBusy(regnum, nrows);
2484             }
2485             else
2486             {
2487                 MUST_BE_TRUE(!hintSet, "Not expecting split for variable with alignment != n*sizeof(GRF)");
2488                 availableRegs.setGRFBusy(regnum, nrows - 1);
2489                 availableRegs.setWordBusy(regnum + nrows - 1, 0, lastRowSize);
2490             }
2491         }
2492     }
2493     else
2494     {
2495         MUST_BE_TRUE(!hintSet, "Not expecting split with sub-GRF granularity");
2496         found = availableRegs.findFreeSingleReg(startReg, size, align, subalign, regnum, subregnum, endReg, instID, isHybridAlloc, forward, forbidden);
2497         if (found)
2498         {
2499             availableRegs.setWordBusy(regnum, subregnum, size);
2500         }
2501     }
2502 
2503     if (found)
2504     {
2505         return nrows;
2506     }
2507 
2508     return 0;
2509 }
2510 
2511 // subregnum parameter is expected to be in units of word
freeRegs(int regnum,int subregnum,int numwords,int instID)2512 void PhyRegsManager::freeRegs(int regnum, int subregnum, int numwords, int instID)
2513 {
2514     while (numwords >= (int)numEltPerGRF<Type_UW>())
2515     {
2516         availableRegs.setGRFNotBusy(regnum, instID);
2517         numwords -= numEltPerGRF<Type_UW>();
2518         regnum++;
2519     }
2520 
2521     while (numwords >= 1)
2522     {
2523         availableRegs.setWordNotBusy(regnum, subregnum, instID);
2524         subregnum++;
2525 
2526         if (subregnum >= (int)numEltPerGRF<Type_UW>())
2527         {
2528             subregnum = 0;
2529             regnum++;
2530         }
2531         numwords--;
2532     }
2533 }
2534 
2535 // ********* LinearScan class implementation *********
2536 
LinearScan(GlobalRA & g,std::vector<LocalLiveRange * > & localLiveIntervals,std::list<InputLiveRange *,std_arena_based_allocator<InputLiveRange * >> & inputLivelIntervals,PhyRegsManager & pregMgr,PhyRegsLocalRA & pregs,Mem_Manager & memmgr,PhyRegSummary * s,unsigned int numReg,unsigned int glrs,bool roundRobin,bool bankConflict,bool internalConflict,bool splitLLR,unsigned int simdS)2537 LinearScan::LinearScan(GlobalRA& g, std::vector<LocalLiveRange*>& localLiveIntervals,
2538     std::list<InputLiveRange*, std_arena_based_allocator<InputLiveRange*>>& inputLivelIntervals,
2539     PhyRegsManager& pregMgr, PhyRegsLocalRA& pregs, Mem_Manager& memmgr, PhyRegSummary* s,
2540     unsigned int numReg, unsigned int glrs, bool roundRobin, bool bankConflict,
2541     bool internalConflict, bool splitLLR, unsigned int simdS)
2542     : gra(g)
2543     , builder(g.builder)
2544     , mem(memmgr)
2545     , pregManager(pregMgr)
2546     , initPregs(pregs)
2547     , liveIntervals(localLiveIntervals)
2548     , inputIntervals(inputLivelIntervals)
2549     , summary(s)
2550     , pregs(g.kernel.getNumRegTotal()* numEltPerGRF<Type_UW>(), false)
2551     , simdSize(simdS)
2552     , globalLRSize(glrs)
2553     , numRegLRA(numReg)
2554     , useRoundRobin(roundRobin)
2555     , doBankConflict(bankConflict)
2556     , highInternalConflict(internalConflict)
2557     , doSplitLLR(splitLLR)
2558 {
2559 
2560     // FIXME: this entire class is a mess, whole thing needs a rewrite
2561     if (g.kernel.getNumRegTotal() < 128)
2562     {
2563         // code below can segfault if user specifies #GRF < 128. We just hack it so bank1 == bank2
2564         bank1StartGRFReg = bank1_start = 0;
2565         bank2StartGRFReg = bank2_start = 0;
2566         bank1_end = bank2_end = numRegLRA - 1;
2567         startGRFReg = &bank1StartGRFReg;
2568         return;
2569     }
2570 
2571     //register number boundaries
2572     bank1_start = 0;
2573     bank1_end = SECOND_HALF_BANK_START_GRF - globalLRSize / 2 - 1;
2574     if (useRoundRobin) { //From middle to back
2575         bank2_start = SECOND_HALF_BANK_START_GRF + (globalLRSize + 1) / 2;
2576         bank2_end = numRegLRA - 1;
2577     } else { //From back to middle
2578         bank2_start = numRegLRA - 1;
2579         bank2_end = SECOND_HALF_BANK_START_GRF + (globalLRSize + 1) / 2;
2580     }
2581 
2582     //register number pointers
2583     bank1StartGRFReg = bank1_start;
2584     bank2StartGRFReg = bank2_start;
2585 
2586     //register pointer
2587     startGRFReg = &bank1StartGRFReg;
2588 
2589     int bank1AvailableRegNum = 0;
2590     for (int i = 0; i < SECOND_HALF_BANK_START_GRF; i++) {
2591         if (pregManager.getAvaialableRegs()->isGRFAvailable(i) && !pregManager.getAvaialableRegs()->isGRFBusy(i)) {
2592             bank1AvailableRegNum++;
2593         }
2594     }
2595     pregManager.getAvaialableRegs()->setBank1AvailableRegNum(bank1AvailableRegNum);
2596 
2597     int bank2AvailableRegNum = 0;
2598     for (unsigned int i = SECOND_HALF_BANK_START_GRF; i < numRegLRA; i++) {
2599         if (pregManager.getAvaialableRegs()->isGRFAvailable(i) && !pregManager.getAvaialableRegs()->isGRFBusy(i)) {
2600             bank2AvailableRegNum++;
2601         }
2602     }
2603     pregManager.getAvaialableRegs()->setBank2AvailableRegNum(bank2AvailableRegNum);
2604 }
2605 
2606 // Linear scan implementation
run(G4_BB * bb,IR_Builder & builder,LLR_USE_MAP & LLRUseMap)2607 void LinearScan::run(G4_BB* bb, IR_Builder& builder, LLR_USE_MAP& LLRUseMap)
2608 {
2609     unsigned int idx = 0;
2610     bool allocateRegResult = false;
2611 
2612     unsigned int firstLocalIdx = 0, firstGlobalIdx = 0;
2613     if (liveIntervals.size() > 0)
2614     {
2615         LocalLiveRange* firstLr = (*liveIntervals.begin());
2616         G4_INST *firstInst = firstLr->getFirstRef(firstLocalIdx);
2617         firstGlobalIdx = firstInst->getLexicalId();
2618     }
2619 
2620     for (auto lr : liveIntervals)
2621     {
2622         G4_INST *currInst = lr->getFirstRef(idx);
2623 
2624         expireRanges(idx);
2625         expireInputRanges(currInst->getLexicalId(), idx, firstGlobalIdx);
2626 
2627         if (!lr->hasHint() &&
2628             doBankConflict && builder.lowHighBundle() && (highInternalConflict || (simdSize >= 16 && builder.oneGRFBankDivision())))
2629         {
2630             allocateRegResult = allocateRegsFromBanks(lr);
2631         }
2632         else
2633         {
2634             allocateRegResult = allocateRegs(lr, bb, builder, LLRUseMap);
2635         }
2636 
2637         if (allocateRegResult)
2638         {
2639             updateActiveList(lr);
2640 #ifdef _DEBUG
2641             int startregnum, endregnum, startsregnum, endsregnum;
2642             G4_VarBase* op;
2643             op = lr->getPhyReg(startsregnum);
2644 
2645             startregnum = endregnum = op->asGreg()->getRegNum();
2646             endsregnum = startsregnum + (lr->getTopDcl()->getNumElems() * lr->getTopDcl()->getElemSize() / 2) - 1;
2647 
2648             MUST_BE_TRUE(((unsigned int)startregnum + lr->getTopDcl()->getNumRows() - 1) < numRegLRA, "Linear scan allocated unavailable physical GRF");
2649 
2650             if (lr->getTopDcl()->getNumRows() > 1) {
2651                 endregnum = startregnum + lr->getTopDcl()->getNumRows() - 1;
2652 
2653                 if (lr->getTopDcl()->getWordSize() > 0)
2654                 {
2655                     endsregnum = lr->getTopDcl()->getWordSize() % numEltPerGRF<Type_UW>() - 1;
2656                     if (endsregnum < 0) endsregnum = 15;
2657                 }
2658                 else
2659                     endsregnum = 15; // last word in GRF
2660             }
2661 #ifdef DEBUG_VERBOSE_ON
2662             DEBUG_VERBOSE("Assigned physical register to " << lr->getTopDcl()->getName() <<
2663                 " (r" << startregnum << "." << startsregnum << ":w - " <<
2664                 "r" << endregnum << "." << endsregnum << ":w)" << std::endl);
2665 #endif
2666 #endif
2667         }
2668     }
2669 
2670     expireAllActive();
2671 }
2672 
updateActiveList(LocalLiveRange * lr)2673 void LinearScan::updateActiveList(LocalLiveRange* lr)
2674 {
2675     bool done = false;
2676     unsigned int newlr_end;
2677 
2678     lr->getLastRef(newlr_end);
2679 
2680     for (auto active_it = active.begin();
2681         active_it != active.end();
2682         active_it++)
2683     {
2684         unsigned int end_idx;
2685         LocalLiveRange* active_lr = (*active_it);
2686 
2687         active_lr->getLastRef(end_idx);
2688 
2689         if (end_idx > newlr_end)
2690         {
2691             active.insert(active_it, lr);
2692             done = true;
2693             break;
2694         }
2695 
2696     }
2697 
2698     if (done == false)
2699         active.push_back(lr);
2700 
2701 }
2702 
expireAllActive()2703 void LinearScan::expireAllActive()
2704 {
2705     if (active.size() > 0)
2706     {
2707         // Expire any remaining ranges
2708         LocalLiveRange* lastActive = active.back();
2709         unsigned int endIdx;
2710 
2711         lastActive->getLastRef(endIdx);
2712 
2713         expireRanges(endIdx);
2714     }
2715 }
2716 
2717 // idx is the current position being processed
2718 // The function will expire active ranges that end at or before idx
expireRanges(unsigned int idx)2719 void LinearScan::expireRanges(unsigned int idx)
2720 {
2721     //active list is sorted in ascending order of starting index
2722 
2723     while (active.size() > 0)
2724     {
2725         unsigned int endIdx;
2726         LocalLiveRange* lr = active.front();
2727 
2728         lr->getLastRef(endIdx);
2729 
2730         if (endIdx <= idx)
2731         {
2732             G4_VarBase* preg;
2733             int subregnumword, subregnum;
2734 
2735             preg = lr->getPhyReg(subregnumword);
2736 
2737             if (preg)
2738             {
2739                 subregnum = LocalRA::convertSubRegOffFromWords(lr->getTopDcl(), subregnumword);
2740 
2741                 // Mark the RegVar object of dcl as assigned to physical register
2742                 lr->getTopDcl()->getRegVar()->setPhyReg(preg, subregnum);
2743                 lr->setAssigned(true);
2744 
2745                 if (summary)
2746                 {
2747                     // mark physical register as used atleast once
2748                     summary->markPhyRegs(preg, lr->getSizeInWords());
2749                 }
2750             }
2751 
2752             // Free physical regs marked for this range
2753             freeAllocedRegs(lr, true);
2754 
2755 #ifdef DEBUG_VERBOSE_ON
2756             DEBUG_VERBOSE("Expiring range " << lr->getTopDcl()->getName() << std::endl);
2757 #endif
2758 
2759             // Remove range from active list
2760             active.pop_front();
2761         }
2762         else
2763         {
2764             // As soon as we find first range that ends after ids break loop
2765             break;
2766         }
2767     }
2768 }
2769 
expireInputRanges(unsigned int global_idx,unsigned int local_idx,unsigned int first_idx)2770 void LinearScan::expireInputRanges(unsigned int global_idx, unsigned int local_idx, unsigned int first_idx)
2771 {
2772     while (inputIntervals.size() > 0)
2773     {
2774         InputLiveRange* lr = inputIntervals.front();
2775         unsigned int endIdx = lr->getLrEndIdx();
2776 
2777         if (endIdx <= global_idx)
2778         {
2779             unsigned int regnum = lr->getRegWordIdx() / numEltPerGRF<Type_UW>();
2780             unsigned int subRegInWord = lr->getRegWordIdx() % numEltPerGRF<Type_UW>();
2781             int inputIdx = (endIdx < first_idx) ? 0 : (local_idx - (global_idx - endIdx) * 2);
2782 
2783             // Free physical regs marked for this range
2784             pregManager.freeRegs(regnum, subRegInWord, 1, inputIdx);
2785             initPregs.setWordNotBusy(regnum, subRegInWord, 0);
2786 
2787 #ifdef DEBUG_VERBOSE_ON
2788             DEBUG_VERBOSE("Expiring input r" << regnum << "." << subRegInWord << std::endl);
2789 #endif
2790 
2791             // Remove range from inputIntervals list
2792             inputIntervals.pop_front();
2793         }
2794         else
2795         {
2796             // As soon as we find first range that ends after ids break loop
2797             break;
2798         }
2799     }
2800 }
2801 
2802 // Allocate registers to live range. It makes a decision whether to spill
2803 // a currently active range or the range passed as parameter. The range
2804 // that has larger size and is longer is the spill candidate.
2805 // Return true if free registers found, false if range is to be spilled
allocateRegs(LocalLiveRange * lr,G4_BB * bb,IR_Builder & builder,LLR_USE_MAP & LLRUseMap)2806 bool LinearScan::allocateRegs(LocalLiveRange* lr, G4_BB* bb, IR_Builder& builder, LLR_USE_MAP& LLRUseMap)
2807 {
2808     int regnum, subregnum;
2809     unsigned int localRABound = 0;
2810     unsigned int instID;
2811 
2812     G4_INST* currInst = lr->getFirstRef(instID);
2813 
2814     // Let local RA allocate only those ranges that need < 10 GRFs
2815     // Larger ranges are not many and are best left to global RA
2816     // as it can make a better judgment by considering the
2817     // spill cost.
2818     int nrows = 0;
2819     int size = lr->getSizeInWords();
2820     G4_Declare *dcl = lr->getTopDcl();
2821     G4_SubReg_Align subalign = gra.getSubRegAlign(dcl);
2822     BankAlign preBank = BankAlign::Either;
2823     unsigned short occupiedBundles = getOccupiedBundle(builder, gra, dcl, preBank);
2824     localRABound = numRegLRA - globalLRSize - 1;  //-1, localRABound will be counted in findFreeRegs()
2825     BankAlign bankAlign = gra.isEvenAligned(dcl) ? BankAlign::Even : BankAlign::Either;
2826     if (bankAlign == BankAlign::Either && doBankConflict)
2827     {
2828         bankAlign = gra.getBankAlign(dcl);
2829     }
2830     else
2831     {
2832         if (bankAlign == BankAlign::Either)
2833         {
2834             bankAlign = preBank;
2835         }
2836     }
2837 
2838     *startGRFReg = lr->hasHint() ? lr->getHint() : *startGRFReg;
2839     if (lr->hasHint() && gra.getVarSplitPass()->isPartialDcl(lr->getTopDcl()))
2840     {
2841         // Special alignment is not needed for var split intrinsic
2842         bankAlign = BankAlign::Either;
2843     }
2844 
2845     if (useRoundRobin)
2846     {
2847         nrows = pregManager.findFreeRegs(size,
2848             bankAlign,
2849             subalign,
2850             regnum,
2851             subregnum,
2852             *startGRFReg,
2853             localRABound,
2854             occupiedBundles,
2855             instID,
2856             false,
2857             lr->getForbidden(),
2858             lr->hasHint(),
2859             lr->getHint());
2860     }
2861     else
2862     {
2863         nrows = pregManager.findFreeRegs(size,
2864             bankAlign,
2865             subalign,
2866             regnum,
2867             subregnum,
2868             *startGRFReg,
2869             localRABound,
2870             occupiedBundles,
2871             instID,
2872             true,
2873             lr->getForbidden(),
2874             lr->hasHint(),
2875             lr->getHint());
2876 
2877         if (!nrows)
2878         {
2879             nrows = pregManager.findFreeRegs(size,
2880                 bankAlign,
2881                 subalign,
2882                 regnum,
2883                 subregnum,
2884                 *startGRFReg,
2885                 localRABound,
2886                 occupiedBundles,
2887                 instID,
2888                 false,
2889                 lr->getForbidden(),
2890                 lr->hasHint(),
2891                 lr->getHint());
2892         }
2893     }
2894 
2895     if (useRoundRobin)
2896     {
2897         if (nrows)
2898         {
2899             *startGRFReg = (regnum + nrows) % (localRABound);
2900         }
2901         else
2902         {
2903             unsigned int endGRFReg = *startGRFReg + nrows;
2904             endGRFReg = (endGRFReg > localRABound) ? localRABound : endGRFReg;
2905 
2906             nrows = pregManager.findFreeRegs(size,
2907                 bankAlign,
2908                 subalign,
2909                 regnum,
2910                 subregnum,
2911                 0,
2912                 endGRFReg,
2913                 occupiedBundles,
2914                 instID,
2915                 false,
2916                 lr->getForbidden(),
2917                 lr->hasHint(),
2918                 lr->getHint());
2919 
2920             if (nrows)
2921             {
2922                 *startGRFReg = (regnum + nrows) % (localRABound);
2923             }
2924         }
2925     }
2926 
2927     // If allocations fails, return false
2928     if (!nrows)
2929     {
2930         // Check if any range in active is longer and wider than lr.
2931         // Spill it if it is and call self again.
2932         for (auto active_it = active.rbegin();
2933             active_it != active.rend();
2934             active_it++)
2935         {
2936             LocalLiveRange* activeLR = (*active_it);
2937 
2938             if (activeLR->getSizeInWords() >= lr->getSizeInWords() &&
2939                 (!doSplitLLR ||
2940                     gra.getNumRefs(activeLR->getTopDcl()) > SPLIT_REF_CNT_THRESHOLD))
2941             {
2942                 // Another active range is larger than lr
2943                 unsigned int active_lr_end, lr_end;
2944 
2945                 activeLR->getLastRef(active_lr_end);
2946                 lr->getLastRef(lr_end);
2947 
2948                 if (active_lr_end > lr_end)
2949                 {
2950                     // Range in active is longer, so spill it
2951 #ifdef DEBUG_VERBOSE_ON
2952                     DEBUG_VERBOSE("Spilling range " << activeLR->getTopDcl()->getName() << std::endl);
2953 #endif
2954 
2955                     freeAllocedRegs(activeLR, false);
2956                     active.erase(--(active_it.base()));
2957 
2958                     if (doSplitLLR)
2959                     {
2960                         G4_Declare* oldDcl = activeLR->getTopDcl();
2961                         unsigned short totalElems = oldDcl->getTotalElems();
2962 
2963                         std::vector<std::pair<INST_LIST_ITER, unsigned int>>* useList = nullptr;
2964                         auto Iter = LLRUseMap.find(activeLR);
2965                         if (Iter != LLRUseMap.end())
2966                             useList = &(Iter->second);
2967 
2968                         if (useList &&
2969                             useList->size() > 2 &&
2970                             gra.getNumRefs(oldDcl) - useList->size() == 1 &&
2971                             (totalElems == 8 || totalElems == 16 || totalElems == 32))
2972                         {
2973                             bool split = false;
2974                             unsigned int useCnt = 0;
2975                             INST_LIST_ITER lastUseIt;
2976                             G4_Declare* newDcl = NULL;
2977                             for (auto usePoint : *useList)
2978                             {
2979                                 INST_LIST_ITER useIt = usePoint.first;
2980                                 G4_INST* useInst = *useIt;
2981                                 useCnt++;
2982 
2983                                 unsigned int currId = currInst->getLexicalId();
2984                                 if (useInst->getLexicalId() < currId ||
2985                                     useCnt < SPLIT_USE_CNT_THRESHOLD)
2986                                 {
2987                                     lastUseIt = useIt;
2988                                 }
2989                                 else if (split == false &&
2990                                          (useInst->getLexicalId() - (*lastUseIt)->getLexicalId()) > SPLIT_USE_DISTANCE_THRESHOLD &&
2991                                          !(oldDcl->getElemSize() >= 8 && oldDcl->getTotalElems() >= 16))
2992                                 {
2993                                     G4_Declare* splitDcl = NULL;
2994                                     const char* splitDclName = builder.getNameString(builder.mem, 16, "split_%s", oldDcl->getName());
2995                                     splitDcl = builder.createDeclareNoLookup(splitDclName, G4_GRF, oldDcl->getNumElems(), oldDcl->getNumRows(), oldDcl->getElemType());
2996                                     splitDcl->copyAlign(oldDcl);
2997 
2998                                     LocalLiveRange* splitLR = gra.GetOrCreateLocalLiveRange(splitDcl);
2999                                     splitLR->markSplit();
3000 
3001                                     INST_LIST_ITER iter = lastUseIt;
3002                                     iter++;
3003 
3004                                     // FIXME: The following code does not handle QWord variable spilling completely.
3005                                     // See the last condition of this if-stmt.
3006                                     G4_DstRegRegion* dst = builder.createDstRegRegion(splitDcl, 1);
3007                                     G4_SrcRegRegion* src = builder.createSrcRegRegion(oldDcl, builder.getRegionStride1());
3008                                     G4_ExecSize splitInstExecSize (oldDcl->getTotalElems() > 16 ? 16 : oldDcl->getTotalElems());
3009                                     G4_INST* splitInst = builder.createMov(
3010                                         splitInstExecSize,
3011                                         dst, src, InstOpt_WriteEnable, false);
3012                                     bb->insertBefore(iter, splitInst);
3013 
3014                                     unsigned int idx = 0;
3015                                     gra.getLocalLR(oldDcl)->setLastRef(splitInst, idx);
3016                                     gra.setNumRefs(oldDcl, useCnt);
3017                                     splitLR->setFirstRef(splitInst, idx);
3018                                     gra.setNumRefs(splitDcl, 2);
3019 
3020                                     if (totalElems == 32)
3021                                     {
3022                                         G4_DstRegRegion* dst = builder.createDst(splitDcl->getRegVar(), 2, 0, 1, oldDcl->getElemType());
3023                                         auto src = builder.createSrc(oldDcl->getRegVar(), 2, 0, builder.getRegionStride1(), oldDcl->getElemType());
3024                                         G4_INST* splitInst2 = builder.createMov(g4::SIMD16, dst, src, InstOpt_WriteEnable, false);
3025                                         bb->insertBefore(iter, splitInst2);
3026                                     }
3027 
3028                                     const char* newDclName = builder.getNameString(builder.mem, 16, "copy_%s", oldDcl->getName());
3029                                     newDcl = builder.createDeclareNoLookup(newDclName, G4_GRF, oldDcl->getNumElems(), oldDcl->getNumRows(), oldDcl->getElemType());
3030                                     newDcl->copyAlign(oldDcl);
3031 
3032                                     unsigned int oldRefs = gra.getNumRefs(oldDcl);
3033                                     LocalLiveRange* newLR = gra.GetOrCreateLocalLiveRange(newDcl);
3034 
3035                                     iter = useIt;
3036 
3037                                     dst = builder.createDstRegRegion(newDcl, 1);
3038                                     src = builder.createSrcRegRegion(splitDcl, builder.getRegionStride1());
3039                                     G4_INST* movInst = builder.createMov(
3040                                         G4_ExecSize(splitDcl->getTotalElems() > 16 ? 16 : splitDcl->getTotalElems()),
3041                                         dst, src, InstOpt_WriteEnable, false);
3042                                     bb->insertBefore(iter, movInst);
3043 
3044                                     splitLR->setLastRef(movInst, idx);
3045                                     G4_INST* old_last_use = activeLR->getLastRef(idx);
3046                                     newLR->setFirstRef(splitInst, idx);
3047                                     newLR->setLastRef(old_last_use, idx);
3048                                     gra.setNumRefs(newDcl, oldRefs - useCnt + 1);
3049 
3050                                     if (totalElems == 32)
3051                                     {
3052                                         G4_DstRegRegion* dst = builder.createDst(newDcl->getRegVar(), 2, 0, 1, splitDcl->getElemType());
3053                                         auto src = builder.createSrc(splitDcl->getRegVar(), 2, 0, builder.getRegionStride1(), splitDcl->getElemType());
3054                                         G4_INST* movInst2 = builder.createMov(g4::SIMD16, dst, src, InstOpt_WriteEnable, false);
3055                                         bb->insertBefore(iter, movInst2);
3056                                     }
3057 
3058                                     unsigned int pos = usePoint.second;
3059                                     G4_SrcRegRegion* oldSrc = useInst->getSrc(pos)->asSrcRegRegion();
3060                                     G4_Declare *oldSrcDcl = oldSrc->getBase()->asRegVar()->getDeclare();
3061                                     G4_Declare* newSrcDcl = newDcl;
3062                                     G4_Declare *aliasOldSrcDcl = oldSrcDcl->getAliasDeclare();
3063                                     if (aliasOldSrcDcl != NULL)
3064                                     {
3065                                         const char* newSrcDclName = builder.getNameString(builder.mem, 16, "copy_%s", oldSrcDcl->getName());
3066                                         newSrcDcl = builder.createDeclareNoLookup(newSrcDclName, G4_GRF, oldSrcDcl->getNumElems(), oldSrcDcl->getNumRows(), oldSrcDcl->getElemType());
3067                                         newSrcDcl->copyAlign(oldSrcDcl);
3068                                         newSrcDcl->setAliasDeclare(aliasOldSrcDcl, oldSrcDcl->getAliasOffset());
3069                                     }
3070                                     auto newSrc = builder.createSrcRegRegion(oldSrc->getModifier(), Direct, newSrcDcl->getRegVar(),
3071                                         oldSrc->getRegOff(), oldSrc->getSubRegOff(), oldSrc->getRegion(), oldSrc->getType());
3072                                     useInst->setSrc(newSrc, pos);
3073                                     while (aliasOldSrcDcl && aliasOldSrcDcl != oldDcl)
3074                                     {
3075                                         oldSrcDcl = aliasOldSrcDcl;
3076                                         const char* newSrcDclName = builder.getNameString(builder.mem, 16, "copy_%s", oldSrcDcl->getName());
3077                                         newSrcDcl = builder.createDeclareNoLookup(newSrcDclName, G4_GRF, oldSrcDcl->getNumElems(), oldSrcDcl->getNumRows(), oldSrcDcl->getElemType());
3078                                         newSrcDcl->copyAlign(oldSrcDcl);
3079                                         aliasOldSrcDcl = oldSrcDcl->getAliasDeclare();
3080                                         MUST_BE_TRUE(aliasOldSrcDcl != NULL, "Invalid alias decl");
3081                                         newSrcDcl->setAliasDeclare(aliasOldSrcDcl, oldSrcDcl->getAliasOffset());
3082                                     }
3083 
3084                                     split = true;
3085                                 }
3086                                 else if (split)
3087                                 {
3088                                     unsigned int pos = usePoint.second;
3089                                     G4_SrcRegRegion* oldSrc = useInst->getSrc(pos)->asSrcRegRegion();
3090                                     G4_Declare *oldSrcDcl = oldSrc->getBase()->asRegVar()->getDeclare();
3091                                     G4_Declare* newSrcDcl = newDcl;
3092                                     G4_Declare *aliasOldSrcDcl = oldSrcDcl->getAliasDeclare();
3093                                     if (aliasOldSrcDcl != NULL)
3094                                     {
3095                                         const char* newSrcDclName = builder.getNameString(builder.mem, 16, "copy_%s", oldSrcDcl->getName());
3096                                         newSrcDcl = builder.createDeclareNoLookup(newSrcDclName, G4_GRF, oldSrcDcl->getNumElems(), oldSrcDcl->getNumRows(), oldSrcDcl->getElemType());
3097                                         newSrcDcl->copyAlign(oldSrcDcl);
3098                                         newSrcDcl->setAliasDeclare(aliasOldSrcDcl, oldSrcDcl->getAliasOffset());
3099                                     }
3100                                     auto newSrc = builder.createSrcRegRegion(oldSrc->getModifier(), Direct, newSrcDcl->getRegVar(),
3101                                         oldSrc->getRegOff(), oldSrc->getSubRegOff(), oldSrc->getRegion(), oldSrc->getType());;
3102                                     useInst->setSrc(newSrc, pos);
3103                                     while (aliasOldSrcDcl && aliasOldSrcDcl != oldDcl)
3104                                     {
3105                                         oldSrcDcl = aliasOldSrcDcl;
3106                                         const char* newSrcDclName = builder.getNameString(builder.mem, 16, "copy_%s", oldSrcDcl->getName());
3107                                         newSrcDcl = builder.createDeclareNoLookup(newSrcDclName, G4_GRF, oldSrcDcl->getNumElems(), oldSrcDcl->getNumRows(), oldSrcDcl->getElemType());
3108                                         newSrcDcl->copyAlign(oldSrcDcl);
3109                                         aliasOldSrcDcl = oldSrcDcl->getAliasDeclare();
3110                                         MUST_BE_TRUE(aliasOldSrcDcl != NULL, "Invalid alias decl");
3111                                         newSrcDcl->setAliasDeclare(aliasOldSrcDcl, oldSrcDcl->getAliasOffset());
3112                                     }
3113                                 }
3114                                 else
3115                                 {
3116                                     lastUseIt = useIt;
3117                                 }
3118                             }
3119                         }
3120                     }
3121 
3122                     // Try again
3123                     return allocateRegs(lr, bb, builder, LLRUseMap);
3124                 }
3125             }
3126         }
3127 
3128 #ifdef DEBUG_VERBOSE_ON
3129         DEBUG_VERBOSE("Spilling range " << lr->getTopDcl()->getName() << std::endl);
3130 #endif
3131 
3132         // Even after spilling everything could not find an allocation
3133         return false;
3134     }
3135 #ifdef DEBUG_VERBOSE_ON
3136     COUT_ERROR << lr->getTopDcl()->getName() << ":r" << regnum << "  BANK: " << gra.getBankConflict(lr->getTopDcl()) << std::endl;
3137 #endif
3138 
3139     lr->setPhyReg(builder.phyregpool.getGreg(regnum), subregnum);
3140 
3141     if (gra.getVarSplitPass()->isSplitDcl(lr->getTopDcl()))
3142     {
3143         // Update hint for all children of lr to coalesce
3144         coalesceSplit(lr);
3145     }
3146 
3147 #ifdef _DEBUG
3148     if (gra.getVarSplitPass()->isPartialDcl(lr->getTopDcl()))
3149     {
3150         int s;
3151         MUST_BE_TRUE(lr->getPhyReg(s)->asGreg()->getRegNum() == lr->getHint() ||
3152             !lr->hasHint(), "hint not honored for child dcl");
3153     }
3154 #endif
3155 
3156     return true;
3157 }
3158 
coalesceSplit(LocalLiveRange * lr)3159 void LinearScan::coalesceSplit(LocalLiveRange* lr)
3160 {
3161     // lr is split parent. set hint on all children.
3162     // immediately free unused rows of split variable.
3163     int subreg;
3164     auto regnum = lr->getPhyReg(subreg)->asGreg()->getRegNum();
3165     auto varSplit = gra.getVarSplitPass();
3166     std::unordered_set<unsigned int> unrefGRFs;
3167     for (unsigned int i = regnum; i != (regnum + lr->getTopDcl()->getNumRows()); i++)
3168     {
3169         unrefGRFs.insert(i);
3170     }
3171     for (auto l : liveIntervals)
3172     {
3173         if (!varSplit->isParentChildRelation(lr->getTopDcl(), l->getTopDcl()))
3174             continue;
3175 
3176         auto siblingNum = varSplit->getSiblingNum(l->getTopDcl());
3177         unsigned int sizePerChild = l->getTopDcl()->getNumRows();
3178 
3179         // this hint is binding on children to guarantee coalescing of split intrinsic
3180         l->setHint(regnum + (siblingNum * sizePerChild));
3181 #ifdef _DEBUG
3182         int s;
3183         if (l->getAssigned())
3184             MUST_BE_TRUE(!l->getPhyReg(s), "split child already allocated");
3185 #endif
3186         for (unsigned int i = regnum + (siblingNum * sizePerChild); i != regnum + ((siblingNum + 1) * sizePerChild); i++)
3187         {
3188             unrefGRFs.erase(i);
3189         }
3190     }
3191 
3192     // now free phy regs of split that dont have an intrinsic split emitted, ie unused
3193     // rows of parent dcl.
3194     unsigned int idx;
3195     lr->getFirstRef(idx);
3196     for (auto f : unrefGRFs)
3197     {
3198         pregManager.freeRegs(f, 0, numEltPerGRF<Type_UW>(), idx);
3199     }
3200 }
3201 
allocateRegsFromBanks(LocalLiveRange * lr)3202 bool LinearScan::allocateRegsFromBanks(LocalLiveRange* lr)
3203 {
3204     int regnum, subregnum;
3205     unsigned int tmpLocalRABound = 0;
3206     int nrows = 0;
3207     int lastUseSum1 = pregManager.getAvaialableRegs()->getLastUseSum1();
3208     int lastUseSum2 = pregManager.getAvaialableRegs()->getLastUseSum2();
3209     int bank1AvailableRegNum = pregManager.getAvaialableRegs()->getBank1AvailableRegNum();
3210     int bank2AvailableRegNum = pregManager.getAvaialableRegs()->getBank2AvailableRegNum();
3211     int size = lr->getSizeInWords();
3212     BankAlign align = gra.isEvenAligned(lr->getTopDcl()) ? BankAlign::Even : BankAlign::Either;
3213     G4_SubReg_Align subalign = gra.getSubRegAlign(lr->getTopDcl());
3214     unsigned int instID;
3215     lr->getFirstRef(instID);
3216     auto lrBC = gra.getBankConflict(lr->getTopDcl());
3217 
3218     // Reverse the order for FF two banks RA.
3219     // For FF, second bank register allocated from back to front
3220     if (align == BankAlign::Either && lrBC != BANK_CONFLICT_NONE)
3221     {
3222         switch (lrBC)
3223         {
3224         case BANK_CONFLICT_FIRST_HALF_EVEN:
3225         case BANK_CONFLICT_FIRST_HALF_ODD:
3226             align = BankAlign::Even;
3227             startGRFReg = &bank1StartGRFReg;
3228             tmpLocalRABound = bank1_end;
3229             break;
3230         case BANK_CONFLICT_SECOND_HALF_EVEN:
3231         case BANK_CONFLICT_SECOND_HALF_ODD:
3232             if (useRoundRobin)
3233             {
3234                 align = BankAlign::Odd;
3235             }
3236             startGRFReg = &bank2StartGRFReg;
3237             tmpLocalRABound = bank2_end;
3238             break;
3239         default: break;
3240         }
3241     }
3242     else
3243     {
3244         if (useRoundRobin)
3245         {
3246             if (bank1AvailableRegNum == 0 && bank2AvailableRegNum == 0)
3247             {
3248                 return false;
3249             }
3250 
3251             float bank1Average = bank1AvailableRegNum == 0 ? (float)0x7fffffff : (float)lastUseSum1 / bank1AvailableRegNum;
3252             float bank2Average = bank2AvailableRegNum == 0 ? (float)0x7fffffff : (float)lastUseSum2 / bank2AvailableRegNum;
3253             if (bank1Average > bank2Average)
3254             {
3255                 startGRFReg = &bank2StartGRFReg;
3256                 tmpLocalRABound = bank2_end;
3257             }
3258             else
3259             {
3260                 startGRFReg = &bank1StartGRFReg;
3261                 tmpLocalRABound = bank1_end;
3262             }
3263         }
3264         else
3265         {
3266             if (bank1AvailableRegNum < bank2AvailableRegNum)
3267             {
3268                 startGRFReg = &bank2StartGRFReg;
3269                 tmpLocalRABound = bank2_end;
3270             }
3271             else
3272             {
3273                 startGRFReg = &bank1StartGRFReg;
3274                 tmpLocalRABound = bank1_end;
3275             }
3276         }
3277     }
3278 
3279     if (useRoundRobin)
3280     {
3281         nrows = pregManager.findFreeRegs(size,
3282             align,
3283             subalign,
3284             regnum,
3285             subregnum,
3286             *startGRFReg,
3287             tmpLocalRABound,
3288             0,
3289             instID,
3290             false,
3291             lr->getForbidden(),
3292             false, 0);
3293 
3294         if (nrows)
3295         {
3296             //Wrapper handling
3297             if (tmpLocalRABound)
3298             {
3299                 *startGRFReg = (regnum + nrows) % (tmpLocalRABound);
3300             }
3301 
3302             if (startGRFReg == &bank2StartGRFReg)
3303             {
3304                 if (*startGRFReg < SECOND_HALF_BANK_START_GRF)
3305                 {
3306                     *startGRFReg += bank2_start;
3307                     if (*startGRFReg >= numRegLRA)
3308                     {
3309                         *startGRFReg = bank2_start;
3310                         return false;
3311                     }
3312                 }
3313             }
3314         }
3315         else
3316         {
3317             //record the original endReg in case failed to allocate in bank switch
3318             unsigned int endReg = *startGRFReg + nrows;
3319 
3320             //If failed, try another bank first.
3321             if (startGRFReg == &bank2StartGRFReg)
3322             {
3323                 startGRFReg = &bank1StartGRFReg;
3324                 tmpLocalRABound = bank1_end;
3325             }
3326             else
3327             {
3328                 startGRFReg = &bank2StartGRFReg;
3329                 tmpLocalRABound = bank2_end;
3330             }
3331 
3332             nrows = pregManager.findFreeRegs(size,
3333                 align,
3334                 subalign,
3335                 regnum,
3336                 subregnum,
3337                 *startGRFReg,
3338                 tmpLocalRABound,
3339                 0,
3340                 instID,
3341                 false,
3342                 lr->getForbidden(),
3343                 false, 0);
3344 
3345             if (!nrows)
3346             {
3347                 //Still fail, wrapper round in original bank
3348                 if (startGRFReg == &bank1StartGRFReg) //switch back to before
3349                 {
3350                     startGRFReg = &bank2StartGRFReg;
3351                     *startGRFReg = bank2_start;
3352                     tmpLocalRABound = ((endReg) > (bank2_end)) ? bank2_end : endReg;
3353                 }
3354                 else
3355                 {
3356                     startGRFReg = &bank1StartGRFReg;
3357                     *startGRFReg = bank1_start;
3358                     tmpLocalRABound = (endReg > bank1_end) ? bank1_end : endReg;
3359                 }
3360                 nrows = pregManager.findFreeRegs(size,
3361                     align,
3362                     subalign,
3363                     regnum,
3364                     subregnum,
3365                     *startGRFReg,
3366                     tmpLocalRABound,
3367                     0,
3368                     instID,
3369                     false,
3370                     lr->getForbidden(),
3371                     false, 0);
3372             }
3373         }
3374     }
3375     else
3376     {
3377         nrows = pregManager.findFreeRegs(size,
3378             align,
3379             subalign,
3380             regnum,
3381             subregnum,
3382             *startGRFReg,
3383             tmpLocalRABound,
3384             0,
3385             instID,
3386             true,
3387             lr->getForbidden(),
3388             false, 0);
3389 
3390         if (!nrows)
3391         {   //Try without window, no even/odd alignment for bank, but still low and high(keep in same bank)
3392             nrows = pregManager.findFreeRegs(size,
3393                 gra.isEvenAligned(lr->getTopDcl()) ? BankAlign::Even : BankAlign::Either,
3394                 subalign,
3395                 regnum,
3396                 subregnum,
3397                 *startGRFReg,
3398                 tmpLocalRABound,
3399                 0,
3400                 instID,
3401                 false,
3402                 lr->getForbidden(),
3403                 false, 0);
3404         }
3405 
3406         if (!nrows)
3407         {
3408             //Try another bank, no even/odd alignment for bank, also no low and high
3409             if (startGRFReg == &bank2StartGRFReg)
3410             {
3411                 startGRFReg = &bank1StartGRFReg;
3412                 tmpLocalRABound = bank1_end;
3413             }
3414             else
3415             {
3416                 startGRFReg = &bank2StartGRFReg;
3417                 tmpLocalRABound = bank2_end;
3418             }
3419 
3420             nrows = pregManager.findFreeRegs(size,
3421                 gra.isEvenAligned(lr->getTopDcl()) ? BankAlign::Even : BankAlign::Either,
3422                 subalign,
3423                 regnum,
3424                 subregnum,
3425                 *startGRFReg,
3426                 tmpLocalRABound,
3427                 0,
3428                 instID,
3429                 false,
3430                 lr->getForbidden(),
3431                 false, 0);
3432         }
3433     }
3434 
3435     // If allocations fails, return false
3436     if (!nrows)
3437     {
3438         return false;
3439     }
3440 
3441     lr->setPhyReg(builder.phyregpool.getGreg(regnum), subregnum);
3442 
3443     if (gra.getVarSplitPass()->isSplitDcl(lr->getTopDcl()))
3444     {
3445         // Update hint for all children of lr to coalesce
3446         coalesceSplit(lr);
3447     }
3448 
3449     MUST_BE_TRUE(!gra.getVarSplitPass()->isPartialDcl(lr->getTopDcl())
3450         || !lr->hasHint(), "not expecting to use this allocate method for split dcl");
3451 
3452 #ifdef DEBUG_VERBOSE_ON
3453     COUT_ERROR << lr->getTopDcl()->getName() << ":r" << regnum << "  BANK: " << gra.getBankConflict(lr->getTopDcl()) << std::endl;
3454 #endif
3455     return true;
3456 }
3457 
3458 // Mark physical register allocated to range lr as available
freeAllocedRegs(LocalLiveRange * lr,bool setInstID)3459 void LinearScan::freeAllocedRegs(LocalLiveRange* lr, bool setInstID)
3460 {
3461     int sregnum;
3462 
3463     G4_VarBase* preg = lr->getPhyReg(sregnum);
3464 
3465     MUST_BE_TRUE(preg != NULL,
3466         "Physical register not assigned to live range. Cannot free regs.");
3467 
3468     unsigned int idx = 0;
3469     if (setInstID)
3470     {
3471         lr->getLastRef(idx);
3472     }
3473 
3474     auto varSplitPass = gra.getVarSplitPass();
3475     if (!varSplitPass->isSplitDcl(lr->getTopDcl()))
3476     {
3477         pregManager.freeRegs(preg->asGreg()->getRegNum(),
3478             sregnum,
3479             lr->getSizeInWords(),
3480             idx);
3481     }
3482 }
3483 
3484 // ********* PhyRegSummary class implementation *********
3485 
3486 // Mark GRFs as used
markPhyRegs(G4_VarBase * pr,unsigned int size)3487 void PhyRegSummary::markPhyRegs(G4_VarBase* pr, unsigned int size)
3488 {
3489     // Assume that pr is aligned to GRF start if it cannot fit in a single GRF
3490     MUST_BE_TRUE(pr->isGreg(), "Expecting GRF as operand");
3491 
3492     int numGRFs = size / numEltPerGRF<Type_UW>();
3493     unsigned int regnum = pr->asGreg()->getRegNum();
3494 
3495     if (size%numEltPerGRF<Type_UW>() != 0)
3496         numGRFs++;
3497 
3498     for (int i = 0; i < numGRFs; i++)
3499     {
3500         GRFUsage[regnum + i] = true;
3501     }
3502 }
3503 
printBusyRegs()3504 void PhyRegSummary::printBusyRegs()
3505 {
3506     DEBUG_VERBOSE("GRFs used: ");
3507 
3508     for (int i = 0; i < (int)totalNumGRF; i++)
3509     {
3510         if (isGRFBusy(i) == true)
3511         {
3512             DEBUG_VERBOSE("r" << i << ", ");
3513         }
3514     }
3515 
3516     DEBUG_VERBOSE(std::endl << std::endl);
3517 }
3518