1 /*========================== begin_copyright_notice ============================
2
3 Copyright (C) 2017-2021 Intel Corporation
4
5 SPDX-License-Identifier: MIT
6
7 ============================= end_copyright_notice ===========================*/
8
9 #include "LocalRA.h"
10 #include "RegAlloc.h"
11 #include "DebugInfo.h"
12 #include "common.h"
13
14 #include <fstream>
15 #include <tuple>
16
17 using namespace vISA;
18
19 //#define _DEBUG
20 //#define cout cerr
21
22 #define NUM_PREGS_FOR_UNIQUE_ASSIGN 50
23
24 #define SPLIT_REF_CNT_THRESHOLD 3
25 #define SPLIT_USE_CNT_THRESHOLD 2
26 #define SPLIT_USE_DISTANCE_THRESHOLD 100
27
28 void getForbiddenGRFs(std::vector<unsigned int>& regNum, G4_Kernel& kernel, unsigned stackCallRegSize, unsigned reserveSpillSize, unsigned reservedRegNum);
29 void getCallerSaveGRF(std::vector<unsigned int>& regNum, G4_Kernel* kernel);
30
LocalRA(BankConflictPass & b,GlobalRA & g)31 LocalRA::LocalRA(BankConflictPass& b, GlobalRA& g) :
32 kernel(g.kernel), builder(g.builder), mem(g.builder.mem), bc(b), gra(g)
33 {
34 }
35
getBankAlignForUniqueAssign(G4_Declare * dcl)36 BankAlign LocalRA::getBankAlignForUniqueAssign(G4_Declare *dcl)
37 {
38 // FIXME: this code is rather suspicious (we return even alignment
39 // for first_half_odd???), should revisit
40 switch (gra.getBankConflict(dcl))
41 {
42 case BANK_CONFLICT_FIRST_HALF_EVEN:
43 case BANK_CONFLICT_FIRST_HALF_ODD:
44 return builder.oneGRFBankDivision() ? BankAlign::Even : BankAlign::Even2GRF;
45 case BANK_CONFLICT_SECOND_HALF_EVEN:
46 case BANK_CONFLICT_SECOND_HALF_ODD:
47 return builder.oneGRFBankDivision() ? BankAlign::Odd : BankAlign::Odd2GRF;
48 default:
49 return BankAlign::Either;
50 }
51 }
52
53 // returns true if kernel contains a back edge
54 // call/return edge may also be considered as a back edge depending on layout.
hasBackEdge()55 bool LocalRA::hasBackEdge()
56 {
57 for (auto curBB : kernel.fg)
58 {
59
60 MUST_BE_TRUE(curBB->size() > 0 || curBB->Succs.size() > 1,
61 "Error in detecting back-edge: Basic block found with no inst list and multiple successors.");
62
63 if (curBB->size() > 0)
64 {
65 for (auto succ : curBB->Succs)
66 {
67 MUST_BE_TRUE(succ->size() > 0,
68 "Error in detecting back-edge: Destination basic block of a jmp has no instructions.");
69 if (curBB->getId() >= succ->getId())
70 {
71 return true;
72 }
73 }
74 }
75 }
76 return false;
77 }
78
getRowInfo(int size,int & nrows,int & lastRowSize)79 void LocalRA::getRowInfo(int size, int& nrows, int& lastRowSize)
80 {
81 if (size <= (int)numEltPerGRF<Type_UW>())
82 {
83 nrows = 1;
84 }
85 else
86 {
87 // nrows is total number of rows, including last row even if it is partial
88 nrows = size / numEltPerGRF<Type_UW>();
89 // lastrowsize is number of words actually used in last row
90 lastRowSize = size%numEltPerGRF<Type_UW>();
91
92 if (size%numEltPerGRF<Type_UW>() != 0)
93 {
94 nrows++;
95 }
96
97 if (lastRowSize == 0)
98 {
99 lastRowSize = numEltPerGRF<Type_UW>();
100 }
101 }
102
103 return;
104 }
105
evenAlign()106 void LocalRA::evenAlign()
107 {
108 if (kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D &&
109 kernel.fg.size() > 2)
110 {
111 if ((GlobalRA::useGenericAugAlign() && kernel.getSimdSize() >= numEltPerGRF<Type_UD>()) ||
112 (!GlobalRA::useGenericAugAlign() && kernel.getSimdSize() > numEltPerGRF<Type_UD>()))
113 {
114 // Set alignment of all GRF candidates
115 // to 2GRF except for NoMask variables
116 #ifdef DEBUG_VERBOSE_ON
117 DEBUG_VERBOSE("Updating alignment to Even for GRF candidates" << std::endl);
118 #endif
119 gra.evenAlign();
120 }
121 gra.updateSubRegAlignment(GRFALIGN);
122 // Since we are piggy backing on mask field of G4_Declare,
123 // we need to make sure we reset it before going further.
124 resetMasks();
125 }
126
127 return;
128 }
129
130 /*
131 * Pre-localRA analsyis including:
132 * 1) Mark reference to create live ranges, 2) alignment handling, 3) Avialable register calcuating and marking
133 */
preLocalRAAnalysis()134 void LocalRA::preLocalRAAnalysis()
135 {
136 unsigned int numRowsEOT = 0;
137 bool lifetimeOpFound = false;
138
139 int numGRF = kernel.getNumRegTotal();
140
141 // Clear LocalLiveRange* computed preRA
142 gra.clearStaleLiveRanges();
143
144 // Mark references made to decls to sieve local from global ranges
145 markReferences(numRowsEOT, lifetimeOpFound);
146
147 // Mark those physical registers unavailable that are declared with Output attribute
148 blockOutputPhyRegs();
149
150 if (lifetimeOpFound == true)
151 {
152 // Check whether pseudo_kill/lifetime.end are only references
153 // for their respective variables. Remove them if so. Doing this
154 // helps reduce number of variables in symbol table increasing
155 // changes of skipping global RA.
156 removeUnrequiredLifetimeOps();
157 }
158
159 unsigned int numRowsReserved = numRowsEOT;
160
161 // Remove unreferenced dcls
162 gra.removeUnreferencedDcls();
163
164 if (builder.getOption(vISA_HybridRAWithSpill) || builder.getOption(vISA_FastCompileRA))
165 {
166 unsigned reserveSpillSize = 0;
167 unsigned int spillRegSize = 0;
168 unsigned int indrSpillRegSize = 0;
169 gra.determineSpillRegSize(spillRegSize, indrSpillRegSize);
170
171 reserveSpillSize = spillRegSize + indrSpillRegSize;
172 if (reserveSpillSize >= kernel.getNumCalleeSaveRegs())
173 {
174 const_cast<Options*>(builder.getOptions())->setOption(vISA_HybridRAWithSpill, false);
175 const_cast<Options*>(builder.getOptions())->setOption(vISA_FastCompileRA, false);
176 numRegLRA = numGRF - numRowsReserved;
177 }
178 else
179 {
180 numRegLRA = numGRF - numRowsReserved - reserveSpillSize - 3; // spill Header, scratch offset, a0.2
181 }
182 }
183 else
184 {
185 numRegLRA = numGRF - numRowsReserved;
186 }
187 bool isStackCall = (kernel.fg.getHasStackCalls() || kernel.fg.getIsStackCallFunc());
188 unsigned int stackCallRegSize = isStackCall ? kernel.numReservedABIGRF() : 0;
189 unsigned int reservedGRFNum = builder.getOptions()->getuInt32Option(vISA_ReservedGRFNum);
190
191 if (isStackCall || reservedGRFNum || builder.getOption(vISA_Debug))
192 {
193 std::vector<unsigned int> forbiddenRegs;
194 getForbiddenGRFs(forbiddenRegs, kernel, stackCallRegSize, 0, reservedGRFNum);
195 for (unsigned int i = 0, size = forbiddenRegs.size(); i < size; i++)
196 {
197 unsigned int regNum = forbiddenRegs[i];
198 pregs->setGRFUnavailable(regNum);
199 }
200
201 // FIXME: this will break if # of GRF is not 128.
202 if (isStackCall)
203 {
204 // Set r60 to r99 as unavailable for local RA since these registers are callee saved
205 // Local RA will use only caller save registers for allocation.
206 std::vector<unsigned int> callerSaveRegs;
207 getCallerSaveGRF(callerSaveRegs, &kernel);
208 for (unsigned int i = 0, size = callerSaveRegs.size(); i < size; i++)
209 {
210 unsigned int regNum = callerSaveRegs[i];
211 pregs->setGRFUnavailable(regNum);
212 }
213
214 numRegLRA = (kernel.getNumRegTotal() - 3) - numRowsReserved;
215 }
216
217 if (builder.getOption(vISA_Debug))
218 {
219 // Since LocalRA is not undone when debug info generation is required,
220 // for keeping compile time low, we allow fewer physical registers
221 // as assignable candidates. Without this, we could run in to a
222 // situation where very few physical registers are available for GRA
223 // and it is unable to assign registers even with spilling.
224 #define USABLE_GRFS_WITH_DEBUG_INFO 80
225 int maxSendReg = 0;
226 for (auto bb : kernel.fg)
227 {
228 for (auto inst : *bb)
229 {
230 if (inst->isSend() || inst->isSplitSend())
231 {
232 maxSendReg = ((int)inst->getMsgDesc()->getDstLenRegs() > maxSendReg) ?
233 ((int)inst->getMsgDesc()->getDstLenRegs()) : maxSendReg;
234 maxSendReg = ((int)inst->getMsgDesc()->getSrc0LenRegs() > maxSendReg) ?
235 ((int)inst->getMsgDesc()->getSrc0LenRegs()) : maxSendReg;
236 maxSendReg = ((int)inst->getMsgDesc()->getSrc1LenRegs() > maxSendReg) ?
237 ((int)inst->getMsgDesc()->getSrc1LenRegs()) : maxSendReg;
238 }
239 }
240 }
241
242 int maxRegsToUse = USABLE_GRFS_WITH_DEBUG_INFO;
243 if (maxSendReg > (numGRF - USABLE_GRFS_WITH_DEBUG_INFO))
244 {
245 maxRegsToUse = (numGRF - maxSendReg) - 10;
246 }
247
248 // Also check max size of addressed GRF
249 unsigned int maxAddressedRows = 0;
250 for (auto dcl : kernel.Declares)
251 {
252 if (dcl->getAddressed() &&
253 maxAddressedRows < dcl->getNumRows())
254 {
255 maxAddressedRows = dcl->getNumRows();
256 }
257 }
258
259 // Assume indirect operand of maxAddressedRows exists
260 // on dst, src0, src1. This is overly conservative but
261 // should work for general cases.
262 if ((numGRF - maxRegsToUse) / 3 < (int)maxAddressedRows)
263 {
264 maxRegsToUse = (numGRF - (maxAddressedRows * 3));
265
266 if (maxRegsToUse < 0)
267 maxRegsToUse = 0;
268 }
269
270 for (int i = maxRegsToUse; i < numGRF; i++)
271 {
272 pregs->setGRFUnavailable(i);
273 }
274 }
275 }
276 else
277 {
278 pregs->setSimpleGRFAvailable(true);
279 const Options *opt = builder.getOptions();
280 if (kernel.getInt32KernelAttr(Attributes::ATTR_Target) != VISA_3D ||
281 opt->getOption(vISA_enablePreemption) ||
282 opt->getOption(vISA_ReserveR0))
283 {
284 pregs->setR0Forbidden();
285 }
286
287 if (opt->getOption(vISA_enablePreemption))
288 {
289 pregs->setR1Forbidden();
290 }
291 }
292
293 return;
294 }
295
trivialAssignRA(bool & needGlobalRA,bool threeSourceCandidate)296 void LocalRA::trivialAssignRA(bool& needGlobalRA, bool threeSourceCandidate)
297 {
298 if (builder.getOption(vISA_RATrace))
299 {
300 std::cout << "\t--trivial RA\n";
301 }
302
303 needGlobalRA = assignUniqueRegisters(threeSourceCandidate, builder.lowHighBundle(), false);
304
305 if (!needGlobalRA)
306 {
307 if (threeSourceCandidate)
308 {
309 kernel.setRAType(RA_Type::TRIVIAL_BC_RA);
310 }
311 else
312 {
313 kernel.setRAType(RA_Type::TRIVIAL_RA);
314 }
315 }
316
317 return;
318 }
319
320 // Entry point to local RA
localRAPass(bool doRoundRobin,bool doSplitLLR)321 bool LocalRA::localRAPass(bool doRoundRobin, bool doSplitLLR)
322 {
323 bool needGlobalRA = true;
324 PhyRegsLocalRA localPregs = *pregs;
325
326 // Iterate over each BB and perform linear scan
327 #ifdef DEBUG_VERBOSE_ON
328 localPregs.printBusyRegs();
329 printAddressTakenDecls();
330 printLocalRACandidates();
331 #endif
332
333 if (kernel.fg.funcInfoTable.empty() && !hasBackEdge())
334 {
335 calculateInputIntervals();
336 }
337
338 #ifdef DEBUG_VERBOSE_ON
339 printInputLiveIntervals();
340 #endif
341
342 int totalGRFNum = kernel.getNumRegTotal();
343 for (auto curBB : kernel.fg)
344 {
345 PhyRegsManager pregManager(localPregs, doBCR);
346 std::vector<LocalLiveRange*> liveIntervals;
347
348 PhyRegSummary* summary = new (mem)PhyRegSummary(totalGRFNum);
349
350 calculateLiveIntervals(curBB, liveIntervals);
351
352 #ifdef DEBUG_VERBOSE_ON
353 printLocalLiveIntervals(curBB, liveIntervals);
354 #endif
355
356 LinearScan ra(gra, liveIntervals, inputIntervals, pregManager,
357 localPregs, mem, summary, numRegLRA, globalLRSize,
358 doRoundRobin, doBCR,
359 highInternalConflict, doSplitLLR, kernel.getSimdSize());
360 ra.run(curBB, builder, LLRUseMap);
361
362 #ifdef DEBUG_VERBOSE_ON
363 COUT_ERROR << "BB" << curBB->getId() << std::endl;
364 summary->printBusyRegs();
365 #endif
366
367 // Tag summary of physical registers used by local RA
368 kernel.fg.addBBLRASummary(curBB, summary);
369
370 if (kernel.getOptions()->getOption(vISA_GenerateDebugInfo))
371 {
372 updateDebugInfo(kernel, liveIntervals);
373 }
374 }
375
376 if (unassignedRangeFound() == false)
377 {
378 needGlobalRA = false;
379 #ifdef DEBUG_VERBOSE_ON
380 DEBUG_VERBOSE("Skipping global RA because local RA allocated all live-ranges." << std::endl);
381 #endif
382 }
383
384 if (builder.getOption(vISA_OptReport))
385 {
386 localRAOptReport();
387 }
388
389 if (needGlobalRA == true &&
390 !(kernel.fg.getHasStackCalls() || kernel.fg.getIsStackCallFunc()))
391 {
392 // Check whether unique physical registers can be assigned
393 // to global ranges.
394 bool twoBanksAssign = false;
395
396 if (builder.oneGRFBankDivision())
397 {
398 twoBanksAssign = highInternalConflict || kernel.getSimdSize() == g4::SIMD16;
399 }
400 else
401 {
402 twoBanksAssign = highInternalConflict;
403 }
404
405 needGlobalRA = assignUniqueRegisters(doBCR, twoBanksAssign, builder.getOption(vISA_HybridRAWithSpill) && !doRoundRobin);
406 }
407
408 if (needGlobalRA && doRoundRobin)
409 {
410 undoLocalRAAssignments(true);
411 }
412
413 if (needGlobalRA == false && builder.getOption(vISA_OptReport))
414 {
415 std::ofstream optreport;
416 getOptReportStream(optreport, builder.getOptions());
417 optreport << "Allocated 100% GRF ranges without graph coloring." << std::endl;
418 closeOptReportStream(optreport);
419 }
420
421 return needGlobalRA;
422 }
423
localRA()424 bool LocalRA::localRA()
425 {
426 if (builder.getOption(vISA_RATrace))
427 {
428 std::cout << "--local RA--\n";
429 }
430
431 bool doRoundRobin = builder.getOption(vISA_LocalRARoundRobin);
432
433 int numGRF = kernel.getNumRegTotal();
434 PhyRegsLocalRA phyRegs(&builder, numGRF);
435 pregs = &phyRegs;
436
437 globalLRSize = 0;
438 bool reduceBCInTAandFF = false;
439 bool needGlobalRA = true;
440
441 doSplitLLR = (builder.getOption(vISA_SpiltLLR) &&
442 kernel.fg.size() == 1 &&
443 kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D);
444
445 preLocalRAAnalysis();
446
447 bool reduceBCInRR = false;
448
449 if (builder.getOption(vISA_LocalBankConflictReduction) && builder.hasBankCollision())
450 {
451 reduceBCInRR = bc.setupBankConflictsForKernel(doRoundRobin, reduceBCInTAandFF, numRegLRA, highInternalConflict);
452 }
453
454 if (!kernel.fg.getHasStackCalls() && !kernel.fg.getIsStackCallFunc() &&
455 !hasSplitInsts)
456 {
457 trivialAssignRA(needGlobalRA, reduceBCInTAandFF);
458 if (!needGlobalRA)
459 {
460 return true;
461 }
462 }
463
464 if (doRoundRobin)
465 {
466 doRoundRobin = countLiveIntervals();
467 }
468
469 if ((!doRoundRobin && reduceBCInTAandFF) || (doRoundRobin && reduceBCInRR)) //To avoid the scheduling issue for send
470 {
471 doBCR = true;
472 }
473 // Local RA passes:
474 // RR+BC --> FF+BC -->FF-->Hybrids
475 // or
476 // RR --> FF --> Hybrid
477 if (doRoundRobin)
478 {
479 if (builder.getOption(vISA_RATrace))
480 {
481 std::cout << "\t--round-robin " << (doBCR ? "BCR " : "") << "RA\n";
482 }
483 needGlobalRA = localRAPass(true, false);
484 if (needGlobalRA == true)
485 {
486 doRoundRobin = false;
487 }
488 }
489
490 if (!doRoundRobin)
491 {
492 if (kernel.getOption(vISA_forceBCR) && doBCR)
493 {
494 if (builder.getOption(vISA_RATrace))
495 {
496 std::cout << "\t--first-fit " << "BCR " << "RA\n";
497 }
498 needGlobalRA = localRAPass(false, doSplitLLR);
499 }
500
501 if (needGlobalRA)
502 {
503 if (builder.getOption(vISA_RATrace))
504 {
505 std::cout << "\t--first-fit " << "RA\n";
506 }
507 globalLRSize = 0;
508 if (builder.getOption(vISA_HybridRAWithSpill))
509 {
510 countLiveIntervals();
511 }
512 else
513 {
514 globalLRSize = 0;
515 }
516 evenAlign();
517 needGlobalRA = localRAPass(false, doSplitLLR);
518 }
519 }
520
521 if (needGlobalRA == false)
522 {
523 if (doRoundRobin)
524 {
525 kernel.setRAType(doBCR ? RA_Type::LOCAL_ROUND_ROBIN_BC_RA : RA_Type::LOCAL_ROUND_ROBIN_RA);
526 }
527 else
528 {
529 kernel.setRAType(doBCR ? RA_Type::LOCAL_FIRST_FIT_BC_RA : RA_Type::LOCAL_FIRST_FIT_RA);
530 }
531 }
532
533 setLexicalID(true);
534
535 return !needGlobalRA;
536 }
537
resetMasks()538 void LocalRA::resetMasks()
539 {
540 auto& dcls = kernel.Declares;
541
542 for (auto dcl : dcls)
543 {
544 gra.setMask(dcl, {});
545 }
546 }
547
blockOutputPhyRegs()548 void LocalRA::blockOutputPhyRegs()
549 {
550 for (auto dcl : kernel.Declares)
551 {
552 if (dcl->isOutput() &&
553 dcl->isInput())
554 {
555 // The live-range may never be referenced in the CFG so we need
556 // this special pass to block physical registers corresponding
557 // to those declares. This may be true for FC.
558 pregs->markPhyRegs(dcl);
559 }
560 }
561 }
562
563 class isLifetimeCandidateOpCandidateForRemoval
564 {
565 public:
isLifetimeCandidateOpCandidateForRemoval(GlobalRA & g)566 isLifetimeCandidateOpCandidateForRemoval(GlobalRA& g) : gra(g)
567 {
568 }
569
570 GlobalRA& gra;
571
operator ()(G4_INST * inst)572 bool operator()(G4_INST* inst)
573 {
574 if (inst->isPseudoKill() ||
575 inst->isLifeTimeEnd())
576 {
577 G4_Declare* topdcl;
578
579 if (inst->isPseudoKill())
580 {
581 topdcl = GetTopDclFromRegRegion(inst->getDst());
582 }
583 else
584 {
585 topdcl = GetTopDclFromRegRegion(inst->getSrc(0));
586 }
587
588 if (gra.getNumRefs(topdcl) == 0 &&
589 (topdcl->getRegFile() == G4_GRF ||
590 topdcl->getRegFile() == G4_INPUT))
591 {
592 // Remove this lifetime op
593 return true;
594 }
595 }
596
597 return false;
598 }
599 };
600
removeUnrequiredLifetimeOps()601 void LocalRA::removeUnrequiredLifetimeOps()
602 {
603 // Iterate over all instructions and inspect only
604 // pseudo_kills/lifetime.end instructions. Remove
605 // instructions that have no other useful instruction.
606
607 for (BB_LIST_ITER bb_it = kernel.fg.begin();
608 bb_it != kernel.fg.end();
609 bb_it++)
610 {
611 G4_BB* bb = (*bb_it);
612 bb->erase(std::remove_if(bb->begin(), bb->end(), isLifetimeCandidateOpCandidateForRemoval(this->gra)),
613 bb->end());
614 }
615 }
616
findRegisterCandiateWithAlignForward(int & i,BankAlign align,bool evenAlign)617 void PhyRegsLocalRA::findRegisterCandiateWithAlignForward(int &i, BankAlign align, bool evenAlign)
618 {
619 if ((align == BankAlign::Even) && (i % 2 != 0))
620 {
621 i++;
622 }
623 else if ((align == BankAlign::Odd) && (i % 2 == 0))
624 {
625 i++;
626 }
627 else if (align == BankAlign::Even2GRF)
628 {
629 while ((i % 4 >= 2) || (evenAlign && (i % 2 != 0)))
630 {
631 i++;
632 }
633 }
634 else if (align == BankAlign::Odd2GRF)
635 {
636 while ((i % 4 < 2) || (evenAlign && (i % 2 != 0)))
637 {
638 i++;
639 }
640 }
641 }
642
get_bundle(unsigned int baseReg,int offset)643 unsigned int PhyRegsLocalRA::get_bundle(unsigned int baseReg, int offset)
644 {
645 if (builder->hasPartialInt64Support())
646 {
647 return (((baseReg + offset) % 32) / 2);
648 }
649 return (((baseReg + offset) % 64) / 4);
650 }
651
findBundleConflictFreeRegister(int curReg,int endReg,unsigned short occupiedBundles,BankAlign align,bool evenAlign)652 int PhyRegsLocalRA::findBundleConflictFreeRegister(int curReg,
653 int endReg,
654 unsigned short occupiedBundles,
655 BankAlign align,
656 bool evenAlign)
657 {
658 int i = curReg;
659 while (occupiedBundles & (1 << get_bundle(i, 0)) ||
660 occupiedBundles & (1 << get_bundle(i, 1)))
661 {
662 i++;
663 findRegisterCandiateWithAlignForward(i, align, evenAlign);
664 if (i > endReg) //Out of bound, drop the bundle conflict considration
665 {
666 i = curReg;
667 break;
668 }
669 }
670
671 return i;
672 }
673
findRegisterCandiateWithAlignBackward(int & i,BankAlign align,bool evenAlign)674 void PhyRegsLocalRA::findRegisterCandiateWithAlignBackward(int &i, BankAlign align, bool evenAlign)
675 {
676 if ((align == BankAlign::Even) && (i % 2 != 0))
677 {
678 i--;
679 }
680 else if ((align == BankAlign::Odd) && (i % 2 == 0))
681 {
682 i--;
683 }
684 else if (align == BankAlign::Even2GRF)
685 {
686 while (i >= 0 && ((i % 4 >= 2) || (evenAlign && (i % 2 != 0))))
687 {
688 i--;
689 }
690 }
691 else if (align == BankAlign::Odd2GRF)
692 {
693 while (i >= 0 && ((i % 4 < 2) || (evenAlign && (i % 2 != 0))))
694 {
695 i--;
696 }
697 }
698 }
699
getOccupiedBundle(IR_Builder & builder,GlobalRA & gra,G4_Declare * dcl,BankAlign & preferBank)700 inline static unsigned short getOccupiedBundle(IR_Builder& builder, GlobalRA& gra, G4_Declare* dcl, BankAlign &preferBank)
701 {
702 unsigned short occupiedBundles = 0;
703 unsigned bundleNum = 0;
704 unsigned int evenBankNum = 0;
705 unsigned int oddBankNum = 0;
706
707 if (!builder.hasDPAS() || !builder.getOption(vISA_EnableDPASBundleConflictReduction))
708 {
709 return 0;
710 }
711
712 for (const BundleConflict& conflict : gra.getBundleConflicts(dcl))
713 {
714 LocalLiveRange* lr = gra.getLocalLR(conflict.dcl);
715 int subregnum;
716 G4_VarBase* preg = lr->getPhyReg(subregnum);
717 if (preg == NULL)
718 {
719 preg = lr->getTopDcl()->getRegVar()->getPhyReg();
720 }
721
722 if (preg != NULL)
723 {
724 int offset = conflict.offset;
725 unsigned int reg = preg->asGreg()->getRegNum();
726 unsigned int bank = gra.get_bank(reg, offset);
727 if (bank)
728 {
729 oddBankNum++;
730 }
731 else
732 {
733 evenBankNum++;
734 }
735 unsigned int bundle = gra.get_bundle(reg, offset);
736 unsigned int bundle1 = gra.get_bundle(reg, offset + 1);
737 if (!(occupiedBundles & ((unsigned short)1 << bundle)))
738 {
739 bundleNum++;
740 }
741 occupiedBundles |= (unsigned short)1 << bundle;
742 occupiedBundles |= (unsigned short)1 << bundle1;
743 }
744 }
745 if (bundleNum > 12)
746 {
747 occupiedBundles = 0;
748 }
749
750 if (evenBankNum || oddBankNum)
751 {
752 preferBank = (evenBankNum < oddBankNum) ? BankAlign::Even : BankAlign::Odd;
753 }
754
755 return occupiedBundles;
756 }
757
assignUniqueRegisters(bool twoBanksRA,bool twoDirectionsAssign,bool hybridWithSpill)758 bool LocalRA::assignUniqueRegisters(bool twoBanksRA, bool twoDirectionsAssign, bool hybridWithSpill)
759 {
760 // Iterate over all dcls and calculate number of rows
761 // required if each unallocated dcl had its own physical
762 // register. Check number of registers used up by local RA.
763 // If unique assignments are possible, then assign and set
764 // needGlobalRA to false.
765 unsigned int numRows = 0;
766 std::vector<G4_Declare*> unallocatedRanges;
767 bool needGlobalRA = true;
768 uint32_t nextEOTGRF = numRegLRA;
769 std::unordered_set<unsigned int> emptyForbidden;
770 auto varSplitPass = gra.getVarSplitPass();
771
772 if (nextEOTGRF < 112 && builder.hasEOTGRFBinding())
773 {
774 // we can't guarantee unique assignments for all the EOT sources
775 // punt to global RA
776 return true;
777 }
778
779 // Identify global ranges not having an allocation
780 for (auto dcl : kernel.Declares)
781 {
782 if (dcl->getAliasDeclare() == NULL &&
783 dcl->getRegFile() == G4_GRF &&
784 dcl->getRegVar()->isPhyRegAssigned() == false)
785 {
786 auto dclLR = gra.getLocalLR(dcl);
787
788 if (dclLR && dclLR->isEOT() && builder.hasEOTGRFBinding())
789 {
790 // For EOT use r112 - r127
791 // for simplicity we assign each EOT source unique GRFs
792 G4_Greg* phyReg = builder.phyregpool.getGreg(nextEOTGRF);
793 dcl->getRegVar()->setPhyReg(phyReg, 0);
794 dclLR->setPhyReg(phyReg, 0);
795 dclLR->setAssigned(true);
796 // we have to make sure src0 and src1 of split send get different GRFs
797 nextEOTGRF += dcl->getNumRows();
798 if (kernel.getOption(vISA_GenerateDebugInfo))
799 {
800 uint32_t start = 0, end = 0;
801 for (auto rit = kernel.fg.rbegin();
802 rit != kernel.fg.rend();
803 rit++)
804 {
805 G4_BB* bb = (*rit);
806
807 if (bb->size() > 0)
808 {
809 end = bb->back()->getCISAOff();
810 break;
811 }
812 }
813 updateDebugInfo(kernel, dcl, start, end);
814 }
815 }
816 else
817 {
818 if (varSplitPass->isSplitDcl(dcl) ||
819 varSplitPass->isPartialDcl(dcl))
820 {
821 return true;
822 }
823 numRows += dcl->getNumRows();
824 if (hybridWithSpill)
825 {
826 if (dcl->isDoNotSpill())
827 {
828 unallocatedRanges.push_back(dcl);
829 }
830 }
831 else
832 {
833 unallocatedRanges.push_back(dcl);
834 }
835 }
836 }
837 }
838
839 if (numRows < numRegLRA || hybridWithSpill)
840 {
841 // Get superset of registers used by local RA
842 // in all basic blocks.
843 PhyRegsManager phyRegMgr(*pregs, twoBanksRA);
844
845 for (auto bb : kernel.fg)
846 {
847 PhyRegSummary* summary = kernel.fg.getBBLRASummary(bb);
848 if (summary != NULL)
849 {
850 for (unsigned int i = 0; i < numRegLRA; i++)
851 {
852 if (summary->isGRFBusy(i))
853 {
854 phyRegMgr.getAvaialableRegs()->setGRFBusy(i, 1);
855 }
856 }
857 }
858 }
859
860
861 needGlobalRA = hybridWithSpill;
862 bool assignFromFront = true;
863 #ifdef DEBUG_VERBOSE_ON
864 COUT_ERROR << "Trival RA: " << std::endl;
865 #endif
866 for (auto dcl : unallocatedRanges)
867 {
868 int regNum = 0;
869 int subregNum = 0;
870 int sizeInWords = dcl->getWordSize();
871 int nrows = 0;
872 BankAlign bankAlign = BankAlign::Either;
873 if (twoBanksRA &&
874 gra.getBankConflict(dcl) != BANK_CONFLICT_NONE)
875 {
876 bankAlign = getBankAlignForUniqueAssign(dcl);
877
878 if (bankAlign == BankAlign::Even || bankAlign == BankAlign::Even2GRF)
879 {
880 assignFromFront = true;
881 }
882 if (twoDirectionsAssign && (bankAlign == BankAlign::Odd || bankAlign == BankAlign::Odd2GRF))
883 {
884 assignFromFront = false;
885 }
886 }
887
888 auto assignAlign = gra.isEvenAligned(dcl) ? BankAlign::Even : bankAlign;
889 G4_SubReg_Align subAlign = builder.GRFAlign() ? GRFALIGN : gra.getSubRegAlign(dcl);
890
891 if (assignFromFront)
892 {
893 BankAlign preferBank = BankAlign::Either;
894 unsigned short occupiedBundles = getOccupiedBundle(builder, gra, dcl, preferBank);
895 nrows = phyRegMgr.findFreeRegs(sizeInWords, assignAlign,
896 subAlign, regNum, subregNum, 0, numRegLRA - 1, occupiedBundles, 0, false, emptyForbidden, false, 0);
897 }
898 else
899 {
900 nrows = phyRegMgr.findFreeRegs(sizeInWords, assignAlign,
901 subAlign, regNum, subregNum, numRegLRA - 1, 0, 0, 0, false, emptyForbidden, false, 0);
902 }
903
904 if (nrows)
905 {
906 G4_Greg* phyReg = builder.phyregpool.getGreg(regNum);
907 // adjust subreg num to type size
908 if (dcl->getElemSize() == 1)
909 {
910 subregNum *= 2;
911 }
912 else
913 {
914 subregNum /= (dcl->getElemSize() / 2);
915 }
916 dcl->getRegVar()->setPhyReg(phyReg, subregNum);
917 }
918 else
919 {
920 needGlobalRA = true;
921 break;
922 }
923 if (twoBanksRA && twoDirectionsAssign)
924 {
925 assignFromFront = !assignFromFront;
926 }
927 }
928
929 if (needGlobalRA)
930 {
931 for (auto dcl : unallocatedRanges)
932 {
933 if (!hybridWithSpill)
934 {
935 dcl->getRegVar()->resetPhyReg();
936 }
937 else if (!dcl->isDoNotSpill())
938 {
939 dcl->getRegVar()->resetPhyReg();
940 }
941 }
942 }
943 else
944 {
945 if (kernel.getOption(vISA_GenerateDebugInfo))
946 {
947 uint32_t start = 0, end = UNMAPPABLE_VISA_INDEX;
948 for (auto rit = kernel.fg.rbegin();
949 rit != kernel.fg.rend();
950 rit++)
951 {
952 G4_BB* bb = (*rit);
953
954 if (bb->size() > 0)
955 {
956 for (auto inst : *bb)
957 {
958 if (inst->getCISAOff() != UNMAPPABLE_VISA_INDEX)
959 {
960 end = bb->back()->getCISAOff();
961 break;
962 }
963 }
964 if (end != UNMAPPABLE_VISA_INDEX)
965 break;
966 }
967 }
968
969 for (auto dcl : unallocatedRanges)
970 {
971 updateDebugInfo(kernel, dcl, start, end);
972 }
973
974 // unallocatedRanges doesnt contain input dcls
975 for (auto dcl : kernel.Declares)
976 {
977 if (dcl->isInput())
978 {
979 updateDebugInfo(kernel, dcl, start, end);
980 }
981 }
982 }
983 }
984
985 }
986 else
987 {
988 needGlobalRA = true;
989 }
990 #ifdef DEBUG_VERBOSE_ON
991 COUT_ERROR << std::endl;
992 #endif
993
994 return needGlobalRA;
995 }
996
removeUnreferencedDcls()997 void GlobalRA::removeUnreferencedDcls()
998 {
999 // Iterate over all dcls and remove those with 0
1000 // ref count and not addressed. This is done only for
1001 // GRF dcls.
1002
1003 // Propagate top dcl info to aliases
1004 for (auto dcl : kernel.Declares)
1005 {
1006 if (dcl->getAliasDeclare() != nullptr)
1007 {
1008 setNumRefs(dcl, getNumRefs(dcl->getAliasDeclare()));
1009 }
1010 }
1011
1012 auto isUnrefDcl = [this](G4_Declare* dcl)
1013 {
1014 return (dcl->getRegFile() == G4_GRF || dcl->getRegFile() == G4_INPUT) &&
1015 getNumRefs(dcl) == 0 &&
1016 dcl->getRegVar()->isPhyRegAssigned() == false &&
1017 dcl != kernel.fg.builder->getBuiltinR0()
1018 && dcl != kernel.fg.builder->getSpillSurfaceOffset() &&
1019 (!kernel.fg.builder->getOptions()->getuInt32Option(vISA_CodePatch) ||
1020 dcl->getAliasDeclare() != kernel.fg.builder->getInputR1());
1021 };
1022
1023 kernel.Declares.erase(
1024 std::remove_if(kernel.Declares.begin(), kernel.Declares.end(), isUnrefDcl),
1025 kernel.Declares.end());
1026 }
1027
unassignedRangeFound()1028 bool LocalRA::unassignedRangeFound()
1029 {
1030 bool unassignedRangeFound = false;
1031 for (auto dcl : kernel.Declares)
1032 {
1033 if (dcl->getAliasDeclare() == NULL &&
1034 dcl->getRegFile() == G4_GRF &&
1035 dcl->getRegVar()->isPhyRegAssigned() == false)
1036 {
1037 unassignedRangeFound = true;
1038 break;
1039 }
1040 }
1041
1042 return unassignedRangeFound;
1043 }
1044
1045 // Update numRegsUsed to max physical register used based on summary
updateRegUsage(PhyRegSummary * summary,unsigned int & numRegsUsed)1046 void LocalRA::updateRegUsage(PhyRegSummary* summary, unsigned int& numRegsUsed)
1047 {
1048 for (uint32_t i = 0, numGRF = summary->getNumGRF(); i < numGRF; i++)
1049 {
1050 if (numRegsUsed < i && summary->isGRFBusy(i) == true)
1051 {
1052 numRegsUsed = i;
1053 }
1054 }
1055 }
1056
1057 // Emit out localRA opt report
localRAOptReport()1058 void LocalRA::localRAOptReport()
1059 {
1060 unsigned int totalRanges = 0, localRanges = 0;
1061
1062 for (auto dcl : kernel.Declares)
1063 {
1064 auto dcl_it_lr = gra.getLocalLR(dcl);
1065 if (dcl_it_lr &&
1066 dcl_it_lr->isLiveRangeLocal() &&
1067 dcl_it_lr->isGRFRegAssigned())
1068 {
1069 localRanges++;
1070 }
1071
1072 if ((dcl->getRegFile() == G4_GRF ||
1073 dcl->getRegFile() == G4_INPUT) &&
1074 dcl->getAliasDeclare() == NULL)
1075 {
1076 totalRanges++;
1077 }
1078 }
1079
1080 if (kernel.getOption(vISA_OptReport))
1081 {
1082 std::ofstream optreport;
1083 getOptReportStream(optreport, kernel.getOptions());
1084 optreport << std::endl;
1085 optreport << "Total GRF ranges: " << totalRanges << std::endl;
1086 optreport << "GRF ranges allocated by local RA: " << localRanges << std::endl;
1087 if (totalRanges)
1088 {
1089 optreport << (int)(localRanges * 100 / totalRanges) <<
1090 "% allocated by local RA" << std::endl << std::endl;
1091 }
1092 closeOptReportStream(optreport);
1093 }
1094 }
1095
1096 // Given a src/dst reg region in opnd, traverse to its base and
1097 // return the declaration. Resolve any aliases and return the
1098 // topmost declaration.
1099 // opnd->topdcl should give same result for most cases. But for
1100 // spill code, if an address register spills then opnd->topdcl does
1101 // not return expected dcl for the fill.
GetTopDclFromRegRegion(G4_Operand * opnd)1102 G4_Declare* GetTopDclFromRegRegion(G4_Operand* opnd)
1103 {
1104 G4_Declare* dcl = nullptr;
1105
1106 MUST_BE_TRUE(opnd->isRegRegion(), "Operand is not a register region so cannot have a top dcl");
1107 G4_VarBase* base = opnd->getBase();
1108
1109 if (base && base->isRegVar())
1110 {
1111 dcl = base->asRegVar()->getDeclare()->getRootDeclare();
1112 }
1113
1114 return dcl;
1115 }
1116
convertSubRegOffFromWords(G4_Declare * dcl,int subregnuminwords)1117 unsigned int LocalRA::convertSubRegOffFromWords(G4_Declare* dcl, int subregnuminwords)
1118 {
1119 // Return subreg offset in units of dcl's element size.
1120 // Input is subregnum in word units.
1121 unsigned int subregnum;
1122
1123 subregnum = (subregnuminwords * 2) / dcl->getElemSize();
1124
1125 return subregnum;
1126 }
1127
convertSubRegOffToWords(G4_Declare * dcl,int subregnum)1128 unsigned int LocalRA::convertSubRegOffToWords(G4_Declare* dcl, int subregnum)
1129 {
1130 // Return subreg offset in units of words from dcl's element size
1131 unsigned int subregnuminwords;
1132
1133 subregnuminwords = subregnum * dcl->getElemSize() / 2;
1134
1135 return subregnuminwords;
1136 }
1137
undoLocalRAAssignments(bool clearInterval)1138 void LocalRA::undoLocalRAAssignments(bool clearInterval)
1139 {
1140 if (kernel.getOption(vISA_OptReport))
1141 {
1142 std::ofstream optreport;
1143 getOptReportStream(optreport, kernel.getOptions());
1144 optreport << "Undoing local RA assignments" << std::endl;
1145 closeOptReportStream(optreport);
1146 }
1147
1148 // Undo all assignments made by local RA
1149 for (auto dcl : kernel.Declares)
1150 {
1151 LocalLiveRange* lr = gra.getLocalLR(dcl);
1152 if (lr != NULL)
1153 {
1154 if (lr->getAssigned() == true)
1155 {
1156 // Undo the assignment
1157 lr->setAssigned(false);
1158 lr->getTopDcl()->getRegVar()->resetPhyReg();
1159
1160 if (kernel.getOption(vISA_GenerateDebugInfo))
1161 {
1162 kernel.getKernelDebugInfo()->getLiveIntervalInfo(lr->getTopDcl(), false)->clearLiveIntervals();
1163 }
1164 }
1165 if (clearInterval)
1166 {
1167 if (lr->isLiveRangeLocal())
1168 {
1169 lr->setFirstRef(NULL, 0);
1170 }
1171 if (lr->getTopDcl()->getRegFile() == G4_INPUT)
1172 {
1173 lr->setLastRef(NULL, 0);
1174 }
1175 }
1176 }
1177 }
1178
1179 // Delete summary information stored with each basic block
1180 kernel.fg.clearBBLRASummaries();
1181 }
1182
GetOrCreateLocalLiveRange(G4_Declare * topdcl)1183 LocalLiveRange* GlobalRA::GetOrCreateLocalLiveRange(G4_Declare* topdcl)
1184 {
1185 LocalLiveRange* lr = getLocalLR(topdcl);
1186
1187 // Check topdcl of operand and setup a new live range if required
1188 if (!lr)
1189 {
1190 localLiveRanges.push_back(LocalLiveRange(builder));
1191 lr = &localLiveRanges.back();
1192 setLocalLR(topdcl, lr);
1193 }
1194
1195 MUST_BE_TRUE(lr != NULL, "Local LR could not be created");
1196 return lr;
1197 }
1198
markReferencesInOpnd(G4_Operand * opnd,bool isEOT,INST_LIST_ITER inst_it,unsigned int pos)1199 void LocalRA::markReferencesInOpnd(G4_Operand* opnd, bool isEOT, INST_LIST_ITER inst_it,
1200 unsigned int pos)
1201 {
1202 G4_Declare* topdcl = NULL;
1203
1204 if ((opnd->isSrcRegRegion() || opnd->isDstRegRegion()))
1205 {
1206 topdcl = GetTopDclFromRegRegion(opnd);
1207
1208 if (topdcl &&
1209 (topdcl->getRegFile() == G4_GRF || topdcl->getRegFile() == G4_INPUT))
1210 {
1211 // Handle GRF here
1212 MUST_BE_TRUE(topdcl->getAliasDeclare() == NULL, "Not topdcl");
1213 LocalLiveRange* lr = gra.GetOrCreateLocalLiveRange(topdcl);
1214 lr->recordRef(curBB);
1215
1216 if (doSplitLLR &&
1217 opnd->isSrcRegRegion())
1218 {
1219 LLR_USE_MAP_ITER useMapIter;
1220 useMapIter = LLRUseMap.find(lr);
1221 if (useMapIter == LLRUseMap.end())
1222 {
1223 std::vector<std::pair<INST_LIST_ITER, unsigned int>> useList;
1224 useList.push_back(make_pair(inst_it, pos));
1225 LLRUseMap.insert(make_pair(lr, useList));
1226 }
1227 else
1228 {
1229 (*useMapIter).second.push_back(make_pair(inst_it, pos));
1230 }
1231 }
1232
1233 if (topdcl->getRegVar()
1234 && topdcl->getRegVar()->isPhyRegAssigned()
1235 && topdcl->getRegVar()->getPhyReg()->isGreg())
1236 {
1237 pregs->markPhyRegs(topdcl);
1238 }
1239
1240 if (isEOT)
1241 {
1242 lr->markEOT();
1243 }
1244
1245 gra.recordRef(topdcl);
1246
1247 if (kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D &&
1248 opnd->isDstRegRegion() &&
1249 !topdcl->getAddressed())
1250 {
1251 if (opnd->getInst()->isWriteEnableInst() == false)
1252 {
1253 }
1254 else
1255 {
1256 gra.setAugmentationMask(topdcl, AugmentationMasks::NonDefault);
1257 }
1258 }
1259 }
1260 else {
1261 // Handle register other than GRF
1262 }
1263 }
1264 else if (opnd->isAddrExp())
1265 {
1266 G4_AddrExp* addrExp = opnd->asAddrExp();
1267
1268 topdcl = addrExp->getRegVar()->getDeclare();
1269 while (topdcl->getAliasDeclare() != NULL)
1270 topdcl = topdcl->getAliasDeclare();
1271
1272 MUST_BE_TRUE(topdcl != NULL, "Top dcl was null for addr exp opnd");
1273
1274 LocalLiveRange* lr = gra.GetOrCreateLocalLiveRange(topdcl);
1275 lr->recordRef(curBB);
1276 lr->markIndirectRef();
1277
1278 if (topdcl->getRegVar()
1279 && topdcl->getRegVar()->isPhyRegAssigned()
1280 && topdcl->getRegVar()->getPhyReg()->isGreg())
1281 {
1282 pregs->markPhyRegs(topdcl);
1283 }
1284
1285 gra.recordRef(topdcl);
1286 }
1287 }
1288
markReferencesInInst(INST_LIST_ITER inst_it)1289 void LocalRA::markReferencesInInst(INST_LIST_ITER inst_it)
1290 {
1291 auto inst = (*inst_it);
1292
1293 if (inst->getNumDst() > 0)
1294 {
1295 // Scan dst
1296 G4_Operand* dst = inst->getDst();
1297
1298 if (dst)
1299 {
1300 markReferencesInOpnd(dst, false, inst_it, 0);
1301 }
1302 }
1303
1304 // Scan srcs
1305 for (int i = 0, nSrcs = inst->getNumSrc(); i < nSrcs; i++)
1306 {
1307 G4_Operand* src = inst->getSrc(i);
1308
1309 if (src)
1310 {
1311 markReferencesInOpnd(src, inst->isEOT(), inst_it, i);
1312 }
1313 }
1314 }
1315
setLexicalID(bool includePseudo)1316 void LocalRA::setLexicalID(bool includePseudo)
1317 {
1318 unsigned int id = 0;
1319 for (auto bb : kernel.fg)
1320 {
1321 for (auto curInst : *bb)
1322 {
1323 if ((!includePseudo) && (curInst->isPseudoKill() ||
1324 curInst->isLifeTimeEnd()))
1325 {
1326 curInst->setLexicalId(id);
1327 }
1328 else
1329 {
1330 curInst->setLexicalId(id++);
1331 }
1332 }
1333 }
1334 }
1335
markReferences(unsigned int & numRowsEOT,bool & lifetimeOpFound)1336 void LocalRA::markReferences(unsigned int& numRowsEOT,
1337 bool& lifetimeOpFound)
1338 {
1339 unsigned int id = 0;
1340 // Iterate over all BBs
1341 for (auto _curBB : kernel.fg)
1342 {
1343 curBB = _curBB;
1344 // Iterate over all insts
1345 for (INST_LIST_ITER inst_it = curBB->begin(), inst_end = curBB->end(); inst_it != inst_end; ++inst_it)
1346 {
1347 G4_INST* curInst = (*inst_it);
1348
1349 if (curInst->isPseudoKill() ||
1350 curInst->isLifeTimeEnd())
1351 {
1352 curInst->setLexicalId(id);
1353 lifetimeOpFound = true;
1354 if (curInst->isLifeTimeEnd())
1355 {
1356 markReferencesInInst(inst_it);
1357 }
1358 continue;
1359 }
1360
1361 if (curInst->isSplitIntrinsic())
1362 hasSplitInsts = true;
1363
1364 // set lexical ID
1365 curInst->setLexicalId(id++);
1366
1367 if (curInst->isEOT() && kernel.fg.builder->hasEOTGRFBinding())
1368 {
1369 numRowsEOT += curInst->getSrc(0)->getTopDcl()->getNumRows();
1370
1371 if (curInst->isSplitSend() && !curInst->getSrc(1)->isNullReg())
1372 {
1373 // both src0 and src1 have to be >=r112
1374 numRowsEOT += curInst->getSrc(1)->getTopDcl()->getNumRows();
1375 }
1376 }
1377
1378 markReferencesInInst(inst_it);
1379 }
1380 }
1381 }
1382
1383 // Function to calculate input register live intervals,
1384 // which are now tracked at word granularity because
1385 // there may be overlap between two different input variables.
calculateInputIntervals()1386 void LocalRA::calculateInputIntervals()
1387 {
1388 setLexicalID(false);
1389
1390 int numGRF = kernel.getNumRegTotal();
1391 std::vector<uint32_t> inputRegLastRef(numGRF * numEltPerGRF<Type_UW>(), UINT_MAX);
1392
1393 for (BB_LIST_RITER bb_it = kernel.fg.rbegin(), bb_rend = kernel.fg.rend();
1394 bb_it != bb_rend;
1395 bb_it++)
1396 {
1397 G4_BB* bb = (*bb_it);
1398
1399 for (INST_LIST_RITER inst_it = bb->rbegin(), inst_rend = bb->rend();
1400 inst_it != inst_rend;
1401 inst_it++)
1402 {
1403 G4_INST* curInst = (*inst_it);
1404
1405 G4_Declare* topdcl = NULL;
1406 LocalLiveRange* lr = NULL;
1407
1408 // scan dst operand (may be unnecessary but added for safety)
1409 if (curInst->getDst() != NULL)
1410 {
1411 // Scan dst
1412 G4_DstRegRegion* dst = curInst->getDst();
1413
1414 topdcl = GetTopDclFromRegRegion(dst);
1415
1416 if (topdcl && (lr = gra.getLocalLR(topdcl)))
1417 {
1418 if (topdcl->getRegFile() == G4_INPUT &&
1419 !(dst->isAreg()) &&
1420 topdcl->isOutput() == false &&
1421 lr->hasIndirectAccess() == false &&
1422 !builder.isPreDefArg(topdcl))
1423 {
1424 unsigned int lastRef = 0;
1425
1426 MUST_BE_TRUE(lr->isGRFRegAssigned(), "Input variable has no pre-assigned physical register");
1427
1428 if (lr->getLastRef(lastRef) == NULL)
1429 {
1430 unsigned int curInstId = curInst->getLexicalId();
1431 lr->setLastRef(curInst, curInstId);
1432
1433 G4_RegVar* var = topdcl->getRegVar();
1434 unsigned int regNum = var->getPhyReg()->asGreg()->getRegNum();
1435 unsigned int regOff = var->getPhyRegOff();
1436 unsigned int idx = regNum * numEltPerGRF<Type_UW>() +
1437 (regOff * topdcl->getElemSize()) / G4_WSIZE + topdcl->getWordSize() - 1;
1438
1439 unsigned int numWords = topdcl->getWordSize();
1440 for (int i = numWords - 1; i >= 0; --i, --idx)
1441 {
1442 if (inputRegLastRef[idx] == UINT_MAX &&
1443 // Check for registers that are marked as forbidden,
1444 // e.g., BuiltinR0 and those reserved for stack call
1445 pregs->isGRFAvailable(idx / numEltPerGRF<Type_UW>()))
1446 {
1447 inputRegLastRef[idx] = curInstId;
1448 inputIntervals.push_front(new (mem)InputLiveRange(idx, curInstId));
1449 if (kernel.getOptions()->getOption(vISA_GenerateDebugInfo))
1450 {
1451 updateDebugInfo(kernel, topdcl, 0, curInst->getCISAOff());
1452 }
1453 }
1454 }
1455 }
1456 }
1457 }
1458 }
1459
1460 // Scan src operands
1461 for (int i = 0, nSrcs = curInst->getNumSrc(); i < nSrcs; i++)
1462 {
1463 G4_Operand* src = curInst->getSrc(i);
1464
1465 if (src && src->getTopDcl())
1466 {
1467 topdcl = GetTopDclFromRegRegion(src);
1468
1469 if (topdcl && (lr = gra.getLocalLR(topdcl)))
1470 {
1471 // Check whether it is input
1472 if ((topdcl->getRegFile() == G4_INPUT || topdcl->isLiveIn()) &&
1473 !(src->isAreg()) &&
1474 topdcl->isOutput() == false &&
1475 lr->hasIndirectAccess() == false &&
1476 !builder.isPreDefArg(topdcl))
1477 {
1478 unsigned int lastRef = 0;
1479
1480 MUST_BE_TRUE(lr->isGRFRegAssigned(), "Input variable has no pre-assigned physical register");
1481
1482 if (lr->getLastRef(lastRef) == NULL)
1483 {
1484 unsigned int curInstId = curInst->getLexicalId();
1485 lr->setLastRef(curInst, curInstId);
1486
1487 G4_RegVar* var = topdcl->getRegVar();
1488 unsigned int regNum = var->getPhyReg()->asGreg()->getRegNum();
1489 unsigned int regOff = var->getPhyRegOff();
1490 unsigned int idx = regNum * numEltPerGRF<Type_UW>() +
1491 (regOff * TypeSize(topdcl->getElemType())) / G4_WSIZE + topdcl->getWordSize() - 1;
1492
1493 unsigned int numWords = topdcl->getWordSize();
1494 for (int i = numWords - 1; i >= 0; --i, --idx)
1495 {
1496 if (inputRegLastRef[idx] == UINT_MAX &&
1497 // Check for registers that are marked as forbidden,
1498 // e.g., BuiltinR0 and those reserved for stack call
1499 pregs->isGRFAvailable(idx / numEltPerGRF<Type_UW>()))
1500 {
1501 inputRegLastRef[idx] = curInstId;
1502 if (builder.avoidDstSrcOverlap() &&
1503 curInst->getDst() != NULL &&
1504 hasDstSrcOverlapPotential(curInst->getDst(), src->asSrcRegRegion()))
1505 {
1506 inputIntervals.push_front(new (mem)InputLiveRange(idx, curInstId + 1));
1507 }
1508 else
1509 {
1510 inputIntervals.push_front(new (mem)InputLiveRange(idx, curInstId));
1511 }
1512 if (kernel.getOptions()->getOption(vISA_GenerateDebugInfo))
1513 {
1514 updateDebugInfo(kernel, topdcl, 0, curInst->getCISAOff());
1515 }
1516 }
1517 }
1518 }
1519 }
1520 }
1521 }
1522 }
1523 }
1524 }
1525
1526 #ifdef _DEBUG
1527 for (DECLARE_LIST_ITER dcl_it = kernel.Declares.begin();
1528 dcl_it != kernel.Declares.end();
1529 dcl_it++)
1530 {
1531 G4_Declare* curDcl = (*dcl_it);
1532
1533 if (curDcl->isOutput() &&
1534 gra.getLocalLR(curDcl) && gra.getLocalLR(curDcl)->isGRFRegAssigned())
1535 {
1536 G4_RegVar* var = curDcl->getRegVar();
1537 unsigned int regNum = var->getPhyReg()->asGreg()->getRegNum();
1538 unsigned int regOff = var->getPhyRegOff();
1539 unsigned int idx = regNum * numEltPerGRF<Type_UW>() +
1540 (regOff * TypeSize(curDcl->getElemType())) / G4_WSIZE;
1541
1542 unsigned int numWords = curDcl->getWordSize();
1543 for (unsigned int i = 0; i < numWords; ++i, ++idx)
1544 {
1545 if (inputRegLastRef[idx] != UINT_MAX)
1546 {
1547 MUST_BE_TRUE(false, "Invalid output variable with overlapping input variable!");
1548 }
1549 }
1550 }
1551 }
1552 #endif
1553 }
1554
hasDstSrcOverlapPotential(G4_DstRegRegion * dst,G4_SrcRegRegion * src)1555 bool LocalRA::hasDstSrcOverlapPotential(G4_DstRegRegion* dst, G4_SrcRegRegion* src)
1556 {
1557 int dstOpndNumRows = 0;
1558
1559 if (dst->getBase()->isRegVar())
1560 {
1561 G4_Declare* dstDcl = dst->getBase()->asRegVar()->getDeclare();
1562 if (dstDcl != nullptr)
1563 {
1564 int dstOffset = (dstDcl->getOffsetFromBase() + dst->getLeftBound()) / numEltPerGRF<Type_UB>();
1565 G4_DstRegRegion* dstRgn = dst;
1566 dstOpndNumRows = dstRgn->getSubRegOff() + dstRgn->getLinearizedEnd() - dstRgn->getLinearizedStart() + 1 > numEltPerGRF<Type_UB>();
1567
1568 if (src != NULL &&
1569 src->isSrcRegRegion() &&
1570 src->asSrcRegRegion()->getBase()->isRegVar())
1571 {
1572 G4_SrcRegRegion* srcRgn = src->asSrcRegRegion();
1573 G4_Declare* srcDcl = src->getBase()->asRegVar()->getDeclare();
1574 int srcOffset = (srcDcl->getOffsetFromBase() + src->getLeftBound()) / numEltPerGRF<Type_UB>();
1575 bool srcOpndNumRows = srcRgn->getSubRegOff() + srcRgn->getLinearizedEnd() - srcRgn->getLinearizedStart() + 1 > numEltPerGRF<Type_UB>();
1576
1577 if (dstOpndNumRows || srcOpndNumRows)
1578 {
1579 if (!(gra.isEvenAligned(dstDcl) && gra.isEvenAligned(srcDcl) &&
1580 srcOffset % 2 == dstOffset % 2 &&
1581 dstOpndNumRows && srcOpndNumRows))
1582 {
1583 return true;
1584 }
1585 }
1586 }
1587 }
1588 }
1589
1590 return false;
1591 }
1592
1593 // Function to calculate local live intervals in ascending order of starting point
1594 // in basic block bb.
1595 //
1596 // FIXME: current implementation cannot handle partial use-before-define case as shown below:
1597 //
1598 // loop_start:
1599 // V1 (1) =
1600 // P1 =
1601 // (P1) (16) = V1
1602 // ...
1603 // V1 (16) =
1604 // ...
1605 // (P2) Jmp loop_start
1606 //
1607 // One way to fix this is to check the emask and simd size of local def/use and make sure they
1608 // satisfy the followign rules:
1609 // 1) The SIMD size of a use should be equal to or smaller than earlier def seen for that variable.
1610 // 2) The Emask of use should be equal of narrower than def seen earlier.
1611 // 3) Each use should be fully defined in current BB (e.g., if a use is predicated,
1612 // the def should be either non-predicated or have the same predicate).
1613 //
calculateLiveIntervals(G4_BB * bb,std::vector<LocalLiveRange * > & liveIntervals)1614 void LocalRA::calculateLiveIntervals(G4_BB* bb, std::vector<LocalLiveRange*>& liveIntervals)
1615 {
1616 int idx = 0;
1617 bool brk = false;
1618
1619 for (INST_LIST_ITER inst_it = bb->begin(), bbend = bb->end();
1620 inst_it != bbend && !brk;
1621 inst_it++, idx += 2)
1622 {
1623 G4_INST* curInst = (*inst_it);
1624 G4_Declare* topdcl = NULL;
1625 LocalLiveRange* lr = NULL;
1626
1627 if (curInst->isPseudoKill() ||
1628 curInst->isLifeTimeEnd())
1629 {
1630 continue;
1631 }
1632
1633 // Scan srcs
1634 for (int i = 0, nSrcs = curInst->getNumSrc(); i < nSrcs; i++)
1635 {
1636 G4_Operand* src = curInst->getSrc(i);
1637
1638 if (src && src->getTopDcl())
1639 {
1640 // Scan all srcs
1641 topdcl = GetTopDclFromRegRegion(src);
1642 LocalLiveRange* topdclLR = nullptr;
1643
1644 if (topdcl && (topdclLR = gra.getLocalLR(topdcl)))
1645 {
1646 lr = topdclLR;
1647
1648 // Check whether local LR is a candidate
1649 if (lr->isLiveRangeLocal() &&
1650 lr->isGRFRegAssigned() == false)
1651 {
1652 unsigned int startIdx;
1653
1654 if (lr->getFirstRef(startIdx) == NULL)
1655 {
1656 // Skip this lr and update its ref count,
1657 // So it will be treated as a global LR
1658 lr->recordRef(NULL);
1659 continue;
1660 }
1661
1662 MUST_BE_TRUE(idx > 0, "Candidate use found in first inst of basic block");
1663
1664 if ((builder.WaDisableSendSrcDstOverlap() &&
1665 ((curInst->isSend() && i == 0) ||
1666 (curInst->isSplitSend() && i == 1)))
1667 || (curInst->isDpas() && i == 1) //For DPAS, as part of same instruction, src1 should not have overlap with dst. Src0 and src2 are okay to have overlap
1668 || (builder.avoidDstSrcOverlap() && curInst->getDst() != NULL && hasDstSrcOverlapPotential(curInst->getDst(), src->asSrcRegRegion()))
1669 )
1670 {
1671 #ifdef DEBUG_VERBOSE_ON
1672 if (curInst->getDst() != NULL && hasDstSrcOverlapPotential(curInst->getDst(), src->asSrcRegRegion()))
1673 {
1674 curInst->dump();
1675 }
1676 #endif
1677 lr->setLastRef(curInst, idx + 1);
1678 }
1679 else
1680 {
1681 lr->setLastRef(curInst, idx);
1682 }
1683 }
1684 }
1685 }
1686 }
1687
1688 if (G4_Inst_Table[curInst->opcode()].n_dst == 1 ||
1689 curInst->isSplitIntrinsic())
1690 {
1691 // Scan dst
1692 G4_DstRegRegion* dst = curInst->getDst();
1693
1694 if (dst)
1695 {
1696 topdcl = GetTopDclFromRegRegion(dst);
1697
1698 if (topdcl && (lr = gra.getLocalLR(topdcl)))
1699 {
1700 // Check whether local LR is a candidate
1701 if (lr->isLiveRangeLocal() &&
1702 lr->isGRFRegAssigned() == false)
1703 {
1704 unsigned int startIdx;
1705
1706 if (lr->getFirstRef(startIdx) == NULL)
1707 {
1708 lr->setFirstRef(curInst, idx);
1709 liveIntervals.push_back(lr);
1710 }
1711
1712 lr->setLastRef(curInst, idx);
1713 }
1714 }
1715 }
1716 }
1717 }
1718 }
1719
printAddressTakenDecls()1720 void LocalRA::printAddressTakenDecls()
1721 {
1722 DEBUG_VERBOSE("Address taken decls:" << std::endl);
1723
1724 for (DECLARE_LIST_ITER dcl_it = kernel.Declares.begin();
1725 dcl_it != kernel.Declares.end();
1726 dcl_it++)
1727 {
1728 G4_Declare* curDcl = (*dcl_it);
1729
1730 if (curDcl->getAliasDeclare() != NULL)
1731 {
1732 MUST_BE_TRUE(gra.getLocalLR(curDcl) == NULL, "Local LR found for alias declare");
1733 continue;
1734 }
1735
1736 if (gra.getLocalLR(curDcl) &&
1737 gra.getLocalLR(curDcl)->hasIndirectAccess() == true)
1738 {
1739 DEBUG_VERBOSE(curDcl->getName() << ", ");
1740 }
1741 }
1742
1743 DEBUG_VERBOSE(std::endl);
1744 }
1745
printLocalRACandidates()1746 void LocalRA::printLocalRACandidates()
1747 {
1748 DEBUG_VERBOSE("Local RA candidates:" << std::endl);
1749
1750 for (DECLARE_LIST_ITER dcl_it = kernel.Declares.begin();
1751 dcl_it != kernel.Declares.end();
1752 dcl_it++)
1753 {
1754 G4_Declare* curDcl = (*dcl_it);
1755
1756 if (curDcl->getAliasDeclare() != NULL)
1757 {
1758 MUST_BE_TRUE(gra.getLocalLR(curDcl) == NULL, "Local LR found for alias declare");
1759 continue;
1760 }
1761
1762 if (gra.getLocalLR(curDcl) &&
1763 gra.getLocalLR(curDcl)->isLiveRangeLocal() &&
1764 gra.getLocalLR(curDcl)->isGRFRegAssigned() == false)
1765 {
1766 DEBUG_VERBOSE(curDcl->getName() << ", ");
1767 }
1768 }
1769
1770 DEBUG_VERBOSE(std::endl);
1771 }
1772
printLocalLiveIntervals(G4_BB * bb,std::vector<LocalLiveRange * > & liveIntervals)1773 void LocalRA::printLocalLiveIntervals(G4_BB* bb, std::vector<LocalLiveRange*>& liveIntervals)
1774 {
1775 DEBUG_VERBOSE("Live intervals for bb " << bb->getId() << std::endl);
1776
1777 for (auto lr : liveIntervals)
1778 {
1779 unsigned int start, end;
1780
1781 lr->getFirstRef(start);
1782 lr->getLastRef(end);
1783
1784 DEBUG_VERBOSE(lr->getTopDcl()->getName() << "(" << start << ", " << end << ", " << lr->getTopDcl()->getByteSize() << ")");
1785 DEBUG_VERBOSE(std::endl);
1786 }
1787
1788 DEBUG_VERBOSE(std::endl);
1789 }
1790
printInputLiveIntervals()1791 void LocalRA::printInputLiveIntervals()
1792 {
1793 DEBUG_VERBOSE(std::endl << "Input Live intervals " << std::endl);
1794
1795 for (std::list<InputLiveRange*>::iterator it = inputIntervals.begin();
1796 it != inputIntervals.end();
1797 it++)
1798 {
1799 unsigned int regWordIdx, lrEndIdx, regNum, subRegInWord;
1800
1801 InputLiveRange* lr = (*it);
1802
1803 regWordIdx = lr->getRegWordIdx();
1804 regNum = regWordIdx / numEltPerGRF<Type_UW>();
1805 subRegInWord = regWordIdx % numEltPerGRF<Type_UW>();
1806 lrEndIdx = lr->getLrEndIdx();
1807
1808 DEBUG_VERBOSE("r" << regNum << "." << subRegInWord << " " << lrEndIdx);
1809 DEBUG_VERBOSE(std::endl);
1810 }
1811
1812 DEBUG_VERBOSE(std::endl);
1813 }
1814
countLocalLiveIntervals(std::vector<LocalLiveRange * > & liveIntervals)1815 void LocalRA::countLocalLiveIntervals(std::vector<LocalLiveRange*>& liveIntervals)
1816 {
1817 unsigned int numScalars = 0, numHalfGRF = 0, numOneGRF = 0, numTwoOrMoreGRF = 0, numTotal = 0;
1818
1819 for (auto lr : liveIntervals)
1820 {
1821 unsigned size, numrows;
1822 G4_Declare* dcl = lr->getTopDcl();
1823
1824 numrows = dcl->getNumRows();
1825 size = dcl->getNumElems() * dcl->getElemSize();
1826 numTotal++;
1827
1828 if (numrows > 1)
1829 {
1830 numTwoOrMoreGRF++;
1831 }
1832 else
1833 {
1834 if (dcl->getNumElems() > 1 && size > (getGRFSize() / 2u) && size <= (unsigned int)getGRFSize())
1835 numOneGRF++;
1836 else if (dcl->getNumElems() > 1 && size <= (getGRFSize() / 2u))
1837 numHalfGRF++;
1838 else if (dcl->getNumElems() == 1)
1839 numScalars++;
1840 else
1841 ASSERT_USER(false, "Unknown size");
1842 }
1843 }
1844
1845 printLocalLiveIntervalDistribution(numScalars, numHalfGRF, numOneGRF, numTwoOrMoreGRF, numTotal);
1846 }
1847
printLocalLiveIntervalDistribution(unsigned int numScalars,unsigned int numHalfGRF,unsigned int numOneGRF,unsigned int numTwoOrMoreGRF,unsigned int numTotal)1848 void LocalRA::printLocalLiveIntervalDistribution(unsigned int numScalars,
1849 unsigned int numHalfGRF, unsigned int numOneGRF, unsigned int numTwoOrMoreGRF,
1850 unsigned int numTotal)
1851 {
1852 DEBUG_VERBOSE("In units of num of live ranges:" << std::endl);
1853 DEBUG_VERBOSE("Total local live ranges: " << numTotal << std::endl);
1854 DEBUG_VERBOSE("2 GRFs or more: " << numTwoOrMoreGRF << std::endl);
1855 DEBUG_VERBOSE("1 GRF: " << numOneGRF << std::endl);
1856 DEBUG_VERBOSE("Half GRF: " << numHalfGRF << std::endl);
1857 DEBUG_VERBOSE("Scalar: " << numScalars << std::endl);
1858 DEBUG_VERBOSE(std::endl);
1859 }
1860
1861 // return false if roundrobin RA should be disabled
countLiveIntervals()1862 bool LocalRA::countLiveIntervals()
1863 {
1864 int globalRows = 0;
1865 uint32_t localRows = 0;
1866 for (auto curDcl : kernel.Declares)
1867 {
1868 LocalLiveRange* curDclLR = nullptr;
1869
1870 if (curDcl->getAliasDeclare() == NULL &&
1871 curDcl->getRegFile() == G4_GRF &&
1872 (curDclLR = gra.getLocalLR(curDcl)) &&
1873 curDclLR->isGRFRegAssigned() == false)
1874 {
1875 if (curDclLR->isLiveRangeGlobal())
1876 {
1877 globalRows += curDcl->getNumRows();
1878 }
1879 else if (curDclLR->isLiveRangeLocal())
1880 {
1881 localRows += curDcl->getNumRows();
1882 }
1883 }
1884 }
1885
1886 if (globalRows <= NUM_PREGS_FOR_UNIQUE_ASSIGN)
1887 {
1888 globalLRSize = globalRows;
1889 }
1890 else
1891 {
1892 if (localRows < (numRegLRA - NUM_PREGS_FOR_UNIQUE_ASSIGN))
1893 {
1894 globalLRSize = NUM_PREGS_FOR_UNIQUE_ASSIGN;
1895 }
1896 else
1897 {
1898 return false;
1899 }
1900 }
1901
1902 return true;
1903 }
1904
1905 // ********* LocalLiveRange class implementation *********
recordRef(G4_BB * bb)1906 void LocalLiveRange::recordRef(G4_BB* bb)
1907 {
1908 if (bb != prevBBRef)
1909 numRefsInFG++;
1910
1911 prevBBRef = bb;
1912 }
1913
isLiveRangeLocal() const1914 bool LocalLiveRange::isLiveRangeLocal() const
1915 {
1916 if (isIndirectAccess == false && numRefsInFG == 1 &&
1917 eot == false &&
1918 !builder.isPreDefArg(topdcl) && !builder.isPreDefRet(topdcl) &&
1919 topdcl->isOutput() == false)
1920 {
1921 return true;
1922 }
1923
1924 return false;
1925 }
1926
isLiveRangeGlobal()1927 bool LocalLiveRange::isLiveRangeGlobal()
1928 {
1929 if (isIndirectAccess == true || (numRefsInFG > 1 && eot == false) ||
1930 topdcl->isOutput() == true)
1931 {
1932 return true;
1933 }
1934
1935 return false;
1936 }
1937
1938
isGRFRegAssigned()1939 bool LocalLiveRange::isGRFRegAssigned()
1940 {
1941 MUST_BE_TRUE(topdcl != NULL, "Top dcl not set");
1942 G4_RegVar* rvar = topdcl->getRegVar();
1943 bool isPhyRegAssigned = false;
1944
1945 if (rvar)
1946 {
1947 if (rvar->isPhyRegAssigned())
1948 isPhyRegAssigned = true;
1949 }
1950
1951 return isPhyRegAssigned;
1952 }
1953
getSizeInWords()1954 unsigned int LocalLiveRange::getSizeInWords()
1955 {
1956 int nrows = getTopDcl()->getNumRows();
1957 int elemsize = getTopDcl()->getElemSize();
1958 int nelems = getTopDcl()->getNumElems();
1959 int words = 0;
1960
1961 if (nrows > 1)
1962 {
1963 // If sizeInWords is set, use it otherwise consider entire row reserved
1964 unsigned int sizeInWords = getTopDcl()->getWordSize();
1965
1966 if (sizeInWords > 0)
1967 words = sizeInWords;
1968 else
1969 words = nrows * numEltPerGRF<Type_UW>();
1970 }
1971 else if (nrows == 1)
1972 {
1973 int nbytesperword = 2;
1974 words = (nelems * elemsize + 1) / nbytesperword;
1975 }
1976
1977 return words;
1978 }
1979
1980 // ********* PhyRegsLocalRA class implementation *********
setGRFBusy(int which)1981 void PhyRegsLocalRA::setGRFBusy(int which)
1982 {
1983 MUST_BE_TRUE(isGRFAvailable(which), "Invalid register");
1984
1985 // all 1 word mask based on register size
1986 uint64_t wordMask = (1ULL << (getGRFSize() / 2)) - 1;
1987 regBusyVector[which] = (uint32_t) wordMask;
1988
1989 if (twoBanksRA)
1990 {
1991 if (which < SECOND_HALF_BANK_START_GRF)
1992 {
1993 bank1AvailableRegNum--;
1994 }
1995 else
1996 {
1997 bank2AvailableRegNum--;
1998 }
1999 }
2000 }
2001
2002 // ********* PhyRegsLocalRA class implementation *********
setGRFBusy(int which,int howmany)2003 void PhyRegsLocalRA::setGRFBusy(int which, int howmany)
2004 {
2005 for (int i = 0; i < howmany; i++)
2006 {
2007 setGRFBusy(which + i);
2008 }
2009 }
2010
setWordBusy(int whichgrf,int word)2011 void PhyRegsLocalRA::setWordBusy(int whichgrf, int word)
2012 {
2013 MUST_BE_TRUE(isGRFAvailable(whichgrf), "Invalid register");
2014 MUST_BE_TRUE(word <= (int)numEltPerGRF<Type_UW>(), "Invalid word");
2015
2016 if (twoBanksRA)
2017 {
2018 if (regBusyVector[whichgrf] == 0)
2019 {
2020 if (whichgrf < SECOND_HALF_BANK_START_GRF)
2021 {
2022 bank1AvailableRegNum--;
2023 }
2024 else
2025 {
2026 bank2AvailableRegNum--;
2027 }
2028 }
2029 }
2030
2031 regBusyVector[whichgrf] |= (WORD_BUSY << word);
2032 }
2033
setWordBusy(int whichgrf,int word,int howmany)2034 void PhyRegsLocalRA::setWordBusy(int whichgrf, int word, int howmany)
2035 {
2036 for (int i = 0; i < howmany; i++)
2037 {
2038 setWordBusy(whichgrf, word + i);
2039 }
2040 }
2041
setGRFNotBusy(int which,int instID)2042 void PhyRegsLocalRA::setGRFNotBusy(int which, int instID)
2043 {
2044 MUST_BE_TRUE(isGRFAvailable(which), "Invalid register");
2045
2046 regBusyVector[which] = 0;
2047
2048 if (twoBanksRA)
2049 {
2050 if (which < SECOND_HALF_BANK_START_GRF)
2051 {
2052 lastUseSum1 -= regLastUse[which];
2053 lastUseSum1 += instID;
2054 bank1AvailableRegNum++;
2055 }
2056 else
2057 {
2058 lastUseSum2 -= regLastUse[which];
2059 lastUseSum2 += instID;
2060 bank2AvailableRegNum++;
2061 }
2062 }
2063 if (instID)
2064 {
2065 regLastUse[which] = instID;
2066 }
2067 }
2068
setWordNotBusy(int whichgrf,int word,int instID)2069 void PhyRegsLocalRA::setWordNotBusy(int whichgrf, int word, int instID)
2070 {
2071 MUST_BE_TRUE(isGRFAvailable(whichgrf), "Invalid register");
2072 MUST_BE_TRUE(word <= (int)numEltPerGRF<Type_UW>(), "Invalid word");
2073
2074 if (twoBanksRA)
2075 {
2076 if (whichgrf < SECOND_HALF_BANK_START_GRF)
2077 {
2078 lastUseSum1 -= regLastUse[whichgrf];
2079 lastUseSum1 += instID;
2080 if (regBusyVector[whichgrf] == 0)
2081 {
2082 bank1AvailableRegNum++;
2083 }
2084 }
2085 else
2086 {
2087 lastUseSum2 -= regLastUse[whichgrf];
2088 lastUseSum2 += instID;
2089 if (regBusyVector[whichgrf] == 0)
2090 {
2091 bank2AvailableRegNum++;
2092 }
2093 }
2094 }
2095 uint32_t mask = ~(1 << word);
2096 regBusyVector[whichgrf] &= mask;
2097 if (instID)
2098 {
2099 regLastUse[whichgrf] = instID;
2100 }
2101 }
2102
isWordBusy(int whichgrf,int word)2103 inline bool PhyRegsLocalRA::isWordBusy(int whichgrf, int word)
2104 {
2105 MUST_BE_TRUE(isGRFAvailable(whichgrf), "Invalid register");
2106
2107 MUST_BE_TRUE(word <= (int)numEltPerGRF<Type_UW>(), "Invalid word");
2108 bool isBusy = ((regBusyVector[whichgrf] & (WORD_BUSY << word)) != 0);
2109 return isBusy;
2110 }
2111
isWordBusy(int whichgrf,int word,int howmany)2112 bool PhyRegsLocalRA::isWordBusy(int whichgrf, int word, int howmany)
2113 {
2114 bool retval = false;
2115
2116 for (int i = 0; i < howmany && !retval; i++)
2117 {
2118 retval |= isWordBusy(whichgrf, word + i);
2119 }
2120
2121 return retval;
2122 }
2123
findFreeMultipleRegsForward(int regIdx,BankAlign align,int & regnum,int nrows,int lastRowSize,int endReg,unsigned short occupiedBundles,int instID,bool isHybridAlloc,std::unordered_set<unsigned int> & forbidden,bool hintSet)2124 bool PhyRegsLocalRA::findFreeMultipleRegsForward(int regIdx, BankAlign align, int ®num, int nrows, int lastRowSize, int endReg, unsigned short occupiedBundles, int instID, bool isHybridAlloc, std::unordered_set<unsigned int>& forbidden, bool hintSet)
2125 {
2126 int foundItem = 0;
2127 int startReg = 0;
2128 int i = regIdx;
2129 int grfRows = 0;
2130 bool multiSteps = nrows > 1;
2131
2132 if (lastRowSize % numEltPerGRF<Type_UW>() == 0)
2133 {
2134 grfRows = nrows;
2135 }
2136 else
2137 {
2138 grfRows = nrows - 1;
2139 }
2140
2141 findRegisterCandiateWithAlignForward(i, align, multiSteps);
2142 i = findBundleConflictFreeRegister(i, endReg, occupiedBundles, align, multiSteps);
2143
2144 startReg = i;
2145 while (i <= endReg + nrows - 1)
2146 {
2147 if (isGRFAvailable(i) && forbidden.find(i) == forbidden.end() &&
2148 regBusyVector[i] == 0 &&
2149 (!isHybridAlloc || (((instID - regLastUse[i]) / 2 >= LraFFWindowSize) || (regLastUse[i] == 0)) || hintSet))
2150 {
2151 foundItem++;
2152 }
2153 else if (foundItem < grfRows)
2154 {
2155 foundItem = 0;
2156 i++;
2157 findRegisterCandiateWithAlignForward(i, align, multiSteps);
2158 i = findBundleConflictFreeRegister(i, endReg, occupiedBundles, align, multiSteps);
2159 startReg = i;
2160 continue;
2161 }
2162
2163 if (foundItem == grfRows)
2164 {
2165 if (lastRowSize % numEltPerGRF<Type_UW>() == 0)
2166 {
2167 regnum = startReg;
2168 return true;
2169 }
2170 else
2171 {
2172 if (i + 1 <= endReg + nrows - 1 &&
2173 isGRFAvailable(i + 1) && forbidden.find(i+1) == forbidden.end() &&
2174 (isWordBusy(i + 1, 0, lastRowSize) == false) &&
2175 (!isHybridAlloc || (((instID - regLastUse[i + 1]) / 2 >= LraFFWindowSize) || (regLastUse[i + 1] == 0))))
2176 {
2177 regnum = startReg;
2178 return true;
2179 }
2180 else
2181 {
2182 foundItem = 0;
2183 i++;
2184 findRegisterCandiateWithAlignForward(i, align, multiSteps);
2185 i = findBundleConflictFreeRegister(i, endReg, occupiedBundles, align, multiSteps);
2186 startReg = i;
2187 continue;
2188 }
2189 }
2190 }
2191
2192 i++;
2193 }
2194
2195 return false;
2196 }
2197
findFreeMultipleRegsBackward(int regIdx,BankAlign align,int & regnum,int nrows,int lastRowSize,int endReg,int instID,bool isHybridAlloc,std::unordered_set<unsigned int> & forbidden)2198 bool PhyRegsLocalRA::findFreeMultipleRegsBackward(int regIdx, BankAlign align, int ®num, int nrows, int lastRowSize, int endReg, int instID, bool isHybridAlloc, std::unordered_set<unsigned int>& forbidden)
2199 {
2200 int foundItem = 0;
2201 int startReg = 0;
2202 int grfRows = 0;
2203 int i = regIdx;
2204 bool multiSteps = nrows > 1;
2205
2206 if (lastRowSize % numEltPerGRF<Type_UW>() == 0)
2207 {
2208 grfRows = nrows;
2209 }
2210 else
2211 {
2212 grfRows = nrows - 1;
2213 }
2214
2215 findRegisterCandiateWithAlignBackward(i, align, multiSteps);
2216
2217 startReg = i;
2218
2219 while (i >= endReg && i >= 0)
2220 {
2221 if (isGRFAvailable(i) && forbidden.find(i) == forbidden.end() &&
2222 regBusyVector[i] == 0 &&
2223 (!isHybridAlloc || (((instID - regLastUse[i]) / 2 >= LraFFWindowSize) || (regLastUse[i] == 0))))
2224 {
2225 foundItem++;
2226 }
2227 else if (foundItem < grfRows)
2228 {
2229 foundItem = 0;
2230 i -= nrows;
2231 findRegisterCandiateWithAlignBackward(i, align, multiSteps);
2232
2233 startReg = i;
2234 continue;
2235 }
2236
2237 if (foundItem == grfRows)
2238 {
2239 if (lastRowSize % numEltPerGRF<Type_UW>() == 0)
2240 {
2241 regnum = startReg;
2242 return true;
2243 }
2244 else
2245 {
2246 if (i + 1 <= endReg &&
2247 isGRFAvailable(i + 1) && forbidden.find(i+1) ==forbidden.end() &&
2248 (isWordBusy(i + 1, 0, lastRowSize) == false) &&
2249 (!isHybridAlloc || (((instID - regLastUse[i + 1]) / 2 >= LraFFWindowSize) || (regLastUse[i + 1] == 0))))
2250 {
2251 regnum = startReg;
2252 return true;
2253 }
2254 else
2255 {
2256 foundItem = 0;
2257 i -= nrows;
2258 findRegisterCandiateWithAlignBackward(i, align, multiSteps);
2259
2260 startReg = i;
2261 continue;
2262 }
2263 }
2264 }
2265
2266 i++;
2267 }
2268
2269 return false;
2270 }
2271
findFreeSingleReg(int regIdx,int size,BankAlign align,G4_SubReg_Align subalign,int & regnum,int & subregnum,int endReg,int instID,bool isHybridAlloc,bool forward,std::unordered_set<unsigned int> & forbidden)2272 bool PhyRegsLocalRA::findFreeSingleReg(int regIdx, int size, BankAlign align, G4_SubReg_Align subalign, int ®num, int &subregnum, int endReg, int instID, bool isHybridAlloc, bool forward, std::unordered_set<unsigned int>& forbidden)
2273 {
2274 int i = regIdx;
2275 bool found = false;
2276
2277 while (!found)
2278 {
2279 if (forward)
2280 {
2281 if (i > endReg) //<= works
2282 break;
2283 }
2284 else
2285 {
2286 if (i < endReg) //>= works
2287 break;
2288 }
2289
2290 // Align GRF
2291 if ((align == BankAlign::Even) && (i % 2 != 0))
2292 {
2293 i += forward ? 1 : -1;
2294 continue;
2295 }
2296 else if ((align == BankAlign::Odd) && (i % 2 == 0))
2297 {
2298 i += forward ? 1 : -1;
2299 continue;
2300 }
2301 else if ((align == BankAlign::Even2GRF) && ((i % 4 >= 2)))
2302 {
2303 i += forward ? 1 : -1;
2304 continue;
2305 }
2306 else if ((align == BankAlign::Odd2GRF) && ((i % 4 < 2)))
2307 {
2308 i += forward ? 1 : -1;
2309 continue;
2310 }
2311
2312 if (isGRFAvailable(i, 1) && forbidden.find(i) ==forbidden.end() &&
2313 (!isHybridAlloc || (((instID - regLastUse[i]) / 2 >= LraFFWindowSize) || (regLastUse[i] == 0))))
2314 {
2315 found = findFreeSingleReg(i, subalign, regnum, subregnum, size);
2316 if (found)
2317 {
2318 return true;
2319 }
2320 }
2321 i += forward ? 1 : -1;
2322 }
2323
2324 return false;
2325 }
2326
printBusyRegs()2327 void PhyRegsLocalRA::printBusyRegs()
2328 {
2329 for (int i = 0; i < (int)numRegs; i++)
2330 {
2331 if (isGRFAvailable(i) == false)
2332 continue;
2333
2334 for (int j = 0; j < (int)numEltPerGRF<Type_UW>(); j++)
2335 {
2336 if (isWordBusy(i, j) == true)
2337 {
2338 DEBUG_VERBOSE("r" << i << "." << j << ":w, " << std::endl);
2339 }
2340 }
2341 }
2342 }
2343
2344 // Based on RegVar, mark available registers as busy
markPhyRegs(G4_Declare * topdcl)2345 void PhyRegsLocalRA::markPhyRegs(G4_Declare* topdcl)
2346 {
2347 G4_RegVar* rvar = topdcl->getRegVar();
2348 unsigned numrows = topdcl->getNumRows();
2349 unsigned numwords = 0;
2350 unsigned int regnum = 0;
2351
2352 if (builder->hasFusedEUWA() && rvar->getPhyReg() == nullptr)
2353 {
2354 return;
2355 }
2356 // Calculate number of physical registers required by this dcl
2357 if (numrows == 1)
2358 {
2359 unsigned numbytes = topdcl->getElemSize() * topdcl->getNumElems();
2360 numwords = (numbytes + 1) / 2;
2361 unsigned int subReg = rvar->getPhyRegOff();
2362 unsigned int subRegInWord = subReg * rvar->getDeclare()->getElemSize() / 2;
2363
2364 regnum = rvar->getPhyReg()->asGreg()->getRegNum();
2365 if (isGRFAvailable(regnum) == true) {
2366 for (unsigned int i = 0; i < numwords; i++)
2367 {
2368 // Starting from first word, mark each consecutive word busy
2369 setWordBusy(regnum, (subRegInWord + i));
2370 }
2371 }
2372 }
2373 else
2374 {
2375 regnum = rvar->getPhyReg()->asGreg()->getRegNum();
2376 for (unsigned int i = 0; i < topdcl->getNumRows(); i++)
2377 {
2378 if (isGRFAvailable(i + regnum) == true)
2379 {
2380 // Set entire GRF busy
2381 setGRFBusy(regnum + i);
2382 }
2383 }
2384 }
2385 }
2386
2387 // ********* PhyRegsManager class implementation *********
findFreeSingleReg(int regIdx,G4_SubReg_Align subalign,int & regnum,int & subregnum,int size)2388 bool PhyRegsLocalRA::findFreeSingleReg(int regIdx, G4_SubReg_Align subalign, int ®num, int &subregnum, int size)
2389 {
2390 bool found = false;
2391 if (subalign == GRFALIGN)
2392 {
2393 if (isWordBusy(regIdx, 0, size) == false)
2394 {
2395 subregnum = 0;
2396 found = true;
2397 }
2398 }
2399 else if (subalign == HALFGRFALIGN)
2400 {
2401 if (isWordBusy(regIdx, 0, size) == false)
2402 {
2403 subregnum = 0;
2404 found = true;
2405 }
2406 else if (size <= (int)numEltPerGRF<Type_UW>() / 2 && isWordBusy(regIdx, numEltPerGRF<Type_UW>() / 2, size) == false)
2407 {
2408 subregnum = numEltPerGRF<Type_UW>() / 2;
2409 found = true;
2410 }
2411 }
2412 else
2413 {
2414 // ToDo: check if dynamic step size has compile time impact
2415 int step = 1;
2416 int upBound = numEltPerGRF<Type_UW>() - size + 1;
2417 switch (subalign)
2418 {
2419 case Eight_Word: step = 8; break;
2420 case Four_Word: step = 4; break;
2421 case Even_Word: step = 2; break;
2422 case Any:
2423 upBound = numEltPerGRF<Type_UW>() - size; //FIXME, why not the last word
2424 step = 1; break;
2425 default:
2426 assert("unexpected alignment");
2427 }
2428 for (int j = 0; j < upBound; j += step)
2429 {
2430 if (!isWordBusy(regIdx, j, size))
2431 {
2432 subregnum = j;
2433 found = true;
2434 break;
2435 }
2436 }
2437 }
2438
2439 if (found)
2440 {
2441 regnum = regIdx;
2442 }
2443
2444 return found;
2445 }
2446
findFreeRegs(int size,BankAlign align,G4_SubReg_Align subalign,int & regnum,int & subregnum,int startRegNum,int endRegNum,unsigned short occupiedBundles,unsigned int instID,bool isHybridAlloc,std::unordered_set<unsigned int> & forbidden,bool hintSet,unsigned int hintReg)2447 int PhyRegsManager::findFreeRegs(int size, BankAlign align, G4_SubReg_Align subalign, int& regnum, int& subregnum,
2448 int startRegNum, int endRegNum, unsigned short occupiedBundles, unsigned int instID, bool isHybridAlloc, std::unordered_set<unsigned int>& forbidden, bool hintSet, unsigned int hintReg)
2449 {
2450 int nrows = 0;
2451 int lastRowSize = 0;
2452 LocalRA::getRowInfo(size, nrows, lastRowSize);
2453
2454 bool forward = hintSet ? true : (startRegNum <= endRegNum ? true : false);
2455 int startReg = forward ? startRegNum : startRegNum - nrows + 1;
2456 int endReg = forward ? endRegNum - nrows + 1 : endRegNum;
2457
2458 bool found = false;
2459
2460 if (size >= (int)numEltPerGRF<Type_UW>())
2461 {
2462 if (!hintSet)
2463 {
2464 if (forward)
2465 {
2466 found = availableRegs.findFreeMultipleRegsForward(startReg, align, regnum, nrows, lastRowSize, endReg, occupiedBundles, instID, isHybridAlloc, forbidden, hintSet);
2467 }
2468 else
2469 {
2470 found = availableRegs.findFreeMultipleRegsBackward(startReg, align, regnum, nrows, lastRowSize, endReg, instID, isHybridAlloc, forbidden);
2471 }
2472 }
2473 else
2474 {
2475 regnum = hintReg;
2476 found = true;
2477 }
2478 if (found)
2479 {
2480 subregnum = 0;
2481 if (size % numEltPerGRF<Type_UW>() == 0)
2482 {
2483 availableRegs.setGRFBusy(regnum, nrows);
2484 }
2485 else
2486 {
2487 MUST_BE_TRUE(!hintSet, "Not expecting split for variable with alignment != n*sizeof(GRF)");
2488 availableRegs.setGRFBusy(regnum, nrows - 1);
2489 availableRegs.setWordBusy(regnum + nrows - 1, 0, lastRowSize);
2490 }
2491 }
2492 }
2493 else
2494 {
2495 MUST_BE_TRUE(!hintSet, "Not expecting split with sub-GRF granularity");
2496 found = availableRegs.findFreeSingleReg(startReg, size, align, subalign, regnum, subregnum, endReg, instID, isHybridAlloc, forward, forbidden);
2497 if (found)
2498 {
2499 availableRegs.setWordBusy(regnum, subregnum, size);
2500 }
2501 }
2502
2503 if (found)
2504 {
2505 return nrows;
2506 }
2507
2508 return 0;
2509 }
2510
2511 // subregnum parameter is expected to be in units of word
freeRegs(int regnum,int subregnum,int numwords,int instID)2512 void PhyRegsManager::freeRegs(int regnum, int subregnum, int numwords, int instID)
2513 {
2514 while (numwords >= (int)numEltPerGRF<Type_UW>())
2515 {
2516 availableRegs.setGRFNotBusy(regnum, instID);
2517 numwords -= numEltPerGRF<Type_UW>();
2518 regnum++;
2519 }
2520
2521 while (numwords >= 1)
2522 {
2523 availableRegs.setWordNotBusy(regnum, subregnum, instID);
2524 subregnum++;
2525
2526 if (subregnum >= (int)numEltPerGRF<Type_UW>())
2527 {
2528 subregnum = 0;
2529 regnum++;
2530 }
2531 numwords--;
2532 }
2533 }
2534
2535 // ********* LinearScan class implementation *********
2536
LinearScan(GlobalRA & g,std::vector<LocalLiveRange * > & localLiveIntervals,std::list<InputLiveRange *,std_arena_based_allocator<InputLiveRange * >> & inputLivelIntervals,PhyRegsManager & pregMgr,PhyRegsLocalRA & pregs,Mem_Manager & memmgr,PhyRegSummary * s,unsigned int numReg,unsigned int glrs,bool roundRobin,bool bankConflict,bool internalConflict,bool splitLLR,unsigned int simdS)2537 LinearScan::LinearScan(GlobalRA& g, std::vector<LocalLiveRange*>& localLiveIntervals,
2538 std::list<InputLiveRange*, std_arena_based_allocator<InputLiveRange*>>& inputLivelIntervals,
2539 PhyRegsManager& pregMgr, PhyRegsLocalRA& pregs, Mem_Manager& memmgr, PhyRegSummary* s,
2540 unsigned int numReg, unsigned int glrs, bool roundRobin, bool bankConflict,
2541 bool internalConflict, bool splitLLR, unsigned int simdS)
2542 : gra(g)
2543 , builder(g.builder)
2544 , mem(memmgr)
2545 , pregManager(pregMgr)
2546 , initPregs(pregs)
2547 , liveIntervals(localLiveIntervals)
2548 , inputIntervals(inputLivelIntervals)
2549 , summary(s)
2550 , pregs(g.kernel.getNumRegTotal()* numEltPerGRF<Type_UW>(), false)
2551 , simdSize(simdS)
2552 , globalLRSize(glrs)
2553 , numRegLRA(numReg)
2554 , useRoundRobin(roundRobin)
2555 , doBankConflict(bankConflict)
2556 , highInternalConflict(internalConflict)
2557 , doSplitLLR(splitLLR)
2558 {
2559
2560 // FIXME: this entire class is a mess, whole thing needs a rewrite
2561 if (g.kernel.getNumRegTotal() < 128)
2562 {
2563 // code below can segfault if user specifies #GRF < 128. We just hack it so bank1 == bank2
2564 bank1StartGRFReg = bank1_start = 0;
2565 bank2StartGRFReg = bank2_start = 0;
2566 bank1_end = bank2_end = numRegLRA - 1;
2567 startGRFReg = &bank1StartGRFReg;
2568 return;
2569 }
2570
2571 //register number boundaries
2572 bank1_start = 0;
2573 bank1_end = SECOND_HALF_BANK_START_GRF - globalLRSize / 2 - 1;
2574 if (useRoundRobin) { //From middle to back
2575 bank2_start = SECOND_HALF_BANK_START_GRF + (globalLRSize + 1) / 2;
2576 bank2_end = numRegLRA - 1;
2577 } else { //From back to middle
2578 bank2_start = numRegLRA - 1;
2579 bank2_end = SECOND_HALF_BANK_START_GRF + (globalLRSize + 1) / 2;
2580 }
2581
2582 //register number pointers
2583 bank1StartGRFReg = bank1_start;
2584 bank2StartGRFReg = bank2_start;
2585
2586 //register pointer
2587 startGRFReg = &bank1StartGRFReg;
2588
2589 int bank1AvailableRegNum = 0;
2590 for (int i = 0; i < SECOND_HALF_BANK_START_GRF; i++) {
2591 if (pregManager.getAvaialableRegs()->isGRFAvailable(i) && !pregManager.getAvaialableRegs()->isGRFBusy(i)) {
2592 bank1AvailableRegNum++;
2593 }
2594 }
2595 pregManager.getAvaialableRegs()->setBank1AvailableRegNum(bank1AvailableRegNum);
2596
2597 int bank2AvailableRegNum = 0;
2598 for (unsigned int i = SECOND_HALF_BANK_START_GRF; i < numRegLRA; i++) {
2599 if (pregManager.getAvaialableRegs()->isGRFAvailable(i) && !pregManager.getAvaialableRegs()->isGRFBusy(i)) {
2600 bank2AvailableRegNum++;
2601 }
2602 }
2603 pregManager.getAvaialableRegs()->setBank2AvailableRegNum(bank2AvailableRegNum);
2604 }
2605
2606 // Linear scan implementation
run(G4_BB * bb,IR_Builder & builder,LLR_USE_MAP & LLRUseMap)2607 void LinearScan::run(G4_BB* bb, IR_Builder& builder, LLR_USE_MAP& LLRUseMap)
2608 {
2609 unsigned int idx = 0;
2610 bool allocateRegResult = false;
2611
2612 unsigned int firstLocalIdx = 0, firstGlobalIdx = 0;
2613 if (liveIntervals.size() > 0)
2614 {
2615 LocalLiveRange* firstLr = (*liveIntervals.begin());
2616 G4_INST *firstInst = firstLr->getFirstRef(firstLocalIdx);
2617 firstGlobalIdx = firstInst->getLexicalId();
2618 }
2619
2620 for (auto lr : liveIntervals)
2621 {
2622 G4_INST *currInst = lr->getFirstRef(idx);
2623
2624 expireRanges(idx);
2625 expireInputRanges(currInst->getLexicalId(), idx, firstGlobalIdx);
2626
2627 if (!lr->hasHint() &&
2628 doBankConflict && builder.lowHighBundle() && (highInternalConflict || (simdSize >= 16 && builder.oneGRFBankDivision())))
2629 {
2630 allocateRegResult = allocateRegsFromBanks(lr);
2631 }
2632 else
2633 {
2634 allocateRegResult = allocateRegs(lr, bb, builder, LLRUseMap);
2635 }
2636
2637 if (allocateRegResult)
2638 {
2639 updateActiveList(lr);
2640 #ifdef _DEBUG
2641 int startregnum, endregnum, startsregnum, endsregnum;
2642 G4_VarBase* op;
2643 op = lr->getPhyReg(startsregnum);
2644
2645 startregnum = endregnum = op->asGreg()->getRegNum();
2646 endsregnum = startsregnum + (lr->getTopDcl()->getNumElems() * lr->getTopDcl()->getElemSize() / 2) - 1;
2647
2648 MUST_BE_TRUE(((unsigned int)startregnum + lr->getTopDcl()->getNumRows() - 1) < numRegLRA, "Linear scan allocated unavailable physical GRF");
2649
2650 if (lr->getTopDcl()->getNumRows() > 1) {
2651 endregnum = startregnum + lr->getTopDcl()->getNumRows() - 1;
2652
2653 if (lr->getTopDcl()->getWordSize() > 0)
2654 {
2655 endsregnum = lr->getTopDcl()->getWordSize() % numEltPerGRF<Type_UW>() - 1;
2656 if (endsregnum < 0) endsregnum = 15;
2657 }
2658 else
2659 endsregnum = 15; // last word in GRF
2660 }
2661 #ifdef DEBUG_VERBOSE_ON
2662 DEBUG_VERBOSE("Assigned physical register to " << lr->getTopDcl()->getName() <<
2663 " (r" << startregnum << "." << startsregnum << ":w - " <<
2664 "r" << endregnum << "." << endsregnum << ":w)" << std::endl);
2665 #endif
2666 #endif
2667 }
2668 }
2669
2670 expireAllActive();
2671 }
2672
updateActiveList(LocalLiveRange * lr)2673 void LinearScan::updateActiveList(LocalLiveRange* lr)
2674 {
2675 bool done = false;
2676 unsigned int newlr_end;
2677
2678 lr->getLastRef(newlr_end);
2679
2680 for (auto active_it = active.begin();
2681 active_it != active.end();
2682 active_it++)
2683 {
2684 unsigned int end_idx;
2685 LocalLiveRange* active_lr = (*active_it);
2686
2687 active_lr->getLastRef(end_idx);
2688
2689 if (end_idx > newlr_end)
2690 {
2691 active.insert(active_it, lr);
2692 done = true;
2693 break;
2694 }
2695
2696 }
2697
2698 if (done == false)
2699 active.push_back(lr);
2700
2701 }
2702
expireAllActive()2703 void LinearScan::expireAllActive()
2704 {
2705 if (active.size() > 0)
2706 {
2707 // Expire any remaining ranges
2708 LocalLiveRange* lastActive = active.back();
2709 unsigned int endIdx;
2710
2711 lastActive->getLastRef(endIdx);
2712
2713 expireRanges(endIdx);
2714 }
2715 }
2716
2717 // idx is the current position being processed
2718 // The function will expire active ranges that end at or before idx
expireRanges(unsigned int idx)2719 void LinearScan::expireRanges(unsigned int idx)
2720 {
2721 //active list is sorted in ascending order of starting index
2722
2723 while (active.size() > 0)
2724 {
2725 unsigned int endIdx;
2726 LocalLiveRange* lr = active.front();
2727
2728 lr->getLastRef(endIdx);
2729
2730 if (endIdx <= idx)
2731 {
2732 G4_VarBase* preg;
2733 int subregnumword, subregnum;
2734
2735 preg = lr->getPhyReg(subregnumword);
2736
2737 if (preg)
2738 {
2739 subregnum = LocalRA::convertSubRegOffFromWords(lr->getTopDcl(), subregnumword);
2740
2741 // Mark the RegVar object of dcl as assigned to physical register
2742 lr->getTopDcl()->getRegVar()->setPhyReg(preg, subregnum);
2743 lr->setAssigned(true);
2744
2745 if (summary)
2746 {
2747 // mark physical register as used atleast once
2748 summary->markPhyRegs(preg, lr->getSizeInWords());
2749 }
2750 }
2751
2752 // Free physical regs marked for this range
2753 freeAllocedRegs(lr, true);
2754
2755 #ifdef DEBUG_VERBOSE_ON
2756 DEBUG_VERBOSE("Expiring range " << lr->getTopDcl()->getName() << std::endl);
2757 #endif
2758
2759 // Remove range from active list
2760 active.pop_front();
2761 }
2762 else
2763 {
2764 // As soon as we find first range that ends after ids break loop
2765 break;
2766 }
2767 }
2768 }
2769
expireInputRanges(unsigned int global_idx,unsigned int local_idx,unsigned int first_idx)2770 void LinearScan::expireInputRanges(unsigned int global_idx, unsigned int local_idx, unsigned int first_idx)
2771 {
2772 while (inputIntervals.size() > 0)
2773 {
2774 InputLiveRange* lr = inputIntervals.front();
2775 unsigned int endIdx = lr->getLrEndIdx();
2776
2777 if (endIdx <= global_idx)
2778 {
2779 unsigned int regnum = lr->getRegWordIdx() / numEltPerGRF<Type_UW>();
2780 unsigned int subRegInWord = lr->getRegWordIdx() % numEltPerGRF<Type_UW>();
2781 int inputIdx = (endIdx < first_idx) ? 0 : (local_idx - (global_idx - endIdx) * 2);
2782
2783 // Free physical regs marked for this range
2784 pregManager.freeRegs(regnum, subRegInWord, 1, inputIdx);
2785 initPregs.setWordNotBusy(regnum, subRegInWord, 0);
2786
2787 #ifdef DEBUG_VERBOSE_ON
2788 DEBUG_VERBOSE("Expiring input r" << regnum << "." << subRegInWord << std::endl);
2789 #endif
2790
2791 // Remove range from inputIntervals list
2792 inputIntervals.pop_front();
2793 }
2794 else
2795 {
2796 // As soon as we find first range that ends after ids break loop
2797 break;
2798 }
2799 }
2800 }
2801
2802 // Allocate registers to live range. It makes a decision whether to spill
2803 // a currently active range or the range passed as parameter. The range
2804 // that has larger size and is longer is the spill candidate.
2805 // Return true if free registers found, false if range is to be spilled
allocateRegs(LocalLiveRange * lr,G4_BB * bb,IR_Builder & builder,LLR_USE_MAP & LLRUseMap)2806 bool LinearScan::allocateRegs(LocalLiveRange* lr, G4_BB* bb, IR_Builder& builder, LLR_USE_MAP& LLRUseMap)
2807 {
2808 int regnum, subregnum;
2809 unsigned int localRABound = 0;
2810 unsigned int instID;
2811
2812 G4_INST* currInst = lr->getFirstRef(instID);
2813
2814 // Let local RA allocate only those ranges that need < 10 GRFs
2815 // Larger ranges are not many and are best left to global RA
2816 // as it can make a better judgment by considering the
2817 // spill cost.
2818 int nrows = 0;
2819 int size = lr->getSizeInWords();
2820 G4_Declare *dcl = lr->getTopDcl();
2821 G4_SubReg_Align subalign = gra.getSubRegAlign(dcl);
2822 BankAlign preBank = BankAlign::Either;
2823 unsigned short occupiedBundles = getOccupiedBundle(builder, gra, dcl, preBank);
2824 localRABound = numRegLRA - globalLRSize - 1; //-1, localRABound will be counted in findFreeRegs()
2825 BankAlign bankAlign = gra.isEvenAligned(dcl) ? BankAlign::Even : BankAlign::Either;
2826 if (bankAlign == BankAlign::Either && doBankConflict)
2827 {
2828 bankAlign = gra.getBankAlign(dcl);
2829 }
2830 else
2831 {
2832 if (bankAlign == BankAlign::Either)
2833 {
2834 bankAlign = preBank;
2835 }
2836 }
2837
2838 *startGRFReg = lr->hasHint() ? lr->getHint() : *startGRFReg;
2839 if (lr->hasHint() && gra.getVarSplitPass()->isPartialDcl(lr->getTopDcl()))
2840 {
2841 // Special alignment is not needed for var split intrinsic
2842 bankAlign = BankAlign::Either;
2843 }
2844
2845 if (useRoundRobin)
2846 {
2847 nrows = pregManager.findFreeRegs(size,
2848 bankAlign,
2849 subalign,
2850 regnum,
2851 subregnum,
2852 *startGRFReg,
2853 localRABound,
2854 occupiedBundles,
2855 instID,
2856 false,
2857 lr->getForbidden(),
2858 lr->hasHint(),
2859 lr->getHint());
2860 }
2861 else
2862 {
2863 nrows = pregManager.findFreeRegs(size,
2864 bankAlign,
2865 subalign,
2866 regnum,
2867 subregnum,
2868 *startGRFReg,
2869 localRABound,
2870 occupiedBundles,
2871 instID,
2872 true,
2873 lr->getForbidden(),
2874 lr->hasHint(),
2875 lr->getHint());
2876
2877 if (!nrows)
2878 {
2879 nrows = pregManager.findFreeRegs(size,
2880 bankAlign,
2881 subalign,
2882 regnum,
2883 subregnum,
2884 *startGRFReg,
2885 localRABound,
2886 occupiedBundles,
2887 instID,
2888 false,
2889 lr->getForbidden(),
2890 lr->hasHint(),
2891 lr->getHint());
2892 }
2893 }
2894
2895 if (useRoundRobin)
2896 {
2897 if (nrows)
2898 {
2899 *startGRFReg = (regnum + nrows) % (localRABound);
2900 }
2901 else
2902 {
2903 unsigned int endGRFReg = *startGRFReg + nrows;
2904 endGRFReg = (endGRFReg > localRABound) ? localRABound : endGRFReg;
2905
2906 nrows = pregManager.findFreeRegs(size,
2907 bankAlign,
2908 subalign,
2909 regnum,
2910 subregnum,
2911 0,
2912 endGRFReg,
2913 occupiedBundles,
2914 instID,
2915 false,
2916 lr->getForbidden(),
2917 lr->hasHint(),
2918 lr->getHint());
2919
2920 if (nrows)
2921 {
2922 *startGRFReg = (regnum + nrows) % (localRABound);
2923 }
2924 }
2925 }
2926
2927 // If allocations fails, return false
2928 if (!nrows)
2929 {
2930 // Check if any range in active is longer and wider than lr.
2931 // Spill it if it is and call self again.
2932 for (auto active_it = active.rbegin();
2933 active_it != active.rend();
2934 active_it++)
2935 {
2936 LocalLiveRange* activeLR = (*active_it);
2937
2938 if (activeLR->getSizeInWords() >= lr->getSizeInWords() &&
2939 (!doSplitLLR ||
2940 gra.getNumRefs(activeLR->getTopDcl()) > SPLIT_REF_CNT_THRESHOLD))
2941 {
2942 // Another active range is larger than lr
2943 unsigned int active_lr_end, lr_end;
2944
2945 activeLR->getLastRef(active_lr_end);
2946 lr->getLastRef(lr_end);
2947
2948 if (active_lr_end > lr_end)
2949 {
2950 // Range in active is longer, so spill it
2951 #ifdef DEBUG_VERBOSE_ON
2952 DEBUG_VERBOSE("Spilling range " << activeLR->getTopDcl()->getName() << std::endl);
2953 #endif
2954
2955 freeAllocedRegs(activeLR, false);
2956 active.erase(--(active_it.base()));
2957
2958 if (doSplitLLR)
2959 {
2960 G4_Declare* oldDcl = activeLR->getTopDcl();
2961 unsigned short totalElems = oldDcl->getTotalElems();
2962
2963 std::vector<std::pair<INST_LIST_ITER, unsigned int>>* useList = nullptr;
2964 auto Iter = LLRUseMap.find(activeLR);
2965 if (Iter != LLRUseMap.end())
2966 useList = &(Iter->second);
2967
2968 if (useList &&
2969 useList->size() > 2 &&
2970 gra.getNumRefs(oldDcl) - useList->size() == 1 &&
2971 (totalElems == 8 || totalElems == 16 || totalElems == 32))
2972 {
2973 bool split = false;
2974 unsigned int useCnt = 0;
2975 INST_LIST_ITER lastUseIt;
2976 G4_Declare* newDcl = NULL;
2977 for (auto usePoint : *useList)
2978 {
2979 INST_LIST_ITER useIt = usePoint.first;
2980 G4_INST* useInst = *useIt;
2981 useCnt++;
2982
2983 unsigned int currId = currInst->getLexicalId();
2984 if (useInst->getLexicalId() < currId ||
2985 useCnt < SPLIT_USE_CNT_THRESHOLD)
2986 {
2987 lastUseIt = useIt;
2988 }
2989 else if (split == false &&
2990 (useInst->getLexicalId() - (*lastUseIt)->getLexicalId()) > SPLIT_USE_DISTANCE_THRESHOLD &&
2991 !(oldDcl->getElemSize() >= 8 && oldDcl->getTotalElems() >= 16))
2992 {
2993 G4_Declare* splitDcl = NULL;
2994 const char* splitDclName = builder.getNameString(builder.mem, 16, "split_%s", oldDcl->getName());
2995 splitDcl = builder.createDeclareNoLookup(splitDclName, G4_GRF, oldDcl->getNumElems(), oldDcl->getNumRows(), oldDcl->getElemType());
2996 splitDcl->copyAlign(oldDcl);
2997
2998 LocalLiveRange* splitLR = gra.GetOrCreateLocalLiveRange(splitDcl);
2999 splitLR->markSplit();
3000
3001 INST_LIST_ITER iter = lastUseIt;
3002 iter++;
3003
3004 // FIXME: The following code does not handle QWord variable spilling completely.
3005 // See the last condition of this if-stmt.
3006 G4_DstRegRegion* dst = builder.createDstRegRegion(splitDcl, 1);
3007 G4_SrcRegRegion* src = builder.createSrcRegRegion(oldDcl, builder.getRegionStride1());
3008 G4_ExecSize splitInstExecSize (oldDcl->getTotalElems() > 16 ? 16 : oldDcl->getTotalElems());
3009 G4_INST* splitInst = builder.createMov(
3010 splitInstExecSize,
3011 dst, src, InstOpt_WriteEnable, false);
3012 bb->insertBefore(iter, splitInst);
3013
3014 unsigned int idx = 0;
3015 gra.getLocalLR(oldDcl)->setLastRef(splitInst, idx);
3016 gra.setNumRefs(oldDcl, useCnt);
3017 splitLR->setFirstRef(splitInst, idx);
3018 gra.setNumRefs(splitDcl, 2);
3019
3020 if (totalElems == 32)
3021 {
3022 G4_DstRegRegion* dst = builder.createDst(splitDcl->getRegVar(), 2, 0, 1, oldDcl->getElemType());
3023 auto src = builder.createSrc(oldDcl->getRegVar(), 2, 0, builder.getRegionStride1(), oldDcl->getElemType());
3024 G4_INST* splitInst2 = builder.createMov(g4::SIMD16, dst, src, InstOpt_WriteEnable, false);
3025 bb->insertBefore(iter, splitInst2);
3026 }
3027
3028 const char* newDclName = builder.getNameString(builder.mem, 16, "copy_%s", oldDcl->getName());
3029 newDcl = builder.createDeclareNoLookup(newDclName, G4_GRF, oldDcl->getNumElems(), oldDcl->getNumRows(), oldDcl->getElemType());
3030 newDcl->copyAlign(oldDcl);
3031
3032 unsigned int oldRefs = gra.getNumRefs(oldDcl);
3033 LocalLiveRange* newLR = gra.GetOrCreateLocalLiveRange(newDcl);
3034
3035 iter = useIt;
3036
3037 dst = builder.createDstRegRegion(newDcl, 1);
3038 src = builder.createSrcRegRegion(splitDcl, builder.getRegionStride1());
3039 G4_INST* movInst = builder.createMov(
3040 G4_ExecSize(splitDcl->getTotalElems() > 16 ? 16 : splitDcl->getTotalElems()),
3041 dst, src, InstOpt_WriteEnable, false);
3042 bb->insertBefore(iter, movInst);
3043
3044 splitLR->setLastRef(movInst, idx);
3045 G4_INST* old_last_use = activeLR->getLastRef(idx);
3046 newLR->setFirstRef(splitInst, idx);
3047 newLR->setLastRef(old_last_use, idx);
3048 gra.setNumRefs(newDcl, oldRefs - useCnt + 1);
3049
3050 if (totalElems == 32)
3051 {
3052 G4_DstRegRegion* dst = builder.createDst(newDcl->getRegVar(), 2, 0, 1, splitDcl->getElemType());
3053 auto src = builder.createSrc(splitDcl->getRegVar(), 2, 0, builder.getRegionStride1(), splitDcl->getElemType());
3054 G4_INST* movInst2 = builder.createMov(g4::SIMD16, dst, src, InstOpt_WriteEnable, false);
3055 bb->insertBefore(iter, movInst2);
3056 }
3057
3058 unsigned int pos = usePoint.second;
3059 G4_SrcRegRegion* oldSrc = useInst->getSrc(pos)->asSrcRegRegion();
3060 G4_Declare *oldSrcDcl = oldSrc->getBase()->asRegVar()->getDeclare();
3061 G4_Declare* newSrcDcl = newDcl;
3062 G4_Declare *aliasOldSrcDcl = oldSrcDcl->getAliasDeclare();
3063 if (aliasOldSrcDcl != NULL)
3064 {
3065 const char* newSrcDclName = builder.getNameString(builder.mem, 16, "copy_%s", oldSrcDcl->getName());
3066 newSrcDcl = builder.createDeclareNoLookup(newSrcDclName, G4_GRF, oldSrcDcl->getNumElems(), oldSrcDcl->getNumRows(), oldSrcDcl->getElemType());
3067 newSrcDcl->copyAlign(oldSrcDcl);
3068 newSrcDcl->setAliasDeclare(aliasOldSrcDcl, oldSrcDcl->getAliasOffset());
3069 }
3070 auto newSrc = builder.createSrcRegRegion(oldSrc->getModifier(), Direct, newSrcDcl->getRegVar(),
3071 oldSrc->getRegOff(), oldSrc->getSubRegOff(), oldSrc->getRegion(), oldSrc->getType());
3072 useInst->setSrc(newSrc, pos);
3073 while (aliasOldSrcDcl && aliasOldSrcDcl != oldDcl)
3074 {
3075 oldSrcDcl = aliasOldSrcDcl;
3076 const char* newSrcDclName = builder.getNameString(builder.mem, 16, "copy_%s", oldSrcDcl->getName());
3077 newSrcDcl = builder.createDeclareNoLookup(newSrcDclName, G4_GRF, oldSrcDcl->getNumElems(), oldSrcDcl->getNumRows(), oldSrcDcl->getElemType());
3078 newSrcDcl->copyAlign(oldSrcDcl);
3079 aliasOldSrcDcl = oldSrcDcl->getAliasDeclare();
3080 MUST_BE_TRUE(aliasOldSrcDcl != NULL, "Invalid alias decl");
3081 newSrcDcl->setAliasDeclare(aliasOldSrcDcl, oldSrcDcl->getAliasOffset());
3082 }
3083
3084 split = true;
3085 }
3086 else if (split)
3087 {
3088 unsigned int pos = usePoint.second;
3089 G4_SrcRegRegion* oldSrc = useInst->getSrc(pos)->asSrcRegRegion();
3090 G4_Declare *oldSrcDcl = oldSrc->getBase()->asRegVar()->getDeclare();
3091 G4_Declare* newSrcDcl = newDcl;
3092 G4_Declare *aliasOldSrcDcl = oldSrcDcl->getAliasDeclare();
3093 if (aliasOldSrcDcl != NULL)
3094 {
3095 const char* newSrcDclName = builder.getNameString(builder.mem, 16, "copy_%s", oldSrcDcl->getName());
3096 newSrcDcl = builder.createDeclareNoLookup(newSrcDclName, G4_GRF, oldSrcDcl->getNumElems(), oldSrcDcl->getNumRows(), oldSrcDcl->getElemType());
3097 newSrcDcl->copyAlign(oldSrcDcl);
3098 newSrcDcl->setAliasDeclare(aliasOldSrcDcl, oldSrcDcl->getAliasOffset());
3099 }
3100 auto newSrc = builder.createSrcRegRegion(oldSrc->getModifier(), Direct, newSrcDcl->getRegVar(),
3101 oldSrc->getRegOff(), oldSrc->getSubRegOff(), oldSrc->getRegion(), oldSrc->getType());;
3102 useInst->setSrc(newSrc, pos);
3103 while (aliasOldSrcDcl && aliasOldSrcDcl != oldDcl)
3104 {
3105 oldSrcDcl = aliasOldSrcDcl;
3106 const char* newSrcDclName = builder.getNameString(builder.mem, 16, "copy_%s", oldSrcDcl->getName());
3107 newSrcDcl = builder.createDeclareNoLookup(newSrcDclName, G4_GRF, oldSrcDcl->getNumElems(), oldSrcDcl->getNumRows(), oldSrcDcl->getElemType());
3108 newSrcDcl->copyAlign(oldSrcDcl);
3109 aliasOldSrcDcl = oldSrcDcl->getAliasDeclare();
3110 MUST_BE_TRUE(aliasOldSrcDcl != NULL, "Invalid alias decl");
3111 newSrcDcl->setAliasDeclare(aliasOldSrcDcl, oldSrcDcl->getAliasOffset());
3112 }
3113 }
3114 else
3115 {
3116 lastUseIt = useIt;
3117 }
3118 }
3119 }
3120 }
3121
3122 // Try again
3123 return allocateRegs(lr, bb, builder, LLRUseMap);
3124 }
3125 }
3126 }
3127
3128 #ifdef DEBUG_VERBOSE_ON
3129 DEBUG_VERBOSE("Spilling range " << lr->getTopDcl()->getName() << std::endl);
3130 #endif
3131
3132 // Even after spilling everything could not find an allocation
3133 return false;
3134 }
3135 #ifdef DEBUG_VERBOSE_ON
3136 COUT_ERROR << lr->getTopDcl()->getName() << ":r" << regnum << " BANK: " << gra.getBankConflict(lr->getTopDcl()) << std::endl;
3137 #endif
3138
3139 lr->setPhyReg(builder.phyregpool.getGreg(regnum), subregnum);
3140
3141 if (gra.getVarSplitPass()->isSplitDcl(lr->getTopDcl()))
3142 {
3143 // Update hint for all children of lr to coalesce
3144 coalesceSplit(lr);
3145 }
3146
3147 #ifdef _DEBUG
3148 if (gra.getVarSplitPass()->isPartialDcl(lr->getTopDcl()))
3149 {
3150 int s;
3151 MUST_BE_TRUE(lr->getPhyReg(s)->asGreg()->getRegNum() == lr->getHint() ||
3152 !lr->hasHint(), "hint not honored for child dcl");
3153 }
3154 #endif
3155
3156 return true;
3157 }
3158
coalesceSplit(LocalLiveRange * lr)3159 void LinearScan::coalesceSplit(LocalLiveRange* lr)
3160 {
3161 // lr is split parent. set hint on all children.
3162 // immediately free unused rows of split variable.
3163 int subreg;
3164 auto regnum = lr->getPhyReg(subreg)->asGreg()->getRegNum();
3165 auto varSplit = gra.getVarSplitPass();
3166 std::unordered_set<unsigned int> unrefGRFs;
3167 for (unsigned int i = regnum; i != (regnum + lr->getTopDcl()->getNumRows()); i++)
3168 {
3169 unrefGRFs.insert(i);
3170 }
3171 for (auto l : liveIntervals)
3172 {
3173 if (!varSplit->isParentChildRelation(lr->getTopDcl(), l->getTopDcl()))
3174 continue;
3175
3176 auto siblingNum = varSplit->getSiblingNum(l->getTopDcl());
3177 unsigned int sizePerChild = l->getTopDcl()->getNumRows();
3178
3179 // this hint is binding on children to guarantee coalescing of split intrinsic
3180 l->setHint(regnum + (siblingNum * sizePerChild));
3181 #ifdef _DEBUG
3182 int s;
3183 if (l->getAssigned())
3184 MUST_BE_TRUE(!l->getPhyReg(s), "split child already allocated");
3185 #endif
3186 for (unsigned int i = regnum + (siblingNum * sizePerChild); i != regnum + ((siblingNum + 1) * sizePerChild); i++)
3187 {
3188 unrefGRFs.erase(i);
3189 }
3190 }
3191
3192 // now free phy regs of split that dont have an intrinsic split emitted, ie unused
3193 // rows of parent dcl.
3194 unsigned int idx;
3195 lr->getFirstRef(idx);
3196 for (auto f : unrefGRFs)
3197 {
3198 pregManager.freeRegs(f, 0, numEltPerGRF<Type_UW>(), idx);
3199 }
3200 }
3201
allocateRegsFromBanks(LocalLiveRange * lr)3202 bool LinearScan::allocateRegsFromBanks(LocalLiveRange* lr)
3203 {
3204 int regnum, subregnum;
3205 unsigned int tmpLocalRABound = 0;
3206 int nrows = 0;
3207 int lastUseSum1 = pregManager.getAvaialableRegs()->getLastUseSum1();
3208 int lastUseSum2 = pregManager.getAvaialableRegs()->getLastUseSum2();
3209 int bank1AvailableRegNum = pregManager.getAvaialableRegs()->getBank1AvailableRegNum();
3210 int bank2AvailableRegNum = pregManager.getAvaialableRegs()->getBank2AvailableRegNum();
3211 int size = lr->getSizeInWords();
3212 BankAlign align = gra.isEvenAligned(lr->getTopDcl()) ? BankAlign::Even : BankAlign::Either;
3213 G4_SubReg_Align subalign = gra.getSubRegAlign(lr->getTopDcl());
3214 unsigned int instID;
3215 lr->getFirstRef(instID);
3216 auto lrBC = gra.getBankConflict(lr->getTopDcl());
3217
3218 // Reverse the order for FF two banks RA.
3219 // For FF, second bank register allocated from back to front
3220 if (align == BankAlign::Either && lrBC != BANK_CONFLICT_NONE)
3221 {
3222 switch (lrBC)
3223 {
3224 case BANK_CONFLICT_FIRST_HALF_EVEN:
3225 case BANK_CONFLICT_FIRST_HALF_ODD:
3226 align = BankAlign::Even;
3227 startGRFReg = &bank1StartGRFReg;
3228 tmpLocalRABound = bank1_end;
3229 break;
3230 case BANK_CONFLICT_SECOND_HALF_EVEN:
3231 case BANK_CONFLICT_SECOND_HALF_ODD:
3232 if (useRoundRobin)
3233 {
3234 align = BankAlign::Odd;
3235 }
3236 startGRFReg = &bank2StartGRFReg;
3237 tmpLocalRABound = bank2_end;
3238 break;
3239 default: break;
3240 }
3241 }
3242 else
3243 {
3244 if (useRoundRobin)
3245 {
3246 if (bank1AvailableRegNum == 0 && bank2AvailableRegNum == 0)
3247 {
3248 return false;
3249 }
3250
3251 float bank1Average = bank1AvailableRegNum == 0 ? (float)0x7fffffff : (float)lastUseSum1 / bank1AvailableRegNum;
3252 float bank2Average = bank2AvailableRegNum == 0 ? (float)0x7fffffff : (float)lastUseSum2 / bank2AvailableRegNum;
3253 if (bank1Average > bank2Average)
3254 {
3255 startGRFReg = &bank2StartGRFReg;
3256 tmpLocalRABound = bank2_end;
3257 }
3258 else
3259 {
3260 startGRFReg = &bank1StartGRFReg;
3261 tmpLocalRABound = bank1_end;
3262 }
3263 }
3264 else
3265 {
3266 if (bank1AvailableRegNum < bank2AvailableRegNum)
3267 {
3268 startGRFReg = &bank2StartGRFReg;
3269 tmpLocalRABound = bank2_end;
3270 }
3271 else
3272 {
3273 startGRFReg = &bank1StartGRFReg;
3274 tmpLocalRABound = bank1_end;
3275 }
3276 }
3277 }
3278
3279 if (useRoundRobin)
3280 {
3281 nrows = pregManager.findFreeRegs(size,
3282 align,
3283 subalign,
3284 regnum,
3285 subregnum,
3286 *startGRFReg,
3287 tmpLocalRABound,
3288 0,
3289 instID,
3290 false,
3291 lr->getForbidden(),
3292 false, 0);
3293
3294 if (nrows)
3295 {
3296 //Wrapper handling
3297 if (tmpLocalRABound)
3298 {
3299 *startGRFReg = (regnum + nrows) % (tmpLocalRABound);
3300 }
3301
3302 if (startGRFReg == &bank2StartGRFReg)
3303 {
3304 if (*startGRFReg < SECOND_HALF_BANK_START_GRF)
3305 {
3306 *startGRFReg += bank2_start;
3307 if (*startGRFReg >= numRegLRA)
3308 {
3309 *startGRFReg = bank2_start;
3310 return false;
3311 }
3312 }
3313 }
3314 }
3315 else
3316 {
3317 //record the original endReg in case failed to allocate in bank switch
3318 unsigned int endReg = *startGRFReg + nrows;
3319
3320 //If failed, try another bank first.
3321 if (startGRFReg == &bank2StartGRFReg)
3322 {
3323 startGRFReg = &bank1StartGRFReg;
3324 tmpLocalRABound = bank1_end;
3325 }
3326 else
3327 {
3328 startGRFReg = &bank2StartGRFReg;
3329 tmpLocalRABound = bank2_end;
3330 }
3331
3332 nrows = pregManager.findFreeRegs(size,
3333 align,
3334 subalign,
3335 regnum,
3336 subregnum,
3337 *startGRFReg,
3338 tmpLocalRABound,
3339 0,
3340 instID,
3341 false,
3342 lr->getForbidden(),
3343 false, 0);
3344
3345 if (!nrows)
3346 {
3347 //Still fail, wrapper round in original bank
3348 if (startGRFReg == &bank1StartGRFReg) //switch back to before
3349 {
3350 startGRFReg = &bank2StartGRFReg;
3351 *startGRFReg = bank2_start;
3352 tmpLocalRABound = ((endReg) > (bank2_end)) ? bank2_end : endReg;
3353 }
3354 else
3355 {
3356 startGRFReg = &bank1StartGRFReg;
3357 *startGRFReg = bank1_start;
3358 tmpLocalRABound = (endReg > bank1_end) ? bank1_end : endReg;
3359 }
3360 nrows = pregManager.findFreeRegs(size,
3361 align,
3362 subalign,
3363 regnum,
3364 subregnum,
3365 *startGRFReg,
3366 tmpLocalRABound,
3367 0,
3368 instID,
3369 false,
3370 lr->getForbidden(),
3371 false, 0);
3372 }
3373 }
3374 }
3375 else
3376 {
3377 nrows = pregManager.findFreeRegs(size,
3378 align,
3379 subalign,
3380 regnum,
3381 subregnum,
3382 *startGRFReg,
3383 tmpLocalRABound,
3384 0,
3385 instID,
3386 true,
3387 lr->getForbidden(),
3388 false, 0);
3389
3390 if (!nrows)
3391 { //Try without window, no even/odd alignment for bank, but still low and high(keep in same bank)
3392 nrows = pregManager.findFreeRegs(size,
3393 gra.isEvenAligned(lr->getTopDcl()) ? BankAlign::Even : BankAlign::Either,
3394 subalign,
3395 regnum,
3396 subregnum,
3397 *startGRFReg,
3398 tmpLocalRABound,
3399 0,
3400 instID,
3401 false,
3402 lr->getForbidden(),
3403 false, 0);
3404 }
3405
3406 if (!nrows)
3407 {
3408 //Try another bank, no even/odd alignment for bank, also no low and high
3409 if (startGRFReg == &bank2StartGRFReg)
3410 {
3411 startGRFReg = &bank1StartGRFReg;
3412 tmpLocalRABound = bank1_end;
3413 }
3414 else
3415 {
3416 startGRFReg = &bank2StartGRFReg;
3417 tmpLocalRABound = bank2_end;
3418 }
3419
3420 nrows = pregManager.findFreeRegs(size,
3421 gra.isEvenAligned(lr->getTopDcl()) ? BankAlign::Even : BankAlign::Either,
3422 subalign,
3423 regnum,
3424 subregnum,
3425 *startGRFReg,
3426 tmpLocalRABound,
3427 0,
3428 instID,
3429 false,
3430 lr->getForbidden(),
3431 false, 0);
3432 }
3433 }
3434
3435 // If allocations fails, return false
3436 if (!nrows)
3437 {
3438 return false;
3439 }
3440
3441 lr->setPhyReg(builder.phyregpool.getGreg(regnum), subregnum);
3442
3443 if (gra.getVarSplitPass()->isSplitDcl(lr->getTopDcl()))
3444 {
3445 // Update hint for all children of lr to coalesce
3446 coalesceSplit(lr);
3447 }
3448
3449 MUST_BE_TRUE(!gra.getVarSplitPass()->isPartialDcl(lr->getTopDcl())
3450 || !lr->hasHint(), "not expecting to use this allocate method for split dcl");
3451
3452 #ifdef DEBUG_VERBOSE_ON
3453 COUT_ERROR << lr->getTopDcl()->getName() << ":r" << regnum << " BANK: " << gra.getBankConflict(lr->getTopDcl()) << std::endl;
3454 #endif
3455 return true;
3456 }
3457
3458 // Mark physical register allocated to range lr as available
freeAllocedRegs(LocalLiveRange * lr,bool setInstID)3459 void LinearScan::freeAllocedRegs(LocalLiveRange* lr, bool setInstID)
3460 {
3461 int sregnum;
3462
3463 G4_VarBase* preg = lr->getPhyReg(sregnum);
3464
3465 MUST_BE_TRUE(preg != NULL,
3466 "Physical register not assigned to live range. Cannot free regs.");
3467
3468 unsigned int idx = 0;
3469 if (setInstID)
3470 {
3471 lr->getLastRef(idx);
3472 }
3473
3474 auto varSplitPass = gra.getVarSplitPass();
3475 if (!varSplitPass->isSplitDcl(lr->getTopDcl()))
3476 {
3477 pregManager.freeRegs(preg->asGreg()->getRegNum(),
3478 sregnum,
3479 lr->getSizeInWords(),
3480 idx);
3481 }
3482 }
3483
3484 // ********* PhyRegSummary class implementation *********
3485
3486 // Mark GRFs as used
markPhyRegs(G4_VarBase * pr,unsigned int size)3487 void PhyRegSummary::markPhyRegs(G4_VarBase* pr, unsigned int size)
3488 {
3489 // Assume that pr is aligned to GRF start if it cannot fit in a single GRF
3490 MUST_BE_TRUE(pr->isGreg(), "Expecting GRF as operand");
3491
3492 int numGRFs = size / numEltPerGRF<Type_UW>();
3493 unsigned int regnum = pr->asGreg()->getRegNum();
3494
3495 if (size%numEltPerGRF<Type_UW>() != 0)
3496 numGRFs++;
3497
3498 for (int i = 0; i < numGRFs; i++)
3499 {
3500 GRFUsage[regnum + i] = true;
3501 }
3502 }
3503
printBusyRegs()3504 void PhyRegSummary::printBusyRegs()
3505 {
3506 DEBUG_VERBOSE("GRFs used: ");
3507
3508 for (int i = 0; i < (int)totalNumGRF; i++)
3509 {
3510 if (isGRFBusy(i) == true)
3511 {
3512 DEBUG_VERBOSE("r" << i << ", ");
3513 }
3514 }
3515
3516 DEBUG_VERBOSE(std::endl << std::endl);
3517 }
3518