1 /*========================== begin_copyright_notice ============================
2
3 Copyright (C) 2017-2021 Intel Corporation
4
5 SPDX-License-Identifier: MIT
6
7 ============================= end_copyright_notice ===========================*/
8
9 #include "Mem_Manager.h"
10 #include "FlowGraph.h"
11 #include "RegAlloc.h"
12 #include "GraphColor.h"
13 #include "Timer.h"
14 #include "DebugInfo.h"
15 #include "VarSplit.h"
16
17 #include <bitset>
18 #include <climits>
19 #include <cmath>
20 #include <fstream>
21 #include <optional>
22 #include <vector>
23
24 using namespace vISA;
25
26 #define GRAPH_COLOR
27
PointsToAnalysis(const DECLARE_LIST & declares,unsigned int numBB)28 PointsToAnalysis::PointsToAnalysis(const DECLARE_LIST &declares, unsigned int numBB) :
29 numBBs(numBB), numAddrs(0), indirectUses(std::make_unique<REGVAR_VECTOR[]>(numBB))
30 {
31 for (auto decl : declares)
32 {
33 //add alias check, For Alias Dcl
34 if ((decl->getRegFile() == G4_ADDRESS || decl->getRegFile() == G4_SCALAR) &&
35 decl->getAliasDeclare() == NULL) // It is a base declaration, not alias
36 {
37 // participate liveness analysis
38 decl->getRegVar()->setId(numAddrs++);
39 }
40 else
41 {
42 decl->getRegVar()->setId(UNDEFINED_VAL);
43 }
44 }
45
46 // assign all addr aliases the same ID as its root
47 for (auto decl : declares)
48 {
49 if ((decl->getRegFile() == G4_ADDRESS || decl->getRegFile() == G4_SCALAR) &&
50 decl->getAliasDeclare() != NULL)
51 {
52 // participate liveness analysis
53 decl->getRegVar()->setId(decl->getRegVar()->getId());
54 }
55 }
56
57 if (numAddrs > 0)
58 {
59 for (unsigned int i = 0; i < numAddrs; i++)
60 regVars.push_back(NULL);
61
62 for (auto decl : declares)
63 {
64 if ((decl->getRegFile() == G4_ADDRESS || decl->getRegFile() == G4_SCALAR) &&
65 decl->getAliasDeclare() == NULL &&
66 decl->getRegVar()->getId() != UNDEFINED_VAL)
67 {
68 regVars[decl->getRegVar()->getId()] = decl->getRegVar();
69 }
70 }
71
72 pointsToSets.resize(numAddrs);
73 addrPointsToSetIndex.resize(numAddrs);
74 // initially each address variable has its own points-to set
75 for (unsigned i = 0; i < numAddrs; i++)
76 {
77 addrPointsToSetIndex[i] = i;
78 }
79 }
80 }
81
82
83 //
84 // A flow-insensitive algroithm to compute the register usage for indirect accesses.
85 // The algorithm is divided into two parts:
86 // 1. We go through every basic block computing the points-to set for each adddress
87 // variable. This happens when we see an instruction like
88 // mov (8) A0 &R0
89 //
90 // 2. We go through each basic block again, and for each r[A0] expression
91 // we mark variables in A0's points-to set as used in the block
92 //
93 // The algorithm is conservative but should work well for our inputs since
94 // the front end pretty much always uses a fresh address variable when taking
95 // the address of a GRF variable, wtih the exception of call-by-reference parameters
96 // It's performed only once at the beginning of RA, at the point where all variables
97 // are virtual and no spill code (either for address or GRF) has been inserted.
98 //
doPointsToAnalysis(FlowGraph & fg)99 void PointsToAnalysis::doPointsToAnalysis(FlowGraph& fg)
100 {
101 if (numAddrs == 0)
102 {
103 return;
104 }
105
106 // keep a list of address taken variables
107 std::vector<G4_RegVar*> addrTakenDsts;
108 std::map<G4_RegVar*, std::vector<std::pair<G4_RegVar*, unsigned char>> > addrTakenMapping;
109 std::vector< std::pair<G4_RegVar*, unsigned char>> addrTakenVariables;
110
111 for (G4_BB* bb : fg)
112 {
113 for (const G4_INST* inst : *bb)
114 {
115 G4_DstRegRegion* dst = inst->getDst();
116 if (dst != NULL && dst->getRegAccess() == Direct && dst->getType() != Type_UD)
117 {
118 G4_VarBase* ptr = dst->getBase();
119
120 for (int i = 0; i < G4_MAX_SRCS; i++)
121 {
122 G4_Operand* src = inst->getSrc(i);
123 if (src != NULL && src->isAddrExp())
124 {
125 int offset = 0;
126 if (dst && !dst->isNullReg() && dst->getBase()->asRegVar()->getDeclare()->getRegFile() == G4_SCALAR)
127 {
128 offset = src->asAddrExp()->getOffset();
129 }
130 addrTakenMapping[ptr->asRegVar()].push_back(std::make_pair(src->asAddrExp()->getRegVar(), offset));
131 addrTakenDsts.push_back(ptr->asRegVar());
132 addrTakenVariables.push_back(std::make_pair(src->asAddrExp()->getRegVar(), offset));
133 }
134 }
135 }
136 }
137 }
138
139 // first compute the points-to set for each address variable
140 for (G4_BB* bb : fg)
141 {
142 for (G4_INST* inst : *bb)
143 {
144 if (inst->isPseudoKill() || inst->isLifeTimeEnd())
145 {
146 // No need to consider these lifetime placeholders for points2analysis
147 continue;
148 }
149
150 G4_DstRegRegion* dst = inst->getDst();
151 if (dst != NULL && dst->getRegAccess() == Direct && dst->getType() != Type_UD)
152 {
153 G4_VarBase* ptr = dst->getBase();
154 //Dst is address variable
155 if (ptr->isRegVar() && (ptr->asRegVar()->getDeclare()->getRegFile() == G4_ADDRESS || ptr->asRegVar()->getDeclare()->getRegFile() == G4_SCALAR) &&
156 !ptr->asRegVar()->getDeclare()->isMsgDesc())
157 {
158
159 // dst is an address variable. ExDesc A0 may be ignored since they are never used in indirect access
160 if (inst->isMov() || inst->isPseudoAddrMovIntrinsic())
161 {
162 for (int i = 0; i < inst->getNumSrc(); i++)
163 {
164 G4_Operand* src = inst->getSrc(i);
165 if (!src || src->isNullReg())
166 {
167 continue;
168 }
169 if (src->isAddrExp())
170 {
171 // case 1: mov A0 &GRF
172 G4_RegVar* addrTaken = src->asAddrExp()->getRegVar();
173
174 if (addrTaken != NULL)
175 {
176 unsigned char offset = 0;
177 if (ptr->asRegVar()->getDeclare()->getRegFile() == G4_SCALAR)
178 {
179 offset = src->asAddrExp()->getOffset();
180 }
181 addToPointsToSet(ptr->asRegVar(), addrTaken, offset);
182 }
183 }
184 else
185 {
186 //G4_Operand* srcPtr = src->isSrcRegRegion() ? src->asSrcRegRegion()->getBase() : src;
187 G4_VarBase* srcPtr = src->isSrcRegRegion() ? src->asSrcRegRegion()->getBase() : nullptr;
188
189 if (srcPtr && srcPtr->isRegVar() && (srcPtr->asRegVar()->getDeclare()->getRegFile() == G4_ADDRESS))
190 {
191 // case 2: mov A0 A1
192 // merge the two addr's points-to set together
193 if (ptr->asRegVar()->getId() != srcPtr->asRegVar()->getId())
194 {
195 mergePointsToSet(srcPtr->asRegVar(), ptr->asRegVar());
196 }
197 }
198 else
199 {
200 // case ?: mov v1 A0
201 // case ?: mov A0 v1
202 if (srcPtr &&
203 srcPtr->isRegVar() &&
204 addrTakenMapping[srcPtr->asRegVar()].size() != 0)
205 {
206 for (int i = 0; i < (int)addrTakenMapping[srcPtr->asRegVar()].size(); i++)
207 {
208 addToPointsToSet(ptr->asRegVar(), addrTakenMapping[srcPtr->asRegVar()][i].first, addrTakenMapping[srcPtr->asRegVar()][i].second);
209 }
210 }
211 else
212 {
213 // case 3: mov A0 0
214 // Initial of address register, igore the point to analysis
215 // FIXME: currently, vISA don't expect mov imm value to the address register. So, 0 is treated as initialization.
216 // If support mov A0 imm in future, 0 may be R0.
217 if (!(src->isImm() && (src->asImm()->getImm() == 0)))
218 {
219 // case 4: mov A0 V2
220 // conservatively assume address can point to anything
221 DEBUG_MSG("unexpected addr move for pointer analysis:\n");
222 DEBUG_EMIT(inst);
223 DEBUG_MSG("\n");
224 for (int i = 0, size = (int)addrTakenVariables.size(); i < size; i++)
225 {
226 addToPointsToSet(ptr->asRegVar(), addrTakenVariables[i].first, addrTakenVariables[i].second);
227 }
228 }
229 }
230 }
231 }
232 }
233 }
234 else if (inst->isArithmetic())
235 {
236 G4_Operand* src0 = inst->getSrc(0);
237 G4_Operand* src1 = inst->getSrc(1);
238 bool src0addr = false;
239 if (src0->isAddrExp())
240 {
241 src0addr = true;
242 }
243 else if (src0->isSrcRegRegion() && src0->getRegAccess() == Direct)
244 {
245 if (src0->isAddress())
246 {
247 src0addr = true;
248 }
249 }
250
251 bool src1addr = false;
252 if (src1->isAddrExp())
253 {
254 src1addr = true;
255 }
256 else if (src1->isSrcRegRegion() && src1->getRegAccess() == Direct)
257 {
258 if (src1->isAddress())
259 {
260 src1addr = true;
261 }
262 }
263
264 if (src0addr ^ src1addr)
265 {
266 G4_Operand* src = src0addr ? src0 : src1;
267
268 if (src->isAddrExp())
269 {
270 // case 5: add/mul A0 &GRF src1
271 G4_RegVar* addrTaken = src->asAddrExp()->getRegVar();
272 addToPointsToSet(ptr->asRegVar(), addrTaken, 0);
273 }
274 else
275 {
276 G4_VarBase* srcPtr = src->isSrcRegRegion() ? src->asSrcRegRegion()->getBase() : nullptr;
277 // case 6: add/mul A0 A1 src1
278 // merge the two addr's points-to set together
279 if (srcPtr && (ptr->asRegVar()->getId() != srcPtr->asRegVar()->getId()))
280 {
281 mergePointsToSet(srcPtr->asRegVar(), ptr->asRegVar());
282 }
283 }
284 }
285 else if (ptr->isRegVar() && ptr->asRegVar()->isPhyRegAssigned())
286 {
287 // OK, using builtin a0 or a0.2 directly.
288 }
289 else
290 {
291 // case 7: add/mul A0 V1 V2
292 DEBUG_MSG("unexpected addr add/mul for pointer analysis:\n");
293 DEBUG_EMIT(inst);
294 DEBUG_MSG("\n");
295 for (int i = 0; i < (int)addrTakenVariables.size(); i++)
296 {
297 addToPointsToSet(ptr->asRegVar(), addrTakenVariables[i].first, 0);
298 }
299 }
300 }
301 else
302 {
303 // case 8: A0 = ???
304 DEBUG_MSG("unexpected instruction with address destination:\n");
305 DEBUG_EMIT(inst);
306 DEBUG_MSG("\n");
307 for (int i = 0; i < (int)addrTakenVariables.size(); i++)
308 {
309 addToPointsToSet(ptr->asRegVar(), addrTakenVariables[i].first, addrTakenVariables[i].second);
310 }
311 }
312 }
313 else if (ptr->isRegVar() && !ptr->asRegVar()->getDeclare()->isMsgDesc())
314 {
315 for (int i = 0; i < G4_MAX_SRCS; i++)
316 {
317 G4_Operand* src = inst->getSrc(i);
318 G4_VarBase* srcPtr = (src && src->isSrcRegRegion()) ? src->asSrcRegRegion()->getBase() : nullptr;
319 //We don't support using "r[a0.0]" as address expression.
320 //For instructions like following, it's not point-to propagation for simdShuffle and add64_i_i_i_i.
321 //(W) mov (1) simdShuffle(0,0)<1>:d r[A0(0,0), 0]<0;1,0>:d
322 // pseudo_mad (16) add64_i_i_i_i(0,0)<1>:d 0x6:w simdShuffle(0,0)<0;0>:d rem_i_i_i_i(0,0)<1;0>:d // $470:
323 // shl (16) add64_i_i_i_i(0,0)<1>:d add64_i_i_i_i(0,0)<1;1,0>:d 0x2:w // $472:
324 if (srcPtr != nullptr && srcPtr->isRegVar() && ptr != srcPtr && src->getRegAccess() != IndirGRF)
325 {
326 std::vector<G4_RegVar*>::iterator addrDst = std::find(addrTakenDsts.begin(), addrTakenDsts.end(), srcPtr->asRegVar());
327 if (addrDst != addrTakenDsts.end())
328 {
329 addrTakenDsts.push_back(ptr->asRegVar());
330 for (int i = 0; i < (int)addrTakenMapping[srcPtr->asRegVar()].size(); i++)
331 {
332 addrTakenMapping[ptr->asRegVar()].push_back(addrTakenMapping[srcPtr->asRegVar()][i]);
333 }
334 }
335 }
336 }
337 }
338 }
339 }
340 }
341
342 #ifdef DEBUG_VERBOSE_ON
343 DEBUG_VERBOSE("Results of points-to analysis:\n");
344 for (unsigned int i = 0; i < numAddrs; i++)
345 {
346 DEBUG_VERBOSE("Addr " << i);
347 for (G4_RegVar *grf : pointsToSets[addrPointsToSetIndex[i]])
348 {
349 DEBUG_EMIT(grf);
350 DEBUG_VERBOSE("\t");
351 }
352 DEBUG_VERBOSE("\n");
353 }
354 #endif
355
356 // mark GRF that may be used indirect access as live in the block
357 // This includes all GRFs in the address's points-to set
358 for (auto bb : fg)
359 {
360 for (G4_INST* inst : *bb)
361 {
362 G4_DstRegRegion* dst = inst->getDst();
363
364 if (dst != NULL &&
365 dst->getRegAccess() == IndirGRF)
366 {
367 G4_VarBase* dstptr = dst->getBase();
368 MUST_BE_TRUE(dstptr->isRegVar() && (dstptr->asRegVar()->getDeclare()->getRegFile() == G4_ADDRESS || dstptr->asRegVar()->getDeclare()->getRegFile() == G4_SCALAR),
369 "base must be address");
370 addPointsToSetToBB(bb->getId(), dstptr->asRegVar());
371 }
372
373 for (unsigned j = 0; j < G4_MAX_SRCS; j++)
374 {
375 //
376 // look for indirect reg access r[ptr] which refers addrTaken reg var
377 //
378 if (inst->getSrc(j) == NULL || !inst->getSrc(j)->isSrcRegRegion()) {
379 continue;
380 }
381
382 G4_SrcRegRegion* src = inst->getSrc(j)->asSrcRegRegion();
383
384 if (src->getRegAccess() == IndirGRF)
385 {
386 G4_VarBase* srcptr = src->getBase();
387 MUST_BE_TRUE(srcptr->isRegVar() && (srcptr->asRegVar()->getDeclare()->getRegFile() == G4_ADDRESS || srcptr->asRegVar()->getDeclare()->getRegFile() == G4_SCALAR),
388 "base must be address");
389 addPointsToSetToBB(bb->getId(), srcptr->asRegVar());
390 }
391 }
392 }
393 }
394
395 #ifndef NDEBUG
396 for (unsigned i = 0; i < numAddrs; i++)
397 {
398 REGVAR_VECTOR& vec = pointsToSets[addrPointsToSetIndex[i]];
399 for (const pointInfo cur : vec)
400 {
401 unsigned indirectVarSize = cur.var->getDeclare()->getByteSize();
402 assert((indirectVarSize <= (unsigned)getGRFSize()* fg.getKernel()->getNumRegTotal()) && "indirected variables' size is larger than GRF file size");
403 }
404 }
405 #endif
406
407 #ifdef DEBUG_VERBOSE_ON
408 for (unsigned int i = 0; i < numBBs; i++)
409 {
410 DEBUG_VERBOSE("Indirect uses for BB" << i << "\t");
411 const REGVAR_VECTOR &grfVec = getIndrUseVectorForBB(i);
412 for (G4_RegVar* grf : grfVec)
413 {
414 DEBUG_EMIT(grf);
415 DEBUG_VERBOSE("\t");
416 }
417 DEBUG_VERBOSE("\n");
418 }
419 #endif
420
421 }
422
isLocalVar(G4_Declare * decl) const423 bool LivenessAnalysis::isLocalVar(G4_Declare* decl) const
424 {
425 if ((decl->isInput() == true &&
426 !(fg.builder->getFCPatchInfo() &&
427 fg.builder->getFCPatchInfo()->getFCComposableKernel() &&
428 !decl->isLiveIn())) &&
429 !(fg.builder->isPreDefArg(decl) &&
430 (fg.builder->getIsKernel() ||
431 (fg.getIsStackCallFunc() &&
432 fg.builder->getArgSize() == 0))))
433 return false;
434
435 if (fg.builder->getOption(vISA_enablePreemption) &&
436 decl == fg.builder->getBuiltinR0())
437 return false;
438
439
440 if (decl->isOutput() == true &&
441 !(fg.builder->isPreDefRet(decl) &&
442 (fg.builder->getIsKernel() ||
443 (fg.getIsStackCallFunc() &&
444 fg.builder->getRetVarSize() == 0))))
445 return false;
446
447 LocalLiveRange* dclLR = gra.getLocalLR(decl);
448 if (decl &&
449 dclLR &&
450 dclLR->isLiveRangeLocal()&&
451 decl->getRegFile() != G4_INPUT)
452 {
453 return true;
454 }
455
456 // Since variable split is done only for local variables, there is no need to analysis for global variables.
457 if (decl->getIsSplittedDcl() || decl->getIsPartialDcl())
458 {
459 return true;
460 }
461
462 return false;
463 }
464
setGlobalVarIDs(bool verifyRA,bool areAllPhyRegAssigned)465 bool LivenessAnalysis::setGlobalVarIDs(bool verifyRA, bool areAllPhyRegAssigned)
466 {
467 bool phyRegAssigned = areAllPhyRegAssigned;
468
469 for (G4_Declare* decl : gra.kernel.Declares)
470 {
471 if (!isLocalVar(decl))
472 {
473 /* Note that we only do local split, so there is no need to handle partial declare and splitted declare */
474 if (livenessCandidate(decl, verifyRA) && decl->getAliasDeclare() == NULL)
475 {
476 decl->getRegVar()->setId(numGlobalVarId++);
477 if (decl->getRegVar()->getPhyReg() == NULL && !decl->getIsPartialDcl())
478 numUnassignedVarId++;
479 if (decl->getRegVar()->isPhyRegAssigned() == false)
480 {
481 phyRegAssigned = false;
482 }
483 }
484 else
485 {
486 decl->getRegVar()->setId(UNDEFINED_VAL);
487 }
488 }
489 }
490
491 return phyRegAssigned;
492 }
493
setLocalVarIDs(bool verifyRA,bool areAllPhyRegAssigned)494 bool LivenessAnalysis::setLocalVarIDs(bool verifyRA, bool areAllPhyRegAssigned)
495 {
496 numVarId = numGlobalVarId;
497 bool phyRegAssigned = areAllPhyRegAssigned;
498
499 for (G4_Declare* decl : gra.kernel.Declares)
500 {
501 if (isLocalVar(decl))
502 {
503 if (livenessCandidate(decl, verifyRA) && decl->getAliasDeclare() == NULL)
504 {
505 if (decl->getIsSplittedDcl())
506 {
507 decl->setSplitVarStartID(0);
508 }
509 if (decl->getIsPartialDcl())
510 {
511 auto declSplitDcl = gra.getSplittedDeclare(decl);
512 if (declSplitDcl && declSplitDcl->getIsSplittedDcl())
513 {
514 if (numSplitStartID == 0)
515 {
516 numSplitStartID = numVarId;
517 }
518
519 if (declSplitDcl->getSplitVarStartID() == 0)
520 {
521 declSplitDcl->setSplitVarStartID(numVarId);
522 }
523 numSplitVar++;
524 }
525 else
526 {
527 assert(0 && "Found child declare without parent");
528 }
529 }
530
531 decl->getRegVar()->setId(numVarId++);
532 if (decl->getRegVar()->getPhyReg() == NULL && !decl->getIsPartialDcl())
533 numUnassignedVarId++;
534 if (decl->getRegVar()->isPhyRegAssigned() == false)
535 {
536 phyRegAssigned = false;
537 }
538 }
539 else
540 {
541 decl->getRegVar()->setId(UNDEFINED_VAL);
542 }
543 }
544 }
545
546 return phyRegAssigned;
547 }
548
setVarIDs(bool verifyRA,bool areAllPhyRegAssigned)549 bool LivenessAnalysis::setVarIDs(bool verifyRA, bool areAllPhyRegAssigned)
550 {
551 bool phyRegAssigned = areAllPhyRegAssigned;
552 for (G4_Declare* decl : gra.kernel.Declares)
553 {
554
555 if (livenessCandidate(decl, verifyRA) && decl->getAliasDeclare() == NULL)
556 {
557 if (decl->getIsSplittedDcl())
558 {
559 decl->setSplitVarStartID(0);
560 }
561 if (decl->getIsPartialDcl())
562 {
563 auto declSplitDcl = gra.getSplittedDeclare(decl);
564 if (declSplitDcl->getIsSplittedDcl())
565 {
566 if (numSplitStartID == 0)
567 {
568 numSplitStartID = numVarId;
569 }
570
571 if (declSplitDcl->getSplitVarStartID() == 0)
572 {
573 declSplitDcl->setSplitVarStartID(numVarId);
574 }
575 numSplitVar++;
576 }
577 else
578 {
579 assert(0 && "Found child declare without parent");
580 }
581 }
582
583 // participate liveness analysis
584 decl->getRegVar()->setId(numVarId++);
585
586 if (decl->getRegVar()->getPhyReg() == NULL && !decl->getIsPartialDcl())
587 numUnassignedVarId++;
588
589 //
590 // dump Reg Var info for debugging
591 //
592
593 if (decl->getRegVar()->isPhyRegAssigned() == false)
594 {
595 phyRegAssigned = false;
596 }
597 #ifdef DEBUG_VERBOSE_ON
598 DEBUG_EMIT(decl->getRegVar());
599 DEBUG_VERBOSE(" id = " << decl->getRegVar()->getId() << std::endl);
600 #endif
601 }
602 //
603 // those reg vars that are not candidates, set their id to
604 // undefined value
605 //
606 else
607 {
608 decl->getRegVar()->setId(UNDEFINED_VAL);
609 }
610 }
611
612 numGlobalVarId = numVarId;
613
614 return phyRegAssigned;
615 }
616
LivenessAnalysis(GlobalRA & g,unsigned char kind,bool verifyRA,bool forceRun)617 LivenessAnalysis::LivenessAnalysis(
618 GlobalRA& g,
619 unsigned char kind,
620 bool verifyRA,
621 bool forceRun) :
622 selectedRF(kind),
623 pointsToAnalysis(g.pointsToAnalysis), m(4096), gra(g), fg(g.kernel.fg)
624 {
625 //
626 // NOTE:
627 // The maydef sets are simply aliases to the mayuse sets, since their uses are
628 // mutually exclusive.
629 //
630 // Go over each reg var if it's a liveness candidate, assign id for bitset.
631 //
632 bool areAllPhyRegAssigned = !forceRun;
633 bool hasStackCall = fg.getHasStackCalls() || fg.getIsStackCallFunc();
634
635 if ((selectedRF & G4_GRF) &&
636 !fg.builder->getOption(vISA_GlobalSendVarSplit) && !hasStackCall)
637 {
638 areAllPhyRegAssigned = setGlobalVarIDs(verifyRA, areAllPhyRegAssigned);
639 areAllPhyRegAssigned = setLocalVarIDs(verifyRA, areAllPhyRegAssigned);
640 }
641 else
642 {
643 areAllPhyRegAssigned = setVarIDs(verifyRA, areAllPhyRegAssigned);
644 }
645
646 // For Alias Dcl
647 for (auto decl : gra.kernel.Declares)
648 {
649 if (livenessCandidate(decl, verifyRA) && decl->getAliasDeclare() != NULL)
650 {
651 // It is an alias declaration. Set its id = base declaration id
652 decl->getRegVar()->setId(decl->getAliasDeclare()->getRegVar()->getId());
653 }
654 #ifdef DEBUG_VERBOSE_ON
655 DEBUG_EMIT(decl->getRegVar());
656 DEBUG_VERBOSE(" id = " << decl->getRegVar()->getId() << std::endl);
657 #endif
658 }
659
660 //
661 // if no chosen candidate for reg allocation return
662 //
663 if (numVarId == 0 ||
664 (verifyRA == false &&
665 areAllPhyRegAssigned == true))
666 {
667 // If all variables have physical register assignments
668 // there are no candidates for allocation
669 numVarId = 0;
670 return;
671 }
672
673 //
674 // put selected reg vars into vars[]
675 //
676 vars.resize(numVarId);
677 for (auto dcl : gra.kernel.Declares)
678 {
679 if (livenessCandidate(dcl, verifyRA) &&
680 dcl->getAliasDeclare() == NULL)
681 {
682 G4_RegVar* var = dcl->getRegVar();
683 vars[var->getId()] = var;
684 }
685 }
686
687 addr_taken = BitSet(numVarId, false);
688
689 numBBId = (unsigned) fg.size();
690
691 def_in.resize(numBBId);
692 def_out.resize(numBBId);
693 use_in.resize(numBBId);
694 use_out.resize(numBBId);
695 use_gen.resize(numBBId);
696 use_kill.resize(numBBId);
697 indr_use.resize(numBBId);
698
699 for (unsigned i = 0; i < numBBId; i++)
700 {
701 def_in[i] = BitSet(numVarId, false);
702 def_out[i] = BitSet(numVarId, false);
703 use_in[i] = BitSet(numVarId, false);
704 use_out[i] = BitSet(numVarId, false);
705 use_gen[i] = BitSet(numVarId, false);
706 use_kill[i]= BitSet(numVarId, false);
707 indr_use[i]= BitSet(numVarId, false);
708 }
709 }
710
~LivenessAnalysis()711 LivenessAnalysis::~LivenessAnalysis()
712 {
713 //
714 // if no chosen candidate for reg allocation return
715 //
716 if (numVarId == 0)
717 {
718 return;
719 }
720
721 // Remove liveness inserted pseudo kills
722 for (auto bb : fg)
723 {
724 for (auto instIt = bb->begin(); instIt != bb->end();)
725 {
726 auto inst = (*instIt);
727 if (inst->isPseudoKill())
728 {
729 auto src0 = inst->getSrc(0);
730 MUST_BE_TRUE(src0 && src0->isImm(), "expecting src0 immediate for pseudo kill");
731 if (src0->asImm()->getImm() == PseudoKillType::FromLiveness)
732 {
733 instIt = bb->erase(instIt);
734 continue;
735 }
736 }
737 ++instIt;
738 }
739 }
740 }
741
livenessCandidate(const G4_Declare * decl,bool verifyRA) const742 bool LivenessAnalysis::livenessCandidate(const G4_Declare* decl, bool verifyRA) const
743 {
744 const LocalLiveRange* declLR = nullptr;
745 if (verifyRA == false && (declLR = gra.getLocalLR(decl)) && declLR->getAssigned() && !declLR->isEOT())
746 {
747 return false;
748 }
749 else if ((selectedRF & decl->getRegFile()))
750 {
751 if (selectedRF & G4_GRF)
752 {
753 if ((decl->getRegFile() & G4_INPUT) && decl->getRegVar()->isPhyRegAssigned() && !decl->getRegVar()->isGreg())
754 {
755 return false;
756 }
757 if (decl->getByteSize() == 0)
758 {
759 // regrettably, this can happen for arg/retval pre-defined variable
760 return false;
761 }
762 }
763 return true;
764 }
765 else
766 {
767 return false;
768 }
769 }
770
updateKillSetForDcl(G4_Declare * dcl,BitSet * curBBGen,BitSet * curBBKill,G4_BB * curBB,BitSet * entryBBGen,BitSet * entryBBKill,G4_BB * entryBB,unsigned scopeID)771 void LivenessAnalysis::updateKillSetForDcl(G4_Declare* dcl, BitSet* curBBGen, BitSet* curBBKill, G4_BB* curBB, BitSet* entryBBGen, BitSet* entryBBKill, G4_BB* entryBB, unsigned scopeID)
772 {
773 if (scopeID != 0 &&
774 scopeID != UINT_MAX &&
775 dcl->getScopeID() == scopeID)
776 {
777 entryBBKill->set(dcl->getRegVar()->getId(), true);
778 entryBBGen->set(dcl->getRegVar()->getId(), false);
779 #ifdef DEBUG_VERBOSE_ON
780 DEBUG_VERBOSE("Killed sub-routine scope " << dcl->getName() << " at bb with id = " << entryBB->getId() << std::endl);
781 #endif
782 }
783 }
784
785 // Scoping info is stored per decl. A variable can be either global scope (default),
786 // sub-routine scope, or basic block scope. This function iterates over all
787 // instructions and their operands in curBB and if scoping for it is set in symbol
788 // table then it marks kills accordingly. A bb scope variable is killed in the bb it appears
789 // and a sub-routine local variable is killed in entry block of the sub-routine. No
790 // error check is performed currently so if variable scoping information is incorrect
791 // then generated code will be so too.
performScoping(BitSet * curBBGen,BitSet * curBBKill,G4_BB * curBB,BitSet * entryBBGen,BitSet * entryBBKill,G4_BB * entryBB)792 void LivenessAnalysis::performScoping(BitSet* curBBGen, BitSet* curBBKill, G4_BB* curBB, BitSet* entryBBGen, BitSet* entryBBKill, G4_BB* entryBB)
793 {
794 unsigned scopeID = curBB->getScopeID();
795 for (G4_INST* inst : *curBB)
796 {
797 G4_DstRegRegion* dst = inst->getDst();
798
799 if (dst &&
800 dst->getBase()->isRegAllocPartaker())
801 {
802 G4_Declare* dcl = GetTopDclFromRegRegion(dst);
803 updateKillSetForDcl(dcl, curBBGen, curBBKill, curBB, entryBBGen, entryBBKill, entryBB, scopeID);
804 }
805
806 for (int i = 0; i < G4_MAX_SRCS; i++)
807 {
808 G4_Operand* src = inst->getSrc(i);
809
810 if (src)
811 {
812 if (src->isSrcRegRegion() &&
813 src->asSrcRegRegion()->getBase()->isRegAllocPartaker())
814 {
815 G4_Declare* dcl = GetTopDclFromRegRegion(src);
816 updateKillSetForDcl(dcl, curBBGen, curBBKill, curBB, entryBBGen, entryBBKill, entryBB, scopeID);
817 }
818 else if (src->isAddrExp() &&
819 src->asAddrExp()->getRegVar()->isRegAllocPartaker())
820 {
821 G4_Declare* dcl = src->asAddrExp()->getRegVar()->getDeclare()->getRootDeclare();
822
823 updateKillSetForDcl(dcl, curBBGen, curBBKill, curBB, entryBBGen, entryBBKill, entryBB, scopeID);
824 }
825 }
826 }
827 }
828 }
829
detectNeverDefinedVarRows()830 void LivenessAnalysis::detectNeverDefinedVarRows()
831 {
832 // This function records variables and its rows that are never defined
833 // in the kernel. This information helps detect kills for partial
834 // writes when VISA optimizer optimizes away some rows of a variable.
835 // In interest of compile time we only look for full rows that are
836 // not defined rather than sub-regs.
837 std::unordered_map<G4_Declare*, BitSet> largeDefs;
838
839 // Populate largeDefs map with dcls > 1 GRF size
840 for (auto dcl : gra.kernel.Declares)
841 {
842 if (dcl->getAliasDeclare() || dcl->getIsPartialDcl() || dcl->getAddressed())
843 continue;
844
845 if (dcl->getRegFile() != G4_GRF)
846 continue;
847
848 unsigned int dclNumRows = dcl->getNumRows();
849
850 if (dclNumRows < 2)
851 continue;
852
853 BitSet bitset(dclNumRows, false);
854
855 largeDefs.insert(std::make_pair(dcl, std::move(bitset)));
856 }
857
858 if (largeDefs.empty())
859 return;
860
861 const unsigned bytesPerGRF = numEltPerGRF<Type_UB>();
862
863 // Update row usage of each dcl in largeDefs
864 for (auto bb : gra.kernel.fg)
865 {
866 for (auto inst : *bb)
867 {
868 auto dst = inst->getDst();
869 if (!dst)
870 continue;
871
872 auto dstTopDcl = dst->getTopDcl();
873
874 if (dstTopDcl)
875 {
876 auto it = largeDefs.find(dstTopDcl);
877
878 if (it == largeDefs.end())
879 {
880 continue;
881 }
882
883 unsigned int rowStart = dst->getLeftBound() / bytesPerGRF;
884 unsigned int rowEnd = dst->getRightBound() / bytesPerGRF;
885
886 it->second.set(rowStart, rowEnd);
887 }
888 }
889 }
890
891 // Propagate largeDefs to neverDefinedRows bit vector to later bitwise OR it
892 for (auto it : largeDefs)
893 {
894 unsigned int numRows = it.first->getNumRows();
895 BitSet* undefinedRows = nullptr;
896 for (unsigned int i = 0; i < numRows; i++)
897 {
898 if (!it.second.isSet(i))
899 {
900 if (undefinedRows == nullptr)
901 {
902 undefinedRows = &neverDefinedRows.emplace(it.first, BitSet(it.first->getByteSize(), false)).first->second;
903 }
904 undefinedRows->set(i * bytesPerGRF, i * bytesPerGRF + bytesPerGRF - 1);
905 }
906 }
907 }
908 }
909
910 //
911 // compute liveness of reg vars
912 // Each reg var indicates a region within the register file. As such, the case in which two consecutive defs
913 // of a reg region without any use in between does not mean the second def overwrites the first one because the two defs
914 // may write different parts of the region. Def vectors are used to track which definitions of reg vars reach
915 // the entry and the end of a basic block, which tell us the first definitions of reg vars. Use vectors track which
916 // uses of reg vars are anticipated, which tell use the uses of reg vars.Def and Use vectors encapsulate the liveness
917 // of reg vars.
918 //
computeLiveness()919 void LivenessAnalysis::computeLiveness()
920 {
921 //
922 // no reg var is selected, then no need to compute liveness
923 //
924 if (getNumSelectedVar() == 0)
925 {
926 return;
927 }
928
929 startTimer(TimerID::LIVENESS);
930
931 #ifdef DEBUG_VERBOSE_ON
932 std::vector<FuncInfo*>& fns = fg.funcInfoTable;
933 #endif
934 //
935 // mark input arguments live at the entry of kernel
936 // mark output arguments live at the exit of kernel
937 //
938 BitSet inputDefs(numVarId, false);
939 BitSet outputUses(numVarId, false);
940
941 for (unsigned i = 0; i < numVarId; i++)
942 {
943 bool setLiveIn = false;
944
945 G4_Declare *decl = vars[i]->getDeclare();
946
947 if ((decl->isInput() == true &&
948 !(fg.builder->getFCPatchInfo() &&
949 fg.builder->getFCPatchInfo()->getFCComposableKernel() &&
950 !decl->isLiveIn())) &&
951 !(fg.builder->isPreDefArg(decl) &&
952 (fg.builder->getIsKernel() ||
953 (fg.getIsStackCallFunc() &&
954 fg.builder->getArgSize() == 0))))
955 setLiveIn = true;
956
957 if (fg.builder->getOption(vISA_enablePreemption) &&
958 decl == fg.builder->getBuiltinR0())
959 setLiveIn = true;
960
961 if(setLiveIn)
962 {
963 inputDefs.set(i, true);
964 #ifdef DEBUG_VERBOSE_ON
965 DEBUG_VERBOSE("First def input = " << decl->getName() << std::endl);
966 #endif
967 }
968
969 bool setLiveOut = false;
970 if (decl->isOutput() == true &&
971 !(fg.builder->isPreDefRet(decl) &&
972 (fg.builder->getIsKernel() ||
973 (fg.getIsStackCallFunc() &&
974 fg.builder->getRetVarSize() == 0))))
975 setLiveOut = true;
976
977 if (fg.builder->getOption(vISA_enablePreemption) &&
978 decl == fg.builder->getBuiltinR0())
979 setLiveOut = true;
980
981 if(setLiveOut)
982 {
983 outputUses.set(i, true);
984 #ifdef DEBUG_VERBOSE_ON
985 DEBUG_VERBOSE("First def output = " << decl->getName() << std::endl);
986 #endif
987 }
988 }
989
990 //
991 // clean up def_in & def_out that are used in markFirstDef
992 //
993 for (unsigned i = 0; i < numBBId; i++)
994 {
995 def_in[i].clear();
996 def_out[i].clear();
997 }
998
999 if (livenessClass(G4_GRF))
1000 detectNeverDefinedVarRows();
1001
1002 //
1003 // compute def_out and use_in vectors for each BB
1004 //
1005 for (G4_BB * bb : fg)
1006 {
1007 unsigned id = bb->getId();
1008
1009 computeGenKillandPseudoKill(bb, def_out[id], use_in[id], use_gen[id], use_kill[id]);
1010
1011 //
1012 // exit block: mark output parameters live
1013 //
1014 if (bb->Succs.empty())
1015 {
1016 use_out[id] = outputUses;
1017 }
1018 }
1019
1020 G4_BB* subEntryBB = NULL;
1021 BitSet* subEntryKill = NULL;
1022 BitSet* subEntryGen = NULL;
1023
1024 if (fg.getKernel()->getInt32KernelAttr(Attributes::ATTR_Target) == VISA_CM)
1025 {
1026 //
1027 // Top-down order of BB list iteration guarantees that
1028 // entry BB of each sub-routine will be seen before any other
1029 // BBs belonging to that sub-routine. This assumes that BBs of
1030 // a sub-routine are laid out back to back in bb list.
1031 //
1032 for (auto bb : fg)
1033 {
1034 unsigned id = bb->getId();
1035
1036 if (bb->getScopeID() != 0 &&
1037 bb->getScopeID() != UINT_MAX)
1038 {
1039 subEntryBB = fg.sortedFuncTable[bb->getScopeID() - 1]->getInitBB();
1040 unsigned entryBBID = subEntryBB->getId();
1041 subEntryKill = &use_kill[entryBBID];
1042 subEntryGen = &use_gen[entryBBID];
1043 }
1044
1045 //
1046 // Mark explicitly scoped variables as kills
1047 //
1048 performScoping(&use_gen[id], &use_kill[id], bb, subEntryGen, subEntryKill, subEntryBB);
1049 }
1050 }
1051
1052 //
1053 // compute indr accesses
1054 //
1055 if (selectedRF & G4_GRF)
1056 {
1057 // only GRF variables can have their address taken
1058 for (auto bb : fg)
1059 {
1060 const REGVAR_VECTOR& grfVec = pointsToAnalysis.getIndrUseVectorForBB(bb->getId());
1061 for (const pointInfo addrTaken : grfVec)
1062 {
1063 indr_use[bb->getId()].set(addrTaken.var->getId(), true);
1064 addr_taken.set(addrTaken.var->getId(), true);
1065 }
1066 }
1067 }
1068 //
1069 // Perform inter-procedural context-sensitive flow analysis.
1070 // This is required when the CFG involves function calls with multiple calling
1071 // contexts for the same function, as peforming just a context-insensitive
1072 // analysis results in uses being propgated along paths that are not feasible
1073 // in the actual program.
1074 //
1075 if (performIPA())
1076 {
1077 hierarchicalIPA(inputDefs, outputUses);
1078 stopTimer(TimerID::LIVENESS);
1079 return;
1080 }
1081
1082
1083 if (fg.getKernel()->getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D &&
1084 (selectedRF & G4_GRF || selectedRF & G4_FLAG) &&
1085 (numFnId > 0))
1086 {
1087 // compute the maydef for each subroutine
1088 maydefAnalysis();
1089
1090 //
1091 // dump vectors for debugging
1092 //
1093 #ifdef DEBUG_VERBOSE_ON
1094 dump_bb_vector("MAYDEF IN", maydef_in);
1095 dump_bb_vector("MAYDEF OUT", maydef_out);
1096 dump_fn_vector("MAYDEF", fns, maydef);
1097 #endif
1098 }
1099
1100 //
1101 // backward flow analysis to propagate uses (locate last uses)
1102 //
1103
1104 bool change = true;
1105
1106 while (change)
1107 {
1108 change = false;
1109 BB_LIST::iterator rit = fg.end();
1110 do
1111 {
1112 //
1113 // use_out = use_in(s1) + use_in(s2) + ...
1114 // where s1 s2 ... are the successors of bb
1115 // use_in = use_gen + (use_out - use_kill)
1116 //
1117 --rit;
1118 if (contextFreeUseAnalyze((*rit), change))
1119 {
1120 change = true;
1121 }
1122
1123 } while (rit != fg.begin());
1124 }
1125
1126 //
1127 // forward flow analysis to propagate defs (locate first defs)
1128 //
1129
1130 //
1131 // initialize entry block with payload input
1132 //
1133 def_in[fg.getEntryBB()->getId()] = inputDefs;
1134 change = true;
1135 while (change)
1136 {
1137 change = false;
1138 for (auto bb : fg)
1139 {
1140 //
1141 // def_in = def_out(p1) + def_out(p2) + ... where p1 p2 ... are the predecessors of bb
1142 // def_out |= def_in
1143 //
1144 if (contextFreeDefAnalyze(bb, change))
1145 {
1146 change = true;
1147 }
1148 }
1149 }
1150
1151 #if 0
1152 // debug code to compare old v. new IPA
1153 {
1154 std::vector<G4_Declare*> idToDecl;
1155 idToDecl.resize(numVarId);
1156 for (auto dcl : fg.getKernel()->Declares)
1157 {
1158 auto id = dcl->getRegVar()->getId();
1159 if (id < numVarId)
1160 {
1161 idToDecl[id] = dcl;
1162 }
1163 }
1164
1165 fg.getKernel()->dump(std::cerr);
1166
1167 auto printLive = [this, &idToDecl](int id)
1168 {
1169 std::cerr << "Liveness for " << idToDecl[id]->getName() << "\n";
1170 std::cerr << "Use In: ";
1171 for (int i = 0; i < (int) useInCopy.size(); ++i)
1172 {
1173 if (useInCopy[i].isSet(id))
1174 {
1175 std::cerr << "BB" << i << " ";
1176 }
1177 }
1178 std::cerr << "\n";
1179 std::cerr << "Use Out: ";
1180 for (int i = 0; i < (int) useOutCopy.size(); ++i)
1181 {
1182 if (useOutCopy[i].isSet(id))
1183 {
1184 std::cerr << "BB" << i << " ";
1185 }
1186 }
1187 std::cerr << "\n";
1188 };
1189
1190 auto printSetDiff = [&idToDecl, this](const std::vector<BitSet>& set1,
1191 const std::vector<BitSet>& set2)
1192 {
1193 for (int i = 0, size = (int) set1.size(); i < size; ++i)
1194 {
1195 bool printBB = true;
1196 for (int j = 0; j < (int)numVarId; ++j)
1197 {
1198 if (set1[i].isSet(j) ^ set2[i].isSet(j))
1199 {
1200 if (printBB)
1201 {
1202 std::cerr << "BB" << i << ": ";
1203 printBB = false;
1204 }
1205 std::cerr << idToDecl[j]->getName() << "(" << j << "):" <<
1206 (set1[i].isSet(j) ? 1 : 0) << " ";
1207 }
1208 }
1209 if (!printBB)
1210 {
1211 std::cerr << "\n";
1212 }
1213 }
1214 };
1215
1216 std::cerr << "use-in comparison:\n";
1217 printSetDiff(use_in, useInCopy);
1218
1219 std::cerr << "use-out comparison:\n";
1220 printSetDiff(use_out, useOutCopy);
1221 }
1222 #endif
1223
1224 //
1225 // dump vectors for debugging
1226 //
1227 #if 0
1228 {
1229 dump_bb_vector("DEF IN", def_in);
1230 dump_bb_vector("DEF OUT", def_out);
1231 dump_bb_vector("USE IN", use_in);
1232 dump_bb_vector("USE OUT", use_out);
1233 }
1234 #endif
1235
1236 stopTimer(TimerID::LIVENESS);
1237 }
1238
1239 //
1240 // compute the maydef set for every subroutine
1241 // This includes recursively all the variables that are defined by the
1242 // subroutine, but does not include defs in the caller
1243 // This means this must be called before we do fix-point on def_in/def_out
1244 // and destroy their original values
1245 // This is used by augmentation later to model all variables that may be defined by a call
1246 // FIXME: we should use a separate def set to represent declares defined in each BB
1247 //
maydefAnalysis()1248 void LivenessAnalysis::maydefAnalysis()
1249 {
1250 for (auto func : fg.sortedFuncTable)
1251 {
1252 unsigned fid = func->getId();
1253 if (fid == UINT_MAX)
1254 {
1255 // entry kernel
1256 continue;
1257 }
1258
1259 auto& BV = subroutineMaydef[func];
1260 for (auto&& bb : func->getBBList())
1261 {
1262 BV |= def_out[bb->getId()];
1263 }
1264 for (auto&& callee : func->getCallees())
1265 {
1266 BV |= subroutineMaydef[callee];
1267 }
1268 }
1269 }
1270
1271 //
1272 // Use analysis for this subroutine only
1273 // use_out[call-BB] = use_in[ret-BB]
1274 // use_out[exit-BB] should be initialized by the caller
1275 //
useAnalysis(FuncInfo * subroutine)1276 void LivenessAnalysis::useAnalysis(FuncInfo* subroutine)
1277 {
1278 bool changed = false;
1279 do
1280 {
1281 changed = false;
1282 for (auto BI = subroutine->getBBList().rbegin(), BE = subroutine->getBBList().rend(); BI != BE; ++BI)
1283 {
1284 //
1285 // use_out = use_in(s1) + use_in(s2) + ...
1286 // where s1 s2 ... are the successors of bb
1287 // use_in = use_gen + (use_out - use_kill)
1288 //
1289 G4_BB* bb = *BI;
1290 unsigned bbid = bb->getId();
1291 if (bb->getBBType() & G4_BB_EXIT_TYPE)
1292 {
1293 // use_out is set by caller
1294 }
1295 else if (bb->getBBType() & G4_BB_CALL_TYPE)
1296 {
1297 use_out[bbid] |= use_in[bb->getPhysicalSucc()->getId()];
1298 }
1299 else
1300 {
1301 for (auto succ : bb->Succs)
1302 {
1303 use_out[bbid] |= use_in[succ->getId()];
1304 }
1305 }
1306
1307 if (changed)
1308 {
1309 // no need to update changed, save a copy
1310 use_in[bbid] = use_out[bbid];
1311 use_in[bbid] -= use_kill[bbid];
1312 use_in[bbid] |= use_gen[bbid];
1313 }
1314 else
1315 {
1316 BitSet oldUseIn = use_in[bbid];
1317
1318 use_in[bbid] = use_out[bbid];
1319 use_in[bbid] -= use_kill[bbid];
1320 use_in[bbid] |= use_gen[bbid];
1321
1322 if (!(bb->getBBType() & G4_BB_INIT_TYPE) && oldUseIn != use_in[bbid])
1323 {
1324 changed = true;
1325 }
1326 }
1327 }
1328 } while (changed);
1329 }
1330
1331 //
1332 // Use analysis for this subroutine only, considering both arg/retval of its callees
1333 // use_out[call-BB] = (use_in[ret-BB] | arg[callee]) - retval[callee]
1334 //
useAnalysisWithArgRetVal(FuncInfo * subroutine,const std::unordered_map<FuncInfo *,BitSet> & args,const std::unordered_map<FuncInfo *,BitSet> & retVal)1335 void LivenessAnalysis::useAnalysisWithArgRetVal(FuncInfo* subroutine,
1336 const std::unordered_map<FuncInfo*, BitSet>& args, const std::unordered_map<FuncInfo*, BitSet>& retVal)
1337 {
1338 bool changed = false;
1339 do
1340 {
1341 changed = false;
1342 for (auto BI = subroutine->getBBList().rbegin(), BE = subroutine->getBBList().rend(); BI != BE; ++BI)
1343 {
1344 //
1345 // use_out = use_in(s1) + use_in(s2) + ...
1346 // where s1 s2 ... are the successors of bb
1347 // use_in = use_gen + (use_out - use_kill)
1348 //
1349 G4_BB* bb = *BI;
1350 unsigned bbid = bb->getId();
1351 if (bb->getBBType() & G4_BB_EXIT_TYPE)
1352 {
1353 // use_out is set by previous analysis
1354 }
1355 else if (bb->getBBType() & G4_BB_CALL_TYPE)
1356 {
1357 use_out[bbid] = use_in[bb->getPhysicalSucc()->getId()];
1358 auto callee = bb->getCalleeInfo();
1359 auto BVIt = args.find(callee);
1360 MUST_BE_TRUE(BVIt != args.end(), "Missing entry in map");
1361 use_out[bbid] |= (*BVIt).second;
1362 BVIt = retVal.find(callee);
1363 MUST_BE_TRUE(BVIt != retVal.end(), "Missing entry in map");
1364 use_out[bbid] -= (*BVIt).second;
1365 }
1366 else
1367 {
1368 for (auto succ : bb->Succs)
1369 {
1370 use_out[bbid] |= use_in[succ->getId()];
1371 }
1372 }
1373
1374 if (changed)
1375 {
1376 // no need to update changed, save a copy
1377 use_in[bbid] = use_out[bbid];
1378 use_in[bbid] -= use_kill[bbid];
1379 use_in[bbid] |= use_gen[bbid];
1380 }
1381 else
1382 {
1383 BitSet oldUseIn = use_in[bbid];
1384
1385 use_in[bbid] = use_out[bbid];
1386 use_in[bbid] -= use_kill[bbid];
1387 use_in[bbid] |= use_gen[bbid];
1388
1389 if (!(bb->getBBType() & G4_BB_INIT_TYPE) && oldUseIn != use_in[bbid])
1390 {
1391 changed = true;
1392 }
1393 }
1394 }
1395 } while (changed);
1396 }
1397
1398 //
1399 // Def analysis for each subroutine only
1400 // at a call site, we do
1401 // def_in[ret-BB] = def_out[call-BB] U def_out[callee exit-BB]
1402 // callee's def_in/def_out is not modified
1403 //
defAnalysis(FuncInfo * subroutine)1404 void LivenessAnalysis::defAnalysis(FuncInfo* subroutine)
1405 {
1406
1407 //def_in[bb] = null (inputs for entry BB)
1408 //def_out[bb] is initialized to all defs in the bb
1409 bool changed = false;
1410 do
1411 {
1412 changed = false;
1413 for (auto&& bb : subroutine->getBBList())
1414 {
1415 uint32_t bbid = bb->getId();
1416 std::optional<BitSet> defInOrNull = std::nullopt;
1417 if (!changed)
1418 {
1419 defInOrNull = def_in[bbid];
1420 }
1421 auto phyPredBB = (bb == fg.getEntryBB()) ? nullptr : bb->getPhysicalPred();
1422 if (phyPredBB && (phyPredBB->getBBType() & G4_BB_CALL_TYPE))
1423 {
1424 // this is the return BB, we take the def_out of the callBB + the predecessors
1425 G4_BB* callBB = bb->getPhysicalPred();
1426 def_in[bbid] |= def_out[callBB->getId()];
1427 for (auto&& pred : bb->Preds)
1428 {
1429 def_in[bbid] |= def_out[pred->getId()];
1430 }
1431 }
1432 else if (bb->getBBType() & G4_BB_INIT_TYPE)
1433 {
1434 // do nothing as we don't want to propagate caller defs yet
1435 }
1436 else
1437 {
1438 for (auto&& pred : bb->Preds)
1439 {
1440 def_in[bbid] |= def_out[pred->getId()];
1441 }
1442 }
1443
1444 if (!changed)
1445 {
1446 if (def_in[bbid] != defInOrNull.value())
1447 {
1448 changed = true;
1449 }
1450 }
1451 def_out[bbid] |= def_in[bbid];
1452 }
1453 } while (changed);
1454 }
1455
hierarchicalIPA(const BitSet & kernelInput,const BitSet & kernelOutput)1456 void LivenessAnalysis::hierarchicalIPA(const BitSet& kernelInput, const BitSet& kernelOutput)
1457 {
1458
1459 assert (fg.sortedFuncTable.size() > 0 && "topological sort must already be performed");
1460 std::unordered_map<FuncInfo*, BitSet> args;
1461 std::unordered_map<FuncInfo*, BitSet> retVal;
1462
1463 auto initKernelLiveOut = [this, &kernelOutput]()
1464 {
1465 for (auto&& bb : fg.kernelInfo->getBBList())
1466 {
1467 if (bb->Succs.empty())
1468 {
1469 // EOT BB
1470 use_out[bb->getId()] = kernelOutput;
1471 }
1472 }
1473 };
1474 // reset all live-in/out sets except for the kernel live-out
1475 auto clearLiveSets = [this]()
1476 {
1477 for (auto subroutine : fg.sortedFuncTable)
1478 {
1479 for (auto bb : subroutine->getBBList())
1480 {
1481 use_in[bb->getId()].clear();
1482 use_out[bb->getId()].clear();
1483 }
1484 }
1485 };
1486
1487 // top-down traversal to compute retval for each subroutine
1488 // retval[s] = live_out[s] - live_in[s],
1489 // where live_out[s] is the union of the live-in of the ret BB at each call site (hence top-down traversal).
1490 // this is not entirely accurate since we may have pass-through retVals
1491 // (e.g., A call B call C, C's retVal is pass-through in B and used in A, which
1492 // means it won't be killed in B if we do top-down)
1493 // But for now let's trade some loss of accuracy to save one more round of fix-point
1494 initKernelLiveOut();
1495 for (auto FI = fg.sortedFuncTable.rbegin(), FE = fg.sortedFuncTable.rend(); FI != FE; ++FI)
1496 {
1497 auto subroutine = *FI;
1498 useAnalysis(subroutine);
1499 if (subroutine != fg.kernelInfo)
1500 {
1501 retVal[subroutine] = use_out[subroutine->getExitBB()->getId()];
1502 retVal[subroutine] -= use_in[subroutine->getInitBB()->getId()];
1503 }
1504 for (auto&& bb : subroutine->getBBList())
1505 {
1506 if (bb->getBBType() & G4_BB_CALL_TYPE)
1507 {
1508 G4_BB* retBB = bb->getPhysicalSucc();
1509 G4_BB* exitBB = bb->getCalleeInfo()->getExitBB();
1510 assert((exitBB->getBBType() & G4_BB_EXIT_TYPE) &&
1511 "should be a subroutine's exit BB");
1512 use_out[exitBB->getId()] |= use_in[retBB->getId()];
1513 }
1514 }
1515 }
1516
1517 // bottom-up traversal to compute arg for each subroutine
1518 // arg[s] = live-in[s], except retval of its callees are excluded as by definition they will not be live-in
1519 // The live-out of each subroutine is initialized to null so that
1520 // args are limited to variables actually used in this subroutine (and its callees)
1521 clearLiveSets();
1522 initKernelLiveOut();
1523 for (auto subroutine : fg.sortedFuncTable)
1524 {
1525 useAnalysisWithArgRetVal(subroutine, args, retVal);
1526 if (subroutine != fg.kernelInfo)
1527 {
1528 args[subroutine] = use_in[subroutine->getInitBB()->getId()];
1529 args[subroutine] -= use_out[subroutine->getExitBB()->getId()];
1530 }
1531 }
1532
1533 // the real deal -- top-down traversal taking arg/retval/live-through all into consideration
1534 // again top-down traversal is needed to compute the live-out of each subroutine.
1535 clearLiveSets();
1536 initKernelLiveOut();
1537 for (auto FI = fg.sortedFuncTable.rbegin(), FE = fg.sortedFuncTable.rend(); FI != FE; ++FI)
1538 {
1539 auto subroutine = *FI;
1540 useAnalysisWithArgRetVal(subroutine, args, retVal);
1541 for (auto&& bb : subroutine->getBBList())
1542 {
1543 if (bb->getBBType() & G4_BB_CALL_TYPE)
1544 {
1545 G4_BB* retBB = bb->getPhysicalSucc();
1546 G4_BB* exitBB = bb->getCalleeInfo()->getExitBB();
1547 assert((exitBB->getBBType() & G4_BB_EXIT_TYPE) &&
1548 "should be a subroutine's exit BB");
1549 use_out[exitBB->getId()] |= use_in[retBB->getId()];
1550 }
1551 }
1552 }
1553
1554 maydefAnalysis(); // must be done before defAnalysis!
1555
1556 // algorithm sketch for def-in/def-out:
1557 // In reverse topological order :
1558 // -- Run def analysis on subroutine
1559 // -- at each call site:
1560 // def_in[ret-BB] |= def_out[call-BB] U def_out[exit-BB]
1561 // In topological order :
1562 // -- At each call site:
1563 // add def_out[call-BB] to all of callee's BBs
1564 def_in[fg.getEntryBB()->getId()] = kernelInput;
1565 for (auto subroutine : fg.sortedFuncTable)
1566 {
1567 defAnalysis(subroutine);
1568 }
1569
1570
1571 // FIXME: I assume we consider all caller's defs to be callee's defs too?
1572 for (auto FI = fg.sortedFuncTable.rbegin(), FE = fg.sortedFuncTable.rend(); FI != FE; ++FI)
1573 {
1574 auto subroutine = *FI;
1575 if (subroutine->getCallees().size() == 0)
1576 {
1577 continue;
1578 }
1579 for (auto&& bb : subroutine->getBBList())
1580 {
1581 if (bb->getBBType() & G4_BB_CALL_TYPE)
1582 {
1583 auto callee = bb->getCalleeInfo();
1584 for (auto&& calleeBB : callee->getBBList())
1585 {
1586 def_in[calleeBB->getId()] |= def_out[bb->getId()];
1587 def_out[calleeBB->getId()] |= def_out[bb->getId()];
1588 }
1589 }
1590 }
1591 }
1592
1593 #if 0
1594 std::vector<G4_Declare*> idToDecl;
1595 idToDecl.resize(numVarId);
1596 for (auto dcl : fg.getKernel()->Declares)
1597 {
1598 auto id = dcl->getRegVar()->getId();
1599 if (id < numVarId)
1600 {
1601 idToDecl[id] = dcl;
1602 }
1603 }
1604 for (auto subroutine : fg.sortedFuncTable)
1605 {
1606 auto printVal = [&idToDecl](const BitSet& bs)
1607 {
1608 for (int i = 0, size = (int)bs.getSize(); i < size; ++i)
1609 {
1610 if (bs.isSet(i))
1611 {
1612 std::cerr << idToDecl[i]->getName() << "(" << i << ") ";
1613 }
1614 }
1615 };
1616
1617 subroutine->dump();
1618
1619 if (subroutine != fg.kernelInfo)
1620 {
1621 std::cerr << "\tArgs: ";
1622 printVal(args[subroutine->getId()]);
1623 std::cerr << "\n";
1624
1625 std::cerr << "\tRetVal: ";
1626 printVal(retVal[subroutine->getId()]);
1627 std::cerr << "\n";
1628 std::cerr << "\tLiveThrough: ";
1629 BitSet liveThrough = use_in[subroutine->getInitBB()->getId()];
1630 liveThrough &= use_out[subroutine->getExitBB()->getId()];
1631 printVal(liveThrough);
1632 //std::cerr << "\n";
1633 //std::cerr << "\tDef in: ";
1634 //printVal(def_in[subroutine->getInitBB()->getId()]);
1635 //std::cerr << "\n";
1636 //std::cerr << "\tDef out: ";
1637 //printVal(def_out[subroutine->getInitBB()->getId()]);
1638 std::cerr << "\n";
1639 }
1640 }
1641 #endif
1642 }
1643
1644 //
1645 // determine if the dst writes the whole region of target declare
1646 //
writeWholeRegion(const G4_BB * bb,const G4_INST * inst,G4_DstRegRegion * dst,const Options * opt) const1647 bool LivenessAnalysis::writeWholeRegion(const G4_BB* bb,
1648 const G4_INST* inst,
1649 G4_DstRegRegion* dst,
1650 const Options *opt) const
1651 {
1652 unsigned execSize = inst->getExecSize();
1653 MUST_BE_TRUE(dst->getBase()->isRegVar(), ERROR_REGALLOC);
1654
1655 if (!bb->isAllLaneActive() && !inst->isWriteEnableInst() &&
1656 fg.getKernel()->getInt32KernelAttr(Attributes::ATTR_Target) != VISA_3D)
1657 {
1658 // conservatively assume non-nomask instructions in simd control flow
1659 // may not write the whole region
1660 return false;
1661 }
1662
1663 if (inst->getPredicate())
1664 {
1665 return false;
1666 }
1667
1668 if (inst->isFCall())
1669 return true;
1670
1671 // Flags may be partially written when used as the destination
1672 // e.g., setp (M5_NM, 16) P11 V97(8,0)<0;1,0>
1673 // It can be only considered as a complete kill
1674 // if the computed bound diff matches with the number of flag elements
1675 if (dst->isFlag() == true)
1676 {
1677 if ((dst->getRightBound() - dst->getLeftBound() + 1) ==
1678 dst->getBase()->asRegVar()->getDeclare()->getNumberFlagElements())
1679 {
1680 return true;
1681 }
1682 else
1683 {
1684 return false;
1685 }
1686 }
1687
1688 //
1689 // Find Primary Variable Declare
1690 //
1691
1692 const G4_Declare* decl = ((const G4_RegVar*)dst->getBase())->getDeclare();
1693 const G4_Declare* primaryDcl = decl->getRootDeclare();
1694
1695 //
1696 // Cannot write whole register if
1697 // * alias offset in non zero
1698 // * reg or sub-reg offset is non zero
1699 // * horiz stride is non zero
1700 // * predicate is non null
1701 //
1702 if (decl->getAliasOffset() != 0 ||
1703 dst->getRegAccess() != Direct ||
1704 dst->getRegOff() != 0 ||
1705 dst->getSubRegOff() != 0 ||
1706 dst->getHorzStride() != 1 ||
1707 inst->isPartialWrite()) {
1708 return false;
1709 }
1710
1711 //
1712 // For CISA3, pseudo-callee-save and pseudo-caller-save insts
1713 // are kills
1714 //
1715 if (fg.isPseudoDcl(primaryDcl))
1716 {
1717 return true;
1718 }
1719
1720 //
1721 // If the region does not cover the whole declare then it does not write the whole region.
1722 //
1723 if (!primaryDcl->getRegVar()->isRegVarTransient() && (dst->getTypeSize() * execSize !=
1724 primaryDcl->getElemSize() * primaryDcl->getNumElems() * primaryDcl->getNumRows())) {
1725 return false;
1726 }
1727
1728 return true;
1729 }
1730
1731 //
1732 // determine if the dst writes the whole region of target declare
1733 //
writeWholeRegion(const G4_BB * bb,const G4_INST * inst,const G4_VarBase * flagReg) const1734 bool LivenessAnalysis::writeWholeRegion(const G4_BB* bb,
1735 const G4_INST* inst,
1736 const G4_VarBase* flagReg) const
1737 {
1738 if (!bb->isAllLaneActive() && !inst->isWriteEnableInst() && gra.kernel.getKernelType() != VISA_3D)
1739 {
1740 // conservatively assume non-nomask instructions in simd control flow
1741 // may not write the whole region
1742 return false;
1743 }
1744
1745 const G4_Declare* decl = flagReg->asRegVar()->getDeclare();
1746 if (inst->getExecSize() != G4_ExecSize(decl->getNumberFlagElements()))
1747 {
1748 return false;
1749 }
1750
1751 return true;
1752 }
1753
1754 // Set bits in dst footprint based on dst region's left/right bound
footprintDst(const G4_BB * bb,const G4_INST * i,G4_Operand * opnd,BitSet * dstfootprint) const1755 void LivenessAnalysis::footprintDst(const G4_BB* bb,
1756 const G4_INST* i,
1757 G4_Operand* opnd,
1758 BitSet* dstfootprint) const
1759 {
1760 if (dstfootprint &&
1761 !(i->isPartialWrite()) &&
1762 ((bb->isAllLaneActive() ||
1763 i->isWriteEnableInst() == true) ||
1764 gra.kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D))
1765 {
1766 // Bitwise OR left-bound/right-bound with dst footprint to indicate
1767 // bytes that are written in to
1768 opnd->updateFootPrint(*dstfootprint, true);
1769 }
1770 }
1771
1772 // Reset bits in srcfootprint based on src region's left/right bound
footprintSrc(const G4_INST * i,G4_Operand * opnd,BitSet * srcfootprint)1773 void LivenessAnalysis::footprintSrc(const G4_INST* i,
1774 G4_Operand *opnd,
1775 BitSet* srcfootprint)
1776 {
1777 // Reset bits in kill map footprint
1778 opnd->updateFootPrint(*srcfootprint, false);
1779 }
1780
computeGenKillandPseudoKill(G4_BB * bb,BitSet & def_out,BitSet & use_in,BitSet & use_gen,BitSet & use_kill) const1781 void LivenessAnalysis::computeGenKillandPseudoKill(G4_BB* bb,
1782 BitSet& def_out,
1783 BitSet& use_in,
1784 BitSet& use_gen,
1785 BitSet& use_kill) const
1786 {
1787 //
1788 // Mark each fcall as using all globals and arg pre-defined var
1789 //
1790 if (bb->isEndWithFCall() && (selectedRF & G4_GRF))
1791 {
1792 const G4_Declare* arg = fg.builder->getStackCallArg();
1793 const G4_Declare* ret = fg.builder->getStackCallRet();
1794
1795 const G4_FCALL* fcall = fg.builder->getFcallInfo(bb->back());
1796 MUST_BE_TRUE(fcall != NULL, "fcall info not found");
1797
1798 if (arg->getByteSize() != 0)
1799 {
1800 // arg var is a use and a kill at each fcall
1801 if (fcall->getArgSize() != 0)
1802 {
1803 use_gen.set(arg->getRegVar()->getId(), true);
1804 }
1805 use_kill.set(arg->getRegVar()->getId(), true);
1806 }
1807
1808 if (ret->getByteSize() != 0)
1809 {
1810 // ret var is a kill at each fcall
1811 use_kill.set(ret->getRegVar()->getId(), true);
1812 if (fcall->getRetSize() != 0)
1813 {
1814 def_out.set(ret->getRegVar()->getId(), true);
1815 }
1816 }
1817 }
1818
1819 std::vector<BitSet> footprints(numVarId);
1820 std::vector<std::pair<G4_Declare*, INST_LIST_RITER>> pseudoKills;
1821
1822 for (INST_LIST::reverse_iterator rit = bb->rbegin(), rend = bb->rend(); rit != rend; ++rit)
1823 {
1824 G4_INST* i = (*rit);
1825 if (i->isLifeTimeEnd())
1826 {
1827 continue;
1828 }
1829
1830 G4_DstRegRegion* dst = i->getDst();
1831 if (dst)
1832 {
1833 G4_DstRegRegion* dstrgn = dst;
1834
1835 if (dstrgn->getBase()->isRegAllocPartaker())
1836 {
1837 G4_Declare* topdcl = GetTopDclFromRegRegion(dstrgn);
1838 unsigned id = topdcl->getRegVar()->getId();
1839 if (i->isPseudoKill())
1840 {
1841 // Mark kill, reset gen
1842 use_kill.set(id, true);
1843 use_gen.set(id, false);
1844
1845 continue;
1846 }
1847
1848 BitSet* dstfootprint = &footprints[id];
1849
1850 if (dstfootprint->getSize() == 0)
1851 {
1852 // Write for dst was not seen before, so insert in to map
1853 // bitsetSize is in bytes
1854 unsigned int bitsetSize = dstrgn->isFlag() ? topdcl->getNumberFlagElements() : topdcl->getByteSize();
1855
1856 BitSet newBitSet(bitsetSize, false);
1857
1858 auto it = neverDefinedRows.find(topdcl);
1859 if (it != neverDefinedRows.end())
1860 {
1861 // Bitwise OR new bitset with never defined rows
1862 newBitSet |= it->second;
1863 }
1864
1865 footprints[id] = std::move(newBitSet);
1866 if (gra.isBlockLocal(topdcl) &&
1867 topdcl->getAddressed() == false &&
1868 dstrgn->getRegAccess() == Direct)
1869 {
1870 // Local live ranges are never live-out of the only
1871 // basic block they are defined. So in top-down order
1872 // the first lexical definition is a kill irrespective
1873 // of the footprint. In cases when local live-range
1874 // def and use have h-stride != 1, the footprint at this
1875 // lexically first definition will not have all bits set.
1876 // This prevents that def to be seen as a kill. A simple
1877 // solution to this is to set all bits when initializing
1878 // the bitvector while iterating in bottom-up order. As
1879 // we traverse further up uses will reset bits and defs
1880 // will set bits. So when we encounter the lexically first
1881 // def, we will be guaranteed to find all bits set, thus
1882 // interpreting that def as a kill.
1883 dstfootprint->setAll();
1884 }
1885 }
1886
1887 if (dstrgn->getRegAccess() == Direct)
1888 {
1889 def_out.set(id, true);
1890 //
1891 // if the inst writes the whole region the var declared, we set use_kill
1892 // so that use of var will not pass through (i.e., var's interval starts
1893 // at this instruction.
1894 //
1895 if (writeWholeRegion(bb, i, dstrgn, fg.builder->getOptions()))
1896 {
1897 use_kill.set(id, true);
1898 use_gen.set(id, false);
1899
1900 dstfootprint->setAll();
1901 }
1902 else
1903 {
1904 footprintDst(bb, i, dstrgn, dstfootprint);
1905
1906 use_gen.set(id, true);
1907 }
1908 }
1909 else
1910 {
1911 use_gen.set(id, true);
1912 }
1913 }
1914 else if ((selectedRF & G4_GRF) && dst->isIndirect())
1915 {
1916 // conservatively add each variable potentially accessed by dst to gen
1917 const REGVAR_VECTOR& pointsToSet = pointsToAnalysis.getAllInPointsToOrIndrUse(dst, bb);
1918 for (auto pt : pointsToSet)
1919 {
1920 if (pt.var->isRegAllocPartaker())
1921 {
1922 use_gen.set(pt.var->getId(), true);
1923 }
1924 }
1925 }
1926 }
1927
1928 //
1929 // process each source operand
1930 //
1931 for (unsigned j = 0; j < G4_MAX_SRCS; j++)
1932 {
1933 G4_Operand* src = i->getSrc(j);
1934
1935 if (!src)
1936 {
1937 continue;
1938 }
1939 if (src->isSrcRegRegion())
1940 {
1941 G4_Declare* topdcl = GetTopDclFromRegRegion(src);
1942 const G4_VarBase* base = (topdcl != nullptr ? topdcl->getRegVar() :
1943 src->asSrcRegRegion()->getBase());
1944 if (base->isRegAllocPartaker())
1945 {
1946 unsigned id = topdcl->getRegVar()->getId();
1947 BitSet* srcfootprint = &footprints[id];
1948
1949 if (srcfootprint->getSize() != 0)
1950 {
1951 footprintSrc(i, src->asSrcRegRegion(), srcfootprint);
1952 }
1953 else
1954 {
1955 unsigned int bitsetSize = (src->asSrcRegRegion()->isFlag()) ? topdcl->getNumberFlagElements() : topdcl->getByteSize();
1956
1957 BitSet newBitSet(bitsetSize, false);
1958
1959 auto it = neverDefinedRows.find(topdcl);
1960 if (it != neverDefinedRows.end())
1961 {
1962 // Bitwise OR new bitset with never defined rows
1963 newBitSet |= it->second;
1964 }
1965
1966 footprints[id] = std::move(newBitSet);
1967 if (gra.isBlockLocal(topdcl) &&
1968 topdcl->getAddressed() == false &&
1969 (topdcl->getRegFile() == G4_ADDRESS ||
1970 src->asSrcRegRegion()->getRegAccess() == Direct))
1971 {
1972 srcfootprint->setAll();
1973 }
1974 footprintSrc(i, src->asSrcRegRegion(), srcfootprint);
1975 }
1976
1977 use_gen.set(static_cast<const G4_RegVar*>(base)->getId(), true);
1978 }
1979
1980 if ((selectedRF & G4_GRF) && src->getRegAccess() == IndirGRF)
1981 {
1982 int idx = 0;
1983 G4_RegVar* grf;
1984 G4_Declare* topdcl = GetTopDclFromRegRegion(src);
1985
1986 while ((grf = pointsToAnalysis.getPointsTo(topdcl->getRegVar(), idx++)) != NULL)
1987 {
1988 // grf is a variable that src potentially points to
1989 // since we dont know exactly which part of grf is sourced
1990 // assume entire grf is sourced
1991 // Also add grf to the gen set as it may be potentially used
1992 unsigned int id = grf->getId();
1993 use_gen.set(id, true);
1994 BitSet* srcfootprint = &footprints[id];
1995
1996 if (srcfootprint->getSize() != 0)
1997 {
1998 srcfootprint->clear();
1999
2000 DEBUG_VERBOSE("Found potential indirect use of " << grf->getDeclare()->getName() <<
2001 " so resetting its footprint" << std::endl);
2002 }
2003 }
2004 }
2005 }
2006 //
2007 // treat the addr expr as both a use and a partial def
2008 //
2009 else if (src->isAddrExp())
2010 {
2011 const G4_RegVar* reg = static_cast<const G4_AddrExp*>(src)->getRegVar();
2012 if (reg->isRegAllocPartaker() && reg->isSpilled() == false)
2013 {
2014 unsigned srcId = reg->getId();
2015 use_gen.set(srcId, true);
2016 def_out.set(srcId, true);
2017 }
2018 }
2019 }
2020
2021 //
2022 // Process condMod
2023 //
2024 G4_CondMod* mod = i->getCondMod();
2025 if (mod) {
2026 G4_VarBase *flagReg = mod->getBase();
2027 if (flagReg)
2028 {
2029 if (flagReg->asRegVar()->isRegAllocPartaker())
2030 {
2031 G4_Declare* topdcl = flagReg->asRegVar()->getDeclare();
2032 MUST_BE_TRUE(topdcl->getAliasDeclare() == nullptr, "Invalid alias flag decl.");
2033 unsigned id = topdcl->getRegVar()->getId();
2034
2035 BitSet* dstfootprint = &footprints[id];
2036
2037 if (dstfootprint->getSize() == 0)
2038 {
2039 // Write for dst was not seen before, so insert in to map
2040 // bitsetSize is in bits for flag
2041 unsigned int bitsetSize = topdcl->getNumberFlagElements();
2042
2043 BitSet newBitSet(bitsetSize, false);
2044 footprints[id] = std::move(newBitSet);
2045
2046 if (gra.isBlockLocal(topdcl))
2047 {
2048 dstfootprint->setAll();
2049 }
2050 }
2051
2052 def_out.set(id, true);
2053
2054 if (writeWholeRegion(bb, i, flagReg))
2055 {
2056 use_kill.set(id, true);
2057 use_gen.set(id, false);
2058
2059 dstfootprint->setAll();
2060 }
2061 else
2062 {
2063 footprintDst(bb, i, mod, dstfootprint);
2064 use_gen.set(id, true);
2065 }
2066 }
2067 }
2068 else
2069 {
2070 MUST_BE_TRUE((i->opcode() == G4_sel ||
2071 i->opcode() == G4_csel) &&
2072 i->getCondMod() != nullptr,
2073 "Invalid CondMod");
2074 }
2075 }
2076
2077 //
2078 // Process predicate
2079 //
2080 G4_Predicate* predicate = i->getPredicate();
2081 if (predicate) {
2082 G4_VarBase *flagReg = predicate->getBase();
2083 MUST_BE_TRUE(flagReg->asRegVar()->getDeclare()->getAliasDeclare() == nullptr, "Invalid alias flag decl.");
2084 if (flagReg->asRegVar()->isRegAllocPartaker())
2085 {
2086 const G4_Declare* topdcl = flagReg->asRegVar()->getDeclare();
2087 unsigned id = topdcl->getRegVar()->getId();
2088 auto srcfootprint = &footprints[id];
2089
2090 if (srcfootprint->getSize() != 0)
2091 {
2092 footprintSrc(i, predicate, srcfootprint);
2093 }
2094 else
2095 {
2096 unsigned int bitsetSize = topdcl->getNumberFlagElements();
2097
2098 BitSet newBitSet(bitsetSize, false);
2099 footprints[id] = std::move(newBitSet);
2100 if (gra.isBlockLocal(topdcl))
2101 {
2102 srcfootprint->setAll();
2103 }
2104 footprintSrc(i, predicate, srcfootprint);
2105 }
2106
2107 use_gen.set(static_cast<const G4_RegVar*>(flagReg)->getId(), true);
2108 }
2109 }
2110
2111 //
2112 // Check whether dst can be killed at this point
2113 // A block of code is said to kill a variable when union
2114 // of all partial writes causes all elements to be written
2115 // into and any reads in the block can be sourced from
2116 // writes within that block itself
2117 //
2118 if (dst && dst->getBase()->isRegAllocPartaker())
2119 {
2120 G4_Declare* topdcl = GetTopDclFromRegRegion(dst);
2121 unsigned id = topdcl->getRegVar()->getId();
2122 auto dstfootprint = &footprints[id];
2123
2124 if (dstfootprint->getSize() != 0)
2125 {
2126 // Found dst in map
2127 // Check whether all bits set
2128 // pseudo_kill for this dst was not found in this BB yet
2129 unsigned int first;
2130 LocalLiveRange* topdclLR = nullptr;
2131
2132 if ((dstfootprint->isAllset() ||
2133 // Check whether local RA marked this range
2134 (topdcl &&
2135 (topdclLR = gra.getLocalLR(topdcl)) &&
2136 topdclLR->isLiveRangeLocal() &&
2137 (!topdcl->isInput()) &&
2138 topdclLR->getFirstRef(first) == i)) &&
2139 // If single inst writes whole region then dont insert pseudo_kill
2140 writeWholeRegion(bb, i, dst, fg.builder->getOptions()) == false)
2141 {
2142 bool foundKill = false;
2143 INST_LIST::reverse_iterator nextIt = rit;
2144 ++nextIt;
2145 if (nextIt != bb->rend())
2146 {
2147 const G4_INST* nextInst = (*nextIt);
2148 if (nextInst->isPseudoKill())
2149 {
2150 G4_DstRegRegion* nextDst = nextInst->getDst();
2151
2152 if (nextDst != NULL &&
2153 nextDst->isDstRegRegion() &&
2154 nextDst->getBase()->isRegAllocPartaker() &&
2155 topdcl == GetTopDclFromRegRegion(nextDst))
2156 {
2157 foundKill = true;
2158 }
2159 }
2160 }
2161 if (!foundKill)
2162 {
2163 // All bytes of dst written at this point, so this is a good place to insert
2164 // pseudo kill inst
2165 pseudoKills.emplace_back(topdcl, rit);
2166 }
2167
2168 // Reset gen
2169 use_gen.set(dst->getBase()->asRegVar()->getId(), false);
2170
2171 // Set kill
2172 use_kill.set(dst->getBase()->asRegVar()->getId(), true);
2173 #ifdef DEBUG_VERBOSE_ON
2174 DEBUG_VERBOSE("Found kill at inst ");
2175 INST_LIST_ITER fwdIter = rit.base();
2176 fwdIter--;
2177 (*fwdIter)->emit_inst(std::cout, false, NULL);
2178 DEBUG_VERBOSE(" // $" << (*fwdIter)->getCISAOff());
2179 DEBUG_VERBOSE(std::endl);
2180 #endif
2181 }
2182 }
2183 }
2184
2185 if (mod && mod->getBase() && mod->getBase()->asRegVar()->isRegAllocPartaker())
2186 {
2187 G4_VarBase *flagReg = mod->getBase();
2188 G4_Declare* topdcl = flagReg->asRegVar()->getDeclare();
2189 unsigned id = topdcl->getRegVar()->getId();
2190 auto dstfootprint = &footprints[id];
2191
2192 if (dstfootprint->getSize() != 0)
2193 {
2194 unsigned int first;
2195 const LocalLiveRange* topdclLR = nullptr;
2196 if ((dstfootprint->isAllset() ||
2197 // Check whether local RA marked this range
2198 // This may not be necessary as currently local RA is not performed for flags.
2199 (topdcl &&
2200 (topdclLR = gra.getLocalLR(topdcl)) &&
2201 topdclLR->isLiveRangeLocal() &&
2202 topdclLR->getFirstRef(first) == i)) &&
2203 // If single inst writes whole region then dont insert pseudo_kill
2204 writeWholeRegion(bb, i, flagReg) == false)
2205 {
2206 // All bytes of dst written at this point, so this is a good place to insert
2207 // pseudo kill inst
2208 pseudoKills.emplace_back(topdcl, rit);
2209
2210 // Reset gen
2211 use_gen.set(flagReg->asRegVar()->getId(), false);
2212
2213 // Set kill
2214 use_kill.set(flagReg->asRegVar()->getId(), true);
2215 #ifdef DEBUG_VERBOSE_ON
2216 DEBUG_VERBOSE("Found kill at inst ");
2217 INST_LIST_ITER fwdIter = rit.base();
2218 fwdIter--;
2219 (*fwdIter)->emit_inst(std::cout, false, NULL);
2220 DEBUG_VERBOSE(" // $" << (*fwdIter)->getCISAOff());
2221 DEBUG_VERBOSE(std::endl);
2222 #endif
2223 }
2224 }
2225 }
2226 }
2227
2228 //
2229 // Insert pseudo_kill nodes in BB
2230 //
2231 for (auto&& pseudoKill : pseudoKills)
2232 {
2233 INST_LIST_ITER iterToInsert = pseudoKill.second.base();
2234 do
2235 {
2236 --iterToInsert;
2237 } while ((*iterToInsert)->isPseudoKill());
2238 G4_INST* killInst = fg.builder->createPseudoKill(pseudoKill.first, PseudoKillType::FromLiveness);
2239 bb->insertBefore(iterToInsert, killInst);
2240 }
2241
2242 //
2243 // initialize use_in
2244 //
2245 use_in = use_gen;
2246 }
2247
2248 //
2249 // use_out = use_in(s1) + use_in(s2) + ... where s1 s2 ... are the successors of bb
2250 // use_in = use_gen + (use_out - use_kill)
2251 //
contextFreeUseAnalyze(G4_BB * bb,bool isChanged)2252 bool LivenessAnalysis::contextFreeUseAnalyze(G4_BB* bb, bool isChanged)
2253 {
2254 bool changed;
2255
2256 unsigned bbid = bb->getId();
2257
2258 if (bb->Succs.empty()) // exit block
2259 {
2260 changed = false;
2261 }
2262 else if (isChanged)
2263 {
2264 // no need to update changed. This saves a memcpy
2265 for (auto succBB : bb->Succs)
2266 {
2267 use_out[bbid] |= use_in[succBB->getId()];
2268 }
2269 changed = true;
2270 }
2271 else
2272 {
2273 BitSet old = use_out[bbid];
2274 for (auto succBB : bb->Succs)
2275 {
2276 use_out[bbid] |= use_in[succBB->getId()];
2277 }
2278
2279 changed = (old != use_out[bbid]);
2280 }
2281
2282 //
2283 // in = gen + (out - kill)
2284 //
2285 use_in[bbid] = use_out[bbid];
2286 use_in[bbid] -= use_kill[bbid];
2287 use_in[bbid] |= use_gen[bbid];
2288
2289 return changed;
2290 }
2291
2292 //
2293 // def_in = def_out(p1) + def_out(p2) + ... where p1 p2 ... are the predecessors of bb
2294 // def_out |= def_in
2295 //
contextFreeDefAnalyze(G4_BB * bb,bool isChanged)2296 bool LivenessAnalysis::contextFreeDefAnalyze(G4_BB* bb, bool isChanged)
2297 {
2298 bool changed = false;
2299 unsigned bbid = bb->getId();
2300
2301 if (bb->Preds.empty())
2302 {
2303 changed = false;
2304 }
2305 else if (isChanged)
2306 {
2307 // no need to update changed. This saves a memcpy
2308 for (auto predBB : bb->Preds)
2309 {
2310 def_in[bbid] |= def_out[predBB->getId()];
2311 }
2312 changed = true;
2313 }
2314 else
2315 {
2316 BitSet old = def_in[bbid];
2317 for (auto predBB : bb->Preds)
2318 {
2319 def_in[bbid] |= def_out[predBB->getId()];
2320 }
2321 changed = (old != def_in[bbid]);
2322 }
2323
2324 def_out[bb->getId()] |= def_in[bb->getId()];
2325
2326 return changed;
2327 }
2328
dump_bb_vector(char * vname,std::vector<BitSet> & vec)2329 void LivenessAnalysis::dump_bb_vector(char* vname, std::vector<BitSet>& vec)
2330 {
2331 std::cerr << vname << "\n";
2332 for (BB_LIST_ITER it = fg.begin(); it != fg.end(); it++)
2333 {
2334 G4_BB* bb = (*it);
2335 std::cerr << " BB" << bb->getId() << "\n";
2336 const BitSet& in = vec[bb->getId()];
2337 std::cerr << " ";
2338 for (unsigned i = 0; i < in.getSize(); i+= 10)
2339 {
2340 //
2341 // dump 10 bits a group
2342 //
2343 for (unsigned j = i; j < in.getSize() && j < i+10; j++)
2344 {
2345 std::cerr << (in.isSet(j) ? "1" : "0");
2346 }
2347 std::cerr << " ";
2348 }
2349 std::cerr << "\n";
2350 }
2351 }
2352
dump_fn_vector(char * vname,std::vector<FuncInfo * > & fns,std::vector<BitSet> & vec)2353 void LivenessAnalysis::dump_fn_vector(char* vname, std::vector<FuncInfo*>& fns, std::vector<BitSet>& vec)
2354 {
2355 DEBUG_VERBOSE(vname << std::endl);
2356 for (std::vector<FuncInfo*>::iterator it = fns.begin(); it != fns.end(); it++)
2357 {
2358 FuncInfo* funcInfo = (*it);
2359
2360 DEBUG_VERBOSE(" FN" << funcInfo->getId() << std::endl);
2361 const BitSet& in = vec[funcInfo->getId()];
2362 DEBUG_VERBOSE(" ");
2363 for (unsigned i = 0; i < in.getSize(); i += 10)
2364 {
2365 //
2366 // dump 10 bits a group
2367 //
2368 for (unsigned j = i; j < in.getSize() && j < i + 10; j++)
2369 {
2370 DEBUG_VERBOSE(in.isSet(j) ? "1" : "0");
2371 }
2372 DEBUG_VERBOSE(" ");
2373 }
2374 DEBUG_VERBOSE(std::endl);
2375 }
2376 }
2377
2378 //
2379 // dump which vars are live at the entry of BB
2380 //
dump() const2381 void LivenessAnalysis::dump() const
2382 {
2383 for (auto bb : fg)
2384 {
2385 std::cerr << "BB" << bb->getId() << "'s live in: ";
2386 unsigned total_size = 0;
2387 auto dumpVar = [&total_size](G4_RegVar* var)
2388 {
2389 int size = var->getDeclare()->getTotalElems() * var->getDeclare()->getElemSize();
2390 std::cerr << var->getName() << "(" << size << "), ";
2391 total_size += size;
2392 };
2393
2394 unsigned count = 0;
2395 for (auto var : vars)
2396 {
2397 if (var->isRegAllocPartaker() && isLiveAtEntry(bb, var->getId()))
2398 {
2399 if (count++ % 10 == 0) std::cerr << "\n";
2400 dumpVar(var);
2401 }
2402 }
2403 std::cerr << "\nBB" << bb->getId() << "'s live in size: " << total_size / numEltPerGRF<Type_UB>() << "\n\n";
2404 std::cerr << "BB" << bb->getId() << "'s live out: ";
2405 total_size = 0;
2406 count = 0;
2407 for (auto var : vars)
2408 {
2409 if (var->isRegAllocPartaker() && isLiveAtExit(bb, var->getId()))
2410 {
2411 if (count++ % 10 == 0) std::cerr << "\n";
2412 dumpVar(var);
2413 }
2414 }
2415 std::cerr << "\nBB" << bb->getId() << "'s live out size: " << total_size / numEltPerGRF<Type_UB>()<< "\n\n";
2416 }
2417 }
2418
dumpBB(G4_BB * bb) const2419 void LivenessAnalysis::dumpBB(G4_BB *bb) const
2420 {
2421 std::cerr << "\n\nBB" << bb->getId() << "'s live in: ";
2422 unsigned total_size = 0;
2423 auto dumpVar = [&total_size](G4_RegVar* var)
2424 {
2425 int size = var->getDeclare()->getTotalElems() * var->getDeclare()->getElemSize();
2426 std::cerr << var->getName() << "(" << size << ")" << "[" << var->getRegAllocPartaker() << "], ";
2427 total_size += size;
2428 };
2429
2430 unsigned count = 0;
2431 for (auto var : vars)
2432 {
2433 if (var->isRegAllocPartaker() && isLiveAtEntry(bb, var->getId()))
2434 {
2435 if (count++ % 10 == 0) std::cerr << "\n";
2436 dumpVar(var);
2437 }
2438 }
2439 std::cerr << "\n\nBB" << bb->getId() << "'s live out: ";
2440 total_size = 0;
2441 count = 0;
2442 for (auto var : vars)
2443 {
2444 if (var->isRegAllocPartaker() && isLiveAtExit(bb, var->getId()))
2445 {
2446 if (count++ % 10 == 0) std::cerr << "\n";
2447 dumpVar(var);
2448 }
2449 }
2450 std::cerr << "\n\nBB" << bb->getId() << "'s use through: ";
2451 total_size = 0;
2452 count = 0;
2453 for (auto var : vars)
2454 {
2455 if (var->isRegAllocPartaker() && isUseThrough(bb, var->getId()))
2456 {
2457 if (count++ % 10 == 0) std::cerr << "\n";
2458 dumpVar(var);
2459 }
2460 }
2461 std::cerr << "\n\nBB" << bb->getId() << "'s def through: ";
2462 total_size = 0;
2463 count = 0;
2464 for (auto var : vars)
2465 {
2466 if (var->isRegAllocPartaker() && isDefThrough(bb, var->getId()))
2467 {
2468 if (count++ % 10 == 0) std::cerr << "\n";
2469 dumpVar(var);
2470 }
2471 }
2472 }
2473
dumpLive(BitSet & live) const2474 void LivenessAnalysis::dumpLive(BitSet& live) const
2475 {
2476 auto dumpVar = [](G4_RegVar* var)
2477 {
2478 int size = var->getDeclare()->getTotalElems() * var->getDeclare()->getElemSize();
2479 std::cerr << var->getName() << "(" << size << ")" << "[" << var->getRegAllocPartaker() << "], ";
2480 };
2481
2482 unsigned count = 0;
2483 for (auto var : vars)
2484 {
2485 if (live.isSet(var->getId()))
2486 {
2487 if (count++ % 10 == 0) std::cerr << "\n";
2488 dumpVar(var);
2489 }
2490 }
2491 }
2492
2493 //
2494 // dump which vars are live at the entry of BB
2495 //
dumpGlobalVarNum() const2496 void LivenessAnalysis::dumpGlobalVarNum() const
2497 {
2498 BitSet global_def_out = BitSet(numVarId, false);
2499 BitSet global_use_in = BitSet(numVarId, false);
2500
2501 for (auto bb : fg)
2502 {
2503 BitSet global_in = use_in[bb->getId()];
2504 BitSet global_out = def_out[bb->getId()];
2505 global_in &= def_in[bb->getId()];
2506 global_use_in |= global_in;
2507 global_out &= use_out[bb->getId()];
2508 global_def_out |= global_out;
2509 }
2510
2511 int global_var_num = 0;
2512 for (auto var : vars)
2513 {
2514 if (var->isRegAllocPartaker())
2515 {
2516 if (global_use_in.isSet(var->getId()) || global_def_out.isSet(var->getId()))
2517 {
2518 global_var_num ++;
2519 }
2520 }
2521 }
2522 std::cerr << "total var num: " << numVarId << " global var num: " << global_var_num << "\n";
2523 }
2524
isEmptyLiveness() const2525 bool LivenessAnalysis::isEmptyLiveness() const
2526 {
2527 return numBBId == 0;
2528 }
2529
2530 //
2531 // return true if var is live at the entry of bb
2532 // check both use_in and def_in, if one condition fails then var is not in the live range
2533 //
isLiveAtEntry(const G4_BB * bb,unsigned var_id) const2534 bool LivenessAnalysis::isLiveAtEntry(const G4_BB* bb, unsigned var_id) const
2535 {
2536 return use_in[bb->getId()].isSet(var_id) && def_in[bb->getId()].isSet(var_id);
2537 }
2538 //
2539 // return true if var is live at the exit of bb
2540 //
isLiveAtExit(const G4_BB * bb,unsigned var_id) const2541 bool LivenessAnalysis::isLiveAtExit(const G4_BB* bb, unsigned var_id) const
2542 {
2543 return use_out[bb->getId()].isSet(var_id) && def_out[bb->getId()].isSet(var_id);
2544 }
2545
2546 //
2547 // return true if var is user through the bb
2548 //
isUseOut(const G4_BB * bb,unsigned var_id) const2549 bool LivenessAnalysis::isUseOut(const G4_BB* bb, unsigned var_id) const
2550 {
2551 return use_out[bb->getId()].isSet(var_id);
2552 }
2553
2554 //
2555 // return true if var is user through the bb
2556 //
isUseIn(const G4_BB * bb,unsigned var_id) const2557 bool LivenessAnalysis::isUseIn(const G4_BB* bb, unsigned var_id) const
2558 {
2559 return use_in[bb->getId()].isSet(var_id);
2560 }
2561
2562 //
2563 // return true if var is user through the bb
2564 //
isUseThrough(const G4_BB * bb,unsigned var_id) const2565 bool LivenessAnalysis::isUseThrough(const G4_BB* bb, unsigned var_id) const
2566 {
2567 return use_in[bb->getId()].isSet(var_id) && use_out[bb->getId()].isSet(var_id);
2568 }
2569 //
2570 // return true if var is live at the exit of bb
2571 //
isDefThrough(const G4_BB * bb,unsigned var_id) const2572 bool LivenessAnalysis::isDefThrough(const G4_BB* bb, unsigned var_id) const
2573 {
2574 return def_in[bb->getId()].isSet(var_id) && def_out[bb->getId()].isSet(var_id);
2575 }
2576
2577
markBlockLocalVar(G4_RegVar * var,unsigned bbId)2578 void GlobalRA::markBlockLocalVar(G4_RegVar* var, unsigned bbId)
2579 {
2580 G4_Declare* dcl = var->getDeclare()->getRootDeclare();
2581
2582 if (dcl->isInput() || dcl->isOutput())
2583 {
2584 setBBId(dcl, UINT_MAX - 1);
2585 }
2586 else
2587 {
2588 if (getBBId(dcl) == bbId)
2589 {
2590 // Do nothing.
2591 }
2592 else if (getBBId(dcl) == UINT_MAX)
2593 {
2594 setBBId(dcl, bbId);
2595 }
2596 else {
2597 setBBId(dcl, UINT_MAX - 1);
2598 }
2599 }
2600 }
2601
markBlockLocalVars()2602 void GlobalRA::markBlockLocalVars()
2603 {
2604 for (auto bb : kernel.fg)
2605 {
2606 for (std::list<G4_INST*>::iterator it = bb->begin(); it != bb->end(); it++)
2607 {
2608 G4_INST* inst = *it;
2609
2610 // Track direct dst references.
2611
2612 G4_DstRegRegion* dst = inst->getDst();
2613
2614 if (dst != NULL)
2615 {
2616 G4_DstRegRegion* dstRgn = dst->asDstRegRegion();
2617
2618 if (dstRgn->getBase()->isRegVar()) {
2619 markBlockLocalVar(dstRgn->getBase()->asRegVar(), bb->getId());
2620
2621 G4_Declare* topdcl = GetTopDclFromRegRegion(dst);
2622 if (topdcl)
2623 {
2624 if (inst->isSend())
2625 {
2626 topdcl->setIsRefInSendDcl(true);
2627 }
2628
2629 LocalLiveRange* lr = GetOrCreateLocalLiveRange(topdcl);
2630 unsigned int startIdx;
2631 if (lr->getFirstRef(startIdx) == NULL)
2632 {
2633 lr->setFirstRef(inst, 0);
2634 }
2635 lr->recordRef(bb);
2636 recordRef(topdcl);
2637 }
2638 }
2639 }
2640
2641 G4_CondMod* condMod = inst->getCondMod();
2642
2643 if (condMod != NULL &&
2644 condMod->getBase() != NULL)
2645 {
2646 if (condMod->getBase() && condMod->getBase()->isRegVar())
2647 {
2648 markBlockLocalVar(condMod->getBase()->asRegVar(), bb->getId());
2649
2650 G4_Declare* topdcl = condMod->getBase()->asRegVar()->getDeclare();
2651 if (topdcl)
2652 {
2653 LocalLiveRange* lr = GetOrCreateLocalLiveRange(topdcl);
2654 unsigned int startIdx;
2655 if (lr->getFirstRef(startIdx) == NULL)
2656 {
2657 lr->setFirstRef(inst, 0);
2658 }
2659 lr->recordRef(bb);
2660 recordRef(topdcl);
2661 }
2662 }
2663 }
2664
2665 // Track direct src references.
2666 for (unsigned j = 0; j < G4_MAX_SRCS; j++)
2667 {
2668 G4_Operand* src = inst->getSrc(j);
2669
2670 if (src == NULL)
2671 {
2672 // Do nothing.
2673 }
2674 else if (src->isSrcRegRegion() && src->asSrcRegRegion()->getBase()->isRegVar())
2675 {
2676 G4_SrcRegRegion* srcRgn = src->asSrcRegRegion();
2677
2678 if (srcRgn->getBase()->isRegVar()) {
2679 markBlockLocalVar(src->asSrcRegRegion()->getBase()->asRegVar(), bb->getId());
2680
2681 G4_Declare* topdcl = GetTopDclFromRegRegion(src);
2682 if (topdcl)
2683 {
2684 if (inst->isSend())
2685 {
2686 topdcl->setIsRefInSendDcl(true);
2687 }
2688
2689 LocalLiveRange* lr = GetOrCreateLocalLiveRange(topdcl);
2690
2691 lr->recordRef(bb);
2692 recordRef(topdcl);
2693 if (inst->isEOT())
2694 {
2695 lr->markEOT();
2696 }
2697 }
2698 }
2699 }
2700 else if (src->isAddrExp())
2701 {
2702 G4_RegVar* addExpVar = src->asAddrExp()->getRegVar();
2703 markBlockLocalVar(addExpVar, bb->getId());
2704
2705 G4_Declare* topdcl = addExpVar->getDeclare()->getRootDeclare();
2706 MUST_BE_TRUE(topdcl != NULL, "Top dcl was null for addr exp opnd");
2707
2708 LocalLiveRange* lr = GetOrCreateLocalLiveRange(topdcl);
2709 lr->recordRef(bb);
2710 lr->markIndirectRef();
2711 recordRef(topdcl);
2712 }
2713 }
2714
2715 G4_Operand* pred = inst->getPredicate();
2716
2717 if (pred != NULL)
2718 {
2719 if (pred->getBase() && pred->getBase()->isRegVar())
2720 {
2721 markBlockLocalVar(pred->getBase()->asRegVar(), bb->getId());
2722 G4_Declare* topdcl = pred->getBase()->asRegVar()->getDeclare();
2723 if (topdcl)
2724 {
2725 LocalLiveRange* lr = GetOrCreateLocalLiveRange(topdcl);
2726 lr->recordRef(bb);
2727 recordRef(topdcl);
2728 }
2729 }
2730 }
2731
2732 // Track all indirect references.
2733 const REGVAR_VECTOR& grfVec = pointsToAnalysis.getIndrUseVectorForBB(bb->getId());
2734 for (pointInfo grf : grfVec)
2735 {
2736 markBlockLocalVar(grf.var, bb->getId());
2737 }
2738 }
2739 }
2740 }
2741
resetGlobalRAStates()2742 void GlobalRA::resetGlobalRAStates()
2743 {
2744 if (builder.getOption(vISA_LocalDeclareSplitInGlobalRA))
2745 {
2746 // remove partial decls
2747 auto isPartialDcl = [](G4_Declare* dcl)
2748 {
2749 return dcl->getIsPartialDcl();
2750 };
2751
2752 kernel.Declares.erase(
2753 std::remove_if (kernel.Declares.begin(), kernel.Declares.end(), isPartialDcl),
2754 kernel.Declares.end());
2755 }
2756
2757 for (auto dcl : kernel.Declares)
2758 {
2759 //Reset all the local live ranges
2760 resetLocalLR(dcl);
2761
2762 if (builder.getOption(vISA_LocalDeclareSplitInGlobalRA))
2763 {
2764 //Remove the split declares
2765 if (dcl->getIsSplittedDcl())
2766 {
2767 dcl->setIsSplittedDcl(false);
2768 clearSubDcl(dcl);
2769 }
2770 }
2771
2772 //Remove the bank assignment
2773 if (builder.getOption(vISA_LocalBankConflictReduction) &&
2774 builder.hasBankCollision())
2775 {
2776 setBankConflict(dcl, BANK_CONFLICT_NONE);
2777 }
2778 clearBundleConflictDcl(dcl);
2779 }
2780
2781 return;
2782 }
2783
2784 //
2785 // Mark block local (temporary) variables.
2786 //
markGraphBlockLocalVars()2787 void GlobalRA::markGraphBlockLocalVars()
2788 {
2789 // Clear stale LocalLiveRange* first to avoid double ref counting
2790 clearStaleLiveRanges();
2791
2792 //Create live ranges and record the reference info
2793 markBlockLocalVars();
2794
2795 #ifdef DEBUG_VERBOSE_ON
2796 std::cout << "\t--LOCAL VARIABLES--\n";
2797 for (auto dcl : kernel.Declares)
2798 {
2799 LocalLiveRange* topdclLR = getLocalLR(dcl);
2800
2801 if (topdclLR != nullptr &&
2802 topdclLR->isLiveRangeLocal())
2803 {
2804 std::cout << dcl->getName() << ",\t";
2805 }
2806 }
2807 std::cout << "\n";
2808 #endif
2809 }
2810
2811 //
2812 // Pre-assign phy regs to stack call function return variable as per ABI.
2813 //
setABIForStackCallFunctionCalls()2814 void FlowGraph::setABIForStackCallFunctionCalls()
2815 {
2816 // For each G4_pseudo_fcall inst, create dst of GRF type
2817 // with physical register 1.0 pre-assigned to it.
2818 // Similarly, for G4_pseudo_fret create src of GRF type
2819 // with physical register 1.0 pre-assigned to it.
2820 // Each will use 2 dwords of r1.0.
2821 int call_id = 0, ret_id = 0;
2822
2823 for (auto bb : *this)
2824 {
2825 if (bb->isEndWithFCall())
2826 {
2827 const char* n = builder->getNameString(mem, 25, "FCALL_RET_LOC_%d", call_id++);
2828
2829 G4_INST* fcall = bb->back();
2830 // Set call dst to r125.0
2831 G4_Declare* r1_dst = builder->createDeclareNoLookup(n, G4_GRF, numEltPerGRF<Type_UD>(), 1, Type_UD);
2832 r1_dst->getRegVar()->setPhyReg(builder->phyregpool.getGreg(builder->kernel.getFPSPGRF()), IR_Builder::SubRegs_Stackcall::Ret_IP);
2833 G4_DstRegRegion* dstRgn = builder->createDst(r1_dst->getRegVar(), 0, 0, 1, Type_UD);
2834 fcall->setDest(dstRgn);
2835 }
2836
2837 if (bb->isEndWithFRet())
2838 {
2839 const char* n = builder->getNameString(mem, 25, "FRET_RET_LOC_%d", ret_id++);
2840 G4_INST* fret = bb->back();
2841 const RegionDesc* rd = builder->createRegionDesc(2, 2, 1);
2842 G4_Declare* r1_src = builder->createDeclareNoLookup(n, G4_INPUT, numEltPerGRF<Type_UD>(), 1, Type_UD);
2843 r1_src->getRegVar()->setPhyReg(builder->phyregpool.getGreg(builder->kernel.getFPSPGRF()), IR_Builder::SubRegs_Stackcall::Ret_IP);
2844 G4_Operand* srcRgn = builder->createSrc(r1_src->getRegVar(), 0, 0, rd, Type_UD);
2845 fret->setSrc(srcRgn, 0);
2846 if (fret->getExecSize() == g4::SIMD1)
2847 {
2848 // due to <2;2,1> regioning we must update exec size as well
2849 fret->setExecSize(g4::SIMD2);
2850 }
2851 if (builder->getOption(vISA_GenerateDebugInfo))
2852 {
2853 pKernel->getKernelDebugInfo()->setFretVar(GetTopDclFromRegRegion(fret->getSrc(0)));
2854 }
2855 }
2856 }
2857 }
2858
2859 // Function to verify RA results
verifyRA(LivenessAnalysis & liveAnalysis)2860 void GlobalRA::verifyRA(LivenessAnalysis & liveAnalysis)
2861 {
2862 for (auto bb : kernel.fg)
2863 {
2864 // Verify PREG assignment
2865 for (auto inst : *bb)
2866 {
2867 G4_DstRegRegion* dst = inst->getDst();
2868 if (dst != NULL &&
2869 dst->getBase()->isRegAllocPartaker())
2870 {
2871 MUST_BE_TRUE(dst->getBase()->asRegVar()->getPhyReg(), "RA verification error: No PREG assigned for variable " << GetTopDclFromRegRegion(dst)->getName() << "!");
2872 }
2873
2874 for (unsigned j = 0; j < G4_MAX_SRCS; j++)
2875 {
2876 G4_Operand* src = inst->getSrc(j);
2877 if (src != NULL &&
2878 src->isSrcRegRegion() &&
2879 src->asSrcRegRegion()->getBase()->isRegAllocPartaker())
2880 {
2881 MUST_BE_TRUE(src->asSrcRegRegion()->getBase()->asRegVar()->getPhyReg(),
2882 "RA verification error: No PREG assigned for variable " << GetTopDclFromRegRegion(src->asSrcRegRegion())->getName() << "!");
2883 }
2884 }
2885 }
2886
2887 int numGRF = kernel.getNumRegTotal();
2888 // Verify Live-in
2889 std::map<uint32_t, G4_Declare*> LiveInRegMap;
2890 std::map<uint32_t, G4_Declare*>::iterator LiveInRegMapIt;
2891 std::vector<uint32_t> liveInRegVec(numGRF * numEltPerGRF<Type_UW>(), UINT_MAX);
2892
2893 for (G4_Declare* dcl : kernel.Declares)
2894 {
2895 if (dcl->getAliasDeclare() != nullptr)
2896 continue;
2897
2898 if (dcl->getRegVar()->isRegAllocPartaker())
2899 {
2900 G4_RegVar* var = dcl->getRegVar();
2901 uint32_t varID = var->getId();
2902 if (liveAnalysis.isLiveAtEntry(bb, dcl->getRegVar()->getId()))
2903 {
2904 MUST_BE_TRUE(var->getPhyReg()->isGreg(), "RA verification error: Invalid preg assignment for variable " << dcl->getName() << "!");
2905
2906 uint32_t regNum = var->getPhyReg()->asGreg()->getRegNum();
2907 uint32_t regOff = var->getPhyRegOff();
2908
2909 uint32_t idx = regNum * numEltPerGRF<Type_UW>() +
2910 (regOff * dcl->getElemSize()) / G4_WSIZE;
2911 for (uint32_t i = 0; i < dcl->getWordSize(); ++i, ++idx)
2912 {
2913 LiveInRegMapIt = LiveInRegMap.find(idx);
2914 if (liveInRegVec[idx] != UINT_MAX)
2915 {
2916 MUST_BE_TRUE(LiveInRegMapIt != LiveInRegMap.end(), "RA verification error: Invalid entry in LiveInRegMap!");
2917 if (dcl->isInput())
2918 {
2919 DEBUG_MSG("RA verification warning: Found conflicting input variables: " << dcl->getName()
2920 << " and " << (*LiveInRegMapIt).second->getName() << " assigned to r" << regNum
2921 << "." << regOff << "!\n");
2922 liveInRegVec[idx] = varID;
2923 LiveInRegMapIt->second = dcl;
2924 }
2925 else
2926 {
2927 DEBUG_MSG("RA verification warning: Found conflicting live-in variables: " << dcl->getName()
2928 << " and " << LiveInRegMapIt->second->getName() << " assigned to r" <<
2929 regNum << "." << regOff << "!\n");
2930 }
2931
2932 }
2933 else
2934 {
2935 liveInRegVec[idx] = varID;
2936 MUST_BE_TRUE(LiveInRegMapIt == LiveInRegMap.end(), "RA verification error: Invalid entry in LiveInRegMap!");
2937 LiveInRegMap.emplace(idx, dcl);
2938 }
2939 }
2940 }
2941 }
2942 }
2943
2944 // Verify Live-out
2945 G4_Declare *ret = kernel.fg.builder->getStackCallRet();
2946 std::map<uint32_t, G4_Declare*> liveOutRegMap;
2947 std::map<uint32_t, G4_Declare*>::iterator liveOutRegMapIt;
2948 std::vector<uint32_t> liveOutRegVec(numGRF * numEltPerGRF<Type_UW>(), UINT_MAX);
2949
2950 for (G4_Declare* dcl : kernel.Declares)
2951 {
2952 if (dcl->getAliasDeclare() != NULL)
2953 continue;
2954 if (dcl->getRegVar()->isRegAllocPartaker())
2955 {
2956 G4_RegVar* var = dcl->getRegVar();
2957 uint32_t varID = var->getId();
2958 if (liveAnalysis.isLiveAtExit(bb, varID))
2959 {
2960 MUST_BE_TRUE(var->getPhyReg()->isGreg(), "RA verification error: Invalid preg assignment for variable " << dcl->getName() << "!");
2961
2962 uint32_t regNum = var->getPhyReg()->asGreg()->getRegNum();
2963 uint32_t regOff = var->getPhyRegOff();
2964
2965 uint32_t idx = regNum * numEltPerGRF<Type_UW>() +
2966 (regOff * dcl->getElemSize()) / G4_WSIZE;
2967 for (uint32_t i = 0; i < dcl->getWordSize(); ++i, ++idx)
2968 {
2969 liveOutRegMapIt = liveOutRegMap.find(idx);
2970 if (liveOutRegVec[idx] != UINT_MAX)
2971 {
2972 MUST_BE_TRUE(liveOutRegMapIt != liveOutRegMap.end(), "RA verification error: Invalid entry in liveOutRegMap!");
2973 if (dcl->isInput())
2974 {
2975 DEBUG_MSG("RA verification warning: Found conflicting input variables: " << dcl->getName()
2976 << " and " << liveOutRegMapIt->second->getName() << " assigned to r" << regNum
2977 << "." << regOff << "!\n");
2978 liveOutRegVec[idx] = varID;
2979 liveOutRegMapIt->second = dcl;
2980 }
2981 else
2982 {
2983 DEBUG_MSG("RA verification warning: Found conflicting live-out variables: " << dcl->getName()
2984 << " and " << liveOutRegMapIt->second->getName() << " assigned to r" <<
2985 regNum << "." << regOff << "!\n");
2986 }
2987
2988 }
2989 else
2990 {
2991 liveOutRegVec[idx] = varID;
2992 MUST_BE_TRUE(liveOutRegMapIt == liveOutRegMap.end(), "RA verification error: Invalid entry in liveOutRegMap!");
2993 liveOutRegMap.emplace(idx, dcl);
2994 }
2995 }
2996 }
2997 }
2998 }
2999
3000 for (INST_LIST::reverse_iterator rit = bb->rbegin(); rit != bb->rend(); ++rit)
3001 {
3002 G4_INST* inst = (*rit);
3003 INST_LIST_RITER ritNext = rit;
3004 ritNext++;
3005 G4_INST* rNInst = nullptr;
3006 if (ritNext != bb->rend())
3007 {
3008 rNInst = (*ritNext);
3009 }
3010
3011 if (inst->isPseudoKill())
3012 {
3013 continue;
3014 }
3015 G4_DstRegRegion* dst = inst->getDst();
3016 G4_DstRegRegion* rNDst = nullptr;
3017
3018 if (rNInst && rNInst->isPseudoKill())
3019 {
3020 rNDst = rNInst->getDst();
3021 }
3022
3023 //
3024 // verify dst operand
3025 //
3026 if (dst != NULL)
3027 {
3028 if (dst->getBase()->isRegAllocPartaker())
3029 {
3030 G4_DstRegRegion* dstrgn = dst;
3031 G4_RegVar* var = dstrgn->getBase()->asRegVar();
3032 uint32_t varID = var->getId();
3033 G4_Declare* dcl = GetTopDclFromRegRegion(dstrgn);
3034 G4_Declare* rNDcl = nullptr;
3035 if (rNDst != nullptr)
3036 {
3037 rNDcl = GetTopDclFromRegRegion(rNDst);
3038 }
3039 MUST_BE_TRUE(dcl != nullptr, "Null declare found");
3040 var = dcl->getRegVar();
3041
3042 MUST_BE_TRUE(var->getId() == varID, "RA verification error: Invalid regVar ID!");
3043 MUST_BE_TRUE(var->getPhyReg()->isGreg(), "RA verification error: Invalid dst reg!");
3044
3045 uint32_t regNum = var->getPhyReg()->asGreg()->getRegNum();
3046 uint32_t regOff = var->getPhyRegOff();
3047
3048 uint32_t idx = regNum * numEltPerGRF<Type_UW>() +
3049 (regOff * dcl->getElemSize()) / G4_WSIZE;
3050 for (uint32_t i = 0; i < dcl->getWordSize(); ++i, ++idx)
3051 {
3052 liveOutRegMapIt = liveOutRegMap.find(idx);
3053 if (liveOutRegVec[idx] == UINT_MAX)
3054 {
3055 if (!inst->isPseudoKill())
3056 {
3057 MUST_BE_TRUE(liveOutRegMapIt == liveOutRegMap.end(), "RA verification error: Invalid entry in liveOutRegMap!");
3058 DEBUG_MSG("RA verification warning: Found unused variable " << dcl->getName() << "!\n");
3059 }
3060 }
3061 else
3062 {
3063 MUST_BE_TRUE(liveOutRegMapIt != liveOutRegMap.end(), "RA verification error: Invalid entry in liveOutRegMap!");
3064 if (liveOutRegVec[idx] != varID)
3065 {
3066 const BitSet& indr_use = liveAnalysis.indr_use[bb->getId()];
3067
3068 if (strstr(dcl->getName(), GlobalRA::StackCallStr) != NULL)
3069 {
3070 DEBUG_MSG("RA verification warning: Found conflicting stackCall variable: " << dcl->getName()
3071 << " and " << liveOutRegMapIt->second->getName() << " assigned to r" <<
3072 regNum << "." << regOff << "!\n");
3073 }
3074 else if (indr_use.isSet(liveOutRegVec[idx]) == true)
3075 {
3076 MUST_BE_TRUE(false, "RA verification warning: Found conflicting indirect variables: " << dcl->getName()
3077 << " and " << liveOutRegMapIt->second->getName() << " assigned to r" <<
3078 regNum << "." << regOff << "!\n");
3079 }
3080 else
3081 {
3082 if (!inst->isPseudoKill())
3083 {
3084 DEBUG_MSG("RA verification error: Found conflicting variables: " << dcl->getName()
3085 << " and " << liveOutRegMapIt->second->getName() << " assigned to r" <<
3086 regNum << "." << regOff << "!\n");
3087 }
3088 }
3089 }
3090
3091 if (liveAnalysis.writeWholeRegion(bb, inst, dstrgn, kernel.getOptions()) ||
3092 inst->isPseudoKill() || rNDcl == dcl) {
3093 liveOutRegVec[idx] = UINT_MAX;
3094 MUST_BE_TRUE(liveOutRegMapIt != liveOutRegMap.end(), "RA verification error: Invalid entry in liveOutRegMap!");
3095 liveOutRegMap.erase(liveOutRegMapIt);
3096 }
3097 }
3098 }
3099 }
3100 else if (dst->getRegAccess() == IndirGRF)
3101 {
3102 G4_DstRegRegion* dstrgn = dst;
3103 G4_Declare* addrdcl = GetTopDclFromRegRegion(dstrgn);
3104 G4_RegVar* ptvar = NULL;
3105 int vid = 0;
3106
3107 while ((ptvar = pointsToAnalysis.getPointsTo(addrdcl->getRegVar(), vid++)) != NULL)
3108 {
3109 uint32_t varID = ptvar->getId();
3110 G4_Declare* dcl = ptvar->getDeclare();
3111 MUST_BE_TRUE(dcl != nullptr, "Null declare found");
3112 while (dcl->getAliasDeclare())
3113 {
3114 dcl = dcl->getAliasDeclare();
3115 }
3116 G4_RegVar* var = dcl->getRegVar();
3117
3118 MUST_BE_TRUE(var->getId() == varID, "RA verification error: Invalid regVar ID!");
3119 MUST_BE_TRUE(var->getPhyReg()->isGreg(), "RA verification error: Invalid dst reg!");
3120
3121 uint32_t regNum = var->getPhyReg()->asGreg()->getRegNum();
3122 uint32_t regOff = var->getPhyRegOff();
3123
3124 uint32_t idx = regNum * numEltPerGRF<Type_UW>() +
3125 (regOff * dcl->getElemSize()) / G4_WSIZE;
3126 for (uint32_t i = 0; i < dcl->getWordSize(); ++i, ++idx)
3127 {
3128 liveOutRegMapIt = liveOutRegMap.find(idx);
3129 if (liveOutRegVec[idx] == UINT_MAX)
3130 {
3131 MUST_BE_TRUE(liveOutRegMapIt == liveOutRegMap.end(), "RA verification error: Invalid entry in liveOutRegMap!");
3132 DEBUG_MSG("RA verification warning: Found unused variable " << dcl->getName() << "!\n");
3133 }
3134 else
3135 {
3136 MUST_BE_TRUE(liveOutRegMapIt != liveOutRegMap.end(), "RA verification error: Invalid entry in liveOutRegMap!");
3137 if (liveOutRegVec[idx] != varID)
3138 {
3139 const BitSet& indr_use = liveAnalysis.indr_use[bb->getId()];
3140
3141 if (strstr(dcl->getName(), GlobalRA::StackCallStr) != NULL)
3142 {
3143 DEBUG_MSG("RA verification warning: Found conflicting stackCall variables: " << dcl->getName()
3144 << " and " << liveOutRegMapIt->second->getName() << " assigned to r" <<
3145 regNum << "." << regOff << "!\n");
3146 }
3147 else if (indr_use.isSet(liveOutRegVec[idx]) == true)
3148 {
3149 MUST_BE_TRUE(false, "RA verification warning: Found conflicting indirect variables: " << dcl->getName()
3150 << " and " << liveOutRegMapIt->second->getName() << " assigned to r" <<
3151 regNum << "." << regOff << "!\n");
3152 }
3153 else
3154 {
3155 MUST_BE_TRUE(false, "RA verification error: Found conflicting variables: " << dcl->getName()
3156 << " and " << liveOutRegMapIt->second->getName() << " assigned to r" <<
3157 regNum << "." << regOff << "!\n");
3158 }
3159 }
3160 }
3161 }
3162 }
3163 }
3164 }
3165
3166 if (inst->opcode() == G4_pseudo_fcall)
3167 {
3168 if (ret != NULL && ret->getRegVar() != NULL)
3169 {
3170 G4_RegVar* var = ret->getRegVar();
3171 uint32_t varID = var->getId();
3172 MUST_BE_TRUE(var->getId() == varID, "RA verification error: Invalid regVar ID!");
3173 MUST_BE_TRUE(var->getPhyReg()->isGreg(), "RA verification error: Invalid dst reg!");
3174
3175 uint32_t regNum = var->getPhyReg()->asGreg()->getRegNum();
3176 uint32_t regOff = var->getPhyRegOff();
3177
3178 uint32_t idx = regNum * numEltPerGRF<Type_UW>() +
3179 regOff * ret->getElemSize() / G4_WSIZE;
3180 for (uint32_t i = 0; i < ret->getWordSize(); ++i, ++idx)
3181 {
3182 liveOutRegMapIt = liveOutRegMap.find(idx);
3183 liveOutRegVec[idx] = UINT_MAX;
3184 MUST_BE_TRUE(liveOutRegMapIt != liveOutRegMap.end(),
3185 "RA verification error: Invalid entry in liveOutRegMap!");
3186 liveOutRegMap.erase(liveOutRegMapIt);
3187 }
3188 }
3189 }
3190
3191 //
3192 // verify each source operand
3193 //
3194 for (unsigned j = 0; j < G4_MAX_SRCS; j++)
3195 {
3196 G4_Operand* src = inst->getSrc(j);
3197 if (src == NULL)
3198 {
3199 continue;
3200 }
3201 if (src->isAddrExp() && src->asAddrExp()->isRegAllocPartaker())
3202 {
3203 G4_RegVar* var = src->asAddrExp()->getRegVar();
3204 uint32_t varID = UINT_MAX;
3205 G4_Declare* dcl = NULL;
3206
3207 varID = var->getId();
3208 dcl = var->getDeclare();
3209
3210 uint32_t regNum = var->getPhyReg()->asGreg()->getRegNum();
3211 uint32_t regOff = var->getPhyRegOff();
3212
3213 uint32_t idx = regNum * numEltPerGRF<Type_UW>() +
3214 (regOff * dcl->getElemSize()) / G4_WSIZE;
3215 for (uint32_t i = 0; i < dcl->getWordSize(); ++i, ++idx)
3216 {
3217 liveOutRegMapIt = liveOutRegMap.find(idx);
3218 if (liveOutRegVec[idx] == UINT_MAX)
3219 {
3220 liveOutRegVec[idx] = varID;
3221 MUST_BE_TRUE(liveOutRegMapIt == liveOutRegMap.end(), "RA verification error: Invalid entry in liveOutRegMap!");
3222 liveOutRegMap.emplace(idx, dcl);
3223 }
3224 else
3225 {
3226 if (liveOutRegVec[idx] != varID)
3227 {
3228 const BitSet& indr_use = liveAnalysis.indr_use[bb->getId()];
3229
3230 if (dcl->isInput())
3231 {
3232 DEBUG_MSG("RA verification warning: Found conflicting input variables: " << dcl->getName()
3233 << " and " << liveOutRegMapIt->second->getName() << " assigned to r" << regNum
3234 << "." << regOff << "!\n");
3235 liveOutRegVec[idx] = varID;
3236 liveOutRegMapIt->second = dcl;
3237 }
3238 else if (strstr(dcl->getName(), GlobalRA::StackCallStr) != NULL)
3239 {
3240 DEBUG_MSG("RA verification warning: Found conflicting stackCall variables: " << dcl->getName()
3241 << " and " << liveOutRegMapIt->second->getName() << " assigned to r" <<
3242 regNum << "." << regOff << "!\n");
3243 }
3244 else if (indr_use.isSet(liveOutRegVec[idx]) == true)
3245 {
3246 MUST_BE_TRUE(false, "RA verification warning: Found conflicting indirect variables: " << dcl->getName()
3247 << " and " << liveOutRegMapIt->second->getName() << " assigned to r" <<
3248 regNum << "." << regOff << "!\n");
3249 }
3250 else
3251 {
3252 INST_LIST::reverse_iterator succ = rit;
3253 ++succ;
3254 bool idMismatch = false;
3255 G4_Declare* topdcl = GetTopDclFromRegRegion((*succ)->getDst());
3256 if (topdcl != nullptr &&
3257 liveOutRegVec[idx] != topdcl->getRegVar()->getId())
3258 {
3259 idMismatch = true;
3260 }
3261 if (succ == bb->rend() ||
3262 !(*succ)->isPseudoKill() ||
3263 (*succ)->getDst() == NULL ||
3264 idMismatch)
3265 {
3266 MUST_BE_TRUE(false, "RA verification error: Found conflicting variables: " << dcl->getName()
3267 << " and " << liveOutRegMapIt->second->getName() << " assigned to r" <<
3268 regNum << "." << regOff << "!\n");
3269 }
3270 }
3271 }
3272 }
3273 }
3274 }
3275 else if (src->isSrcRegRegion() && src->asSrcRegRegion()->getBase()->isRegAllocPartaker())
3276 {
3277 G4_SrcRegRegion* srcrgn = src->asSrcRegRegion();
3278 G4_RegVar* var = srcrgn->getBase()->asRegVar();
3279 uint32_t varID = var->getId();
3280 G4_Declare* dcl = GetTopDclFromRegRegion(srcrgn);
3281 var = dcl->getRegVar();
3282 MUST_BE_TRUE(var->getId() == varID, "RA verification error: Invalid regVar ID!");
3283 MUST_BE_TRUE(var->getPhyReg()->isGreg(), "RA verification error: Invalid dst reg!");
3284
3285 if (!inst->isLifeTimeEnd())
3286 {
3287 uint32_t regNum = var->getPhyReg()->asGreg()->getRegNum();
3288 uint32_t regOff = var->getPhyRegOff();
3289
3290 uint32_t idx = regNum * numEltPerGRF<Type_UW>() +
3291 (regOff * dcl->getElemSize()) / G4_WSIZE;
3292 for (uint32_t i = 0; i < dcl->getWordSize(); ++i, ++idx)
3293 {
3294 liveOutRegMapIt = liveOutRegMap.find(idx);
3295 if (liveOutRegVec[idx] == UINT_MAX)
3296 {
3297 liveOutRegVec[idx] = varID;
3298 MUST_BE_TRUE(liveOutRegMapIt == liveOutRegMap.end(), "RA verification error: Invalid entry in liveOutRegMap!");
3299 liveOutRegMap.emplace(idx, dcl);
3300 }
3301 else
3302 {
3303 if (liveOutRegVec[idx] != varID)
3304 {
3305 const BitSet& indr_use = liveAnalysis.indr_use[bb->getId()];
3306
3307 if (dcl->isInput())
3308 {
3309 DEBUG_MSG("RA verification warning: Found conflicting input variables: " << dcl->getName()
3310 << " and " << liveOutRegMapIt->second->getName() << " assigned to r" << regNum
3311 << "." << regOff << "!\n");
3312 liveOutRegVec[idx] = varID;
3313 liveOutRegMapIt->second = dcl;
3314 }
3315 else if (strstr(dcl->getName(), GlobalRA::StackCallStr) != NULL)
3316 {
3317 DEBUG_MSG("RA verification warning: Found conflicting stackCall variables: " << dcl->getName()
3318 << " and " << liveOutRegMapIt->second->getName() << " assigned to r" <<
3319 regNum << "." << regOff << "!\n");
3320 }
3321 else if (indr_use.isSet(liveOutRegVec[idx]) == true)
3322 {
3323 MUST_BE_TRUE(false, "RA verification warning: Found conflicting indirect variables: " << dcl->getName()
3324 << " and " << liveOutRegMapIt->second->getName() << " assigned to r" <<
3325 regNum << "." << regOff << "!\n");
3326 }
3327 else
3328 {
3329 INST_LIST::reverse_iterator succ = rit;
3330 ++succ;
3331 bool idMismatch = false;
3332 G4_Declare* topdcl = GetTopDclFromRegRegion((*succ)->getDst());
3333 if (topdcl != nullptr &&
3334 liveOutRegVec[idx] != topdcl->getRegVar()->getId())
3335 {
3336 idMismatch = true;
3337 }
3338 if (succ == bb->rbegin() ||
3339 !(*succ)->isPseudoKill() ||
3340 (*succ)->getDst() == NULL ||
3341 idMismatch)
3342 {
3343 DEBUG_MSG("RA verification error: Found conflicting variables: " << dcl->getName()
3344 << " and " << liveOutRegMapIt->second->getName() << " assigned to r" <<
3345 regNum << "." << regOff << "!\n");
3346 }
3347 }
3348 }
3349 }
3350 }
3351 }
3352 else
3353 {
3354 uint32_t regNum = var->getPhyReg()->asGreg()->getRegNum();
3355 uint32_t regOff = var->getPhyRegOff();
3356
3357 uint32_t idx = regNum * numEltPerGRF<Type_UW>() +
3358 (regOff * dcl->getElemSize()) / G4_WSIZE;
3359 for (uint32_t i = 0; i < dcl->getWordSize(); ++i, ++idx)
3360 {
3361 if (liveOutRegVec[idx] != UINT_MAX)
3362 {
3363 liveOutRegMapIt = liveOutRegMap.find(idx);
3364 MUST_BE_TRUE(liveOutRegMapIt != liveOutRegMap.end(), "RA verification error: Invalid entry in liveOutRegMap!");
3365 MUST_BE_TRUE(false, "RA verification error: Found live variable: " << dcl->getName()
3366 << " after lifetime_end " << " assigned to r" << regNum << "." << regOff << "!\n");
3367 }
3368 }
3369 }
3370
3371 // verify EOT source
3372 if (inst->isEOT() && kernel.fg.builder->hasEOTGRFBinding())
3373 {
3374 uint32_t regNum = var->getPhyReg()->asGreg()->getRegNum();
3375 MUST_BE_TRUE(regNum >= 112,
3376 "RA verification error: EOT source: " << dcl->getName()
3377 << " is assigned to r." << regNum << "\n");
3378 }
3379 }
3380 else if (src->isSrcRegRegion() && src->getRegAccess() == IndirGRF)
3381 {
3382 G4_SrcRegRegion* srcrgn = src->asSrcRegRegion();
3383 G4_Declare* addrdcl = GetTopDclFromRegRegion(srcrgn);
3384 G4_RegVar* ptvar = NULL;
3385 int vid = 0;
3386
3387 while ((ptvar = pointsToAnalysis.getPointsTo(addrdcl->getRegVar(), vid++)) != NULL)
3388 {
3389 uint32_t varID = ptvar->getId();
3390 G4_Declare* dcl = ptvar->getDeclare()->getRootDeclare();
3391 G4_RegVar* var = dcl->getRegVar();
3392
3393 uint32_t regNum = var->getPhyReg()->asGreg()->getRegNum();
3394 uint32_t regOff = var->getPhyRegOff();
3395
3396 uint32_t idx = regNum * numEltPerGRF<Type_UW>() +
3397 (regOff * dcl->getElemSize()) / G4_WSIZE;
3398 for (uint32_t i = 0; i < dcl->getWordSize(); ++i, ++idx)
3399 {
3400 liveOutRegMapIt = liveOutRegMap.find(idx);
3401 if (liveOutRegVec[idx] == UINT_MAX)
3402 {
3403 liveOutRegVec[idx] = varID;
3404 MUST_BE_TRUE(liveOutRegMapIt == liveOutRegMap.end(), "RA verification error: Invalid entry in liveOutRegMap!");
3405 liveOutRegMap.emplace(idx, dcl);
3406 }
3407 else
3408 {
3409 if (liveOutRegVec[idx] != varID)
3410 {
3411 const BitSet& indr_use = liveAnalysis.indr_use[bb->getId()];
3412
3413 if (dcl->isInput())
3414 {
3415 DEBUG_MSG("RA verification warning: Found conflicting input variables: " << dcl->getName()
3416 << " and " << liveOutRegMapIt->second->getName() << " assigned to r" << regNum
3417 << "." << regOff << "!\n");
3418 liveOutRegVec[idx] = varID;
3419 liveOutRegMapIt->second = dcl;
3420 }
3421 else if (indr_use.isSet(liveOutRegVec[idx]) == true)
3422 {
3423 MUST_BE_TRUE(false, "RA verification warning: Found conflicting indirect variables: " << dcl->getName()
3424 << " and " << liveOutRegMapIt->second->getName() << " assigned to r" <<
3425 regNum << "." << regOff << "!\n");
3426 }
3427 else
3428 {
3429 MUST_BE_TRUE(false, "RA verification error: Found conflicting variables: " << dcl->getName()
3430 << " and " << liveOutRegMapIt->second->getName() << " assigned to r" <<
3431 regNum << "." << regOff << "!\n");
3432 }
3433 }
3434 }
3435 }
3436 }
3437 }
3438 }
3439 }
3440 }
3441 }
3442
recordRAStats(IR_Builder & builder,G4_Kernel & kernel,int RAStatus)3443 static void recordRAStats(IR_Builder& builder,
3444 G4_Kernel& kernel,
3445 int RAStatus)
3446 {
3447 #if COMPILER_STATS_ENABLE
3448 CompilerStats &Stats = builder.getcompilerStats();
3449 int SimdSize = kernel.getSimdSize();
3450 if (RAStatus == VISA_SUCCESS)
3451 {
3452 Stats.SetFlag("IsRAsuccessful", SimdSize);
3453 switch (kernel.getRAType())
3454 {
3455 case RA_Type::TRIVIAL_BC_RA:
3456 case RA_Type::TRIVIAL_RA:
3457 Stats.SetFlag("IsTrivialRA", SimdSize);
3458 break;
3459 case RA_Type::LOCAL_ROUND_ROBIN_BC_RA:
3460 case RA_Type::LOCAL_ROUND_ROBIN_RA:
3461 case RA_Type::LOCAL_FIRST_FIT_BC_RA:
3462 case RA_Type::LOCAL_FIRST_FIT_RA:
3463 Stats.SetFlag("IsLocalRA", SimdSize);
3464 break;
3465 case RA_Type::HYBRID_BC_RA:
3466 case RA_Type::HYBRID_RA:
3467 Stats.SetFlag("IsHybridRA", SimdSize);
3468 break;
3469 case RA_Type::GRAPH_COLORING_RR_RA:
3470 case RA_Type::GRAPH_COLORING_FF_RA:
3471 case RA_Type::GRAPH_COLORING_RR_BC_RA:
3472 case RA_Type::GRAPH_COLORING_FF_BC_RA:
3473 case RA_Type::GRAPH_COLORING_SPILL_RR_RA:
3474 case RA_Type::GRAPH_COLORING_SPILL_FF_RA:
3475 case RA_Type::GRAPH_COLORING_SPILL_RR_BC_RA:
3476 case RA_Type::GRAPH_COLORING_SPILL_FF_BC_RA:
3477 case RA_Type::GLOBAL_LINEAR_SCAN_RA:
3478 case RA_Type::GLOBAL_LINEAR_SCAN_BC_RA:
3479 Stats.SetFlag("IsGlobalRA", SimdSize);
3480 break;
3481 case RA_Type::UNKNOWN_RA:
3482 break;
3483 default:
3484 assert(0 && "Incorrect RA type");
3485 }
3486 }
3487 #endif // COMPILER_STATS_ENABLE
3488 }
3489
replaceSSO(G4_Kernel & kernel)3490 static void replaceSSO(G4_Kernel& kernel)
3491 {
3492 // Invoke function only for XeHP_SDV+
3493 // Replace SSO with r126.7 (scratch reg)
3494
3495 auto dst = kernel.fg.builder->createDst(
3496 kernel.fg.getScratchRegDcl()->getRegVar(), 0, 7, 1, Type_UD);
3497 for (auto bb : kernel.fg)
3498 {
3499 for (auto instIt = bb->begin(); instIt != bb->end(); instIt++)
3500 {
3501 auto inst = (*instIt);
3502 if (inst->getDst() &&
3503 inst->getDst()->getTopDcl() == kernel.fg.builder->getSpillSurfaceOffset())
3504 {
3505 if (kernel.fg.getIsStackCallFunc())
3506 {
3507 bb->erase(instIt);
3508 }
3509 else
3510 inst->setDest(dst);
3511
3512 // Also update scratch msg dcl to be an alias
3513 kernel.fg.builder->getSpillSurfaceOffset()->setAliasDeclare(
3514 kernel.fg.getScratchRegDcl(), 7 * TypeSize(Type_UD));
3515
3516 return;
3517 }
3518 }
3519 }
3520 }
3521
3522
regAlloc(IR_Builder & builder,PhyRegPool & regPool,G4_Kernel & kernel)3523 int regAlloc(IR_Builder& builder, PhyRegPool& regPool, G4_Kernel& kernel)
3524 {
3525 kernel.fg.callerSaveAreaOffset = kernel.fg.calleeSaveAreaOffset = kernel.fg.frameSizeInOWord = 0;
3526
3527 // This must be done before Points-to analysis as it may modify CFG and add new BB!
3528 if (kernel.fg.getHasStackCalls() || kernel.fg.getIsStackCallFunc())
3529 {
3530 kernel.fg.setABIForStackCallFunctionCalls();
3531 kernel.fg.addFrameSetupDeclares(builder, regPool);
3532 kernel.fg.normalizeFlowGraph();
3533 if (builder.getPlatform() >= XeHP_SDV)
3534 replaceSSO(kernel);
3535 }
3536
3537 if (kernel.getOption(vISA_DoSplitOnSpill))
3538 {
3539 // loop computation is done here because we may need to add
3540 // new preheader BBs. later parts of RA assume no change
3541 // to CFG structure.
3542 kernel.fg.getLoops().computePreheaders();
3543 }
3544
3545 kernel.fg.reassignBlockIDs();
3546 // do it once for whole RA pass. Assumption is RA should not modify CFG at all
3547 kernel.fg.setPhysicalPredSucc();
3548
3549 if (kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D)
3550 {
3551 kernel.fg.findNaturalLoops();
3552
3553 #ifdef DEBUG_VERBOSE_ON
3554 for (auto backedge : kernel.fg.backEdges)
3555 {
3556 DEBUG_VERBOSE("\n===> Found natural loop: ");
3557 for (auto block : kernel.fg.naturalLoops[backedge])
3558 {
3559 DEBUG_VERBOSE("\tBB" << block->getId());
3560 }
3561 DEBUG_VERBOSE(std::endl);
3562 }
3563 #endif
3564 }
3565
3566 //
3567 // Perform flow-insensitive points-to-analysis.
3568 //
3569 PointsToAnalysis pointsToAnalysis(kernel.Declares, kernel.fg.getNumBB());
3570 pointsToAnalysis.doPointsToAnalysis(kernel.fg);
3571
3572 GlobalRA gra(kernel, regPool, pointsToAnalysis);
3573
3574 //
3575 // insert pseudo save/restore return address so that reg alloc
3576 // can assign registers to hold the return addresses
3577 //
3578 gra.assignLocForReturnAddr();
3579
3580 //FIXME: here is a temp WA
3581 bool hybridWithSpill = builder.getOption(vISA_HybridRAWithSpill) && !(kernel.fg.getHasStackCalls() || kernel.fg.getIsStackCallFunc());
3582 if (kernel.fg.funcInfoTable.size() > 0 &&
3583 kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D && !hybridWithSpill)
3584 {
3585 kernel.getOptions()->setOption(vISAOptions::vISA_LocalRA, false);
3586 }
3587
3588 //
3589 // Mark block local variables for the whole graph prior to performing liveness analysis.
3590 // 1. Is required for flag/address register allocation
3591 // 2. We must make sure the reference number, reference BB(which will be identified in local RA as well) happens only one time.
3592 // Otherwise, there will be correctness issue
3593 gra.markGraphBlockLocalVars();
3594
3595 //Remove the un-referenced declares
3596 gra.removeUnreferencedDcls();
3597
3598 if (kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_CM)
3599 {
3600 kernel.fg.markScope();
3601 }
3602
3603 //
3604 // perform graph coloring for whole program
3605 //
3606
3607 if (kernel.fg.getHasStackCalls() || kernel.fg.getIsStackCallFunc())
3608 {
3609 kernel.fg.addSaveRestorePseudoDeclares(builder);
3610 gra.fixSrc0IndirFcall();
3611 }
3612
3613 int status = gra.coloringRegAlloc();
3614
3615 if (auto jitInfo = builder.getJitInfo())
3616 {
3617 jitInfo->numBytesScratchGtpin = kernel.getGTPinData()->getNumBytesScratchUse();
3618 }
3619
3620 recordRAStats(builder, kernel, status);
3621 if (status != VISA_SUCCESS)
3622 {
3623 return status;
3624 }
3625
3626 if (auto sp = kernel.getVarSplitPass())
3627 {
3628 sp->replaceIntrinsics();
3629 }
3630 if (builder.getOption(vISA_VerifyRA))
3631 {
3632 LivenessAnalysis liveAnalysis(gra,
3633 G4_GRF | G4_INPUT, true);
3634 liveAnalysis.computeLiveness();
3635 gra.verifyRA(liveAnalysis);
3636 }
3637
3638 // printf("EU Fusion WA insts for func: %s\n", kernel.getName());
3639 for (auto inst : gra.getEUFusionWAInsts())
3640 {
3641 kernel.setMaskOffset(inst, InstOpt_M16);
3642 // inst->dump();
3643 }
3644
3645 return status;
3646 }
3647