1 /*========================== begin_copyright_notice ============================
2
3 Copyright (C) 2017-2021 Intel Corporation
4
5 SPDX-License-Identifier: MIT
6
7 ============================= end_copyright_notice ===========================*/
8
9 #include "BitSet.h"
10 #include "BuildIR.h"
11 #include "CFGStructurizer.h"
12 #include "DebugInfo.h"
13 #include "FlowGraph.h"
14 #include "G4_Kernel.hpp"
15 #include "Option.h"
16 #include "PhyRegUsage.h"
17 #include "visa_wa.h"
18
19 #include "BinaryEncodingIGA.h"
20 #include "iga/IGALibrary/api/iga.h"
21 #include "iga/IGALibrary/api/iga.hpp"
22
23 #include <algorithm>
24 #include <chrono>
25 #include <cstdlib>
26 #include <fstream>
27 #include <functional>
28 #include <iostream>
29 #include <iterator>
30 #include <random>
31 #include <set>
32 #include <sstream>
33 #include <string>
34 #include <unordered_map>
35 #include <unordered_set>
36
37 #if defined(WIN32)
38 #define CDECLATTRIBUTE __cdecl
39 #elif __GNUC__
40 #ifdef __x86_64__
41 #define CDECLATTRIBUTE
42 #else
43 #define CDECLATTRIBUTE __attribute__((__cdecl__))
44 #endif
45 #endif
46
47 using namespace vISA;
48
49
insert(uint16_t newLB,uint16_t newRB)50 void GlobalOpndHashTable::HashNode::insert(uint16_t newLB, uint16_t newRB)
51 {
52 // check if the newLB/RB either subsumes or can be subsumed by an existing bound
53 // ToDo: consider merging bound as well
54 for (uint32_t& bound : bounds)
55 {
56 uint16_t nodeLB = getLB(bound);
57 uint16_t nodeRB = getRB(bound);
58 if (newLB >= nodeLB && newRB <= nodeRB)
59 {
60 return;
61 }
62 else if (newLB <= nodeLB && newRB >= nodeRB)
63 {
64 bound = packBound(newLB, newRB);
65 return;
66 }
67 }
68 bounds.push_back(packBound(newLB, newRB));
69 }
70
isInNode(uint16_t lb,uint16_t rb) const71 bool GlobalOpndHashTable::HashNode::isInNode(uint16_t lb, uint16_t rb) const
72 {
73 for (uint32_t bound : bounds)
74 {
75 uint16_t nodeLB = getLB(bound);
76 uint16_t nodeRB = getRB(bound);
77 if (lb <= nodeLB && rb >= nodeLB)
78 {
79 return true;
80 }
81 else if (lb > nodeLB && lb <= nodeRB)
82 {
83 return true;
84 }
85 }
86 return false;
87 }
88
clearHashTable()89 void GlobalOpndHashTable::clearHashTable()
90 {
91 for (auto& iter : globalVars)
92 {
93 iter.second->~HashNode();
94 }
95 globalVars.clear();
96 globalOpnds.clear();
97 }
98
99 // all of the operand in this table are srcRegion
addGlobalOpnd(G4_Operand * opnd)100 void GlobalOpndHashTable::addGlobalOpnd(G4_Operand *opnd)
101 {
102 G4_Declare *topDcl = opnd->getTopDcl();
103
104 if (topDcl)
105 {
106 // global operands must have a declare
107 auto entry = globalVars.find(topDcl);
108 if (entry != globalVars.end())
109 {
110 entry->second->insert(
111 (uint16_t)opnd->getLeftBound(),
112 (uint16_t)opnd->getRightBound());
113 }
114 else
115 {
116 HashNode* node = new (mem)HashNode(
117 (uint16_t)opnd->getLeftBound(),
118 (uint16_t)opnd->getRightBound(),
119 private_arena_allocator);
120 globalVars[topDcl] = node;
121 }
122 globalOpnds.push_back(opnd);
123 }
124 }
125
126 // if def overlaps with any operand in this table, it is treated as global
isOpndGlobal(G4_Operand * opnd) const127 bool GlobalOpndHashTable::isOpndGlobal(G4_Operand *opnd) const
128 {
129 G4_Declare * dcl = opnd->getTopDcl();
130 if (dcl == NULL)
131 {
132 return false;
133 }
134 else if (dcl->getAddressed())
135 {
136 // Conservatively assume that all address taken
137 // virtual registers are global
138 return true;
139 }
140 else
141 {
142 auto entry = globalVars.find(dcl);
143 if (entry == globalVars.end())
144 {
145 return false;
146 }
147 HashNode* node = entry->second;
148 return node->isInNode(
149 (uint16_t)opnd->getLeftBound(),
150 (uint16_t)opnd->getRightBound());
151 }
152 }
153
dump(std::ostream & os) const154 void GlobalOpndHashTable::dump(std::ostream &os) const
155 {
156 os << "Global variables:\n";
157 for (auto&& entry : globalVars)
158 {
159 G4_Declare* dcl = entry.first;
160 dcl->dump();
161 if ((dcl->getRegFile() & G4_FLAG) == 0)
162 {
163 std::vector<bool> globalElt;
164 globalElt.resize(dcl->getByteSize(), false);
165 auto ranges = entry.second;
166 for (auto bound : ranges->bounds)
167 {
168 uint16_t lb = getLB(bound);
169 uint16_t rb = getRB(bound);
170 for (int i = lb; i <= rb; ++i)
171 {
172 globalElt[i] = true;
173 }
174 }
175 bool inRange = false;
176 for (int i = 0, size = (int)globalElt.size(); i < size; ++i)
177 {
178 if (globalElt[i] && !inRange)
179 {
180 // start of new range
181 os << "[" << i << ",";
182 inRange = true;
183 }
184 else if (!globalElt[i] && inRange)
185 {
186 // end of range
187 os << i - 1 << "], ";
188 inRange = false;
189 }
190 }
191 if (inRange)
192 {
193 // close last range
194 os << globalElt.size() - 1 << "]";
195 }
196 }
197 os << "\n";
198 }
199 os << "Instructions with global operands:\n";
200 for (auto opnd : globalOpnds)
201 {
202 if (opnd->getInst())
203 {
204 opnd->getInst()->print(os);
205 }
206 }
207 }
208
setPhysicalLink(G4_BB * pred,G4_BB * succ)209 void FlowGraph::setPhysicalLink(G4_BB* pred, G4_BB* succ)
210 {
211 if (pred)
212 {
213 pred->setPhysicalSucc(succ);
214 }
215 if (succ)
216 {
217 succ->setPhysicalPred(pred);
218 }
219 }
220
insert(BB_LIST_ITER iter,G4_BB * bb)221 BB_LIST_ITER FlowGraph::insert(BB_LIST_ITER iter, G4_BB* bb)
222 {
223 G4_BB* prev = iter != BBs.begin() ? *std::prev(iter) : nullptr;
224 G4_BB* next = iter != BBs.end() ? *iter : nullptr;
225 setPhysicalLink(prev, bb);
226 setPhysicalLink(bb, next);
227 markStale();
228 return BBs.insert(iter, bb);
229 }
230
erase(BB_LIST_ITER iter)231 void FlowGraph::erase(BB_LIST_ITER iter)
232 {
233 G4_BB* prev = (iter != BBs.begin()) ? *std::prev(iter) : nullptr;
234 G4_BB* next = (std::next(iter) != BBs.end()) ? *std::next(iter) : nullptr;
235 setPhysicalLink(prev, next);
236 BBs.erase(iter);
237 markStale();
238 }
239
addPrologBB(G4_BB * BB)240 void FlowGraph::addPrologBB(G4_BB* BB)
241 {
242 G4_BB* oldEntry = getEntryBB();
243 insert(BBs.begin(), BB);
244 addPredSuccEdges(BB, oldEntry);
245 }
246
append(const FlowGraph & otherFG)247 void FlowGraph::append(const FlowGraph& otherFG)
248 {
249 markStale();
250
251 for (auto I = otherFG.cbegin(), E = otherFG.cend(); I != E; ++I)
252 {
253 auto bb = *I;
254 BBs.push_back(bb);
255 incrementNumBBs();
256 }
257 }
258
259 //
260 // return label's corresponding BB
261 // if label's BB is not yet created, then create one and add map label to BB
262 //
getLabelBB(Label_BB_Map & map,G4_Label * label)263 G4_BB* FlowGraph::getLabelBB(Label_BB_Map& map, G4_Label* label)
264 {
265 if (map.find(label) != map.end())
266 {
267 return map[label];
268 }
269 else
270 {
271 G4_BB* bb = createNewBB();
272 map[label] = bb;
273 return bb;
274 }
275 }
276
277 //
278 // first is the first inst of a BB
279 //
beginBB(Label_BB_Map & map,G4_INST * first)280 G4_BB* FlowGraph::beginBB(Label_BB_Map& map, G4_INST* first)
281 {
282 if (first == NULL) return NULL;
283 G4_INST* labelInst;
284 bool newLabelInst = false;
285 if (first->isLabel())
286 {
287 labelInst = first;
288 }
289 else
290 {
291 // no label for this BB, create one!
292 std::string prefix = builder->kernel.getName();
293 std::string name = prefix + "_auto_" + std::to_string(autoLabelId++);
294 G4_Label* label = builder->createLabel(name, LABEL_BLOCK);
295 labelInst = createNewLabelInst(label);
296 newLabelInst = true;
297 }
298 G4_BB* bb = getLabelBB(map, labelInst->getLabel());
299 push_back(bb); // append to BBs list
300 if (newLabelInst)
301 {
302 bb->push_front(labelInst);
303 }
304 return bb;
305 }
306
createNewLabelInst(G4_Label * label,int lineNo,int CISAOff)307 G4_INST* FlowGraph::createNewLabelInst(G4_Label* label, int lineNo, int CISAOff)
308 {
309 //srcFileName is NULL
310 // TODO: remove this (use createLabelInst)
311 return builder->createInternalInst(NULL, G4_label,
312 NULL, g4::NOSAT, g4::SIMD1, NULL, label, NULL, 0);
313 }
314
removePredSuccEdges(G4_BB * pred,G4_BB * succ)315 void FlowGraph::removePredSuccEdges(G4_BB* pred, G4_BB* succ)
316 {
317 MUST_BE_TRUE(pred != NULL && succ != NULL, ERROR_INTERNAL_ARGUMENT);
318
319 BB_LIST_ITER lt = pred->Succs.begin();
320 for (; lt != pred->Succs.end(); ++lt) {
321 if ((*lt) == succ) {
322 pred->Succs.erase(lt);
323 break;
324 }
325 }
326
327 lt = succ->Preds.begin();
328 for (; lt != succ->Preds.end(); ++lt) {
329 if ((*lt) == pred) {
330 succ->Preds.erase(lt);
331 break;
332 }
333 }
334
335 markStale();
336 }
337
createNewBB(bool insertInFG)338 G4_BB* FlowGraph::createNewBB(bool insertInFG)
339 {
340 G4_BB* bb = new (mem)G4_BB(instListAlloc, numBBId, this);
341
342 // Increment counter only when new BB is inserted in FlowGraph
343 if (insertInFG)
344 numBBId++;
345
346 BBAllocList.push_back(bb);
347 return bb;
348 }
349
createNewBBWithLabel(const char * LabelPrefix,int Lineno,int CISAoff)350 G4_BB* FlowGraph::createNewBBWithLabel(const char* LabelPrefix, int Lineno, int CISAoff)
351 {
352 G4_BB* newBB = createNewBB(true);
353 std::string name = LabelPrefix + std::to_string(newBB->getId());
354 G4_Label* lbl = builder->createLabel(name, LABEL_BLOCK);
355 G4_INST* inst = createNewLabelInst(lbl, Lineno, CISAoff);
356 newBB->push_back(inst);
357 return newBB;
358 }
359
360 static int globalCount = 1;
361
insertDummyUUIDMov()362 int64_t FlowGraph::insertDummyUUIDMov()
363 {
364 // Here when -addKernelId is passed
365 if (builder->getIsKernel())
366 {
367 // Add mov inst as first instruction in kernel. This
368 // has to be *first* instruction in the kernel so debugger
369 // can detect the pattern.
370 //
371 // 32-bit random number is generated and set as src0 operand.
372 // Earlier nop was being generated with 64-bit UUID overloading
373 // MBZ bits and this turned out to be a problem for the
374 // debugger (random clobbering of r0).
375
376 for (auto bb : BBs)
377 {
378 uint32_t seed = (uint32_t)std::chrono::high_resolution_clock::now().time_since_epoch().count();
379 std::mt19937 mt_rand(seed * globalCount);
380 globalCount++;
381
382 G4_DstRegRegion* nullDst = builder->createNullDst(Type_UD);
383 int64_t uuID = (int64_t)mt_rand();
384 G4_Operand* randImm = (G4_Operand*)builder->createImm(uuID, Type_UD);
385 G4_INST* movInst = builder->createMov(g4::SIMD1, nullDst, randImm, InstOpt_NoOpt, false);
386
387 auto instItEnd = bb->end();
388 for (auto it = bb->begin();
389 it != instItEnd;
390 it++)
391 {
392 if ((*it)->isLabel())
393 {
394 bb->insertBefore(++it, movInst);
395 return uuID;
396 }
397
398 bb->push_front(movInst);
399 return uuID;
400 }
401 }
402 }
403 return 0;
404 }
405
406 //
407 // (1) check if-else-endif and iff-endif pairs
408 // (2) add label for those omitted ones
409 //
matchBranch(int & sn,INST_LIST & instlist,INST_LIST_ITER & it)410 bool FlowGraph::matchBranch(int &sn, INST_LIST& instlist, INST_LIST_ITER &it)
411 {
412 G4_INST* inst = *it;
413 //
414 // process if-endif or if-else-endif
415 //
416 if (inst->opcode() == G4_if)
417 {
418 // check label / add label
419 G4_Label *if_label = NULL;
420 G4_Label *else_label = NULL;
421 G4_Label *endif_label = NULL; // the label immediately before the endif
422 G4_INST* ifInst = inst;
423 assert(inst->asCFInst()->getJip() == nullptr && "IF should not have a label at this point");
424
425 // create if_label
426 std::string createdLabel = "_AUTO_GENERATED_IF_LABEL_" + std::to_string(sn);
427 sn++;
428 if_label = builder->createLabel(createdLabel, LABEL_BLOCK);
429 inst->asCFInst()->setJip(if_label);
430
431 // look for else/endif
432 int elseCount = 0;
433 it++;
434 while (it != instlist.end())
435 {
436 inst = *it;
437 // meet iff or if
438 if (inst->opcode() == G4_if)
439 {
440 if (!matchBranch(sn, instlist, it))
441 return false;
442 }
443 else if (inst->opcode() == G4_else)
444 {
445 if (elseCount != 0)
446 {
447 MUST_BE_TRUE(false, "ERROR: Mismatched if-else: more than one else following if!");
448 return false;
449 }
450 elseCount++;
451 INST_LIST_ITER it1 = it;
452 it1++;
453
454 // add endif label to "else"
455 std::string createdLabel = "__AUTO_GENERATED_ELSE_LABEL__" + std::to_string(sn);
456 sn++;
457 else_label = builder->createLabel(createdLabel, LABEL_BLOCK);
458 inst->asCFInst()->setJip(else_label);
459
460 // insert if-else label
461 G4_INST* label = createNewLabelInst(if_label, inst->getLineNo(), inst->getCISAOff());
462 instlist.insert(it1, label);
463
464 // Uip must be the same as Jip for else instructions.
465 inst->asCFInst()->setUip(else_label);
466 }
467 else if (inst->opcode() == G4_endif)
468 {
469 if (elseCount == 0) // if-endif case
470 {
471 // insert endif label
472 G4_INST* label = createNewLabelInst(if_label, inst->getLineNo(), inst->getCISAOff());
473 instlist.insert(it, label);
474 endif_label = if_label;
475 }
476 else // if-else-endif case
477 {
478 // insert endif label
479 G4_INST* label = createNewLabelInst(else_label, inst->getLineNo(), inst->getCISAOff());
480 instlist.insert(it, label);
481 endif_label = else_label;
482 }
483
484 //we must also set the UIP of if to point to its corresponding endif
485 ifInst->asCFInst()->setUip(endif_label);
486 return true;
487 } // if opcode
488 if (it == instlist.end())
489 {
490 MUST_BE_TRUE(false, "ERROR: Can not find endif for if!");
491 return false;
492 }
493 it++;
494 } // while
495 }
496 //
497 // process iff-endif
498 //
499 else
500 MUST_BE_TRUE(false, ERROR_FLOWGRAPH);
501
502 return false;
503 }
504
505 //
506 // HW Rules about the jip and uip for the control flow instructions
507 // if:
508 // <branch_ctrl> set
509 // jip = first join in the if block
510 // uip = endif
511 // <branch_ctrl> not set
512 // jip = instruction right after the else, or the endif if else doesn't exist
513 // uip = endif
514 // else:
515 // <branch_ctrl> set
516 // jip = first join in the else block
517 // uip = endif
518 // <branch_ctrl> not set
519 // jip = uip = endif
520 // endif:
521 // jip = uip = next enclosing endif/while
522 // while:
523 // jip = loop head label
524 // uip = don't care
525 // break:
526 // jip = end of innermost CF block (else/endif/while)
527 // uip = while
528 // cont:
529 // jip = end of innermost CF block (else/endif/while)
530 // uip = while
531 //
532
533 //
534 // Preprocess the instruction and kernel list, including:
535 // 1. Check if the labels and kernel names are unique
536 // 2. Check if all the labels used by jmp, CALL, cont, break, goto is defined, determine if goto is forward or backward
537 // 3. Process the non-label "if-else-endif" cases
538 //
preprocess(INST_LIST & instlist)539 void FlowGraph::preprocess(INST_LIST& instlist)
540 {
541
542 std::unordered_set<G4_Label*> labels; // label inst we have seen so far
543
544 // Mark goto/jmpi/call as either forward or backward
545 for (G4_INST* i : instlist)
546 {
547 if (i->isLabel())
548 {
549 labels.emplace(i->getLabel());
550 }
551 else if (i->opcode() == G4_goto)
552 {
553 if (labels.count(i->asCFInst()->getUip()->asLabel()))
554 {
555 i->asCFInst()->setBackward(true);
556 }
557 }
558 else if ((i->opcode() == G4_jmpi || i->isCall()) && i->getSrc(0) && i->getSrc(0)->isLabel())
559 {
560 if (labels.count(i->getSrc(0)->asLabel()))
561 {
562 i->asCFInst()->setBackward(true);
563 }
564 }
565 }
566
567 #ifdef _DEBUG
568 // sanity check to make sure we don't have undefined labels
569 for (G4_INST* i : instlist)
570 {
571 if (i->opcode() == G4_goto)
572 {
573 G4_Label* target = i->asCFInst()->getUip()->asLabel();
574 assert(labels.count(target) && "undefined goto label");
575 }
576 else if ((i->opcode() == G4_jmpi || i->isCall()) && i->getSrc(0) && i->getSrc(0)->isLabel())
577 {
578 assert(labels.count((G4_Label *)i->getSrc(0)) && "undefined jmpi/call label");
579 }
580 }
581 #endif
582
583 //
584 // Process the non-label "if-else-endif" cases as following.
585 // ToDo: remove this once we stop generating if-else-endif for the IEEE macros
586 //
587 {
588 int sn = 0;
589 for (INST_LIST_ITER it = instlist.begin(), instlistEnd = instlist.end(); it != instlistEnd; ++it)
590 {
591 G4_INST *inst = *it;
592 if (inst->opcode() == G4_if)
593 {
594 if (!matchBranch(sn, instlist, it))
595 {
596 MUST_BE_TRUE2(false, "ERROR: mismatched if-branch", inst);
597 break;
598 }
599 } // fi
600 } // for
601 }
602 }
603
normalizeFlowGraph()604 void FlowGraph::normalizeFlowGraph()
605 {
606 // For CISA 3 flowgraph there will be pseudo_fcall instructions to invoke stack functions.
607 // Save/restore code will be added around this at the time of register allocation.
608 // If fcall's successor has more than 1 predecessor then it will create problems inserting
609 // code. This function handles such patterns by creating new basic block and guaranteeing
610 // that fcall's successor has a single predecessor.
611 for (BB_LIST_ITER it = BBs.begin(); it != BBs.end(); it++)
612 {
613 G4_BB* bb = *it;
614 if (bb->isEndWithFCall())
615 {
616 G4_BB* retBB = bb->Succs.front();
617 G4_INST *lInst = retBB->front();
618 if (retBB->Preds.size() > 1)
619 {
620
621 // To insert restore code we need to guarantee that save code has
622 // been executed. If retBB has multiple preds, this may not be
623 // guaranteed, so insert a new node.
624 G4_BB* newNode = createNewBB();
625
626 // Remove old edge
627 removePredSuccEdges(bb, retBB);
628
629 // Add new edges
630 addPredSuccEdges(bb, newNode);
631 addPredSuccEdges(newNode, retBB);
632
633 // Create and insert label inst
634 std::string name = "L_AUTO_" + std::to_string(newNode->getId());
635 G4_Label* lbl = builder->createLabel(name, LABEL_BLOCK);
636 G4_INST* inst = createNewLabelInst(lbl, lInst->getLineNo(), lInst->getCISAOff());
637 newNode->push_back(inst);
638 insert(++it, newNode);
639 it--;
640
641 retBB = newNode;
642 }
643 }
644 }
645 }
646
647 //
648 // build a control flow graph from the inst list
649 // we want to keep the original BB order (same as shown in assembly code) for linear scan
650 // register allocation. We use beginBB to indicate that we encounter the beginning
651 // a BB and add the BB to the BB list and use getLabelBB to create a block for a label but
652 // not to add to the BB list.For instance, when we come across a "jmp target", getLabelBB
653 // is called to create a BB for target but the target BB is added into the list later (
654 // assume forward jmp) as the target label is visited.
655 //
656 //
constructFlowGraph(INST_LIST & instlist)657 void FlowGraph::constructFlowGraph(INST_LIST& instlist)
658 {
659 MUST_BE_TRUE(!instlist.empty(), ERROR_SYNTAX("empty instruction list"));
660
661 if (builder->hasScratchSurface() && (isStackCallFunc || hasStackCalls))
662 {
663 // unfortunately we can't put this in stack call prolog since this has to be done before RA
664 // ToDo: just hard-wire the scratch-surface offset register?
665 builder->initScratchSurfaceOffset();
666 }
667 if (builder->hasFusedEU() &&
668 getKernel()->getInt32KernelAttr(Attributes::ATTR_Target) == VISA_CM)
669 {
670 getKernel()->getOptions()->setOptionInternally(vISA_EnableScalarJmp, false);
671 }
672
673 pKernel->renameAliasDeclares();
674 //
675 // The funcInfoHashTable maintains a map between the id of the function's INIT block
676 // to its FuncInfo definition.
677 //
678 FuncInfoHashTable funcInfoHashTable;
679
680 preprocess(instlist);
681
682 //
683 // map label to its corresponding BB
684 //
685 std::unordered_map<G4_Label*, G4_BB*> labelMap;
686 //
687 // create the entry block of the flow graph
688 //
689 G4_BB* curr_BB = beginBB(labelMap, instlist.front());
690
691 kernelInfo = new (mem)FuncInfo(UINT_MAX, curr_BB, NULL);
692
693 std::vector<G4_BB*> subroutineStartBB; // needed by handleExit()
694
695 bool hasSIMDCF = false, hasNoUniformGoto = false;
696 G4_Label* currSubroutine = nullptr;
697
698 auto addToSubroutine = [this](G4_BB* bb, G4_Label* currSubroutine)
699 {
700 if (currSubroutine)
701 {
702 subroutines[currSubroutine].push_back(bb);
703 }
704 };
705
706 while (!instlist.empty())
707 {
708 INST_LIST_ITER iter = instlist.begin();
709 G4_INST* i = *iter;
710
711 MUST_BE_TRUE(curr_BB != NULL, "Current BB must not be empty");
712 //
713 // inst i belongs to the current BB
714 // remove inst i from instlist and relink it to curr_BB's instList
715 //
716 curr_BB->splice(curr_BB->end(), instlist, iter);
717 G4_INST* next_i = (instlist.empty()) ? nullptr : instlist.front();
718
719 if (i->isSend())
720 {
721 curr_BB->setSendInBB(true);
722 }
723
724 if (i->isLabel() && i->getLabel()->isFuncLabel())
725 {
726 std::vector<G4_BB*> bbvec;
727 subroutines[i->getLabel()] = bbvec;
728 currSubroutine = i->getLabel();
729 subroutineStartBB.push_back(curr_BB);
730 }
731
732 //
733 // do and endif do not have predicate and jump-to label,so we treated them as non-control instruction
734 // the labels around them will decides the beginning of a new BB
735 //
736 if (i->isFlowControl() && i->opcode() != G4_endif)
737 {
738 addToSubroutine(curr_BB, currSubroutine);
739 G4_BB* next_BB = beginBB(labelMap, next_i);
740
741 // next_BB may be null if the kernel ends on an CF inst (e.g., backward goto/jmpi)
742 // This should be ok because we should not fall-through to next_BB in such case
743 // (i.e., goto/jmpi must not be predicated)
744 {
745 if (i->opcode() == G4_jmpi || i->isCall())
746 {
747 //
748 // add control flow edges <curr_BB,target>
749 //
750 if (i->getSrc(0)->isLabel())
751 {
752 addPredSuccEdges(curr_BB, getLabelBB(labelMap, i->getSrc(0)->asLabel()));
753 }
754 else if (i->asCFInst()->isIndirectJmp())
755 {
756 // indirect jmp
757 // For each label in the switch table, we insert a jmpi
758 // to that label. We keep the jmpi in the same
759 // block as the indirect jmp instead of creaing a new block for
760 // each of them, as the offset actually determines which jmpi
761 // instruction we will hit.
762 // FIXME: We may want to delay the emission of the jmps to
763 // each individual labels, so that we still maintain the property
764 // that every basic block ends with a control flow instruction
765 const std::list<G4_Label*>& jmpTargets = i->asCFInst()->getIndirectJmpLabels();
766 for (auto it = jmpTargets.begin(), jmpTrgEnd = jmpTargets.end(); it != jmpTrgEnd; ++it)
767 {
768 G4_INST* jmpInst = builder->createJmp(nullptr, *it, InstOpt_NoOpt, true);
769 indirectJmpTarget.emplace(jmpInst);
770 INST_LIST_ITER jmpInstIter = builder->instList.end();
771 curr_BB->splice(curr_BB->end(), builder->instList, --jmpInstIter);
772 addPredSuccEdges(curr_BB, getLabelBB(labelMap, (*it)));
773 }
774 }
775
776 if (i->isCall())
777 {
778 //
779 // the <CALL,return addr> link is added temporarily to facilitate linking
780 // RETURN with return addresses. The link will be removed after it
781 // serves its purpose
782 // NOTE: No matter it has predicate, we add this link. We will check the predicate in handleReturn()
783 // and only remove the link when it is not a conditional call
784 //
785 addPredSuccEdges(curr_BB, next_BB);
786 if (i->getSrc(0)->isLabel())
787 {
788 i->getSrc(0)->asLabel()->setFuncLabel(true);
789 }
790 }
791 else if (i->getPredicate())
792 {
793 // add fall through edge
794 addUniquePredSuccEdges(curr_BB, next_BB);
795 }
796 } // if (i->opcode()
797 else if (i->opcode() == G4_if || i->opcode() == G4_while || i->opcode() == G4_else)
798 {
799 hasSIMDCF = true;
800 if (i->asCFInst()->getJip()->isLabel())
801 {
802 if (i->opcode() == G4_else || i->opcode() == G4_while)
803 {
804 // for G4_while, jump no matter predicate
805 addPredSuccEdges(curr_BB, getLabelBB(labelMap, i->asCFInst()->getJip()));
806 }
807 else if ((i->getPredicate() != NULL) ||
808 ((i->getCondMod() != NULL) &&
809 (i->getSrc(0) != NULL) &&
810 (i->getSrc(1) != NULL)))
811 {
812 addPredSuccEdges(curr_BB, getLabelBB(labelMap, i->asCFInst()->getJip()));
813 }
814 }
815 if (i->opcode() == G4_while)
816 {
817 // always do this since break jumps to while, otherwise code after while without predicate appears unreachable.
818 // add fall through edge if while is not the last instruction
819 if (next_BB)
820 {
821 addPredSuccEdges(curr_BB, next_BB);
822 }
823 }
824 else
825 {
826 // always fall through
827 addPredSuccEdges(curr_BB, next_BB); // add fall through edge
828 }
829 }
830 else if (i->opcode() == G4_break || i->opcode() == G4_cont || i->opcode() == G4_halt)
831 {
832 // JIP and UIP must have been computed at this point
833 MUST_BE_TRUE(i->asCFInst()->getJip() != NULL && i->asCFInst()->getUip() != NULL,
834 "null JIP or UIP for break/cont instruction");
835 addPredSuccEdges(curr_BB, getLabelBB(labelMap, i->asCFInst()->getJip()));
836
837 if (strcmp(i->asCFInst()->getJipLabelStr(), i->asCFInst()->getUipLabelStr()) != 0)
838 addPredSuccEdges(curr_BB, getLabelBB(labelMap, i->asCFInst()->getUip()));
839
840 //
841 // pred means conditional branch
842 //
843 if (i->getPredicate() != NULL)
844 {
845 // add fall through edge
846 addPredSuccEdges(curr_BB, next_BB);
847 }
848 }
849 else if (i->isReturn() || i->opcode() == G4_pseudo_exit)
850 {
851 // does nothing for unconditional return;
852 // later phase will link the return and the return address
853 if (i->getPredicate() != NULL)
854 {
855 // add fall through edge
856 addPredSuccEdges(curr_BB, next_BB);
857 }
858 }
859 else if (i->opcode() == G4_pseudo_fcall || i->opcode() == G4_pseudo_fc_call)
860 {
861 addPredSuccEdges(curr_BB, next_BB);
862 }
863 else if (i->opcode() == G4_pseudo_fret || i->opcode() == G4_pseudo_fc_ret)
864 {
865 if (i->getPredicate())
866 {
867 // need to add fall through edge
868 addPredSuccEdges(curr_BB, next_BB);
869 }
870 }
871 else if (i->opcode() == G4_goto)
872 {
873 hasNoUniformGoto = true;
874 hasSIMDCF = true;
875 addPredSuccEdges(curr_BB, getLabelBB(labelMap, i->asCFInst()->getUip()));
876
877 if (i->getPredicate())
878 {
879 // fall through, but check if goto target is same as fall-thru
880 // FIXME: replace all addPredSuccEdges with addUniquePredSuccEdges?
881 addUniquePredSuccEdges(curr_BB, next_BB);
882 }
883 }
884 else
885 {
886 assert(false && "should not reach here");
887 }
888 } // need edge
889 curr_BB = next_BB;
890 } // flow control
891 else if (curr_BB->isLastInstEOT())
892 {
893 addToSubroutine(curr_BB, currSubroutine);
894 //this is a send instruction that marks end of thread.
895 //the basic block will end here with no outgoing links
896 curr_BB = beginBB(labelMap, next_i);
897 }
898 else if (next_i && next_i->isLabel())
899 {
900 addToSubroutine(curr_BB, currSubroutine);
901 G4_BB* next_BB = beginBB(labelMap, next_i);
902 addPredSuccEdges(curr_BB, next_BB);
903 curr_BB = next_BB;
904 }
905 }
906 if (curr_BB)
907 {
908 addToSubroutine(curr_BB, currSubroutine);
909 }
910
911 // we can do this only after fg is constructed
912 pKernel->calculateSimdSize();
913
914 // Jmpi instruction cannot be used when EU Fusion is enabled.
915 bool hasGoto = hasNoUniformGoto;
916 if (builder->noScalarJmp())
917 {
918 // No jmpi should be inserted after this point.
919 hasGoto |= convertJmpiToGoto();
920 }
921
922 pKernel->dumpToFile("after.CFGConstruction");
923
924 removeRedundantLabels();
925
926 pKernel->dumpToFile("after.RemoveRedundantLabels");
927
928 handleExit(subroutineStartBB.size() > 1 ? subroutineStartBB[1] : nullptr);
929
930 handleReturn(labelMap, funcInfoHashTable);
931 mergeReturn(funcInfoHashTable);
932 normalizeSubRoutineBB(funcInfoHashTable);
933
934 handleWait();
935
936 if (isStackCallFunc)
937 {
938 mergeFReturns();
939 }
940
941 setPhysicalPredSucc();
942 removeUnreachableBlocks(funcInfoHashTable);
943
944 pKernel->dumpToFile("after.RemoveUnreachableBlocks");
945
946 //
947 // build the table of function info nodes
948 //
949
950 for (FuncInfoHashTable::iterator it = funcInfoHashTable.begin(), end = funcInfoHashTable.end(); it != end; ++it) {
951 FuncInfo* funcInfo = (*it).second;
952 funcInfo->getInitBB()->setFuncInfo(funcInfo);
953 funcInfo->getExitBB()->setFuncInfo(funcInfo);
954 funcInfoTable.push_back(funcInfo);
955 }
956
957 if (hasGoto)
958 {
959 // Structurizer requires that the last BB has no goto (ie, the
960 // last BB is either a return or an exit).
961 if (builder->getOption(vISA_EnableStructurizer) &&
962 !endWithGotoInLastBB())
963 {
964 doCFGStructurize(this);
965 pKernel->dumpToFile("after.CFGStructurizer");
966
967 removeRedundantLabels();
968 pKernel->dumpToFile("after.PostStructurizerRedundantLabels");
969 }
970 else
971 {
972 processGoto(hasSIMDCF);
973 pKernel->dumpToFile("after.ProcessGoto");
974 }
975 }
976
977 // This finds back edges and populates blocks into function info objects.
978 // The latter will be used to mark SIMD blocks. Prior to RA, no transformation
979 // shall invalidate back edges (G4_BB -> G4_BB).
980 reassignBlockIDs();
981 findBackEdges();
982
983 if (hasSIMDCF && pKernel->getInt32KernelAttr(Attributes::ATTR_Target) == VISA_CM)
984 {
985 processSCF(funcInfoHashTable);
986 addSIMDEdges();
987 }
988
989 // patch the last BB for the kernel
990 if (funcInfoTable.size() > 0)
991 {
992 kernelInfo->updateExitBB(subroutineStartBB[1]->getPhysicalPred());
993 topologicalSortCallGraph();
994 }
995 else
996 {
997 kernelInfo->updateExitBB(BBs.back());
998 }
999
1000 // For non-kernel function, always invoke markDivergentBBs to
1001 // conservatively assume divergence.
1002 if ((hasSIMDCF || hasGoto || !builder->getIsKernel()) &&
1003 builder->getOption(vISA_divergentBB))
1004 {
1005 markDivergentBBs();
1006 }
1007
1008 builder->materializeGlobalImm(getEntryBB());
1009 normalizeRegionDescriptors();
1010 localDataFlowAnalysis();
1011
1012 markStale();
1013 }
1014
normalizeRegionDescriptors()1015 void FlowGraph::normalizeRegionDescriptors()
1016 {
1017 for (auto bb : BBs)
1018 {
1019 for (auto inst : *bb)
1020 {
1021 uint16_t execSize = inst->getExecSize();
1022 for (unsigned i = 0, e = (unsigned)inst->getNumSrc(); i < e; ++i)
1023 {
1024 G4_Operand *src = inst->getSrc(i);
1025 if (!src || !src->isSrcRegRegion())
1026 continue;
1027
1028 G4_SrcRegRegion *srcRegion = src->asSrcRegRegion();
1029 const RegionDesc *desc = srcRegion->getRegion();
1030 auto normDesc = builder->getNormalizedRegion(execSize, desc);
1031 if (normDesc && normDesc != desc)
1032 {
1033 srcRegion->setRegion(normDesc, /*invariant*/ true);
1034 }
1035 }
1036 }
1037 }
1038 }
1039
1040 // materialize the values in global Imm at entry BB
materializeGlobalImm(G4_BB * entryBB)1041 void IR_Builder::materializeGlobalImm(G4_BB* entryBB)
1042 {
1043 InstSplitPass instSplitter(this);
1044 for (int i = 0, numImm = immPool.size(); i < numImm; ++i)
1045 {
1046 auto&& immVal = immPool.getImmVal(i);
1047 auto dcl = immPool.getImmDcl(i);
1048 G4_INST* inst = createMov(
1049 G4_ExecSize((unsigned)immVal.numElt),
1050 createDstRegRegion(dcl, 1), immVal.imm, InstOpt_WriteEnable, false);
1051 auto iter = std::find_if(entryBB->begin(), entryBB->end(),
1052 [](G4_INST* inst) { return !inst->isLabel(); });
1053 INST_LIST_ITER newMov = entryBB->insertBefore(iter, inst);
1054 instSplitter.splitInstruction(newMov, entryBB->getInstList());
1055 }
1056 }
1057
1058 //
1059 // attempt to sink the pseudo-wait into the immediate succeeding send instruction (in same BB)
1060 // by changing it into a sendc.
1061 // If sinking fails, generate a fence with sendc.
1062 //
handleWait()1063 void FlowGraph::handleWait()
1064 {
1065 for (auto bb : BBs)
1066 {
1067 auto iter = bb->begin(), instEnd = bb->end();
1068 while (iter != instEnd)
1069 {
1070 auto inst = *iter;
1071 if (inst->isIntrinsic() &&
1072 inst->asIntrinsicInst()->getIntrinsicId() == Intrinsic::Wait)
1073 {
1074 bool sunk = false;
1075 auto nextIter = iter;
1076 ++nextIter;
1077 while (nextIter != instEnd)
1078 {
1079 G4_INST* nextInst = *nextIter;
1080 if (nextInst->isSend())
1081 {
1082 nextInst->asSendInst()->setSendc();
1083 sunk = true;
1084 break;
1085 }
1086 else if (nextInst->isOptBarrier())
1087 {
1088 break;
1089 }
1090 ++nextIter;
1091 }
1092 if (!sunk)
1093 {
1094 auto fenceInst = builder->createSLMFence();
1095 auto sendInst = fenceInst->asSendInst();
1096 if (sendInst != NULL)
1097 {
1098 sendInst->setSendc();
1099 bb->insertBefore(iter, fenceInst);
1100 }
1101 }
1102 iter = bb->erase(iter);
1103 }
1104 else
1105 {
1106 ++iter;
1107 }
1108 }
1109 }
1110 }
1111
1112 //
1113 // Each g4_pseudo_exit instruction will be translated into one of the following:
1114 // -- a unconditional simd1 ret: translated into a EOT send (may be optionally merged with
1115 // an immediately preceding send)
1116 // -- a conditional simd1 ret: translated into a jmpi to the exit block
1117 // -- a non-uniforom ret: translated into a halt to the exit block
1118 // for case 2 and 3 an exit block will be inserted, and it will consist of a EOT send
1119 // plus a join if it's targeted by another goto instruction
1120 //
handleExit(G4_BB * firstSubroutineBB)1121 void FlowGraph::handleExit(G4_BB* firstSubroutineBB)
1122 {
1123
1124 // blocks that end with non-uniform or conditional return
1125 std::vector<G4_BB*> exitBlocks;
1126 BB_LIST_ITER iter = BBs.begin(), iterEnd = BBs.end();
1127 for (; iter != iterEnd; ++iter)
1128 {
1129 G4_BB* bb = *iter;
1130 if (bb->size() == 0)
1131 {
1132 continue;
1133 }
1134
1135 if (bb == firstSubroutineBB)
1136 {
1137 // we've reached the first subroutine's entry BB,
1138 break;
1139 }
1140
1141 G4_INST* lastInst = bb->back();
1142 if (lastInst->opcode() == G4_pseudo_exit)
1143 {
1144 if (lastInst->getExecSize() == g4::SIMD1 &&
1145 !(builder->getFCPatchInfo() && builder->getFCPatchInfo()->getFCComposableKernel()))
1146 {
1147 //uniform return
1148 if (lastInst->getPredicate() != NULL)
1149 {
1150 exitBlocks.push_back(bb);
1151 }
1152 else
1153 {
1154 //generate EOT send
1155 G4_INST* lastInst = bb->back();
1156 bb->pop_back();
1157 bool needsEOTSend = true;
1158 // the EOT may be folded into the BB's last instruction if it's a send
1159 // that supports EOT
1160 if (builder->getOption(vISA_foldEOTtoPrevSend) && bb->size() > 1)
1161 {
1162 G4_InstSend* secondToLastInst = bb->back()->asSendInst();
1163 if (secondToLastInst && secondToLastInst->canBeEOT() &&
1164 !(secondToLastInst->getMsgDesc()->getSrc1LenRegs() > 2 &&
1165 VISA_WA_CHECK(builder->getPWaTable(), WaSendsSrc1SizeLimitWhenEOT)))
1166 {
1167 G4_SendDescRaw *rawDesc = secondToLastInst->getMsgDescRaw();
1168 MUST_BE_TRUE(rawDesc, "expected raw descriptor");
1169 rawDesc->setEOT();
1170 if (secondToLastInst->isSplitSend())
1171 {
1172 if (secondToLastInst->getSrc(3)->isImm())
1173 {
1174 secondToLastInst->setSrc(
1175 builder->createImm(rawDesc->getExtendedDesc(), Type_UD), 3);
1176 }
1177 }
1178 needsEOTSend = false;
1179 if (builder->getHasNullReturnSampler() && VISA_WA_CHECK(builder->getPWaTable(), Wa_1607871015))
1180 {
1181 bb->addSamplerFlushBeforeEOT();
1182 }
1183 }
1184 }
1185
1186 if (needsEOTSend)
1187 {
1188 bb->addEOTSend(lastInst);
1189 }
1190 }
1191 }
1192 else
1193 {
1194 exitBlocks.push_back(bb);
1195 }
1196 }
1197 }
1198
1199 // create an exit BB
1200 if (exitBlocks.size() > 0)
1201 {
1202 G4_BB *exitBB = createNewBB();
1203
1204 if (builder->getFCPatchInfo() &&
1205 builder->getFCPatchInfo()->getFCComposableKernel())
1206 {
1207 // For FC composable kernels insert exitBB as
1208 // last BB in BBs list. This automatically does
1209 // jump threading.
1210 push_back(exitBB);
1211 }
1212 else
1213 {
1214 insert(iter, exitBB);
1215 }
1216
1217 std::string exitBBStr("__EXIT_BB");
1218 G4_Label *exitLabel = builder->createLabel(exitBBStr, LABEL_BLOCK);
1219 G4_INST* label = createNewLabelInst(exitLabel);
1220 exitBB->push_back(label);
1221
1222 if (!(builder->getFCPatchInfo() &&
1223 builder->getFCPatchInfo()->getFCComposableKernel()))
1224 {
1225 // Dont insert EOT send for FC composable kernels
1226 exitBB->addEOTSend();
1227 }
1228
1229 for (int i = 0, size = (int)exitBlocks.size(); i < size; i++)
1230 {
1231 G4_BB* retBB = exitBlocks[i];
1232 addPredSuccEdges(retBB, exitBB, false);
1233 G4_INST* retInst = retBB->back();
1234 retBB->pop_back();
1235
1236 // Insert jump only if newly inserted exitBB is not lexical
1237 // successor of current BB
1238 auto lastBBIt = BBs.end();
1239 lastBBIt--;
1240 if ((*lastBBIt) == exitBB)
1241 {
1242 lastBBIt--;
1243 if ((*lastBBIt) == retBB)
1244 {
1245 // This condition is BB layout dependent.
1246 // However, we dont change BB layout in JIT
1247 // and in case we do it in future, we
1248 // will need to insert correct jumps
1249 // there to preserve correctness.
1250 continue;
1251 }
1252 }
1253
1254 if (retInst->getExecSize() == g4::SIMD1)
1255 {
1256 G4_INST* jmpInst = builder->createJmp(retInst->getPredicate(), exitLabel, InstOpt_NoOpt, false);
1257 retBB->push_back(jmpInst);
1258 }
1259 else
1260 {
1261 // uip for goto will be fixed later
1262 G4_INST* gotoInst = builder->createInternalCFInst(retInst->getPredicate(),
1263 G4_goto, retInst->getExecSize(), exitLabel, exitLabel, InstOpt_NoOpt);
1264 retBB->push_back(gotoInst);
1265 }
1266 }
1267 }
1268
1269 #ifdef _DEBUG
1270
1271 // sanity check
1272 for (const G4_BB* bb : BBs)
1273 {
1274 if (bb->size() == 0)
1275 {
1276 continue;
1277 }
1278 const G4_INST* lastInst = bb->back();
1279 if (lastInst->opcode() == G4_pseudo_exit)
1280 {
1281 MUST_BE_TRUE(false, "All pseudo exits should be removed at this point");
1282 }
1283 }
1284
1285 #endif
1286 }
1287
1288 //
1289 // This phase iterates each BB and checks if the last inst of a BB is a "call foo". If yes,
1290 // the algorithm traverses from the block of foo to search for RETURN and link the block of
1291 // RETURN with the block of the return address
1292 //
handleReturn(Label_BB_Map & labelMap,FuncInfoHashTable & funcInfoHashTable)1293 void FlowGraph::handleReturn(Label_BB_Map& labelMap, FuncInfoHashTable& funcInfoHashTable)
1294 {
1295 for (std::list<G4_BB*>::iterator it = BBs.begin(), itEnd = BBs.end(); it != itEnd; ++it)
1296 {
1297 G4_BB* bb = (*it);
1298
1299 if (bb->isEndWithCall())
1300 {
1301 bb->setBBType(G4_BB_CALL_TYPE);
1302 G4_INST* last = bb->back();
1303 if (last->getSrc(0)->isLabel())
1304 {
1305 // make sure bb has only two successors, one subroutine and one return addr
1306 MUST_BE_TRUE1(bb->Succs.size() == 2, last->getLineNo(),
1307 ERROR_FLOWGRAPH);
1308
1309 // find the subroutine BB and return Addr BB
1310 G4_BB* subBB = labelMap[last->getSrc(0)->asLabel()];
1311 //
1312 // the fall through BB must be the front
1313 //
1314 G4_BB* retAddr = bb->Succs.front();
1315 if (retAddr->Preds.size() > 1)
1316 {
1317 // Create an empty BB as a new RETURN BB as we want RETURN BB
1318 // to be reached only by CALL BB. Note that at this time, call
1319 // return edge has not been added yet.
1320 G4_INST* I0 = retAddr->getFirstInst();
1321 if (I0 == nullptr) I0 = last;
1322 G4_BB* newRetBB = createNewBBWithLabel("Label_return_BB", I0->getLineNo(), I0->getCISAOff());
1323 bb->removeSuccEdge(retAddr);
1324 retAddr->removePredEdge(bb);
1325 addPredSuccEdges(bb, newRetBB, true);
1326 addPredSuccEdges(newRetBB, retAddr, true);
1327
1328 insert(std::next(it), newRetBB);
1329 retAddr = newRetBB;
1330 }
1331 // Add EXIT->RETURN edge.
1332 linkReturnAddr(subBB, retAddr);
1333
1334 // set callee info for CALL
1335 FuncInfoHashTable::iterator calleeInfoLoc = funcInfoHashTable.find(subBB->getId());
1336
1337 if (calleeInfoLoc != funcInfoHashTable.end())
1338 {
1339 (*calleeInfoLoc).second->incrementCallCount();
1340 bb->setCalleeInfo((*calleeInfoLoc).second);
1341 doIPA = true;
1342 }
1343 else
1344 {
1345 unsigned funcId = (unsigned)funcInfoHashTable.size();
1346 FuncInfo *funcInfo = new (mem)FuncInfo(
1347 funcId, subBB, retAddr->Preds.front());
1348 std::pair<FuncInfoHashTable::iterator, bool> loc =
1349 funcInfoHashTable.insert(
1350 std::make_pair(subBB->getId(), funcInfo));
1351 subBB->setBBType(G4_BB_INIT_TYPE);
1352 retAddr->Preds.front()->setBBType(G4_BB_EXIT_TYPE);
1353 MUST_BE_TRUE(loc.second, ERROR_FLOWGRAPH);
1354 bb->setCalleeInfo((*(loc.first)).second);
1355 }
1356 retAddr->setBBType(G4_BB_RETURN_TYPE);
1357 }
1358 }
1359
1360 if (bb->isEndWithFCall())
1361 {
1362 // normalizeFlowGraph() process FCALL BB. (Maybe should be processed here?)
1363 // Here just set BB type
1364 bb->setBBType(G4_BB_FCALL_TYPE);
1365 }
1366 }
1367 //
1368 // remove <CALL, return addr> link when it is not a conditional call
1369 //
1370 for (G4_BB* bb : BBs)
1371 {
1372 if (bb->isEndWithCall())
1373 {
1374 G4_INST* last = bb->back();
1375 if (last->getPredicate() == NULL)
1376 {
1377 MUST_BE_TRUE(!bb->Succs.empty(), ERROR_FLOWGRAPH);
1378 G4_BB* retAddr = bb->Succs.front(); // return BB must be the front
1379 bb->removeSuccEdge(retAddr);
1380 retAddr->removePredEdge(bb);
1381 }
1382 }
1383 }
1384 }
1385
linkReturnAddr(G4_BB * entryBB,G4_BB * returnAddr)1386 void FlowGraph::linkReturnAddr(G4_BB* entryBB, G4_BB* returnAddr)
1387 {
1388
1389 assert(entryBB->size() > 0 && entryBB->front()->isLabel() && "BB should start with a label");
1390 auto label = entryBB->front()->getLabel();
1391 assert(subroutines.count(label) && "can't find subroutine label");
1392 auto subroutineBBs = subroutines[label];
1393
1394 for (auto bb : subroutineBBs)
1395 {
1396 if (!bb->empty() && bb->back()->isReturn())
1397 {
1398 //
1399 // check the direct recursive call here!
1400 //
1401 if (bb == returnAddr && hasStackCalls == false)
1402 {
1403 MUST_BE_TRUE(false,
1404 "ERROR: Do not support recursive subroutine call!");
1405 }
1406 addPredSuccEdges(bb, returnAddr, false); // IMPORTANT: add to the back to allow fall through edge is in the front, which is used by fallThroughBB()
1407 }
1408 }
1409 }
1410
1411
1412 //
1413 // This phase iterates each BB and checks if the last inst of a BB is a "call foo". If yes,
1414 // the algorith traverses from the block of foo to search for RETURN. If multiple returns exist,
1415 // the algorithm will merge returns into one
1416 // [Reason]:to handle the case when spill codes are needed between multiple return BBs of one subroutine
1417 // and the afterCAll BB. It is impossible to insert spill codes of different return BBs all before
1418 // afterCall BB.
1419 //
mergeReturn(FuncInfoHashTable & funcInfoHashTable)1420 void FlowGraph::mergeReturn(FuncInfoHashTable& funcInfoHashTable)
1421 {
1422
1423 for (auto I = subroutines.begin(), E = subroutines.end(); I != E; ++I)
1424 {
1425 auto label = I->first;
1426 G4_BB* subBB = I->second.front();
1427 G4_BB* mergedExitBB = mergeSubRoutineReturn(label);
1428
1429 // update callee exit block info
1430 FuncInfoHashTable::iterator calleeInfoLoc = funcInfoHashTable.find(subBB->getId());
1431 // it's possible for the subroutine to never be called (e.g., kernel function)
1432 if (calleeInfoLoc != funcInfoHashTable.end() && mergedExitBB)
1433 {
1434 calleeInfoLoc->second->getExitBB()->unsetBBType(G4_BB_EXIT_TYPE);
1435 mergedExitBB->setBBType(G4_BB_EXIT_TYPE);
1436 calleeInfoLoc->second->updateExitBB(mergedExitBB);
1437 }
1438 }
1439 }
1440
mergeSubRoutineReturn(G4_Label * subroutineLabel)1441 G4_BB* FlowGraph::mergeSubRoutineReturn(G4_Label* subroutineLabel)
1442 {
1443
1444 G4_BB* newBB = nullptr;
1445
1446 auto subroutineBB = subroutines[subroutineLabel];
1447
1448 std::vector<G4_BB*> retBBList;
1449 for (auto bb : subroutineBB)
1450 {
1451 if (!bb->empty() && bb->back()->isReturn())
1452 {
1453 retBBList.push_back(bb);
1454 }
1455 }
1456
1457 if (retBBList.size() > 1) // For return number >1, need to merge returns
1458 {
1459 builder->instList.clear();
1460
1461 //
1462 // insert newBB to fg.BBs list
1463 //
1464 newBB = createNewBB();
1465 insert(BBs.end(), newBB);
1466 // choose the last BB in retBBList as a candidate
1467 G4_BB* candidateBB = *(retBBList.rbegin());
1468 // Add <newBB, succBB> edges
1469 G4_INST* last = candidateBB->back();
1470 BB_LIST_ITER succIt = (last->getPredicate() == NULL) ? candidateBB->Succs.begin() : (++candidateBB->Succs.begin());
1471 BB_LIST_ITER succItEnd = candidateBB->Succs.end();
1472 // link new ret BB with each call site
1473 for (; succIt != succItEnd; ++succIt) {
1474 addPredSuccEdges(newBB, (*succIt), false);
1475 }
1476
1477 //
1478 // Create a label for the newBB and insert return to new BB
1479 //
1480 std::string str = "LABEL__" + std::to_string(newBB->getId());
1481 G4_Label* lab = builder->createLabel(str, LABEL_BLOCK);
1482 G4_INST* labelInst = createNewLabelInst(lab);
1483
1484 // exitBB is really just a dummy one for analysis, and does not need a return
1485 // we will instead leave the return in each of its predecessors
1486 newBB->push_back(labelInst);
1487
1488 //
1489 // Deal with all return BBs
1490 //
1491 for (G4_BB * retBB : retBBList)
1492 {
1493 if (retBB->getId() == newBB->getId())
1494 {
1495 continue;
1496 }
1497 last = retBB->back();
1498 // remove <retBB,its succBB> edges, do not remove the fall through edge if predicated
1499 BB_LIST retSuccsList;
1500 retSuccsList.assign(retBB->Succs.begin(), retBB->Succs.end());
1501 for (BB_LIST_ITER retSuccIt = retSuccsList.begin(); retSuccIt != retSuccsList.end(); ++retSuccIt)
1502 {
1503 G4_BB * retSuccBB = (*retSuccIt);
1504 if (last->getPredicate() != NULL && retSuccIt == retSuccsList.begin()) continue;
1505 retBB->removeSuccEdge(retSuccBB);
1506 retSuccBB->removePredEdge(retBB);
1507 }
1508 // Add <retBB,newBB> edges
1509 addPredSuccEdges(retBB, newBB, false);
1510 }
1511 }
1512
1513 return newBB;
1514 }
1515
decoupleInitBlock(G4_BB * bb,FuncInfoHashTable & funcInfoHashTable)1516 void FlowGraph::decoupleInitBlock(G4_BB* bb, FuncInfoHashTable& funcInfoHashTable)
1517 {
1518 G4_BB* oldInitBB = bb;
1519 G4_BB* newInitBB = createNewBB();
1520 BB_LIST_ITER insertBefore = std::find(BBs.begin(), BBs.end(), bb);
1521 MUST_BE_TRUE(insertBefore != BBs.end(), ERROR_FLOWGRAPH);
1522 insert(insertBefore, newInitBB);
1523
1524 BB_LIST_ITER kt = oldInitBB->Preds.begin();
1525 while (kt != oldInitBB->Preds.end())
1526 {
1527 // the pred of this new INIT BB are all call BB
1528 if ((*kt)->getBBType() & G4_BB_CALL_TYPE) {
1529
1530 newInitBB->Preds.push_back((*kt));
1531
1532 BB_LIST_ITER jt = (*kt)->Succs.begin();
1533 while (jt != (*kt)->Succs.end()) {
1534 if ((*jt) == oldInitBB)
1535 {
1536 break;
1537 }
1538 jt++;
1539 }
1540 MUST_BE_TRUE(jt != (*kt)->Succs.end(), ERROR_FLOWGRAPH);
1541 (*kt)->Succs.insert(jt, newInitBB);
1542 (*kt)->Succs.erase(jt);
1543
1544 BB_LIST_ITER tmp_kt = kt;
1545 ++kt;
1546 // erase this pred from old INIT BB's pred
1547 oldInitBB->Preds.erase(tmp_kt);
1548 }
1549 else
1550 {
1551 ++kt;
1552 }
1553 }
1554
1555 FuncInfoHashTable::iterator old_iter = funcInfoHashTable.find(oldInitBB->getId());
1556 MUST_BE_TRUE(old_iter != funcInfoHashTable.end(), " Function info is not in hashtable.");
1557 FuncInfo* funcInfo = old_iter->second;
1558
1559 // Erase the old item from unordered_map and add the new one.
1560 funcInfo->updateInitBB(newInitBB);
1561 funcInfoHashTable.erase(old_iter);
1562 funcInfoHashTable.insert(std::make_pair(newInitBB->getId(), funcInfo));
1563
1564 oldInitBB->unsetBBType(G4_BB_INIT_TYPE);
1565 newInitBB->setBBType(G4_BB_INIT_TYPE);
1566 addPredSuccEdges(newInitBB, oldInitBB);
1567
1568 std::string blockName = "LABEL__EMPTYBB__" + std::to_string(newInitBB->getId());
1569 G4_Label* label = builder->createLabel(blockName, LABEL_BLOCK);
1570 G4_INST* labelInst = createNewLabelInst(label);
1571 newInitBB->push_back(labelInst);
1572 }
1573
decoupleReturnBlock(G4_BB * bb)1574 void FlowGraph::decoupleReturnBlock(G4_BB* bb)
1575 {
1576 G4_BB* oldRetBB = bb;
1577 G4_BB* itsExitBB = oldRetBB->BBBeforeCall()->getCalleeInfo()->getExitBB();
1578 G4_BB* newRetBB = createNewBB();
1579 BB_LIST_ITER insertBefore = std::find(BBs.begin(), BBs.end(), bb);
1580 MUST_BE_TRUE(insertBefore != BBs.end(), ERROR_FLOWGRAPH);
1581 insert(insertBefore, newRetBB);
1582
1583 std::replace(itsExitBB->Succs.begin(), itsExitBB->Succs.end(), oldRetBB, newRetBB);
1584 newRetBB->Preds.push_back(itsExitBB);
1585 newRetBB->Succs.push_back(oldRetBB);
1586
1587 std::replace(oldRetBB->Preds.begin(), oldRetBB->Preds.end(), itsExitBB, newRetBB);
1588 oldRetBB->Preds.unique();
1589
1590 oldRetBB->unsetBBType(G4_BB_RETURN_TYPE);
1591 newRetBB->setBBType(G4_BB_RETURN_TYPE);
1592
1593 std::string str = "LABEL__EMPTYBB__" + std::to_string(newRetBB->getId());
1594 G4_Label* label = builder->createLabel(str, LABEL_BLOCK);
1595 G4_INST* labelInst = createNewLabelInst(label);
1596 newRetBB->push_back(labelInst);
1597 }
1598
1599 // Make sure that a BB is at most one of CALL/RETURN/EXIT/INIT. If any
1600 // BB is both, say INIT and CALL, decouple them by inserting a new BB.
1601 // [The above comments are post-added from reading the code.]
1602 //
1603 // The inserted BB must be in the original place so that each subroutine
1604 // has a list of consecutive BBs.
normalizeSubRoutineBB(FuncInfoHashTable & funcInfoTable)1605 void FlowGraph::normalizeSubRoutineBB(FuncInfoHashTable& funcInfoTable)
1606 {
1607 setPhysicalPredSucc();
1608 for (G4_BB* bb : BBs)
1609 {
1610 if ((bb->getBBType() & G4_BB_CALL_TYPE))
1611 {
1612 if (bb->getBBType() & G4_BB_EXIT_TYPE)
1613 {
1614 // As call BB has RETURN BB as fall-thru, cannot be EXIT
1615 MUST_BE_TRUE(false, ERROR_FLOWGRAPH);
1616 }
1617
1618 // BB could be either INIT or RETURN, but not both.
1619 if (bb->getBBType() & G4_BB_INIT_TYPE)
1620 {
1621 decoupleInitBlock(bb, funcInfoTable);
1622 }
1623 else if (bb->getBBType() & G4_BB_RETURN_TYPE)
1624 {
1625 decoupleReturnBlock(bb);
1626 }
1627 }
1628 else if ((bb->getBBType() & G4_BB_INIT_TYPE))
1629 {
1630 if (bb->getBBType() & G4_BB_RETURN_TYPE)
1631 {
1632 // As retrun BB must have a pred, it cannot be init BB
1633 MUST_BE_TRUE(false, ERROR_FLOWGRAPH);
1634 }
1635
1636 // Two possible combinations: INIT & CALL, or INIT & EXIT. INIT & CALL has
1637 // been processed in the previous IF, here only INIT and EXIT is possible.
1638 if (bb->getBBType() & G4_BB_EXIT_TYPE)
1639 {
1640 decoupleInitBlock(bb, funcInfoTable);
1641 }
1642 }
1643 else if ((bb->getBBType() & G4_BB_EXIT_TYPE))
1644 {
1645 // Only EXIT & RETURN are possible. (INIT & EXIT
1646 // has been processed)
1647 if (bb->getBBType() & G4_BB_RETURN_TYPE)
1648 {
1649 decoupleReturnBlock(bb);
1650 }
1651 }
1652 else if ((bb->getBBType() & G4_BB_RETURN_TYPE))
1653 {
1654 // Do we need to do this ?
1655 if (bb->Preds.size() > 1)
1656 {
1657 decoupleReturnBlock(bb);
1658 }
1659 }
1660 }
1661 }
1662
1663 //
1664 // This function does a DFS and any blocks that get visited will have their
1665 // preId set according to the ordering. Blocks that never get visited will
1666 // have their preId unmodified.
1667 //
doDFS(G4_BB * startBB,unsigned int p)1668 void doDFS(G4_BB* startBB, unsigned int p)
1669 {
1670 unsigned int currP = p;
1671 std::stack<G4_BB*> traversalStack;
1672 traversalStack.push(startBB);
1673
1674 while (!traversalStack.empty())
1675 {
1676 G4_BB* currBB = traversalStack.top();
1677 traversalStack.pop();
1678 if (currBB->getPreId() != UINT_MAX)
1679 {
1680 continue;
1681 }
1682 currBB->setPreId(currP++);
1683
1684 for (G4_BB* tmp : currBB->Succs)
1685 {
1686 if (tmp->getPreId() == UINT_MAX)
1687 {
1688 traversalStack.push(tmp);
1689 }
1690 }
1691 }
1692 }
1693
1694 //
1695 // The optimization pass will remove unreachable blocks from functions. Later compilation phases
1696 // assume that the only unreachable code that can exist in a function is the block with
1697 // return/pseudo_fret instruction. All other unreachable code should be removed. The only time
1698 // blocks with return/pseudo_fret will be removed is when the header of that function itself
1699 // is deemed unreachable.
1700 //
removeUnreachableBlocks(FuncInfoHashTable & funcInfoHT)1701 void FlowGraph::removeUnreachableBlocks(FuncInfoHashTable& funcInfoHT)
1702 {
1703 unsigned preId = 0;
1704 //
1705 // initializations
1706 //
1707 for (G4_BB* bb : BBs)
1708 {
1709 bb->setPreId(UINT_MAX);
1710 }
1711 //
1712 // assign DFS based pre/rpost ids to all blocks in the main program
1713 //
1714 doDFS(getEntryBB(), preId);
1715
1716 //
1717 // Basic blocks with preId/rpostId set to UINT_MAX are unreachable.
1718 // Special handling:
1719 // 1. If CALL_BB isn't dead, don't remove RETURN_BB;
1720 // 2. If INIT_BB isn't dead, don't remove EXIT_BB.
1721 // 3. For BB with EOT, don't remove it.
1722 // (Note that in BB list (BBs), CALL_BB appears before RETURN_BB and INIT_BB
1723 // appears before EXIT_BB. The algo works under this assumpion.)
1724 BB_LIST_ITER it = BBs.begin();
1725 while (it != BBs.end())
1726 {
1727 G4_BB* bb = (*it);
1728 if (bb->getPreId() == UINT_MAX)
1729 {
1730 if (bb->getBBType() & G4_BB_INIT_TYPE)
1731 {
1732 // Remove it from funcInfoHT.
1733 int funcId = bb->getId();
1734 unsigned numErased = funcInfoHT.erase(funcId);
1735 assert(numErased == 1);
1736 }
1737 else if (bb->getBBType() & G4_BB_CALL_TYPE)
1738 {
1739 // If call bb is removed, its return BB shuld be removed as well.
1740 G4_BB* retBB = bb->getPhysicalSucc();
1741 assert(retBB && "vISA ICE: missing Return BB");
1742 if (retBB->getPreId() != UINT_MAX)
1743 {
1744 retBB->setPreId(UINT_MAX);
1745 }
1746 }
1747
1748 // EOT should be in entry function, we keep it for now.
1749 if (bb->size() > 0 && bb->back()->isEOT())
1750 {
1751 ++it;
1752 continue;
1753 }
1754
1755 // Now, remove bb.
1756 while (bb->Succs.size() > 0)
1757 {
1758 removePredSuccEdges(bb, bb->Succs.front());
1759 }
1760 if (bb->getBBType() & G4_BB_RETURN_TYPE)
1761 {
1762 // As callee may be live, need to remove pred edges from exit BB.
1763 for (auto& II : bb->Preds)
1764 {
1765 G4_BB* exitBB = II;
1766 if (exitBB->getBBType() & G4_BB_EXIT_TYPE)
1767 {
1768 removePredSuccEdges(exitBB, bb);
1769 break;
1770 }
1771 }
1772 }
1773
1774 BB_LIST_ITER prev = it;
1775 ++it;
1776 erase(prev);
1777 }
1778 else
1779 {
1780 if (bb->getBBType() & G4_BB_CALL_TYPE)
1781 {
1782 // CALL BB isn't dead, don't remove return BB.
1783 G4_BB* retBB = bb->getPhysicalSucc();
1784 assert(retBB && (retBB->getBBType() & G4_BB_RETURN_TYPE) &&
1785 "vISA ICE: missing RETURN BB");
1786 if (retBB->getPreId() == UINT_MAX)
1787 {
1788 // For example,
1789 // CALL_BB : call A succ: init_BB
1790 // RETURN_BB: pred: exit_BB
1791 //
1792 // // subroutine A
1793 // init_BB:
1794 // while (true) ...; // inifinite loop
1795 // exit_BB: succ : RETURN_BB
1796 // Both RETURN_BB and exit_BB are unreachable, but
1797 // we keep both!
1798 retBB->setPreId(UINT_MAX - 1);
1799 }
1800 }
1801 else if (bb->getBBType() & G4_BB_INIT_TYPE)
1802 {
1803 // function isn't dead, don't remove exit BB.
1804 int funcId = bb->getId();
1805 auto entry = funcInfoHT.find(funcId);
1806 assert(entry != funcInfoHT.end());
1807 FuncInfo* finfo = entry->second;
1808 G4_BB* exitBB = finfo->getExitBB();
1809 assert(exitBB && "vISA ICE: missing exit BB");
1810 if (exitBB->getPreId() == UINT_MAX)
1811 {
1812 // See the example above for G4_BB_CALL_TYPE
1813 exitBB->setPreId(UINT_MAX - 1);
1814 }
1815 }
1816 it++;
1817 }
1818 }
1819 reassignBlockIDs();
1820 setPhysicalPredSucc();
1821 }
1822
1823 //
1824 // Remove redundant goto/jmpi
1825 // Remove the fall through edges between subroutine and its non-caller preds
1826 // Remove basic blocks that only contain a label, function labels are untouched.
1827 // Remove empty blocks.
1828 //
removeRedundantLabels()1829 void FlowGraph::removeRedundantLabels()
1830 {
1831 // first, remove redundant goto
1832 // goto L0
1833 // ....
1834 // goto L1 <-- redundant goto
1835 // L0: <empty BB>
1836 // L1:
1837 BB_LIST_ITER Next = BBs.begin(), IE = BBs.end();
1838 for (BB_LIST_ITER II = Next; II != IE; II = Next)
1839 {
1840 ++Next;
1841
1842 G4_BB* BB = *II;
1843 G4_opcode lastop = BB->getLastOpcode();
1844 if (BB->Succs.size() == 1 && (lastop == G4_goto || lastop == G4_jmpi))
1845 {
1846 G4_BB* SuccBB = BB->Succs.back();
1847 G4_BB* BB1 = nullptr;
1848 // Skip over empty BBs
1849 for (auto iter = Next; iter != IE; ++iter)
1850 {
1851 BB1 = *iter;
1852 if (BB1 != SuccBB &&
1853 (BB1->empty() ||
1854 (BB1->size() == 1 && BB1->getLastOpcode() == G4_label)))
1855 {
1856 continue;
1857 }
1858 break;
1859 }
1860 if (BB1 && SuccBB == BB1)
1861 {
1862 // Remove goto and its fall-thru will be its succ.
1863 G4_BB* fallThruBB = *Next;
1864 if (fallThruBB != SuccBB)
1865 {
1866 // Reconnect succ/pred for BB
1867 removePredSuccEdges(BB, SuccBB);
1868 addPredSuccEdges(BB, fallThruBB);
1869 }
1870 BB->pop_back();
1871 }
1872 }
1873 }
1874
1875 for (BB_LIST_ITER nextit = BBs.begin(), ite = BBs.end(); nextit != ite; )
1876 {
1877 BB_LIST_ITER it = nextit++;
1878 G4_BB* bb = *it;
1879
1880 if (bb == getEntryBB())
1881 {
1882 continue;
1883 }
1884 if (bb->Succs.size() == 0 && bb->Preds.size() == 0) {
1885 //leaving dangling BBs with return in for now.
1886 //workaround to handle unreachable return
1887 //for example return after infinite loop.
1888 if (bb->isEndWithFRet() || (bb->size() > 0 && ((G4_INST*)bb->back())->isReturn()))
1889 {
1890 continue;
1891 }
1892
1893 bb->clear();
1894 erase(it);
1895
1896 continue;
1897 }
1898
1899 assert(bb->size() > 0 && bb->front()->isLabel() &&
1900 "Every BB should at least have a label inst!");
1901
1902 // Possible kernel's entry, don't delete.
1903 if (strcmp(bb->front()->getLabelStr(), "per-thread-prolog") == 0)
1904 {
1905 continue;
1906 }
1907
1908 if (bb->getBBType() &
1909 (G4_BB_CALL_TYPE | G4_BB_EXIT_TYPE | G4_BB_INIT_TYPE | G4_BB_RETURN_TYPE))
1910 {
1911 // Keep those BBs
1912 continue;
1913 }
1914
1915 //
1916 // The removal candidates will have a single successor and a single inst
1917 //
1918 if (bb->Succs.size() == 1 && bb->size() == 1)
1919 {
1920 G4_INST* removedBlockInst = bb->front();
1921 if (removedBlockInst->getLabel()->isFuncLabel() ||
1922 strncmp(removedBlockInst->getLabelStr(), "LABEL__EMPTYBB", 14) == 0 ||
1923 strncmp(removedBlockInst->getLabelStr(), "__AUTO_GENERATED_DUMMY_LAST_BB", 30) == 0)
1924 {
1925 continue;
1926 }
1927
1928 G4_Label *succ_label = bb->Succs.front()->front()->getLabel();
1929
1930 // check if the last inst of pred is a control flow inst
1931 for (auto pred : bb->Preds)
1932 {
1933 auto jt = std::find(pred->Succs.begin(), pred->Succs.end(), bb);
1934 if (jt == pred->Succs.end())
1935 {
1936 // Just skip!
1937 //
1938 // Each unique pred is processed just once. If a pred appears
1939 // more than once in bb->Preds, it is only handled the first time
1940 // the pred is processed.
1941 // Note that if jt == end(), it means that this pred appears
1942 // more than once in the Preds list AND it has been handled
1943 // before (see code at the end of this loop). So, it is safe to skip.
1944 continue;
1945 }
1946
1947 G4_INST *i = pred->back();
1948 // replace label in instructions
1949 if (i->isFlowControl())
1950 {
1951 if (isIndirectJmpTarget(i))
1952 {
1953 // due to the switchjmp we may have multiple jmpi
1954 // at the end of a block.
1955 bool foundMatchingJmp = false;
1956 for (INST_LIST::iterator iter = --pred->end();
1957 iter != pred->begin(); --iter)
1958 {
1959 i = *iter;
1960 if (i->opcode() == G4_jmpi)
1961 {
1962 if (i->getSrc(0)->isLabel() &&
1963 i->getSrc(0) == removedBlockInst->getLabel())
1964 {
1965 i->setSrc(succ_label, 0);
1966 foundMatchingJmp = true;
1967 break;
1968 }
1969 }
1970 else
1971 {
1972 break;
1973 }
1974 }
1975 MUST_BE_TRUE(foundMatchingJmp, "Can't find the matching jmpi to the given label");
1976 }
1977 else if (i->opcode() == G4_jmpi || i->isCall())
1978 {
1979 if (i->getSrc(0)->isLabel())
1980 {
1981 if (i->getSrc(0) == removedBlockInst->getLabel())
1982 {
1983 i->setSrc(succ_label, 0);
1984 }
1985 }
1986 }
1987 else if (i->opcode() == G4_if || i->opcode() == G4_while ||
1988 i->opcode() == G4_else)
1989 {
1990 if (i->asCFInst()->getJip()->isLabel())
1991 {
1992 if (i->asCFInst()->getJip() == removedBlockInst->getLabel())
1993 {
1994 if (i->opcode() == G4_else || i->opcode() == G4_while)
1995 {
1996 // for G4_while, jump no matter predicate
1997 i->asCFInst()->setJip(succ_label);
1998 }
1999 // for G4_if, jump only when it has predictate; if no predicate, no jump
2000 else if (i->getPredicate())
2001 {
2002 i->asCFInst()->setJip(succ_label);
2003 }
2004 }
2005 }
2006 }
2007 else if (i->opcode() == G4_break || i->opcode() == G4_cont || i->opcode() == G4_halt)
2008 {
2009 // JIP and UIP must have been computed at this point
2010 MUST_BE_TRUE(i->asCFInst()->getJip() != NULL && i->asCFInst()->getUip() != NULL,
2011 "null JIP or UIP for break/cont instruction");
2012 if (i->asCFInst()->getJip() == removedBlockInst->getLabel())
2013 {
2014 i->asCFInst()->setJip(succ_label);
2015 }
2016
2017 if (i->asCFInst()->getUip() == removedBlockInst->getLabel())
2018 {
2019 i->asCFInst()->setUip(succ_label);
2020 }
2021 }
2022 else if (i->opcode() == G4_goto)
2023 {
2024 // UIP must have been computed at this point
2025 MUST_BE_TRUE(i->asCFInst()->getUip() != NULL,
2026 "null UIP for goto instruction");
2027 if (i->asCFInst()->getUip() == removedBlockInst->getLabel())
2028 {
2029 i->asCFInst()->setUip(succ_label);
2030 }
2031 if (i->asCFInst()->getUip() == removedBlockInst->getLabel())
2032 {
2033 i->asCFInst()->setUip(succ_label);
2034 }
2035 }
2036 }
2037
2038 pred->Succs.insert(jt, bb->Succs.front());
2039 pred->Succs.erase(jt);
2040
2041 // [Bug1915]: In rare case the precessor may have more than one Succ edge pointing
2042 // to the same BB, due to empty block being eliminated. For example, with
2043 // BB1:
2044 // (P) jmp BB3
2045 // BB2:
2046 // BB3:
2047 // BB4:
2048 // ...
2049 // After BB2 is eliminated BB1's two succ will both point to BB3.
2050 // When we get rid of BB3,
2051 // we have to make sure we update both Succ edges as we'd otherwise create an
2052 // edge to a non-existing BB. Note that we don't just delete the edge because
2053 // elsewhere there may be assumptions that if a BB ends with a jump it must have
2054 // two successors
2055 {
2056 BB_LIST_ITER succs = pred->Succs.begin();
2057 BB_LIST_ITER end = pred->Succs.end();
2058 while (succs != end)
2059 {
2060 BB_LIST_ITER iter = succs;
2061 ++succs;
2062 if ((*iter) == bb)
2063 {
2064 pred->Succs.insert(iter, bb->Succs.front());
2065 pred->Succs.erase(iter);
2066 }
2067 }
2068 }
2069 }
2070
2071 //
2072 // Replace the unique successor's predecessor links with the removed block's predessors.
2073 //
2074 BB_LIST_ITER kt = std::find(bb->Succs.front()->Preds.begin(), bb->Succs.front()->Preds.end(), bb);
2075
2076 BB_LIST_ITER mt = bb->Preds.begin();
2077
2078 for (; mt != bb->Preds.end(); ++mt)
2079 {
2080 bb->Succs.front()->Preds.insert(kt, *mt);
2081 }
2082
2083 bb->Succs.front()->Preds.erase(kt);
2084 bb->Succs.front()->Preds.unique();
2085 //
2086 // Propagate the removed block's type to its unique successor.
2087 //
2088 bb->Succs.front()->setBBType(bb->getBBType());
2089
2090 bb->Succs.clear();
2091 bb->Preds.clear();
2092 bb->clear();
2093
2094 erase(it);
2095 }
2096 else if (bb->Preds.size() == 1 && bb->Preds.front()->Succs.size() == 1)
2097 {
2098 // Merge bb into singlePred and delete bb if all the following are true:
2099 // 1. singlePred has no control-flow inst (at the end),
2100 // 2. bb's label is not used at all.
2101 //
2102 // singlePred:
2103 // ....
2104 // bb:
2105 // ....
2106 //
2107 // If singlePred does not end with a control-flow inst, bb's label is not used except
2108 // bb ends with while. For while bb, we need further to check if any break uses label.
2109 // As break should jump to while bb's fall-thru (not while bb), we need to follow all
2110 // preds of while's fall-thru BB to see if any pred has a break.
2111 //
2112 G4_BB* singlePred = bb->Preds.front();
2113 G4_INST* labelInst = bb->front();
2114 assert(labelInst->isLabel());
2115 if (!singlePred->back()->isFlowControl() &&
2116 singlePred->getPhysicalSucc() == bb /* sanity */ &&
2117 !labelInst->getLabel()->isFuncLabel() /* skip special bb */ &&
2118 bb != singlePred /* [special] skip dead single-BB loop */)
2119 {
2120 bool doMerging = true;
2121 G4_INST* whileInst = bb->back();
2122 if (whileInst->opcode() == G4_while)
2123 {
2124 // If there is any break inst for this while, the break uses the label. No merging.
2125 // Note that any break inst of this while will jump to the BB right after while BB
2126 // (this BB is the fall-thru BB of while if while BB has fall-thru, or just its physical
2127 // succ if it has no fall-thru).
2128 G4_BB* whilePhySucc = bb->getPhysicalSucc();
2129 if (whilePhySucc)
2130 {
2131 for (auto breakPred : whilePhySucc->Preds)
2132 {
2133 if (breakPred->getLastOpcode() == G4_break) {
2134 doMerging = false;
2135 break;
2136 }
2137 }
2138 }
2139 }
2140
2141 if (doMerging)
2142 {
2143 removePredSuccEdges(singlePred, bb);
2144 assert(singlePred->Succs.size() == 0);
2145 std::vector<G4_BB*> allSuccBBs(bb->Succs.begin(), bb->Succs.end());
2146 for (auto S : allSuccBBs)
2147 {
2148 removePredSuccEdges(bb, S);
2149 addPredSuccEdges(singlePred, S, false);
2150 }
2151
2152 // remove bb's label before splice
2153 bb->remove(labelInst);
2154 singlePred->getInstList().splice(singlePred->end(), bb->getInstList());
2155
2156 bb->Succs.clear();
2157 bb->Preds.clear();
2158
2159 erase(it);
2160 }
2161 }
2162 }
2163 }
2164
2165 reassignBlockIDs();
2166 }
2167
2168 //
2169 // remove any mov with the same src and dst opnds
2170 //
removeRedundMov()2171 void FlowGraph::removeRedundMov()
2172 {
2173 for (G4_BB* bb : BBs)
2174 {
2175 INST_LIST_ITER curr_iter = bb->begin();
2176 while (curr_iter != bb->end())
2177 {
2178 G4_INST* inst = (*curr_iter);
2179 if (inst->isRawMov())
2180 {
2181 G4_Operand *src = inst->getSrc(0);
2182 G4_DstRegRegion *dst = inst->getDst();
2183
2184 if (src->isSrcRegRegion())
2185 {
2186 G4_SrcRegRegion* srcRgn = (G4_SrcRegRegion*)src;
2187 if (!dst->isIndirect() &&
2188 !srcRgn->isIndirect() &&
2189 dst->isGreg() &&
2190 src->isGreg())
2191 {
2192 if (dst->getLinearizedStart() == srcRgn->getLinearizedStart() &&
2193 dst->getLinearizedEnd() == srcRgn->getLinearizedEnd())
2194 {
2195 uint16_t stride = 0;
2196 const RegionDesc *rd = srcRgn->getRegion();
2197 unsigned ExSize = inst->getExecSize();
2198 if (ExSize == 1 ||
2199 (rd->isSingleStride(ExSize, stride) &&
2200 (dst->getHorzStride() == stride)))
2201 {
2202 curr_iter = bb->erase(curr_iter);
2203 continue;
2204 }
2205 }
2206 }
2207 }
2208 }
2209 ++curr_iter;
2210 }
2211 }
2212 }
2213
2214 //
2215 // Remove any placeholder empty blocks that could have been inserted to aid analysis
2216 //
removeEmptyBlocks()2217 void FlowGraph::removeEmptyBlocks()
2218 {
2219 bool changed = true;
2220
2221 while (changed)
2222 {
2223 changed = false;
2224 for (BB_LIST_ITER it = BBs.begin(); it != BBs.end();)
2225 {
2226 G4_BB* bb = *it;
2227
2228 //
2229 // The removal candidates will have a unique successor and a single label
2230 // starting with LABEL__EMPTYBB as the only instruction besides a JMP.
2231 //
2232 if (bb->size() > 0 && bb->size() < 3)
2233 {
2234 INST_LIST::iterator removedBlockInst = bb->begin();
2235
2236 if ((*removedBlockInst)->isLabel() == false ||
2237 strncmp((*removedBlockInst)->getLabelStr(),
2238 "LABEL__EMPTYBB", 14) != 0)
2239 {
2240 ++it;
2241 continue;
2242 }
2243
2244 ++removedBlockInst;
2245
2246 if (removedBlockInst != bb->end())
2247 {
2248 // if the BB is not empty, it must end with a unconditional jump
2249 if ((*removedBlockInst)->opcode() != G4_jmpi || bb->Succs.size() > 1)
2250 {
2251 ++it;
2252 continue;
2253 }
2254 }
2255
2256 // remove redundant succs and preds for the empty block
2257 bb->Succs.unique();
2258 bb->Preds.unique();
2259
2260 for (auto predBB : bb->Preds)
2261 {
2262 //
2263 // Replace the predecessors successor links to the removed block's unique successor.
2264 //
2265 BB_LIST_ITER jt = std::find(predBB->Succs.begin(), predBB->Succs.end(), bb);
2266
2267 for (auto succBB : bb->Succs)
2268 {
2269 predBB->Succs.insert(jt, succBB);
2270 }
2271 predBB->Succs.erase(jt);
2272 predBB->Succs.unique();
2273 }
2274
2275 for (auto succBB : bb->Succs)
2276 {
2277 //
2278 // Replace the unique successor's predecessor links with the removed block's predessors.
2279 //
2280 BB_LIST_ITER kt = std::find(succBB->Preds.begin(), succBB->Preds.end(), bb);
2281
2282 if (kt != succBB->Preds.end())
2283 {
2284 for (auto predBB : bb->Preds)
2285 {
2286 succBB->Preds.insert(kt, predBB);
2287 }
2288
2289 succBB->Preds.erase(kt);
2290 succBB->Preds.unique();
2291
2292 //
2293 // Propagate the removed block's type to its unique successor.
2294 //
2295 succBB->setBBType(bb->getBBType());
2296 }
2297 }
2298 //
2299 // Remove the block to be removed.
2300 //
2301 bb->Succs.clear();
2302 bb->Preds.clear();
2303 bb->clear();
2304
2305 BB_LIST_ITER rt = it++;
2306 erase(rt);
2307 changed = true;
2308 }
2309 else
2310 {
2311 ++it;
2312 }
2313 }
2314 }
2315 }
2316
2317 //
2318 // If multiple freturns exist in a flowgraph create a new basic block
2319 // with an freturn. Replace all freturns with jumps.
2320 //
mergeFReturns()2321 void FlowGraph::mergeFReturns()
2322 {
2323 std::list<G4_BB*> exitBBs;
2324 G4_BB* candidateFretBB = NULL;
2325 G4_Label *dumLabel = NULL;
2326
2327 for (G4_BB* cur : BBs)
2328 {
2329 if (cur->size() > 0 && cur->back()->isFReturn())
2330 {
2331 exitBBs.push_back(cur);
2332
2333 if (cur->size() == 2 && cur->front()->isLabel())
2334 {
2335 // An fret already exists that can be shared among all
2336 // so skip creating a new block with fret.
2337 dumLabel = cur->front()->getSrc(0)->asLabel();
2338 candidateFretBB = cur;
2339 }
2340 }
2341 }
2342
2343 if (exitBBs.size() > 1)
2344 {
2345 if (candidateFretBB == NULL)
2346 {
2347 G4_BB* newExit = createNewBB();
2348 assert(!builder->getIsKernel() && "Not expecting fret in kernel");
2349 std::string str = "__MERGED_FRET_EXIT_BLOCK";
2350 dumLabel = builder->createLabel(str, LABEL_BLOCK);
2351 G4_INST* label = createNewLabelInst(dumLabel);
2352 newExit->push_back(label);
2353 G4_INST* fret = builder->createInternalCFInst(
2354 nullptr, G4_pseudo_fret, g4::SIMD1, nullptr, nullptr, InstOpt_NoOpt);
2355 newExit->push_back(fret);
2356 BBs.push_back(newExit);
2357 candidateFretBB = newExit;
2358 }
2359
2360 for (G4_BB* cur : exitBBs)
2361 {
2362 if (cur != candidateFretBB)
2363 {
2364 G4_INST* last = cur->back();
2365 addPredSuccEdges(cur, candidateFretBB);
2366
2367 last->setOpcode(G4_jmpi);
2368 last->setSrc(dumLabel, 0);
2369 last->setExecSize(g4::SIMD1);
2370 }
2371 }
2372 }
2373 }
2374
2375
2376 //
2377 // Add a dummy BB for multiple-exit flow graph
2378 // The criteria of a valid multiple-exit flow graph is:
2379 // (1). equal or less than one BB w/o successor has non-EOT last instruction;
2380 // (2). Other BBs w/o successor must end with EOT
2381 //
linkDummyBB()2382 void FlowGraph::linkDummyBB()
2383 {
2384 //
2385 // check the flow graph to find if it satify the criteria and record the exit BB
2386 //
2387 std::list<G4_BB*> exitBBs;
2388 int nonEotExitBB = 0;
2389 for (G4_BB *bb : BBs)
2390 {
2391 if (bb->Succs.empty())
2392 {
2393 exitBBs.push_back(bb); // record exit BBs
2394 G4_INST* last = bb->back();
2395 if (last == NULL)
2396 {
2397 MUST_BE_TRUE(false, "ERROR: Invalid flow graph with empty exit BB!");
2398 }
2399 if (!bb->isLastInstEOT())
2400 {
2401 nonEotExitBB++;
2402 if (nonEotExitBB > 1)
2403 {
2404 MUST_BE_TRUE(false,
2405 "ERROR: Invalid flow graph with more than one exit BB not end with EOT!");
2406 }
2407 }
2408 }
2409 }
2410
2411 //
2412 // create the dummy BB and link the exit BBs to it
2413 //
2414 if (nonEotExitBB == 1 && exitBBs.size() > 1)
2415 {
2416 G4_BB *dumBB = createNewBB();
2417 MUST_BE_TRUE(dumBB != NULL, ERROR_FLOWGRAPH);
2418 std::string str("__AUTO_GENERATED_DUMMY_LAST_BB");
2419 G4_Label *dumLabel = builder->createLabel(str, LABEL_BLOCK);
2420 G4_INST* label = createNewLabelInst(dumLabel);
2421 dumBB->push_back(label);
2422 BBs.push_back(dumBB);
2423
2424 for (G4_BB *bb : exitBBs)
2425 {
2426 dumBB->Preds.push_back(bb);
2427 bb->Succs.push_back(dumBB);
2428 }
2429 }
2430 }
2431
2432 //
2433 // Re-assign block ID so that we can use id to determine the ordering of two blocks in the code layout
2434 //
reassignBlockIDs()2435 void FlowGraph::reassignBlockIDs()
2436 {
2437 //
2438 // re-assign block id so that we can use id to determine the ordering of
2439 // two blocks in the code layout; namely which one is ahead of the other.
2440 // Important: since we re-assign id, there MUST NOT exist any instruction
2441 // that depends on BB id. Or the code will be incorrect once we reassign id.
2442 //
2443 unsigned i = 0;
2444 for (G4_BB* bb : BBs)
2445 {
2446 bb->setId(i);
2447 i++;
2448 MUST_BE_TRUE(i <= getNumBB(), ERROR_FLOWGRAPH);
2449 }
2450
2451 numBBId = i;
2452 }
2453
findLabelBB(BB_LIST_ITER StartIter,BB_LIST_ITER EndIter,const char * Label)2454 G4_BB* FlowGraph::findLabelBB(
2455 BB_LIST_ITER StartIter, BB_LIST_ITER EndIter, const char* Label)
2456 {
2457 for (BB_LIST_ITER it = StartIter; it != EndIter; ++it)
2458 {
2459 G4_BB* bb = *it;
2460 G4_INST* first = bb->empty() ? NULL : bb->front();
2461
2462 if (first && first->isLabel())
2463 {
2464 const char* currLabel = first->getLabelStr();
2465 if (strcmp(Label, currLabel) == 0)
2466 {
2467 return bb;
2468 }
2469 }
2470 }
2471 return NULL;
2472 }
2473
2474
2475 typedef enum
2476 {
2477 STRUCTURED_CF_IF = 0,
2478 STRUCTURED_CF_LOOP = 1
2479 } STRUCTURED_CF_TYPE;
2480
2481 struct StructuredCF
2482 {
2483 STRUCTURED_CF_TYPE mType;
2484 // for if this is the block that ends with if
2485 // for while this is the loop block
2486 G4_BB* mStartBB;
2487 // for if this is the endif block
2488 // for while this is the block that ends with while
2489 G4_BB* mEndBB;
2490 // it's possible for a BB to have multiple endifs, so we need
2491 // to know which endif corresponds to this CF
2492 G4_INST* mEndInst;
2493
2494 StructuredCF* enclosingCF;
2495
2496 // ToDo: can add more information (else, break, cont, etc.) as needed later
2497
2498 // endBB is set when we encounter the endif/while
StructuredCFStructuredCF2499 StructuredCF(STRUCTURED_CF_TYPE type, G4_BB* startBB) :
2500 mType(type), mStartBB(startBB), mEndBB(NULL), mEndInst(NULL), enclosingCF(NULL) {}
2501
operator newStructuredCF2502 void *operator new(size_t sz, vISA::Mem_Manager& m) { return m.alloc(sz); }
2503
setEndStructuredCF2504 void setEnd(G4_BB* endBB, G4_INST* endInst)
2505 {
2506 mEndBB = endBB;
2507 mEndInst = endInst;
2508 }
2509 }; // StructuredCF
2510
2511 /*
2512 * This function sets the JIP for Structured CF (SCF) instructions, Unstructured
2513 * Control Flow (UCF) instructions (goto) are handled in processGoto().
2514 *
2515 * Note: we currently do not consider goto/join when adding the JIP for endif,
2516 * since structure analysis should not allow goto into/out of if-endif. This means
2517 * we only need to set the the JIP of the endif to its immediately enclosing endif/while
2518 *
2519 * The simd control flow blocks must be well-structured
2520 *
2521 */
processSCF(FuncInfoHashTable & FuncInfoMap)2522 void FlowGraph::processSCF(FuncInfoHashTable &FuncInfoMap)
2523 {
2524 std::stack<StructuredCF*> ifAndLoops;
2525 std::vector<StructuredCF*> structuredSimdCF;
2526
2527 for (G4_BB* bb : BBs)
2528 {
2529 for (G4_INST* inst : *bb)
2530 {
2531 // check if first non-label inst is an endif
2532 if (inst->opcode() != G4_label && inst->opcode() != G4_join)
2533 {
2534 if (inst->opcode() == G4_endif)
2535 {
2536
2537 MUST_BE_TRUE(ifAndLoops.size() > 0, "endif without matching if");
2538 StructuredCF* cf = ifAndLoops.top();
2539 MUST_BE_TRUE(cf->mType == STRUCTURED_CF_IF, "ill-formed if");
2540 cf->setEnd(bb, inst);
2541 ifAndLoops.pop();
2542 if (ifAndLoops.size() > 0)
2543 {
2544 cf->enclosingCF = ifAndLoops.top();
2545 }
2546 }
2547 else
2548 {
2549 // stop at the first non-endif instruction (there may be multiple endifs)
2550 break;
2551 }
2552 }
2553 }
2554
2555 // check if bb is SIMD loop head
2556 for (G4_BB* predBB : bb->Preds)
2557 {
2558 // check if one of the pred ends with a while
2559 if (predBB->getId() >= bb->getId())
2560 {
2561 if (predBB->size() != 0 &&
2562 predBB->back()->opcode() == G4_while)
2563 {
2564 StructuredCF* cf = new (mem)StructuredCF(STRUCTURED_CF_LOOP, bb);
2565 structuredSimdCF.push_back(cf);
2566 ifAndLoops.push(cf);
2567 }
2568 }
2569 }
2570
2571 if (bb->size() > 0)
2572 {
2573 G4_INST* lastInst = bb->back();
2574 if (lastInst->opcode() == G4_if)
2575 {
2576 StructuredCF* cf = new (mem)StructuredCF(STRUCTURED_CF_IF, bb);
2577 structuredSimdCF.push_back(cf);
2578 ifAndLoops.push(cf);
2579 }
2580 else if (lastInst->opcode() == G4_while)
2581 {
2582 MUST_BE_TRUE(ifAndLoops.size() > 0, "while without matching do");
2583 StructuredCF* cf = ifAndLoops.top();
2584 MUST_BE_TRUE(cf->mType == STRUCTURED_CF_LOOP, "ill-formed while loop");
2585 cf->setEnd(bb, lastInst);
2586 ifAndLoops.pop();
2587 if (ifAndLoops.size() > 0)
2588 {
2589 cf->enclosingCF = ifAndLoops.top();
2590 }
2591 }
2592 }
2593 }
2594
2595 MUST_BE_TRUE(ifAndLoops.size() == 0, "not well-structured SIMD CF");
2596
2597 for (StructuredCF* cf : structuredSimdCF)
2598 {
2599 if (cf->mType == STRUCTURED_CF_IF && cf->enclosingCF != NULL)
2600 {
2601 setJIPForEndif(cf->mEndInst, cf->enclosingCF->mEndInst, cf->enclosingCF->mEndBB);
2602 }
2603 }
2604 }
2605
2606 // markDivergentBBs()
2607 // If BB is on the divergent path, mark it as divergent.
2608 // Divergent:
2609 // If all active simd lanes on entry to shader/kernel are
2610 // active in a BB, that BB is NOT divergent; otherwise,
2611 // it is divergent.
2612 //
2613 // Note that this does not require the number of all the
2614 // active simd lanes equals to the simd dispatch width!
2615 // For example, simd8 kernel might have 4 active lanes
2616 // on entry to the kernel, thus if any BB of the kernel
2617 // has less than 4 active lanes, it is divergent! As
2618 // matter of fact, the entry BB must not be divergent.
2619 //
markDivergentBBs()2620 void FlowGraph::markDivergentBBs()
2621 {
2622 if (BBs.empty())
2623 {
2624 // Sanity check
2625 return;
2626 }
2627
2628 // fcall'ed Function: to be conservative and assume that any function
2629 // that is fcall'ed is divergent on entry
2630 // (Note that each function has its own CFG)
2631 if (!builder->getIsKernel())
2632 {
2633 for (G4_BB* BB : BBs)
2634 {
2635 BB->setDivergent(true);
2636 }
2637 return;
2638 }
2639
2640 // Now, handle kernel function.
2641 // 1. It would be perfered to have a single return/exit for kernel and
2642 // and all its subroutines in the last BB (IGC does, cm does not),
2643 // although the code can handle return in the middle of kernel.
2644 // 2. The entry function will appear first in the BB list, and if there
2645 // is a call from A to B, subroutine A shall appear prior to
2646 // subroutine B in BB list.
2647 //
2648 // Required: need to set BB id.
2649 //
2650 // Key variables:
2651 // LastJoinBB:
2652 // LastJoinBB is the fartherest joinBB of any goto/break/if/while
2653 // so far, as described below in the algorithm.
2654 // LastJoinBBId: Id(LastJoinBB).
2655 //
2656 // The algorithm initializes LastJoinBBId to -1 and scans all BBs in order.
2657 // It checks control-flow instructions to see if it diverges (has join).
2658 // If so, the algorithm sets LastJoinBBId to be the larger one of its join BB
2659 // and LastJoinBBId; For a non-negative LastJoinBBId, it means that there is
2660 // an active join in that BB, and therefore, all BBs from the current BB to
2661 // that LastJoinBB (LastJoinBB not included) are divergent.
2662 //
2663 // The algorithm checks the following cases and their join BBs are:
2664 // case 1: cf inst = goto
2665 // <currBB> [(p)] goto L
2666 // ...
2667 // <joinBB> L:
2668 // case 2: cf inst = if
2669 // <currBB> if
2670 // ...
2671 // <joinBB> endif
2672 //
2673 // case 3: cf inst = break
2674 // <currBB> break
2675 // ...
2676 // [(p)] while
2677 // <joinBB>
2678 // case 4:
2679 // <currBB> L:
2680 // ....
2681 // [(p)] while/goto L
2682 // <joinBB>
2683 //
2684 // The presence of loop makes the checking complicated. Before checking the
2685 // divergence of a loop head, we need to know if the loop has ANY NON-UNIFORM
2686 // OUT-OF-LOOP BRANCH or OUT-OF-LOOP BRANCH in DIVERGENT BB, which includes
2687 // checking the loop's backedge and any out-going branches inside the loop body.
2688 // For example,
2689 // L0:
2690 // B0: (uniform cond) goto B1
2691 // ......
2692 // B1:
2693 // B2:
2694 // L1:
2695 // B3: goto OUT
2696 // B4: (non-uniform cond) goto L1
2697 // B5:
2698 // B6: (uniform cond) goto L0
2699 //
2700 // OUT: B7:
2701 //
2702 // At B0, we don't know whether B0 is divergent even though B0's goto is uniform. Once
2703 // we find out that B3 is divergent, "goto OUT" will make Loop L0 divergent, which means
2704 // any of L0's BBs, including B0, is divergent. In order to know B3 is divergent, we need
2705 // to check the back branch of loop L1. The fact that L1's backedge is non-uniform indicates
2706 // L1 is divergent, therefore B3 is divergent. This means that WE NEED TO DO PRE_SCAN in
2707 // order to know whether any BBs of a loop is divergent.
2708 //
2709 int LastJoinBBId;
2710
2711 auto pushJoin = [&](G4_BB* joinBB) {
2712 LastJoinBBId = std::max(LastJoinBBId, (int)joinBB->getId());
2713 };
2714 auto popJoinIfMatch = [&](G4_BB* BB) {
2715 if ((int)BB->getId() == LastJoinBBId) {
2716 LastJoinBBId = -1;
2717 }
2718 };
2719 auto isPriorToLastJoin = [&](G4_BB* aBB) ->bool {
2720 return (int)aBB->getId() < LastJoinBBId;
2721 };
2722
2723 reassignBlockIDs();
2724
2725 // Analyze function in topological order. As there is no recursion
2726 // and no indirect call, a function will be analyzed only if all
2727 // its callers have been analyzed.
2728 //
2729 // If no subroutine, sortedFuncTable is empty. Here keep all functions
2730 // in a vector first (it works with and without subroutines), then scan
2731 // functions in topological order.
2732 struct StartEndIter {
2733 BB_LIST_ITER StartI;
2734 BB_LIST_ITER EndI;
2735 bool InvokedFromDivergentBB;
2736 };
2737 int numFuncs = (int)sortedFuncTable.size();
2738 std::vector<StartEndIter> allFuncs;
2739 std::unordered_map<FuncInfo*, uint32_t> funcInfoIndex;
2740 if (numFuncs == 0)
2741 {
2742 numFuncs = 1;
2743 allFuncs.resize(1);
2744 allFuncs[0].StartI = BBs.begin();
2745 allFuncs[0].EndI = BBs.end();
2746 allFuncs[0].InvokedFromDivergentBB = false;
2747 }
2748 else
2749 {
2750 allFuncs.resize(numFuncs);
2751 for (int i = numFuncs; i > 0; --i)
2752 {
2753 uint32_t ix = (uint32_t)(i - 1);
2754 FuncInfo* pFInfo = sortedFuncTable[ix];
2755 G4_BB* StartBB = pFInfo->getInitBB();
2756 G4_BB* EndBB = pFInfo->getExitBB();
2757 uint32_t ui = (uint32_t)(numFuncs - i);
2758 allFuncs[ui].StartI = std::find(BBs.begin(), BBs.end(), StartBB);
2759 auto nextI = std::find(BBs.begin(), BBs.end(), EndBB);
2760 assert(nextI != BBs.end() && "ICE: subroutine's end BB not found!");
2761 allFuncs[ui].EndI = (++nextI);
2762 allFuncs[ui].InvokedFromDivergentBB = false;
2763
2764 funcInfoIndex[pFInfo] = ui;
2765 }
2766 }
2767
2768 // Update LastJoin for the backward branch of BB referred to by IT.
2769 auto updateLastJoinForBwdBrInBB = [&](BB_LIST_ITER& IT)
2770 {
2771 G4_BB* predBB = *IT;
2772 G4_INST* bInst = predBB->back();
2773 assert(bInst->isCFInst() &&
2774 (bInst->opcode() == G4_while || bInst->asCFInst()->isBackward()) &&
2775 "ICE: expected backward branch/while!");
2776
2777 // joinBB of a loop is the BB right after tail BB
2778 BB_LIST_ITER loopJoinIter = IT;
2779 ++loopJoinIter;
2780 if (loopJoinIter == BBs.end())
2781 {
2782 // Loop end is the last BB (no Join BB). This happens in the
2783 // following cases:
2784 // 1. For CM, CM's return is in the middle of code! For IGC,
2785 // this never happen as a return, if present, must be in
2786 // the last BB.
2787 // 2. The last segment code is an infinite loop (static infinite
2788 // loop so that compiler knows it is an infinite loop).
2789 // In either case, no join is needed.
2790 return;
2791 }
2792
2793 // Update LastJoin if the branch itself is divergent.
2794 // (todo: checking G4_jmpi is redundant as jmpi must have isUniform()
2795 // return true.)
2796 if ((!bInst->asCFInst()->isUniform() && bInst->opcode() != G4_jmpi))
2797 {
2798 G4_BB* joinBB = *loopJoinIter;
2799 pushJoin(joinBB);
2800 }
2801 return;
2802 };
2803
2804 // Update LastJoin for BB referred to by IT. It is called either from normal
2805 // scan (setting BB's divergent field) or from loop pre-scan (no setting to
2806 // BB's divergent field). IsPreScan indicates whether it is called from
2807 // the pre-scan or not.
2808 //
2809 // The normal scan (IsPreScan is false), this function also update the divergence
2810 // info for any called subroutines.
2811 //
2812 auto updateLastJoinForFwdBrInBB = [&](BB_LIST_ITER& IT, bool IsPreScan)
2813 {
2814 G4_BB* aBB = *IT;
2815 if (aBB->size() == 0)
2816 {
2817 return;
2818 }
2819
2820 // Using isPriorToLastJoin() works for both loop pre-scan and normal scan.
2821 // (aBB->isDivergent() works for normal scan only.)
2822 bool isBBDivergent = isPriorToLastJoin(aBB);
2823
2824 G4_INST* lastInst = aBB->back();
2825 if (((lastInst->opcode() == G4_goto && !lastInst->asCFInst()->isBackward()) ||
2826 lastInst->opcode() == G4_break) &&
2827 (!lastInst->asCFInst()->isUniform() || isBBDivergent))
2828 {
2829 // forward goto/break : the last Succ BB is our target BB
2830 // For break, it should be the BB right after while inst.
2831 G4_BB* joinBB = aBB->Succs.back();
2832 pushJoin(joinBB);
2833 }
2834 else if (lastInst->opcode() == G4_if &&
2835 (!lastInst->asCFInst()->isUniform() || isBBDivergent))
2836 {
2837 G4_Label* labelInst = lastInst->asCFInst()->getUip();
2838 G4_BB* joinBB = findLabelBB(IT, BBs.end(), labelInst->getLabel());
2839 assert(joinBB && "ICE(vISA) : missing endif label!");
2840 pushJoin(joinBB);
2841 }
2842 else if (!IsPreScan && lastInst->opcode() == G4_call)
2843 {
2844 // If this function is already in divergent branch, the callee
2845 // must be in a divergent branch!.
2846 if (isBBDivergent || lastInst->getPredicate() != nullptr)
2847 {
2848 FuncInfo* calleeFunc = aBB->getCalleeInfo();
2849 if (funcInfoIndex.count(calleeFunc))
2850 {
2851 int ix = funcInfoIndex[calleeFunc];
2852 allFuncs[ix].InvokedFromDivergentBB = true;
2853 }
2854 }
2855 }
2856 return;
2857 };
2858
2859
2860 // Now, scan all subroutines (kernels or functions)
2861 for (int i = 0; i < numFuncs; ++i)
2862 {
2863 // each function: [IT, IE)
2864 BB_LIST_ITER& IS = allFuncs[i].StartI;
2865 BB_LIST_ITER& IE = allFuncs[i].EndI;
2866 if (IS == IE)
2867 {
2868 // sanity check
2869 continue;
2870 }
2871
2872 if (allFuncs[i].InvokedFromDivergentBB)
2873 {
2874 // subroutine's divergent on entry. Mark every BB as divergent
2875 for (auto IT = IS; IT != IE; ++IT)
2876 {
2877 G4_BB* BB = *IT;
2878
2879 BB->setDivergent(true);
2880 if (BB->size() == 0)
2881 {
2882 // sanity check
2883 continue;
2884 }
2885 if (BB->isEndWithCall() || BB->isEndWithFCall())
2886 {
2887 FuncInfo* calleeFunc = BB->getCalleeInfo();
2888 if (funcInfoIndex.count(calleeFunc))
2889 {
2890 int ix = funcInfoIndex[calleeFunc];
2891 allFuncs[ix].InvokedFromDivergentBB = true;
2892 }
2893 }
2894 }
2895 // continue for next subroutine
2896 continue;
2897 }
2898
2899 // Scaning all BBs of a single subroutine (or kernel, or function).
2900 LastJoinBBId = -1;
2901 for (auto IT = IS; IT != IE; ++IT)
2902 {
2903 G4_BB* BB = *IT;
2904
2905 // join could be: explicit (join inst) or implicit (endif/while)
2906 popJoinIfMatch(BB);
2907
2908 // Handle loop
2909 // Loop needs to be scanned twice in order to get an accurate marking.
2910 // In pre-scan (1st scan), it finds out divergent out-of-loop branch.
2911 // If found, it updates LastJoin to the target of that out-of-loop branch
2912 // and restart the normal scan. If not, it restarts the normal scan with
2913 // the original LastJoin unchanged.
2914
2915 // BB could be head of several loops, and the following does pre-scan for
2916 // every one of those loops.
2917 for (G4_BB* predBB : BB->Preds)
2918 {
2919 if (!isBackwardBranch(predBB, BB)) {
2920 G4_opcode t_opc = predBB->getLastOpcode();
2921 bool isBr = (t_opc == G4_goto || t_opc == G4_jmpi);
2922 assert((!isBr || (BB->getId() > predBB->getId())) && "backward Branch did not set correctly!");
2923 continue;
2924 }
2925 assert(BB->getId() <= predBB->getId() && "Branch incorrectly set to be backward!");
2926
2927 BB_LIST_ITER LoopITEnd = std::find(BBs.begin(), BBs.end(), predBB);
2928 updateLastJoinForBwdBrInBB(LoopITEnd);
2929
2930 // If lastJoin is already after loop end, no need to scan loop twice
2931 // as all BBs in the loop must be divergent
2932 if (isPriorToLastJoin(predBB))
2933 {
2934 continue;
2935 }
2936
2937 // pre-scan loop (BB, predBB)
2938 //
2939 // Observation:
2940 // pre-scan loop once and as a BB is scanned. Each backward
2941 // branch to this BB and a forward branch from this BB are processed.
2942 // Doing so finds out any divergent out-of-loop branch iff the
2943 // loop has one. In addition, if a loop has more than one divergent
2944 // out-of-loop branches, using any of those branches would get us
2945 // precise divergence during the normal scan.
2946 //
2947 // LastJoinBBId will be updated iff there is an out-of-loop branch that is
2948 // is also divergent. For example,
2949 // a) "goto OUT" is out-of-loop branch, but not divergent. Thus, LastJoinBB
2950 // will not be updated.
2951 //
2952 // L :
2953 // B0
2954 // if (uniform cond) goto OUT;
2955 // if (non-uniform cond)
2956 // B1
2957 // else
2958 // B2
2959 // B3
2960 // (p) jmpi L
2961 // OUT:
2962 //
2963 // b) "goto OUT" is out-of-loop branch and divergent. Thus, update LastJoinBB.
2964 // Note that "goto OUT" is divergent because loop L1 is divergent, which
2965 // makes every BB in L1 divergent. And any branch from a divergent BB
2966 // must be divergent.
2967 //
2968 // L :
2969 // B0
2970 // L1:
2971 // B1
2972 // if (uniform cond) goto OUT;
2973 // if (cond)
2974 // B2
2975 // else
2976 // B3
2977 // if (non-uniform cond) goto L1
2978 // B4
2979 // (uniform cond) while L
2980 // OUT:
2981 //
2982 int orig_LastJoinBBId = LastJoinBBId;
2983 for (auto LoopIT = IT; LoopIT != LoopITEnd; ++LoopIT)
2984 {
2985 if (LoopIT != IT)
2986 {
2987 // Check loops that are fully inside the current loop.
2988 G4_BB* H = *LoopIT;
2989 for (G4_BB* T : H->Preds)
2990 {
2991 if (!isBackwardBranch(T, H)) {
2992 continue;
2993 }
2994 assert(H->getId() <= T->getId() && "Branch incorrectly set to be backward!");
2995 BB_LIST_ITER TIter = std::find(BBs.begin(), BBs.end(), T);
2996 updateLastJoinForBwdBrInBB(TIter);
2997 }
2998 }
2999 updateLastJoinForFwdBrInBB(LoopIT, true);
3000 }
3001
3002 // After scan, if no branch out of loop, restore the original LastJoinBBId
3003 if (!isPriorToLastJoin(predBB))
3004 { // case a) above.
3005 LastJoinBBId = orig_LastJoinBBId;
3006 }
3007 }
3008
3009 // normal scan of BB
3010 if (isPriorToLastJoin(BB)) {
3011 BB->setDivergent(true);
3012 }
3013 // Temporary for debugging, toBeDelete!
3014 // if (pKernel->getIntKernelAttribute(Attributes::ATTR_Target) == VISA_CM)
3015 // {
3016 // assert((BB->isDivergent() && BB->isInSimdFlow() ||
3017 // (!BB->isDivergent() && !BB->isInSimdFlow())) && " isDivergent != isInSimdFlow!");
3018 // }
3019 updateLastJoinForFwdBrInBB(IT, false);
3020 }
3021 }
3022 return;
3023 }
3024
getUniqueReturnBlock()3025 G4_BB* FlowGraph::getUniqueReturnBlock()
3026 {
3027 // Return block that has a return instruction
3028 // Return NULL if multiple return instructions found
3029 G4_BB* uniqueReturnBlock = NULL;
3030
3031 for (G4_BB* curBB : BBs) {
3032 if (!curBB->empty())
3033 {
3034 G4_INST* last_inst = curBB->back();
3035
3036 if (last_inst->opcode() == G4_pseudo_fret) {
3037 if (uniqueReturnBlock == NULL) {
3038 uniqueReturnBlock = curBB;
3039 }
3040 else {
3041 uniqueReturnBlock = NULL;
3042 break;
3043 }
3044 }
3045 }
3046 }
3047
3048 return uniqueReturnBlock;
3049 }
3050
3051 /*
3052 * Insert a join at the beginning of this basic block, immediately after the label
3053 * If a join is already present, nothing will be done
3054 */
insertJoinToBB(G4_BB * bb,G4_ExecSize execSize,G4_Label * jip)3055 void FlowGraph::insertJoinToBB(G4_BB* bb, G4_ExecSize execSize, G4_Label* jip)
3056 {
3057 MUST_BE_TRUE(bb->size() > 0, "empty block");
3058 INST_LIST_ITER iter = bb->begin();
3059
3060 // Skip label if any.
3061 if ((*iter)->isLabel())
3062 {
3063 iter++;
3064 }
3065
3066 if (iter == bb->end())
3067 {
3068 // insert join at the end
3069 G4_INST* jInst = builder->createInternalCFInst(NULL, G4_join, execSize, jip, NULL, InstOpt_NoOpt);
3070 bb->push_back(jInst);
3071 }
3072 else
3073 {
3074 G4_INST* secondInst = *iter;
3075
3076 if (secondInst->opcode() == G4_join)
3077 {
3078 if (execSize > secondInst->getExecSize())
3079 {
3080 secondInst->setExecSize(execSize);
3081 }
3082 }
3083 else
3084 {
3085 G4_INST* jInst = builder->createInternalCFInst(NULL, G4_join, execSize, jip, NULL, InstOpt_NoOpt);
3086 bb->insertBefore(iter, jInst);
3087 }
3088 }
3089 }
3090
3091 typedef std::pair<G4_BB*, G4_ExecSize> BlockSizePair;
3092
addBBToActiveJoinList(std::list<BlockSizePair> & activeJoinBlocks,G4_BB * bb,G4_ExecSize execSize)3093 static void addBBToActiveJoinList(std::list<BlockSizePair>& activeJoinBlocks, G4_BB* bb, G4_ExecSize execSize)
3094 {
3095 // add goto target to list of active blocks that need a join
3096 std::list<BlockSizePair>::iterator listIter;
3097 for (listIter = activeJoinBlocks.begin(); listIter != activeJoinBlocks.end(); ++listIter)
3098 {
3099 G4_BB* aBB = (*listIter).first;
3100 if (aBB->getId() == bb->getId())
3101 {
3102 // block already in list, update exec size if necessary
3103 if (execSize > (*listIter).second)
3104 {
3105 (*listIter).second = execSize;
3106 }
3107 break;
3108 }
3109 else if (aBB->getId() > bb->getId())
3110 {
3111 activeJoinBlocks.insert(listIter, BlockSizePair(bb, execSize));
3112 break;
3113 }
3114 }
3115
3116 if (listIter == activeJoinBlocks.end())
3117 {
3118 activeJoinBlocks.push_back(BlockSizePair(bb, execSize));
3119 }
3120 }
3121
setPhysicalPredSucc()3122 void FlowGraph::setPhysicalPredSucc()
3123 {
3124 BB_LIST_CITER it = BBs.cbegin();
3125 BB_LIST_CITER cend = BBs.cend();
3126 if (it != cend)
3127 {
3128 // first, set up head BB
3129 G4_BB* pred = *it;
3130 pred->setPhysicalPred(NULL);
3131
3132 for (++it; it != cend; ++it)
3133 {
3134 G4_BB* bb = *it;
3135 bb->setPhysicalPred(pred);
3136 pred->setPhysicalSucc(bb);
3137 pred = bb;
3138 }
3139
3140 // last, set up the last BB
3141 pred->setPhysicalSucc(NULL);
3142 }
3143 }
3144
insertEndif(G4_BB * bb,G4_ExecSize execSize,bool createLabel)3145 G4_Label* FlowGraph::insertEndif(G4_BB* bb, G4_ExecSize execSize, bool createLabel)
3146 {
3147 // endif is placed immediately after the label
3148 G4_INST* endifInst = builder->createInternalCFInst(NULL, G4_endif, execSize, NULL, NULL, InstOpt_NoOpt);
3149 INST_LIST_ITER iter = bb->begin();
3150 MUST_BE_TRUE(iter != bb->end(), "empty BB");
3151 iter++;
3152 bb->insertBefore(iter, endifInst);
3153
3154 // this block may be a target of multiple ifs, in which case we will need to insert
3155 // one endif for each if. The innermost endif will use the BB label, while for the other
3156 // endifs a new label will be created for each of them.
3157 if (createLabel)
3158 {
3159 std::string name = "_auto" + std::to_string(autoLabelId++);
3160 G4_Label* label = builder->createLabel(name, LABEL_BLOCK);
3161 endifWithLabels.emplace(endifInst, label);
3162 return label;
3163 }
3164 else
3165 {
3166 return bb->getLabel();
3167 }
3168 }
3169
3170 // This function set the JIP of the endif to the target instruction (either endif or while)
setJIPForEndif(G4_INST * endif,G4_INST * target,G4_BB * targetBB)3171 void FlowGraph::setJIPForEndif(G4_INST* endif, G4_INST* target, G4_BB* targetBB)
3172 {
3173 MUST_BE_TRUE(endif->opcode() == G4_endif, "must be an endif instruction");
3174 G4_Label* label = getLabelForEndif(target);
3175 if (label)
3176 {
3177 // see if there's another label before the inst that we can reuse
3178 // FIXME: we really should associate labels with inst instead of having special label inst,
3179 // so we can avoid ugly code like this
3180 G4_INST* prevInst = NULL;
3181 if (target->opcode() == G4_endif)
3182 {
3183 for (G4_INST* inst : *targetBB)
3184 {
3185 if (inst == target)
3186 {
3187 if (prevInst != NULL && prevInst->isLabel())
3188 {
3189 label = prevInst->getLabel();
3190 }
3191 break;
3192 }
3193 prevInst = inst;
3194 }
3195 }
3196 else
3197 {
3198 MUST_BE_TRUE(target->opcode() == G4_while, "must be a while instruction");
3199 INST_LIST_RITER it = ++(targetBB->rbegin());
3200 if (it != targetBB->rend())
3201 {
3202 G4_INST* inst = *it;
3203 if (inst->isLabel())
3204 {
3205 label = inst->getLabel();
3206 }
3207 }
3208 }
3209
3210 if (label == NULL)
3211 {
3212 std::string name = "_auto" + std::to_string(autoLabelId++);
3213 label = builder->createLabel(name, LABEL_BLOCK);
3214 endifWithLabels.emplace(target, label);
3215 }
3216 }
3217 endif->asCFInst()->setJip(label);
3218
3219 #ifdef DEBUG_VERBOSE_ON
3220 cout << "set JIP for: \n";
3221 endif->emit(cout);
3222 cout << "\n";
3223 #endif
3224 }
3225
convertGotoToJmpi(G4_INST * gotoInst)3226 void FlowGraph::convertGotoToJmpi(G4_INST *gotoInst)
3227 {
3228 gotoInst->setOpcode(G4_jmpi);
3229 gotoInst->setSrc(gotoInst->asCFInst()->getUip(), 0);
3230 gotoInst->asCFInst()->setJip(NULL);
3231 gotoInst->asCFInst()->setUip(NULL);
3232 gotoInst->setExecSize(g4::SIMD1);
3233 gotoInst->setOptions(InstOpt_NoOpt | InstOpt_WriteEnable);
3234 }
3235
3236 /*
3237 * This function generates UCF (unstructurized control flow, that is, goto/join/jmpi)
3238 * This function inserts the join for each goto as well as compute the JIP for the
3239 * goto and join instructions. It additionally converts uniform (simd1 or non-predicated)
3240 * gotos into scalar jumps. If it's a forward goto, it may be converted into a jmpi only
3241 * if it is uniform and it does not overlap with another divergent goto. All uniform
3242 * backward gotos may be converted into scalar jumps.
3243 *
3244 * - This function does *not* alter the CFG.
3245 * - This function does *not* consider SCF (structured control flow), as a well-formed
3246 * vISA program should not have overlapped goto and structured CF instructions.
3247 *
3248 */
processGoto(bool HasSIMDCF)3249 void FlowGraph::processGoto(bool HasSIMDCF)
3250 {
3251 // For all BBs in [StartBB, EndBB) (including StartBB, but not including EndBB) that
3252 // jump after EndBB (forward jump), return the earliest (closest to EndBB). If no such
3253 // BB, return nullptr.
3254 // Assumption: 1. BB's id is in the increasing order; and 2) physical succ has been set.
3255 auto getEarliestJmpOutBB = [](
3256 const std::list<BlockSizePair> &activeJoins,
3257 const G4_BB* StartBB,
3258 const G4_BB* EndBB) -> G4_BB*
3259 {
3260 // Existing active joins must be considered. For example,
3261 // goto L0 (non-uniform)
3262 // ...
3263 // Loop:
3264 // ...
3265 // goto L1 (uniform goto)
3266 // ...
3267 // L0:
3268 // ...
3269 // goto Loop
3270 // ...
3271 // L1:
3272 // Since 'goto L0' is non-uniform, 'goto L1' must be tranlated into goto and a join must
3273 // be inserted at L1. If goto L0 is not considered (no active join at L0) and 'goto L1' can
3274 // be converted into jmpi (thus no join will be inserted at L1, which is wrong.
3275 std::list<BlockSizePair> tmpActiveJoins(activeJoins);
3276
3277 const G4_BB* bb = StartBB;
3278 while (bb != EndBB)
3279 {
3280 const G4_BB* currBB = bb;
3281 if (!tmpActiveJoins.empty())
3282 {
3283 // adjust active join lists if a join is reached.
3284 if (bb == tmpActiveJoins.front().first)
3285 {
3286 tmpActiveJoins.pop_front();
3287 }
3288 }
3289
3290 bb = bb->getPhysicalSucc();
3291 if (currBB->empty())
3292 {
3293 continue;
3294 }
3295
3296 const G4_INST* lastInst = currBB->back();
3297 if (lastInst->opcode() != G4_goto || lastInst->asCFInst()->isBackward())
3298 {
3299 continue;
3300 }
3301 G4_BB* targetBB = currBB->Succs.back();
3302 assert(lastInst->asCFInst()->getUip() == targetBB->getLabel());
3303 if (lastInst->asCFInst()->isUniform() &&
3304 (tmpActiveJoins.empty() || tmpActiveJoins.front().first->getId() >= targetBB->getId()))
3305 {
3306 // Non-crossing uniform goto will be jmpi, and thus no join needed.
3307 //
3308 // For the crossing gotos, a uniform goto cannot be translated to jmpi.
3309 // For example:
3310 // goto L0: // non-uniform goto (need a join)
3311 // ....
3312 // uniform-goto L1:
3313 // ...
3314 // L0:
3315 // ...
3316 // L1:
3317 // Here, uniform-goto L1 is uniform. Since it crosses the non-uniform joins at L0,
3318 // it must be translated to goto, not jmpi. Thus, join is needed at L1.
3319 continue;
3320 }
3321 // Here, use SIMD1 as execsize does not matter here.
3322 addBBToActiveJoinList(tmpActiveJoins, targetBB, g4::SIMD1);
3323 }
3324
3325 // Need to remove join at EndBB if present (looking for joins after EndBB)
3326 if (!tmpActiveJoins.empty() && tmpActiveJoins.front().first == EndBB)
3327 {
3328 tmpActiveJoins.pop_front();
3329 }
3330
3331 if (tmpActiveJoins.empty())
3332 {
3333 // no new join
3334 return nullptr;
3335 }
3336 return tmpActiveJoins.front().first;
3337 };
3338
3339 // list of active blocks where a join needs to be inserted, sorted in lexical order
3340 std::list<BlockSizePair> activeJoinBlocks;
3341 bool doScalarJmp = !builder->noScalarJmp();
3342
3343 for (G4_BB* bb : BBs)
3344 {
3345 if (bb->size() == 0)
3346 {
3347 continue;
3348 }
3349
3350 if (activeJoinBlocks.size() > 0)
3351 {
3352 if (bb == activeJoinBlocks.front().first)
3353 {
3354 // This block is the target of one or more forward goto,
3355 // or the fall-thru of a backward goto, needs to insert a join
3356 G4_ExecSize execSize = activeJoinBlocks.front().second;
3357 G4_Label* joinJIP = NULL;
3358
3359 activeJoinBlocks.pop_front();
3360 if (activeJoinBlocks.size() > 0)
3361 {
3362 //set join JIP to the next active join
3363 G4_BB* joinBlock = activeJoinBlocks.front().first;
3364 joinJIP = joinBlock->getLabel();
3365 }
3366
3367 insertJoinToBB(bb, execSize, joinJIP);
3368 }
3369 }
3370
3371 // check to see if this block is the target of one (or more) backward goto
3372 // If so, we process the backward goto and push its fall-thru block to the
3373 // active join list
3374 for (std::list<G4_BB*>::iterator iter = bb->Preds.begin(), iterEnd = bb->Preds.end(); iter != iterEnd; ++iter)
3375 {
3376 G4_BB* predBB = *iter;
3377 G4_INST* lastInst = predBB->back();
3378 if (lastInst->opcode() == G4_goto && lastInst->asCFInst()->isBackward() &&
3379 lastInst->asCFInst()->getUip() == bb->getLabel())
3380 {
3381 // backward goto
3382 G4_ExecSize eSize = lastInst->getExecSize() > g4::SIMD1 ?
3383 lastInst->getExecSize() : pKernel->getSimdSize();
3384 bool isUniform = lastInst->getExecSize() == g4::SIMD1 || lastInst->getPredicate() == NULL;
3385 if (isUniform && doScalarJmp)
3386 {
3387 // can always convert a uniform backward goto into a jmp
3388 convertGotoToJmpi(lastInst);
3389
3390 // we still have to add a join at the BB immediately after the back edge,
3391 // since there may be subsequent loop breaks that are waiting there.
3392 // example:
3393 // L1:
3394 // (P1) goto exit
3395 // (P2) goto L2
3396 // goto L1
3397 // L2: <-- earliest after-loop goto target
3398 // ...
3399 // exit:
3400 //
3401 // In this case, L2 shall be the 1st after-loop join (not necessarily the one immediately
3402 // after the backward BB). The goto exit's JIP should be set to L2.
3403 //
3404 // Here, we will add the 1st after-loop join so that any out-loop JIP (either goto or
3405 // join) within the loop body will has its JIP set to this join.
3406 if (G4_BB* afterLoopJoinBB = getEarliestJmpOutBB(activeJoinBlocks, bb, predBB))
3407 {
3408 addBBToActiveJoinList(activeJoinBlocks, afterLoopJoinBB, eSize);
3409 }
3410 }
3411 else
3412 {
3413 if (lastInst->getExecSize() == g4::SIMD1)
3414 { // For simd1 goto, convert it to a goto with the right execSize.
3415 lastInst->setExecSize(eSize);
3416 // This should have noMask removed if any
3417 lastInst->setOptions(InstOpt_M0);
3418 }
3419 // add join to the fall-thru BB
3420 if (G4_BB* fallThruBB = predBB->getPhysicalSucc())
3421 {
3422 addBBToActiveJoinList(activeJoinBlocks, fallThruBB, eSize);
3423 lastInst->asCFInst()->setJip(fallThruBB->getLabel());
3424 }
3425 }
3426 }
3427 }
3428
3429 G4_INST* lastInst = bb->back();
3430 if (lastInst->opcode() == G4_goto && !lastInst->asCFInst()->isBackward())
3431 {
3432 // forward goto
3433 // the last Succ BB is our goto target
3434 G4_BB* gotoTargetBB = bb->Succs.back();
3435 bool isUniform = lastInst->getExecSize() == g4::SIMD1 || lastInst->getPredicate() == NULL;
3436
3437 if (isUniform && doScalarJmp &&
3438 (activeJoinBlocks.size() == 0 || activeJoinBlocks.front().first->getId() > gotoTargetBB->getId()))
3439 {
3440 // can convert goto into a scalar jump to UIP, if the jmp will not make us skip any joins
3441 // CFG itself does not need to be updated
3442 convertGotoToJmpi(lastInst);
3443 }
3444 else
3445 {
3446 // set goto JIP to the first active block
3447 G4_ExecSize eSize = lastInst->getExecSize() > g4::SIMD1 ?
3448 lastInst->getExecSize() : pKernel->getSimdSize();
3449 addBBToActiveJoinList(activeJoinBlocks, gotoTargetBB, eSize);
3450 G4_BB* joinBlock = activeJoinBlocks.front().first;
3451 if (lastInst->getExecSize() == g4::SIMD1)
3452 { // For simd1 goto, convert it to a goto with the right execSize.
3453 lastInst->setExecSize(eSize);
3454 lastInst->setOptions(InstOpt_M0);
3455 }
3456 lastInst->asCFInst()->setJip(joinBlock->getLabel());
3457
3458 if (!builder->gotoJumpOnTrue())
3459 {
3460 // For BDW/SKL goto, the false channels are the ones that actually will take the jump,
3461 // and we thus have to flip the predicate
3462 G4_Predicate* pred = lastInst->getPredicate();
3463 if (pred != NULL)
3464 {
3465 pred->setState(pred->getState() == PredState_Plus ? PredState_Minus : PredState_Plus);
3466 }
3467 else
3468 {
3469 // if there is no predicate, generate a predicate with all 0s.
3470 // if predicate is SIMD32, we have to use a :ud dst type for the move
3471 G4_ExecSize execSize = lastInst->getExecSize() > g4::SIMD16 ? g4::SIMD2 : g4::SIMD1;
3472 G4_Declare* tmpFlagDcl = builder->createTempFlag(execSize);
3473 G4_DstRegRegion* newPredDef = builder->createDst(
3474 tmpFlagDcl->getRegVar(), 0, 0, 1, execSize == g4::SIMD2 ? Type_UD : Type_UW);
3475 G4_INST* predInst = builder->createMov(g4::SIMD1, newPredDef, builder->createImm(0, Type_UW),
3476 InstOpt_WriteEnable, false);
3477 INST_LIST_ITER iter = bb->end();
3478 iter--;
3479 bb->insertBefore(iter, predInst);
3480
3481 pred = builder->createPredicate(
3482 PredState_Plus,
3483 tmpFlagDcl->getRegVar(),
3484 0);
3485 lastInst->setPredicate(pred);
3486 }
3487 }
3488 }
3489 }
3490 }
3491 }
3492
3493 // TGL NoMask WA : to identify which BB needs WA
3494 //
3495 // nestedDivergentBBs[BB] = 2;
3496 // BB is in a nested divergent branch
3497 // nestedDivergentBBs[BB] = 1;
3498 // BB is not in a nested divergent branch, but in a divergent branch.
findNestedDivergentBBs(std::unordered_map<G4_BB *,int> & nestedDivergentBBs)3499 void FlowGraph::findNestedDivergentBBs(std::unordered_map<G4_BB*, int>& nestedDivergentBBs)
3500 {
3501 // Control-Flow state
3502 // Used for keeping the current state of control flow during
3503 // traversing BBs in the layout order. Assuming the current
3504 // BB is 'currBB', this class shows whether currBB is in any
3505 // divergent branch/nested divergent branch.
3506 class CFState
3507 {
3508 // If currBB is the head of loop or currBB has a cf inst such as goto/break/if,
3509 // the code will diverge from currBB to the joinBB. The following cases shows where
3510 // joinBB is:
3511 // case 1: cf inst = goto
3512 // <currBB> [(p)] goto L
3513 // ...
3514 // <joinBB> L:
3515 // case 2: cf inst = if
3516 // <currBB> if
3517 // ...
3518 // <joinBB> endif
3519 //
3520 // Note that else does not increase nor decrease divergence level.
3521 // case 3: cf inst = break
3522 // <currBB> break
3523 // ...
3524 // [(p)] while
3525 // <joinBB>
3526 // case 4:
3527 // <currBB> L:
3528 // ....
3529 // [(p)] while/goto L
3530 // <joinBB>
3531 // Case 1/2/3 will increase divergence level, while case 4 does not.
3532 //
3533 // The following are used for tracking nested divergence
3534 // LastDivergentBBId: Id(LastDivergentBB).
3535 // LastDivergentBB is the fartherest joinBB of any goto/break/if/while
3536 // so far, as described above. That is, [currBB, LastDivergentBB) is
3537 // a divergent code path. LastDivergentBB is not included in this
3538 // divergent path.
3539 // LastNestedBBId: Id(LastNestedBB).
3540 // LastNestedBB is the current fartherest join BB that is inside
3541 // [currBB, LastDivergentBB). We have LastNestedBB if a goto/break/if (no
3542 // loop) is encountered inside an already divergent code path. It is also
3543 // called nested divergent (or just nested).
3544 //
3545 // The following is always true:
3546 // LastDivergentBBId >= LastNestedBBId
3547 int LastDivergentBBId;
3548 int LastNestedBBId;
3549
3550 void setLastDivergentBBId(G4_BB* toBB)
3551 {
3552 LastDivergentBBId = std::max(LastDivergentBBId, (int)toBB->getId());
3553 }
3554
3555 void setLastNestedBBId(G4_BB* toBB)
3556 {
3557 LastNestedBBId = std::max(LastNestedBBId, (int)toBB->getId());
3558 setLastDivergentBBId(toBB);
3559 }
3560
3561 // Increase divergence level
3562 void incDivergenceLevel(G4_BB* toBB)
3563 {
3564 if (isInDivergentBranch())
3565 {
3566 setLastNestedBBId(toBB);
3567 }
3568 else
3569 {
3570 setLastDivergentBBId(toBB);
3571 }
3572 }
3573
3574 public:
3575 CFState() : LastNestedBBId(-1), LastDivergentBBId(-1) {}
3576
3577 bool isInDivergentBranch() const { return (LastDivergentBBId > 0); }
3578 bool isInNestedDivergentBranch() const { return LastNestedBBId > 0; }
3579
3580 // pushLoop: for case 4
3581 // pushJoin: for case 1/2/3
3582 void pushLoop(G4_BB* joinBB) { setLastDivergentBBId(joinBB); }
3583 void pushJoin(G4_BB* joinBB) { incDivergenceLevel(joinBB); }
3584 void popJoinIfMatching(G4_BB* currBB)
3585 {
3586 if ((int)currBB->getId() == LastNestedBBId)
3587 {
3588 LastNestedBBId = -1;
3589 }
3590 if ((int)currBB->getId() == LastDivergentBBId)
3591 {
3592 LastDivergentBBId = -1;
3593 }
3594 assert(!(LastDivergentBBId == -1 && LastNestedBBId >= 0) &&
3595 "ICE: something wrong in setting divergent BB Id!");
3596 }
3597 };
3598
3599 // For loop with backedge Tail->Head, if Tail is divergent, its divergence should be
3600 // propagated to the entire loop as Tail jumps to head, which could go all BBs in the loop.
3601 //
3602 // In another word, the whole loop's divergence level is the same as Tail's. Once the
3603 // entire loop has been handled and Tail's divergence is known, invoking this lambda func
3604 // to carry out propagation.
3605 //
3606 // An example to show WA is needed (nested divergence for Tail):
3607 // Head: // initial fuseMask = 11; 2nd iter: fuseMask = 11 (should be 10)
3608 // if (...) goto Tail; // fuseMask = 11
3609 // // After if, fusedMask = 01 (bigEU is Off)
3610 // ...
3611 // goto out // fuseMask = 01 (BigEU off, SmallEU on)
3612 // // after goto, fuseMask = 00, but HW remains 01
3613 // Tail:
3614 // goto Head // fuseMask should 10, but HW remains 11, and jump to Head at 2nd iter
3615 // out:
3616 auto propLoopDivergence = [&](G4_BB* LoopTail)
3617 {
3618 // LoopTail must be divergent.
3619 std::vector<G4_BB*> workset;
3620 workset.push_back(LoopTail);
3621 while (!workset.empty())
3622 {
3623 std::vector<G4_BB*> newWorkset;
3624 for (auto iter : workset)
3625 {
3626 G4_BB* Tail = iter;
3627 G4_InstCF* cfInst = Tail->back()->asCFInst();
3628
3629 assert(nestedDivergentBBs.count(Tail) > 0 &&
3630 "Only divergent Tail shall invoke this func!");
3631
3632 // Find loop head
3633 G4_BB* Head = nullptr;
3634 for (G4_BB* succBB : Tail->Succs)
3635 {
3636 if ((cfInst->opcode() == G4_goto && cfInst->getUip() == succBB->getLabel()) ||
3637 (cfInst->opcode() == G4_while && cfInst->getJip() == succBB->getLabel()) ||
3638 (cfInst->opcode() == G4_jmpi && cfInst->getSrc(0) == succBB->getLabel()))
3639 {
3640 Head = succBB;
3641 break;
3642 }
3643 }
3644 assert(Head != nullptr);
3645
3646 // If Head's divergence level is already higher than Tail, no propagation needed
3647 // as Head's divergence has been propagated already.
3648 if (nestedDivergentBBs.count(Head) > 0 &&
3649 nestedDivergentBBs[Head] >= nestedDivergentBBs[Tail])
3650 {
3651 continue;
3652 }
3653
3654 // Follow physical succs to visit all BBs in this loop
3655 for (G4_BB* tBB = Head; tBB != nullptr && tBB != Tail; tBB = tBB->getPhysicalSucc())
3656 {
3657 auto miter = nestedDivergentBBs.find(tBB);
3658 if (miter == nestedDivergentBBs.end() || miter->second < nestedDivergentBBs[Tail])
3659 {
3660 // Propagate Tail's divergence. If the new BB is a tail (another loop),
3661 // add it to workset for the further propagation.
3662 nestedDivergentBBs[tBB] = nestedDivergentBBs[Tail];
3663 auto lastOp = tBB->getLastOpcode();
3664 if (lastOp == G4_while ||
3665 ((lastOp == G4_goto || lastOp == G4_jmpi) &&
3666 tBB->back()->asCFInst()->isBackward()))
3667 {
3668 newWorkset.push_back(tBB);
3669 }
3670 }
3671 }
3672 }
3673
3674 workset = newWorkset;
3675 }
3676 };
3677
3678 if (BBs.empty())
3679 {
3680 // Sanity check
3681 return;
3682 }
3683
3684 // Analyze subroutines in topological order. As there is no recursion
3685 // and no indirect call, a subroutine will be analyzed only if all
3686 // its callers have been analyzed.
3687 //
3688 // If no subroutine, sortedFuncTable is empty. Here keep the entry and
3689 // subroutines in a vector first (it works with and without subroutines),
3690 // then scan subroutines in topological order.
3691 struct StartEndIter {
3692 BB_LIST_ITER StartI;
3693 BB_LIST_ITER EndI;
3694
3695 // When a subroutine is entered, the fusedMask must not be 00, but it
3696 // could be 01. This field shows it could be 01 if set to true.
3697 bool isInDivergentBranch;
3698 };
3699 int numFuncs = (int)sortedFuncTable.size();
3700 std::vector<StartEndIter> allFuncs;
3701 std::unordered_map<FuncInfo*, uint32_t> funcInfoIndex;
3702 if (numFuncs == 0)
3703 {
3704 numFuncs = 1;
3705 allFuncs.resize(1);
3706 allFuncs[0].StartI = BBs.begin();
3707 allFuncs[0].EndI = BBs.end();
3708 allFuncs[0].isInDivergentBranch = false;
3709 }
3710 else
3711 {
3712 allFuncs.resize(numFuncs);
3713 for (int i = numFuncs; i > 0; --i)
3714 {
3715 uint32_t ix = (uint32_t)(i - 1);
3716 FuncInfo* pFInfo = sortedFuncTable[ix];
3717 G4_BB* StartBB = pFInfo->getInitBB();
3718 G4_BB* EndBB = pFInfo->getExitBB();
3719 uint32_t ui = (uint32_t)(numFuncs - i);
3720 allFuncs[ui].StartI = std::find(BBs.begin(), BBs.end(), StartBB);
3721 auto nextI = std::find(BBs.begin(), BBs.end(), EndBB);
3722 assert(nextI != BBs.end() && "ICE: subroutine's end BB not found!");
3723 allFuncs[ui].EndI = (++nextI);
3724 allFuncs[ui].isInDivergentBranch = false;
3725
3726 funcInfoIndex[pFInfo] = ui;
3727 }
3728 }
3729
3730 for (int i = 0; i < numFuncs; ++i)
3731 {
3732 // each function: [IT, IE)
3733 BB_LIST_ITER& IT = allFuncs[i].StartI;
3734 BB_LIST_ITER& IE = allFuncs[i].EndI;
3735 if (IT == IE)
3736 {
3737 // Sanity check
3738 continue;
3739 }
3740
3741 CFState cfs;
3742 if (allFuncs[i].isInDivergentBranch)
3743 {
3744 // subroutine's divergent on entry. Mark [StartBB, EndBB) as divergent
3745 auto prevI = IE;
3746 --prevI;
3747 G4_BB* EndBB = *prevI;
3748 cfs.pushJoin(EndBB);
3749 }
3750
3751 for (; IT != IE; ++IT)
3752 {
3753 G4_BB* BB = *IT;
3754
3755 // This handles cases in which BB has endif/while/join as well as others
3756 // so we don't need to explicitly check whether BB has endif/while/join, etc.
3757 cfs.popJoinIfMatching(BB);
3758
3759 G4_INST* firstInst = BB->getFirstInst();
3760 if (firstInst == nullptr)
3761 {
3762 // empty BB or BB with only label inst
3763 continue;
3764 }
3765
3766 // Handle loop
3767 for (G4_BB* predBB : BB->Preds)
3768 {
3769 G4_INST* lastInst = predBB->back();
3770 if ((lastInst->opcode() == G4_while &&
3771 lastInst->asCFInst()->getJip() == BB->getLabel()) ||
3772 (lastInst->opcode() == G4_goto && lastInst->asCFInst()->isBackward() &&
3773 lastInst->asCFInst()->getUip() == BB->getLabel()))
3774 {
3775 // joinBB is the BB right after goto/while
3776 BB_LIST_ITER predIter = std::find(BBs.begin(), BBs.end(), predBB);
3777 ++predIter;
3778 // if predBB is the last BB then joinBB is not available
3779 if (predIter == BBs.end())
3780 continue;
3781 G4_BB* joinBB = *predIter;
3782 cfs.pushLoop(joinBB);
3783 }
3784 }
3785
3786 if (cfs.isInNestedDivergentBranch())
3787 {
3788 nestedDivergentBBs[BB] = 2;
3789 }
3790 else if (cfs.isInDivergentBranch())
3791 {
3792 nestedDivergentBBs[BB] = 1;
3793 }
3794
3795 G4_INST* lastInst = BB->back();
3796
3797 // Need to check whether to propagate WA marking to entire loop!
3798 //
3799 // Do it for CM now, need to apply to all!
3800 // if (nestedDivergentBBs.count(BB) > 0 && nestedDivergentBBs[BB] >= 2 &&
3801 // getKernel()->getInt32KernelAttr(Attributes::ATTR_Target) == VISA_CM)
3802 if (nestedDivergentBBs.count(BB) > 0 && nestedDivergentBBs[BB] >= 2)
3803 {
3804 if (lastInst->opcode() == G4_while ||
3805 ((lastInst->opcode() == G4_goto || lastInst->opcode() == G4_jmpi) &&
3806 lastInst->asCFInst()->isBackward()))
3807 {
3808 propLoopDivergence(BB);
3809 }
3810 }
3811
3812 if ((lastInst->opcode() == G4_goto && !lastInst->asCFInst()->isBackward()) ||
3813 lastInst->opcode() == G4_break)
3814 {
3815 // forward goto/break : the last Succ BB is our target BB
3816 // For break, it should be the BB right after while inst.
3817 G4_BB* joinBB = BB->Succs.back();
3818 cfs.pushJoin(joinBB);
3819 }
3820 else if (lastInst->opcode() == G4_if)
3821 {
3822 G4_Label* labelInst = lastInst->asCFInst()->getUip();
3823 G4_BB* joinBB = findLabelBB(IT, IE, labelInst->getLabel());
3824 assert(joinBB && "ICE(vISA) : missing endif label!");
3825 cfs.pushJoin(joinBB);
3826 }
3827 else if (lastInst->opcode() == G4_call)
3828 {
3829 // If this function is already in divergent branch, the callee
3830 // must be in a divergent branch!.
3831 if (allFuncs[i].isInDivergentBranch ||
3832 cfs.isInDivergentBranch() ||
3833 lastInst->getPredicate() != nullptr)
3834 {
3835 FuncInfo* calleeFunc = BB->getCalleeInfo();
3836 if (funcInfoIndex.count(calleeFunc))
3837 {
3838 int ix = funcInfoIndex[calleeFunc];
3839 allFuncs[ix].isInDivergentBranch = true;
3840 }
3841 }
3842 }
3843 }
3844
3845 // Once all BBs are precessed, cfs should be clear
3846 assert((!cfs.isInDivergentBranch()) &&
3847 "ICE(vISA): there is an error in divergence tracking!");
3848 }
3849
3850 for (auto MI : nestedDivergentBBs)
3851 {
3852 G4_BB* bb = MI.first;
3853 if (MI.second >= 2)
3854 {
3855 bb->setBBType(G4_BB_NM_WA_TYPE);
3856 }
3857 }
3858 return;
3859 }
3860
3861 //
3862 // Add declares for the stack and frame pointers.
3863 //
addFrameSetupDeclares(IR_Builder & builder,PhyRegPool & regPool)3864 void FlowGraph::addFrameSetupDeclares(IR_Builder& builder, PhyRegPool& regPool)
3865 {
3866 if (framePtrDcl == NULL)
3867 {
3868 framePtrDcl = builder.getBEFP();
3869 }
3870 if (stackPtrDcl == NULL)
3871 {
3872 stackPtrDcl = builder.getBESP();
3873 }
3874 if (scratchRegDcl == NULL)
3875 {
3876 scratchRegDcl = builder.createDeclareNoLookup("SR", G4_GRF, numEltPerGRF<Type_UD>(), 1, Type_UD);
3877 scratchRegDcl->getRegVar()->setPhyReg(regPool.getGreg(builder.kernel.getSpillHeaderGRF()), 0);
3878 }
3879 }
3880
3881 //
3882 // Insert pseudo dcls to represent the caller-save and callee-save registers.
3883 // This is only required when there is more than one graph cut due to the presence
3884 // of function calls using a stack calling convention.
3885 //
addSaveRestorePseudoDeclares(IR_Builder & builder)3886 void FlowGraph::addSaveRestorePseudoDeclares(IR_Builder& builder)
3887 {
3888 //
3889 // VCE_SAVE (r60.0-r125.0) [r125-127 is reserved]
3890 //
3891 if (pseudoVCEDcl == NULL)
3892 {
3893 unsigned numRowsVCE = getKernel()->getNumCalleeSaveRegs();
3894 pseudoVCEDcl = builder.createDeclareNoLookup("VCE_SAVE", G4_GRF, numEltPerGRF<Type_UD>(), static_cast<unsigned short>(numRowsVCE), Type_UD);
3895 }
3896 else
3897 {
3898 pseudoVCEDcl->getRegVar()->setPhyReg(NULL, 0);
3899 }
3900
3901 //
3902 // Insert caller save decls for VCA/A0/Flag (one per call site)
3903 // VCA_SAVE (r1.0-r60.0) [r0 is reserved] - one required per stack call,
3904 // but will be reused across cuts.
3905 //
3906 INST_LIST callSites;
3907 for (auto bb : builder.kernel.fg)
3908 {
3909 if (bb->isEndWithFCall())
3910 {
3911 callSites.push_back(bb->back());
3912 }
3913 }
3914
3915 unsigned i = 0;
3916 for (auto callSite : callSites)
3917 {
3918 const char* nameBase = "VCA_SAVE"; // sizeof(nameBase) = 9, including ending 0
3919 const int maxIdLen = 3;
3920 const char* name = builder.getNameString(mem, sizeof(nameBase) + maxIdLen, "%s_%d", nameBase, i);
3921 G4_Declare* VCA = builder.createDeclareNoLookup(name, G4_GRF, numEltPerGRF<Type_UD>(), builder.kernel.getCallerSaveLastGRF(), Type_UD);
3922 name = builder.getNameString(mem, 50, "SA0_%d", i);
3923 G4_Declare* saveA0 = builder.createDeclareNoLookup(name, G4_ADDRESS, (uint16_t)getNumAddrRegisters(), 1, Type_UW);
3924 name = builder.getNameString(mem, 64, "SFLAG_%d", i);
3925 G4_Declare* saveFLAG = builder.createDeclareNoLookup(name, G4_FLAG, (uint16_t) builder.getNumFlagRegisters(), 1, Type_UW);
3926 fcallToPseudoDclMap[callSite->asCFInst()] = { VCA, saveA0, saveFLAG };
3927 i++;
3928 }
3929 }
3930
3931 //
3932 // Since we don't do SIMD augmentation in RA for CM, we have to add an edge
3933 // between the then and else block of an if branch to ensure that liveness is
3934 // computed correctly, if conservatively. This also applies to any goto BB and
3935 // its JIP join block
3936 //
addSIMDEdges()3937 void FlowGraph::addSIMDEdges()
3938 {
3939 std::map<G4_Label*, G4_BB*> joinBBMap;
3940 for (auto bb : BBs)
3941 {
3942 if (bb->size() > 0 && bb->back()->opcode() == G4_else)
3943 {
3944 addUniquePredSuccEdges(bb, bb->getPhysicalSucc());
3945 }
3946 else
3947 {
3948 // check goto case
3949 auto instIter = std::find_if(bb->begin(), bb->end(),
3950 [](G4_INST* inst) { return !inst->isLabel(); });
3951 if (instIter != bb->end() && (*instIter)->opcode() == G4_join)
3952 {
3953 G4_INST* firstInst = bb->front();
3954 if (firstInst->isLabel())
3955 {
3956 joinBBMap[firstInst->getLabel()] = bb;
3957 }
3958 }
3959 }
3960 }
3961
3962 if (!joinBBMap.empty())
3963 {
3964 for (auto bb : BBs)
3965 {
3966 if (bb->isEndWithGoto())
3967 {
3968 G4_INST* gotoInst = bb->back();
3969 auto iter = joinBBMap.find(gotoInst->asCFInst()->getJip()->asLabel());
3970 if (iter != joinBBMap.end())
3971 {
3972 addUniquePredSuccEdges(bb, iter->second);
3973 }
3974 }
3975 }
3976 }
3977 }
3978
3979 //
3980 // Perform DFS traversal on the flow graph (do not enter subroutine, but mark subroutine blocks
3981 // so that they will be processed independently later)
3982 //
DFSTraverse(G4_BB * startBB,unsigned & preId,unsigned & postId,FuncInfo * fn)3983 void FlowGraph::DFSTraverse(G4_BB* startBB, unsigned &preId, unsigned &postId, FuncInfo* fn)
3984 {
3985 MUST_BE_TRUE(fn, "Invalid func info");
3986 std::stack<G4_BB*> traversalStack;
3987 traversalStack.push(startBB);
3988
3989 while (!traversalStack.empty())
3990 {
3991 G4_BB* bb = traversalStack.top();
3992 if (bb->getPreId() != UINT_MAX)
3993 {
3994 // Pre-processed already and continue to the next one.
3995 // Before doing so, set postId if not set before.
3996 traversalStack.pop();
3997 if (bb->getRPostId() == UINT_MAX)
3998 {
3999 // All bb's succ has been visited (PreId is set) at this time.
4000 // if any of its succ has not been finished (RPostId not set),
4001 // bb->succ forms a backedge.
4002 //
4003 // Note: originally, CALL and EXIT will not check back-edges, here
4004 // we skip checking for them as well. (INIT & RETURN should
4005 // be checked as well ?)
4006 if (!(bb->getBBType() & (G4_BB_CALL_TYPE | G4_BB_EXIT_TYPE)))
4007 {
4008 for (auto succBB : bb->Succs)
4009 {
4010 if (succBB->getRPostId() == UINT_MAX)
4011 {
4012 backEdges.push_back(Edge(bb, succBB));
4013 }
4014 }
4015 }
4016
4017 // Need to keep this after backedge checking so that self-backedge
4018 // (single-bb loop) will not be missed.
4019 bb->setRPostId(postId++);
4020 }
4021 continue;
4022 }
4023
4024 fn->addBB(bb);
4025 bb->setPreId(preId++);
4026
4027 if (bb->getBBType() & G4_BB_CALL_TYPE)
4028 {
4029 G4_BB* returnBB = bb->BBAfterCall();
4030 // If call is predicated, first item in Succs is physically consecutive BB and second (or last)
4031 // item is sub-routine entry BB.
4032 MUST_BE_TRUE(bb->Succs.back()->getBBType() & G4_BB_INIT_TYPE, ERROR_FLOWGRAPH);
4033 // bb->Succs size may be 2 if call is predicated.
4034 MUST_BE_TRUE(bb->Succs.size() == 1 || bb->Succs.size() == 2, ERROR_FLOWGRAPH);
4035
4036 {
4037 bool found = false;
4038 for (auto func : fn->getCallees())
4039 {
4040 if (func == bb->getCalleeInfo())
4041 found = true;
4042 }
4043 if (!found)
4044 {
4045 fn->addCallee(bb->getCalleeInfo());
4046 }
4047 }
4048
4049 if (returnBB->getPreId() == UINT_MAX)
4050 {
4051 traversalStack.push(returnBB);
4052 }
4053 else
4054 {
4055 MUST_BE_TRUE(false, ERROR_FLOWGRAPH);
4056 }
4057 }
4058 else if (bb->getBBType() & G4_BB_EXIT_TYPE)
4059 {
4060 // Skip
4061 }
4062 else
4063 {
4064 // To be consistent with previous behavior, use reverse_iter.
4065 BB_LIST_RITER RIE = bb->Succs.rend();
4066 for (BB_LIST_RITER rit = bb->Succs.rbegin(); rit != RIE; ++rit)
4067 {
4068 G4_BB* succBB = *rit;
4069 if (succBB->getPreId() == UINT_MAX)
4070 {
4071 traversalStack.push(succBB);
4072 }
4073 }
4074 }
4075 // As the top of stack may be different than that at the
4076 // beginning of this iteration, cannot do pop here. Instead,
4077 // do pop and set RPostId at the beginning of each iteration.
4078 //
4079 // traversalStack.pop();
4080 // bb->setRPostId(postId++);
4081 }
4082 }
4083
markStale()4084 void vISA::FlowGraph::markStale()
4085 {
4086 // invoked whenever CFG structures changes.
4087 // structural changes include addition/removal of BB or edge.
4088 // mark analysis passes as stale so getters lazily re-run
4089 // analysis when queried.
4090 dom.setStale();
4091 immDom.setStale();
4092 pDom.setStale();
4093 loops.setStale();
4094
4095 // any other analysis that becomes stale when FlowGraph changes
4096 // should be marked as stale here.
4097 }
4098
markRPOTraversal()4099 void FlowGraph::markRPOTraversal()
4100 {
4101 MUST_BE_TRUE(numBBId == BBs.size(), ERROR_FLOWGRAPH);
4102
4103 unsigned postID = 0;
4104 backEdges.clear();
4105
4106 for (auto curBB : BBs)
4107 {
4108 curBB->setRPostId(postID++);
4109
4110 if (curBB->size() > 0)
4111 {
4112 if (curBB->getBBType() & G4_BB_CALL_TYPE)
4113 {
4114 // skip
4115 }
4116 else if (curBB->getBBType() & G4_BB_EXIT_TYPE)
4117 {
4118 // Skip
4119 }
4120 else
4121 {
4122 for (auto succBB : curBB->Succs)
4123 {
4124 if (curBB->getId() >= succBB->getId())
4125 {
4126 backEdges.push_back(Edge(curBB, succBB));
4127 }
4128 }
4129 }
4130 }
4131 }
4132 }
4133
4134 //
4135 // Find back-edges in the flow graph.
4136 //
findBackEdges()4137 void FlowGraph::findBackEdges()
4138 {
4139 MUST_BE_TRUE(numBBId == BBs.size(), ERROR_FLOWGRAPH);
4140
4141 for (auto bb : BBs)
4142 {
4143 bb->setPreId(UINT_MAX);
4144 bb->setRPostId(UINT_MAX);
4145 }
4146
4147 unsigned preId = 0;
4148 unsigned postID = 0;
4149 backEdges.clear();
4150
4151 setPhysicalPredSucc();
4152 DFSTraverse(getEntryBB(), preId, postID, kernelInfo);
4153
4154 for (auto fn : funcInfoTable)
4155 {
4156 DFSTraverse(fn->getInitBB(), preId, postID, fn);
4157 }
4158 }
4159
4160 //
4161 // Find natural loops in the flow graph.
4162 // Assumption: the input FG is reducible.
4163 //
findNaturalLoops()4164 void FlowGraph::findNaturalLoops()
4165 {
4166 setPhysicalPredSucc();
4167 for (auto&& backEdge : backEdges)
4168 {
4169 G4_BB* head = backEdge.second;
4170 G4_BB* tail = backEdge.first;
4171 std::list<G4_BB*> loopBlocks;
4172 Blocks loopBody;
4173 loopBlocks.push_back(tail);
4174 loopBody.insert(tail);
4175
4176 while (!loopBlocks.empty())
4177 {
4178 G4_BB* loopBlock = loopBlocks.front();
4179 loopBlocks.pop_front();
4180 loopBlock->setInNaturalLoop(true);
4181 loopBlock->setNestLevel();
4182
4183 if ((loopBlock == head) || (loopBlock->getBBType() & G4_BB_INIT_TYPE))
4184 {
4185 // Skip
4186 }
4187 else if (loopBlock->getBBType() & G4_BB_RETURN_TYPE)
4188 {
4189 auto callBB = loopBlock->BBBeforeCall();
4190 if (!callBB->isInNaturalLoop())
4191 {
4192 loopBlocks.push_front(callBB);
4193 loopBody.insert(callBB);
4194 }
4195 }
4196 else
4197 {
4198 auto entryBB = getEntryBB();
4199 for (auto predBB : loopBlock->Preds)
4200 {
4201 if (!predBB->isInNaturalLoop())
4202 {
4203 if (predBB == entryBB && head != entryBB)
4204 {
4205 // graph is irreducible, punt natural loop detection for entire CFG
4206 this->reducible = false;
4207 naturalLoops.clear();
4208 for (auto BB : BBs)
4209 {
4210 BB->setInNaturalLoop(false);
4211 BB->resetNestLevel();
4212 }
4213 return;
4214 }
4215 MUST_BE_TRUE(predBB != entryBB || head == entryBB, ERROR_FLOWGRAPH);
4216 loopBlocks.push_front(predBB);
4217 loopBody.insert(predBB);
4218 }
4219 }
4220 }
4221 }
4222
4223 for (auto loopBB : loopBody)
4224 {
4225 loopBB->setInNaturalLoop(false);
4226 }
4227
4228 naturalLoops.emplace(backEdge, loopBody);
4229 }
4230 }
4231
traverseFunc(FuncInfo * func,unsigned * ptr)4232 void FlowGraph::traverseFunc(FuncInfo* func, unsigned *ptr)
4233 {
4234 func->setPreID((*ptr)++);
4235 func->setVisited();
4236 for (auto callee : func->getCallees())
4237 {
4238 if (!(callee->getVisited()))
4239 {
4240 traverseFunc(callee, ptr);
4241 }
4242 }
4243 sortedFuncTable.push_back(func);
4244 func->setPostID((*ptr)++);
4245 }
4246
4247 //
4248 // Sort subroutines in topological order based on DFS
4249 // a topological order is guaranteed as recursion is not allowed for subroutine calls
4250 // results are stored in sortedFuncTable in reverse topological order
4251 //
topologicalSortCallGraph()4252 void FlowGraph::topologicalSortCallGraph()
4253 {
4254 unsigned visitID = 1;
4255 traverseFunc(kernelInfo, &visitID);
4256 }
4257
4258 //
4259 // This should be called only after pre-/post-visit ID are set
4260 //
checkVisitID(FuncInfo * func1,FuncInfo * func2)4261 static bool checkVisitID(FuncInfo* func1, FuncInfo* func2)
4262 {
4263 if (func1->getPreID() < func2->getPreID() &&
4264 func1->getPostID() > func2->getPostID())
4265 {
4266 return true;
4267 }
4268 else
4269 {
4270 return false;
4271 }
4272 }
4273
4274 //
4275 // Find dominators for each function
4276 //
findDominators(std::map<FuncInfo *,std::set<FuncInfo * >> & domMap)4277 void FlowGraph::findDominators(std::map<FuncInfo*, std::set<FuncInfo*>>& domMap)
4278 {
4279 std::map<FuncInfo*, std::set<FuncInfo*>> predMap;
4280
4281 for (auto func : sortedFuncTable)
4282 {
4283 if (func == kernelInfo)
4284 {
4285 std::set<FuncInfo*> initSet;
4286 initSet.insert(kernelInfo);
4287 domMap.insert(std::make_pair(kernelInfo, initSet));
4288 }
4289 else
4290 {
4291 std::set<FuncInfo*> initSet;
4292 for (auto funcTmp : sortedFuncTable)
4293 {
4294 initSet.insert(funcTmp);
4295 }
4296 domMap.insert(std::make_pair(func, initSet));
4297 }
4298
4299 for (auto callee : func->getCallees())
4300 {
4301 std::map<FuncInfo*, std::set<FuncInfo*>>::iterator predMapIter = predMap.find(callee);
4302 if (predMapIter == predMap.end())
4303 {
4304 std::set<FuncInfo*> initSet;
4305 initSet.insert(func);
4306 predMap.insert(std::make_pair(callee, initSet));
4307 }
4308 else
4309 {
4310 predMapIter->second.insert(func);
4311 }
4312 }
4313 }
4314
4315 bool changed = false;
4316 do
4317 {
4318 changed = false;
4319
4320 unsigned funcTableSize = static_cast<unsigned> (sortedFuncTable.size());
4321 unsigned funcID = funcTableSize - 1;
4322 do
4323 {
4324 funcID--;
4325 FuncInfo* func = sortedFuncTable[funcID];
4326
4327 std::map<FuncInfo*, std::set<FuncInfo*>>::iterator predMapIter = predMap.find(func);
4328 if (predMapIter != predMap.end())
4329 {
4330 std::set<FuncInfo*>& domSet = (*domMap.find(func)).second;
4331 std::set<FuncInfo*> oldDomSet = domSet;
4332 domSet.clear();
4333 domSet.insert(func);
4334
4335 std::vector<unsigned> funcVec(funcTableSize);
4336 for (unsigned i = 0; i < funcTableSize; i++)
4337 {
4338 funcVec[i] = 0;
4339 }
4340
4341 std::set<FuncInfo*>& predSet = (*predMapIter).second;
4342 for (auto pred : predSet)
4343 {
4344 for (auto predDom : (*domMap.find(pred)).second)
4345 {
4346 unsigned domID = (predDom->getScopeID() == UINT_MAX) ? funcTableSize - 1 : predDom->getScopeID() - 1;
4347 funcVec[domID]++;
4348 }
4349 }
4350
4351 unsigned predSetSize = static_cast<unsigned> (predSet.size());
4352 for (unsigned i = 0; i < funcTableSize; i++)
4353 {
4354 if (funcVec[i] == predSetSize)
4355 {
4356 FuncInfo* newFunc = sortedFuncTable[i];
4357 domSet.insert(newFunc);
4358 if (oldDomSet.find(newFunc) == oldDomSet.end())
4359 changed = true;
4360 }
4361 }
4362
4363 if (oldDomSet.size() != domSet.size())
4364 {
4365 changed = true;
4366 }
4367 }
4368 } while (funcID != 0);
4369
4370 } while (changed);
4371 }
4372
4373 //
4374 // Check if func1 is a dominator of func2
4375 //
checkDominator(FuncInfo * func1,FuncInfo * func2,std::map<FuncInfo *,std::set<FuncInfo * >> & domMap)4376 static bool checkDominator(FuncInfo* func1, FuncInfo* func2, std::map<FuncInfo*, std::set<FuncInfo*>>& domMap)
4377 {
4378 std::map<FuncInfo*, std::set<FuncInfo*>>::iterator domMapIter = domMap.find(func2);
4379
4380 if (domMapIter != domMap.end())
4381 {
4382 std::set<FuncInfo*> domSet = (*domMapIter).second;
4383 std::set<FuncInfo*>::iterator domSetIter = domSet.find(func1);
4384
4385 if (domSetIter != domSet.end())
4386 {
4387 return true;
4388 }
4389 }
4390
4391 return false;
4392 }
4393
4394 //
4395 // Determine the scope of a varaible based on different contexts
4396 //
resolveVarScope(G4_Declare * dcl,FuncInfo * func)4397 unsigned FlowGraph::resolveVarScope(G4_Declare* dcl, FuncInfo* func)
4398 {
4399 unsigned oldID = dcl->getScopeID();
4400 unsigned newID = func->getScopeID();
4401
4402 if (oldID == newID)
4403 {
4404 return oldID;
4405 }
4406 else if (oldID == 0)
4407 {
4408 return newID;
4409 }
4410 else if (oldID == UINT_MAX ||
4411 newID == UINT_MAX)
4412 {
4413 return UINT_MAX;
4414 }
4415 else if (builder->getOption(vISA_EnableGlobalScopeAnalysis))
4416 {
4417 // This is safe if the global variable usage is
4418 // self-contained under the calling function
4419 std::map<FuncInfo*, std::set<FuncInfo*>> domMap;
4420
4421 findDominators(domMap);
4422
4423 FuncInfo* oldFunc = sortedFuncTable[oldID - 1];
4424
4425 if (checkVisitID(func, oldFunc) &&
4426 checkDominator(func, oldFunc, domMap))
4427 {
4428 return newID;
4429 }
4430 else if (checkVisitID(oldFunc, func) &&
4431 checkDominator(oldFunc, func, domMap))
4432 {
4433 return oldID;
4434 }
4435 else
4436 {
4437 unsigned start = (newID > oldID) ? newID : oldID;
4438 unsigned end = static_cast<unsigned> (sortedFuncTable.size());
4439
4440 for (unsigned funcID = start; funcID != end; funcID++)
4441 {
4442 FuncInfo* currFunc = sortedFuncTable[funcID];
4443 if (checkVisitID(currFunc, func) &&
4444 checkDominator(currFunc, func, domMap) &&
4445 checkVisitID(currFunc, oldFunc) &&
4446 checkDominator(currFunc, oldFunc, domMap))
4447 {
4448 return currFunc->getScopeID();
4449 }
4450 }
4451 }
4452 }
4453
4454 return UINT_MAX;
4455 }
4456
4457 //
4458 // Visit all operands referenced in a function and update the varaible scope
4459 //
markVarScope(std::vector<G4_BB * > & BBList,FuncInfo * func)4460 void FlowGraph::markVarScope(std::vector<G4_BB*>& BBList, FuncInfo* func)
4461 {
4462 for (auto bb : BBList)
4463 {
4464 for (auto it = bb->begin(), end = bb->end(); it != end; it++)
4465 {
4466 G4_INST* inst = (*it);
4467
4468 G4_DstRegRegion* dst = inst->getDst();
4469
4470 if (dst &&
4471 !dst->isAreg() &&
4472 dst->getBase())
4473 {
4474 G4_Declare* dcl = GetTopDclFromRegRegion(dst);
4475 unsigned scopeID = resolveVarScope(dcl, func);
4476 dcl->updateScopeID(scopeID);
4477 }
4478
4479 for (int i = 0; i < G4_MAX_SRCS; i++)
4480 {
4481 G4_Operand* src = inst->getSrc(i);
4482
4483 if (src && !src->isAreg())
4484 {
4485 if (src->isSrcRegRegion() &&
4486 src->asSrcRegRegion()->getBase())
4487 {
4488 G4_Declare* dcl = GetTopDclFromRegRegion(src);
4489 unsigned scopeID = resolveVarScope(dcl, func);
4490 dcl->updateScopeID(scopeID);
4491 }
4492 else if (src->isAddrExp() &&
4493 src->asAddrExp()->getRegVar())
4494 {
4495 G4_Declare* dcl = src->asAddrExp()->getRegVar()->getDeclare()->getRootDeclare();
4496 unsigned scopeID = resolveVarScope(dcl, func);
4497 dcl->updateScopeID(scopeID);
4498 }
4499 }
4500 }
4501 }
4502 }
4503 }
4504
4505 //
4506 // Traverse the call graph and mark varaible scope
4507 //
markScope()4508 void FlowGraph::markScope()
4509 {
4510 if (funcInfoTable.size() == 0)
4511 {
4512 // no subroutines
4513 return;
4514 }
4515 unsigned id = 1;
4516 std::vector<FuncInfo *>::iterator kernelIter = sortedFuncTable.end();
4517 kernelIter--;
4518 for (std::vector<FuncInfo *>::iterator funcIter = sortedFuncTable.begin();
4519 funcIter != sortedFuncTable.end();
4520 ++funcIter)
4521 {
4522 if (funcIter == kernelIter)
4523 {
4524 id = UINT_MAX;
4525 }
4526
4527 FuncInfo* func = (*funcIter);
4528 func->setScopeID(id);
4529
4530 for (auto bb : func->getBBList())
4531 {
4532 bb->setScopeID(id);
4533 }
4534
4535 id++;
4536 }
4537
4538 for (auto func : sortedFuncTable)
4539 {
4540 markVarScope(func->getBBList(), func);
4541 }
4542 }
4543
4544 // Return the mask for anyH or allH predicate control in the goto to be emitted.
getFlagMask(G4_Predicate_Control pCtrl)4545 static uint32_t getFlagMask(G4_Predicate_Control pCtrl)
4546 {
4547 switch (pCtrl)
4548 {
4549 case G4_Predicate_Control::PRED_ALL2H:
4550 return 0xFFFFFFFC;
4551 case G4_Predicate_Control::PRED_ALL4H:
4552 return 0xFFFFFFF0;
4553 case G4_Predicate_Control::PRED_ALL8H:
4554 return 0xFFFFFF00;
4555 case G4_Predicate_Control::PRED_ALL16H:
4556 return 0xFFFF0000;
4557 case G4_Predicate_Control::PRED_ALL32H:
4558 return 0x00000000;
4559 case G4_Predicate_Control::PRED_ANY2H:
4560 return 0x00000003;
4561 case G4_Predicate_Control::PRED_ANY4H:
4562 return 0x0000000F;
4563 case G4_Predicate_Control::PRED_ANY8H:
4564 return 0x000000FF;
4565 case G4_Predicate_Control::PRED_ANY16H:
4566 return 0x0000FFFF;
4567 case G4_Predicate_Control::PRED_ANY32H:
4568 return 0xFFFFFFFF;
4569 default:
4570 MUST_BE_TRUE(false, "only for AllH or AnyH predicate control");
4571 break;
4572 }
4573 return 0;
4574 }
4575
4576 // Given a predicate ctrl for jmpi, return the adjusted predicate ctrl in a new
4577 // simd size.
getPredCtrl(unsigned simdSize,G4_Predicate_Control pCtrl,bool predCtrlHasWidth)4578 static G4_Predicate_Control getPredCtrl(unsigned simdSize,
4579 G4_Predicate_Control pCtrl, bool predCtrlHasWidth)
4580 {
4581 if (!predCtrlHasWidth)
4582 {
4583 return G4_Predicate::isAllH(pCtrl) ? PRED_ALL_WHOLE : PRED_ANY_WHOLE;
4584 }
4585
4586 if (G4_Predicate::isAllH(pCtrl))
4587 return (simdSize == 8)
4588 ? PRED_ALL8H
4589 : (simdSize == 16) ? PRED_ALL16H
4590 : G4_Predicate_Control::PRED_ALL32H;
4591
4592 // Any or default
4593 return (simdSize == 8)
4594 ? PRED_ANY8H
4595 : (simdSize == 16) ? PRED_ANY16H
4596 : G4_Predicate_Control::PRED_ANY32H;
4597 }
4598
4599 // If BB has only one predecessor, return it;
4600 // if BB has two predecessors and one is ExcludedPred, return the other predecessor;
4601 // otherwise, return nullptr.
getSinglePredecessor(G4_BB * BB,G4_BB * ExcludedPred) const4602 G4_BB* FlowGraph::getSinglePredecessor(G4_BB* BB, G4_BB* ExcludedPred) const
4603 {
4604 if (BB->Preds.size() != 1) {
4605 if (BB->Preds.size() == 2) {
4606 if (BB->Preds.front() == ExcludedPred)
4607 return BB->Preds.back();
4608 if (BB->Preds.back() == ExcludedPred)
4609 return BB->Preds.front();
4610 }
4611 return nullptr;
4612 }
4613 return BB->Preds.front();
4614 }
4615
4616 // Convert jmpi to goto. E.g.
4617 //
4618 // Case1:
4619 // .decl P1 v_type=P num_elts=2
4620 //
4621 // cmp.ne (M1, 2) P1 V33(0,0)<2;2,1> 0x0:f
4622 // (!P1.any) jmpi (M1, 1) BB1
4623 //
4624 // ===>
4625 //
4626 // cmp.ne (M1, 2) P1 V33(0,0)<2;2,1> 0x0:f
4627 // and (1) P1 P1 0b00000011
4628 // (!P2.any) goto (M1, 8) BB1
4629 //
4630 // Case2:
4631 // .decl P1 v_type=P num_elts=2
4632 //
4633 // cmp.ne (M1, 2) P1 V33(0,0)<2;2,1> 0x0:f
4634 // (!P1.all) jmpi (M1, 1) BB1
4635 //
4636 // ===>
4637 //
4638 // cmp.ne (M1, 2) P1 V33(0,0)<2;2,1> 0x0:f
4639 // or (1) P1 P1 0b11111100
4640 // (!P1.all) goto (M1, 8) BB1
4641 //
convertJmpiToGoto()4642 bool FlowGraph::convertJmpiToGoto()
4643 {
4644 bool Changed = false;
4645 for (auto bb : BBs)
4646 {
4647 for (auto I = bb->begin(), IEnd = bb->end(); I != IEnd; ++I)
4648 {
4649 G4_INST *inst = *I;
4650 if (inst->opcode() != G4_jmpi)
4651 continue;
4652
4653 unsigned predSize = pKernel->getSimdSize();
4654 G4_Predicate *newPred = nullptr;
4655
4656 if (G4_Predicate *pred = inst->getPredicate())
4657 {
4658 // The number of bool elements in vISA decl.
4659 unsigned nElts = pred->getTopDcl()->getNumberFlagElements();
4660
4661 // Since we need to turn this into goto, set high bits properly.
4662 if (nElts != predSize)
4663 {
4664 // The underlying dcl type is either uw or ud.
4665 G4_Type SrcTy = pred->getTopDcl()->getElemType();
4666 G4_Type DstTy = (predSize > 16) ? Type_UD : Type_UW;
4667
4668 G4_Predicate_Control pCtrl = pred->getControl();
4669 MUST_BE_TRUE(nElts == 1 ||
4670 G4_Predicate::isAnyH(pCtrl) ||
4671 G4_Predicate::isAllH(pCtrl),
4672 "predicate control not handled yet");
4673
4674 // Common dst and src0 operand for flag.
4675 G4_Declare *newDcl = builder->createTempFlag(predSize > 16 ? 2 : 1);
4676 auto pDst = builder->createDst(newDcl->getRegVar(), 0, 0, 1, DstTy);
4677 auto pSrc0 = builder->createSrc(
4678 pred->getBase(), 0, 0, builder->getRegionScalar(), SrcTy);
4679
4680 auto truncMask = [](uint32_t mask, G4_Type Ty) -> uint64_t
4681 {
4682 return (Ty == Type_UW) ? uint16_t(mask) : mask;
4683 };
4684
4685 if (pCtrl == G4_Predicate_Control::PRED_DEFAULT)
4686 {
4687 // P = P & 1
4688 auto pSrc1 = builder->createImm(1, Type_UW);
4689 auto pInst = builder->createBinOp(
4690 G4_and, g4::SIMD1, pDst, pSrc0, pSrc1,
4691 InstOpt_M0 | InstOpt_WriteEnable, false);
4692 bb->insertBefore(I, pInst);
4693 }
4694 else if (G4_Predicate::isAnyH(pCtrl))
4695 {
4696 // P = P & mask
4697 uint32_t mask = getFlagMask(pCtrl);
4698 auto pSrc1 = builder->createImm(truncMask(mask, DstTy), DstTy);
4699 auto pInst = builder->createBinOp(
4700 G4_and, g4::SIMD1, pDst, pSrc0, pSrc1,
4701 InstOpt_M0 | InstOpt_WriteEnable, false);
4702 bb->insertBefore(I, pInst);
4703 }
4704 else
4705 {
4706 // AllH
4707 // P = P | mask
4708 uint32_t mask = getFlagMask(pCtrl);
4709 auto pSrc1 = builder->createImm(truncMask(mask, DstTy), DstTy);
4710 auto pInst = builder->createBinOp(
4711 G4_or, g4::SIMD1, pDst, pSrc0, pSrc1,
4712 InstOpt_M0 | InstOpt_WriteEnable, false);
4713 bb->insertBefore(I, pInst);
4714 }
4715
4716 // Adjust pred control to the new execution size and build the
4717 // new predicate.
4718 pCtrl = getPredCtrl(predSize, pCtrl, builder->predCtrlHasWidth());
4719 newPred = builder->createPredicate(
4720 pred->getState(), newDcl->getRegVar(), 0, pCtrl);
4721 }
4722 }
4723
4724 // (!P) jmpi L
4725 // becomes:
4726 // P = P & MASK
4727 // (!P.anyN) goto (N) L
4728 inst->setOpcode(G4_goto);
4729 inst->setExecSize(G4_ExecSize(predSize));
4730 if (newPred)
4731 inst->setPredicate(newPred);
4732 inst->asCFInst()->setUip(inst->getSrc(0)->asLabel());
4733 inst->setSrc(nullptr, 0);
4734 inst->setOptions(InstOpt_M0);
4735 Changed = true;
4736 }
4737 }
4738 return Changed;
4739 }
4740
print(std::ostream & OS) const4741 void FlowGraph::print(std::ostream& OS) const
4742 {
4743 const char* kname = nullptr;
4744 if (getKernel()) {
4745 kname = getKernel()->getName();
4746 }
4747 kname = kname ? kname : "unnamed";
4748 OS << "\n\nCFG: "
4749 << kname
4750 << (builder->getIsKernel() ? " [kernel]" : " [non-kernel function]")
4751 << "\n\n";
4752 for (auto BB : BBs) {
4753 BB->print(OS);
4754 }
4755 }
4756
dump() const4757 void FlowGraph::dump() const
4758 {
4759 print(std::cerr);
4760 }
4761
~FlowGraph()4762 FlowGraph::~FlowGraph()
4763 {
4764 // even though G4_BBs are allocated in a mem pool and freed in one shot,
4765 // we must call each BB's desstructor explicitly to free up the memory used
4766 // by the STL objects(list, vector, etc.) in each BB
4767 for (G4_BB* bb : BBAllocList)
4768 {
4769 bb->~G4_BB();
4770 }
4771 BBAllocList.clear();
4772 globalOpndHT.clearHashTable();
4773 for (auto funcInfo : funcInfoTable)
4774 {
4775 funcInfo->~FuncInfo();
4776 }
4777 if (kernelInfo)
4778 {
4779 kernelInfo->~FuncInfo();
4780 }
4781 for (auto summary : localRASummaries)
4782 {
4783 summary->~PhyRegSummary();
4784 }
4785 }
4786
createRelocation(G4_Kernel & kernel,G4_INST & inst,int opndPos,const std::string & symbolName,RelocationType type)4787 RelocationEntry& RelocationEntry::createRelocation(
4788 G4_Kernel& kernel, G4_INST& inst,
4789 int opndPos, const std::string& symbolName,
4790 RelocationType type)
4791 {
4792 kernel.getRelocationTable().emplace_back(
4793 RelocationEntry(&inst, opndPos, type, symbolName));
4794 return kernel.getRelocationTable().back();
4795 }
4796
doRelocation(const G4_Kernel & kernel,void * binary,uint32_t binarySize)4797 void RelocationEntry::doRelocation(
4798 const G4_Kernel& kernel, void* binary, uint32_t binarySize)
4799 {
4800 // FIXME: nothing to do here
4801 // we have only dynamic relocations now, which cannot be resolved at compilation time
4802 }
4803
getTargetOffset(const IR_Builder & builder) const4804 uint32_t RelocationEntry::getTargetOffset(const IR_Builder& builder) const
4805 {
4806 // instruction being relocated must not be compacted, or the offset need to be re-adjusted
4807 // FIXME: This only check if vISA force to compact the instruction, it cannot make sure
4808 // the Binary encoder won't compact it
4809 assert(inst->isCompactedInst() == false);
4810
4811 G4_Operand* target_operand = inst->getSrc(opndPos);
4812 assert(target_operand->isRelocImm());
4813
4814 switch (inst->opcode()) {
4815 case G4_mov:
4816 // When src0 type is 64 bits:
4817 // On Pre-Xe:
4818 // Src0.imm[31:0] mapped to Instruction [95:64]
4819 // Src0.imm[63:32] mapped to Instruction [127:96]
4820 // On Xe+:
4821 // Src0.imm[31:0] mapped to Instruction [127:96]
4822 // Src0.imm[63:32] mapped to Instruction [95:64]
4823 // When src0 type is 32 bits:
4824 // Src0.imm[31:0] mapped to instruction [127:96]
4825 assert((target_operand->getType() == Type_UD) || (target_operand->getType() == Type_D) ||
4826 (target_operand->getType() == Type_UQ) || (target_operand->getType() == Type_Q));
4827 assert(opndPos == 0);
4828 return (target_operand->getType() == Type_UD || target_operand->getType() == Type_D) ? 12 : 8;
4829
4830 case G4_add:
4831 // add instruction cannot have 64-bit imm
4832 assert(relocType != R_SYM_ADDR && relocType != R_SYM_ADDR_32_HI);
4833 assert(opndPos == 1);
4834 // Src1.imm[31:0] mapped to Instruction [127:96]
4835 return 12;
4836 default:
4837 break;
4838 }
4839
4840 // currently we only support relocation on mov or add instruction
4841 assert(false && "Unreachable");
4842 return 0;
4843 }
4844
getTypeString() const4845 const char * RelocationEntry::getTypeString() const
4846 {
4847 switch (relocType)
4848 {
4849 case RelocationType::R_SYM_ADDR:
4850 return "R_SYM_ADDR";
4851 case RelocationType::R_SYM_ADDR_32:
4852 return "R_SYM_ADDR_32";
4853 case RelocationType::R_SYM_ADDR_32_HI:
4854 return "R_SYM_ADDR_32_HI";
4855 case RelocationType::R_PER_THREAD_PAYLOAD_OFFSET_32:
4856 return "R_PER_THREAD_PAYLOAD_OFFSET_32";
4857 case RelocationType::R_GLOBAL_IMM_32:
4858 return "R_GLOBAL_IMM_32";
4859 default:
4860 assert(false && "unhandled relocation type");
4861 return "";
4862 }
4863 }
4864
dump(std::ostream & os) const4865 void RelocationEntry::dump(std::ostream &os) const
4866 {
4867 os << "Relocation entry: " << getTypeString() << "\n";
4868 os << "\t";
4869 inst->dump();
4870 switch (relocType)
4871 {
4872 case RelocationType::R_NONE:
4873 os << "R_NONE: symbol name = " << symName;
4874 break;
4875 case RelocationType::R_SYM_ADDR:
4876 os << "R_SYM_ADDR: symbol name = " << symName;
4877 break;
4878 case RelocationType::R_SYM_ADDR_32:
4879 os << "R_SYM_ADDR_32: symbol name = " << symName;
4880 break;
4881 case RelocationType::R_SYM_ADDR_32_HI:
4882 os << "R_SYM_ADDR_32_HI: symbol name = " << symName;
4883 break;
4884 case RelocationType::R_PER_THREAD_PAYLOAD_OFFSET_32:
4885 os << "R_PER_THREAD_PAYLOAD_OFFSET_32: symbol name = " << symName;
4886 break;
4887 case RelocationType::R_GLOBAL_IMM_32:
4888 os << "R_GLOBAL_IMM_32: symbol name = " << symName;
4889 break;
4890 }
4891 os << "\n";
4892 }
4893
dump() const4894 void FuncInfo::dump() const
4895 {
4896 std::cerr << "subroutine " << getId() << "(" << getInitBB()->front()->getLabelStr() << ")\n";
4897 std::cerr << "\tentryBB=" << getInitBB()->getId() << ", exitBB=" << getExitBB()->getId() << "\n";
4898 std::cerr << "\tCallees: ";
4899 for (auto callee : callees)
4900 {
4901 std::cerr << callee->getId() << " ";
4902 }
4903 std::cerr << "\n\tBB list: ";
4904 for (auto bb : BBList)
4905 {
4906 std::cerr << bb->getId() << " ";
4907 }
4908 std::cerr << "\n";
4909 }
4910