1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2017-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include "BitSet.h"
10 #include "BuildIR.h"
11 #include "CFGStructurizer.h"
12 #include "DebugInfo.h"
13 #include "FlowGraph.h"
14 #include "G4_Kernel.hpp"
15 #include "Option.h"
16 #include "PhyRegUsage.h"
17 #include "visa_wa.h"
18 
19 #include "BinaryEncodingIGA.h"
20 #include "iga/IGALibrary/api/iga.h"
21 #include "iga/IGALibrary/api/iga.hpp"
22 
23 #include <algorithm>
24 #include <chrono>
25 #include <cstdlib>
26 #include <fstream>
27 #include <functional>
28 #include <iostream>
29 #include <iterator>
30 #include <random>
31 #include <set>
32 #include <sstream>
33 #include <string>
34 #include <unordered_map>
35 #include <unordered_set>
36 
37 #if defined(WIN32)
38 #define CDECLATTRIBUTE __cdecl
39 #elif __GNUC__
40 #ifdef __x86_64__
41 #define CDECLATTRIBUTE
42 #else
43 #define CDECLATTRIBUTE                 __attribute__((__cdecl__))
44 #endif
45 #endif
46 
47 using namespace vISA;
48 
49 
insert(uint16_t newLB,uint16_t newRB)50 void GlobalOpndHashTable::HashNode::insert(uint16_t newLB, uint16_t newRB)
51 {
52     // check if the newLB/RB either subsumes or can be subsumed by an existing bound
53     // ToDo: consider merging bound as well
54     for (uint32_t& bound : bounds)
55     {
56         uint16_t nodeLB = getLB(bound);
57         uint16_t nodeRB = getRB(bound);
58         if (newLB >= nodeLB && newRB <= nodeRB)
59         {
60             return;
61         }
62         else if (newLB <= nodeLB && newRB >= nodeRB)
63         {
64             bound = packBound(newLB, newRB);
65             return;
66         }
67     }
68     bounds.push_back(packBound(newLB, newRB));
69 }
70 
isInNode(uint16_t lb,uint16_t rb) const71 bool GlobalOpndHashTable::HashNode::isInNode(uint16_t lb, uint16_t rb) const
72 {
73     for (uint32_t bound : bounds)
74     {
75         uint16_t nodeLB = getLB(bound);
76         uint16_t nodeRB = getRB(bound);
77         if (lb <= nodeLB && rb >= nodeLB)
78         {
79             return true;
80         }
81         else if (lb > nodeLB && lb <= nodeRB)
82         {
83             return true;
84         }
85     }
86     return false;
87 }
88 
clearHashTable()89 void GlobalOpndHashTable::clearHashTable()
90 {
91     for (auto& iter : globalVars)
92     {
93         iter.second->~HashNode();
94     }
95     globalVars.clear();
96     globalOpnds.clear();
97 }
98 
99 // all of the operand in this table are srcRegion
addGlobalOpnd(G4_Operand * opnd)100 void GlobalOpndHashTable::addGlobalOpnd(G4_Operand *opnd)
101 {
102     G4_Declare *topDcl = opnd->getTopDcl();
103 
104     if (topDcl)
105     {
106         // global operands must have a declare
107         auto entry = globalVars.find(topDcl);
108         if (entry != globalVars.end())
109         {
110             entry->second->insert(
111                 (uint16_t)opnd->getLeftBound(),
112                 (uint16_t)opnd->getRightBound());
113         }
114         else
115         {
116             HashNode* node = new (mem)HashNode(
117                 (uint16_t)opnd->getLeftBound(),
118                 (uint16_t)opnd->getRightBound(),
119                 private_arena_allocator);
120             globalVars[topDcl] = node;
121         }
122         globalOpnds.push_back(opnd);
123     }
124 }
125 
126 // if def overlaps with any operand in this table, it is treated as global
isOpndGlobal(G4_Operand * opnd) const127 bool GlobalOpndHashTable::isOpndGlobal(G4_Operand *opnd) const
128 {
129     G4_Declare * dcl = opnd->getTopDcl();
130     if (dcl == NULL)
131     {
132         return false;
133     }
134     else if (dcl->getAddressed())
135     {
136         // Conservatively assume that all address taken
137         // virtual registers are global
138         return true;
139     }
140     else
141     {
142         auto entry = globalVars.find(dcl);
143         if (entry == globalVars.end())
144         {
145             return false;
146         }
147         HashNode* node = entry->second;
148         return node->isInNode(
149             (uint16_t)opnd->getLeftBound(),
150             (uint16_t)opnd->getRightBound());
151     }
152 }
153 
dump(std::ostream & os) const154 void GlobalOpndHashTable::dump(std::ostream &os) const
155 {
156     os << "Global variables:\n";
157     for (auto&& entry : globalVars)
158     {
159         G4_Declare* dcl = entry.first;
160         dcl->dump();
161         if ((dcl->getRegFile() & G4_FLAG) == 0)
162         {
163             std::vector<bool> globalElt;
164             globalElt.resize(dcl->getByteSize(), false);
165             auto ranges = entry.second;
166             for (auto bound : ranges->bounds)
167             {
168                 uint16_t lb = getLB(bound);
169                 uint16_t rb = getRB(bound);
170                 for (int i = lb; i <= rb; ++i)
171                 {
172                     globalElt[i] = true;
173                 }
174             }
175             bool inRange = false;
176             for (int i = 0, size = (int)globalElt.size(); i < size; ++i)
177             {
178                 if (globalElt[i] && !inRange)
179                 {
180                     // start of new range
181                     os << "[" << i << ",";
182                     inRange = true;
183                 }
184                 else if (!globalElt[i] && inRange)
185                 {
186                     // end of range
187                     os << i - 1 << "], ";
188                     inRange = false;
189                 }
190             }
191             if (inRange)
192             {
193                 // close last range
194                 os << globalElt.size() - 1 << "]";
195             }
196         }
197         os << "\n";
198     }
199     os << "Instructions with global operands:\n";
200     for (auto opnd : globalOpnds)
201     {
202         if (opnd->getInst())
203         {
204             opnd->getInst()->print(os);
205         }
206     }
207 }
208 
setPhysicalLink(G4_BB * pred,G4_BB * succ)209 void FlowGraph::setPhysicalLink(G4_BB* pred, G4_BB* succ)
210 {
211     if (pred)
212     {
213         pred->setPhysicalSucc(succ);
214     }
215     if (succ)
216     {
217         succ->setPhysicalPred(pred);
218     }
219 }
220 
insert(BB_LIST_ITER iter,G4_BB * bb)221 BB_LIST_ITER FlowGraph::insert(BB_LIST_ITER iter, G4_BB* bb)
222 {
223     G4_BB* prev = iter != BBs.begin() ? *std::prev(iter) : nullptr;
224     G4_BB* next = iter != BBs.end() ? *iter : nullptr;
225     setPhysicalLink(prev, bb);
226     setPhysicalLink(bb, next);
227     markStale();
228     return BBs.insert(iter, bb);
229 }
230 
erase(BB_LIST_ITER iter)231 void FlowGraph::erase(BB_LIST_ITER iter)
232 {
233     G4_BB* prev = (iter != BBs.begin()) ? *std::prev(iter) : nullptr;
234     G4_BB* next = (std::next(iter) != BBs.end()) ? *std::next(iter) : nullptr;
235     setPhysicalLink(prev, next);
236     BBs.erase(iter);
237     markStale();
238 }
239 
addPrologBB(G4_BB * BB)240 void FlowGraph::addPrologBB(G4_BB* BB)
241 {
242     G4_BB* oldEntry = getEntryBB();
243     insert(BBs.begin(), BB);
244     addPredSuccEdges(BB, oldEntry);
245 }
246 
append(const FlowGraph & otherFG)247 void FlowGraph::append(const FlowGraph& otherFG)
248 {
249     markStale();
250 
251     for (auto I = otherFG.cbegin(), E = otherFG.cend(); I != E; ++I)
252     {
253         auto bb = *I;
254         BBs.push_back(bb);
255         incrementNumBBs();
256     }
257 }
258 
259 //
260 // return label's corresponding BB
261 // if label's BB is not yet created, then create one and add map label to BB
262 //
getLabelBB(Label_BB_Map & map,G4_Label * label)263 G4_BB* FlowGraph::getLabelBB(Label_BB_Map& map, G4_Label* label)
264 {
265     if (map.find(label) != map.end())
266     {
267         return map[label];
268     }
269     else
270     {
271         G4_BB* bb = createNewBB();
272         map[label] = bb;
273         return bb;
274     }
275 }
276 
277 //
278 // first is the first inst of a BB
279 //
beginBB(Label_BB_Map & map,G4_INST * first)280 G4_BB* FlowGraph::beginBB(Label_BB_Map& map, G4_INST* first)
281 {
282     if (first == NULL) return NULL;
283     G4_INST* labelInst;
284     bool newLabelInst = false;
285     if (first->isLabel())
286     {
287         labelInst = first;
288     }
289     else
290     {
291         // no label for this BB, create one!
292         std::string prefix = builder->kernel.getName();
293         std::string name = prefix + "_auto_" + std::to_string(autoLabelId++);
294         G4_Label* label = builder->createLabel(name, LABEL_BLOCK);
295         labelInst = createNewLabelInst(label);
296         newLabelInst = true;
297     }
298     G4_BB* bb = getLabelBB(map, labelInst->getLabel());
299     push_back(bb); // append to BBs list
300     if (newLabelInst)
301     {
302         bb->push_front(labelInst);
303     }
304     return bb;
305 }
306 
createNewLabelInst(G4_Label * label,int lineNo,int CISAOff)307 G4_INST* FlowGraph::createNewLabelInst(G4_Label* label, int lineNo, int CISAOff)
308 {
309     //srcFileName is NULL
310     // TODO: remove this (use createLabelInst)
311     return builder->createInternalInst(NULL, G4_label,
312         NULL, g4::NOSAT, g4::SIMD1, NULL, label, NULL, 0);
313 }
314 
removePredSuccEdges(G4_BB * pred,G4_BB * succ)315 void FlowGraph::removePredSuccEdges(G4_BB* pred, G4_BB* succ)
316 {
317     MUST_BE_TRUE(pred != NULL && succ != NULL, ERROR_INTERNAL_ARGUMENT);
318 
319     BB_LIST_ITER lt = pred->Succs.begin();
320     for (; lt != pred->Succs.end(); ++lt) {
321         if ((*lt) == succ) {
322             pred->Succs.erase(lt);
323             break;
324         }
325     }
326 
327     lt = succ->Preds.begin();
328     for (; lt != succ->Preds.end(); ++lt) {
329         if ((*lt) == pred) {
330             succ->Preds.erase(lt);
331             break;
332         }
333     }
334 
335     markStale();
336 }
337 
createNewBB(bool insertInFG)338 G4_BB* FlowGraph::createNewBB(bool insertInFG)
339 {
340     G4_BB* bb = new (mem)G4_BB(instListAlloc, numBBId, this);
341 
342     // Increment counter only when new BB is inserted in FlowGraph
343     if (insertInFG)
344         numBBId++;
345 
346     BBAllocList.push_back(bb);
347     return bb;
348 }
349 
createNewBBWithLabel(const char * LabelPrefix,int Lineno,int CISAoff)350 G4_BB* FlowGraph::createNewBBWithLabel(const char* LabelPrefix, int Lineno, int CISAoff)
351 {
352     G4_BB* newBB = createNewBB(true);
353     std::string name = LabelPrefix + std::to_string(newBB->getId());
354     G4_Label* lbl = builder->createLabel(name, LABEL_BLOCK);
355     G4_INST* inst = createNewLabelInst(lbl, Lineno, CISAoff);
356     newBB->push_back(inst);
357     return newBB;
358 }
359 
360 static int globalCount = 1;
361 
insertDummyUUIDMov()362 int64_t FlowGraph::insertDummyUUIDMov()
363 {
364     // Here when -addKernelId is passed
365     if (builder->getIsKernel())
366     {
367         // Add mov inst as first instruction in kernel. This
368         // has to be *first* instruction in the kernel so debugger
369         // can detect the pattern.
370         //
371         // 32-bit random number is generated and set as src0 operand.
372         // Earlier nop was being generated with 64-bit UUID overloading
373         // MBZ bits and this turned out to be a problem for the
374         // debugger (random clobbering of r0).
375 
376         for (auto bb : BBs)
377         {
378             uint32_t seed = (uint32_t)std::chrono::high_resolution_clock::now().time_since_epoch().count();
379             std::mt19937 mt_rand(seed * globalCount);
380             globalCount++;
381 
382             G4_DstRegRegion* nullDst = builder->createNullDst(Type_UD);
383             int64_t uuID = (int64_t)mt_rand();
384             G4_Operand* randImm = (G4_Operand*)builder->createImm(uuID, Type_UD);
385             G4_INST* movInst = builder->createMov(g4::SIMD1, nullDst, randImm, InstOpt_NoOpt, false);
386 
387             auto instItEnd = bb->end();
388             for (auto it = bb->begin();
389                 it != instItEnd;
390                 it++)
391             {
392                 if ((*it)->isLabel())
393                 {
394                     bb->insertBefore(++it, movInst);
395                     return uuID;
396                 }
397 
398                 bb->push_front(movInst);
399                 return uuID;
400             }
401         }
402     }
403     return 0;
404 }
405 
406 //
407 // (1) check if-else-endif and iff-endif pairs
408 // (2) add label for those omitted ones
409 //
matchBranch(int & sn,INST_LIST & instlist,INST_LIST_ITER & it)410 bool FlowGraph::matchBranch(int &sn, INST_LIST& instlist, INST_LIST_ITER &it)
411 {
412     G4_INST* inst = *it;
413     //
414     // process if-endif or if-else-endif
415     //
416     if (inst->opcode() == G4_if)
417     {
418         // check label / add label
419         G4_Label *if_label = NULL;
420         G4_Label *else_label = NULL;
421         G4_Label *endif_label = NULL;   // the label immediately before the endif
422         G4_INST* ifInst = inst;
423         assert(inst->asCFInst()->getJip() == nullptr && "IF should not have a label at this point");
424 
425         // create if_label
426         std::string createdLabel = "_AUTO_GENERATED_IF_LABEL_" + std::to_string(sn);
427         sn++;
428         if_label = builder->createLabel(createdLabel, LABEL_BLOCK);
429         inst->asCFInst()->setJip(if_label);
430 
431         // look for else/endif
432         int elseCount = 0;
433         it++;
434         while (it != instlist.end())
435         {
436             inst = *it;
437             // meet iff or if
438             if (inst->opcode() == G4_if)
439             {
440                 if (!matchBranch(sn, instlist, it))
441                     return false;
442             }
443             else if (inst->opcode() == G4_else)
444             {
445                 if (elseCount != 0)
446                 {
447                     MUST_BE_TRUE(false, "ERROR: Mismatched if-else: more than one else following if!");
448                     return false;
449                 }
450                 elseCount++;
451                 INST_LIST_ITER it1 = it;
452                 it1++;
453 
454                 // add endif label to "else"
455                 std::string createdLabel = "__AUTO_GENERATED_ELSE_LABEL__" + std::to_string(sn);
456                 sn++;
457                 else_label = builder->createLabel(createdLabel, LABEL_BLOCK);
458                 inst->asCFInst()->setJip(else_label);
459 
460                 // insert if-else label
461                 G4_INST* label = createNewLabelInst(if_label, inst->getLineNo(), inst->getCISAOff());
462                 instlist.insert(it1, label);
463 
464                 // Uip must be the same as Jip for else instructions.
465                 inst->asCFInst()->setUip(else_label);
466             }
467             else if (inst->opcode() == G4_endif)
468             {
469                 if (elseCount == 0)                 // if-endif case
470                 {
471                     // insert endif label
472                     G4_INST* label = createNewLabelInst(if_label, inst->getLineNo(), inst->getCISAOff());
473                     instlist.insert(it, label);
474                     endif_label = if_label;
475                 }
476                 else                                    // if-else-endif case
477                 {
478                     // insert endif label
479                     G4_INST* label = createNewLabelInst(else_label, inst->getLineNo(), inst->getCISAOff());
480                     instlist.insert(it, label);
481                     endif_label = else_label;
482                 }
483 
484                 //we must also set the UIP of if to point to its corresponding endif
485                 ifInst->asCFInst()->setUip(endif_label);
486                 return true;
487             }   // if opcode
488             if (it == instlist.end())
489             {
490                 MUST_BE_TRUE(false, "ERROR: Can not find endif for if!");
491                 return false;
492             }
493             it++;
494         }   // while
495     }
496     //
497     // process iff-endif
498     //
499     else
500         MUST_BE_TRUE(false, ERROR_FLOWGRAPH);
501 
502     return false;
503 }
504 
505 //
506 //  HW Rules about the jip and uip for the control flow instructions
507 //  if:
508 //      <branch_ctrl> set
509 //          jip = first join in the if block
510 //          uip = endif
511 //      <branch_ctrl> not set
512 //          jip = instruction right after the else, or the endif if else doesn't exist
513 //          uip = endif
514 //  else:
515 //      <branch_ctrl> set
516 //          jip = first join in the else block
517 //          uip = endif
518 //      <branch_ctrl> not set
519 //          jip = uip = endif
520 //  endif:
521 //          jip = uip = next enclosing endif/while
522 //  while:
523 //          jip = loop head label
524 //          uip = don't care
525 //  break:
526 //          jip = end of innermost CF block (else/endif/while)
527 //          uip = while
528 //  cont:
529 //          jip = end of innermost CF block (else/endif/while)
530 //          uip = while
531 //
532 
533 //
534 // Preprocess the instruction and kernel list, including:
535 // 1. Check if the labels and kernel names are unique
536 // 2. Check if all the labels used by jmp, CALL, cont, break, goto is defined, determine if goto is forward or backward
537 // 3. Process the non-label "if-else-endif" cases
538 //
preprocess(INST_LIST & instlist)539 void FlowGraph::preprocess(INST_LIST& instlist)
540 {
541 
542     std::unordered_set<G4_Label*> labels;   // label inst we have seen so far
543 
544     // Mark goto/jmpi/call as either forward or backward
545     for (G4_INST* i : instlist)
546     {
547         if (i->isLabel())
548         {
549             labels.emplace(i->getLabel());
550         }
551         else if (i->opcode() == G4_goto)
552         {
553             if (labels.count(i->asCFInst()->getUip()->asLabel()))
554             {
555                 i->asCFInst()->setBackward(true);
556             }
557         }
558         else if ((i->opcode() == G4_jmpi || i->isCall()) && i->getSrc(0) && i->getSrc(0)->isLabel())
559         {
560             if (labels.count(i->getSrc(0)->asLabel()))
561             {
562                 i->asCFInst()->setBackward(true);
563             }
564         }
565     }
566 
567 #ifdef _DEBUG
568     // sanity check to make sure we don't have undefined labels
569     for (G4_INST* i : instlist)
570     {
571         if (i->opcode() == G4_goto)
572         {
573             G4_Label* target = i->asCFInst()->getUip()->asLabel();
574             assert(labels.count(target) && "undefined goto label");
575         }
576         else if ((i->opcode() == G4_jmpi || i->isCall()) && i->getSrc(0) && i->getSrc(0)->isLabel())
577         {
578             assert(labels.count((G4_Label *)i->getSrc(0)) && "undefined jmpi/call label");
579         }
580     }
581 #endif
582 
583     //
584     // Process the non-label "if-else-endif" cases as following.
585     // ToDo: remove this once we stop generating if-else-endif for the IEEE macros
586     //
587     {
588         int sn = 0;
589         for (INST_LIST_ITER it = instlist.begin(), instlistEnd = instlist.end(); it != instlistEnd; ++it)
590         {
591             G4_INST *inst = *it;
592             if (inst->opcode() == G4_if)
593             {
594                 if (!matchBranch(sn, instlist, it))
595                 {
596                     MUST_BE_TRUE2(false, "ERROR: mismatched if-branch", inst);
597                     break;
598                 }
599             }   // fi
600         }   // for
601     }
602 }
603 
normalizeFlowGraph()604 void FlowGraph::normalizeFlowGraph()
605 {
606     // For CISA 3 flowgraph there will be pseudo_fcall instructions to invoke stack functions.
607     // Save/restore code will be added around this at the time of register allocation.
608     // If fcall's successor has more than 1 predecessor then it will create problems inserting
609     // code. This function handles such patterns by creating new basic block and guaranteeing
610     // that fcall's successor has a single predecessor.
611     for (BB_LIST_ITER it = BBs.begin(); it != BBs.end(); it++)
612     {
613         G4_BB* bb = *it;
614         if (bb->isEndWithFCall())
615         {
616             G4_BB* retBB = bb->Succs.front();
617             G4_INST *lInst = retBB->front();
618             if (retBB->Preds.size() > 1)
619             {
620 
621                 // To insert restore code we need to guarantee that save code has
622                 // been executed. If retBB has multiple preds, this may not be
623                 // guaranteed, so insert a new node.
624                 G4_BB* newNode = createNewBB();
625 
626                 // Remove old edge
627                 removePredSuccEdges(bb, retBB);
628 
629                 // Add new edges
630                 addPredSuccEdges(bb, newNode);
631                 addPredSuccEdges(newNode, retBB);
632 
633                 // Create and insert label inst
634                 std::string name = "L_AUTO_" + std::to_string(newNode->getId());
635                 G4_Label* lbl = builder->createLabel(name, LABEL_BLOCK);
636                 G4_INST* inst = createNewLabelInst(lbl, lInst->getLineNo(), lInst->getCISAOff());
637                 newNode->push_back(inst);
638                 insert(++it, newNode);
639                 it--;
640 
641                 retBB = newNode;
642             }
643         }
644     }
645 }
646 
647 //
648 // build a control flow graph from the inst list
649 // we want to keep the original BB order (same as shown in assembly code) for linear scan
650 // register allocation. We use beginBB to indicate that we encounter the beginning
651 // a BB and add the BB to the BB list and use getLabelBB to create a block for a label but
652 // not to add to the BB list.For instance, when we come across a "jmp target", getLabelBB
653 // is called to create a BB for target but the target BB is added into the list later (
654 // assume forward jmp) as the target label is visited.
655 //
656 //
constructFlowGraph(INST_LIST & instlist)657 void FlowGraph::constructFlowGraph(INST_LIST& instlist)
658 {
659     MUST_BE_TRUE(!instlist.empty(), ERROR_SYNTAX("empty instruction list"));
660 
661     if (builder->hasScratchSurface() && (isStackCallFunc || hasStackCalls))
662     {
663         // unfortunately we can't put this in stack call prolog since this has to be done before RA
664         // ToDo: just hard-wire the scratch-surface offset register?
665         builder->initScratchSurfaceOffset();
666     }
667     if (builder->hasFusedEU() &&
668         getKernel()->getInt32KernelAttr(Attributes::ATTR_Target) == VISA_CM)
669     {
670         getKernel()->getOptions()->setOptionInternally(vISA_EnableScalarJmp, false);
671     }
672 
673     pKernel->renameAliasDeclares();
674     //
675     // The funcInfoHashTable maintains a map between the id of the function's INIT block
676     // to its FuncInfo definition.
677     //
678     FuncInfoHashTable funcInfoHashTable;
679 
680     preprocess(instlist);
681 
682     //
683     // map label to its corresponding BB
684     //
685     std::unordered_map<G4_Label*, G4_BB*> labelMap;
686     //
687     // create the entry block of the flow graph
688     //
689     G4_BB* curr_BB = beginBB(labelMap, instlist.front());
690 
691     kernelInfo = new (mem)FuncInfo(UINT_MAX, curr_BB, NULL);
692 
693     std::vector<G4_BB*> subroutineStartBB; // needed by handleExit()
694 
695     bool hasSIMDCF = false, hasNoUniformGoto = false;
696     G4_Label* currSubroutine = nullptr;
697 
698     auto addToSubroutine = [this](G4_BB* bb, G4_Label* currSubroutine)
699     {
700         if (currSubroutine)
701         {
702             subroutines[currSubroutine].push_back(bb);
703         }
704     };
705 
706     while (!instlist.empty())
707     {
708         INST_LIST_ITER iter = instlist.begin();
709         G4_INST* i = *iter;
710 
711         MUST_BE_TRUE(curr_BB != NULL, "Current BB must not be empty");
712         //
713         // inst i belongs to the current BB
714         // remove inst i from instlist and relink it to curr_BB's instList
715         //
716         curr_BB->splice(curr_BB->end(), instlist, iter);
717         G4_INST* next_i = (instlist.empty()) ? nullptr : instlist.front();
718 
719         if (i->isSend())
720         {
721             curr_BB->setSendInBB(true);
722         }
723 
724         if (i->isLabel() && i->getLabel()->isFuncLabel())
725         {
726             std::vector<G4_BB*> bbvec;
727             subroutines[i->getLabel()] = bbvec;
728             currSubroutine = i->getLabel();
729             subroutineStartBB.push_back(curr_BB);
730         }
731 
732         //
733         // do and endif do not have predicate and jump-to label,so we treated them as non-control instruction
734         // the labels around them will decides the beginning of a new BB
735         //
736         if (i->isFlowControl() && i->opcode() != G4_endif)
737         {
738             addToSubroutine(curr_BB, currSubroutine);
739             G4_BB* next_BB = beginBB(labelMap, next_i);
740 
741             // next_BB may be null if the kernel ends on an CF inst (e.g., backward goto/jmpi)
742             // This should be ok because we should not fall-through to next_BB in such case
743             // (i.e., goto/jmpi must not be predicated)
744             {
745                 if (i->opcode() == G4_jmpi || i->isCall())
746                 {
747                     //
748                     // add control flow edges <curr_BB,target>
749                     //
750                     if (i->getSrc(0)->isLabel())
751                     {
752                         addPredSuccEdges(curr_BB, getLabelBB(labelMap, i->getSrc(0)->asLabel()));
753                     }
754                     else if (i->asCFInst()->isIndirectJmp())
755                     {
756                         // indirect jmp
757                         // For each label in the switch table, we insert a jmpi
758                         // to that label. We keep the jmpi in the same
759                         // block as the indirect jmp instead of creaing a new block for
760                         // each of them, as the offset actually determines which jmpi
761                         // instruction we will hit.
762                         // FIXME: We may want to delay the emission of the jmps to
763                         // each individual labels, so that we still maintain the property
764                         // that every basic block ends with a control flow instruction
765                         const std::list<G4_Label*>& jmpTargets = i->asCFInst()->getIndirectJmpLabels();
766                         for (auto it = jmpTargets.begin(), jmpTrgEnd = jmpTargets.end(); it != jmpTrgEnd; ++it)
767                         {
768                             G4_INST* jmpInst = builder->createJmp(nullptr, *it, InstOpt_NoOpt, true);
769                             indirectJmpTarget.emplace(jmpInst);
770                             INST_LIST_ITER jmpInstIter = builder->instList.end();
771                             curr_BB->splice(curr_BB->end(), builder->instList, --jmpInstIter);
772                             addPredSuccEdges(curr_BB, getLabelBB(labelMap, (*it)));
773                         }
774                     }
775 
776                     if (i->isCall())
777                     {
778                         //
779                         // the <CALL,return addr> link is added temporarily to facilitate linking
780                         // RETURN with return addresses. The link will be removed after it
781                         // serves its purpose
782                         // NOTE: No matter it has predicate, we add this link. We will check the predicate in handleReturn()
783                         // and only remove the link when it is not a conditional call
784                         //
785                         addPredSuccEdges(curr_BB, next_BB);
786                         if (i->getSrc(0)->isLabel())
787                         {
788                             i->getSrc(0)->asLabel()->setFuncLabel(true);
789                         }
790                     }
791                     else if (i->getPredicate())
792                     {
793                         // add fall through edge
794                         addUniquePredSuccEdges(curr_BB, next_BB);
795                     }
796                 }    // if (i->opcode()
797                 else if (i->opcode() == G4_if || i->opcode() == G4_while || i->opcode() == G4_else)
798                 {
799                     hasSIMDCF = true;
800                     if (i->asCFInst()->getJip()->isLabel())
801                     {
802                         if (i->opcode() == G4_else || i->opcode() == G4_while)
803                         {
804                             // for G4_while, jump no matter predicate
805                             addPredSuccEdges(curr_BB, getLabelBB(labelMap, i->asCFInst()->getJip()));
806                         }
807                         else if ((i->getPredicate() != NULL) ||
808                             ((i->getCondMod() != NULL) &&
809                             (i->getSrc(0) != NULL) &&
810                                 (i->getSrc(1) != NULL)))
811                         {
812                             addPredSuccEdges(curr_BB, getLabelBB(labelMap, i->asCFInst()->getJip()));
813                         }
814                     }
815                     if (i->opcode() == G4_while)
816                     {
817                         // always do this since break jumps to while, otherwise code after while without predicate appears unreachable.
818                         // add fall through edge if while is not the last instruction
819                         if (next_BB)
820                         {
821                             addPredSuccEdges(curr_BB, next_BB);
822                         }
823                     }
824                     else
825                     {
826                         //  always fall through
827                         addPredSuccEdges(curr_BB, next_BB);     // add fall through edge
828                     }
829                 }
830                 else if (i->opcode() == G4_break || i->opcode() == G4_cont || i->opcode() == G4_halt)
831                 {
832                     // JIP and UIP must have been computed at this point
833                     MUST_BE_TRUE(i->asCFInst()->getJip() != NULL && i->asCFInst()->getUip() != NULL,
834                         "null JIP or UIP for break/cont instruction");
835                     addPredSuccEdges(curr_BB, getLabelBB(labelMap, i->asCFInst()->getJip()));
836 
837                     if (strcmp(i->asCFInst()->getJipLabelStr(), i->asCFInst()->getUipLabelStr()) != 0)
838                         addPredSuccEdges(curr_BB, getLabelBB(labelMap, i->asCFInst()->getUip()));
839 
840                     //
841                     // pred means conditional branch
842                     //
843                     if (i->getPredicate() != NULL)
844                     {
845                         // add fall through edge
846                         addPredSuccEdges(curr_BB, next_BB);
847                     }
848                 }
849                 else if (i->isReturn() || i->opcode() == G4_pseudo_exit)
850                 {
851                     // does nothing for unconditional return;
852                     // later phase will link the return and the return address
853                     if (i->getPredicate() != NULL)
854                     {
855                         // add fall through edge
856                         addPredSuccEdges(curr_BB, next_BB);
857                     }
858                 }
859                 else if (i->opcode() == G4_pseudo_fcall || i->opcode() == G4_pseudo_fc_call)
860                 {
861                     addPredSuccEdges(curr_BB, next_BB);
862                 }
863                 else if (i->opcode() == G4_pseudo_fret || i->opcode() == G4_pseudo_fc_ret)
864                 {
865                     if (i->getPredicate())
866                     {
867                         // need to add fall through edge
868                         addPredSuccEdges(curr_BB, next_BB);
869                     }
870                 }
871                 else if (i->opcode() == G4_goto)
872                 {
873                     hasNoUniformGoto = true;
874                     hasSIMDCF = true;
875                     addPredSuccEdges(curr_BB, getLabelBB(labelMap, i->asCFInst()->getUip()));
876 
877                     if (i->getPredicate())
878                     {
879                         // fall through, but check if goto target is same as fall-thru
880                         // FIXME: replace all addPredSuccEdges with addUniquePredSuccEdges?
881                         addUniquePredSuccEdges(curr_BB, next_BB);
882                     }
883                 }
884                 else
885                 {
886                     assert(false && "should not reach here");
887                 }
888             } // need edge
889             curr_BB = next_BB;
890         }  // flow control
891         else if (curr_BB->isLastInstEOT())
892         {
893             addToSubroutine(curr_BB, currSubroutine);
894             //this is a send instruction that marks end of thread.
895             //the basic block will end here with no outgoing links
896             curr_BB = beginBB(labelMap, next_i);
897         }
898         else if (next_i && next_i->isLabel())
899         {
900             addToSubroutine(curr_BB, currSubroutine);
901             G4_BB* next_BB = beginBB(labelMap, next_i);
902             addPredSuccEdges(curr_BB, next_BB);
903             curr_BB = next_BB;
904         }
905     }
906     if (curr_BB)
907     {
908         addToSubroutine(curr_BB, currSubroutine);
909     }
910 
911     // we can do this only after fg is constructed
912     pKernel->calculateSimdSize();
913 
914     // Jmpi instruction cannot be used when EU Fusion is enabled.
915     bool hasGoto = hasNoUniformGoto;
916     if (builder->noScalarJmp())
917     {
918         // No jmpi should be inserted after this point.
919         hasGoto |= convertJmpiToGoto();
920     }
921 
922     pKernel->dumpToFile("after.CFGConstruction");
923 
924     removeRedundantLabels();
925 
926     pKernel->dumpToFile("after.RemoveRedundantLabels");
927 
928     handleExit(subroutineStartBB.size() > 1 ? subroutineStartBB[1] : nullptr);
929 
930     handleReturn(labelMap, funcInfoHashTable);
931     mergeReturn(funcInfoHashTable);
932     normalizeSubRoutineBB(funcInfoHashTable);
933 
934     handleWait();
935 
936     if (isStackCallFunc)
937     {
938         mergeFReturns();
939     }
940 
941     setPhysicalPredSucc();
942     removeUnreachableBlocks(funcInfoHashTable);
943 
944     pKernel->dumpToFile("after.RemoveUnreachableBlocks");
945 
946     //
947     // build the table of function info nodes
948     //
949 
950     for (FuncInfoHashTable::iterator it = funcInfoHashTable.begin(), end = funcInfoHashTable.end(); it != end; ++it) {
951         FuncInfo* funcInfo = (*it).second;
952         funcInfo->getInitBB()->setFuncInfo(funcInfo);
953         funcInfo->getExitBB()->setFuncInfo(funcInfo);
954         funcInfoTable.push_back(funcInfo);
955     }
956 
957     if (hasGoto)
958     {
959         // Structurizer requires that the last BB has no goto (ie, the
960         // last BB is either a return or an exit).
961         if (builder->getOption(vISA_EnableStructurizer) &&
962             !endWithGotoInLastBB())
963         {
964             doCFGStructurize(this);
965             pKernel->dumpToFile("after.CFGStructurizer");
966 
967             removeRedundantLabels();
968             pKernel->dumpToFile("after.PostStructurizerRedundantLabels");
969         }
970         else
971         {
972             processGoto(hasSIMDCF);
973             pKernel->dumpToFile("after.ProcessGoto");
974         }
975     }
976 
977     // This finds back edges and populates blocks into function info objects.
978     // The latter will be used to mark SIMD blocks. Prior to RA, no transformation
979     // shall invalidate back edges (G4_BB -> G4_BB).
980     reassignBlockIDs();
981     findBackEdges();
982 
983     if (hasSIMDCF && pKernel->getInt32KernelAttr(Attributes::ATTR_Target) == VISA_CM)
984     {
985         processSCF(funcInfoHashTable);
986         addSIMDEdges();
987     }
988 
989     // patch the last BB for the kernel
990     if (funcInfoTable.size() > 0)
991     {
992         kernelInfo->updateExitBB(subroutineStartBB[1]->getPhysicalPred());
993         topologicalSortCallGraph();
994     }
995     else
996     {
997         kernelInfo->updateExitBB(BBs.back());
998     }
999 
1000     // For non-kernel function, always invoke markDivergentBBs to
1001     // conservatively assume divergence.
1002     if ((hasSIMDCF || hasGoto || !builder->getIsKernel()) &&
1003         builder->getOption(vISA_divergentBB))
1004     {
1005         markDivergentBBs();
1006     }
1007 
1008     builder->materializeGlobalImm(getEntryBB());
1009     normalizeRegionDescriptors();
1010     localDataFlowAnalysis();
1011 
1012     markStale();
1013 }
1014 
normalizeRegionDescriptors()1015 void FlowGraph::normalizeRegionDescriptors()
1016 {
1017     for (auto bb : BBs)
1018     {
1019         for (auto inst : *bb)
1020         {
1021             uint16_t execSize = inst->getExecSize();
1022             for (unsigned i = 0, e = (unsigned)inst->getNumSrc(); i < e; ++i)
1023             {
1024                 G4_Operand *src = inst->getSrc(i);
1025                 if (!src || !src->isSrcRegRegion())
1026                     continue;
1027 
1028                 G4_SrcRegRegion *srcRegion = src->asSrcRegRegion();
1029                 const RegionDesc *desc = srcRegion->getRegion();
1030                 auto normDesc = builder->getNormalizedRegion(execSize, desc);
1031                 if (normDesc && normDesc != desc)
1032                 {
1033                     srcRegion->setRegion(normDesc, /*invariant*/ true);
1034                 }
1035             }
1036         }
1037     }
1038 }
1039 
1040 // materialize the values in global Imm at entry BB
materializeGlobalImm(G4_BB * entryBB)1041 void IR_Builder::materializeGlobalImm(G4_BB* entryBB)
1042 {
1043     InstSplitPass instSplitter(this);
1044     for (int i = 0, numImm = immPool.size(); i < numImm; ++i)
1045     {
1046         auto&& immVal = immPool.getImmVal(i);
1047         auto dcl = immPool.getImmDcl(i);
1048         G4_INST* inst = createMov(
1049             G4_ExecSize((unsigned)immVal.numElt),
1050             createDstRegRegion(dcl, 1), immVal.imm, InstOpt_WriteEnable, false);
1051         auto iter = std::find_if(entryBB->begin(), entryBB->end(),
1052             [](G4_INST* inst) { return !inst->isLabel(); });
1053         INST_LIST_ITER newMov = entryBB->insertBefore(iter, inst);
1054         instSplitter.splitInstruction(newMov, entryBB->getInstList());
1055     }
1056 }
1057 
1058 //
1059 // attempt to sink the pseudo-wait into the immediate succeeding send instruction (in same BB)
1060 // by changing it into a sendc.
1061 // If sinking fails, generate a fence with sendc.
1062 //
handleWait()1063 void FlowGraph::handleWait()
1064 {
1065     for (auto bb : BBs)
1066     {
1067         auto iter = bb->begin(), instEnd = bb->end();
1068         while (iter != instEnd)
1069         {
1070             auto inst = *iter;
1071             if (inst->isIntrinsic() &&
1072                 inst->asIntrinsicInst()->getIntrinsicId() == Intrinsic::Wait)
1073             {
1074                 bool sunk = false;
1075                 auto nextIter = iter;
1076                 ++nextIter;
1077                 while (nextIter != instEnd)
1078                 {
1079                     G4_INST* nextInst = *nextIter;
1080                     if (nextInst->isSend())
1081                     {
1082                         nextInst->asSendInst()->setSendc();
1083                         sunk = true;
1084                         break;
1085                     }
1086                     else if (nextInst->isOptBarrier())
1087                     {
1088                         break;
1089                     }
1090                     ++nextIter;
1091                 }
1092                 if (!sunk)
1093                 {
1094                     auto fenceInst = builder->createSLMFence();
1095                     auto sendInst = fenceInst->asSendInst();
1096                     if (sendInst != NULL)
1097                     {
1098                         sendInst->setSendc();
1099                         bb->insertBefore(iter, fenceInst);
1100                     }
1101                 }
1102                 iter = bb->erase(iter);
1103             }
1104             else
1105             {
1106                 ++iter;
1107             }
1108         }
1109     }
1110 }
1111 
1112 //
1113 // Each g4_pseudo_exit instruction will be translated into one of the following:
1114 // -- a unconditional simd1 ret: translated into a EOT send (may be optionally merged with
1115 //    an immediately preceding send)
1116 // -- a conditional simd1 ret: translated into a jmpi to the exit block
1117 // -- a non-uniforom ret: translated into a halt to the exit block
1118 // for case 2 and 3 an exit block will be inserted, and it will consist of a EOT send
1119 // plus a join if it's targeted by another goto instruction
1120 //
handleExit(G4_BB * firstSubroutineBB)1121 void FlowGraph::handleExit(G4_BB* firstSubroutineBB)
1122 {
1123 
1124     // blocks that end with non-uniform or conditional return
1125     std::vector<G4_BB*> exitBlocks;
1126     BB_LIST_ITER iter = BBs.begin(), iterEnd = BBs.end();
1127     for (; iter != iterEnd; ++iter)
1128     {
1129         G4_BB* bb = *iter;
1130         if (bb->size() == 0)
1131         {
1132             continue;
1133         }
1134 
1135         if (bb == firstSubroutineBB)
1136         {
1137             // we've reached the first subroutine's entry BB,
1138             break;
1139         }
1140 
1141         G4_INST* lastInst = bb->back();
1142         if (lastInst->opcode() == G4_pseudo_exit)
1143         {
1144             if (lastInst->getExecSize() == g4::SIMD1 &&
1145                 !(builder->getFCPatchInfo() && builder->getFCPatchInfo()->getFCComposableKernel()))
1146             {
1147                 //uniform return
1148                 if (lastInst->getPredicate() != NULL)
1149                 {
1150                     exitBlocks.push_back(bb);
1151                 }
1152                 else
1153                 {
1154                     //generate EOT send
1155                     G4_INST* lastInst = bb->back();
1156                     bb->pop_back();
1157                     bool needsEOTSend = true;
1158                     // the EOT may be folded into the BB's last instruction if it's a send
1159                     // that supports EOT
1160                     if (builder->getOption(vISA_foldEOTtoPrevSend) && bb->size() > 1)
1161                     {
1162                         G4_InstSend* secondToLastInst = bb->back()->asSendInst();
1163                         if (secondToLastInst && secondToLastInst->canBeEOT() &&
1164                             !(secondToLastInst->getMsgDesc()->getSrc1LenRegs() > 2 &&
1165                                 VISA_WA_CHECK(builder->getPWaTable(), WaSendsSrc1SizeLimitWhenEOT)))
1166                         {
1167                             G4_SendDescRaw *rawDesc = secondToLastInst->getMsgDescRaw();
1168                             MUST_BE_TRUE(rawDesc, "expected raw descriptor");
1169                             rawDesc->setEOT();
1170                             if (secondToLastInst->isSplitSend())
1171                             {
1172                                 if (secondToLastInst->getSrc(3)->isImm())
1173                                 {
1174                                     secondToLastInst->setSrc(
1175                                         builder->createImm(rawDesc->getExtendedDesc(), Type_UD), 3);
1176                                 }
1177                             }
1178                             needsEOTSend = false;
1179                             if (builder->getHasNullReturnSampler() && VISA_WA_CHECK(builder->getPWaTable(), Wa_1607871015))
1180                             {
1181                                 bb->addSamplerFlushBeforeEOT();
1182                             }
1183                         }
1184                     }
1185 
1186                     if (needsEOTSend)
1187                     {
1188                         bb->addEOTSend(lastInst);
1189                     }
1190                 }
1191             }
1192             else
1193             {
1194                 exitBlocks.push_back(bb);
1195             }
1196         }
1197     }
1198 
1199     // create an exit BB
1200     if (exitBlocks.size() > 0)
1201     {
1202         G4_BB *exitBB = createNewBB();
1203 
1204         if (builder->getFCPatchInfo() &&
1205             builder->getFCPatchInfo()->getFCComposableKernel())
1206         {
1207             // For FC composable kernels insert exitBB as
1208             // last BB in BBs list. This automatically does
1209             // jump threading.
1210             push_back(exitBB);
1211         }
1212         else
1213         {
1214             insert(iter, exitBB);
1215         }
1216 
1217         std::string exitBBStr("__EXIT_BB");
1218         G4_Label *exitLabel = builder->createLabel(exitBBStr, LABEL_BLOCK);
1219         G4_INST* label = createNewLabelInst(exitLabel);
1220         exitBB->push_back(label);
1221 
1222         if (!(builder->getFCPatchInfo() &&
1223             builder->getFCPatchInfo()->getFCComposableKernel()))
1224         {
1225             // Dont insert EOT send for FC composable kernels
1226             exitBB->addEOTSend();
1227         }
1228 
1229         for (int i = 0, size = (int)exitBlocks.size(); i < size; i++)
1230         {
1231             G4_BB* retBB = exitBlocks[i];
1232             addPredSuccEdges(retBB, exitBB, false);
1233             G4_INST* retInst = retBB->back();
1234             retBB->pop_back();
1235 
1236             // Insert jump only if newly inserted exitBB is not lexical
1237             // successor of current BB
1238             auto lastBBIt = BBs.end();
1239             lastBBIt--;
1240             if ((*lastBBIt) == exitBB)
1241             {
1242                 lastBBIt--;
1243                 if ((*lastBBIt) == retBB)
1244                 {
1245                     // This condition is BB layout dependent.
1246                     // However, we dont change BB layout in JIT
1247                     // and in case we do it in future, we
1248                     // will need to insert correct jumps
1249                     // there to preserve correctness.
1250                     continue;
1251                 }
1252             }
1253 
1254             if (retInst->getExecSize() == g4::SIMD1)
1255             {
1256                 G4_INST* jmpInst = builder->createJmp(retInst->getPredicate(), exitLabel, InstOpt_NoOpt, false);
1257                 retBB->push_back(jmpInst);
1258             }
1259             else
1260             {
1261                 // uip for goto will be fixed later
1262                 G4_INST* gotoInst = builder->createInternalCFInst(retInst->getPredicate(),
1263                     G4_goto, retInst->getExecSize(), exitLabel, exitLabel, InstOpt_NoOpt);
1264                 retBB->push_back(gotoInst);
1265             }
1266         }
1267     }
1268 
1269 #ifdef _DEBUG
1270 
1271     // sanity check
1272     for (const G4_BB* bb : BBs)
1273     {
1274         if (bb->size() == 0)
1275         {
1276             continue;
1277         }
1278         const G4_INST* lastInst = bb->back();
1279         if (lastInst->opcode() == G4_pseudo_exit)
1280         {
1281             MUST_BE_TRUE(false, "All pseudo exits should be removed at this point");
1282         }
1283     }
1284 
1285 #endif
1286 }
1287 
1288 //
1289 // This phase iterates each BB and checks if the last inst of a BB is a "call foo".  If yes,
1290 // the algorithm traverses from the block of foo to search for RETURN and link the block of
1291 // RETURN with the block of the return address
1292 //
handleReturn(Label_BB_Map & labelMap,FuncInfoHashTable & funcInfoHashTable)1293 void FlowGraph::handleReturn(Label_BB_Map& labelMap, FuncInfoHashTable& funcInfoHashTable)
1294 {
1295     for (std::list<G4_BB*>::iterator it = BBs.begin(), itEnd = BBs.end(); it != itEnd; ++it)
1296     {
1297         G4_BB* bb = (*it);
1298 
1299         if (bb->isEndWithCall())
1300         {
1301             bb->setBBType(G4_BB_CALL_TYPE);
1302             G4_INST* last = bb->back();
1303             if (last->getSrc(0)->isLabel())
1304             {
1305                 // make sure bb has only two successors, one subroutine and one return addr
1306                 MUST_BE_TRUE1(bb->Succs.size() == 2, last->getLineNo(),
1307                     ERROR_FLOWGRAPH);
1308 
1309                 // find the subroutine BB and return Addr BB
1310                 G4_BB* subBB = labelMap[last->getSrc(0)->asLabel()];
1311                 //
1312                 // the fall through BB must be the front
1313                 //
1314                 G4_BB* retAddr = bb->Succs.front();
1315                 if (retAddr->Preds.size() > 1)
1316                 {
1317                     // Create an empty BB as a new RETURN BB as we want RETURN BB
1318                     // to be reached only by CALL BB.  Note that at this time, call
1319                     // return edge has not been added yet.
1320                     G4_INST* I0 = retAddr->getFirstInst();
1321                     if (I0 == nullptr) I0 = last;
1322                     G4_BB* newRetBB = createNewBBWithLabel("Label_return_BB", I0->getLineNo(), I0->getCISAOff());
1323                     bb->removeSuccEdge(retAddr);
1324                     retAddr->removePredEdge(bb);
1325                     addPredSuccEdges(bb, newRetBB, true);
1326                     addPredSuccEdges(newRetBB, retAddr, true);
1327 
1328                     insert(std::next(it), newRetBB);
1329                     retAddr = newRetBB;
1330                 }
1331                 // Add EXIT->RETURN edge.
1332                 linkReturnAddr(subBB, retAddr);
1333 
1334                 // set callee info for CALL
1335                 FuncInfoHashTable::iterator calleeInfoLoc = funcInfoHashTable.find(subBB->getId());
1336 
1337                 if (calleeInfoLoc != funcInfoHashTable.end())
1338                 {
1339                     (*calleeInfoLoc).second->incrementCallCount();
1340                     bb->setCalleeInfo((*calleeInfoLoc).second);
1341                     doIPA = true;
1342                 }
1343                 else
1344                 {
1345                     unsigned funcId = (unsigned)funcInfoHashTable.size();
1346                     FuncInfo *funcInfo = new (mem)FuncInfo(
1347                         funcId, subBB, retAddr->Preds.front());
1348                     std::pair<FuncInfoHashTable::iterator, bool> loc =
1349                         funcInfoHashTable.insert(
1350                             std::make_pair(subBB->getId(), funcInfo));
1351                     subBB->setBBType(G4_BB_INIT_TYPE);
1352                     retAddr->Preds.front()->setBBType(G4_BB_EXIT_TYPE);
1353                     MUST_BE_TRUE(loc.second, ERROR_FLOWGRAPH);
1354                     bb->setCalleeInfo((*(loc.first)).second);
1355                 }
1356                 retAddr->setBBType(G4_BB_RETURN_TYPE);
1357             }
1358         }
1359 
1360         if (bb->isEndWithFCall())
1361         {
1362             // normalizeFlowGraph() process FCALL BB. (Maybe should be processed here?)
1363             // Here just set BB type
1364             bb->setBBType(G4_BB_FCALL_TYPE);
1365         }
1366     }
1367     //
1368     // remove <CALL, return addr> link when it is not a conditional call
1369     //
1370     for (G4_BB* bb : BBs)
1371     {
1372         if (bb->isEndWithCall())
1373         {
1374             G4_INST* last = bb->back();
1375             if (last->getPredicate() == NULL)
1376             {
1377                 MUST_BE_TRUE(!bb->Succs.empty(), ERROR_FLOWGRAPH);
1378                 G4_BB* retAddr = bb->Succs.front();     // return BB must be the front
1379                 bb->removeSuccEdge(retAddr);
1380                 retAddr->removePredEdge(bb);
1381             }
1382         }
1383     }
1384 }
1385 
linkReturnAddr(G4_BB * entryBB,G4_BB * returnAddr)1386 void FlowGraph::linkReturnAddr(G4_BB* entryBB, G4_BB* returnAddr)
1387 {
1388 
1389     assert(entryBB->size() > 0 && entryBB->front()->isLabel() && "BB should start with a label");
1390     auto label = entryBB->front()->getLabel();
1391     assert(subroutines.count(label) && "can't find subroutine label");
1392     auto subroutineBBs = subroutines[label];
1393 
1394     for (auto bb : subroutineBBs)
1395     {
1396         if (!bb->empty() && bb->back()->isReturn())
1397         {
1398             //
1399             // check the direct recursive call here!
1400             //
1401             if (bb == returnAddr && hasStackCalls == false)
1402             {
1403                 MUST_BE_TRUE(false,
1404                     "ERROR: Do not support recursive subroutine call!");
1405             }
1406             addPredSuccEdges(bb, returnAddr, false); // IMPORTANT: add to the back to allow fall through edge is in the front, which is used by fallThroughBB()
1407         }
1408     }
1409 }
1410 
1411 
1412 //
1413 // This phase iterates each BB and checks if the last inst of a BB is a "call foo".  If yes,
1414 // the algorith traverses from the block of foo to search for RETURN. If multiple returns exist,
1415 // the algorithm will merge returns into one
1416 // [Reason]:to handle the case when spill codes are needed between multiple return BBs of one subroutine
1417 //          and the afterCAll BB. It is impossible to insert spill codes of different return BBs all before
1418 //          afterCall BB.
1419 //
mergeReturn(FuncInfoHashTable & funcInfoHashTable)1420 void FlowGraph::mergeReturn(FuncInfoHashTable& funcInfoHashTable)
1421 {
1422 
1423     for (auto I = subroutines.begin(), E = subroutines.end(); I != E; ++I)
1424     {
1425         auto label = I->first;
1426         G4_BB* subBB = I->second.front();
1427         G4_BB* mergedExitBB = mergeSubRoutineReturn(label);
1428 
1429         // update callee exit block info
1430         FuncInfoHashTable::iterator calleeInfoLoc = funcInfoHashTable.find(subBB->getId());
1431         // it's possible for the subroutine to never be called (e.g., kernel function)
1432         if (calleeInfoLoc != funcInfoHashTable.end() && mergedExitBB)
1433         {
1434             calleeInfoLoc->second->getExitBB()->unsetBBType(G4_BB_EXIT_TYPE);
1435             mergedExitBB->setBBType(G4_BB_EXIT_TYPE);
1436             calleeInfoLoc->second->updateExitBB(mergedExitBB);
1437         }
1438     }
1439 }
1440 
mergeSubRoutineReturn(G4_Label * subroutineLabel)1441 G4_BB* FlowGraph::mergeSubRoutineReturn(G4_Label* subroutineLabel)
1442 {
1443 
1444     G4_BB* newBB = nullptr;
1445 
1446     auto subroutineBB = subroutines[subroutineLabel];
1447 
1448     std::vector<G4_BB*> retBBList;
1449     for (auto bb : subroutineBB)
1450     {
1451         if (!bb->empty() && bb->back()->isReturn())
1452         {
1453             retBBList.push_back(bb);
1454         }
1455     }
1456 
1457     if (retBBList.size() > 1)     // For return number >1, need to merge returns
1458     {
1459         builder->instList.clear();
1460 
1461         //
1462         // insert newBB to fg.BBs list
1463         //
1464         newBB = createNewBB();
1465         insert(BBs.end(), newBB);
1466         // choose the last BB in retBBList as a candidate
1467         G4_BB* candidateBB = *(retBBList.rbegin());
1468         // Add <newBB, succBB> edges
1469         G4_INST* last = candidateBB->back();
1470         BB_LIST_ITER succIt = (last->getPredicate() == NULL) ? candidateBB->Succs.begin() : (++candidateBB->Succs.begin());
1471         BB_LIST_ITER succItEnd = candidateBB->Succs.end();
1472         // link new ret BB with each call site
1473         for (; succIt != succItEnd; ++succIt) {
1474             addPredSuccEdges(newBB, (*succIt), false);
1475         }
1476 
1477         //
1478         // Create a label for the newBB and insert return to new BB
1479         //
1480         std::string str = "LABEL__" + std::to_string(newBB->getId());
1481         G4_Label* lab = builder->createLabel(str, LABEL_BLOCK);
1482         G4_INST* labelInst = createNewLabelInst(lab);
1483 
1484         // exitBB is really just a dummy one for analysis, and does not need a return
1485         // we will instead leave the return in each of its predecessors
1486         newBB->push_back(labelInst);
1487 
1488         //
1489         // Deal with all return BBs
1490         //
1491         for (G4_BB * retBB : retBBList)
1492         {
1493             if (retBB->getId() == newBB->getId())
1494             {
1495                 continue;
1496             }
1497             last = retBB->back();
1498             // remove <retBB,its succBB> edges, do not remove the fall through edge if predicated
1499             BB_LIST retSuccsList;
1500             retSuccsList.assign(retBB->Succs.begin(), retBB->Succs.end());
1501             for (BB_LIST_ITER retSuccIt = retSuccsList.begin(); retSuccIt != retSuccsList.end(); ++retSuccIt)
1502             {
1503                 G4_BB * retSuccBB = (*retSuccIt);
1504                 if (last->getPredicate() != NULL && retSuccIt == retSuccsList.begin()) continue;
1505                 retBB->removeSuccEdge(retSuccBB);
1506                 retSuccBB->removePredEdge(retBB);
1507             }
1508             // Add <retBB,newBB> edges
1509             addPredSuccEdges(retBB, newBB, false);
1510         }
1511     }
1512 
1513     return newBB;
1514 }
1515 
decoupleInitBlock(G4_BB * bb,FuncInfoHashTable & funcInfoHashTable)1516 void FlowGraph::decoupleInitBlock(G4_BB* bb, FuncInfoHashTable& funcInfoHashTable)
1517 {
1518     G4_BB* oldInitBB = bb;
1519     G4_BB* newInitBB = createNewBB();
1520     BB_LIST_ITER insertBefore = std::find(BBs.begin(), BBs.end(), bb);
1521     MUST_BE_TRUE(insertBefore != BBs.end(), ERROR_FLOWGRAPH);
1522     insert(insertBefore, newInitBB);
1523 
1524     BB_LIST_ITER kt = oldInitBB->Preds.begin();
1525     while (kt != oldInitBB->Preds.end())
1526     {
1527         // the pred of this new INIT BB are all call BB
1528         if ((*kt)->getBBType() & G4_BB_CALL_TYPE) {
1529 
1530             newInitBB->Preds.push_back((*kt));
1531 
1532             BB_LIST_ITER jt = (*kt)->Succs.begin();
1533             while (jt != (*kt)->Succs.end()) {
1534                 if ((*jt) == oldInitBB)
1535                 {
1536                     break;
1537                 }
1538                 jt++;
1539             }
1540             MUST_BE_TRUE(jt != (*kt)->Succs.end(), ERROR_FLOWGRAPH);
1541             (*kt)->Succs.insert(jt, newInitBB);
1542             (*kt)->Succs.erase(jt);
1543 
1544             BB_LIST_ITER tmp_kt = kt;
1545             ++kt;
1546             // erase this pred from old INIT BB's pred
1547             oldInitBB->Preds.erase(tmp_kt);
1548         }
1549         else
1550         {
1551             ++kt;
1552         }
1553     }
1554 
1555     FuncInfoHashTable::iterator old_iter = funcInfoHashTable.find(oldInitBB->getId());
1556     MUST_BE_TRUE(old_iter != funcInfoHashTable.end(), " Function info is not in hashtable.");
1557     FuncInfo* funcInfo = old_iter->second;
1558 
1559     // Erase the old item from unordered_map and add the new one.
1560     funcInfo->updateInitBB(newInitBB);
1561     funcInfoHashTable.erase(old_iter);
1562     funcInfoHashTable.insert(std::make_pair(newInitBB->getId(), funcInfo));
1563 
1564     oldInitBB->unsetBBType(G4_BB_INIT_TYPE);
1565     newInitBB->setBBType(G4_BB_INIT_TYPE);
1566     addPredSuccEdges(newInitBB, oldInitBB);
1567 
1568     std::string blockName = "LABEL__EMPTYBB__" + std::to_string(newInitBB->getId());
1569     G4_Label* label = builder->createLabel(blockName, LABEL_BLOCK);
1570     G4_INST* labelInst = createNewLabelInst(label);
1571     newInitBB->push_back(labelInst);
1572 }
1573 
decoupleReturnBlock(G4_BB * bb)1574 void FlowGraph::decoupleReturnBlock(G4_BB* bb)
1575 {
1576     G4_BB* oldRetBB = bb;
1577     G4_BB* itsExitBB = oldRetBB->BBBeforeCall()->getCalleeInfo()->getExitBB();
1578     G4_BB* newRetBB = createNewBB();
1579     BB_LIST_ITER insertBefore = std::find(BBs.begin(), BBs.end(), bb);
1580     MUST_BE_TRUE(insertBefore != BBs.end(), ERROR_FLOWGRAPH);
1581     insert(insertBefore, newRetBB);
1582 
1583     std::replace(itsExitBB->Succs.begin(), itsExitBB->Succs.end(), oldRetBB, newRetBB);
1584     newRetBB->Preds.push_back(itsExitBB);
1585     newRetBB->Succs.push_back(oldRetBB);
1586 
1587     std::replace(oldRetBB->Preds.begin(), oldRetBB->Preds.end(), itsExitBB, newRetBB);
1588     oldRetBB->Preds.unique();
1589 
1590     oldRetBB->unsetBBType(G4_BB_RETURN_TYPE);
1591     newRetBB->setBBType(G4_BB_RETURN_TYPE);
1592 
1593     std::string str = "LABEL__EMPTYBB__" + std::to_string(newRetBB->getId());
1594     G4_Label* label = builder->createLabel(str, LABEL_BLOCK);
1595     G4_INST* labelInst = createNewLabelInst(label);
1596     newRetBB->push_back(labelInst);
1597 }
1598 
1599 // Make sure that a BB is at most one of CALL/RETURN/EXIT/INIT. If any
1600 // BB is both, say INIT and CALL, decouple them by inserting a new BB.
1601 // [The above comments are post-added from reading the code.]
1602 //
1603 // The inserted BB must be in the original place so that each subroutine
1604 // has a list of consecutive BBs.
normalizeSubRoutineBB(FuncInfoHashTable & funcInfoTable)1605 void FlowGraph::normalizeSubRoutineBB(FuncInfoHashTable& funcInfoTable)
1606 {
1607     setPhysicalPredSucc();
1608     for (G4_BB* bb : BBs)
1609     {
1610         if ((bb->getBBType() & G4_BB_CALL_TYPE))
1611         {
1612             if (bb->getBBType() & G4_BB_EXIT_TYPE)
1613             {
1614                 // As call BB has RETURN BB as fall-thru, cannot be EXIT
1615                 MUST_BE_TRUE(false, ERROR_FLOWGRAPH);
1616             }
1617 
1618             // BB could be either INIT or RETURN, but not both.
1619             if (bb->getBBType() & G4_BB_INIT_TYPE)
1620             {
1621                 decoupleInitBlock(bb, funcInfoTable);
1622             }
1623             else if (bb->getBBType() & G4_BB_RETURN_TYPE)
1624             {
1625                 decoupleReturnBlock(bb);
1626             }
1627         }
1628         else if ((bb->getBBType() & G4_BB_INIT_TYPE))
1629         {
1630             if (bb->getBBType() & G4_BB_RETURN_TYPE)
1631             {
1632                 // As retrun BB must have a pred, it cannot be init BB
1633                 MUST_BE_TRUE(false, ERROR_FLOWGRAPH);
1634             }
1635 
1636             // Two possible combinations: INIT & CALL, or INIT & EXIT. INIT & CALL has
1637             // been processed in the previous IF, here only INIT and EXIT is possible.
1638             if (bb->getBBType() & G4_BB_EXIT_TYPE)
1639             {
1640                 decoupleInitBlock(bb, funcInfoTable);
1641             }
1642         }
1643         else if ((bb->getBBType() & G4_BB_EXIT_TYPE))
1644         {
1645             // Only EXIT & RETURN are possible. (INIT & EXIT
1646             // has been processed)
1647             if (bb->getBBType() & G4_BB_RETURN_TYPE)
1648             {
1649                 decoupleReturnBlock(bb);
1650             }
1651         }
1652         else if ((bb->getBBType() & G4_BB_RETURN_TYPE))
1653         {
1654             // Do we need to do this ?
1655             if (bb->Preds.size() > 1)
1656             {
1657                 decoupleReturnBlock(bb);
1658             }
1659         }
1660     }
1661 }
1662 
1663 //
1664 // This function does a DFS and any blocks that get visited will have their
1665 // preId set according to the ordering. Blocks that never get visited will
1666 // have their preId unmodified.
1667 //
doDFS(G4_BB * startBB,unsigned int p)1668 void doDFS(G4_BB* startBB, unsigned int p)
1669 {
1670     unsigned int currP = p;
1671     std::stack<G4_BB*> traversalStack;
1672     traversalStack.push(startBB);
1673 
1674     while (!traversalStack.empty())
1675     {
1676         G4_BB* currBB = traversalStack.top();
1677         traversalStack.pop();
1678         if (currBB->getPreId() != UINT_MAX)
1679         {
1680             continue;
1681         }
1682         currBB->setPreId(currP++);
1683 
1684         for (G4_BB* tmp : currBB->Succs)
1685         {
1686             if (tmp->getPreId() == UINT_MAX)
1687             {
1688                 traversalStack.push(tmp);
1689             }
1690         }
1691     }
1692 }
1693 
1694 //
1695 // The optimization pass will remove unreachable blocks from functions. Later compilation phases
1696 // assume that the only unreachable code that can exist in a function is the block with
1697 // return/pseudo_fret instruction. All other unreachable code should be removed. The only time
1698 // blocks with return/pseudo_fret will be removed is when the header of that function itself
1699 // is deemed unreachable.
1700 //
removeUnreachableBlocks(FuncInfoHashTable & funcInfoHT)1701 void FlowGraph::removeUnreachableBlocks(FuncInfoHashTable& funcInfoHT)
1702 {
1703     unsigned preId = 0;
1704     //
1705     // initializations
1706     //
1707     for (G4_BB* bb : BBs)
1708     {
1709         bb->setPreId(UINT_MAX);
1710     }
1711     //
1712     // assign DFS based pre/rpost ids to all blocks in the main program
1713     //
1714     doDFS(getEntryBB(), preId);
1715 
1716     //
1717     // Basic blocks with preId/rpostId set to UINT_MAX are unreachable.
1718     // Special handling:
1719     //   1. If CALL_BB isn't dead, don't remove RETURN_BB;
1720     //   2. If INIT_BB isn't dead, don't remove EXIT_BB.
1721     //   3. For BB with EOT, don't remove it.
1722     // (Note that in BB list (BBs), CALL_BB appears before RETURN_BB and INIT_BB
1723     //  appears before EXIT_BB. The algo works under this assumpion.)
1724     BB_LIST_ITER it = BBs.begin();
1725     while (it != BBs.end())
1726     {
1727         G4_BB* bb = (*it);
1728         if (bb->getPreId() == UINT_MAX)
1729         {
1730             if (bb->getBBType() & G4_BB_INIT_TYPE)
1731             {
1732                 // Remove it from funcInfoHT.
1733                 int funcId = bb->getId();
1734                 unsigned numErased = funcInfoHT.erase(funcId);
1735                 assert(numErased == 1);
1736             }
1737             else if (bb->getBBType() & G4_BB_CALL_TYPE)
1738             {
1739                 // If call bb is removed, its return BB shuld be removed as well.
1740                 G4_BB* retBB = bb->getPhysicalSucc();
1741                 assert(retBB && "vISA ICE: missing Return BB");
1742                 if (retBB->getPreId() != UINT_MAX)
1743                 {
1744                     retBB->setPreId(UINT_MAX);
1745                 }
1746             }
1747 
1748             // EOT should be in entry function, we keep it for now.
1749             if (bb->size() > 0 && bb->back()->isEOT())
1750             {
1751                 ++it;
1752                 continue;
1753             }
1754 
1755             // Now, remove bb.
1756             while (bb->Succs.size() > 0)
1757             {
1758                 removePredSuccEdges(bb, bb->Succs.front());
1759             }
1760             if (bb->getBBType() & G4_BB_RETURN_TYPE)
1761             {
1762                 // As callee may be live, need to remove pred edges from exit BB.
1763                 for (auto& II : bb->Preds)
1764                 {
1765                     G4_BB* exitBB = II;
1766                     if (exitBB->getBBType() & G4_BB_EXIT_TYPE)
1767                     {
1768                         removePredSuccEdges(exitBB, bb);
1769                         break;
1770                     }
1771                 }
1772             }
1773 
1774             BB_LIST_ITER prev = it;
1775             ++it;
1776             erase(prev);
1777         }
1778         else
1779         {
1780             if (bb->getBBType() & G4_BB_CALL_TYPE)
1781             {
1782                 // CALL BB isn't dead, don't remove return BB.
1783                 G4_BB* retBB = bb->getPhysicalSucc();
1784                 assert(retBB && (retBB->getBBType() & G4_BB_RETURN_TYPE) &&
1785                     "vISA ICE: missing RETURN BB");
1786                 if (retBB->getPreId() == UINT_MAX)
1787                 {
1788                     // For example,
1789                     //    CALL_BB  : call A   succ: init_BB
1790                     //    RETURN_BB: pred: exit_BB
1791                     //
1792                     //    // subroutine A
1793                     //    init_BB:
1794                     //       while (true) ...;  // inifinite loop
1795                     //    exit_BB:  succ : RETURN_BB
1796                     // Both RETURN_BB and exit_BB are unreachable, but
1797                     // we keep both!
1798                     retBB->setPreId(UINT_MAX - 1);
1799                 }
1800             }
1801             else if (bb->getBBType() & G4_BB_INIT_TYPE)
1802             {
1803                 // function isn't dead, don't remove exit BB.
1804                 int funcId = bb->getId();
1805                 auto entry = funcInfoHT.find(funcId);
1806                 assert(entry != funcInfoHT.end());
1807                 FuncInfo* finfo = entry->second;
1808                 G4_BB* exitBB = finfo->getExitBB();
1809                 assert(exitBB && "vISA ICE: missing exit BB");
1810                 if (exitBB->getPreId() == UINT_MAX)
1811                 {
1812                     // See the example above for G4_BB_CALL_TYPE
1813                     exitBB->setPreId(UINT_MAX - 1);
1814                 }
1815             }
1816             it++;
1817         }
1818     }
1819     reassignBlockIDs();
1820     setPhysicalPredSucc();
1821 }
1822 
1823 //
1824 // Remove redundant goto/jmpi
1825 // Remove the fall through edges between subroutine and its non-caller preds
1826 // Remove basic blocks that only contain a label, function labels are untouched.
1827 // Remove empty blocks.
1828 //
removeRedundantLabels()1829 void FlowGraph::removeRedundantLabels()
1830 {
1831     // first,  remove redundant goto
1832     //    goto L0
1833     //      ....
1834     //    goto L1     <-- redundant goto
1835     // L0: <empty BB>
1836     // L1:
1837     BB_LIST_ITER Next = BBs.begin(), IE = BBs.end();
1838     for (BB_LIST_ITER II = Next; II != IE; II = Next)
1839     {
1840         ++Next;
1841 
1842         G4_BB* BB = *II;
1843         G4_opcode lastop = BB->getLastOpcode();
1844         if (BB->Succs.size() == 1 && (lastop == G4_goto || lastop == G4_jmpi))
1845         {
1846             G4_BB* SuccBB = BB->Succs.back();
1847             G4_BB* BB1 = nullptr;
1848             // Skip over empty BBs
1849             for (auto iter = Next; iter != IE; ++iter)
1850             {
1851                 BB1 = *iter;
1852                 if (BB1 != SuccBB &&
1853                     (BB1->empty() ||
1854                      (BB1->size() == 1 && BB1->getLastOpcode() == G4_label)))
1855                 {
1856                     continue;
1857                 }
1858                 break;
1859             }
1860             if (BB1 && SuccBB == BB1)
1861             {
1862                 // Remove goto and its fall-thru will be its succ.
1863                 G4_BB* fallThruBB = *Next;
1864                 if (fallThruBB != SuccBB)
1865                 {
1866                     // Reconnect succ/pred for BB
1867                     removePredSuccEdges(BB, SuccBB);
1868                     addPredSuccEdges(BB, fallThruBB);
1869                 }
1870                 BB->pop_back();
1871             }
1872         }
1873     }
1874 
1875     for (BB_LIST_ITER nextit = BBs.begin(), ite = BBs.end(); nextit != ite; )
1876     {
1877         BB_LIST_ITER it = nextit++;
1878         G4_BB* bb = *it;
1879 
1880         if (bb == getEntryBB())
1881         {
1882             continue;
1883         }
1884         if (bb->Succs.size() == 0 && bb->Preds.size() == 0) {
1885             //leaving dangling BBs with return in for now.
1886             //workaround to handle unreachable return
1887             //for example return after infinite loop.
1888             if (bb->isEndWithFRet() || (bb->size() > 0 && ((G4_INST*)bb->back())->isReturn()))
1889             {
1890                 continue;
1891             }
1892 
1893             bb->clear();
1894             erase(it);
1895 
1896             continue;
1897         }
1898 
1899         assert(bb->size() > 0 && bb->front()->isLabel() &&
1900                "Every BB should at least have a label inst!");
1901 
1902         // Possible kernel's entry, don't delete.
1903         if (strcmp(bb->front()->getLabelStr(), "per-thread-prolog") == 0)
1904         {
1905             continue;
1906         }
1907 
1908         if (bb->getBBType() &
1909             (G4_BB_CALL_TYPE | G4_BB_EXIT_TYPE | G4_BB_INIT_TYPE | G4_BB_RETURN_TYPE))
1910         {
1911             // Keep those BBs
1912             continue;
1913         }
1914 
1915         //
1916         // The removal candidates will have a single successor and a single inst
1917         //
1918         if (bb->Succs.size() == 1 && bb->size() == 1)
1919         {
1920             G4_INST* removedBlockInst = bb->front();
1921             if (removedBlockInst->getLabel()->isFuncLabel() ||
1922                 strncmp(removedBlockInst->getLabelStr(), "LABEL__EMPTYBB", 14) == 0 ||
1923                 strncmp(removedBlockInst->getLabelStr(), "__AUTO_GENERATED_DUMMY_LAST_BB", 30) == 0)
1924             {
1925                 continue;
1926             }
1927 
1928             G4_Label *succ_label = bb->Succs.front()->front()->getLabel();
1929 
1930             // check if the last inst of pred is a control flow inst
1931             for (auto pred : bb->Preds)
1932             {
1933                 auto jt = std::find(pred->Succs.begin(), pred->Succs.end(), bb);
1934                 if (jt == pred->Succs.end())
1935                 {
1936                     // Just skip!
1937                     //
1938                     // Each unique pred is processed just once.  If a pred appears
1939                     // more than once in bb->Preds, it is only handled the first time
1940                     // the pred is processed.
1941                     //   Note that if jt == end(), it means that this pred appears
1942                     //   more than once in the Preds list AND it has been handled
1943                     //   before (see code at the end of this loop). So, it is safe to skip.
1944                     continue;
1945                 }
1946 
1947                 G4_INST *i = pred->back();
1948                 // replace label in instructions
1949                 if (i->isFlowControl())
1950                 {
1951                     if (isIndirectJmpTarget(i))
1952                     {
1953                         // due to the switchjmp we may have multiple jmpi
1954                         // at the end of a block.
1955                         bool foundMatchingJmp = false;
1956                         for (INST_LIST::iterator iter = --pred->end();
1957                             iter != pred->begin(); --iter)
1958                         {
1959                             i = *iter;
1960                             if (i->opcode() == G4_jmpi)
1961                             {
1962                                 if (i->getSrc(0)->isLabel() &&
1963                                     i->getSrc(0) == removedBlockInst->getLabel())
1964                                 {
1965                                     i->setSrc(succ_label, 0);
1966                                     foundMatchingJmp = true;
1967                                     break;
1968                                 }
1969                             }
1970                             else
1971                             {
1972                                 break;
1973                             }
1974                         }
1975                         MUST_BE_TRUE(foundMatchingJmp, "Can't find the matching jmpi to the given label");
1976                     }
1977                     else if (i->opcode() == G4_jmpi || i->isCall())
1978                     {
1979                         if (i->getSrc(0)->isLabel())
1980                         {
1981                             if (i->getSrc(0) == removedBlockInst->getLabel())
1982                             {
1983                                 i->setSrc(succ_label, 0);
1984                             }
1985                         }
1986                     }
1987                     else if (i->opcode() == G4_if || i->opcode() == G4_while ||
1988                         i->opcode() == G4_else)
1989                     {
1990                         if (i->asCFInst()->getJip()->isLabel())
1991                         {
1992                             if (i->asCFInst()->getJip() == removedBlockInst->getLabel())
1993                             {
1994                                 if (i->opcode() == G4_else || i->opcode() == G4_while)
1995                                 {
1996                                     // for G4_while, jump no matter predicate
1997                                     i->asCFInst()->setJip(succ_label);
1998                                 }
1999                                 // for G4_if, jump only when it has predictate; if no predicate, no jump
2000                                 else if (i->getPredicate())
2001                                 {
2002                                     i->asCFInst()->setJip(succ_label);
2003                                 }
2004                             }
2005                         }
2006                     }
2007                     else if (i->opcode() == G4_break || i->opcode() == G4_cont || i->opcode() == G4_halt)
2008                     {
2009                         // JIP and UIP must have been computed at this point
2010                         MUST_BE_TRUE(i->asCFInst()->getJip() != NULL && i->asCFInst()->getUip() != NULL,
2011                             "null JIP or UIP for break/cont instruction");
2012                         if (i->asCFInst()->getJip() == removedBlockInst->getLabel())
2013                         {
2014                             i->asCFInst()->setJip(succ_label);
2015                         }
2016 
2017                         if (i->asCFInst()->getUip() == removedBlockInst->getLabel())
2018                         {
2019                             i->asCFInst()->setUip(succ_label);
2020                         }
2021                     }
2022                     else if (i->opcode() == G4_goto)
2023                     {
2024                         // UIP must have been computed at this point
2025                         MUST_BE_TRUE(i->asCFInst()->getUip() != NULL,
2026                             "null UIP for goto instruction");
2027                         if (i->asCFInst()->getUip() == removedBlockInst->getLabel())
2028                         {
2029                             i->asCFInst()->setUip(succ_label);
2030                         }
2031                         if (i->asCFInst()->getUip() == removedBlockInst->getLabel())
2032                         {
2033                             i->asCFInst()->setUip(succ_label);
2034                         }
2035                     }
2036                 }
2037 
2038                 pred->Succs.insert(jt, bb->Succs.front());
2039                 pred->Succs.erase(jt);
2040 
2041                 // [Bug1915]: In rare case the precessor may have more than one Succ edge pointing
2042                 // to the same BB, due to empty block being eliminated.  For example, with
2043                 // BB1:
2044                 // (P) jmp BB3
2045                 // BB2:
2046                 // BB3:
2047                 // BB4:
2048                 // ...
2049                 // After BB2 is eliminated BB1's two succ will both point to BB3.
2050                 // When we get rid of BB3,
2051                 // we have to make sure we update both Succ edges as we'd otherwise create an
2052                 // edge to a non-existing BB.  Note that we don't just delete the edge because
2053                 // elsewhere there may be assumptions that if a BB ends with a jump it must have
2054                 // two successors
2055                 {
2056                     BB_LIST_ITER succs = pred->Succs.begin();
2057                     BB_LIST_ITER end = pred->Succs.end();
2058                     while (succs != end)
2059                     {
2060                         BB_LIST_ITER iter = succs;
2061                         ++succs;
2062                         if ((*iter) == bb)
2063                         {
2064                             pred->Succs.insert(iter, bb->Succs.front());
2065                             pred->Succs.erase(iter);
2066                         }
2067                     }
2068                 }
2069             }
2070 
2071             //
2072             // Replace the unique successor's predecessor links with the removed block's predessors.
2073             //
2074             BB_LIST_ITER kt = std::find(bb->Succs.front()->Preds.begin(), bb->Succs.front()->Preds.end(), bb);
2075 
2076             BB_LIST_ITER mt = bb->Preds.begin();
2077 
2078             for (; mt != bb->Preds.end(); ++mt)
2079             {
2080                 bb->Succs.front()->Preds.insert(kt, *mt);
2081             }
2082 
2083             bb->Succs.front()->Preds.erase(kt);
2084             bb->Succs.front()->Preds.unique();
2085             //
2086             // Propagate the removed block's type to its unique successor.
2087             //
2088             bb->Succs.front()->setBBType(bb->getBBType());
2089 
2090             bb->Succs.clear();
2091             bb->Preds.clear();
2092             bb->clear();
2093 
2094             erase(it);
2095         }
2096         else if (bb->Preds.size() == 1 && bb->Preds.front()->Succs.size() == 1)
2097         {
2098             // Merge bb into singlePred and delete bb if all the following are true:
2099             //   1. singlePred has no control-flow inst (at the end),
2100             //   2. bb's label is not used at all.
2101             //
2102             //     singlePred:
2103             //        ....
2104             //     bb:
2105             //        ....
2106             //
2107             // If singlePred does not end with a control-flow inst, bb's label is not used except
2108             // bb ends with while. For while bb, we need further to check if any break uses label.
2109             // As break should jump to while bb's fall-thru (not while bb), we need to follow all
2110             // preds of while's fall-thru BB to see if any pred has a break.
2111             //
2112             G4_BB* singlePred = bb->Preds.front();
2113             G4_INST* labelInst = bb->front();
2114             assert(labelInst->isLabel());
2115             if (!singlePred->back()->isFlowControl() &&
2116                 singlePred->getPhysicalSucc() == bb /* sanity */ &&
2117                 !labelInst->getLabel()->isFuncLabel() /* skip special bb */ &&
2118                 bb != singlePred /* [special] skip dead single-BB loop */)
2119             {
2120                 bool doMerging = true;
2121                 G4_INST* whileInst = bb->back();
2122                 if (whileInst->opcode() == G4_while)
2123                 {
2124                     // If there is any break inst for this while, the break uses the label. No merging.
2125                     // Note that any break inst of this while will jump to the BB right after while BB
2126                     // (this BB is the fall-thru BB of while if while BB has fall-thru, or just its physical
2127                     // succ if it has no fall-thru).
2128                     G4_BB* whilePhySucc = bb->getPhysicalSucc();
2129                     if (whilePhySucc)
2130                     {
2131                         for (auto breakPred : whilePhySucc->Preds)
2132                         {
2133                             if (breakPred->getLastOpcode() == G4_break) {
2134                                 doMerging = false;
2135                                 break;
2136                             }
2137                         }
2138                     }
2139                 }
2140 
2141                 if (doMerging)
2142                 {
2143                     removePredSuccEdges(singlePred, bb);
2144                     assert(singlePred->Succs.size() == 0);
2145                     std::vector<G4_BB*> allSuccBBs(bb->Succs.begin(), bb->Succs.end());
2146                     for (auto S : allSuccBBs)
2147                     {
2148                         removePredSuccEdges(bb, S);
2149                         addPredSuccEdges(singlePred, S, false);
2150                     }
2151 
2152                     // remove bb's label before splice
2153                     bb->remove(labelInst);
2154                     singlePred->getInstList().splice(singlePred->end(), bb->getInstList());
2155 
2156                     bb->Succs.clear();
2157                     bb->Preds.clear();
2158 
2159                     erase(it);
2160                 }
2161             }
2162         }
2163     }
2164 
2165     reassignBlockIDs();
2166 }
2167 
2168 //
2169 // remove any mov with the same src and dst opnds
2170 //
removeRedundMov()2171 void FlowGraph::removeRedundMov()
2172 {
2173     for (G4_BB* bb : BBs)
2174     {
2175         INST_LIST_ITER curr_iter = bb->begin();
2176         while (curr_iter != bb->end())
2177         {
2178             G4_INST* inst = (*curr_iter);
2179             if (inst->isRawMov())
2180             {
2181                 G4_Operand *src = inst->getSrc(0);
2182                 G4_DstRegRegion *dst = inst->getDst();
2183 
2184                 if (src->isSrcRegRegion())
2185                 {
2186                     G4_SrcRegRegion* srcRgn = (G4_SrcRegRegion*)src;
2187                     if (!dst->isIndirect() &&
2188                         !srcRgn->isIndirect() &&
2189                         dst->isGreg() &&
2190                         src->isGreg())
2191                     {
2192                         if (dst->getLinearizedStart() == srcRgn->getLinearizedStart() &&
2193                             dst->getLinearizedEnd() == srcRgn->getLinearizedEnd())
2194                         {
2195                             uint16_t stride = 0;
2196                             const RegionDesc *rd = srcRgn->getRegion();
2197                             unsigned ExSize = inst->getExecSize();
2198                             if (ExSize == 1 ||
2199                                 (rd->isSingleStride(ExSize, stride) &&
2200                                 (dst->getHorzStride() == stride)))
2201                             {
2202                                 curr_iter = bb->erase(curr_iter);
2203                                 continue;
2204                             }
2205                         }
2206                     }
2207                 }
2208             }
2209             ++curr_iter;
2210         }
2211     }
2212 }
2213 
2214 //
2215 // Remove any placeholder empty blocks that could have been inserted to aid analysis
2216 //
removeEmptyBlocks()2217 void FlowGraph::removeEmptyBlocks()
2218 {
2219     bool changed = true;
2220 
2221     while (changed)
2222     {
2223         changed = false;
2224         for (BB_LIST_ITER it = BBs.begin(); it != BBs.end();)
2225         {
2226             G4_BB* bb = *it;
2227 
2228             //
2229             // The removal candidates will have a unique successor and a single label
2230             // starting with LABEL__EMPTYBB as the only instruction besides a JMP.
2231             //
2232             if (bb->size() > 0 && bb->size() < 3)
2233             {
2234                 INST_LIST::iterator removedBlockInst = bb->begin();
2235 
2236                 if ((*removedBlockInst)->isLabel() == false ||
2237                     strncmp((*removedBlockInst)->getLabelStr(),
2238                         "LABEL__EMPTYBB", 14) != 0)
2239                 {
2240                     ++it;
2241                     continue;
2242                 }
2243 
2244                 ++removedBlockInst;
2245 
2246                 if (removedBlockInst != bb->end())
2247                 {
2248                     // if the BB is not empty, it must end with a unconditional jump
2249                     if ((*removedBlockInst)->opcode() != G4_jmpi || bb->Succs.size() > 1)
2250                     {
2251                         ++it;
2252                         continue;
2253                     }
2254                 }
2255 
2256                 // remove redundant succs and preds for the empty block
2257                 bb->Succs.unique();
2258                 bb->Preds.unique();
2259 
2260                 for (auto predBB : bb->Preds)
2261                 {
2262                     //
2263                     // Replace the predecessors successor links to the removed block's unique successor.
2264                     //
2265                     BB_LIST_ITER jt = std::find(predBB->Succs.begin(), predBB->Succs.end(), bb);
2266 
2267                     for (auto succBB : bb->Succs)
2268                     {
2269                         predBB->Succs.insert(jt, succBB);
2270                     }
2271                     predBB->Succs.erase(jt);
2272                     predBB->Succs.unique();
2273                 }
2274 
2275                 for (auto succBB : bb->Succs)
2276                 {
2277                     //
2278                     // Replace the unique successor's predecessor links with the removed block's predessors.
2279                     //
2280                     BB_LIST_ITER kt = std::find(succBB->Preds.begin(), succBB->Preds.end(), bb);
2281 
2282                     if (kt != succBB->Preds.end())
2283                     {
2284                         for (auto predBB : bb->Preds)
2285                         {
2286                             succBB->Preds.insert(kt, predBB);
2287                         }
2288 
2289                         succBB->Preds.erase(kt);
2290                         succBB->Preds.unique();
2291 
2292                         //
2293                         // Propagate the removed block's type to its unique successor.
2294                         //
2295                         succBB->setBBType(bb->getBBType());
2296                     }
2297                 }
2298                 //
2299                 // Remove the block to be removed.
2300                 //
2301                 bb->Succs.clear();
2302                 bb->Preds.clear();
2303                 bb->clear();
2304 
2305                 BB_LIST_ITER rt = it++;
2306                 erase(rt);
2307                 changed = true;
2308             }
2309             else
2310             {
2311                 ++it;
2312             }
2313         }
2314     }
2315 }
2316 
2317 //
2318 // If multiple freturns exist in a flowgraph create a new basic block
2319 // with an freturn. Replace all freturns with jumps.
2320 //
mergeFReturns()2321 void FlowGraph::mergeFReturns()
2322 {
2323     std::list<G4_BB*> exitBBs;
2324     G4_BB* candidateFretBB = NULL;
2325     G4_Label *dumLabel = NULL;
2326 
2327     for (G4_BB* cur : BBs)
2328     {
2329         if (cur->size() > 0 && cur->back()->isFReturn())
2330         {
2331             exitBBs.push_back(cur);
2332 
2333             if (cur->size() == 2 && cur->front()->isLabel())
2334             {
2335                 // An fret already exists that can be shared among all
2336                 // so skip creating a new block with fret.
2337                 dumLabel = cur->front()->getSrc(0)->asLabel();
2338                 candidateFretBB = cur;
2339             }
2340         }
2341     }
2342 
2343     if (exitBBs.size() > 1)
2344     {
2345         if (candidateFretBB == NULL)
2346         {
2347             G4_BB* newExit = createNewBB();
2348             assert(!builder->getIsKernel() && "Not expecting fret in kernel");
2349             std::string str = "__MERGED_FRET_EXIT_BLOCK";
2350             dumLabel = builder->createLabel(str, LABEL_BLOCK);
2351             G4_INST* label = createNewLabelInst(dumLabel);
2352             newExit->push_back(label);
2353             G4_INST* fret = builder->createInternalCFInst(
2354                 nullptr, G4_pseudo_fret, g4::SIMD1, nullptr, nullptr, InstOpt_NoOpt);
2355             newExit->push_back(fret);
2356             BBs.push_back(newExit);
2357             candidateFretBB = newExit;
2358         }
2359 
2360         for (G4_BB* cur : exitBBs)
2361         {
2362             if (cur != candidateFretBB)
2363             {
2364                 G4_INST* last = cur->back();
2365                 addPredSuccEdges(cur, candidateFretBB);
2366 
2367                 last->setOpcode(G4_jmpi);
2368                 last->setSrc(dumLabel, 0);
2369                 last->setExecSize(g4::SIMD1);
2370             }
2371         }
2372     }
2373 }
2374 
2375 
2376 //
2377 // Add a dummy BB for multiple-exit flow graph
2378 // The criteria of a valid multiple-exit flow graph is:
2379 //  (1). equal or less than one BB w/o successor has non-EOT last instruction;
2380 //  (2). Other BBs w/o successor must end with EOT
2381 //
linkDummyBB()2382 void FlowGraph::linkDummyBB()
2383 {
2384     //
2385     // check the flow graph to find if it satify the criteria and record the exit BB
2386     //
2387     std::list<G4_BB*> exitBBs;
2388     int nonEotExitBB = 0;
2389     for (G4_BB *bb : BBs)
2390     {
2391         if (bb->Succs.empty())
2392         {
2393             exitBBs.push_back(bb);      // record exit BBs
2394             G4_INST* last = bb->back();
2395             if (last == NULL)
2396             {
2397                 MUST_BE_TRUE(false, "ERROR: Invalid flow graph with empty exit BB!");
2398             }
2399             if (!bb->isLastInstEOT())
2400             {
2401                 nonEotExitBB++;
2402                 if (nonEotExitBB > 1)
2403                 {
2404                     MUST_BE_TRUE(false,
2405                         "ERROR: Invalid flow graph with more than one exit BB not end with EOT!");
2406                 }
2407             }
2408         }
2409     }
2410 
2411     //
2412     // create the dummy BB and link the exit BBs to it
2413     //
2414     if (nonEotExitBB == 1 && exitBBs.size() > 1)
2415     {
2416         G4_BB *dumBB = createNewBB();
2417         MUST_BE_TRUE(dumBB != NULL, ERROR_FLOWGRAPH);
2418         std::string str("__AUTO_GENERATED_DUMMY_LAST_BB");
2419         G4_Label *dumLabel = builder->createLabel(str, LABEL_BLOCK);
2420         G4_INST* label = createNewLabelInst(dumLabel);
2421         dumBB->push_back(label);
2422         BBs.push_back(dumBB);
2423 
2424         for (G4_BB *bb : exitBBs)
2425         {
2426             dumBB->Preds.push_back(bb);
2427             bb->Succs.push_back(dumBB);
2428         }
2429     }
2430 }
2431 
2432 //
2433 // Re-assign block ID so that we can use id to determine the ordering of two blocks in the code layout
2434 //
reassignBlockIDs()2435 void FlowGraph::reassignBlockIDs()
2436 {
2437     //
2438     // re-assign block id so that we can use id to determine the ordering of
2439     // two blocks in the code layout; namely which one is ahead of the other.
2440     // Important: since we re-assign id, there MUST NOT exist any instruction
2441     // that depends on BB id. Or the code will be incorrect once we reassign id.
2442     //
2443     unsigned i = 0;
2444     for (G4_BB* bb : BBs)
2445     {
2446         bb->setId(i);
2447         i++;
2448         MUST_BE_TRUE(i <= getNumBB(), ERROR_FLOWGRAPH);
2449     }
2450 
2451     numBBId = i;
2452 }
2453 
findLabelBB(BB_LIST_ITER StartIter,BB_LIST_ITER EndIter,const char * Label)2454 G4_BB* FlowGraph::findLabelBB(
2455     BB_LIST_ITER StartIter, BB_LIST_ITER EndIter, const char* Label)
2456 {
2457     for (BB_LIST_ITER it = StartIter; it != EndIter; ++it)
2458     {
2459         G4_BB* bb = *it;
2460         G4_INST* first = bb->empty() ? NULL : bb->front();
2461 
2462         if (first && first->isLabel())
2463         {
2464             const char* currLabel = first->getLabelStr();
2465             if (strcmp(Label, currLabel) == 0)
2466             {
2467                 return bb;
2468             }
2469         }
2470     }
2471     return NULL;
2472 }
2473 
2474 
2475 typedef enum
2476 {
2477     STRUCTURED_CF_IF = 0,
2478     STRUCTURED_CF_LOOP = 1
2479 } STRUCTURED_CF_TYPE;
2480 
2481 struct StructuredCF
2482 {
2483     STRUCTURED_CF_TYPE mType;
2484     // for if this is the block that ends with if
2485     // for while this is the loop block
2486     G4_BB* mStartBB;
2487     // for if this is the endif block
2488     // for while this is the block that ends with while
2489     G4_BB* mEndBB;
2490     // it's possible for a BB to have multiple endifs, so we need
2491     // to know which endif corresponds to this CF
2492     G4_INST* mEndInst;
2493 
2494     StructuredCF* enclosingCF;
2495 
2496     // ToDo: can add more information (else, break, cont, etc.) as needed later
2497 
2498     // endBB is set when we encounter the endif/while
StructuredCFStructuredCF2499     StructuredCF(STRUCTURED_CF_TYPE type, G4_BB* startBB) :
2500         mType(type), mStartBB(startBB), mEndBB(NULL), mEndInst(NULL), enclosingCF(NULL) {}
2501 
operator newStructuredCF2502     void *operator new(size_t sz, vISA::Mem_Manager& m) { return m.alloc(sz); }
2503 
setEndStructuredCF2504     void setEnd(G4_BB* endBB, G4_INST* endInst)
2505     {
2506         mEndBB = endBB;
2507         mEndInst = endInst;
2508     }
2509 }; // StructuredCF
2510 
2511 /*
2512 *  This function sets the JIP for Structured CF (SCF) instructions, Unstructured
2513 *  Control Flow (UCF) instructions (goto) are handled in processGoto().
2514 *
2515 *  Note: we currently do not consider goto/join when adding the JIP for endif,
2516 *  since structure analysis should not allow goto into/out of if-endif.  This means
2517 *  we only need to set the the JIP of the endif to its immediately enclosing endif/while
2518 *
2519 *  The simd control flow blocks must be well-structured
2520 *
2521 */
processSCF(FuncInfoHashTable & FuncInfoMap)2522 void FlowGraph::processSCF(FuncInfoHashTable &FuncInfoMap)
2523 {
2524     std::stack<StructuredCF*> ifAndLoops;
2525     std::vector<StructuredCF*> structuredSimdCF;
2526 
2527     for (G4_BB* bb : BBs)
2528     {
2529         for (G4_INST* inst : *bb)
2530         {
2531             // check if first non-label inst is an endif
2532             if (inst->opcode() != G4_label && inst->opcode() != G4_join)
2533             {
2534                 if (inst->opcode() == G4_endif)
2535                 {
2536 
2537                     MUST_BE_TRUE(ifAndLoops.size() > 0, "endif without matching if");
2538                     StructuredCF* cf = ifAndLoops.top();
2539                     MUST_BE_TRUE(cf->mType == STRUCTURED_CF_IF, "ill-formed if");
2540                     cf->setEnd(bb, inst);
2541                     ifAndLoops.pop();
2542                     if (ifAndLoops.size() > 0)
2543                     {
2544                         cf->enclosingCF = ifAndLoops.top();
2545                     }
2546                 }
2547                 else
2548                 {
2549                     // stop at the first non-endif instruction (there may be multiple endifs)
2550                     break;
2551                 }
2552             }
2553         }
2554 
2555         // check if bb is SIMD loop head
2556         for (G4_BB* predBB : bb->Preds)
2557         {
2558             // check if one of the pred ends with a while
2559             if (predBB->getId() >= bb->getId())
2560             {
2561                 if (predBB->size() != 0 &&
2562                     predBB->back()->opcode() == G4_while)
2563                 {
2564                     StructuredCF* cf = new (mem)StructuredCF(STRUCTURED_CF_LOOP, bb);
2565                     structuredSimdCF.push_back(cf);
2566                     ifAndLoops.push(cf);
2567                 }
2568             }
2569         }
2570 
2571         if (bb->size() > 0)
2572         {
2573             G4_INST* lastInst = bb->back();
2574             if (lastInst->opcode() == G4_if)
2575             {
2576                 StructuredCF* cf = new (mem)StructuredCF(STRUCTURED_CF_IF, bb);
2577                 structuredSimdCF.push_back(cf);
2578                 ifAndLoops.push(cf);
2579             }
2580             else if (lastInst->opcode() == G4_while)
2581             {
2582                 MUST_BE_TRUE(ifAndLoops.size() > 0, "while without matching do");
2583                 StructuredCF* cf = ifAndLoops.top();
2584                 MUST_BE_TRUE(cf->mType == STRUCTURED_CF_LOOP, "ill-formed while loop");
2585                 cf->setEnd(bb, lastInst);
2586                 ifAndLoops.pop();
2587                 if (ifAndLoops.size() > 0)
2588                 {
2589                     cf->enclosingCF = ifAndLoops.top();
2590                 }
2591             }
2592         }
2593     }
2594 
2595     MUST_BE_TRUE(ifAndLoops.size() == 0, "not well-structured SIMD CF");
2596 
2597     for (StructuredCF* cf : structuredSimdCF)
2598     {
2599         if (cf->mType == STRUCTURED_CF_IF && cf->enclosingCF != NULL)
2600         {
2601             setJIPForEndif(cf->mEndInst, cf->enclosingCF->mEndInst, cf->enclosingCF->mEndBB);
2602         }
2603     }
2604 }
2605 
2606 // markDivergentBBs()
2607 //    If BB is on the divergent path, mark it as divergent.
2608 // Divergent:
2609 //    If all active simd lanes on entry to shader/kernel are
2610 //    active in a BB,  that BB is NOT divergent;  otherwise,
2611 //    it is divergent.
2612 //
2613 //    Note that this does not require the number of all the
2614 //    active simd lanes equals to the simd dispatch width!
2615 //    For example,  simd8 kernel might have 4 active lanes
2616 //    on entry to the kernel, thus if any BB of the kernel
2617 //    has less than 4 active lanes,  it is divergent! As
2618 //    matter of fact, the entry BB must not be divergent.
2619 //
markDivergentBBs()2620 void FlowGraph::markDivergentBBs()
2621 {
2622     if (BBs.empty())
2623     {
2624         // Sanity check
2625         return;
2626     }
2627 
2628     // fcall'ed Function:  to be conservative and assume that any function
2629     // that is fcall'ed is divergent on entry
2630     // (Note that each function has its own CFG)
2631     if (!builder->getIsKernel())
2632     {
2633         for (G4_BB* BB : BBs)
2634         {
2635             BB->setDivergent(true);
2636         }
2637         return;
2638     }
2639 
2640     //  Now, handle kernel function.
2641     //       1.  It would be perfered to have a single return/exit for kernel and
2642     //           and all its subroutines in the last BB (IGC does, cm does not),
2643     //           although the code  can handle return in the middle of kernel.
2644     //       2.  The entry function will appear first in the BB list, and if there
2645     //           is a call from A to B,   subroutine A shall appear prior to
2646     //           subroutine B in BB list.
2647     //
2648     // Required:  need to set BB id.
2649     //
2650     // Key variables:
2651     //   LastJoinBB:
2652     //     LastJoinBB is the fartherest joinBB of any goto/break/if/while
2653     //     so far, as described below in the algorithm.
2654     //   LastJoinBBId:  Id(LastJoinBB).
2655     //
2656     // The algorithm initializes LastJoinBBId to -1 and scans all BBs in order.
2657     // It checks control-flow instructions to see if it diverges (has join).
2658     // If so, the algorithm sets LastJoinBBId to be the larger one of its join BB
2659     // and LastJoinBBId; For a non-negative LastJoinBBId, it means that there is
2660     // an active join in that BB, and therefore, all BBs from the current BB to
2661     // that LastJoinBB (LastJoinBB not included) are divergent.
2662     //
2663     // The algorithm checks the following cases and their join BBs are:
2664     //     case 1: cf inst = goto
2665     //          <currBB>    [(p)] goto L
2666     //                           ...
2667     //          <joinBB>    L:
2668     //     case 2: cf inst = if
2669     //          <currBB>    if
2670     //                          ...
2671     //          <joinBB>    endif
2672     //
2673     //     case 3:  cf inst = break
2674     //          <currBB>   break
2675     //                     ...
2676     //                     [(p)] while
2677     //          <joinBB>
2678     //     case 4:
2679     //          <currBB>  L:
2680     //                     ....
2681     //                    [(p)] while/goto L
2682     //          <joinBB>
2683     //
2684     //  The presence of loop makes the checking complicated. Before checking the
2685     //  divergence of a loop head, we need to know if the loop has ANY NON-UNIFORM
2686     //  OUT-OF-LOOP BRANCH or OUT-OF-LOOP BRANCH in DIVERGENT BB, which includes
2687     //  checking the loop's backedge and any out-going branches inside the loop body.
2688     //  For example,
2689     //      L0:
2690     //          B0: (uniform cond) goto B1
2691     //               ......
2692     //          B1:
2693     //             B2:
2694     //      L1:
2695     //             B3: goto OUT
2696     //             B4: (non-uniform cond) goto  L1
2697     //             B5:
2698     //          B6: (uniform cond) goto L0
2699     //
2700     //     OUT: B7:
2701     //
2702     //  At B0, we don't know whether B0 is divergent even though B0's goto is uniform. Once
2703     //  we find out that B3 is divergent, "goto OUT" will make Loop L0 divergent, which means
2704     //  any of L0's BBs, including B0, is divergent. In order to know B3 is divergent, we need
2705     //  to check the back branch of loop L1. The fact that L1's backedge is non-uniform indicates
2706     //  L1 is divergent, therefore B3 is divergent. This means that WE NEED TO DO PRE_SCAN in
2707     //  order to know whether any BBs of a loop is divergent.
2708     //
2709     int LastJoinBBId;
2710 
2711     auto pushJoin = [&](G4_BB* joinBB) {
2712         LastJoinBBId = std::max(LastJoinBBId, (int)joinBB->getId());
2713     };
2714     auto popJoinIfMatch = [&](G4_BB* BB) {
2715         if ((int)BB->getId() == LastJoinBBId) {
2716             LastJoinBBId = -1;
2717         }
2718     };
2719     auto isPriorToLastJoin = [&](G4_BB* aBB) ->bool {
2720         return (int)aBB->getId() < LastJoinBBId;
2721     };
2722 
2723     reassignBlockIDs();
2724 
2725     // Analyze function in topological order. As there is no recursion
2726     // and no indirect call,  a function will be analyzed only if all
2727     // its callers have been analyzed.
2728     //
2729     // If no subroutine, sortedFuncTable is empty. Here keep all functions
2730     // in a vector first (it works with and without subroutines), then scan
2731     // functions in topological order.
2732     struct StartEndIter {
2733         BB_LIST_ITER StartI;
2734         BB_LIST_ITER EndI;
2735         bool  InvokedFromDivergentBB;
2736     };
2737     int numFuncs = (int)sortedFuncTable.size();
2738     std::vector<StartEndIter> allFuncs;
2739     std::unordered_map<FuncInfo*, uint32_t> funcInfoIndex;
2740     if (numFuncs == 0)
2741     {
2742         numFuncs = 1;
2743         allFuncs.resize(1);
2744         allFuncs[0].StartI = BBs.begin();
2745         allFuncs[0].EndI = BBs.end();
2746         allFuncs[0].InvokedFromDivergentBB = false;
2747     }
2748     else
2749     {
2750         allFuncs.resize(numFuncs);
2751         for (int i = numFuncs; i > 0; --i)
2752         {
2753             uint32_t ix = (uint32_t)(i - 1);
2754             FuncInfo* pFInfo = sortedFuncTable[ix];
2755             G4_BB* StartBB = pFInfo->getInitBB();
2756             G4_BB* EndBB = pFInfo->getExitBB();
2757             uint32_t ui = (uint32_t)(numFuncs - i);
2758             allFuncs[ui].StartI = std::find(BBs.begin(), BBs.end(), StartBB);
2759             auto nextI = std::find(BBs.begin(), BBs.end(), EndBB);
2760             assert(nextI != BBs.end() && "ICE: subroutine's end BB not found!");
2761             allFuncs[ui].EndI = (++nextI);
2762             allFuncs[ui].InvokedFromDivergentBB = false;
2763 
2764             funcInfoIndex[pFInfo] = ui;
2765         }
2766     }
2767 
2768     // Update LastJoin for the backward branch of BB referred to by IT.
2769     auto updateLastJoinForBwdBrInBB = [&](BB_LIST_ITER& IT)
2770     {
2771         G4_BB* predBB = *IT;
2772         G4_INST* bInst = predBB->back();
2773         assert(bInst->isCFInst() &&
2774             (bInst->opcode() == G4_while || bInst->asCFInst()->isBackward()) &&
2775             "ICE: expected backward branch/while!");
2776 
2777         // joinBB of a loop is the BB right after tail BB
2778         BB_LIST_ITER loopJoinIter = IT;
2779         ++loopJoinIter;
2780         if (loopJoinIter == BBs.end())
2781         {
2782             // Loop end is the last BB (no Join BB). This happens in the
2783             // following cases:
2784             //    1. For CM, CM's return is in the middle of code! For IGC,
2785             //       this never happen as a return, if present, must be in
2786             //       the last BB.
2787             //    2. The last segment code is an infinite loop (static infinite
2788             //       loop so that compiler knows it is an infinite loop).
2789             // In either case, no join is needed.
2790             return;
2791         }
2792 
2793         // Update LastJoin if the branch itself is divergent.
2794         // (todo: checking G4_jmpi is redundant as jmpi must have isUniform()
2795         //  return true.)
2796         if ((!bInst->asCFInst()->isUniform() && bInst->opcode() != G4_jmpi))
2797         {
2798             G4_BB* joinBB = *loopJoinIter;
2799             pushJoin(joinBB);
2800         }
2801         return;
2802     };
2803 
2804     // Update LastJoin for BB referred to by IT. It is called either from normal
2805     // scan (setting BB's divergent field) or from loop pre-scan (no setting to
2806     // BB's divergent field).  IsPreScan indicates whether it is called from
2807     // the pre-scan or not.
2808     //
2809     // The normal scan (IsPreScan is false), this function also update the divergence
2810     // info for any called subroutines.
2811     //
2812     auto updateLastJoinForFwdBrInBB = [&](BB_LIST_ITER& IT, bool IsPreScan)
2813     {
2814         G4_BB* aBB = *IT;
2815         if (aBB->size() == 0)
2816         {
2817             return;
2818         }
2819 
2820         // Using isPriorToLastJoin() works for both loop pre-scan and normal scan.
2821         // (aBB->isDivergent() works for normal scan only.)
2822         bool isBBDivergent = isPriorToLastJoin(aBB);
2823 
2824         G4_INST* lastInst = aBB->back();
2825         if (((lastInst->opcode() == G4_goto && !lastInst->asCFInst()->isBackward()) ||
2826              lastInst->opcode() == G4_break) &&
2827             (!lastInst->asCFInst()->isUniform() || isBBDivergent))
2828         {
2829             // forward goto/break : the last Succ BB is our target BB
2830             // For break, it should be the BB right after while inst.
2831             G4_BB* joinBB = aBB->Succs.back();
2832             pushJoin(joinBB);
2833         }
2834         else if (lastInst->opcode() == G4_if &&
2835             (!lastInst->asCFInst()->isUniform() || isBBDivergent))
2836         {
2837             G4_Label* labelInst = lastInst->asCFInst()->getUip();
2838             G4_BB* joinBB = findLabelBB(IT, BBs.end(), labelInst->getLabel());
2839             assert(joinBB && "ICE(vISA) : missing endif label!");
2840             pushJoin(joinBB);
2841         }
2842         else if (!IsPreScan && lastInst->opcode() == G4_call)
2843         {
2844             // If this function is already in divergent branch, the callee
2845             // must be in a divergent branch!.
2846             if (isBBDivergent || lastInst->getPredicate() != nullptr)
2847             {
2848                 FuncInfo* calleeFunc = aBB->getCalleeInfo();
2849                 if (funcInfoIndex.count(calleeFunc))
2850                 {
2851                     int ix = funcInfoIndex[calleeFunc];
2852                     allFuncs[ix].InvokedFromDivergentBB = true;
2853                 }
2854             }
2855         }
2856         return;
2857     };
2858 
2859 
2860     // Now, scan all subroutines (kernels or functions)
2861     for (int i = 0; i < numFuncs; ++i)
2862     {
2863         // each function: [IT, IE)
2864         BB_LIST_ITER& IS = allFuncs[i].StartI;
2865         BB_LIST_ITER& IE = allFuncs[i].EndI;
2866         if (IS == IE)
2867         {
2868             // sanity check
2869             continue;
2870         }
2871 
2872         if (allFuncs[i].InvokedFromDivergentBB)
2873         {
2874             // subroutine's divergent on entry. Mark every BB as divergent
2875             for (auto IT = IS; IT != IE; ++IT)
2876             {
2877                 G4_BB* BB = *IT;
2878 
2879                 BB->setDivergent(true);
2880                 if (BB->size() == 0)
2881                 {
2882                     // sanity check
2883                     continue;
2884                 }
2885                 if (BB->isEndWithCall() || BB->isEndWithFCall())
2886                 {
2887                     FuncInfo* calleeFunc = BB->getCalleeInfo();
2888                     if (funcInfoIndex.count(calleeFunc))
2889                     {
2890                         int ix = funcInfoIndex[calleeFunc];
2891                         allFuncs[ix].InvokedFromDivergentBB = true;
2892                     }
2893                 }
2894             }
2895             // continue for next subroutine
2896             continue;
2897         }
2898 
2899         // Scaning all BBs of a single subroutine (or kernel, or function).
2900         LastJoinBBId = -1;
2901         for (auto IT = IS; IT != IE; ++IT)
2902         {
2903             G4_BB* BB = *IT;
2904 
2905             // join could be: explicit (join inst) or implicit (endif/while)
2906             popJoinIfMatch(BB);
2907 
2908             // Handle loop
2909             //    Loop needs to be scanned twice in order to get an accurate marking.
2910             //    In pre-scan (1st scan), it finds out divergent out-of-loop branch.
2911             //    If found,  it updates LastJoin to the target of that out-of-loop branch
2912             //    and restart the normal scan. If not, it restarts the normal scan with
2913             //    the original LastJoin unchanged.
2914 
2915             // BB could be head of several loops, and the following does pre-scan for
2916             //  every one of those loops.
2917             for (G4_BB* predBB : BB->Preds)
2918             {
2919                 if (!isBackwardBranch(predBB, BB)) {
2920                     G4_opcode t_opc = predBB->getLastOpcode();
2921                     bool isBr = (t_opc == G4_goto || t_opc == G4_jmpi);
2922                     assert((!isBr || (BB->getId() > predBB->getId())) && "backward Branch did not set correctly!");
2923                     continue;
2924                 }
2925                 assert(BB->getId() <= predBB->getId() && "Branch incorrectly set to be backward!");
2926 
2927                 BB_LIST_ITER LoopITEnd = std::find(BBs.begin(), BBs.end(), predBB);
2928                 updateLastJoinForBwdBrInBB(LoopITEnd);
2929 
2930                 // If lastJoin is already after loop end, no need to scan loop twice
2931                 // as all BBs in the loop must be divergent
2932                 if (isPriorToLastJoin(predBB))
2933                 {
2934                     continue;
2935                 }
2936 
2937                 // pre-scan loop (BB, predBB)
2938                 //
2939                 //    Observation:
2940                 //        pre-scan loop once and as a BB is scanned. Each backward
2941                 //        branch to this BB and a forward branch from this BB are processed.
2942                 //        Doing so finds out any divergent out-of-loop branch iff the
2943                 //        loop has one. In addition, if a loop has more than one divergent
2944                 //        out-of-loop branches, using any of those branches would get us
2945                 //        precise divergence during the normal scan.
2946                 //
2947                 // LastJoinBBId will be updated iff there is an out-of-loop branch that is
2948                 // is also divergent. For example,
2949                 //  a)  "goto OUT" is out-of-loop branch, but not divergent. Thus, LastJoinBB
2950                 //      will not be updated.
2951                 //
2952                 //      L :
2953                 //           B0
2954                 //           if (uniform cond) goto OUT;
2955                 //           if  (non-uniform cond)
2956                 //              B1
2957                 //           else
2958                 //              B2
2959                 //           B3
2960                 //           (p) jmpi L
2961                 //      OUT:
2962                 //
2963                 //  b)  "goto OUT" is out-of-loop branch and divergent. Thus, update LastJoinBB.
2964                 //      Note that "goto OUT" is divergent because loop L1 is divergent, which
2965                 //      makes every BB in L1 divergent. And any branch from a divergent BB
2966                 //      must be divergent.
2967                 //
2968                 //      L :
2969                 //           B0
2970                 //      L1:
2971                 //           B1
2972                 //           if (uniform cond) goto OUT;
2973                 //           if  (cond)
2974                 //              B2
2975                 //           else
2976                 //              B3
2977                 //           if (non-uniform cond) goto L1
2978                 //           B4
2979                 //           (uniform cond) while L
2980                 //      OUT:
2981                 //
2982                 int orig_LastJoinBBId = LastJoinBBId;
2983                 for (auto LoopIT = IT; LoopIT != LoopITEnd; ++LoopIT)
2984                 {
2985                     if (LoopIT != IT)
2986                     {
2987                         // Check loops that are fully inside the current loop.
2988                         G4_BB* H = *LoopIT;
2989                         for (G4_BB* T : H->Preds)
2990                         {
2991                             if (!isBackwardBranch(T, H)) {
2992                                 continue;
2993                             }
2994                             assert(H->getId() <= T->getId() && "Branch incorrectly set to be backward!");
2995                             BB_LIST_ITER TIter = std::find(BBs.begin(), BBs.end(), T);
2996                             updateLastJoinForBwdBrInBB(TIter);
2997                         }
2998                     }
2999                     updateLastJoinForFwdBrInBB(LoopIT, true);
3000                 }
3001 
3002                 // After scan, if no branch out of loop, restore the original LastJoinBBId
3003                 if (!isPriorToLastJoin(predBB))
3004                 {   // case a) above.
3005                     LastJoinBBId = orig_LastJoinBBId;
3006                 }
3007             }
3008 
3009             // normal scan of BB
3010             if (isPriorToLastJoin(BB)) {
3011                 BB->setDivergent(true);
3012             }
3013             // Temporary for debugging, toBeDelete!
3014             // if (pKernel->getIntKernelAttribute(Attributes::ATTR_Target) == VISA_CM)
3015             // {
3016             //    assert((BB->isDivergent() && BB->isInSimdFlow() ||
3017             //        (!BB->isDivergent() && !BB->isInSimdFlow())) && " isDivergent != isInSimdFlow!");
3018             // }
3019             updateLastJoinForFwdBrInBB(IT, false);
3020         }
3021     }
3022     return;
3023 }
3024 
getUniqueReturnBlock()3025 G4_BB* FlowGraph::getUniqueReturnBlock()
3026 {
3027     // Return block that has a return instruction
3028     // Return NULL if multiple return instructions found
3029     G4_BB* uniqueReturnBlock = NULL;
3030 
3031     for (G4_BB* curBB : BBs) {
3032         if (!curBB->empty())
3033         {
3034             G4_INST* last_inst = curBB->back();
3035 
3036             if (last_inst->opcode() == G4_pseudo_fret) {
3037                 if (uniqueReturnBlock == NULL) {
3038                     uniqueReturnBlock = curBB;
3039                 }
3040                 else {
3041                     uniqueReturnBlock = NULL;
3042                     break;
3043                 }
3044             }
3045         }
3046     }
3047 
3048     return uniqueReturnBlock;
3049 }
3050 
3051 /*
3052 * Insert a join at the beginning of this basic block, immediately after the label
3053 * If a join is already present, nothing will be done
3054 */
insertJoinToBB(G4_BB * bb,G4_ExecSize execSize,G4_Label * jip)3055 void FlowGraph::insertJoinToBB(G4_BB* bb, G4_ExecSize execSize, G4_Label* jip)
3056 {
3057     MUST_BE_TRUE(bb->size() > 0, "empty block");
3058     INST_LIST_ITER iter = bb->begin();
3059 
3060     // Skip label if any.
3061     if ((*iter)->isLabel())
3062     {
3063         iter++;
3064     }
3065 
3066     if (iter == bb->end())
3067     {
3068         // insert join at the end
3069         G4_INST* jInst = builder->createInternalCFInst(NULL, G4_join, execSize, jip, NULL, InstOpt_NoOpt);
3070         bb->push_back(jInst);
3071     }
3072     else
3073     {
3074         G4_INST* secondInst = *iter;
3075 
3076         if (secondInst->opcode() == G4_join)
3077         {
3078             if (execSize > secondInst->getExecSize())
3079             {
3080                 secondInst->setExecSize(execSize);
3081             }
3082         }
3083         else
3084         {
3085             G4_INST* jInst = builder->createInternalCFInst(NULL, G4_join, execSize, jip, NULL, InstOpt_NoOpt);
3086             bb->insertBefore(iter, jInst);
3087         }
3088     }
3089 }
3090 
3091 typedef std::pair<G4_BB*, G4_ExecSize> BlockSizePair;
3092 
addBBToActiveJoinList(std::list<BlockSizePair> & activeJoinBlocks,G4_BB * bb,G4_ExecSize execSize)3093 static void addBBToActiveJoinList(std::list<BlockSizePair>& activeJoinBlocks, G4_BB* bb, G4_ExecSize execSize)
3094 {
3095     // add goto target to list of active blocks that need a join
3096     std::list<BlockSizePair>::iterator listIter;
3097     for (listIter = activeJoinBlocks.begin(); listIter != activeJoinBlocks.end(); ++listIter)
3098     {
3099         G4_BB* aBB = (*listIter).first;
3100         if (aBB->getId() == bb->getId())
3101         {
3102             // block already in list, update exec size if necessary
3103             if (execSize > (*listIter).second)
3104             {
3105                 (*listIter).second = execSize;
3106             }
3107             break;
3108         }
3109         else if (aBB->getId() > bb->getId())
3110         {
3111             activeJoinBlocks.insert(listIter, BlockSizePair(bb, execSize));
3112             break;
3113         }
3114     }
3115 
3116     if (listIter == activeJoinBlocks.end())
3117     {
3118         activeJoinBlocks.push_back(BlockSizePair(bb, execSize));
3119     }
3120 }
3121 
setPhysicalPredSucc()3122 void FlowGraph::setPhysicalPredSucc()
3123 {
3124     BB_LIST_CITER it = BBs.cbegin();
3125     BB_LIST_CITER cend = BBs.cend();
3126     if (it != cend)
3127     {
3128         // first, set up head BB
3129         G4_BB* pred = *it;
3130         pred->setPhysicalPred(NULL);
3131 
3132         for (++it; it != cend; ++it)
3133         {
3134             G4_BB* bb = *it;
3135             bb->setPhysicalPred(pred);
3136             pred->setPhysicalSucc(bb);
3137             pred = bb;
3138         }
3139 
3140         // last, set up the last BB
3141         pred->setPhysicalSucc(NULL);
3142     }
3143 }
3144 
insertEndif(G4_BB * bb,G4_ExecSize execSize,bool createLabel)3145 G4_Label* FlowGraph::insertEndif(G4_BB* bb, G4_ExecSize execSize, bool createLabel)
3146 {
3147     // endif is placed immediately after the label
3148     G4_INST* endifInst = builder->createInternalCFInst(NULL, G4_endif, execSize, NULL, NULL, InstOpt_NoOpt);
3149     INST_LIST_ITER iter = bb->begin();
3150     MUST_BE_TRUE(iter != bb->end(), "empty BB");
3151     iter++;
3152     bb->insertBefore(iter, endifInst);
3153 
3154     // this block may be a target of multiple ifs, in which case we will need to insert
3155     // one endif for each if.  The innermost endif will use the BB label, while for the other
3156     // endifs a new label will be created for each of them.
3157     if (createLabel)
3158     {
3159         std::string name = "_auto" + std::to_string(autoLabelId++);
3160         G4_Label* label = builder->createLabel(name, LABEL_BLOCK);
3161         endifWithLabels.emplace(endifInst, label);
3162         return label;
3163     }
3164     else
3165     {
3166         return bb->getLabel();
3167     }
3168 }
3169 
3170 // This function set the JIP of the endif to the target instruction (either endif or while)
setJIPForEndif(G4_INST * endif,G4_INST * target,G4_BB * targetBB)3171 void FlowGraph::setJIPForEndif(G4_INST* endif, G4_INST* target, G4_BB* targetBB)
3172 {
3173     MUST_BE_TRUE(endif->opcode() == G4_endif, "must be an endif instruction");
3174     G4_Label* label = getLabelForEndif(target);
3175     if (label)
3176     {
3177         // see if there's another label before the inst that we can reuse
3178         // FIXME: we really should associate labels with inst instead of having special label inst,
3179         // so we can avoid ugly code like this
3180         G4_INST* prevInst = NULL;
3181         if (target->opcode() == G4_endif)
3182         {
3183             for (G4_INST* inst : *targetBB)
3184             {
3185                 if (inst == target)
3186                 {
3187                     if (prevInst != NULL && prevInst->isLabel())
3188                     {
3189                         label = prevInst->getLabel();
3190                     }
3191                     break;
3192                 }
3193                 prevInst = inst;
3194             }
3195         }
3196         else
3197         {
3198             MUST_BE_TRUE(target->opcode() == G4_while, "must be a while instruction");
3199             INST_LIST_RITER it = ++(targetBB->rbegin());
3200             if (it != targetBB->rend())
3201             {
3202                 G4_INST* inst = *it;
3203                 if (inst->isLabel())
3204                 {
3205                     label = inst->getLabel();
3206                 }
3207             }
3208         }
3209 
3210         if (label == NULL)
3211         {
3212             std::string name = "_auto" + std::to_string(autoLabelId++);
3213             label = builder->createLabel(name, LABEL_BLOCK);
3214             endifWithLabels.emplace(target, label);
3215         }
3216     }
3217     endif->asCFInst()->setJip(label);
3218 
3219 #ifdef DEBUG_VERBOSE_ON
3220     cout << "set JIP for: \n";
3221     endif->emit(cout);
3222     cout << "\n";
3223 #endif
3224 }
3225 
convertGotoToJmpi(G4_INST * gotoInst)3226 void FlowGraph::convertGotoToJmpi(G4_INST *gotoInst)
3227 {
3228     gotoInst->setOpcode(G4_jmpi);
3229     gotoInst->setSrc(gotoInst->asCFInst()->getUip(), 0);
3230     gotoInst->asCFInst()->setJip(NULL);
3231     gotoInst->asCFInst()->setUip(NULL);
3232     gotoInst->setExecSize(g4::SIMD1);
3233     gotoInst->setOptions(InstOpt_NoOpt | InstOpt_WriteEnable);
3234 }
3235 
3236 /*
3237 *  This function generates UCF (unstructurized control flow, that is, goto/join/jmpi)
3238 *    This function inserts the join for each goto as well as compute the JIP for the
3239 *    goto and join instructions. It additionally converts uniform (simd1 or non-predicated)
3240 *    gotos into scalar jumps. If it's a forward goto, it may be converted into a jmpi only
3241 *    if it is uniform and it does not overlap with another divergent goto.  All uniform
3242 *    backward gotos may be converted into scalar jumps.
3243 *
3244 *  - This function does *not* alter the CFG.
3245 *  - This function does *not* consider SCF (structured control flow), as a well-formed
3246 *    vISA program should not have overlapped goto and structured CF instructions.
3247 *
3248 */
processGoto(bool HasSIMDCF)3249 void FlowGraph::processGoto(bool HasSIMDCF)
3250 {
3251     // For all BBs in [StartBB, EndBB) (including StartBB, but not including EndBB) that
3252     // jump after EndBB (forward jump), return the earliest (closest to EndBB). If no such
3253     // BB, return nullptr.
3254     // Assumption:  1. BB's id is in the increasing order; and 2) physical succ has been set.
3255     auto getEarliestJmpOutBB = [](
3256         const std::list<BlockSizePair> &activeJoins,
3257         const G4_BB* StartBB,
3258         const G4_BB* EndBB) -> G4_BB*
3259     {
3260         // Existing active joins must be considered. For example,
3261         //    goto L0  (non-uniform)
3262         //    ...
3263         //  Loop:
3264         //     ...
3265         //     goto L1 (uniform goto)
3266         //     ...
3267         //  L0:
3268         //     ...
3269         //     goto Loop
3270         //     ...
3271         //  L1:
3272         // Since 'goto L0' is non-uniform, 'goto L1' must be tranlated into goto and a join must
3273         // be inserted at L1.  If goto L0 is not considered (no active join at L0) and 'goto L1' can
3274         // be converted into jmpi (thus no join will be inserted at L1, which is wrong.
3275         std::list<BlockSizePair> tmpActiveJoins(activeJoins);
3276 
3277         const G4_BB* bb = StartBB;
3278         while (bb != EndBB)
3279         {
3280             const G4_BB* currBB = bb;
3281             if (!tmpActiveJoins.empty())
3282             {
3283                 // adjust active join lists if a join is reached.
3284                 if (bb == tmpActiveJoins.front().first)
3285                 {
3286                     tmpActiveJoins.pop_front();
3287                 }
3288             }
3289 
3290             bb = bb->getPhysicalSucc();
3291             if (currBB->empty())
3292             {
3293                 continue;
3294             }
3295 
3296             const G4_INST* lastInst = currBB->back();
3297             if (lastInst->opcode() != G4_goto || lastInst->asCFInst()->isBackward())
3298             {
3299                 continue;
3300             }
3301             G4_BB* targetBB = currBB->Succs.back();
3302             assert(lastInst->asCFInst()->getUip() == targetBB->getLabel());
3303             if (lastInst->asCFInst()->isUniform() &&
3304                 (tmpActiveJoins.empty() || tmpActiveJoins.front().first->getId() >= targetBB->getId()))
3305             {
3306                 // Non-crossing uniform goto will be jmpi, and thus no join needed.
3307                 //
3308                 // For the crossing gotos, a uniform goto cannot be translated to jmpi.
3309                 // For example:
3310                 //     goto L0:        // non-uniform goto (need a join)
3311                 //      ....
3312                 //     uniform-goto L1:
3313                 //      ...
3314                 //    L0:
3315                 //      ...
3316                 //    L1:
3317                 //  Here, uniform-goto L1 is uniform. Since it crosses the non-uniform joins at L0,
3318                 //  it must be translated to goto, not jmpi.  Thus, join is needed at L1.
3319                 continue;
3320             }
3321             // Here, use SIMD1 as execsize does not matter here.
3322             addBBToActiveJoinList(tmpActiveJoins, targetBB, g4::SIMD1);
3323         }
3324 
3325         // Need to remove join at EndBB if present (looking for joins after EndBB)
3326         if (!tmpActiveJoins.empty() && tmpActiveJoins.front().first == EndBB)
3327         {
3328             tmpActiveJoins.pop_front();
3329         }
3330 
3331         if (tmpActiveJoins.empty())
3332         {
3333             // no new join
3334             return nullptr;
3335         }
3336         return tmpActiveJoins.front().first;
3337     };
3338 
3339     // list of active blocks where a join needs to be inserted, sorted in lexical order
3340     std::list<BlockSizePair> activeJoinBlocks;
3341     bool doScalarJmp = !builder->noScalarJmp();
3342 
3343     for (G4_BB* bb : BBs)
3344     {
3345         if (bb->size() == 0)
3346         {
3347             continue;
3348         }
3349 
3350         if (activeJoinBlocks.size() > 0)
3351         {
3352             if (bb == activeJoinBlocks.front().first)
3353             {
3354                 // This block is the target of one or more forward goto,
3355                 // or the fall-thru of a backward goto, needs to insert a join
3356                 G4_ExecSize execSize = activeJoinBlocks.front().second;
3357                 G4_Label* joinJIP = NULL;
3358 
3359                 activeJoinBlocks.pop_front();
3360                 if (activeJoinBlocks.size() > 0)
3361                 {
3362                     //set join JIP to the next active join
3363                     G4_BB* joinBlock = activeJoinBlocks.front().first;
3364                     joinJIP = joinBlock->getLabel();
3365                 }
3366 
3367                 insertJoinToBB(bb, execSize, joinJIP);
3368             }
3369         }
3370 
3371         // check to see if this block is the target of one (or more) backward goto
3372         // If so, we process the backward goto and push its fall-thru block to the
3373         // active join list
3374         for (std::list<G4_BB*>::iterator iter = bb->Preds.begin(), iterEnd = bb->Preds.end(); iter != iterEnd; ++iter)
3375         {
3376             G4_BB* predBB = *iter;
3377             G4_INST* lastInst = predBB->back();
3378             if (lastInst->opcode() == G4_goto && lastInst->asCFInst()->isBackward() &&
3379                 lastInst->asCFInst()->getUip() == bb->getLabel())
3380             {
3381                 // backward goto
3382                 G4_ExecSize eSize = lastInst->getExecSize() > g4::SIMD1 ?
3383                     lastInst->getExecSize() : pKernel->getSimdSize();
3384                 bool isUniform = lastInst->getExecSize() == g4::SIMD1 || lastInst->getPredicate() == NULL;
3385                 if (isUniform && doScalarJmp)
3386                 {
3387                     // can always convert a uniform backward goto into a jmp
3388                     convertGotoToJmpi(lastInst);
3389 
3390                     // we still have to add a join at the BB immediately after the back edge,
3391                     // since there may be subsequent loop breaks that are waiting there.
3392                     // example:
3393                     // L1:
3394                     // (P1) goto exit
3395                     // (P2) goto L2
3396                     // goto L1
3397                     // L2:                <-- earliest after-loop goto target
3398                     // ...
3399                     // exit:
3400                     //
3401                     // In this case, L2 shall be the 1st after-loop join (not necessarily the one immediately
3402                     // after the backward BB). The goto exit's JIP should be set to L2.
3403                     //
3404                     // Here, we will add the 1st after-loop join so that any out-loop JIP (either goto or
3405                     // join) within the loop body will has its JIP set to this join.
3406                     if (G4_BB* afterLoopJoinBB = getEarliestJmpOutBB(activeJoinBlocks, bb, predBB))
3407                     {
3408                         addBBToActiveJoinList(activeJoinBlocks, afterLoopJoinBB, eSize);
3409                     }
3410                 }
3411                 else
3412                 {
3413                     if (lastInst->getExecSize() == g4::SIMD1)
3414                     {   // For simd1 goto, convert it to a goto with the right execSize.
3415                         lastInst->setExecSize(eSize);
3416                         // This should have noMask removed if any
3417                         lastInst->setOptions(InstOpt_M0);
3418                     }
3419                     // add join to the fall-thru BB
3420                     if (G4_BB* fallThruBB = predBB->getPhysicalSucc())
3421                     {
3422                         addBBToActiveJoinList(activeJoinBlocks, fallThruBB, eSize);
3423                         lastInst->asCFInst()->setJip(fallThruBB->getLabel());
3424                     }
3425                 }
3426             }
3427         }
3428 
3429         G4_INST* lastInst = bb->back();
3430         if (lastInst->opcode() == G4_goto && !lastInst->asCFInst()->isBackward())
3431         {
3432             // forward goto
3433             // the last Succ BB is our goto target
3434             G4_BB* gotoTargetBB = bb->Succs.back();
3435             bool isUniform = lastInst->getExecSize() == g4::SIMD1 || lastInst->getPredicate() == NULL;
3436 
3437             if (isUniform && doScalarJmp &&
3438                 (activeJoinBlocks.size() == 0 || activeJoinBlocks.front().first->getId() > gotoTargetBB->getId()))
3439             {
3440                 // can convert goto into a scalar jump to UIP, if the jmp will not make us skip any joins
3441                 // CFG itself does not need to be updated
3442                 convertGotoToJmpi(lastInst);
3443             }
3444             else
3445             {
3446                 // set goto JIP to the first active block
3447                 G4_ExecSize eSize = lastInst->getExecSize() > g4::SIMD1 ?
3448                     lastInst->getExecSize() : pKernel->getSimdSize();
3449                 addBBToActiveJoinList(activeJoinBlocks, gotoTargetBB, eSize);
3450                 G4_BB* joinBlock = activeJoinBlocks.front().first;
3451                 if (lastInst->getExecSize() == g4::SIMD1)
3452                 {   // For simd1 goto, convert it to a goto with the right execSize.
3453                     lastInst->setExecSize(eSize);
3454                     lastInst->setOptions(InstOpt_M0);
3455                 }
3456                 lastInst->asCFInst()->setJip(joinBlock->getLabel());
3457 
3458                 if (!builder->gotoJumpOnTrue())
3459                 {
3460                     // For BDW/SKL goto, the false channels are the ones that actually will take the jump,
3461                     // and we thus have to flip the predicate
3462                     G4_Predicate* pred = lastInst->getPredicate();
3463                     if (pred != NULL)
3464                     {
3465                         pred->setState(pred->getState() == PredState_Plus ? PredState_Minus : PredState_Plus);
3466                     }
3467                     else
3468                     {
3469                         // if there is no predicate, generate a predicate with all 0s.
3470                         // if predicate is SIMD32, we have to use a :ud dst type for the move
3471                         G4_ExecSize execSize = lastInst->getExecSize() > g4::SIMD16 ? g4::SIMD2 : g4::SIMD1;
3472                         G4_Declare* tmpFlagDcl = builder->createTempFlag(execSize);
3473                         G4_DstRegRegion* newPredDef = builder->createDst(
3474                             tmpFlagDcl->getRegVar(), 0, 0, 1, execSize == g4::SIMD2 ? Type_UD : Type_UW);
3475                         G4_INST* predInst = builder->createMov(g4::SIMD1, newPredDef, builder->createImm(0, Type_UW),
3476                             InstOpt_WriteEnable, false);
3477                         INST_LIST_ITER iter = bb->end();
3478                         iter--;
3479                         bb->insertBefore(iter, predInst);
3480 
3481                         pred = builder->createPredicate(
3482                             PredState_Plus,
3483                             tmpFlagDcl->getRegVar(),
3484                             0);
3485                         lastInst->setPredicate(pred);
3486                     }
3487                 }
3488             }
3489         }
3490     }
3491 }
3492 
3493 // TGL NoMask WA : to identify which BB needs WA
3494 //
3495 // nestedDivergentBBs[BB] = 2;
3496 //      BB is in a nested divergent branch
3497 // nestedDivergentBBs[BB] = 1;
3498 //      BB is not in a nested divergent branch, but in a divergent branch.
findNestedDivergentBBs(std::unordered_map<G4_BB *,int> & nestedDivergentBBs)3499 void FlowGraph::findNestedDivergentBBs(std::unordered_map<G4_BB*, int>& nestedDivergentBBs)
3500 {
3501     // Control-Flow state
3502     //    Used for keeping the current state of control flow during
3503     //    traversing BBs in the layout order. Assuming the current
3504     //    BB is 'currBB', this class shows whether currBB is in any
3505     //    divergent branch/nested divergent branch.
3506     class CFState
3507     {
3508         // If currBB is the head of loop or currBB has a cf inst such as goto/break/if,
3509         // the code will diverge from currBB to the joinBB.  The following cases shows where
3510         // joinBB is:
3511         //     case 1: cf inst = goto
3512         //          <currBB>    [(p)] goto L
3513         //                           ...
3514         //          <joinBB>    L:
3515         //     case 2: cf inst = if
3516         //          <currBB>    if
3517         //                          ...
3518         //          <joinBB>    endif
3519         //
3520         //         Note that else does not increase nor decrease divergence level.
3521         //     case 3:  cf inst = break
3522         //          <currBB>   break
3523         //                     ...
3524         //                     [(p)] while
3525         //          <joinBB>
3526         //     case 4:
3527         //          <currBB>  L:
3528         //                     ....
3529         //                    [(p)] while/goto L
3530         //          <joinBB>
3531         // Case 1/2/3 will increase divergence level, while case 4 does not.
3532         //
3533         // The following are used for tracking nested divergence
3534         //    LastDivergentBBId:  Id(LastDivergentBB).
3535         //        LastDivergentBB is the fartherest joinBB of any goto/break/if/while
3536         //        so far, as described above.  That is, [currBB, LastDivergentBB) is
3537         //        a divergent code path. LastDivergentBB is not included in this
3538         //        divergent path.
3539         //    LastNestedBBId:  Id(LastNestedBB).
3540         //        LastNestedBB is the current fartherest join BB that is inside
3541         //        [currBB, LastDivergentBB). We have LastNestedBB if a goto/break/if (no
3542         //        loop) is encountered inside an already divergent code path. It is also
3543         //        called nested divergent (or just nested).
3544         //
3545         // The following is always true:
3546         //    LastDivergentBBId >= LastNestedBBId
3547         int LastDivergentBBId;
3548         int LastNestedBBId;
3549 
3550         void setLastDivergentBBId(G4_BB* toBB)
3551         {
3552             LastDivergentBBId = std::max(LastDivergentBBId, (int)toBB->getId());
3553         }
3554 
3555         void setLastNestedBBId(G4_BB* toBB)
3556         {
3557             LastNestedBBId = std::max(LastNestedBBId, (int)toBB->getId());
3558             setLastDivergentBBId(toBB);
3559         }
3560 
3561         // Increase divergence level
3562         void incDivergenceLevel(G4_BB* toBB)
3563         {
3564             if (isInDivergentBranch())
3565             {
3566                 setLastNestedBBId(toBB);
3567             }
3568             else
3569             {
3570                 setLastDivergentBBId(toBB);
3571             }
3572         }
3573 
3574     public:
3575         CFState() : LastNestedBBId(-1), LastDivergentBBId(-1) {}
3576 
3577         bool isInDivergentBranch() const { return (LastDivergentBBId > 0);  }
3578         bool isInNestedDivergentBranch() const { return LastNestedBBId > 0; }
3579 
3580         //  pushLoop: for case 4
3581         //  pushJoin: for case 1/2/3
3582         void pushLoop(G4_BB* joinBB) { setLastDivergentBBId(joinBB); }
3583         void pushJoin(G4_BB* joinBB) { incDivergenceLevel(joinBB); }
3584         void popJoinIfMatching(G4_BB* currBB)
3585         {
3586             if ((int)currBB->getId() == LastNestedBBId)
3587             {
3588                 LastNestedBBId = -1;
3589             }
3590             if ((int)currBB->getId() == LastDivergentBBId)
3591             {
3592                 LastDivergentBBId = -1;
3593             }
3594             assert(!(LastDivergentBBId == -1 && LastNestedBBId >= 0) &&
3595                    "ICE:  something wrong in setting divergent BB Id!");
3596         }
3597     };
3598 
3599     // For loop with backedge Tail->Head, if Tail is divergent, its divergence should be
3600     // propagated to the entire loop as Tail jumps to head, which could go all BBs in the loop.
3601     //
3602     // In another word, the whole loop's divergence level is the same as Tail's. Once the
3603     // entire loop has been handled and Tail's divergence is known, invoking this lambda func
3604     // to carry out propagation.
3605     //
3606     // An example to show WA is needed (nested divergence for Tail):
3607     //      Head:                   // initial fuseMask = 11;  2nd iter: fuseMask = 11 (should be 10)
3608     //         if (...) goto Tail;  // fuseMask = 11
3609     //                              // After if, fusedMask = 01 (bigEU is Off)
3610     //         ...
3611     //         goto out             // fuseMask = 01 (BigEU off, SmallEU on)
3612     //                              // after goto, fuseMask = 00, but HW remains 01
3613     //      Tail:
3614     //           goto Head          // fuseMask should 10, but HW remains 11, and jump to Head at 2nd iter
3615     //      out:
3616     auto propLoopDivergence = [&](G4_BB* LoopTail)
3617     {
3618         // LoopTail must be divergent.
3619         std::vector<G4_BB*> workset;
3620         workset.push_back(LoopTail);
3621         while (!workset.empty())
3622         {
3623             std::vector<G4_BB*> newWorkset;
3624             for (auto iter : workset)
3625             {
3626                 G4_BB* Tail = iter;
3627                 G4_InstCF* cfInst = Tail->back()->asCFInst();
3628 
3629                 assert(nestedDivergentBBs.count(Tail) > 0 &&
3630                        "Only divergent Tail shall invoke this func!");
3631 
3632                 // Find loop head
3633                 G4_BB* Head = nullptr;
3634                 for (G4_BB* succBB : Tail->Succs)
3635                 {
3636                     if ((cfInst->opcode() == G4_goto && cfInst->getUip() == succBB->getLabel()) ||
3637                         (cfInst->opcode() == G4_while && cfInst->getJip() == succBB->getLabel()) ||
3638                         (cfInst->opcode() == G4_jmpi && cfInst->getSrc(0) == succBB->getLabel()))
3639                     {
3640                         Head = succBB;
3641                         break;
3642                     }
3643                 }
3644                 assert(Head != nullptr);
3645 
3646                 // If Head's divergence level is already higher than Tail, no propagation needed
3647                 // as Head's divergence has been propagated already.
3648                 if (nestedDivergentBBs.count(Head) > 0 &&
3649                     nestedDivergentBBs[Head] >= nestedDivergentBBs[Tail])
3650                 {
3651                     continue;
3652                 }
3653 
3654                 // Follow physical succs to visit all BBs in this loop
3655                 for (G4_BB* tBB = Head; tBB != nullptr && tBB != Tail; tBB = tBB->getPhysicalSucc())
3656                 {
3657                     auto miter = nestedDivergentBBs.find(tBB);
3658                     if (miter == nestedDivergentBBs.end() || miter->second < nestedDivergentBBs[Tail])
3659                     {
3660                         // Propagate Tail's divergence. If the new BB is a tail (another loop),
3661                         // add it to workset for the further propagation.
3662                         nestedDivergentBBs[tBB] = nestedDivergentBBs[Tail];
3663                         auto lastOp = tBB->getLastOpcode();
3664                         if (lastOp == G4_while ||
3665                             ((lastOp == G4_goto || lastOp == G4_jmpi) &&
3666                              tBB->back()->asCFInst()->isBackward()))
3667                         {
3668                             newWorkset.push_back(tBB);
3669                         }
3670                     }
3671                 }
3672             }
3673 
3674             workset = newWorkset;
3675         }
3676     };
3677 
3678     if (BBs.empty())
3679     {
3680         // Sanity check
3681         return;
3682     }
3683 
3684     // Analyze subroutines in topological order. As there is no recursion
3685     // and no indirect call,  a subroutine will be analyzed only if all
3686     // its callers have been analyzed.
3687     //
3688     // If no subroutine, sortedFuncTable is empty. Here keep the entry and
3689     // subroutines in a vector first (it works with and without subroutines),
3690     // then scan subroutines in topological order.
3691     struct StartEndIter {
3692         BB_LIST_ITER StartI;
3693         BB_LIST_ITER EndI;
3694 
3695         // When a subroutine is entered, the fusedMask must not be 00, but it
3696         // could be 01. This field shows it could be 01 if set to true.
3697         bool  isInDivergentBranch;
3698     };
3699     int numFuncs = (int)sortedFuncTable.size();
3700     std::vector<StartEndIter> allFuncs;
3701     std::unordered_map<FuncInfo*, uint32_t> funcInfoIndex;
3702     if (numFuncs == 0)
3703     {
3704         numFuncs = 1;
3705         allFuncs.resize(1);
3706         allFuncs[0].StartI = BBs.begin();
3707         allFuncs[0].EndI = BBs.end();
3708         allFuncs[0].isInDivergentBranch = false;
3709     }
3710     else
3711     {
3712         allFuncs.resize(numFuncs);
3713         for (int i = numFuncs; i > 0; --i)
3714         {
3715             uint32_t ix = (uint32_t)(i - 1);
3716             FuncInfo* pFInfo = sortedFuncTable[ix];
3717             G4_BB* StartBB = pFInfo->getInitBB();
3718             G4_BB* EndBB = pFInfo->getExitBB();
3719             uint32_t ui = (uint32_t)(numFuncs - i);
3720             allFuncs[ui].StartI = std::find(BBs.begin(), BBs.end(), StartBB);
3721             auto nextI = std::find(BBs.begin(), BBs.end(), EndBB);
3722             assert(nextI != BBs.end() && "ICE: subroutine's end BB not found!");
3723             allFuncs[ui].EndI = (++nextI);
3724             allFuncs[ui].isInDivergentBranch = false;
3725 
3726             funcInfoIndex[pFInfo] = ui;
3727         }
3728     }
3729 
3730     for (int i = 0; i < numFuncs; ++i)
3731     {
3732         // each function: [IT, IE)
3733         BB_LIST_ITER& IT = allFuncs[i].StartI;
3734         BB_LIST_ITER& IE = allFuncs[i].EndI;
3735         if (IT == IE)
3736         {
3737             // Sanity check
3738             continue;
3739         }
3740 
3741         CFState cfs;
3742         if (allFuncs[i].isInDivergentBranch)
3743         {
3744             // subroutine's divergent on entry. Mark [StartBB, EndBB) as divergent
3745             auto prevI = IE;
3746             --prevI;
3747             G4_BB* EndBB = *prevI;
3748             cfs.pushJoin(EndBB);
3749         }
3750 
3751         for (; IT != IE; ++IT)
3752         {
3753             G4_BB* BB = *IT;
3754 
3755             // This handles cases in which BB has endif/while/join as well as others
3756             // so we don't need to explicitly check whether BB has endif/while/join, etc.
3757             cfs.popJoinIfMatching(BB);
3758 
3759             G4_INST* firstInst = BB->getFirstInst();
3760             if (firstInst == nullptr)
3761             {
3762                 // empty BB or BB with only label inst
3763                 continue;
3764             }
3765 
3766             // Handle loop
3767             for (G4_BB* predBB : BB->Preds)
3768             {
3769                 G4_INST* lastInst = predBB->back();
3770                 if ((lastInst->opcode() == G4_while &&
3771                       lastInst->asCFInst()->getJip() == BB->getLabel()) ||
3772                      (lastInst->opcode() == G4_goto && lastInst->asCFInst()->isBackward() &&
3773                       lastInst->asCFInst()->getUip() == BB->getLabel()))
3774                 {
3775                     // joinBB is the BB right after goto/while
3776                     BB_LIST_ITER predIter = std::find(BBs.begin(), BBs.end(), predBB);
3777                     ++predIter;
3778                     // if predBB is the last BB then joinBB is not available
3779                     if (predIter == BBs.end())
3780                         continue;
3781                     G4_BB* joinBB = *predIter;
3782                     cfs.pushLoop(joinBB);
3783                 }
3784             }
3785 
3786             if (cfs.isInNestedDivergentBranch())
3787             {
3788                 nestedDivergentBBs[BB] = 2;
3789             }
3790             else if (cfs.isInDivergentBranch())
3791             {
3792                 nestedDivergentBBs[BB] = 1;
3793             }
3794 
3795             G4_INST* lastInst = BB->back();
3796 
3797             // Need to check whether to propagate WA marking to entire loop!
3798             //
3799             // Do it for CM now, need to apply to all!
3800             // if (nestedDivergentBBs.count(BB) > 0 && nestedDivergentBBs[BB] >= 2 &&
3801             //    getKernel()->getInt32KernelAttr(Attributes::ATTR_Target) == VISA_CM)
3802             if (nestedDivergentBBs.count(BB) > 0 && nestedDivergentBBs[BB] >= 2)
3803             {
3804                 if (lastInst->opcode() == G4_while ||
3805                     ((lastInst->opcode() == G4_goto || lastInst->opcode() == G4_jmpi) &&
3806                      lastInst->asCFInst()->isBackward()))
3807                 {
3808                     propLoopDivergence(BB);
3809                 }
3810             }
3811 
3812             if ((lastInst->opcode() == G4_goto && !lastInst->asCFInst()->isBackward()) ||
3813                 lastInst->opcode() == G4_break)
3814             {
3815                 // forward goto/break : the last Succ BB is our target BB
3816                 // For break, it should be the BB right after while inst.
3817                 G4_BB* joinBB = BB->Succs.back();
3818                 cfs.pushJoin(joinBB);
3819             }
3820             else if (lastInst->opcode() == G4_if)
3821             {
3822                 G4_Label* labelInst = lastInst->asCFInst()->getUip();
3823                 G4_BB* joinBB = findLabelBB(IT, IE, labelInst->getLabel());
3824                 assert(joinBB && "ICE(vISA) : missing endif label!");
3825                 cfs.pushJoin(joinBB);
3826             }
3827             else if (lastInst->opcode() == G4_call)
3828             {
3829                 // If this function is already in divergent branch, the callee
3830                 // must be in a divergent branch!.
3831                 if (allFuncs[i].isInDivergentBranch ||
3832                     cfs.isInDivergentBranch() ||
3833                     lastInst->getPredicate() != nullptr)
3834                 {
3835                     FuncInfo* calleeFunc = BB->getCalleeInfo();
3836                     if (funcInfoIndex.count(calleeFunc))
3837                     {
3838                         int ix = funcInfoIndex[calleeFunc];
3839                         allFuncs[ix].isInDivergentBranch = true;
3840                     }
3841                 }
3842             }
3843         }
3844 
3845         // Once all BBs are precessed, cfs should be clear
3846         assert((!cfs.isInDivergentBranch()) &&
3847                "ICE(vISA): there is an error in divergence tracking!");
3848     }
3849 
3850     for (auto MI : nestedDivergentBBs)
3851     {
3852         G4_BB* bb = MI.first;
3853         if (MI.second >= 2)
3854         {
3855             bb->setBBType(G4_BB_NM_WA_TYPE);
3856         }
3857     }
3858     return;
3859 }
3860 
3861 //
3862 // Add declares for the stack and frame pointers.
3863 //
addFrameSetupDeclares(IR_Builder & builder,PhyRegPool & regPool)3864 void FlowGraph::addFrameSetupDeclares(IR_Builder& builder, PhyRegPool& regPool)
3865 {
3866     if (framePtrDcl == NULL)
3867     {
3868         framePtrDcl = builder.getBEFP();
3869     }
3870     if (stackPtrDcl == NULL)
3871     {
3872         stackPtrDcl = builder.getBESP();
3873     }
3874     if (scratchRegDcl == NULL)
3875     {
3876         scratchRegDcl = builder.createDeclareNoLookup("SR", G4_GRF, numEltPerGRF<Type_UD>(), 1, Type_UD);
3877         scratchRegDcl->getRegVar()->setPhyReg(regPool.getGreg(builder.kernel.getSpillHeaderGRF()), 0);
3878     }
3879 }
3880 
3881 //
3882 // Insert pseudo dcls to represent the caller-save and callee-save registers.
3883 // This is only required when there is more than one graph cut due to the presence
3884 // of function calls using a stack calling convention.
3885 //
addSaveRestorePseudoDeclares(IR_Builder & builder)3886 void FlowGraph::addSaveRestorePseudoDeclares(IR_Builder& builder)
3887 {
3888     //
3889     // VCE_SAVE (r60.0-r125.0) [r125-127 is reserved]
3890     //
3891     if (pseudoVCEDcl == NULL)
3892     {
3893         unsigned numRowsVCE = getKernel()->getNumCalleeSaveRegs();
3894         pseudoVCEDcl = builder.createDeclareNoLookup("VCE_SAVE", G4_GRF, numEltPerGRF<Type_UD>(), static_cast<unsigned short>(numRowsVCE), Type_UD);
3895     }
3896     else
3897     {
3898         pseudoVCEDcl->getRegVar()->setPhyReg(NULL, 0);
3899     }
3900 
3901     //
3902     // Insert caller save decls for VCA/A0/Flag (one per call site)
3903     // VCA_SAVE (r1.0-r60.0) [r0 is reserved] - one required per stack call,
3904     // but will be reused across cuts.
3905     //
3906     INST_LIST callSites;
3907     for (auto bb : builder.kernel.fg)
3908     {
3909         if (bb->isEndWithFCall())
3910         {
3911             callSites.push_back(bb->back());
3912         }
3913     }
3914 
3915     unsigned i = 0;
3916     for (auto callSite : callSites)
3917     {
3918         const char* nameBase = "VCA_SAVE";  // sizeof(nameBase) = 9, including ending 0
3919         const int maxIdLen = 3;
3920         const char* name = builder.getNameString(mem, sizeof(nameBase) + maxIdLen, "%s_%d", nameBase, i);
3921         G4_Declare* VCA = builder.createDeclareNoLookup(name, G4_GRF, numEltPerGRF<Type_UD>(), builder.kernel.getCallerSaveLastGRF(), Type_UD);
3922         name = builder.getNameString(mem, 50, "SA0_%d", i);
3923         G4_Declare* saveA0 = builder.createDeclareNoLookup(name, G4_ADDRESS, (uint16_t)getNumAddrRegisters(), 1, Type_UW);
3924         name = builder.getNameString(mem, 64, "SFLAG_%d", i);
3925         G4_Declare* saveFLAG = builder.createDeclareNoLookup(name, G4_FLAG, (uint16_t) builder.getNumFlagRegisters(), 1, Type_UW);
3926         fcallToPseudoDclMap[callSite->asCFInst()] = { VCA, saveA0, saveFLAG };
3927         i++;
3928     }
3929 }
3930 
3931 //
3932 // Since we don't do SIMD augmentation in RA for CM, we have to add an edge
3933 // between the then and else block of an if branch to ensure that liveness is
3934 // computed correctly, if conservatively. This also applies to any goto BB and
3935 // its JIP join block
3936 //
addSIMDEdges()3937 void FlowGraph::addSIMDEdges()
3938 {
3939     std::map<G4_Label*, G4_BB*> joinBBMap;
3940     for (auto bb : BBs)
3941     {
3942         if (bb->size() > 0 && bb->back()->opcode() == G4_else)
3943         {
3944             addUniquePredSuccEdges(bb, bb->getPhysicalSucc());
3945         }
3946         else
3947         {
3948             // check goto case
3949             auto instIter = std::find_if(bb->begin(), bb->end(),
3950                 [](G4_INST* inst) { return !inst->isLabel(); });
3951             if (instIter != bb->end() && (*instIter)->opcode() == G4_join)
3952             {
3953                 G4_INST* firstInst = bb->front();
3954                 if (firstInst->isLabel())
3955                 {
3956                     joinBBMap[firstInst->getLabel()] = bb;
3957                 }
3958             }
3959         }
3960     }
3961 
3962     if (!joinBBMap.empty())
3963     {
3964         for (auto bb : BBs)
3965         {
3966             if (bb->isEndWithGoto())
3967             {
3968                 G4_INST* gotoInst = bb->back();
3969                 auto iter = joinBBMap.find(gotoInst->asCFInst()->getJip()->asLabel());
3970                 if (iter != joinBBMap.end())
3971                 {
3972                     addUniquePredSuccEdges(bb, iter->second);
3973                 }
3974             }
3975         }
3976     }
3977 }
3978 
3979 //
3980 // Perform DFS traversal on the flow graph (do not enter subroutine, but mark subroutine blocks
3981 // so that they will be processed independently later)
3982 //
DFSTraverse(G4_BB * startBB,unsigned & preId,unsigned & postId,FuncInfo * fn)3983 void FlowGraph::DFSTraverse(G4_BB* startBB, unsigned &preId, unsigned &postId, FuncInfo* fn)
3984 {
3985     MUST_BE_TRUE(fn, "Invalid func info");
3986     std::stack<G4_BB*> traversalStack;
3987     traversalStack.push(startBB);
3988 
3989     while (!traversalStack.empty())
3990     {
3991         G4_BB*  bb = traversalStack.top();
3992         if (bb->getPreId() != UINT_MAX)
3993         {
3994             // Pre-processed already and continue to the next one.
3995             // Before doing so, set postId if not set before.
3996             traversalStack.pop();
3997             if (bb->getRPostId() == UINT_MAX)
3998             {
3999                 // All bb's succ has been visited (PreId is set) at this time.
4000                 // if any of its succ has not been finished (RPostId not set),
4001                 // bb->succ forms a backedge.
4002                 //
4003                 // Note: originally, CALL and EXIT will not check back-edges, here
4004                 //       we skip checking for them as well. (INIT & RETURN should
4005                 //       be checked as well ?)
4006                 if (!(bb->getBBType() & (G4_BB_CALL_TYPE | G4_BB_EXIT_TYPE)))
4007                 {
4008                     for (auto succBB : bb->Succs)
4009                     {
4010                         if (succBB->getRPostId() == UINT_MAX)
4011                         {
4012                             backEdges.push_back(Edge(bb, succBB));
4013                         }
4014                     }
4015                 }
4016 
4017                 // Need to keep this after backedge checking so that self-backedge
4018                 // (single-bb loop) will not be missed.
4019                 bb->setRPostId(postId++);
4020             }
4021             continue;
4022         }
4023 
4024         fn->addBB(bb);
4025         bb->setPreId(preId++);
4026 
4027         if (bb->getBBType() & G4_BB_CALL_TYPE)
4028         {
4029             G4_BB* returnBB = bb->BBAfterCall();
4030             // If call is predicated, first item in Succs is physically consecutive BB and second (or last)
4031             // item is sub-routine entry BB.
4032             MUST_BE_TRUE(bb->Succs.back()->getBBType() & G4_BB_INIT_TYPE, ERROR_FLOWGRAPH);
4033             // bb->Succs size may be 2 if call is predicated.
4034             MUST_BE_TRUE(bb->Succs.size() == 1 || bb->Succs.size() == 2, ERROR_FLOWGRAPH);
4035 
4036             {
4037                 bool found = false;
4038                 for (auto func : fn->getCallees())
4039                 {
4040                     if (func == bb->getCalleeInfo())
4041                         found = true;
4042                 }
4043                 if (!found)
4044                 {
4045                     fn->addCallee(bb->getCalleeInfo());
4046                 }
4047             }
4048 
4049             if (returnBB->getPreId() == UINT_MAX)
4050             {
4051                 traversalStack.push(returnBB);
4052             }
4053             else
4054             {
4055                 MUST_BE_TRUE(false, ERROR_FLOWGRAPH);
4056             }
4057         }
4058         else if (bb->getBBType() & G4_BB_EXIT_TYPE)
4059         {
4060             // Skip
4061         }
4062         else
4063         {
4064             // To be consistent with previous behavior, use reverse_iter.
4065             BB_LIST_RITER RIE = bb->Succs.rend();
4066             for (BB_LIST_RITER rit = bb->Succs.rbegin(); rit != RIE; ++rit)
4067             {
4068                 G4_BB* succBB = *rit;
4069                 if (succBB->getPreId() == UINT_MAX)
4070                 {
4071                     traversalStack.push(succBB);
4072                 }
4073             }
4074         }
4075         // As the top of stack may be different than that at the
4076         // beginning of this iteration, cannot do pop here. Instead,
4077         // do pop and set RPostId at the beginning of each iteration.
4078         //
4079         // traversalStack.pop();
4080         // bb->setRPostId(postId++);
4081     }
4082 }
4083 
markStale()4084 void vISA::FlowGraph::markStale()
4085 {
4086     // invoked whenever CFG structures changes.
4087     // structural changes include addition/removal of BB or edge.
4088     // mark analysis passes as stale so getters lazily re-run
4089     // analysis when queried.
4090     dom.setStale();
4091     immDom.setStale();
4092     pDom.setStale();
4093     loops.setStale();
4094 
4095     // any other analysis that becomes stale when FlowGraph changes
4096     // should be marked as stale here.
4097 }
4098 
markRPOTraversal()4099 void FlowGraph::markRPOTraversal()
4100 {
4101     MUST_BE_TRUE(numBBId == BBs.size(), ERROR_FLOWGRAPH);
4102 
4103     unsigned postID = 0;
4104     backEdges.clear();
4105 
4106     for (auto curBB : BBs)
4107     {
4108         curBB->setRPostId(postID++);
4109 
4110         if (curBB->size() > 0)
4111         {
4112             if (curBB->getBBType() & G4_BB_CALL_TYPE)
4113             {
4114                 // skip
4115             }
4116             else if (curBB->getBBType() & G4_BB_EXIT_TYPE)
4117             {
4118                 // Skip
4119             }
4120             else
4121             {
4122                 for (auto succBB : curBB->Succs)
4123                 {
4124                     if (curBB->getId() >= succBB->getId())
4125                     {
4126                         backEdges.push_back(Edge(curBB, succBB));
4127                     }
4128                 }
4129             }
4130         }
4131     }
4132 }
4133 
4134 //
4135 // Find back-edges in the flow graph.
4136 //
findBackEdges()4137 void FlowGraph::findBackEdges()
4138 {
4139     MUST_BE_TRUE(numBBId == BBs.size(), ERROR_FLOWGRAPH);
4140 
4141     for (auto bb : BBs)
4142     {
4143         bb->setPreId(UINT_MAX);
4144         bb->setRPostId(UINT_MAX);
4145     }
4146 
4147     unsigned preId = 0;
4148     unsigned postID = 0;
4149     backEdges.clear();
4150 
4151     setPhysicalPredSucc();
4152     DFSTraverse(getEntryBB(), preId, postID, kernelInfo);
4153 
4154     for (auto fn : funcInfoTable)
4155     {
4156         DFSTraverse(fn->getInitBB(), preId, postID, fn);
4157     }
4158 }
4159 
4160 //
4161 // Find natural loops in the flow graph.
4162 // Assumption: the input FG is reducible.
4163 //
findNaturalLoops()4164 void FlowGraph::findNaturalLoops()
4165 {
4166     setPhysicalPredSucc();
4167     for (auto&& backEdge : backEdges)
4168     {
4169         G4_BB* head = backEdge.second;
4170         G4_BB* tail = backEdge.first;
4171         std::list<G4_BB*> loopBlocks;
4172         Blocks loopBody;
4173         loopBlocks.push_back(tail);
4174         loopBody.insert(tail);
4175 
4176         while (!loopBlocks.empty())
4177         {
4178             G4_BB* loopBlock = loopBlocks.front();
4179             loopBlocks.pop_front();
4180             loopBlock->setInNaturalLoop(true);
4181             loopBlock->setNestLevel();
4182 
4183             if ((loopBlock == head) || (loopBlock->getBBType() & G4_BB_INIT_TYPE))
4184             {
4185                 // Skip
4186             }
4187             else if (loopBlock->getBBType() & G4_BB_RETURN_TYPE)
4188             {
4189                 auto callBB = loopBlock->BBBeforeCall();
4190                 if (!callBB->isInNaturalLoop())
4191                 {
4192                     loopBlocks.push_front(callBB);
4193                     loopBody.insert(callBB);
4194                 }
4195             }
4196             else
4197             {
4198                 auto entryBB = getEntryBB();
4199                 for (auto predBB : loopBlock->Preds)
4200                 {
4201                     if (!predBB->isInNaturalLoop())
4202                     {
4203                         if (predBB == entryBB && head != entryBB)
4204                         {
4205                             // graph is irreducible, punt natural loop detection for entire CFG
4206                             this->reducible = false;
4207                             naturalLoops.clear();
4208                             for (auto BB : BBs)
4209                             {
4210                                 BB->setInNaturalLoop(false);
4211                                 BB->resetNestLevel();
4212                             }
4213                             return;
4214                         }
4215                         MUST_BE_TRUE(predBB != entryBB || head == entryBB, ERROR_FLOWGRAPH);
4216                         loopBlocks.push_front(predBB);
4217                         loopBody.insert(predBB);
4218                     }
4219                 }
4220             }
4221         }
4222 
4223         for (auto loopBB : loopBody)
4224         {
4225             loopBB->setInNaturalLoop(false);
4226         }
4227 
4228         naturalLoops.emplace(backEdge, loopBody);
4229     }
4230 }
4231 
traverseFunc(FuncInfo * func,unsigned * ptr)4232 void FlowGraph::traverseFunc(FuncInfo* func, unsigned *ptr)
4233 {
4234     func->setPreID((*ptr)++);
4235     func->setVisited();
4236     for (auto callee : func->getCallees())
4237     {
4238         if (!(callee->getVisited()))
4239         {
4240             traverseFunc(callee, ptr);
4241         }
4242     }
4243     sortedFuncTable.push_back(func);
4244     func->setPostID((*ptr)++);
4245 }
4246 
4247 //
4248 // Sort subroutines in topological order based on DFS
4249 // a topological order is guaranteed as recursion is not allowed for subroutine calls
4250 // results are stored in sortedFuncTable in reverse topological order
4251 //
topologicalSortCallGraph()4252 void FlowGraph::topologicalSortCallGraph()
4253 {
4254     unsigned visitID = 1;
4255     traverseFunc(kernelInfo, &visitID);
4256 }
4257 
4258 //
4259 // This should be called only after pre-/post-visit ID are set
4260 //
checkVisitID(FuncInfo * func1,FuncInfo * func2)4261 static bool checkVisitID(FuncInfo* func1, FuncInfo* func2)
4262 {
4263     if (func1->getPreID() < func2->getPreID() &&
4264         func1->getPostID() > func2->getPostID())
4265     {
4266         return true;
4267     }
4268     else
4269     {
4270         return false;
4271     }
4272 }
4273 
4274 //
4275 // Find dominators for each function
4276 //
findDominators(std::map<FuncInfo *,std::set<FuncInfo * >> & domMap)4277 void FlowGraph::findDominators(std::map<FuncInfo*, std::set<FuncInfo*>>& domMap)
4278 {
4279     std::map<FuncInfo*, std::set<FuncInfo*>> predMap;
4280 
4281     for (auto func : sortedFuncTable)
4282     {
4283         if (func == kernelInfo)
4284         {
4285             std::set<FuncInfo*> initSet;
4286             initSet.insert(kernelInfo);
4287             domMap.insert(std::make_pair(kernelInfo, initSet));
4288         }
4289         else
4290         {
4291             std::set<FuncInfo*> initSet;
4292             for (auto funcTmp : sortedFuncTable)
4293             {
4294                 initSet.insert(funcTmp);
4295             }
4296             domMap.insert(std::make_pair(func, initSet));
4297         }
4298 
4299         for (auto callee : func->getCallees())
4300         {
4301             std::map<FuncInfo*, std::set<FuncInfo*>>::iterator predMapIter = predMap.find(callee);
4302             if (predMapIter == predMap.end())
4303             {
4304                 std::set<FuncInfo*> initSet;
4305                 initSet.insert(func);
4306                 predMap.insert(std::make_pair(callee, initSet));
4307             }
4308             else
4309             {
4310                 predMapIter->second.insert(func);
4311             }
4312         }
4313     }
4314 
4315     bool changed = false;
4316     do
4317     {
4318         changed = false;
4319 
4320         unsigned funcTableSize = static_cast<unsigned> (sortedFuncTable.size());
4321         unsigned funcID = funcTableSize - 1;
4322         do
4323         {
4324             funcID--;
4325             FuncInfo* func = sortedFuncTable[funcID];
4326 
4327             std::map<FuncInfo*, std::set<FuncInfo*>>::iterator predMapIter = predMap.find(func);
4328             if (predMapIter != predMap.end())
4329             {
4330                 std::set<FuncInfo*>& domSet = (*domMap.find(func)).second;
4331                 std::set<FuncInfo*> oldDomSet = domSet;
4332                 domSet.clear();
4333                 domSet.insert(func);
4334 
4335                 std::vector<unsigned> funcVec(funcTableSize);
4336                 for (unsigned i = 0; i < funcTableSize; i++)
4337                 {
4338                     funcVec[i] = 0;
4339                 }
4340 
4341                 std::set<FuncInfo*>& predSet = (*predMapIter).second;
4342                 for (auto pred : predSet)
4343                 {
4344                     for (auto predDom : (*domMap.find(pred)).second)
4345                     {
4346                         unsigned domID = (predDom->getScopeID() == UINT_MAX) ? funcTableSize - 1 : predDom->getScopeID() - 1;
4347                         funcVec[domID]++;
4348                     }
4349                 }
4350 
4351                 unsigned predSetSize = static_cast<unsigned> (predSet.size());
4352                 for (unsigned i = 0; i < funcTableSize; i++)
4353                 {
4354                     if (funcVec[i] == predSetSize)
4355                     {
4356                         FuncInfo* newFunc = sortedFuncTable[i];
4357                         domSet.insert(newFunc);
4358                         if (oldDomSet.find(newFunc) == oldDomSet.end())
4359                             changed = true;
4360                     }
4361                 }
4362 
4363                 if (oldDomSet.size() != domSet.size())
4364                 {
4365                     changed = true;
4366                 }
4367             }
4368         } while (funcID != 0);
4369 
4370     } while (changed);
4371 }
4372 
4373 //
4374 // Check if func1 is a dominator of func2
4375 //
checkDominator(FuncInfo * func1,FuncInfo * func2,std::map<FuncInfo *,std::set<FuncInfo * >> & domMap)4376 static bool checkDominator(FuncInfo* func1, FuncInfo* func2, std::map<FuncInfo*, std::set<FuncInfo*>>& domMap)
4377 {
4378     std::map<FuncInfo*, std::set<FuncInfo*>>::iterator domMapIter = domMap.find(func2);
4379 
4380     if (domMapIter != domMap.end())
4381     {
4382         std::set<FuncInfo*> domSet = (*domMapIter).second;
4383         std::set<FuncInfo*>::iterator domSetIter = domSet.find(func1);
4384 
4385         if (domSetIter != domSet.end())
4386         {
4387             return true;
4388         }
4389     }
4390 
4391     return false;
4392 }
4393 
4394 //
4395 // Determine the scope of a varaible based on different contexts
4396 //
resolveVarScope(G4_Declare * dcl,FuncInfo * func)4397 unsigned FlowGraph::resolveVarScope(G4_Declare* dcl, FuncInfo* func)
4398 {
4399     unsigned oldID = dcl->getScopeID();
4400     unsigned newID = func->getScopeID();
4401 
4402     if (oldID == newID)
4403     {
4404         return oldID;
4405     }
4406     else if (oldID == 0)
4407     {
4408         return newID;
4409     }
4410     else if (oldID == UINT_MAX ||
4411         newID == UINT_MAX)
4412     {
4413         return UINT_MAX;
4414     }
4415     else if (builder->getOption(vISA_EnableGlobalScopeAnalysis))
4416     {
4417         // This is safe if the global variable usage is
4418         // self-contained under the calling function
4419         std::map<FuncInfo*, std::set<FuncInfo*>> domMap;
4420 
4421         findDominators(domMap);
4422 
4423         FuncInfo* oldFunc = sortedFuncTable[oldID - 1];
4424 
4425         if (checkVisitID(func, oldFunc) &&
4426             checkDominator(func, oldFunc, domMap))
4427         {
4428             return newID;
4429         }
4430         else if (checkVisitID(oldFunc, func) &&
4431             checkDominator(oldFunc, func, domMap))
4432         {
4433             return oldID;
4434         }
4435         else
4436         {
4437             unsigned start = (newID > oldID) ? newID : oldID;
4438             unsigned end = static_cast<unsigned> (sortedFuncTable.size());
4439 
4440             for (unsigned funcID = start; funcID != end; funcID++)
4441             {
4442                 FuncInfo* currFunc = sortedFuncTable[funcID];
4443                 if (checkVisitID(currFunc, func) &&
4444                     checkDominator(currFunc, func, domMap) &&
4445                     checkVisitID(currFunc, oldFunc) &&
4446                     checkDominator(currFunc, oldFunc, domMap))
4447                 {
4448                     return currFunc->getScopeID();
4449                 }
4450             }
4451         }
4452     }
4453 
4454     return UINT_MAX;
4455 }
4456 
4457 //
4458 // Visit all operands referenced in a function and update the varaible scope
4459 //
markVarScope(std::vector<G4_BB * > & BBList,FuncInfo * func)4460 void FlowGraph::markVarScope(std::vector<G4_BB*>& BBList, FuncInfo* func)
4461 {
4462     for (auto bb : BBList)
4463     {
4464         for (auto it = bb->begin(), end = bb->end(); it != end; it++)
4465         {
4466             G4_INST* inst = (*it);
4467 
4468             G4_DstRegRegion* dst = inst->getDst();
4469 
4470             if (dst &&
4471                 !dst->isAreg() &&
4472                 dst->getBase())
4473             {
4474                 G4_Declare* dcl = GetTopDclFromRegRegion(dst);
4475                 unsigned scopeID = resolveVarScope(dcl, func);
4476                 dcl->updateScopeID(scopeID);
4477             }
4478 
4479             for (int i = 0; i < G4_MAX_SRCS; i++)
4480             {
4481                 G4_Operand* src = inst->getSrc(i);
4482 
4483                 if (src && !src->isAreg())
4484                 {
4485                     if (src->isSrcRegRegion() &&
4486                         src->asSrcRegRegion()->getBase())
4487                     {
4488                         G4_Declare* dcl = GetTopDclFromRegRegion(src);
4489                         unsigned scopeID = resolveVarScope(dcl, func);
4490                         dcl->updateScopeID(scopeID);
4491                     }
4492                     else if (src->isAddrExp() &&
4493                         src->asAddrExp()->getRegVar())
4494                     {
4495                         G4_Declare* dcl = src->asAddrExp()->getRegVar()->getDeclare()->getRootDeclare();
4496                         unsigned scopeID = resolveVarScope(dcl, func);
4497                         dcl->updateScopeID(scopeID);
4498                     }
4499                 }
4500             }
4501         }
4502     }
4503 }
4504 
4505 //
4506 // Traverse the call graph and mark varaible scope
4507 //
markScope()4508 void FlowGraph::markScope()
4509 {
4510     if (funcInfoTable.size() == 0)
4511     {
4512         // no subroutines
4513         return;
4514     }
4515     unsigned id = 1;
4516     std::vector<FuncInfo *>::iterator kernelIter = sortedFuncTable.end();
4517     kernelIter--;
4518     for (std::vector<FuncInfo *>::iterator funcIter = sortedFuncTable.begin();
4519         funcIter != sortedFuncTable.end();
4520         ++funcIter)
4521     {
4522         if (funcIter == kernelIter)
4523         {
4524             id = UINT_MAX;
4525         }
4526 
4527         FuncInfo* func = (*funcIter);
4528         func->setScopeID(id);
4529 
4530         for (auto bb : func->getBBList())
4531         {
4532             bb->setScopeID(id);
4533         }
4534 
4535         id++;
4536     }
4537 
4538     for (auto func : sortedFuncTable)
4539     {
4540         markVarScope(func->getBBList(), func);
4541     }
4542 }
4543 
4544 // Return the mask for anyH or allH predicate control in the goto to be emitted.
getFlagMask(G4_Predicate_Control pCtrl)4545 static uint32_t getFlagMask(G4_Predicate_Control pCtrl)
4546 {
4547     switch (pCtrl)
4548     {
4549     case G4_Predicate_Control::PRED_ALL2H:
4550         return 0xFFFFFFFC;
4551     case G4_Predicate_Control::PRED_ALL4H:
4552         return 0xFFFFFFF0;
4553     case G4_Predicate_Control::PRED_ALL8H:
4554         return 0xFFFFFF00;
4555     case G4_Predicate_Control::PRED_ALL16H:
4556         return 0xFFFF0000;
4557     case G4_Predicate_Control::PRED_ALL32H:
4558         return 0x00000000;
4559     case G4_Predicate_Control::PRED_ANY2H:
4560         return 0x00000003;
4561     case G4_Predicate_Control::PRED_ANY4H:
4562         return 0x0000000F;
4563     case G4_Predicate_Control::PRED_ANY8H:
4564         return 0x000000FF;
4565     case G4_Predicate_Control::PRED_ANY16H:
4566         return 0x0000FFFF;
4567     case G4_Predicate_Control::PRED_ANY32H:
4568         return 0xFFFFFFFF;
4569     default:
4570         MUST_BE_TRUE(false, "only for AllH or AnyH predicate control");
4571         break;
4572     }
4573     return 0;
4574 }
4575 
4576 // Given a predicate ctrl for jmpi, return the adjusted predicate ctrl in a new
4577 // simd size.
getPredCtrl(unsigned simdSize,G4_Predicate_Control pCtrl,bool predCtrlHasWidth)4578 static G4_Predicate_Control getPredCtrl(unsigned simdSize,
4579     G4_Predicate_Control pCtrl, bool predCtrlHasWidth)
4580 {
4581     if (!predCtrlHasWidth)
4582     {
4583         return G4_Predicate::isAllH(pCtrl) ? PRED_ALL_WHOLE : PRED_ANY_WHOLE;
4584     }
4585 
4586     if (G4_Predicate::isAllH(pCtrl))
4587         return (simdSize == 8)
4588         ? PRED_ALL8H
4589         : (simdSize == 16) ? PRED_ALL16H
4590         : G4_Predicate_Control::PRED_ALL32H;
4591 
4592     // Any or default
4593     return (simdSize == 8)
4594         ? PRED_ANY8H
4595         : (simdSize == 16) ? PRED_ANY16H
4596         : G4_Predicate_Control::PRED_ANY32H;
4597 }
4598 
4599 // If BB has only one predecessor, return it;
4600 // if BB has two predecessors and one is ExcludedPred, return the other predecessor;
4601 // otherwise, return nullptr.
getSinglePredecessor(G4_BB * BB,G4_BB * ExcludedPred) const4602 G4_BB* FlowGraph::getSinglePredecessor(G4_BB* BB, G4_BB* ExcludedPred) const
4603 {
4604     if (BB->Preds.size() != 1) {
4605         if (BB->Preds.size() == 2) {
4606             if (BB->Preds.front() == ExcludedPred)
4607                 return BB->Preds.back();
4608             if (BB->Preds.back() == ExcludedPred)
4609                 return BB->Preds.front();
4610         }
4611         return nullptr;
4612     }
4613     return BB->Preds.front();
4614 }
4615 
4616 // Convert jmpi to goto. E.g.
4617 //
4618 // Case1:
4619 // .decl P1 v_type=P num_elts=2
4620 //
4621 // cmp.ne (M1, 2) P1 V33(0,0)<2;2,1> 0x0:f
4622 // (!P1.any) jmpi (M1, 1) BB1
4623 //
4624 // ===>
4625 //
4626 // cmp.ne (M1, 2) P1 V33(0,0)<2;2,1> 0x0:f
4627 // and (1) P1 P1 0b00000011
4628 // (!P2.any) goto (M1, 8) BB1
4629 //
4630 // Case2:
4631 // .decl P1 v_type=P num_elts=2
4632 //
4633 //  cmp.ne (M1, 2) P1 V33(0,0)<2;2,1> 0x0:f
4634 // (!P1.all) jmpi (M1, 1) BB1
4635 //
4636 // ===>
4637 //
4638 // cmp.ne (M1, 2) P1 V33(0,0)<2;2,1> 0x0:f
4639 // or (1) P1 P1 0b11111100
4640 // (!P1.all) goto (M1, 8) BB1
4641 //
convertJmpiToGoto()4642 bool FlowGraph::convertJmpiToGoto()
4643 {
4644     bool Changed = false;
4645     for (auto bb : BBs)
4646     {
4647         for (auto I = bb->begin(), IEnd = bb->end(); I != IEnd; ++I)
4648         {
4649             G4_INST *inst = *I;
4650             if (inst->opcode() != G4_jmpi)
4651                 continue;
4652 
4653             unsigned predSize = pKernel->getSimdSize();
4654             G4_Predicate *newPred = nullptr;
4655 
4656             if (G4_Predicate *pred = inst->getPredicate())
4657             {
4658                 // The number of bool elements in vISA decl.
4659                 unsigned nElts = pred->getTopDcl()->getNumberFlagElements();
4660 
4661                 // Since we need to turn this into goto, set high bits properly.
4662                 if (nElts != predSize)
4663                 {
4664                     // The underlying dcl type is either uw or ud.
4665                     G4_Type SrcTy = pred->getTopDcl()->getElemType();
4666                     G4_Type DstTy = (predSize > 16) ? Type_UD : Type_UW;
4667 
4668                     G4_Predicate_Control pCtrl = pred->getControl();
4669                     MUST_BE_TRUE(nElts == 1 ||
4670                         G4_Predicate::isAnyH(pCtrl) ||
4671                         G4_Predicate::isAllH(pCtrl),
4672                         "predicate control not handled yet");
4673 
4674                     // Common dst and src0 operand for flag.
4675                     G4_Declare *newDcl = builder->createTempFlag(predSize > 16 ? 2 : 1);
4676                     auto pDst = builder->createDst(newDcl->getRegVar(), 0, 0, 1, DstTy);
4677                     auto pSrc0 = builder->createSrc(
4678                         pred->getBase(), 0, 0, builder->getRegionScalar(), SrcTy);
4679 
4680                     auto truncMask = [](uint32_t mask, G4_Type Ty) -> uint64_t
4681                     {
4682                         return (Ty == Type_UW) ? uint16_t(mask) : mask;
4683                     };
4684 
4685                     if (pCtrl == G4_Predicate_Control::PRED_DEFAULT)
4686                     {
4687                         // P = P & 1
4688                         auto pSrc1 = builder->createImm(1, Type_UW);
4689                         auto pInst = builder->createBinOp(
4690                             G4_and, g4::SIMD1, pDst, pSrc0, pSrc1,
4691                             InstOpt_M0 | InstOpt_WriteEnable, false);
4692                         bb->insertBefore(I, pInst);
4693                     }
4694                     else if (G4_Predicate::isAnyH(pCtrl))
4695                     {
4696                         // P = P & mask
4697                         uint32_t mask = getFlagMask(pCtrl);
4698                         auto pSrc1 = builder->createImm(truncMask(mask, DstTy), DstTy);
4699                         auto pInst = builder->createBinOp(
4700                             G4_and, g4::SIMD1, pDst, pSrc0, pSrc1,
4701                             InstOpt_M0 | InstOpt_WriteEnable, false);
4702                         bb->insertBefore(I, pInst);
4703                     }
4704                     else
4705                     {
4706                         // AllH
4707                         // P = P | mask
4708                         uint32_t mask = getFlagMask(pCtrl);
4709                         auto pSrc1 = builder->createImm(truncMask(mask, DstTy), DstTy);
4710                         auto pInst = builder->createBinOp(
4711                             G4_or, g4::SIMD1, pDst, pSrc0, pSrc1,
4712                             InstOpt_M0 | InstOpt_WriteEnable, false);
4713                         bb->insertBefore(I, pInst);
4714                     }
4715 
4716                     // Adjust pred control to the new execution size and build the
4717                     // new predicate.
4718                     pCtrl = getPredCtrl(predSize, pCtrl, builder->predCtrlHasWidth());
4719                     newPred = builder->createPredicate(
4720                         pred->getState(), newDcl->getRegVar(), 0, pCtrl);
4721                 }
4722             }
4723 
4724             // (!P) jmpi L
4725             // becomes:
4726             // P = P & MASK
4727             // (!P.anyN) goto (N) L
4728             inst->setOpcode(G4_goto);
4729             inst->setExecSize(G4_ExecSize(predSize));
4730             if (newPred)
4731                 inst->setPredicate(newPred);
4732             inst->asCFInst()->setUip(inst->getSrc(0)->asLabel());
4733             inst->setSrc(nullptr, 0);
4734             inst->setOptions(InstOpt_M0);
4735             Changed = true;
4736         }
4737     }
4738     return Changed;
4739 }
4740 
print(std::ostream & OS) const4741 void FlowGraph::print(std::ostream& OS) const
4742 {
4743     const char* kname = nullptr;
4744     if (getKernel()) {
4745         kname = getKernel()->getName();
4746     }
4747     kname = kname ? kname : "unnamed";
4748     OS  << "\n\nCFG: "
4749         << kname
4750         << (builder->getIsKernel() ? " [kernel]" : " [non-kernel function]")
4751         << "\n\n";
4752     for (auto BB : BBs) {
4753         BB->print(OS);
4754     }
4755 }
4756 
dump() const4757 void FlowGraph::dump() const
4758 {
4759     print(std::cerr);
4760 }
4761 
~FlowGraph()4762 FlowGraph::~FlowGraph()
4763 {
4764     // even though G4_BBs are allocated in a mem pool and freed in one shot,
4765     // we must call each BB's desstructor explicitly to free up the memory used
4766     // by the STL objects(list, vector, etc.) in each BB
4767     for (G4_BB* bb : BBAllocList)
4768     {
4769         bb->~G4_BB();
4770     }
4771     BBAllocList.clear();
4772     globalOpndHT.clearHashTable();
4773     for (auto funcInfo : funcInfoTable)
4774     {
4775         funcInfo->~FuncInfo();
4776     }
4777     if (kernelInfo)
4778     {
4779         kernelInfo->~FuncInfo();
4780     }
4781     for (auto summary : localRASummaries)
4782     {
4783         summary->~PhyRegSummary();
4784     }
4785 }
4786 
createRelocation(G4_Kernel & kernel,G4_INST & inst,int opndPos,const std::string & symbolName,RelocationType type)4787 RelocationEntry& RelocationEntry::createRelocation(
4788     G4_Kernel& kernel, G4_INST& inst,
4789     int opndPos, const std::string& symbolName,
4790     RelocationType type)
4791 {
4792     kernel.getRelocationTable().emplace_back(
4793         RelocationEntry(&inst, opndPos, type, symbolName));
4794     return kernel.getRelocationTable().back();
4795 }
4796 
doRelocation(const G4_Kernel & kernel,void * binary,uint32_t binarySize)4797 void RelocationEntry::doRelocation(
4798     const G4_Kernel& kernel, void* binary, uint32_t binarySize)
4799 {
4800     // FIXME: nothing to do here
4801     // we have only dynamic relocations now, which cannot be resolved at compilation time
4802 }
4803 
getTargetOffset(const IR_Builder & builder) const4804 uint32_t RelocationEntry::getTargetOffset(const IR_Builder& builder) const
4805 {
4806     // instruction being relocated must not be compacted, or the offset need to be re-adjusted
4807     // FIXME: This only check if vISA force to compact the instruction, it cannot make sure
4808     // the Binary encoder won't compact it
4809     assert(inst->isCompactedInst() == false);
4810 
4811     G4_Operand* target_operand = inst->getSrc(opndPos);
4812     assert(target_operand->isRelocImm());
4813 
4814     switch (inst->opcode()) {
4815     case G4_mov:
4816         // When src0 type is 64 bits:
4817         //  On Pre-Xe:
4818         //   Src0.imm[31:0] mapped to Instruction [95:64]
4819         //   Src0.imm[63:32] mapped to Instruction [127:96]
4820         //  On Xe+:
4821         //   Src0.imm[31:0] mapped to Instruction [127:96]
4822         //   Src0.imm[63:32] mapped to Instruction [95:64]
4823         // When src0 type is 32 bits:
4824         //   Src0.imm[31:0] mapped to instruction [127:96]
4825         assert((target_operand->getType() == Type_UD) || (target_operand->getType() == Type_D) ||
4826             (target_operand->getType() == Type_UQ) || (target_operand->getType() == Type_Q));
4827         assert(opndPos == 0);
4828         return (target_operand->getType() == Type_UD || target_operand->getType() == Type_D) ? 12 : 8;
4829 
4830     case G4_add:
4831         // add instruction cannot have 64-bit imm
4832         assert(relocType != R_SYM_ADDR && relocType != R_SYM_ADDR_32_HI);
4833         assert(opndPos == 1);
4834         // Src1.imm[31:0] mapped to Instruction [127:96]
4835         return 12;
4836     default:
4837         break;
4838     }
4839 
4840     // currently we only support relocation on mov or add instruction
4841     assert(false && "Unreachable");
4842     return 0;
4843 }
4844 
getTypeString() const4845 const char * RelocationEntry::getTypeString() const
4846 {
4847     switch (relocType)
4848     {
4849     case RelocationType::R_SYM_ADDR:
4850         return "R_SYM_ADDR";
4851     case RelocationType::R_SYM_ADDR_32:
4852         return "R_SYM_ADDR_32";
4853     case RelocationType::R_SYM_ADDR_32_HI:
4854         return "R_SYM_ADDR_32_HI";
4855     case RelocationType::R_PER_THREAD_PAYLOAD_OFFSET_32:
4856         return "R_PER_THREAD_PAYLOAD_OFFSET_32";
4857     case RelocationType::R_GLOBAL_IMM_32:
4858         return "R_GLOBAL_IMM_32";
4859     default:
4860         assert(false && "unhandled relocation type");
4861         return "";
4862     }
4863 }
4864 
dump(std::ostream & os) const4865 void RelocationEntry::dump(std::ostream &os) const
4866 {
4867     os << "Relocation entry: " << getTypeString() << "\n";
4868     os << "\t";
4869     inst->dump();
4870     switch (relocType)
4871     {
4872         case RelocationType::R_NONE:
4873             os << "R_NONE: symbol name = " << symName;
4874             break;
4875         case RelocationType::R_SYM_ADDR:
4876             os << "R_SYM_ADDR: symbol name = " << symName;
4877             break;
4878         case RelocationType::R_SYM_ADDR_32:
4879             os << "R_SYM_ADDR_32: symbol name = " << symName;
4880             break;
4881         case RelocationType::R_SYM_ADDR_32_HI:
4882             os << "R_SYM_ADDR_32_HI: symbol name = " << symName;
4883             break;
4884         case RelocationType::R_PER_THREAD_PAYLOAD_OFFSET_32:
4885             os << "R_PER_THREAD_PAYLOAD_OFFSET_32: symbol name = " << symName;
4886             break;
4887         case RelocationType::R_GLOBAL_IMM_32:
4888             os << "R_GLOBAL_IMM_32: symbol name = " << symName;
4889             break;
4890     }
4891     os << "\n";
4892 }
4893 
dump() const4894 void FuncInfo::dump() const
4895 {
4896     std::cerr << "subroutine " << getId() << "(" << getInitBB()->front()->getLabelStr() << ")\n";
4897     std::cerr << "\tentryBB=" << getInitBB()->getId() << ", exitBB=" << getExitBB()->getId() << "\n";
4898     std::cerr << "\tCallees: ";
4899     for (auto callee : callees)
4900     {
4901         std::cerr << callee->getId() << " ";
4902     }
4903     std::cerr << "\n\tBB list: ";
4904     for (auto bb : BBList)
4905     {
4906         std::cerr << bb->getId() << " ";
4907     }
4908     std::cerr << "\n";
4909 }
4910