1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include "G4_BB.hpp"
10 #include "BuildIR.h"
11 #include "LocalRA.h" // SECOND_HALF_BANK_START_GRF
12 
13 #include <ostream>
14 
15 using namespace vISA;
16 
17 
isSuccBB(G4_BB * succ) const18 bool G4_BB::isSuccBB(G4_BB* succ) const
19 {
20     for (auto it = Succs.begin(), bbEnd = Succs.end(); it != bbEnd; ++it)
21     {
22         if ((*it) == succ)  return true;
23     }
24     return false;
25 }
26 
getKernel() const27 G4_Kernel& G4_BB::getKernel() const
28 {
29     return *getParent().getKernel();
30 }
31 
32 //
33 // to check if the last instruction in list is EOT
34 //
isLastInstEOT() const35 bool G4_BB::isLastInstEOT() const
36 {
37     if (instList.size() == 0)
38     {
39         return false;
40     }
41 
42     G4_INST *i = instList.back();
43 
44     if (parent->builder->hasSendShootdown())
45     {
46         // due to send shootdown, a predicated send may not actually be an EOT
47         return i->isEOT() && i->getPredicate() == NULL;
48     }
49     else
50     {
51         return i->isEOT();
52     }
53 }
54 
getLastOpcode() const55 G4_opcode G4_BB::getLastOpcode() const
56 {
57     const G4_INST *i = instList.empty() ? nullptr : instList.back();
58     if (i)
59     {
60         return i->opcode();
61     }
62     else
63     {
64         return G4_illegal;
65     }
66 }
67 
setId(unsigned i)68 void G4_BB::setId(unsigned i)
69 {
70     // some analysis passes rely on G4_BB id
71     if (id != i)
72         getParent().markStale();
73     id = i;
74 }
75 
76 
removePredEdge(G4_BB * pred)77 void G4_BB::removePredEdge(G4_BB* pred)
78 {
79     for (std::list<G4_BB*>::iterator it = Preds.begin(), bbEnd = Preds.end();
80         it != bbEnd; ++it)
81     {
82         if (*it != pred) continue;
83         // found
84         Preds.erase(it);
85         getParent().markStale();
86         return;
87     }
88     MUST_BE_TRUE(false, ERROR_FLOWGRAPH); // edge is not found
89 }
90 
removeSuccEdge(G4_BB * succ)91 void G4_BB::removeSuccEdge(G4_BB* succ)
92 {
93     for (std::list<G4_BB*>::iterator it = Succs.begin(), bbEnd = Succs.end(); it != bbEnd; ++it)
94     {
95         if (*it != succ) continue;
96         // found
97         Succs.erase(it);
98         getParent().markStale();
99         return;
100     }
101     MUST_BE_TRUE(false, ERROR_FLOWGRAPH); // edge is not found
102 }
103 
104 //
105 // find the fall-through BB of the current block.
106 // if the last inst is a unconditional jump, then the target is not considered a fall-through BB
107 // NOTE: Pay attention this function is only works after the handleReturn() duo the the conditional CALL
108 //
fallThroughBB()109 G4_BB * G4_BB::fallThroughBB()
110 {
111     G4_INST* last = (!instList.empty()) ? instList.back() : NULL;
112 
113     if (last)
114     {
115         if (last->opcode() == G4_goto || last->opcode() == G4_join)
116         {
117             return nullptr;
118         }
119         if (last->isFlowControl())
120         {
121             // if No successor, return NULL;
122             if (Succs.empty())
123             {
124                 return nullptr;
125             }
126 
127             //
128             // Instructions    Predicate-On    Predicate-Off    Num of Succ
129             // Jmpi               Front                None               >=1
130             // CALL               Front                None               >=2     considered the conditional call here
131             // while              Front                Front              2
132             // if, else           Front                Front              2
133             // break, cont        Front                None               1,2
134             // return             Front                None               >=1
135             // do                 Front                Front              1
136             // endif              Front                Front              1
137             if (last->isCall())
138             {
139                 return BBAfterCall();
140             }
141             else if (!last->getPredicate() &&
142                 // G4_while considered to fall trhu even without pred, since break jumps to while
143                 (last->opcode() == G4_jmpi || last->opcode() == G4_break || last->opcode() == G4_cont || last->isReturn()))
144             {
145                 return nullptr;
146             }
147             else
148             {
149                 return Succs.front();
150             }
151         }
152     }
153 
154     //
155     // process other cases
156     //
157     if (Succs.size() == 0) // exit BB
158         return NULL; // no fall-through BB
159     else
160         return Succs.front();
161 }
162 
BBBeforeCall() const163 G4_BB * G4_BB::BBBeforeCall() const
164 {
165     assert((getBBType() & G4_BB_RETURN_TYPE) && "this must be a subroutine return BB");
166     return physicalPred;
167 }
168 
BBAfterCall() const169 G4_BB * G4_BB::BBAfterCall() const
170 {
171     assert((getBBType() & G4_BB_CALL_TYPE) && "this must be a subroutine call BB");
172     return physicalSucc;
173 }
174 
isAllLaneActive() const175 bool G4_BB::isAllLaneActive() const
176 {
177     G4_Kernel* pK = parent->getKernel();
178     if (pK->getInt32KernelAttr(Attributes::ATTR_Target) == VISA_CM && !isDivergent())
179     {
180         // CM: if BB isn't divergent, all lanes (32) must be active (dmask = 0xFFFFFFFF)
181         return true;
182     }
183     return false;
184 }
185 
186 
emit(std::ostream & output)187 void G4_BB::emit(std::ostream& output)
188 {
189     for (INST_LIST_ITER it = instList.begin(); it != instList.end(); ++it)
190     {
191         emitInstruction(output, it);
192     }
193 }
emitInstruction(std::ostream & output,INST_LIST_ITER & it)194 void G4_BB::emitInstruction(std::ostream& output, INST_LIST_ITER &it)
195 {
196     // prints out instruction line
197     if (!parent->getKernel()->getOptions()->getOption(vISA_disableInstDebugInfo))
198     {
199         emitInstructionSourceLineMapping(output, it);
200     }
201 
202     emitBasicInstruction(output, it);
203 
204     output << "\n";
205 }
emitBasicInstruction(std::ostream & output,INST_LIST_ITER & it)206 void G4_BB::emitBasicInstruction(std::ostream& output, INST_LIST_ITER &it)
207 {
208     if ((*it)->isSend())
209     {
210         //
211         // emit send instruction
212         //
213         G4_InstSend* SendInst = (*it)->asSendInst();
214         if( SendInst )
215         {
216             SendInst->emit_send(output);
217             SendInst->emit_send_desc(output);
218         }
219     }
220     else
221     {
222         //
223         // emit label and instruction
224         //
225         G4_INST *inst = *it;
226         inst->emit(output, parent->builder->getOption(vISA_SymbolReg));
227         if ((*it)->isLabel() == false)
228         {
229             emitBankConflict(output, inst);
230         }
231     }
232 
233 }
emitBasicInstructionComment(std::ostream & output,INST_LIST_ITER & it,int * suppressRegs,int * lastRegs)234 void G4_BB::emitBasicInstructionComment(
235     std::ostream& output,
236     INST_LIST_ITER &it,
237     int *suppressRegs, int *lastRegs)
238 {
239     const G4_INST* inst = *it;
240 
241     auto platform = inst->getPlatform();
242 
243     if (!inst->isLabel() && inst->opcode() < G4_NUM_OPCODE)
244     {
245         output << " // ";
246 
247         auto comments = inst->getComments();
248         if (!comments.empty()) {
249             output << " " << comments << "; ";
250         }
251         int vISAId = inst->getCISAOff();
252         if (vISAId != -1) {
253             output << "$" << vISAId;
254         }
255 
256         if (getParent().getKernel()->getOption(vISA_DumpSBID))
257         {
258             int lexicalId = inst->getLexicalId();
259             if (lexicalId != -1) {
260                 output << "&" << lexicalId;
261             }
262         }
263 
264         if (getParent().getKernel()->getOption(vISA_DumpGenOffset) &&
265             inst->getBinInst())
266         {
267             output << ":%" << inst->getGenOffset();
268         }
269 
270         if (getPlatformGeneration(platform) < PlatformGen::XE)
271         {
272             emitBankConflict(output, inst);
273         }
274         else
275         {
276             int sameBankConflicts = 0;
277             int twoSrcConflicts = 0;
278             int simd16SuppressionConflicts = 0;
279             unsigned BCNum = 0;
280             if (parent->builder->hasEarlyGRFRead())
281             {
282                 BCNum = emitBankConflictXeLP(
283                     output, inst, suppressRegs, lastRegs,
284                     sameBankConflicts, twoSrcConflicts, simd16SuppressionConflicts);
285             }
286             else
287             {
288                 BCNum = emitBankConflictXe(
289                     output, inst, suppressRegs,
290                     sameBankConflicts, twoSrcConflicts, simd16SuppressionConflicts,
291                     parent->builder->hasOneGRFBank16Bundles(),
292                     platform == GENX_TGLLP, parent->builder->has64bundleSize());
293             }
294             parent->XeBCStats.addBC(BCNum);
295             parent->XeBCStats.addSameBankBC(sameBankConflicts);
296             parent->XeBCStats.add2SrcBC(twoSrcConflicts);
297             parent->XeBCStats.addSimd16RSBC(simd16SuppressionConflicts);
298             parent->numRMWs += countReadModifyWrite(output, inst);
299         }
300     }
301 
302 }
303 
304 _THREAD const char* g4_prevFilename;
305 _THREAD int g4_prevSrcLineNo;
306 
emitInstructionSourceLineMapping(std::ostream & output,INST_LIST_ITER & it)307 void G4_BB::emitInstructionSourceLineMapping(std::ostream& output, INST_LIST_ITER &it)
308 {
309     bool emitFile = false, emitLineNo = false;
310     const char* curFilename = (*it)->getSrcFilename();
311     int curSrcLineNo = (*it)->getLineNo();
312 
313     if ((*it)->isLabel())
314     {
315         return;
316     }
317 
318     if (curFilename && (g4_prevFilename == nullptr || strcmp(g4_prevFilename, curFilename) != 0))
319     {
320         emitFile = true;
321     }
322 
323     if (g4_prevSrcLineNo != curSrcLineNo && curSrcLineNo != 0)
324     {
325         emitLineNo = true;
326     }
327 
328     if (emitFile)
329     {
330         output << "\n// File: " << curFilename << "\n";
331     }
332 
333     if (emitLineNo)
334     {
335         output << "\n// Line " << curSrcLineNo;
336         if (curFilename)
337         {
338             std::string curLine = parent->getKernel()->getDebugSrcLine(curFilename, curSrcLineNo);
339             if (!curLine.empty()) {
340                 auto isNotSpace = [](int ch) { return !std::isspace(ch); };
341                 curLine.erase(curLine.begin(), std::find_if(curLine.begin(), curLine.end(), isNotSpace));
342                 curLine.erase(std::find_if(curLine.rbegin(), curLine.rend(), isNotSpace).base(), curLine.end());
343                 output << ":  " << curLine;
344             }
345         }
346         output << "\n";
347     }
348 
349     if (emitFile)
350     {
351         g4_prevFilename = curFilename;
352     }
353 
354     if (emitLineNo)
355     {
356         g4_prevSrcLineNo = curSrcLineNo;
357     }
358 }
359 
emitBankConflict(std::ostream & output,const G4_INST * inst)360 void G4_BB::emitBankConflict(std::ostream& output, const G4_INST *inst)
361 {
362     int regNum[2][G4_MAX_SRCS];
363     int execSize[G4_MAX_SRCS];
364     int regSrcNum = 0;
365 
366     if (inst->isDpas()) {
367         return;
368     }
369 
370     if (inst->getNumSrc() == 3 && !inst->isSend())
371     {
372         for (unsigned i = 0; i < 3; i++)
373         {
374             G4_Operand * srcOpnd = inst->getSrc(i);
375             regNum[1][i] = -1;
376             if (srcOpnd)
377             {
378                 if (srcOpnd->isSrcRegRegion() &&
379                     srcOpnd->asSrcRegRegion()->getBase() &&
380                     srcOpnd->asSrcRegRegion()->getBase()->isRegVar())
381                 {
382                     G4_RegVar* baseVar = static_cast<G4_RegVar*>(srcOpnd->asSrcRegRegion()->getBase());
383                     if (baseVar->isGreg()) {
384                         uint32_t byteAddress = srcOpnd->getLinearizedStart();
385                         if (byteAddress != 0) {
386                             regNum[0][i] = byteAddress / numEltPerGRF<Type_UB>();
387                         }
388                         else {
389                             // before RA, use the value in Greg directly
390                             regNum[0][i] = baseVar->getPhyReg()->asGreg()->getRegNum();
391                         }
392                         regNum[1][i] = regNum[0][i];
393                         regSrcNum++;
394                     }
395                     execSize[i] = srcOpnd->getLinearizedEnd() - srcOpnd->getLinearizedStart();
396                 }
397             }
398         }
399     }
400 
401     if (regSrcNum == 3)
402     {
403         int maxGRFNum = 0;
404         output << " {";
405         if (parent->builder->oneGRFBankDivision())
406         {//EVEN/ODD
407             for (int i = 0; i < 3; i++)
408             {
409                 output << i << "=";
410                 if (!(regNum[0][i] % 2) && regNum[0][i] < SECOND_HALF_BANK_START_GRF)
411                 {
412                     output << "EL, ";
413                 }
414                 if (regNum[0][i] % 2 && regNum[0][i] < SECOND_HALF_BANK_START_GRF)
415                 {
416                     output << "OL, ";
417                 }
418                 if (!(regNum[0][i] % 2) && regNum[0][i] >= SECOND_HALF_BANK_START_GRF)
419                 {
420                     output << "EH, ";
421                 }
422                 if (regNum[0][i] % 2 && regNum[0][i] >= SECOND_HALF_BANK_START_GRF)
423                 {
424                     output << "OH, ";
425                 }
426             }
427         }
428         else
429         { //EVEN EVEN/ODD ODD
430             for (int i = 0; i < 3; i++)
431             {
432                 output << i << "=";
433                 for (int j = 0; j < (execSize[i] + (int)numEltPerGRF<Type_UB>() - 1) / (int)numEltPerGRF<Type_UB>(); j++)
434                 {
435                     int reg_num = regNum[0][i] + j;
436                     if (!(reg_num & 0x02) && reg_num < SECOND_HALF_BANK_START_GRF)
437                     {
438                         output << "EL, ";
439                     }
440                     if ((reg_num & 0x02) && reg_num < SECOND_HALF_BANK_START_GRF)
441                     {
442                         output << "OL, ";
443                     }
444                     if (!(reg_num & 0x02) && reg_num >= SECOND_HALF_BANK_START_GRF)
445                     {
446                         output << "EH, ";
447                     }
448                     if ((reg_num & 0x02) && reg_num >= SECOND_HALF_BANK_START_GRF)
449                     {
450                         output << "OH, ";
451                     }
452                     if (j > 1)
453                     {
454                         regNum[1][i] = reg_num;
455                     }
456                 }
457                 maxGRFNum = ((execSize[i] + (int)numEltPerGRF<Type_UB>() - 1) / (int)numEltPerGRF<Type_UB>()) > maxGRFNum ?
458                     ((execSize[i] + (int)numEltPerGRF<Type_UB>() - 1) / (int)numEltPerGRF<Type_UB>()) : maxGRFNum;
459             }
460         }
461         output << "BC=";
462         if (!parent->builder->twoSourcesCollision())
463         {
464             if (!parent->builder->oneGRFBankDivision())
465             { //EVEN EVEN/ODD ODD
466                 ASSERT_USER(maxGRFNum < 3, "Not supporting register size > 2");
467                 if (maxGRFNum == 2)
468                 {
469                     for (int i = 0; i < maxGRFNum; i++)
470                     {
471                         if ((regNum[i][1] & 0x02) == (regNum[i][2] & 0x02))
472                         {
473                             if ((regNum[i][1] < SECOND_HALF_BANK_START_GRF &&
474                                 regNum[i][2] < SECOND_HALF_BANK_START_GRF) ||
475                                 (regNum[i][1] >= SECOND_HALF_BANK_START_GRF &&
476                                     regNum[i][2] >= SECOND_HALF_BANK_START_GRF))
477                             {
478                                 parent->BCStats.addBad();
479                                 output << "BAD,";
480                             }
481                             else
482                             {
483                                 parent->BCStats.addOK();
484                                 output << "OK,";
485                             }
486                         }
487                         else
488                         {
489                             parent->BCStats.addGood();
490                             output << "GOOD,";
491                         }
492                     }
493                 }
494                 else
495                 {
496                     for (int i = 0; i < maxGRFNum; i++)
497                     {
498                         if (((regNum[i][1] & 0x02) == (regNum[i][2] & 0x02)) &&
499                             ((regNum[i][0] & 0x02) == (regNum[i][1] & 0x02)))
500                         {
501                             if ((regNum[i][0] < SECOND_HALF_BANK_START_GRF &&
502                                 regNum[i][1] < SECOND_HALF_BANK_START_GRF &&
503                                 regNum[i][2] < SECOND_HALF_BANK_START_GRF) ||
504                                 (regNum[i][0] >= SECOND_HALF_BANK_START_GRF &&
505                                     regNum[i][1] >= SECOND_HALF_BANK_START_GRF &&
506                                     regNum[i][2] >= SECOND_HALF_BANK_START_GRF))
507                             {
508                                 parent->BCStats.addBad();
509                                 output << "BAD,";
510                             }
511                             else
512                             {
513                                 parent->BCStats.addOK();
514                                 output << "OK,";
515                             }
516                         }
517                         else
518                         {
519                             parent->BCStats.addGood();
520                             output << "GOOD,";
521                         }
522                     }
523                 }
524             }
525             else
526             {  //EVEN/ODD
527                 if ((regNum[0][1] % 2) != (regNum[0][2] % 2) ||
528                     (regNum[0][0] % 2) != (regNum[0][1] % 2) ||
529                     (regNum[0][1] == regNum[0][2]))
530                 {
531                     parent->BCStats.addGood();
532                     output << "GOOD";
533                 }
534                 else
535                 {
536                     if ((regNum[0][0] < SECOND_HALF_BANK_START_GRF &&
537                         regNum[0][1] < SECOND_HALF_BANK_START_GRF &&
538                         regNum[0][2] < SECOND_HALF_BANK_START_GRF) ||
539                         (regNum[0][0] >= SECOND_HALF_BANK_START_GRF &&
540                             regNum[0][1] >= SECOND_HALF_BANK_START_GRF &&
541                             regNum[0][2] >= SECOND_HALF_BANK_START_GRF))
542                     {
543                         parent->BCStats.addBad();
544                         output << "BAD";
545                     }
546                     else
547                     {
548                         parent->BCStats.addOK();
549                         output << "OK";
550                     }
551                 }
552             }
553         }
554         else  //Two source
555         {  //   EVEN/ODD
556             if ((regNum[0][1] != regNum[0][2]) &&
557                 ((regNum[0][1] % 2) == (regNum[0][2] % 2)))
558             {
559                 if ((regNum[0][1] < SECOND_HALF_BANK_START_GRF &&
560                     regNum[0][2] < SECOND_HALF_BANK_START_GRF) ||
561                     (regNum[0][1] >= SECOND_HALF_BANK_START_GRF &&
562                         regNum[0][2] >= SECOND_HALF_BANK_START_GRF))
563                 {
564                     parent->BCStats.addBad();
565                     output << "BAD";
566                 }
567                 else
568                 {
569                     parent->BCStats.addOK();
570                     output << "OK";
571                 }
572             }
573             else
574             {
575                 parent->BCStats.addGood();
576                 output << "GOOD";
577             }
578         }
579         output << "}";
580     }
581 }
582 
isValidReg(int reg)583 static bool isValidReg(int reg)
584 {
585     return reg != -1;
586 }
587 
setInValidReg(int & reg)588 static void setInValidReg(int &reg)
589 {
590     reg = -1;
591 }
592 
getConflictTimesForTGLLP(std::ostream & output,int * firstRegCandidate,int & sameBankConflicts)593 static int getConflictTimesForTGLLP(
594     std::ostream& output, int *firstRegCandidate, int &sameBankConflicts)
595 {
596     int conflictTimes = 0;
597     int bundles[2][8];
598     int bankSrcs[2];
599 
600     for (int i = 0; i < 2; i++)
601     {
602         for (int j = 0; j < 8; j++)
603         {
604             bundles[i][j] = -1;
605         }
606         bankSrcs[i] = 0;
607     }
608 
609     output << "{";
610     for (int i = 0; i < G4_MAX_SRCS; i++)
611     {
612         if (isValidReg(firstRegCandidate[i]))
613         {
614             int bundleID = (firstRegCandidate[i] % 16) / 2;
615             int bankID = firstRegCandidate[i] % 2;
616 
617             //Same bank and same bundle
618             if (bundles[bankID][bundleID] != -1)
619             {
620                 conflictTimes++;
621             }
622 
623             bundles[bankID][bundleID] = i;
624             bankSrcs[bankID]++;
625             if (bankID == 0)
626             {
627                 output << "E:";
628             }
629             else
630             {
631                 output << "O:";
632             }
633             output << bundleID << ",";
634         }
635     }
636 
637     //Same bank but different bundles
638     if (conflictTimes == 0 &&
639         (bankSrcs[0] > 2 ||
640             bankSrcs[1] > 2))
641     {
642         conflictTimes++;
643         sameBankConflicts ++;
644     }
645     else if  (bankSrcs[0] > 2 ||
646         bankSrcs[1] > 2)
647     {
648         sameBankConflicts ++;
649     }
650 
651     output << "}, ";
652 
653     return conflictTimes;
654 }
655 
getConflictTimesForTGL(std::ostream & output,int * firstRegCandidate,int & sameBankConflicts,bool zeroOne,bool isTGLLP,bool reducedBundles)656 int G4_BB::getConflictTimesForTGL(
657     std::ostream& output, int *firstRegCandidate,
658     int &sameBankConflicts, bool zeroOne, bool isTGLLP, bool reducedBundles)
659 {
660     int conflictTimes = 0;
661     int bundles[2][16];
662     int bankSrcs[2];
663 
664     for (int i = 0; i < 2; i++)
665     {
666         for (int j = 0; j < 16; j++)
667         {
668             bundles[i][j] = -1;
669         }
670         bankSrcs[i] = 0;
671     }
672 
673     output << "{";
674     for (int i = 0; i < G4_MAX_SRCS; i++)
675     {
676         bool same_register = false;
677 
678         if (isValidReg(firstRegCandidate[i]))
679         {
680             for (int j = 0; j < i; j++)
681             {
682                 if (isValidReg(firstRegCandidate[j]) && j != i)
683                 {
684                     if (firstRegCandidate[j] == firstRegCandidate[i])
685                     {
686                         same_register = true;
687                         break;
688                     }
689                 }
690             }
691 
692             if (same_register)
693             {
694                 continue;
695             }
696 
697             int bundleID = (firstRegCandidate[i] % 64) / 4;
698             int bankID = (firstRegCandidate[i] % 4) / 2;
699             if (isTGLLP)
700             {
701                 bankID = (firstRegCandidate[i]) % 2;
702                 bundleID = (firstRegCandidate[i] % 16) / 2;
703             }
704             else if (zeroOne)
705             {
706                 bankID = (firstRegCandidate[i]) % 2;
707                 bundleID = (firstRegCandidate[i] % 32) / 2;
708             }
709 
710             if(reducedBundles)
711             {
712                 bundleID = (firstRegCandidate[i] % 16) / 2;
713             }
714             //Same bank and same bundle
715             if (bundles[bankID][bundleID] != -1)  //Same bank and same bundle
716             {
717                 conflictTimes++;
718             }
719 
720             bundles[bankID][bundleID] = i;
721             bankSrcs[bankID]++;
722             if (bankID == 0)
723             {
724                 output << "E:";
725             }
726             else
727             {
728                 output << "O:";
729             }
730             output << bundleID << ",";
731         }
732     }
733 
734     //Same bank but different bundles
735     if (conflictTimes == 0 && (bankSrcs[0] > 2 || bankSrcs[1] > 2))
736     {
737         conflictTimes++;
738         sameBankConflicts++;
739     }
740     else if (bankSrcs[0] > 2 || bankSrcs[1] > 2)
741     {
742         sameBankConflicts++;
743     }
744 
745     output << "}, ";
746 
747     return conflictTimes;
748 }
749 
750 /*
751 * Xe BC evaluation
752 * All read suppression is GRF granularity based.
753 * Read suppression only happens between or within a physical instruction not compressed one. Compressed one will be split into physical instructions.
754 * Read suppression between instructions:
755 *     The read suppression mechanism is used to save the GRF register reading operations with a register cache in HW. The suppression we talked here
756 *     is the suppression between instructions. For each source operand slot, HW provide a GRF cache. With the cache, if the same GRF will be read in
757 *     the instruction, the read will not happen, the cached value will be used directly.
758 *     Note that:
759 *     1. Inter read suppression is the suppression cache based.
760 *     2. For compressed instructino 2 GRFs read suppression for src1 for DF and F type operands and 1 GRF read suppression for src0 and src2.
761 *     3. The slot cache will be flushed if the buffered register is used as destination operand.
762 *
763 * Read suppression within a instruction:
764 *     1. Works for all source operands.
765 *     2. intra suppression is the GRF read operation based(no read no suppression).
766 */
emitBankConflictXe(std::ostream & os_output,const G4_INST * inst,int * suppressRegs,int & sameConflictTimes,int & twoSrcConflicts,int & simd16RS,bool zeroOne,bool isTGLLP,bool hasReducedBundles)767 uint32_t G4_BB::emitBankConflictXe(
768     std::ostream& os_output, const G4_INST *inst,
769     int *suppressRegs,
770     int &sameConflictTimes, int &twoSrcConflicts,
771     int &simd16RS, bool zeroOne, bool isTGLLP, bool hasReducedBundles)
772 {
773     std::stringstream output;
774 
775     parent->XeBCStats.addSIMD8();
776 
777     if (inst->isSend() || inst->isMath() ||
778         inst->isSWSBSync() ||
779         inst->isWait() ||
780         inst->isReturn() || inst->isCall())
781     { //Flush
782         for (int i = 0; i < 4; i++)
783         {
784             setInValidReg(suppressRegs[i]);
785         }
786         return 0;
787     }
788 
789     int currInstRegs[2][G4_MAX_SRCS];
790     int readRegs[2][G4_MAX_SRCS];
791     int currInstExecSize[G4_MAX_SRCS] = {0};
792     int firstRegCandidate[G4_MAX_SRCS];
793     int secondRegCandidate[G4_MAX_SRCS];
794     int candidateNum = 0;
795     int dstExecSize = 0;
796     int dstRegs[2];
797 
798     for (int i = 0; i < G4_MAX_SRCS; i++)
799     {
800         setInValidReg(firstRegCandidate[i]);
801         setInValidReg(secondRegCandidate[i]);
802         setInValidReg(currInstRegs[0][i]);
803         setInValidReg(currInstRegs[1][i]);
804         setInValidReg(readRegs[0][i]);
805         setInValidReg(readRegs[1][i]);
806     }
807     setInValidReg(dstRegs[0]);
808     setInValidReg(dstRegs[1]);
809 
810     bool isCompressedInst = false;
811     bool isLastInstCompressed = suppressRegs[4] == 1;
812     bool isFDFSrc1 = false;
813     bool isSrc1Suppressed = false;
814 
815     //Get Dst
816     G4_DstRegRegion* dstOpnd = inst->getDst();
817     if (dstOpnd &&
818         !dstOpnd->isIndirect() &&
819         dstOpnd->isGreg())
820     {
821         dstExecSize = dstOpnd->getLinearizedEnd() - dstOpnd->getLinearizedStart() + 1;
822         uint32_t byteAddress = dstOpnd->getLinearizedStart();
823         dstRegs[0] = byteAddress / numEltPerGRF<Type_UB>();
824         if (dstExecSize > getGRFSize())
825         {
826             dstRegs[1] = dstRegs[0] + (dstExecSize + numEltPerGRF<Type_UB>() - 1) / numEltPerGRF<Type_UB>() - 1;
827             isCompressedInst = true;
828         }
829     }
830 
831     //Get src
832     for (int i = 0; i < inst->getNumSrc(); i++)
833     {
834         setInValidReg(currInstRegs[0][i]);
835         setInValidReg(currInstRegs[1][i]);
836         G4_Operand * srcOpnd = inst->getSrc(i);
837         if (srcOpnd)
838         {
839             if (srcOpnd->isSrcRegRegion() &&
840                 srcOpnd->asSrcRegRegion()->getBase() &&
841                 srcOpnd->asSrcRegRegion()->getBase()->isRegVar())
842             {
843                 G4_RegVar* baseVar = static_cast<G4_RegVar*>(srcOpnd->asSrcRegRegion()->getBase());
844                 currInstExecSize[i] = srcOpnd->getLinearizedEnd() - srcOpnd->getLinearizedStart() + 1;
845                 if (baseVar->isGreg()) {
846                     uint32_t byteAddress = srcOpnd->getLinearizedStart();
847                     currInstRegs[0][i] = byteAddress / numEltPerGRF<Type_UB>();
848                     if (i == 1)
849                     {
850                         isFDFSrc1 = IS_TYPE_F32_F64(srcOpnd->getType());
851                     }
852                     if (currInstExecSize[i] > getGRFSize())
853                     {
854                         currInstRegs[1][i] = currInstRegs[0][i] + 1;
855                         isCompressedInst = true;
856                     }
857                     else //Read suppression will be handled later
858                     {
859                         currInstRegs[1][i] = currInstRegs[0][i];
860                     }
861                 }
862             }
863         }
864     }
865 
866     if (isCompressedInst)
867     {
868         parent->XeBCStats.addSIMD8();
869     }
870 
871     //Kill previous read suppression candiadte if it wrote in DST
872     if (isValidReg(dstRegs[0]))
873     {
874         for (int i = 0; i < 4; i++)
875         {
876             if (suppressRegs[i] == dstRegs[0])
877             {
878                 setInValidReg(suppressRegs[i]);
879             }
880         }
881     }
882 
883     //Read Suppression from previous instruction
884     //Keep suppressRegs, if suppression happen
885     //Update suppression, and registers to be read.
886     // inst1: mad(8)   r10, r20, r20, r40
887     // inst2: mad(8)   r10, r30, r20, r50
888     // the suppression of r20 inst2 will happen
889     output << " R{";
890     for (int i = 0; i < 3; i++)
891     {
892         //Read suppression for src0, src1 and src2
893         if (isValidReg(suppressRegs[i]) &&
894             currInstRegs[0][i] == suppressRegs[i])
895         {
896             setInValidReg(currInstRegs[0][i]);
897             if (isCompressedInst &&
898                 isLastInstCompressed && //Two GRF operand instructions
899                 isFDFSrc1 &&
900                 i == 1)
901             {
902                 setInValidReg(currInstRegs[1][i]);
903                 isSrc1Suppressed = true;
904             }
905             output << "r" << suppressRegs[i] << ",";
906         }
907         else
908         {
909             suppressRegs[i] = currInstRegs[0][i];
910         }
911     }
912     output << "}";
913 
914     //Intra suppression for the first GRF
915     //Inter and intra will happen only once, if inter happen, intra wouldn't read
916     //Such as in following case, the src1 r20 of inst2 need be read because src0 r20 of inst2 is suppressed
917     // inst1: mad(8)   r10, r20, r30, r40
918     // inst2: mad(8)   r10, r20, r20, r50
919     // for this case, currInstRegs[0][j] is updated to invalid in this case because inter is handled first.
920     output << " IR{";
921     for (int i = 0; i < inst->getNumSrc(); i++)
922     {
923         if (isValidReg(currInstRegs[0][i]))
924         {
925             for (int k = 0; k < G4_MAX_SRCS; k++)
926             {
927                 if (isValidReg(readRegs[0][k]) && readRegs[0][k] == currInstRegs[0][i])
928                 {
929                     setInValidReg(currInstRegs[0][i]);
930                     output << "r" << readRegs[0][k] << ",";
931                 }
932             }
933             readRegs[0][i] = currInstRegs[0][i];
934         }
935     }
936     output << "}";
937 
938     suppressRegs[4] = isCompressedInst ? 1 : 0;
939 
940     int conflictTimes = 0;
941     for (int i = 0; i < 3; i++)
942     {
943         if (isValidReg(currInstRegs[0][i]))
944         {
945             firstRegCandidate[candidateNum] = currInstRegs[0][i];
946             candidateNum++;
947         }
948     }
949 
950     //Get the bank conflict for the first GRF instruction.
951     if (candidateNum > 1)
952     {
953         conflictTimes = getConflictTimesForTGL(output, firstRegCandidate, sameConflictTimes, zeroOne, isTGLLP, hasReducedBundles);
954         if (candidateNum == 2)
955         {
956             twoSrcConflicts += conflictTimes;
957         }
958     }
959 
960     if (isCompressedInst)
961     {
962         if (isValidReg(dstRegs[1]))
963         {
964             for (int i = 0; i < 4; i++)
965             {
966                 if (suppressRegs[i] == dstRegs[1])
967                 {
968                     //Should be no real overlap, only GRF level overlap may happen
969                     setInValidReg(suppressRegs[i]);
970                 }
971             }
972         }
973 
974         output << " R{";
975         //Inter for the second instruction
976         for (int i = 0; i < 3; i++)
977         {
978             if (isSrc1Suppressed)
979             {
980                 continue;
981             }
982             //Read suppression for src0, src1 and src2
983             if (isValidReg(suppressRegs[i]) &&
984                 currInstRegs[1][i] == suppressRegs[i])
985             {
986                 setInValidReg(currInstRegs[1][i]);
987                 output << "r" << suppressRegs[i] << ",";
988             }
989             else
990             {
991                 suppressRegs[i] = currInstRegs[1][i];
992             }
993         }
994         output << "}";
995 
996         output << " IR{";
997         //Intra suppression for the second instruction
998         for (int i = 0; i < inst->getNumSrc(); i++)
999         {
1000             if (isValidReg(currInstRegs[1][i]))
1001             {
1002                 for (int k = 0; k < G4_MAX_SRCS; k++)
1003                 {
1004                     if (isValidReg(readRegs[1][k]) && readRegs[1][k] == currInstRegs[1][i])
1005                     {
1006                         setInValidReg(currInstRegs[1][i]);
1007                         output << "r" << readRegs[1][k] << ",";
1008                     }
1009                 }
1010                 readRegs[1][i] = currInstRegs[1][i];
1011             }
1012         }
1013         output << "}";
1014 
1015         candidateNum = 0;
1016         //For SIMD8, if any GRF0 of src1 or src2 of inst1 is GRF register
1017         for (int i = 0; i < 3; i++)
1018         {
1019             if (isValidReg(currInstRegs[1][i]))
1020             {
1021                 secondRegCandidate[candidateNum] = currInstRegs[1][i];
1022                 candidateNum++;
1023             }
1024         }
1025 
1026         if (candidateNum > 1)
1027         {
1028             int c = 0;
1029             c = getConflictTimesForTGL(output, secondRegCandidate, sameConflictTimes, zeroOne, isTGLLP, false);
1030             conflictTimes += c;
1031             if (candidateNum == 2)
1032             {
1033                 twoSrcConflicts += c;
1034             }
1035             if (currInstExecSize[0] <= 16 || currInstExecSize[1] <= 16 || currInstExecSize[2] <= 16)
1036             {
1037                 simd16RS += c;
1038             }
1039         }
1040     }
1041 
1042     if (conflictTimes != 0 || parent->builder->getOption(vISA_DumpAllBCInfo))
1043     {
1044         output << " {";
1045         output << "BC=";
1046         output << conflictTimes;
1047         output << "}";
1048         os_output  << output.str();
1049     }
1050 
1051     return conflictTimes;
1052 } // emitBankConflictXe
1053 
hasInternalConflict(IR_Builder * builder,int reg1,int reg2)1054 static bool hasInternalConflict(IR_Builder *builder, int reg1, int reg2)
1055 {
1056     int bundleID1 = (reg1 % 16) / 2;
1057     int bankID1 = reg1 % 2;
1058     int bundleID2 = (reg2 % 16) / 2;
1059     int bankID2 = reg2 % 2;
1060 
1061     if (builder->hasTwoGRFBank16Bundles())
1062     {
1063         bundleID1 = (reg1 % 64) / 4;
1064         bankID1 = (reg1 % 4) / 2;
1065         bundleID2 = (reg2 % 64) / 4;
1066         bankID2 = (reg2 % 4) / 2;
1067     }
1068 
1069     if (builder->hasOneGRFBank16Bundles())
1070     {
1071         bundleID1 = (reg1 % 64) / 4;
1072         bankID1 = reg1 % 2;
1073         bundleID2 = (reg2 % 64) / 4;
1074         bankID2 = reg2 % 2;
1075     }
1076 
1077     return ((bankID1 == bankID2) && (bundleID1 == bundleID2));
1078 }
1079 
1080 /*
1081 * In XeLP, there are 8 bundles and 2 banks per HW thread.
1082 * Banks are divided according to EVEN / ODD of register index: 0101010101010101
1083 * There are 8 bundles per 16 registers : 0011223344556677
1084 * For two adjacent instructions : inst1 and inst2, inst1_src1(, inst1_src2) and inst2_src0 will be read in same cycle
1085 * Considered HW swapand read suppresion mechanisms
1086 * HW swap :
1087 *The origional GRF register reading sequence for a three source instruction is : src0 in cycle0and src1and src2 in cycle2.
1088 * HW swap mechanism detects the conflict between src1and src2, if there is a conflict, HW will read src1 in cycle0and src0and src2 in cycle1.
1089 * Note that :
1090 * 1. for SIMD16, HW swap only happens when detecting conflicts in first simd8's registers. conflict in second simd8 will not trigger swap.
1091 * 2. for SIMD16, when swapping happens, the src1and src0 of both simd8 instructions will be swapped.
1092 */
emitBankConflictXeLP(std::ostream & os_output,const G4_INST * inst,int * suppressRegs,int * lastRegs,int & sameConflictTimes,int & twoSrcConflicts,int & simd16RS)1093 uint32_t G4_BB::emitBankConflictXeLP(
1094     std::ostream& os_output, const G4_INST *inst,
1095     int *suppressRegs, int *lastRegs,
1096     int &sameConflictTimes, int &twoSrcConflicts, int &simd16RS)
1097 {
1098     std::stringstream output;
1099 
1100     parent->XeBCStats.addSIMD8();
1101 
1102     if (inst->isSend() ||
1103         inst->isMath() ||
1104         inst->isSWSBSync() ||
1105         inst->isWait() ||
1106         inst->isReturn() ||
1107         inst->isCall())
1108     { //Flush
1109         for (int i = 0; i < 3; i++)
1110         {
1111             setInValidReg(suppressRegs[i]);
1112             setInValidReg(lastRegs[i]);
1113         }
1114         return 0;
1115     }
1116 
1117     int currInstRegs[2][G4_MAX_SRCS];
1118     int currInstExecSize[G4_MAX_SRCS] = {0};
1119     int firstRegCandidate[G4_MAX_SRCS];
1120     int secondRegCandidate[G4_MAX_SRCS];
1121     int candidateNum = 0;
1122     int dstExecSize = 0;
1123     int dstRegs[2];
1124 
1125     for (int i = 0; i < G4_MAX_SRCS; i++)
1126     {
1127         setInValidReg(firstRegCandidate[i]);
1128         setInValidReg(secondRegCandidate[i]);
1129         setInValidReg(currInstRegs[0][i]);
1130         setInValidReg(currInstRegs[1][i]);
1131     }
1132     setInValidReg(dstRegs[0]);
1133     setInValidReg(dstRegs[1]);
1134 
1135     bool conflictWithPrevInst = true;
1136     if (!isValidReg(lastRegs[1]) && !isValidReg(lastRegs[2]))
1137     {
1138         conflictWithPrevInst = false;
1139     }
1140 
1141     //Get the regsiters of previous instruction
1142     //If there is potentail to conflict with it
1143     if (conflictWithPrevInst)
1144     {
1145         if (isValidReg(lastRegs[1]))
1146         {
1147             firstRegCandidate[candidateNum] = lastRegs[1];
1148             candidateNum++;
1149         }
1150         if (isValidReg(lastRegs[2]))
1151         {
1152             firstRegCandidate[candidateNum] = lastRegs[2];
1153             candidateNum++;
1154         }
1155     }
1156 
1157     bool instSplit = false;
1158 
1159     //Get Dst
1160     G4_DstRegRegion* dstOpnd = inst->getDst();
1161     if (dstOpnd &&
1162         !dstOpnd->isIndirect() &&
1163         dstOpnd->isGreg())
1164     {
1165         dstExecSize = dstOpnd->getLinearizedEnd() - dstOpnd->getLinearizedStart() + 1;
1166         uint32_t byteAddress = dstOpnd->getLinearizedStart();
1167         dstRegs[0] = byteAddress / numEltPerGRF<Type_UB>();
1168         if (dstExecSize > getGRFSize())
1169         {
1170             dstRegs[1] = dstRegs[0] + (dstExecSize + numEltPerGRF<Type_UB>() - 1) / numEltPerGRF<Type_UB>() - 1;
1171             instSplit = true;
1172 
1173         }
1174     }
1175 
1176     for (int i = 0; i < inst->getNumSrc(); i++)
1177     {
1178         setInValidReg(currInstRegs[0][i]);
1179         setInValidReg(currInstRegs[1][i]);
1180         G4_Operand * srcOpnd = inst->getSrc(i);
1181         if (srcOpnd)
1182         {
1183             if (srcOpnd->isSrcRegRegion() &&
1184                 srcOpnd->asSrcRegRegion()->getBase() &&
1185                 srcOpnd->asSrcRegRegion()->getBase()->isRegVar())
1186             {
1187                 G4_RegVar* baseVar = static_cast<G4_RegVar*>(srcOpnd->asSrcRegRegion()->getBase());
1188                 currInstExecSize[i] = srcOpnd->getLinearizedEnd() - srcOpnd->getLinearizedStart() + 1;
1189                 if (baseVar->isGreg()) {
1190                     uint32_t byteAddress = srcOpnd->getLinearizedStart();
1191                     currInstRegs[0][i] = byteAddress / numEltPerGRF<Type_UB>();
1192 
1193                     if (currInstExecSize[i] > getGRFSize())
1194                     {
1195                         currInstRegs[1][i] = currInstRegs[0][i] + (currInstExecSize[i] + numEltPerGRF<Type_UB>() - 1) / numEltPerGRF<Type_UB>() - 1;
1196                         instSplit = true;
1197                     }
1198                     else if (srcOpnd->asSrcRegRegion()->isScalar()) //No Read suppression for SIMD 16/scalar src
1199                     {
1200                         currInstRegs[1][i] = currInstRegs[0][i];
1201                     }
1202                     else
1203                     {
1204                         setInValidReg(currInstRegs[1][i]);
1205                     }
1206                 }
1207             }
1208         }
1209     }
1210 
1211     if (instSplit)
1212     {
1213         parent->XeBCStats.addSIMD8();
1214     }
1215 
1216     //Read Suppression for current instruction
1217     output << " R{";
1218     for (int i = 1; i < 3; i++)
1219     {
1220         if (isValidReg(suppressRegs[i]) &&
1221             currInstRegs[0][i] == suppressRegs[i])
1222         {
1223             setInValidReg(currInstRegs[0][i]);
1224             output << "r" << suppressRegs[i] << ",";
1225         }
1226         else
1227         {
1228             suppressRegs[i] = currInstRegs[0][i];
1229         }
1230     }
1231     output << "}";
1232 
1233     //Kill all previous read suppression candiadte if it wrote in DST
1234     if (isValidReg(dstRegs[0]))
1235     {
1236         for (int i = 1; i < 3; i++)
1237         {
1238             if (suppressRegs[i] == dstRegs[0])
1239             {
1240                 setInValidReg(suppressRegs[i]);
1241             }
1242         }
1243     }
1244 
1245     bool swap = false;
1246     // SWAP: has lower proirity than read suppression
1247     // For SIMD16, the SWAP is triggered by first register, but the second one will be swapped as well
1248     if (isValidReg(currInstRegs[0][0]) && isValidReg(currInstRegs[0][1]) && isValidReg(currInstRegs[0][2]) &&
1249         hasInternalConflict(parent->builder, currInstRegs[0][1], currInstRegs[0][2]))
1250     {
1251         int tmpReg = currInstRegs[0][1];
1252         currInstRegs[0][1] = currInstRegs[0][0];
1253         currInstRegs[0][0] = tmpReg;
1254         output << " S{r" << currInstRegs[0][1] << ", r" << currInstRegs[0][0] << "} ";
1255         swap = true;
1256     }
1257 
1258 
1259     // No suppression, update the suppressRegs[0] for XeLP
1260     // suppressRegs[1], suppressRegs[2] will be updated with next instruction
1261 
1262     // src1 and src2 will be read with src0 of next instruction
1263     lastRegs[1] = currInstRegs[0][1];
1264     lastRegs[2] = currInstRegs[0][2];
1265     //Conflict with previous instruction
1266     int conflictTimes = 0;
1267     if (conflictWithPrevInst)
1268     {
1269 
1270         if (isValidReg(currInstRegs[0][0]))
1271         {
1272             firstRegCandidate[candidateNum] = currInstRegs[0][0];
1273             candidateNum++;
1274         }
1275         if (candidateNum > 1)
1276         {
1277             conflictTimes = getConflictTimesForTGLLP(output, firstRegCandidate, sameConflictTimes);
1278             if (candidateNum == 2)
1279             {
1280                 twoSrcConflicts += conflictTimes;
1281             }
1282         }
1283     }
1284 
1285     if (instSplit)
1286     {
1287         output << " R{";
1288         for (int i = 1; i < 3; i++)
1289         {
1290             if (isValidReg(suppressRegs[i]) &&
1291                 currInstRegs[1][i] == suppressRegs[i])
1292             {
1293                 setInValidReg(currInstRegs[1][i]);
1294                 output << "r" << suppressRegs[i] << ",";
1295             }
1296             else
1297             {
1298                 suppressRegs[i] = currInstRegs[1][i];
1299             }
1300         }
1301         output << "}";
1302 
1303         if (isValidReg(dstRegs[1]))
1304         {
1305             for (int i = 1; i < 3; i++)
1306             {
1307                 if (suppressRegs[i] == dstRegs[1])
1308                 {
1309                     setInValidReg(suppressRegs[i]);
1310                 }
1311             }
1312         }
1313 
1314         if (swap && isValidReg(currInstRegs[1][0]) && isValidReg(currInstRegs[1][1]) && isValidReg(currInstRegs[1][2]))
1315         {
1316             int tmpReg = currInstRegs[1][0];
1317             currInstRegs[1][0] = currInstRegs[1][1];
1318             currInstRegs[1][1] = tmpReg;
1319             output << " S{r" << currInstRegs[1][1] << ", r" << currInstRegs[1][0] << "} ";
1320         }
1321 
1322         candidateNum = 0;
1323         //For SIMD8, if any GRF0 of src1 or src2 of inst1 is GRF register
1324         if (isValidReg(lastRegs[1])) // && lastRegs[1] != suppressRegs[1])
1325         {
1326             secondRegCandidate[candidateNum] = lastRegs[1];
1327             candidateNum++;
1328         }
1329         if (isValidReg(lastRegs[2])) // && lastRegs[2] != suppressRegs[2])
1330         {
1331             secondRegCandidate[candidateNum] = lastRegs[2];
1332             candidateNum++;
1333         }
1334 
1335         if (isValidReg(currInstRegs[1][0]))
1336         {
1337             secondRegCandidate[candidateNum] = currInstRegs[1][0];
1338             candidateNum++;
1339         }
1340 
1341         lastRegs[1] = currInstRegs[1][1];
1342         lastRegs[2] = currInstRegs[1][2];
1343 
1344 
1345         if (candidateNum > 1)
1346         {
1347             int c = 0;
1348             c = getConflictTimesForTGLLP(output, secondRegCandidate, sameConflictTimes);
1349             conflictTimes += c;
1350             if (candidateNum == 2)
1351             {
1352                 twoSrcConflicts += c;
1353             }
1354             if (currInstExecSize[0] <= 16 || currInstExecSize[1] <= 16 || currInstExecSize[2] <= 16)
1355             {
1356                 simd16RS += c;
1357             }
1358         }
1359     }
1360 
1361     if (conflictTimes != 0)
1362     {
1363         output << " {";
1364         output << "BC=";
1365         output << conflictTimes;
1366         output << "}";
1367         os_output << output.str();
1368     }
1369 
1370     return conflictTimes;
1371 } // emitBankConflictXeLP
1372 
countReadModifyWrite(std::ostream & output,const G4_INST * inst)1373 uint32_t G4_BB::countReadModifyWrite(std::ostream& output, const G4_INST *inst)
1374 {
1375     if (!inst->getDst() || inst->getDst()->isNullReg() ||
1376         inst->isSend() || inst->isDpas())
1377     {
1378         return 0;
1379     }
1380     auto dst = inst->getDst();
1381     auto dstTy = dst->getType();
1382     if (TypeSize(dstTy) == 1 && dst->getHorzStride() > 1)
1383     {
1384         return 1;
1385     }
1386     return 0;
1387 }
1388 
getLabel()1389 G4_Label * G4_BB::getLabel()
1390 {
1391     //FIXME: For now not all BBs will start with a label (e.g.,
1392     //a block that follows a call).  We should fix it by getting rid
1393     //of the g4_label instruction and associate each label with a BB
1394     if (instList.size() > 0 && instList.front()->isLabel())
1395     {
1396         return instList.front()->getLabel();
1397     }
1398     return NULL;
1399 }
1400 
getFirstInst()1401 G4_INST * G4_BB::getFirstInst()
1402 {
1403     G4_INST *firstInst = nullptr;
1404     if (instList.size() > 0)
1405     {
1406         INST_LIST_ITER I = instList.begin();
1407         firstInst = *I;
1408         if (firstInst->isLabel())
1409         {
1410             // Only first inst can be label.
1411             ++I;
1412             firstInst = (I != instList.end()) ? *I : nullptr;
1413         }
1414     }
1415     return firstInst;
1416 }
1417 
getFirstInsertPos()1418 INST_LIST_ITER G4_BB::getFirstInsertPos()
1419 {
1420     INST_LIST_ITER II = begin();
1421     for(INST_LIST_ITER IB = end(); II != IB; ++II)
1422     {
1423         G4_INST* tI = (*II);
1424         if (tI->isLabel()
1425             || tI->opcode() == G4_join
1426             || tI->opcode() == G4_endif
1427             || tI->opcode() == G4_while)
1428         {
1429             continue;
1430         }
1431         break;
1432     }
1433     return II;
1434 }
1435 
1436 //
1437 //  Add an EOT send to the end of this BB.
1438 //
addEOTSend(G4_INST * lastInst)1439 void G4_BB::addEOTSend(G4_INST* lastInst)
1440 {
1441     // mov (8) r1.0<1>:ud r0.0<8;8,1>:ud {NoMask}
1442     // send (8) null r1 0x27 desc
1443     IR_Builder* builder = parent->builder;
1444     G4_Declare *dcl = builder->createSendPayloadDcl(numEltPerGRF<Type_UD>(), Type_UD);
1445     G4_DstRegRegion* movDst = builder->createDstRegRegion(dcl, 1);
1446     G4_SrcRegRegion* r0Src = builder->createSrcRegRegion(
1447         builder->getBuiltinR0(), builder->getRegionStride1());
1448     G4_INST *movInst = builder->createMov(
1449         G4_ExecSize(numEltPerGRF<Type_UD>()), movDst, r0Src, InstOpt_WriteEnable, false);
1450     if (lastInst)
1451     {
1452         movInst->inheritDIFrom(lastInst);
1453     }
1454     instList.push_back(movInst);
1455 
1456     auto EOT_SFID = builder->getEOTSFID();
1457 
1458     int exdesc = (0x1 << 5) + SFIDtoInt(EOT_SFID);
1459     // response len = 0, msg len = 1
1460     int desc = (0x1 << 25) + (0x1 << 4);
1461 
1462     G4_SrcRegRegion* sendSrc = builder->createSrcRegRegion(
1463         dcl, builder->getRegionStride1());
1464 
1465     G4_DstRegRegion *sendDst = builder->createNullDst(Type_UD);
1466 
1467     auto msgDesc = builder->createGeneralMsgDesc(desc, exdesc, SendAccess::WRITE_ONLY);
1468     G4_INST* sendInst = builder->createSendInst(
1469         NULL,
1470         G4_send,
1471         g4::SIMD8,
1472         sendDst,
1473         sendSrc,
1474         builder->createImm(desc, Type_UD),
1475         InstOpt_WriteEnable,
1476         msgDesc,
1477         false);
1478     sendInst->inheritDIFrom(movInst);
1479     instList.push_back(sendInst);
1480 
1481     if (builder->getHasNullReturnSampler() && VISA_WA_CHECK(builder->getPWaTable(), Wa_1607871015))
1482     {
1483         addSamplerFlushBeforeEOT();
1484     }
1485 }
1486 
getBBTypeStr() const1487 const char* G4_BB::getBBTypeStr() const
1488 {
1489     switch (getBBType()) {
1490     default:
1491         break;
1492     case G4_BB_CALL_TYPE:
1493         return "CALL";
1494     case G4_BB_RETURN_TYPE:
1495         return "RETURN";
1496     case G4_BB_INIT_TYPE:
1497         return "INIT";
1498     case G4_BB_EXIT_TYPE:
1499         return "EXIT";
1500     case G4_BB_NM_WA_TYPE:
1501         return "NoMaskWA";
1502     case G4_BB_FCALL_TYPE:
1503         return "FCALL";
1504     }
1505     return " ";
1506 }
1507 
dump() const1508 void G4_BB::dump() const
1509 {
1510     print(std::cerr);
1511 }
1512 
emitBbInfo(std::ostream & os) const1513 void G4_BB::emitBbInfo(std::ostream& os) const {
1514     // mustn't exceed a single line because it could be in asm output
1515     auto fmtBbId = [&](int bb) {
1516         std::stringstream ss;
1517         ss << "B" << std::setw(3) << std::setfill('0') << bb;
1518         return ss.str();
1519     };
1520     os << fmtBbId(getId()) << ":";
1521     bool first = true;
1522     auto maybeComma = [&]() {
1523         if (first)
1524             first = false;
1525         else
1526             os << ", ";
1527     };
1528     if (getBBType())
1529     {
1530         maybeComma();
1531         os << " [" << getBBTypeStr() << "]";
1532     }
1533     if (isDivergent())
1534     {
1535         maybeComma();
1536         os << " [inDivergent]";
1537     }
1538     auto emitBbSet = [&](const char *name, const BB_LIST &bbl) {
1539         maybeComma();
1540         os << " " << name << ":{";
1541         bool first = true;
1542         for (const auto &bb : bbl) {
1543             if (first) first = false; else os << ", ";
1544             os << fmtBbId(bb->getId());
1545         }
1546         os << "}";
1547     };
1548     emitBbSet("Preds", Preds);
1549     emitBbSet("Succs", Succs);
1550 }
1551 
print(std::ostream & OS) const1552 void G4_BB::print(std::ostream& OS) const
1553 {
1554     emitBbInfo(OS);
1555     OS << "\n";
1556     for (auto& x : instList)
1557         x->print(OS);
1558     OS << "\n";
1559 }
1560 
dumpDefUse(std::ostream & os) const1561 void G4_BB::dumpDefUse(std::ostream& os) const
1562 {
1563     for (auto& x : instList)
1564     {
1565         x->dump();
1566         if (x->def_size() > 0 || x->use_size() > 0)
1567         {
1568             x->dumpDefUse(os);
1569             os << "\n\n\n";
1570         }
1571     }
1572 }
1573 
resetLocalIds()1574 void G4_BB::resetLocalIds()
1575 {
1576     int i = 0;
1577 
1578     for (INST_LIST_ITER iter = instList.begin(), end = instList.end();
1579         iter != end;
1580         ++iter, ++i)
1581     {
1582         (*iter)->setLocalId(i);
1583     }
1584 }
1585 
removeIntrinsics(Intrinsic intrinId)1586 void G4_BB::removeIntrinsics(Intrinsic intrinId) {
1587     instList.remove_if([=](G4_INST* inst) {
1588         return inst->isIntrinsic() &&
1589             inst->asIntrinsicInst()->getIntrinsicId() == intrinId;
1590         });
1591 }
1592 
1593 
1594 // Add two sampler cache flushes before the EOT send.
1595 // sampler cache flush 1 must have null return
1596 // sampler cache flush 2 must have valid return
1597 // bb must end with an EOT send
addSamplerFlushBeforeEOT()1598 void G4_BB::addSamplerFlushBeforeEOT()
1599 {
1600     assert(isLastInstEOT() && "last instruction must be EOT");
1601     auto builder = parent->builder;
1602     int samplerFlushOpcode = 0x1F;
1603     int samplerFlushFC = (SamplerSIMDMode::SIMD32 << 17) +
1604         (samplerFlushOpcode << 12);
1605     // null return version
1606     {
1607         int desc = G4_SendDescRaw::createDesc(samplerFlushFC, true, 1, 0);
1608         G4_SrcRegRegion* sendMsgOpnd = builder->createSrcRegRegion(
1609             builder->getBuiltinR0(),
1610             builder->getRegionStride1());
1611 
1612         auto msgDesc = builder->createSyncMsgDesc(SFID::SAMPLER, desc);
1613         G4_INST* samplerFlushInst = builder->createSendInst(
1614             nullptr, G4_send, g4::SIMD8,
1615             builder->createNullDst(Type_UD), sendMsgOpnd,
1616             builder->createImm(desc, Type_UD),
1617             0, msgDesc, true);
1618         auto iter = std::prev(end());
1619         insert(iter, samplerFlushInst);
1620     }
1621 
1622     // valid return version
1623     {
1624         int desc = G4_SendDescRaw::createDesc(samplerFlushFC, true, 1, 1);
1625         G4_SrcRegRegion* sendMsgOpnd = builder->createSrcRegRegion(
1626             builder->getBuiltinR0(),
1627             builder->getRegionStride1());
1628         G4_Declare *tmpDest = builder->createTempVar(g4::SIMD8, Type_UD, GRFALIGN);
1629         tmpDest->setDoNotSpill();
1630         G4_DstRegRegion* sendMsgDst = builder->createDstRegRegion(tmpDest, 1);
1631         auto msgDesc = builder->createSyncMsgDesc(SFID::SAMPLER, desc);
1632         G4_INST* samplerFlushInst = builder->createSendInst(
1633             nullptr, G4_send, g4::SIMD8,
1634             sendMsgDst, sendMsgOpnd,
1635             builder->createImm(desc, Type_UD),
1636             0, msgDesc, true);
1637         auto iter = std::prev(end());
1638         insert(iter, samplerFlushInst);
1639     }
1640 }
1641 
dominates(G4_BB * other)1642 bool G4_BB::dominates(G4_BB* other)
1643 {
1644     return getParent().getDominator().dominates(this, other);
1645 }
1646