1 /*========================== begin_copyright_notice ============================
2
3 Copyright (C) 2021 Intel Corporation
4
5 SPDX-License-Identifier: MIT
6
7 ============================= end_copyright_notice ===========================*/
8
9 #include "G4_BB.hpp"
10 #include "BuildIR.h"
11 #include "LocalRA.h" // SECOND_HALF_BANK_START_GRF
12
13 #include <ostream>
14
15 using namespace vISA;
16
17
isSuccBB(G4_BB * succ) const18 bool G4_BB::isSuccBB(G4_BB* succ) const
19 {
20 for (auto it = Succs.begin(), bbEnd = Succs.end(); it != bbEnd; ++it)
21 {
22 if ((*it) == succ) return true;
23 }
24 return false;
25 }
26
getKernel() const27 G4_Kernel& G4_BB::getKernel() const
28 {
29 return *getParent().getKernel();
30 }
31
32 //
33 // to check if the last instruction in list is EOT
34 //
isLastInstEOT() const35 bool G4_BB::isLastInstEOT() const
36 {
37 if (instList.size() == 0)
38 {
39 return false;
40 }
41
42 G4_INST *i = instList.back();
43
44 if (parent->builder->hasSendShootdown())
45 {
46 // due to send shootdown, a predicated send may not actually be an EOT
47 return i->isEOT() && i->getPredicate() == NULL;
48 }
49 else
50 {
51 return i->isEOT();
52 }
53 }
54
getLastOpcode() const55 G4_opcode G4_BB::getLastOpcode() const
56 {
57 const G4_INST *i = instList.empty() ? nullptr : instList.back();
58 if (i)
59 {
60 return i->opcode();
61 }
62 else
63 {
64 return G4_illegal;
65 }
66 }
67
setId(unsigned i)68 void G4_BB::setId(unsigned i)
69 {
70 // some analysis passes rely on G4_BB id
71 if (id != i)
72 getParent().markStale();
73 id = i;
74 }
75
76
removePredEdge(G4_BB * pred)77 void G4_BB::removePredEdge(G4_BB* pred)
78 {
79 for (std::list<G4_BB*>::iterator it = Preds.begin(), bbEnd = Preds.end();
80 it != bbEnd; ++it)
81 {
82 if (*it != pred) continue;
83 // found
84 Preds.erase(it);
85 getParent().markStale();
86 return;
87 }
88 MUST_BE_TRUE(false, ERROR_FLOWGRAPH); // edge is not found
89 }
90
removeSuccEdge(G4_BB * succ)91 void G4_BB::removeSuccEdge(G4_BB* succ)
92 {
93 for (std::list<G4_BB*>::iterator it = Succs.begin(), bbEnd = Succs.end(); it != bbEnd; ++it)
94 {
95 if (*it != succ) continue;
96 // found
97 Succs.erase(it);
98 getParent().markStale();
99 return;
100 }
101 MUST_BE_TRUE(false, ERROR_FLOWGRAPH); // edge is not found
102 }
103
104 //
105 // find the fall-through BB of the current block.
106 // if the last inst is a unconditional jump, then the target is not considered a fall-through BB
107 // NOTE: Pay attention this function is only works after the handleReturn() duo the the conditional CALL
108 //
fallThroughBB()109 G4_BB * G4_BB::fallThroughBB()
110 {
111 G4_INST* last = (!instList.empty()) ? instList.back() : NULL;
112
113 if (last)
114 {
115 if (last->opcode() == G4_goto || last->opcode() == G4_join)
116 {
117 return nullptr;
118 }
119 if (last->isFlowControl())
120 {
121 // if No successor, return NULL;
122 if (Succs.empty())
123 {
124 return nullptr;
125 }
126
127 //
128 // Instructions Predicate-On Predicate-Off Num of Succ
129 // Jmpi Front None >=1
130 // CALL Front None >=2 considered the conditional call here
131 // while Front Front 2
132 // if, else Front Front 2
133 // break, cont Front None 1,2
134 // return Front None >=1
135 // do Front Front 1
136 // endif Front Front 1
137 if (last->isCall())
138 {
139 return BBAfterCall();
140 }
141 else if (!last->getPredicate() &&
142 // G4_while considered to fall trhu even without pred, since break jumps to while
143 (last->opcode() == G4_jmpi || last->opcode() == G4_break || last->opcode() == G4_cont || last->isReturn()))
144 {
145 return nullptr;
146 }
147 else
148 {
149 return Succs.front();
150 }
151 }
152 }
153
154 //
155 // process other cases
156 //
157 if (Succs.size() == 0) // exit BB
158 return NULL; // no fall-through BB
159 else
160 return Succs.front();
161 }
162
BBBeforeCall() const163 G4_BB * G4_BB::BBBeforeCall() const
164 {
165 assert((getBBType() & G4_BB_RETURN_TYPE) && "this must be a subroutine return BB");
166 return physicalPred;
167 }
168
BBAfterCall() const169 G4_BB * G4_BB::BBAfterCall() const
170 {
171 assert((getBBType() & G4_BB_CALL_TYPE) && "this must be a subroutine call BB");
172 return physicalSucc;
173 }
174
isAllLaneActive() const175 bool G4_BB::isAllLaneActive() const
176 {
177 G4_Kernel* pK = parent->getKernel();
178 if (pK->getInt32KernelAttr(Attributes::ATTR_Target) == VISA_CM && !isDivergent())
179 {
180 // CM: if BB isn't divergent, all lanes (32) must be active (dmask = 0xFFFFFFFF)
181 return true;
182 }
183 return false;
184 }
185
186
emit(std::ostream & output)187 void G4_BB::emit(std::ostream& output)
188 {
189 for (INST_LIST_ITER it = instList.begin(); it != instList.end(); ++it)
190 {
191 emitInstruction(output, it);
192 }
193 }
emitInstruction(std::ostream & output,INST_LIST_ITER & it)194 void G4_BB::emitInstruction(std::ostream& output, INST_LIST_ITER &it)
195 {
196 // prints out instruction line
197 if (!parent->getKernel()->getOptions()->getOption(vISA_disableInstDebugInfo))
198 {
199 emitInstructionSourceLineMapping(output, it);
200 }
201
202 emitBasicInstruction(output, it);
203
204 output << "\n";
205 }
emitBasicInstruction(std::ostream & output,INST_LIST_ITER & it)206 void G4_BB::emitBasicInstruction(std::ostream& output, INST_LIST_ITER &it)
207 {
208 if ((*it)->isSend())
209 {
210 //
211 // emit send instruction
212 //
213 G4_InstSend* SendInst = (*it)->asSendInst();
214 if( SendInst )
215 {
216 SendInst->emit_send(output);
217 SendInst->emit_send_desc(output);
218 }
219 }
220 else
221 {
222 //
223 // emit label and instruction
224 //
225 G4_INST *inst = *it;
226 inst->emit(output, parent->builder->getOption(vISA_SymbolReg));
227 if ((*it)->isLabel() == false)
228 {
229 emitBankConflict(output, inst);
230 }
231 }
232
233 }
emitBasicInstructionComment(std::ostream & output,INST_LIST_ITER & it,int * suppressRegs,int * lastRegs)234 void G4_BB::emitBasicInstructionComment(
235 std::ostream& output,
236 INST_LIST_ITER &it,
237 int *suppressRegs, int *lastRegs)
238 {
239 const G4_INST* inst = *it;
240
241 auto platform = inst->getPlatform();
242
243 if (!inst->isLabel() && inst->opcode() < G4_NUM_OPCODE)
244 {
245 output << " // ";
246
247 auto comments = inst->getComments();
248 if (!comments.empty()) {
249 output << " " << comments << "; ";
250 }
251 int vISAId = inst->getCISAOff();
252 if (vISAId != -1) {
253 output << "$" << vISAId;
254 }
255
256 if (getParent().getKernel()->getOption(vISA_DumpSBID))
257 {
258 int lexicalId = inst->getLexicalId();
259 if (lexicalId != -1) {
260 output << "&" << lexicalId;
261 }
262 }
263
264 if (getParent().getKernel()->getOption(vISA_DumpGenOffset) &&
265 inst->getBinInst())
266 {
267 output << ":%" << inst->getGenOffset();
268 }
269
270 if (getPlatformGeneration(platform) < PlatformGen::XE)
271 {
272 emitBankConflict(output, inst);
273 }
274 else
275 {
276 int sameBankConflicts = 0;
277 int twoSrcConflicts = 0;
278 int simd16SuppressionConflicts = 0;
279 unsigned BCNum = 0;
280 if (parent->builder->hasEarlyGRFRead())
281 {
282 BCNum = emitBankConflictXeLP(
283 output, inst, suppressRegs, lastRegs,
284 sameBankConflicts, twoSrcConflicts, simd16SuppressionConflicts);
285 }
286 else
287 {
288 BCNum = emitBankConflictXe(
289 output, inst, suppressRegs,
290 sameBankConflicts, twoSrcConflicts, simd16SuppressionConflicts,
291 parent->builder->hasOneGRFBank16Bundles(),
292 platform == GENX_TGLLP, parent->builder->has64bundleSize());
293 }
294 parent->XeBCStats.addBC(BCNum);
295 parent->XeBCStats.addSameBankBC(sameBankConflicts);
296 parent->XeBCStats.add2SrcBC(twoSrcConflicts);
297 parent->XeBCStats.addSimd16RSBC(simd16SuppressionConflicts);
298 parent->numRMWs += countReadModifyWrite(output, inst);
299 }
300 }
301
302 }
303
304 _THREAD const char* g4_prevFilename;
305 _THREAD int g4_prevSrcLineNo;
306
emitInstructionSourceLineMapping(std::ostream & output,INST_LIST_ITER & it)307 void G4_BB::emitInstructionSourceLineMapping(std::ostream& output, INST_LIST_ITER &it)
308 {
309 bool emitFile = false, emitLineNo = false;
310 const char* curFilename = (*it)->getSrcFilename();
311 int curSrcLineNo = (*it)->getLineNo();
312
313 if ((*it)->isLabel())
314 {
315 return;
316 }
317
318 if (curFilename && (g4_prevFilename == nullptr || strcmp(g4_prevFilename, curFilename) != 0))
319 {
320 emitFile = true;
321 }
322
323 if (g4_prevSrcLineNo != curSrcLineNo && curSrcLineNo != 0)
324 {
325 emitLineNo = true;
326 }
327
328 if (emitFile)
329 {
330 output << "\n// File: " << curFilename << "\n";
331 }
332
333 if (emitLineNo)
334 {
335 output << "\n// Line " << curSrcLineNo;
336 if (curFilename)
337 {
338 std::string curLine = parent->getKernel()->getDebugSrcLine(curFilename, curSrcLineNo);
339 if (!curLine.empty()) {
340 auto isNotSpace = [](int ch) { return !std::isspace(ch); };
341 curLine.erase(curLine.begin(), std::find_if(curLine.begin(), curLine.end(), isNotSpace));
342 curLine.erase(std::find_if(curLine.rbegin(), curLine.rend(), isNotSpace).base(), curLine.end());
343 output << ": " << curLine;
344 }
345 }
346 output << "\n";
347 }
348
349 if (emitFile)
350 {
351 g4_prevFilename = curFilename;
352 }
353
354 if (emitLineNo)
355 {
356 g4_prevSrcLineNo = curSrcLineNo;
357 }
358 }
359
emitBankConflict(std::ostream & output,const G4_INST * inst)360 void G4_BB::emitBankConflict(std::ostream& output, const G4_INST *inst)
361 {
362 int regNum[2][G4_MAX_SRCS];
363 int execSize[G4_MAX_SRCS];
364 int regSrcNum = 0;
365
366 if (inst->isDpas()) {
367 return;
368 }
369
370 if (inst->getNumSrc() == 3 && !inst->isSend())
371 {
372 for (unsigned i = 0; i < 3; i++)
373 {
374 G4_Operand * srcOpnd = inst->getSrc(i);
375 regNum[1][i] = -1;
376 if (srcOpnd)
377 {
378 if (srcOpnd->isSrcRegRegion() &&
379 srcOpnd->asSrcRegRegion()->getBase() &&
380 srcOpnd->asSrcRegRegion()->getBase()->isRegVar())
381 {
382 G4_RegVar* baseVar = static_cast<G4_RegVar*>(srcOpnd->asSrcRegRegion()->getBase());
383 if (baseVar->isGreg()) {
384 uint32_t byteAddress = srcOpnd->getLinearizedStart();
385 if (byteAddress != 0) {
386 regNum[0][i] = byteAddress / numEltPerGRF<Type_UB>();
387 }
388 else {
389 // before RA, use the value in Greg directly
390 regNum[0][i] = baseVar->getPhyReg()->asGreg()->getRegNum();
391 }
392 regNum[1][i] = regNum[0][i];
393 regSrcNum++;
394 }
395 execSize[i] = srcOpnd->getLinearizedEnd() - srcOpnd->getLinearizedStart();
396 }
397 }
398 }
399 }
400
401 if (regSrcNum == 3)
402 {
403 int maxGRFNum = 0;
404 output << " {";
405 if (parent->builder->oneGRFBankDivision())
406 {//EVEN/ODD
407 for (int i = 0; i < 3; i++)
408 {
409 output << i << "=";
410 if (!(regNum[0][i] % 2) && regNum[0][i] < SECOND_HALF_BANK_START_GRF)
411 {
412 output << "EL, ";
413 }
414 if (regNum[0][i] % 2 && regNum[0][i] < SECOND_HALF_BANK_START_GRF)
415 {
416 output << "OL, ";
417 }
418 if (!(regNum[0][i] % 2) && regNum[0][i] >= SECOND_HALF_BANK_START_GRF)
419 {
420 output << "EH, ";
421 }
422 if (regNum[0][i] % 2 && regNum[0][i] >= SECOND_HALF_BANK_START_GRF)
423 {
424 output << "OH, ";
425 }
426 }
427 }
428 else
429 { //EVEN EVEN/ODD ODD
430 for (int i = 0; i < 3; i++)
431 {
432 output << i << "=";
433 for (int j = 0; j < (execSize[i] + (int)numEltPerGRF<Type_UB>() - 1) / (int)numEltPerGRF<Type_UB>(); j++)
434 {
435 int reg_num = regNum[0][i] + j;
436 if (!(reg_num & 0x02) && reg_num < SECOND_HALF_BANK_START_GRF)
437 {
438 output << "EL, ";
439 }
440 if ((reg_num & 0x02) && reg_num < SECOND_HALF_BANK_START_GRF)
441 {
442 output << "OL, ";
443 }
444 if (!(reg_num & 0x02) && reg_num >= SECOND_HALF_BANK_START_GRF)
445 {
446 output << "EH, ";
447 }
448 if ((reg_num & 0x02) && reg_num >= SECOND_HALF_BANK_START_GRF)
449 {
450 output << "OH, ";
451 }
452 if (j > 1)
453 {
454 regNum[1][i] = reg_num;
455 }
456 }
457 maxGRFNum = ((execSize[i] + (int)numEltPerGRF<Type_UB>() - 1) / (int)numEltPerGRF<Type_UB>()) > maxGRFNum ?
458 ((execSize[i] + (int)numEltPerGRF<Type_UB>() - 1) / (int)numEltPerGRF<Type_UB>()) : maxGRFNum;
459 }
460 }
461 output << "BC=";
462 if (!parent->builder->twoSourcesCollision())
463 {
464 if (!parent->builder->oneGRFBankDivision())
465 { //EVEN EVEN/ODD ODD
466 ASSERT_USER(maxGRFNum < 3, "Not supporting register size > 2");
467 if (maxGRFNum == 2)
468 {
469 for (int i = 0; i < maxGRFNum; i++)
470 {
471 if ((regNum[i][1] & 0x02) == (regNum[i][2] & 0x02))
472 {
473 if ((regNum[i][1] < SECOND_HALF_BANK_START_GRF &&
474 regNum[i][2] < SECOND_HALF_BANK_START_GRF) ||
475 (regNum[i][1] >= SECOND_HALF_BANK_START_GRF &&
476 regNum[i][2] >= SECOND_HALF_BANK_START_GRF))
477 {
478 parent->BCStats.addBad();
479 output << "BAD,";
480 }
481 else
482 {
483 parent->BCStats.addOK();
484 output << "OK,";
485 }
486 }
487 else
488 {
489 parent->BCStats.addGood();
490 output << "GOOD,";
491 }
492 }
493 }
494 else
495 {
496 for (int i = 0; i < maxGRFNum; i++)
497 {
498 if (((regNum[i][1] & 0x02) == (regNum[i][2] & 0x02)) &&
499 ((regNum[i][0] & 0x02) == (regNum[i][1] & 0x02)))
500 {
501 if ((regNum[i][0] < SECOND_HALF_BANK_START_GRF &&
502 regNum[i][1] < SECOND_HALF_BANK_START_GRF &&
503 regNum[i][2] < SECOND_HALF_BANK_START_GRF) ||
504 (regNum[i][0] >= SECOND_HALF_BANK_START_GRF &&
505 regNum[i][1] >= SECOND_HALF_BANK_START_GRF &&
506 regNum[i][2] >= SECOND_HALF_BANK_START_GRF))
507 {
508 parent->BCStats.addBad();
509 output << "BAD,";
510 }
511 else
512 {
513 parent->BCStats.addOK();
514 output << "OK,";
515 }
516 }
517 else
518 {
519 parent->BCStats.addGood();
520 output << "GOOD,";
521 }
522 }
523 }
524 }
525 else
526 { //EVEN/ODD
527 if ((regNum[0][1] % 2) != (regNum[0][2] % 2) ||
528 (regNum[0][0] % 2) != (regNum[0][1] % 2) ||
529 (regNum[0][1] == regNum[0][2]))
530 {
531 parent->BCStats.addGood();
532 output << "GOOD";
533 }
534 else
535 {
536 if ((regNum[0][0] < SECOND_HALF_BANK_START_GRF &&
537 regNum[0][1] < SECOND_HALF_BANK_START_GRF &&
538 regNum[0][2] < SECOND_HALF_BANK_START_GRF) ||
539 (regNum[0][0] >= SECOND_HALF_BANK_START_GRF &&
540 regNum[0][1] >= SECOND_HALF_BANK_START_GRF &&
541 regNum[0][2] >= SECOND_HALF_BANK_START_GRF))
542 {
543 parent->BCStats.addBad();
544 output << "BAD";
545 }
546 else
547 {
548 parent->BCStats.addOK();
549 output << "OK";
550 }
551 }
552 }
553 }
554 else //Two source
555 { // EVEN/ODD
556 if ((regNum[0][1] != regNum[0][2]) &&
557 ((regNum[0][1] % 2) == (regNum[0][2] % 2)))
558 {
559 if ((regNum[0][1] < SECOND_HALF_BANK_START_GRF &&
560 regNum[0][2] < SECOND_HALF_BANK_START_GRF) ||
561 (regNum[0][1] >= SECOND_HALF_BANK_START_GRF &&
562 regNum[0][2] >= SECOND_HALF_BANK_START_GRF))
563 {
564 parent->BCStats.addBad();
565 output << "BAD";
566 }
567 else
568 {
569 parent->BCStats.addOK();
570 output << "OK";
571 }
572 }
573 else
574 {
575 parent->BCStats.addGood();
576 output << "GOOD";
577 }
578 }
579 output << "}";
580 }
581 }
582
isValidReg(int reg)583 static bool isValidReg(int reg)
584 {
585 return reg != -1;
586 }
587
setInValidReg(int & reg)588 static void setInValidReg(int ®)
589 {
590 reg = -1;
591 }
592
getConflictTimesForTGLLP(std::ostream & output,int * firstRegCandidate,int & sameBankConflicts)593 static int getConflictTimesForTGLLP(
594 std::ostream& output, int *firstRegCandidate, int &sameBankConflicts)
595 {
596 int conflictTimes = 0;
597 int bundles[2][8];
598 int bankSrcs[2];
599
600 for (int i = 0; i < 2; i++)
601 {
602 for (int j = 0; j < 8; j++)
603 {
604 bundles[i][j] = -1;
605 }
606 bankSrcs[i] = 0;
607 }
608
609 output << "{";
610 for (int i = 0; i < G4_MAX_SRCS; i++)
611 {
612 if (isValidReg(firstRegCandidate[i]))
613 {
614 int bundleID = (firstRegCandidate[i] % 16) / 2;
615 int bankID = firstRegCandidate[i] % 2;
616
617 //Same bank and same bundle
618 if (bundles[bankID][bundleID] != -1)
619 {
620 conflictTimes++;
621 }
622
623 bundles[bankID][bundleID] = i;
624 bankSrcs[bankID]++;
625 if (bankID == 0)
626 {
627 output << "E:";
628 }
629 else
630 {
631 output << "O:";
632 }
633 output << bundleID << ",";
634 }
635 }
636
637 //Same bank but different bundles
638 if (conflictTimes == 0 &&
639 (bankSrcs[0] > 2 ||
640 bankSrcs[1] > 2))
641 {
642 conflictTimes++;
643 sameBankConflicts ++;
644 }
645 else if (bankSrcs[0] > 2 ||
646 bankSrcs[1] > 2)
647 {
648 sameBankConflicts ++;
649 }
650
651 output << "}, ";
652
653 return conflictTimes;
654 }
655
getConflictTimesForTGL(std::ostream & output,int * firstRegCandidate,int & sameBankConflicts,bool zeroOne,bool isTGLLP,bool reducedBundles)656 int G4_BB::getConflictTimesForTGL(
657 std::ostream& output, int *firstRegCandidate,
658 int &sameBankConflicts, bool zeroOne, bool isTGLLP, bool reducedBundles)
659 {
660 int conflictTimes = 0;
661 int bundles[2][16];
662 int bankSrcs[2];
663
664 for (int i = 0; i < 2; i++)
665 {
666 for (int j = 0; j < 16; j++)
667 {
668 bundles[i][j] = -1;
669 }
670 bankSrcs[i] = 0;
671 }
672
673 output << "{";
674 for (int i = 0; i < G4_MAX_SRCS; i++)
675 {
676 bool same_register = false;
677
678 if (isValidReg(firstRegCandidate[i]))
679 {
680 for (int j = 0; j < i; j++)
681 {
682 if (isValidReg(firstRegCandidate[j]) && j != i)
683 {
684 if (firstRegCandidate[j] == firstRegCandidate[i])
685 {
686 same_register = true;
687 break;
688 }
689 }
690 }
691
692 if (same_register)
693 {
694 continue;
695 }
696
697 int bundleID = (firstRegCandidate[i] % 64) / 4;
698 int bankID = (firstRegCandidate[i] % 4) / 2;
699 if (isTGLLP)
700 {
701 bankID = (firstRegCandidate[i]) % 2;
702 bundleID = (firstRegCandidate[i] % 16) / 2;
703 }
704 else if (zeroOne)
705 {
706 bankID = (firstRegCandidate[i]) % 2;
707 bundleID = (firstRegCandidate[i] % 32) / 2;
708 }
709
710 if(reducedBundles)
711 {
712 bundleID = (firstRegCandidate[i] % 16) / 2;
713 }
714 //Same bank and same bundle
715 if (bundles[bankID][bundleID] != -1) //Same bank and same bundle
716 {
717 conflictTimes++;
718 }
719
720 bundles[bankID][bundleID] = i;
721 bankSrcs[bankID]++;
722 if (bankID == 0)
723 {
724 output << "E:";
725 }
726 else
727 {
728 output << "O:";
729 }
730 output << bundleID << ",";
731 }
732 }
733
734 //Same bank but different bundles
735 if (conflictTimes == 0 && (bankSrcs[0] > 2 || bankSrcs[1] > 2))
736 {
737 conflictTimes++;
738 sameBankConflicts++;
739 }
740 else if (bankSrcs[0] > 2 || bankSrcs[1] > 2)
741 {
742 sameBankConflicts++;
743 }
744
745 output << "}, ";
746
747 return conflictTimes;
748 }
749
750 /*
751 * Xe BC evaluation
752 * All read suppression is GRF granularity based.
753 * Read suppression only happens between or within a physical instruction not compressed one. Compressed one will be split into physical instructions.
754 * Read suppression between instructions:
755 * The read suppression mechanism is used to save the GRF register reading operations with a register cache in HW. The suppression we talked here
756 * is the suppression between instructions. For each source operand slot, HW provide a GRF cache. With the cache, if the same GRF will be read in
757 * the instruction, the read will not happen, the cached value will be used directly.
758 * Note that:
759 * 1. Inter read suppression is the suppression cache based.
760 * 2. For compressed instructino 2 GRFs read suppression for src1 for DF and F type operands and 1 GRF read suppression for src0 and src2.
761 * 3. The slot cache will be flushed if the buffered register is used as destination operand.
762 *
763 * Read suppression within a instruction:
764 * 1. Works for all source operands.
765 * 2. intra suppression is the GRF read operation based(no read no suppression).
766 */
emitBankConflictXe(std::ostream & os_output,const G4_INST * inst,int * suppressRegs,int & sameConflictTimes,int & twoSrcConflicts,int & simd16RS,bool zeroOne,bool isTGLLP,bool hasReducedBundles)767 uint32_t G4_BB::emitBankConflictXe(
768 std::ostream& os_output, const G4_INST *inst,
769 int *suppressRegs,
770 int &sameConflictTimes, int &twoSrcConflicts,
771 int &simd16RS, bool zeroOne, bool isTGLLP, bool hasReducedBundles)
772 {
773 std::stringstream output;
774
775 parent->XeBCStats.addSIMD8();
776
777 if (inst->isSend() || inst->isMath() ||
778 inst->isSWSBSync() ||
779 inst->isWait() ||
780 inst->isReturn() || inst->isCall())
781 { //Flush
782 for (int i = 0; i < 4; i++)
783 {
784 setInValidReg(suppressRegs[i]);
785 }
786 return 0;
787 }
788
789 int currInstRegs[2][G4_MAX_SRCS];
790 int readRegs[2][G4_MAX_SRCS];
791 int currInstExecSize[G4_MAX_SRCS] = {0};
792 int firstRegCandidate[G4_MAX_SRCS];
793 int secondRegCandidate[G4_MAX_SRCS];
794 int candidateNum = 0;
795 int dstExecSize = 0;
796 int dstRegs[2];
797
798 for (int i = 0; i < G4_MAX_SRCS; i++)
799 {
800 setInValidReg(firstRegCandidate[i]);
801 setInValidReg(secondRegCandidate[i]);
802 setInValidReg(currInstRegs[0][i]);
803 setInValidReg(currInstRegs[1][i]);
804 setInValidReg(readRegs[0][i]);
805 setInValidReg(readRegs[1][i]);
806 }
807 setInValidReg(dstRegs[0]);
808 setInValidReg(dstRegs[1]);
809
810 bool isCompressedInst = false;
811 bool isLastInstCompressed = suppressRegs[4] == 1;
812 bool isFDFSrc1 = false;
813 bool isSrc1Suppressed = false;
814
815 //Get Dst
816 G4_DstRegRegion* dstOpnd = inst->getDst();
817 if (dstOpnd &&
818 !dstOpnd->isIndirect() &&
819 dstOpnd->isGreg())
820 {
821 dstExecSize = dstOpnd->getLinearizedEnd() - dstOpnd->getLinearizedStart() + 1;
822 uint32_t byteAddress = dstOpnd->getLinearizedStart();
823 dstRegs[0] = byteAddress / numEltPerGRF<Type_UB>();
824 if (dstExecSize > getGRFSize())
825 {
826 dstRegs[1] = dstRegs[0] + (dstExecSize + numEltPerGRF<Type_UB>() - 1) / numEltPerGRF<Type_UB>() - 1;
827 isCompressedInst = true;
828 }
829 }
830
831 //Get src
832 for (int i = 0; i < inst->getNumSrc(); i++)
833 {
834 setInValidReg(currInstRegs[0][i]);
835 setInValidReg(currInstRegs[1][i]);
836 G4_Operand * srcOpnd = inst->getSrc(i);
837 if (srcOpnd)
838 {
839 if (srcOpnd->isSrcRegRegion() &&
840 srcOpnd->asSrcRegRegion()->getBase() &&
841 srcOpnd->asSrcRegRegion()->getBase()->isRegVar())
842 {
843 G4_RegVar* baseVar = static_cast<G4_RegVar*>(srcOpnd->asSrcRegRegion()->getBase());
844 currInstExecSize[i] = srcOpnd->getLinearizedEnd() - srcOpnd->getLinearizedStart() + 1;
845 if (baseVar->isGreg()) {
846 uint32_t byteAddress = srcOpnd->getLinearizedStart();
847 currInstRegs[0][i] = byteAddress / numEltPerGRF<Type_UB>();
848 if (i == 1)
849 {
850 isFDFSrc1 = IS_TYPE_F32_F64(srcOpnd->getType());
851 }
852 if (currInstExecSize[i] > getGRFSize())
853 {
854 currInstRegs[1][i] = currInstRegs[0][i] + 1;
855 isCompressedInst = true;
856 }
857 else //Read suppression will be handled later
858 {
859 currInstRegs[1][i] = currInstRegs[0][i];
860 }
861 }
862 }
863 }
864 }
865
866 if (isCompressedInst)
867 {
868 parent->XeBCStats.addSIMD8();
869 }
870
871 //Kill previous read suppression candiadte if it wrote in DST
872 if (isValidReg(dstRegs[0]))
873 {
874 for (int i = 0; i < 4; i++)
875 {
876 if (suppressRegs[i] == dstRegs[0])
877 {
878 setInValidReg(suppressRegs[i]);
879 }
880 }
881 }
882
883 //Read Suppression from previous instruction
884 //Keep suppressRegs, if suppression happen
885 //Update suppression, and registers to be read.
886 // inst1: mad(8) r10, r20, r20, r40
887 // inst2: mad(8) r10, r30, r20, r50
888 // the suppression of r20 inst2 will happen
889 output << " R{";
890 for (int i = 0; i < 3; i++)
891 {
892 //Read suppression for src0, src1 and src2
893 if (isValidReg(suppressRegs[i]) &&
894 currInstRegs[0][i] == suppressRegs[i])
895 {
896 setInValidReg(currInstRegs[0][i]);
897 if (isCompressedInst &&
898 isLastInstCompressed && //Two GRF operand instructions
899 isFDFSrc1 &&
900 i == 1)
901 {
902 setInValidReg(currInstRegs[1][i]);
903 isSrc1Suppressed = true;
904 }
905 output << "r" << suppressRegs[i] << ",";
906 }
907 else
908 {
909 suppressRegs[i] = currInstRegs[0][i];
910 }
911 }
912 output << "}";
913
914 //Intra suppression for the first GRF
915 //Inter and intra will happen only once, if inter happen, intra wouldn't read
916 //Such as in following case, the src1 r20 of inst2 need be read because src0 r20 of inst2 is suppressed
917 // inst1: mad(8) r10, r20, r30, r40
918 // inst2: mad(8) r10, r20, r20, r50
919 // for this case, currInstRegs[0][j] is updated to invalid in this case because inter is handled first.
920 output << " IR{";
921 for (int i = 0; i < inst->getNumSrc(); i++)
922 {
923 if (isValidReg(currInstRegs[0][i]))
924 {
925 for (int k = 0; k < G4_MAX_SRCS; k++)
926 {
927 if (isValidReg(readRegs[0][k]) && readRegs[0][k] == currInstRegs[0][i])
928 {
929 setInValidReg(currInstRegs[0][i]);
930 output << "r" << readRegs[0][k] << ",";
931 }
932 }
933 readRegs[0][i] = currInstRegs[0][i];
934 }
935 }
936 output << "}";
937
938 suppressRegs[4] = isCompressedInst ? 1 : 0;
939
940 int conflictTimes = 0;
941 for (int i = 0; i < 3; i++)
942 {
943 if (isValidReg(currInstRegs[0][i]))
944 {
945 firstRegCandidate[candidateNum] = currInstRegs[0][i];
946 candidateNum++;
947 }
948 }
949
950 //Get the bank conflict for the first GRF instruction.
951 if (candidateNum > 1)
952 {
953 conflictTimes = getConflictTimesForTGL(output, firstRegCandidate, sameConflictTimes, zeroOne, isTGLLP, hasReducedBundles);
954 if (candidateNum == 2)
955 {
956 twoSrcConflicts += conflictTimes;
957 }
958 }
959
960 if (isCompressedInst)
961 {
962 if (isValidReg(dstRegs[1]))
963 {
964 for (int i = 0; i < 4; i++)
965 {
966 if (suppressRegs[i] == dstRegs[1])
967 {
968 //Should be no real overlap, only GRF level overlap may happen
969 setInValidReg(suppressRegs[i]);
970 }
971 }
972 }
973
974 output << " R{";
975 //Inter for the second instruction
976 for (int i = 0; i < 3; i++)
977 {
978 if (isSrc1Suppressed)
979 {
980 continue;
981 }
982 //Read suppression for src0, src1 and src2
983 if (isValidReg(suppressRegs[i]) &&
984 currInstRegs[1][i] == suppressRegs[i])
985 {
986 setInValidReg(currInstRegs[1][i]);
987 output << "r" << suppressRegs[i] << ",";
988 }
989 else
990 {
991 suppressRegs[i] = currInstRegs[1][i];
992 }
993 }
994 output << "}";
995
996 output << " IR{";
997 //Intra suppression for the second instruction
998 for (int i = 0; i < inst->getNumSrc(); i++)
999 {
1000 if (isValidReg(currInstRegs[1][i]))
1001 {
1002 for (int k = 0; k < G4_MAX_SRCS; k++)
1003 {
1004 if (isValidReg(readRegs[1][k]) && readRegs[1][k] == currInstRegs[1][i])
1005 {
1006 setInValidReg(currInstRegs[1][i]);
1007 output << "r" << readRegs[1][k] << ",";
1008 }
1009 }
1010 readRegs[1][i] = currInstRegs[1][i];
1011 }
1012 }
1013 output << "}";
1014
1015 candidateNum = 0;
1016 //For SIMD8, if any GRF0 of src1 or src2 of inst1 is GRF register
1017 for (int i = 0; i < 3; i++)
1018 {
1019 if (isValidReg(currInstRegs[1][i]))
1020 {
1021 secondRegCandidate[candidateNum] = currInstRegs[1][i];
1022 candidateNum++;
1023 }
1024 }
1025
1026 if (candidateNum > 1)
1027 {
1028 int c = 0;
1029 c = getConflictTimesForTGL(output, secondRegCandidate, sameConflictTimes, zeroOne, isTGLLP, false);
1030 conflictTimes += c;
1031 if (candidateNum == 2)
1032 {
1033 twoSrcConflicts += c;
1034 }
1035 if (currInstExecSize[0] <= 16 || currInstExecSize[1] <= 16 || currInstExecSize[2] <= 16)
1036 {
1037 simd16RS += c;
1038 }
1039 }
1040 }
1041
1042 if (conflictTimes != 0 || parent->builder->getOption(vISA_DumpAllBCInfo))
1043 {
1044 output << " {";
1045 output << "BC=";
1046 output << conflictTimes;
1047 output << "}";
1048 os_output << output.str();
1049 }
1050
1051 return conflictTimes;
1052 } // emitBankConflictXe
1053
hasInternalConflict(IR_Builder * builder,int reg1,int reg2)1054 static bool hasInternalConflict(IR_Builder *builder, int reg1, int reg2)
1055 {
1056 int bundleID1 = (reg1 % 16) / 2;
1057 int bankID1 = reg1 % 2;
1058 int bundleID2 = (reg2 % 16) / 2;
1059 int bankID2 = reg2 % 2;
1060
1061 if (builder->hasTwoGRFBank16Bundles())
1062 {
1063 bundleID1 = (reg1 % 64) / 4;
1064 bankID1 = (reg1 % 4) / 2;
1065 bundleID2 = (reg2 % 64) / 4;
1066 bankID2 = (reg2 % 4) / 2;
1067 }
1068
1069 if (builder->hasOneGRFBank16Bundles())
1070 {
1071 bundleID1 = (reg1 % 64) / 4;
1072 bankID1 = reg1 % 2;
1073 bundleID2 = (reg2 % 64) / 4;
1074 bankID2 = reg2 % 2;
1075 }
1076
1077 return ((bankID1 == bankID2) && (bundleID1 == bundleID2));
1078 }
1079
1080 /*
1081 * In XeLP, there are 8 bundles and 2 banks per HW thread.
1082 * Banks are divided according to EVEN / ODD of register index: 0101010101010101
1083 * There are 8 bundles per 16 registers : 0011223344556677
1084 * For two adjacent instructions : inst1 and inst2, inst1_src1(, inst1_src2) and inst2_src0 will be read in same cycle
1085 * Considered HW swapand read suppresion mechanisms
1086 * HW swap :
1087 *The origional GRF register reading sequence for a three source instruction is : src0 in cycle0and src1and src2 in cycle2.
1088 * HW swap mechanism detects the conflict between src1and src2, if there is a conflict, HW will read src1 in cycle0and src0and src2 in cycle1.
1089 * Note that :
1090 * 1. for SIMD16, HW swap only happens when detecting conflicts in first simd8's registers. conflict in second simd8 will not trigger swap.
1091 * 2. for SIMD16, when swapping happens, the src1and src0 of both simd8 instructions will be swapped.
1092 */
emitBankConflictXeLP(std::ostream & os_output,const G4_INST * inst,int * suppressRegs,int * lastRegs,int & sameConflictTimes,int & twoSrcConflicts,int & simd16RS)1093 uint32_t G4_BB::emitBankConflictXeLP(
1094 std::ostream& os_output, const G4_INST *inst,
1095 int *suppressRegs, int *lastRegs,
1096 int &sameConflictTimes, int &twoSrcConflicts, int &simd16RS)
1097 {
1098 std::stringstream output;
1099
1100 parent->XeBCStats.addSIMD8();
1101
1102 if (inst->isSend() ||
1103 inst->isMath() ||
1104 inst->isSWSBSync() ||
1105 inst->isWait() ||
1106 inst->isReturn() ||
1107 inst->isCall())
1108 { //Flush
1109 for (int i = 0; i < 3; i++)
1110 {
1111 setInValidReg(suppressRegs[i]);
1112 setInValidReg(lastRegs[i]);
1113 }
1114 return 0;
1115 }
1116
1117 int currInstRegs[2][G4_MAX_SRCS];
1118 int currInstExecSize[G4_MAX_SRCS] = {0};
1119 int firstRegCandidate[G4_MAX_SRCS];
1120 int secondRegCandidate[G4_MAX_SRCS];
1121 int candidateNum = 0;
1122 int dstExecSize = 0;
1123 int dstRegs[2];
1124
1125 for (int i = 0; i < G4_MAX_SRCS; i++)
1126 {
1127 setInValidReg(firstRegCandidate[i]);
1128 setInValidReg(secondRegCandidate[i]);
1129 setInValidReg(currInstRegs[0][i]);
1130 setInValidReg(currInstRegs[1][i]);
1131 }
1132 setInValidReg(dstRegs[0]);
1133 setInValidReg(dstRegs[1]);
1134
1135 bool conflictWithPrevInst = true;
1136 if (!isValidReg(lastRegs[1]) && !isValidReg(lastRegs[2]))
1137 {
1138 conflictWithPrevInst = false;
1139 }
1140
1141 //Get the regsiters of previous instruction
1142 //If there is potentail to conflict with it
1143 if (conflictWithPrevInst)
1144 {
1145 if (isValidReg(lastRegs[1]))
1146 {
1147 firstRegCandidate[candidateNum] = lastRegs[1];
1148 candidateNum++;
1149 }
1150 if (isValidReg(lastRegs[2]))
1151 {
1152 firstRegCandidate[candidateNum] = lastRegs[2];
1153 candidateNum++;
1154 }
1155 }
1156
1157 bool instSplit = false;
1158
1159 //Get Dst
1160 G4_DstRegRegion* dstOpnd = inst->getDst();
1161 if (dstOpnd &&
1162 !dstOpnd->isIndirect() &&
1163 dstOpnd->isGreg())
1164 {
1165 dstExecSize = dstOpnd->getLinearizedEnd() - dstOpnd->getLinearizedStart() + 1;
1166 uint32_t byteAddress = dstOpnd->getLinearizedStart();
1167 dstRegs[0] = byteAddress / numEltPerGRF<Type_UB>();
1168 if (dstExecSize > getGRFSize())
1169 {
1170 dstRegs[1] = dstRegs[0] + (dstExecSize + numEltPerGRF<Type_UB>() - 1) / numEltPerGRF<Type_UB>() - 1;
1171 instSplit = true;
1172
1173 }
1174 }
1175
1176 for (int i = 0; i < inst->getNumSrc(); i++)
1177 {
1178 setInValidReg(currInstRegs[0][i]);
1179 setInValidReg(currInstRegs[1][i]);
1180 G4_Operand * srcOpnd = inst->getSrc(i);
1181 if (srcOpnd)
1182 {
1183 if (srcOpnd->isSrcRegRegion() &&
1184 srcOpnd->asSrcRegRegion()->getBase() &&
1185 srcOpnd->asSrcRegRegion()->getBase()->isRegVar())
1186 {
1187 G4_RegVar* baseVar = static_cast<G4_RegVar*>(srcOpnd->asSrcRegRegion()->getBase());
1188 currInstExecSize[i] = srcOpnd->getLinearizedEnd() - srcOpnd->getLinearizedStart() + 1;
1189 if (baseVar->isGreg()) {
1190 uint32_t byteAddress = srcOpnd->getLinearizedStart();
1191 currInstRegs[0][i] = byteAddress / numEltPerGRF<Type_UB>();
1192
1193 if (currInstExecSize[i] > getGRFSize())
1194 {
1195 currInstRegs[1][i] = currInstRegs[0][i] + (currInstExecSize[i] + numEltPerGRF<Type_UB>() - 1) / numEltPerGRF<Type_UB>() - 1;
1196 instSplit = true;
1197 }
1198 else if (srcOpnd->asSrcRegRegion()->isScalar()) //No Read suppression for SIMD 16/scalar src
1199 {
1200 currInstRegs[1][i] = currInstRegs[0][i];
1201 }
1202 else
1203 {
1204 setInValidReg(currInstRegs[1][i]);
1205 }
1206 }
1207 }
1208 }
1209 }
1210
1211 if (instSplit)
1212 {
1213 parent->XeBCStats.addSIMD8();
1214 }
1215
1216 //Read Suppression for current instruction
1217 output << " R{";
1218 for (int i = 1; i < 3; i++)
1219 {
1220 if (isValidReg(suppressRegs[i]) &&
1221 currInstRegs[0][i] == suppressRegs[i])
1222 {
1223 setInValidReg(currInstRegs[0][i]);
1224 output << "r" << suppressRegs[i] << ",";
1225 }
1226 else
1227 {
1228 suppressRegs[i] = currInstRegs[0][i];
1229 }
1230 }
1231 output << "}";
1232
1233 //Kill all previous read suppression candiadte if it wrote in DST
1234 if (isValidReg(dstRegs[0]))
1235 {
1236 for (int i = 1; i < 3; i++)
1237 {
1238 if (suppressRegs[i] == dstRegs[0])
1239 {
1240 setInValidReg(suppressRegs[i]);
1241 }
1242 }
1243 }
1244
1245 bool swap = false;
1246 // SWAP: has lower proirity than read suppression
1247 // For SIMD16, the SWAP is triggered by first register, but the second one will be swapped as well
1248 if (isValidReg(currInstRegs[0][0]) && isValidReg(currInstRegs[0][1]) && isValidReg(currInstRegs[0][2]) &&
1249 hasInternalConflict(parent->builder, currInstRegs[0][1], currInstRegs[0][2]))
1250 {
1251 int tmpReg = currInstRegs[0][1];
1252 currInstRegs[0][1] = currInstRegs[0][0];
1253 currInstRegs[0][0] = tmpReg;
1254 output << " S{r" << currInstRegs[0][1] << ", r" << currInstRegs[0][0] << "} ";
1255 swap = true;
1256 }
1257
1258
1259 // No suppression, update the suppressRegs[0] for XeLP
1260 // suppressRegs[1], suppressRegs[2] will be updated with next instruction
1261
1262 // src1 and src2 will be read with src0 of next instruction
1263 lastRegs[1] = currInstRegs[0][1];
1264 lastRegs[2] = currInstRegs[0][2];
1265 //Conflict with previous instruction
1266 int conflictTimes = 0;
1267 if (conflictWithPrevInst)
1268 {
1269
1270 if (isValidReg(currInstRegs[0][0]))
1271 {
1272 firstRegCandidate[candidateNum] = currInstRegs[0][0];
1273 candidateNum++;
1274 }
1275 if (candidateNum > 1)
1276 {
1277 conflictTimes = getConflictTimesForTGLLP(output, firstRegCandidate, sameConflictTimes);
1278 if (candidateNum == 2)
1279 {
1280 twoSrcConflicts += conflictTimes;
1281 }
1282 }
1283 }
1284
1285 if (instSplit)
1286 {
1287 output << " R{";
1288 for (int i = 1; i < 3; i++)
1289 {
1290 if (isValidReg(suppressRegs[i]) &&
1291 currInstRegs[1][i] == suppressRegs[i])
1292 {
1293 setInValidReg(currInstRegs[1][i]);
1294 output << "r" << suppressRegs[i] << ",";
1295 }
1296 else
1297 {
1298 suppressRegs[i] = currInstRegs[1][i];
1299 }
1300 }
1301 output << "}";
1302
1303 if (isValidReg(dstRegs[1]))
1304 {
1305 for (int i = 1; i < 3; i++)
1306 {
1307 if (suppressRegs[i] == dstRegs[1])
1308 {
1309 setInValidReg(suppressRegs[i]);
1310 }
1311 }
1312 }
1313
1314 if (swap && isValidReg(currInstRegs[1][0]) && isValidReg(currInstRegs[1][1]) && isValidReg(currInstRegs[1][2]))
1315 {
1316 int tmpReg = currInstRegs[1][0];
1317 currInstRegs[1][0] = currInstRegs[1][1];
1318 currInstRegs[1][1] = tmpReg;
1319 output << " S{r" << currInstRegs[1][1] << ", r" << currInstRegs[1][0] << "} ";
1320 }
1321
1322 candidateNum = 0;
1323 //For SIMD8, if any GRF0 of src1 or src2 of inst1 is GRF register
1324 if (isValidReg(lastRegs[1])) // && lastRegs[1] != suppressRegs[1])
1325 {
1326 secondRegCandidate[candidateNum] = lastRegs[1];
1327 candidateNum++;
1328 }
1329 if (isValidReg(lastRegs[2])) // && lastRegs[2] != suppressRegs[2])
1330 {
1331 secondRegCandidate[candidateNum] = lastRegs[2];
1332 candidateNum++;
1333 }
1334
1335 if (isValidReg(currInstRegs[1][0]))
1336 {
1337 secondRegCandidate[candidateNum] = currInstRegs[1][0];
1338 candidateNum++;
1339 }
1340
1341 lastRegs[1] = currInstRegs[1][1];
1342 lastRegs[2] = currInstRegs[1][2];
1343
1344
1345 if (candidateNum > 1)
1346 {
1347 int c = 0;
1348 c = getConflictTimesForTGLLP(output, secondRegCandidate, sameConflictTimes);
1349 conflictTimes += c;
1350 if (candidateNum == 2)
1351 {
1352 twoSrcConflicts += c;
1353 }
1354 if (currInstExecSize[0] <= 16 || currInstExecSize[1] <= 16 || currInstExecSize[2] <= 16)
1355 {
1356 simd16RS += c;
1357 }
1358 }
1359 }
1360
1361 if (conflictTimes != 0)
1362 {
1363 output << " {";
1364 output << "BC=";
1365 output << conflictTimes;
1366 output << "}";
1367 os_output << output.str();
1368 }
1369
1370 return conflictTimes;
1371 } // emitBankConflictXeLP
1372
countReadModifyWrite(std::ostream & output,const G4_INST * inst)1373 uint32_t G4_BB::countReadModifyWrite(std::ostream& output, const G4_INST *inst)
1374 {
1375 if (!inst->getDst() || inst->getDst()->isNullReg() ||
1376 inst->isSend() || inst->isDpas())
1377 {
1378 return 0;
1379 }
1380 auto dst = inst->getDst();
1381 auto dstTy = dst->getType();
1382 if (TypeSize(dstTy) == 1 && dst->getHorzStride() > 1)
1383 {
1384 return 1;
1385 }
1386 return 0;
1387 }
1388
getLabel()1389 G4_Label * G4_BB::getLabel()
1390 {
1391 //FIXME: For now not all BBs will start with a label (e.g.,
1392 //a block that follows a call). We should fix it by getting rid
1393 //of the g4_label instruction and associate each label with a BB
1394 if (instList.size() > 0 && instList.front()->isLabel())
1395 {
1396 return instList.front()->getLabel();
1397 }
1398 return NULL;
1399 }
1400
getFirstInst()1401 G4_INST * G4_BB::getFirstInst()
1402 {
1403 G4_INST *firstInst = nullptr;
1404 if (instList.size() > 0)
1405 {
1406 INST_LIST_ITER I = instList.begin();
1407 firstInst = *I;
1408 if (firstInst->isLabel())
1409 {
1410 // Only first inst can be label.
1411 ++I;
1412 firstInst = (I != instList.end()) ? *I : nullptr;
1413 }
1414 }
1415 return firstInst;
1416 }
1417
getFirstInsertPos()1418 INST_LIST_ITER G4_BB::getFirstInsertPos()
1419 {
1420 INST_LIST_ITER II = begin();
1421 for(INST_LIST_ITER IB = end(); II != IB; ++II)
1422 {
1423 G4_INST* tI = (*II);
1424 if (tI->isLabel()
1425 || tI->opcode() == G4_join
1426 || tI->opcode() == G4_endif
1427 || tI->opcode() == G4_while)
1428 {
1429 continue;
1430 }
1431 break;
1432 }
1433 return II;
1434 }
1435
1436 //
1437 // Add an EOT send to the end of this BB.
1438 //
addEOTSend(G4_INST * lastInst)1439 void G4_BB::addEOTSend(G4_INST* lastInst)
1440 {
1441 // mov (8) r1.0<1>:ud r0.0<8;8,1>:ud {NoMask}
1442 // send (8) null r1 0x27 desc
1443 IR_Builder* builder = parent->builder;
1444 G4_Declare *dcl = builder->createSendPayloadDcl(numEltPerGRF<Type_UD>(), Type_UD);
1445 G4_DstRegRegion* movDst = builder->createDstRegRegion(dcl, 1);
1446 G4_SrcRegRegion* r0Src = builder->createSrcRegRegion(
1447 builder->getBuiltinR0(), builder->getRegionStride1());
1448 G4_INST *movInst = builder->createMov(
1449 G4_ExecSize(numEltPerGRF<Type_UD>()), movDst, r0Src, InstOpt_WriteEnable, false);
1450 if (lastInst)
1451 {
1452 movInst->inheritDIFrom(lastInst);
1453 }
1454 instList.push_back(movInst);
1455
1456 auto EOT_SFID = builder->getEOTSFID();
1457
1458 int exdesc = (0x1 << 5) + SFIDtoInt(EOT_SFID);
1459 // response len = 0, msg len = 1
1460 int desc = (0x1 << 25) + (0x1 << 4);
1461
1462 G4_SrcRegRegion* sendSrc = builder->createSrcRegRegion(
1463 dcl, builder->getRegionStride1());
1464
1465 G4_DstRegRegion *sendDst = builder->createNullDst(Type_UD);
1466
1467 auto msgDesc = builder->createGeneralMsgDesc(desc, exdesc, SendAccess::WRITE_ONLY);
1468 G4_INST* sendInst = builder->createSendInst(
1469 NULL,
1470 G4_send,
1471 g4::SIMD8,
1472 sendDst,
1473 sendSrc,
1474 builder->createImm(desc, Type_UD),
1475 InstOpt_WriteEnable,
1476 msgDesc,
1477 false);
1478 sendInst->inheritDIFrom(movInst);
1479 instList.push_back(sendInst);
1480
1481 if (builder->getHasNullReturnSampler() && VISA_WA_CHECK(builder->getPWaTable(), Wa_1607871015))
1482 {
1483 addSamplerFlushBeforeEOT();
1484 }
1485 }
1486
getBBTypeStr() const1487 const char* G4_BB::getBBTypeStr() const
1488 {
1489 switch (getBBType()) {
1490 default:
1491 break;
1492 case G4_BB_CALL_TYPE:
1493 return "CALL";
1494 case G4_BB_RETURN_TYPE:
1495 return "RETURN";
1496 case G4_BB_INIT_TYPE:
1497 return "INIT";
1498 case G4_BB_EXIT_TYPE:
1499 return "EXIT";
1500 case G4_BB_NM_WA_TYPE:
1501 return "NoMaskWA";
1502 case G4_BB_FCALL_TYPE:
1503 return "FCALL";
1504 }
1505 return " ";
1506 }
1507
dump() const1508 void G4_BB::dump() const
1509 {
1510 print(std::cerr);
1511 }
1512
emitBbInfo(std::ostream & os) const1513 void G4_BB::emitBbInfo(std::ostream& os) const {
1514 // mustn't exceed a single line because it could be in asm output
1515 auto fmtBbId = [&](int bb) {
1516 std::stringstream ss;
1517 ss << "B" << std::setw(3) << std::setfill('0') << bb;
1518 return ss.str();
1519 };
1520 os << fmtBbId(getId()) << ":";
1521 bool first = true;
1522 auto maybeComma = [&]() {
1523 if (first)
1524 first = false;
1525 else
1526 os << ", ";
1527 };
1528 if (getBBType())
1529 {
1530 maybeComma();
1531 os << " [" << getBBTypeStr() << "]";
1532 }
1533 if (isDivergent())
1534 {
1535 maybeComma();
1536 os << " [inDivergent]";
1537 }
1538 auto emitBbSet = [&](const char *name, const BB_LIST &bbl) {
1539 maybeComma();
1540 os << " " << name << ":{";
1541 bool first = true;
1542 for (const auto &bb : bbl) {
1543 if (first) first = false; else os << ", ";
1544 os << fmtBbId(bb->getId());
1545 }
1546 os << "}";
1547 };
1548 emitBbSet("Preds", Preds);
1549 emitBbSet("Succs", Succs);
1550 }
1551
print(std::ostream & OS) const1552 void G4_BB::print(std::ostream& OS) const
1553 {
1554 emitBbInfo(OS);
1555 OS << "\n";
1556 for (auto& x : instList)
1557 x->print(OS);
1558 OS << "\n";
1559 }
1560
dumpDefUse(std::ostream & os) const1561 void G4_BB::dumpDefUse(std::ostream& os) const
1562 {
1563 for (auto& x : instList)
1564 {
1565 x->dump();
1566 if (x->def_size() > 0 || x->use_size() > 0)
1567 {
1568 x->dumpDefUse(os);
1569 os << "\n\n\n";
1570 }
1571 }
1572 }
1573
resetLocalIds()1574 void G4_BB::resetLocalIds()
1575 {
1576 int i = 0;
1577
1578 for (INST_LIST_ITER iter = instList.begin(), end = instList.end();
1579 iter != end;
1580 ++iter, ++i)
1581 {
1582 (*iter)->setLocalId(i);
1583 }
1584 }
1585
removeIntrinsics(Intrinsic intrinId)1586 void G4_BB::removeIntrinsics(Intrinsic intrinId) {
1587 instList.remove_if([=](G4_INST* inst) {
1588 return inst->isIntrinsic() &&
1589 inst->asIntrinsicInst()->getIntrinsicId() == intrinId;
1590 });
1591 }
1592
1593
1594 // Add two sampler cache flushes before the EOT send.
1595 // sampler cache flush 1 must have null return
1596 // sampler cache flush 2 must have valid return
1597 // bb must end with an EOT send
addSamplerFlushBeforeEOT()1598 void G4_BB::addSamplerFlushBeforeEOT()
1599 {
1600 assert(isLastInstEOT() && "last instruction must be EOT");
1601 auto builder = parent->builder;
1602 int samplerFlushOpcode = 0x1F;
1603 int samplerFlushFC = (SamplerSIMDMode::SIMD32 << 17) +
1604 (samplerFlushOpcode << 12);
1605 // null return version
1606 {
1607 int desc = G4_SendDescRaw::createDesc(samplerFlushFC, true, 1, 0);
1608 G4_SrcRegRegion* sendMsgOpnd = builder->createSrcRegRegion(
1609 builder->getBuiltinR0(),
1610 builder->getRegionStride1());
1611
1612 auto msgDesc = builder->createSyncMsgDesc(SFID::SAMPLER, desc);
1613 G4_INST* samplerFlushInst = builder->createSendInst(
1614 nullptr, G4_send, g4::SIMD8,
1615 builder->createNullDst(Type_UD), sendMsgOpnd,
1616 builder->createImm(desc, Type_UD),
1617 0, msgDesc, true);
1618 auto iter = std::prev(end());
1619 insert(iter, samplerFlushInst);
1620 }
1621
1622 // valid return version
1623 {
1624 int desc = G4_SendDescRaw::createDesc(samplerFlushFC, true, 1, 1);
1625 G4_SrcRegRegion* sendMsgOpnd = builder->createSrcRegRegion(
1626 builder->getBuiltinR0(),
1627 builder->getRegionStride1());
1628 G4_Declare *tmpDest = builder->createTempVar(g4::SIMD8, Type_UD, GRFALIGN);
1629 tmpDest->setDoNotSpill();
1630 G4_DstRegRegion* sendMsgDst = builder->createDstRegRegion(tmpDest, 1);
1631 auto msgDesc = builder->createSyncMsgDesc(SFID::SAMPLER, desc);
1632 G4_INST* samplerFlushInst = builder->createSendInst(
1633 nullptr, G4_send, g4::SIMD8,
1634 sendMsgDst, sendMsgOpnd,
1635 builder->createImm(desc, Type_UD),
1636 0, msgDesc, true);
1637 auto iter = std::prev(end());
1638 insert(iter, samplerFlushInst);
1639 }
1640 }
1641
dominates(G4_BB * other)1642 bool G4_BB::dominates(G4_BB* other)
1643 {
1644 return getParent().getDominator().dominates(this, other);
1645 }
1646