1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2017-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include "SWSB_G4IR.h"
10 #include "Dependencies_G4IR.h"
11 #include "../G4_Opcode.h"
12 #include "../Timer.h"
13 #include "../RegAlloc.h"
14 #include "visa_wa.h"
15 
16 #include <algorithm>
17 #include <fstream>
18 #include <functional>
19 #include <sstream>
20 #include <queue>
21 
22 using namespace vISA;
23 
getDPASPipelineCycle(uint8_t repc)24 static uint8_t getDPASPipelineCycle(uint8_t repc)
25 {
26     switch (repc)
27     {
28     case REP_1:
29         return DPAS_8x1_CYCLE;
30     case REP_2:
31         return DPAS_8x2_CYCLE;
32     case REP_4:
33         return DPAS_8x4_CYCLE;
34     case REP_8:
35         return DPAS_8x8_CYCLE;
36     default:
37         assert(0 && "Unexpected DPAS repeat count");
38     }
39 
40     return 0;
41 }
42 
getDPASGRFReadCycle(uint8_t repc)43 static uint8_t getDPASGRFReadCycle(uint8_t repc)
44 {
45     switch (repc)
46     {
47     case REP_1:
48         return DPAS_8x1_GRFREAD_CYCLE;
49     case REP_2:
50         return DPAS_8x2_GRFREAD_CYCLE;
51     case REP_4:
52         return DPAS_8x4_GRFREAD_CYCLE;
53     case REP_8:
54         return DPAS_8x8_GRFREAD_CYCLE;
55     default:
56         assert(0 && "Unexpected DPAS repeat count");
57     }
58 
59     return 0;
60 }
61 
hasSameFunctionID(const G4_INST * inst1,const G4_INST * inst2)62 static bool hasSameFunctionID(const G4_INST* inst1, const G4_INST* inst2)
63 {
64     if (inst1->isSend() && inst2->isSend())
65     {
66         G4_SendDesc* msgDesc1 = inst1->getMsgDesc();
67         G4_SendDesc* msgDesc2 = inst2->getMsgDesc();
68 
69         if (msgDesc1->isSLM() && msgDesc2->isSLM())
70         {
71             return (msgDesc1->getSFID() == msgDesc2->getSFID());
72         }
73         else if (msgDesc1->isSLM() || msgDesc2->isSLM())
74         {
75             return false;
76         }
77 
78         return (msgDesc1->getSFID() == msgDesc2->getSFID());
79     }
80     else if (inst1->isSend() || inst2->isSend())
81     {
82         return false;
83     }
84     else if (inst1->isMathPipeInst() && inst2->isMathPipeInst())
85     {
86         return true;
87     }
88     else if (inst1->isDpas() && inst2->isDpas())
89     {
90         return true;
91     }
92     else if (inst1->isDpas() || inst2->isDpas())
93     {
94         return false;
95     }
96     else if (inst1->isMathPipeInst() || inst2->isMathPipeInst())
97     {
98         return false;
99     }
100     else
101     {
102         return true;
103     }
104 }
105 
isSLMMsg(const G4_INST * inst)106 static bool isSLMMsg(const G4_INST* inst)
107 {
108     assert(inst->isSend());
109     const G4_SendDesc* msgDesc = inst->getMsgDesc();
110     if (msgDesc->isSLM())
111     {
112         return true;
113     }
114     return false;
115 }
116 
isPrefetch(const G4_INST * inst)117 static bool isPrefetch(const G4_INST* inst)
118 {
119     if(!inst->isSend())
120     {
121         return false;
122     }
123 
124     const G4_SendDesc* msgDesc = inst->getMsgDesc();
125     if (msgDesc->isRead() && (inst->getDst() == nullptr || inst->getDst()->isNullReg()))
126     {
127         return true;
128     }
129     return false;
130 }
131 
isFence(const G4_INST * inst)132 static bool isFence(const G4_INST* inst)
133 {
134     assert(inst->isSend());
135     const G4_SendDesc* msgDesc = inst->getMsgDesc();
136     if (msgDesc->isFence())
137     {
138         return true;
139     }
140     return false;
141 }
142 
hasSamePredicator(const G4_INST * inst1,const G4_INST * inst2)143 static bool hasSamePredicator(const G4_INST* inst1, const G4_INST* inst2)
144 {
145     G4_Predicate* pred1 = inst1->getPredicate();
146     G4_Predicate* pred2 = inst2->getPredicate();
147 
148     if (pred1 && pred2)
149     {
150         bool flagRegNumValid = true;
151         unsigned short refOff1 = pred1->getBase()->ExRegNum(flagRegNumValid);
152         unsigned short subRefOff1 = pred1->getBase()->asRegVar()->getPhyRegOff();;
153         unsigned short refOff2 = pred2->getBase()->ExRegNum(flagRegNumValid);
154         unsigned short subRefOff2 = pred2->getBase()->asRegVar()->getPhyRegOff();;
155 
156         if (refOff1 == refOff2 &&
157             subRefOff1 == subRefOff2)
158         {
159             return true;
160         }
161         return false;
162     }
163 
164     if (pred1 || pred2)
165     {
166         return false;
167     }
168 
169     if (inst1->isWriteEnableInst() || inst2->isWriteEnableInst())
170     {
171         return false;
172     }
173 
174     return true;
175 }
176 
hasSameExecMask(const G4_INST * inst1,const G4_INST * inst2)177 static bool hasSameExecMask(const G4_INST* inst1, const G4_INST* inst2)
178 {
179     uint16_t mask1 = inst1->getMaskOffset();
180     uint16_t mask2 = inst2->getMaskOffset();
181     if (mask1 != mask2)
182     {
183         return false;
184     }
185 
186     unsigned char execSize1 = inst1->getExecSize();
187     unsigned char execSize2 = inst2->getExecSize();
188     if (execSize1 != execSize2)
189     {
190         return false;
191     }
192 
193     return true;
194 }
195 
WARDepRequired(const G4_INST * inst1,const G4_INST * inst2)196 static bool WARDepRequired(const G4_INST* inst1, const G4_INST* inst2)
197 {
198     if (!hasSameFunctionID(inst1, inst2) ||
199         (hasSameFunctionID(inst1, inst2) &&
200         (!hasSamePredicator(inst1, inst2) ||
201             !hasSameExecMask(inst1, inst2))))
202     {
203         return true;
204     }
205 
206     return false;
207 }
208 
209 // check if two operands occupy overlapping GRFs
210 // we put them here instead of inside G4_Operand since this is only valid till after RA
211 // It's the caller's responsibility to ensure that opnd1 and opnd2 are both GRF allocated
operandOverlap(G4_Operand * opnd1,G4_Operand * opnd2)212 static bool operandOverlap(G4_Operand* opnd1, G4_Operand* opnd2)
213 {
214     return (opnd1->getLinearizedStart() <= opnd2->getLinearizedStart() &&
215         opnd1->getLinearizedEnd() > opnd2->getLinearizedStart()) ||
216         (opnd2->getLinearizedStart() <= opnd1->getLinearizedStart() &&
217             opnd2->getLinearizedEnd() > opnd1->getLinearizedStart());
218 }
219 
getDPASDataType(GenPrecision p)220 static G4_Type getDPASDataType(GenPrecision p)
221 {
222     switch (p)
223     {
224     case GenPrecision::U1:
225     case GenPrecision::U2:
226     case GenPrecision::U4:
227     case GenPrecision::U8:   return Type_UB;
228     case GenPrecision::S2:
229     case GenPrecision::S4:
230     case GenPrecision::S8:   return Type_B;
231     case GenPrecision::FP16: return Type_HF;
232     case GenPrecision::BF16: return Type_BF;
233     case GenPrecision::BF8:  return Type_UNDEF;
234     case GenPrecision::TF32: return Type_UNDEF;
235     default:
236         assert(false && "illegal Operand Precision");
237         return Type_UD;
238     }
239 }
240 
241 // Compute the range of registers touched by OPND.
getFootprintForGRF(G4_Operand * opnd,Gen4_Operand_Number opnd_num,G4_INST * inst,int startingBucket,bool mustBeWholeGRF)242 SBFootprint* G4_BB_SB::getFootprintForGRF(
243     G4_Operand* opnd,
244     Gen4_Operand_Number opnd_num,
245     G4_INST* inst,
246     int startingBucket,
247     bool mustBeWholeGRF)
248 {
249     unsigned short LB = 0;
250     unsigned short RB = 0;
251     int aregOffset = totalGRFNum;
252     G4_Type type = opnd->getType();
253     if (inst->opcode() == G4_fcvt &&
254         (type == Type_UB ||
255         (type == Type_UD && builder.hasPartialInt64Support())))
256     {
257         type = Type_F;
258     }
259     if (inst->opcode() == G4_srnd)
260     {   // srnd ub  hf  hf | srnd hf f f
261         type = inst->getSrc(0)->getType();
262     }
263 
264     if (inst->isDpas() && (opnd_num == Opnd_src1 || opnd_num == Opnd_src2))
265     {
266         if (opnd_num == Opnd_src1)
267         {
268             type = getDPASDataType(inst->asDpasInst()->getSrc1Precision());
269         }
270         if (opnd_num == Opnd_src2)
271         {
272             type = getDPASDataType(inst->asDpasInst()->getSrc2Precision());
273         }
274     }
275 
276     switch (opnd_num) {
277     case Opnd_src0:
278     case Opnd_src1:
279     case Opnd_src2:
280     case Opnd_src3:
281     case Opnd_dst:
282         LB = (unsigned short)opnd->getLinearizedStart();
283         RB = (unsigned short)opnd->getLinearizedEnd();
284         if (inst->isSend())
285         {
286             assert((LB % numEltPerGRF<Type_UB>()) == 0);
287             //For the operands of the send instructions,
288             //we are using the message length to avoid the in-consistence with the HW requirement.
289             //
290             if (opnd_num == Opnd_src0)
291             {
292                 RB = LB + numEltPerGRF<Type_UB>() * inst->getMsgDesc()->getSrc0LenRegs() - 1;
293             }
294 
295             if (inst->isSplitSend() &&
296                 opnd_num == Opnd_src1)
297             {
298                 RB = LB + numEltPerGRF<Type_UB>() * inst->getMsgDesc()->getSrc1LenRegs() - 1;
299             }
300 
301             if (opnd_num == Opnd_dst)
302             {
303                 int dstSize = inst->getMsgDesc()->getDstLenRegs();
304                 // DG2 A0 W/A to treat SIMD8 SLM load with single GRF return as two GRF return
305                 if (VISA_WA_CHECK(builder.getPWaTable(), Wa_14012562260) &&
306                     inst->getExecSize() <= 8 && isSLMMsg(inst) && dstSize == 1)
307                 {
308                     if ((LB / numEltPerGRF<Type_UB>()) < 127)
309                     {
310                         dstSize = 2;
311                     }
312                 }
313 
314                 if ((LB / numEltPerGRF<Type_UB>()) < (unsigned short)(totalGRFNum - 1))
315                 {
316                     RB = LB + numEltPerGRF<Type_UB>() * dstSize - 1;
317                 }
318             }
319 
320             assert(RB < (numEltPerGRF<Type_UB>() * aregOffset) && "Out of register bound");
321         }
322         //HW WA for DPAS src2, treat all source 2 as 8x8 source 2 to avoid the read suppression issue
323         if (builder.hasDPASSrc2ReadSuppressionDepIssue() &&
324             inst->opcode() == G4_dpas && opnd_num == Opnd_src2)
325         {
326             const G4_InstDpas* dpasInst = inst->asDpasInst();
327             uint32_t bytesPerLane = dpasInst->getSrc2SizePerLaneInByte();
328             uint32_t bytes = bytesPerLane * 8* 8;
329             RB = LB + bytes - 1;
330         }
331 
332         //HW WA for DPAS src1, treat all source 1 8GRF size
333         if (VISA_WA_CHECK(builder.getPWaTable(), Wa_14013341720) &&
334             inst->opcode() == G4_dpas && opnd_num == Opnd_src1)
335         {
336             uint32_t bytes = getGRFSize() * 8;
337             RB = LB + bytes - 1;
338         }
339         break;
340     default:
341         assert(0 && "Bad opnd");
342     }
343 
344     void* allocedMem = mem.alloc(sizeof(SBFootprint));
345     if (startingBucket >= aregOffset)
346     {
347         LB = startingBucket * numEltPerGRF<Type_UB>() + LB;
348         RB = startingBucket * numEltPerGRF<Type_UB>() + RB;
349     }
350 
351     //This is WA which assumes whole GRF will be touched in send instruction, not matter the occupation of real valid value.
352     //FIXME: But this is not true in media block read/write, which can specify the byte level size in descriptor, no GRF align required.
353     if (mustBeWholeGRF)
354     {
355         LB = (LB / numEltPerGRF<Type_UB>()) * numEltPerGRF<Type_UB>();
356         RB = ((RB / numEltPerGRF<Type_UB>()) + 1) * numEltPerGRF<Type_UB>() - 1;
357     }
358 
359     SBFootprint* footprint = new (allocedMem)SBFootprint(GRF_T, type, LB, RB, inst);
360 
361     return footprint;
362 }
363 
needBothAcc(IR_Builder & builder,G4_INST * inst,G4_Operand * opnd)364 bool needBothAcc(IR_Builder& builder, G4_INST* inst, G4_Operand * opnd)
365 {
366     switch (opnd->getType())
367     {
368     case Type_F:
369         return inst->getExecSize() == G4_ExecSize(builder.getNativeExecSize() * 2);
370     case Type_HF:
371     case Type_BF:
372         return false;
373     case Type_DF:
374         return inst->getExecSize() > G4_ExecSize(builder.getNativeExecSize() / 2);
375     default:
376         return true;
377     }
378 }
379 
380 
381 // Compute the range of registers touched by OPND.
getFootprintForACC(G4_Operand * opnd,Gen4_Operand_Number opnd_num,G4_INST * inst)382 SBFootprint* G4_BB_SB::getFootprintForACC(G4_Operand* opnd,
383     Gen4_Operand_Number opnd_num,
384     G4_INST* inst)
385 {
386     unsigned short LB = 0;
387     unsigned short RB = 0;
388     G4_Type type = opnd->getType();
389 
390     switch (opnd_num) {
391     case Opnd_src0:
392     case Opnd_src1:
393     case Opnd_src2:
394     case Opnd_src3:
395     case Opnd_dst:
396     case Opnd_implAccSrc:
397     case Opnd_implAccDst:
398         LB = (unsigned short)opnd->getLinearizedStart();
399         RB = (unsigned short)opnd->getLinearizedEnd();
400         break;
401     default:
402         assert(0 && "Bad opnd");
403     }
404 
405     if (needBothAcc(builder, inst, opnd))
406     {
407         if (((RB - LB + 1) / numEltPerGRF<Type_UB>()) < 2)
408         {
409             RB = LB + numEltPerGRF<Type_UB>() * 2 - 1;
410         }
411     }
412     int regNum = 0;
413     if (opnd->isDstRegRegion())
414         regNum += opnd->asDstRegRegion()->getRegOff();
415     else if (opnd->isSrcRegRegion())
416         regNum += opnd->asSrcRegRegion()->getRegOff();
417 
418     LB += regNum * numEltPerGRF<Type_UB>();
419     RB += regNum * numEltPerGRF<Type_UB>();
420 
421     void* allocedMem = mem.alloc(sizeof(SBFootprint));
422     SBFootprint* footprint = nullptr;
423 
424     footprint = new (allocedMem)SBFootprint(ACC_T, type, LB, RB, inst);
425 
426     return footprint;
427 }
428 
429 // Compute the range of flag registers touched by OPND.
430 // Treat each 16 bit of the flag register as a bucket unit, GRF size
431 // 64 bytes GRF: each bit means 8 bytes
432 // 32 bytes GRF: each bit means 4 bytes
getFootprintForFlag(G4_Operand * opnd,Gen4_Operand_Number opnd_num,G4_INST * inst)433 SBFootprint* G4_BB_SB::getFootprintForFlag(G4_Operand* opnd,
434     Gen4_Operand_Number opnd_num,
435     G4_INST* inst)
436 {
437     unsigned short LB = 0;
438     unsigned short RB = 0;
439     G4_Type type = opnd->getType();
440     bool valid = true;
441     unsigned subRegOff = opnd->getBase()->ExSubRegNum(valid);
442     LB = (unsigned short)(opnd->getLeftBound() + subRegOff * 16) * FLAG_TO_GRF_MAP;
443     RB = (unsigned short)(opnd->getRightBound() + subRegOff * 16) * FLAG_TO_GRF_MAP;
444 
445     LB += (builder.kernel.getNumRegTotal() + builder.getNumScalarRegisters() + builder.kernel.getNumAcc()) * numEltPerGRF<Type_UB>();
446     RB += (builder.kernel.getNumRegTotal() + builder.getNumScalarRegisters() + builder.kernel.getNumAcc()) * numEltPerGRF<Type_UB>();
447 
448     void* allocedMem = mem.alloc(sizeof(SBFootprint));
449     SBFootprint* footprint = nullptr;
450 
451     footprint = new (allocedMem)SBFootprint(FLAG_T, type, LB, RB, inst);
452 
453     return footprint;
454 }
455 
456 
compareInterval(SBNode * n1,SBNode * n2)457 static bool compareInterval(SBNode* n1, SBNode* n2)
458 {
459     return n1->getLiveStartID() < n2->getLiveStartID();
460 }
461 
compareBBStart(G4_BB_SB * b1,G4_BB_SB * b2)462 static bool compareBBStart(G4_BB_SB* b1, G4_BB_SB* b2)
463 {
464     return b1->first_node < b2->first_node;
465 }
466 
nodeSortCompare(SBDEP_ITEM dep1,SBDEP_ITEM dep2)467 static bool nodeSortCompare(SBDEP_ITEM dep1, SBDEP_ITEM dep2)
468 {
469     if (dep1.node->getBBID() < dep2.node->getBBID())
470     {
471         return true;
472     }
473     else if (dep1.node->getBBID() == dep2.node->getBBID())
474     {
475         return (dep1.node->getNodeID() < dep2.node->getNodeID());
476     }
477 
478     return false;
479 }
480 
481 // Return TRUE if opnd corresponding to opndNum has indirect access.
hasIndirection(const G4_Operand * opnd,Gen4_Operand_Number opndNum)482 static inline bool hasIndirection(const G4_Operand* opnd, Gen4_Operand_Number opndNum) {
483     switch (opndNum) {
484     case Opnd_dst:
485         return opnd->asDstRegRegion()->isIndirect();
486     case Opnd_src0:
487     case Opnd_src1:
488     case Opnd_src2:
489         return opnd->asSrcRegRegion()->isIndirect();
490     case Opnd_src3:
491     case Opnd_pred:
492     case Opnd_condMod:
493     case Opnd_implAccSrc:
494     case Opnd_implAccDst:
495         return false;
496     default:
497         assert(0 && "Bad opndNum");
498         return false;           // Unreachable
499     }
500 }
501 
distanceHonourInstruction(const G4_INST * inst)502 static inline bool distanceHonourInstruction(const G4_INST* inst)
503 {
504     return !inst->tokenHonourInstruction() && !inst->isWait() && inst->opcode() != G4_nop && inst->opcode() != G4_halt;
505 }
506 
tokenHonourInstruction(const G4_INST * inst)507 static inline bool tokenHonourInstruction(const G4_INST* inst)
508 {
509     return inst->tokenHonourInstruction();
510 }
511 
512 //Generate the dependence distance
setDefaultDistanceAtFirstInstruction()513 void SWSB::setDefaultDistanceAtFirstInstruction()
514 {
515     for (auto bb : fg)
516     {
517         for (auto it = bb->begin();
518             it != bb->end();
519             it++)
520         {
521             if (!(*it)->isLabel())
522             {
523                 (*it)->setDistance(1);
524                 if (fg.builder->hasThreeALUPipes() || fg.builder->hasFourALUPipes())
525                 {
526                     (*it)->setDistanceTypeXe(G4_INST::DistanceType::DISTALL);
527                 }
528                 if (fg.builder->getFCPatchInfo()->getFCComposableKernel() && fg.builder->hasFourALUPipes())
529                 {
530                     insertSyncAllWRInstruction(bb, 0, it, (*it)->getCISAOff(), (*it)->getLineNo());
531                     insertSyncAllRDInstruction(bb, 0, it, (*it)->getCISAOff(), (*it)->getLineNo());
532                 }
533 
534                 return;
535             }
536         }
537     }
538 }
539 
addSIMDEdge(G4_BB_SB * pred,G4_BB_SB * succ)540 void SWSB::addSIMDEdge(G4_BB_SB* pred, G4_BB_SB* succ)
541 {
542     pred->Succs.push_back(succ);
543     succ->Preds.push_back(pred);
544 }
545 
546 // Build SIMD CFG for the global WAR dependence tracking
547 // 1. When building CFG, except the backedge, all using JIP branch edge.
548 // 2. For the join and endif instructions which are no separated and place in the head of a BB. We do edge propagation
549 //    Such as:   BB a, b, c, d,  there is a join in BB b which JIP to d, and there is an edge from a to b, we will add edge from a to d, instead of b to d.
SWSBBuildSIMDCFG()550 void SWSB::SWSBBuildSIMDCFG()
551 {
552     //Build parallel control flow graph
553     for (size_t i = 0; i < BBVector.size(); i++)
554     {
555         G4_BB_SB* currBB = BBVector[i];
556         const G4_INST* lastInst = currBB->getBB()->back();
557         for (const G4_INST* firstInst : *currBB->getBB())
558         {
559             if (firstInst->isLabel())
560                 continue;
561 
562             if (firstInst != lastInst &&
563                 G4_Inst_Table[firstInst->opcode()].instType == InstTypeFlow)
564             {
565                 if (firstInst->asCFInst()->getJip())
566                 {
567                     G4_Operand* jip = firstInst->asCFInst()->getJip();
568                     G4_BB_SB* targetBB = labelToBlockMap[jip->asLabel()];
569 
570                     //Do we need to propagate edge for fall through preds?
571                     for (G4_BB_SB* predBB : currBB->Preds)
572                     {
573                         addSIMDEdge(predBB, targetBB);
574                     }
575                 }
576             }
577             break;
578         }
579 
580         if (lastInst->isEOT())
581         {
582             continue;
583         }
584 
585         if (G4_Inst_Table[lastInst->opcode()].instType == InstTypeFlow)
586         {
587             G4_opcode op = lastInst->opcode();
588 
589             if (op == G4_jmpi)
590             {
591                 G4_Operand* jip = lastInst->getSrc(0);
592                 G4_BB_SB* targetBB = labelToBlockMap[jip->asLabel()];
593                 addSIMDEdge(currBB, targetBB);
594                 if (lastInst->getPredicate())
595                 {
596                     if (i + 1 != BBVector.size())
597                     {
598                         addSIMDEdge(currBB, BBVector[i + 1]);
599                     }
600                 }
601             }
602             else if (lastInst->isReturn() || lastInst->isCall() ||
603                 lastInst->isFReturn() || lastInst->isFCall())
604             {
605                 for (const G4_BB* bb : currBB->getBB()->Succs)
606                 {
607                     unsigned bbID = bb->getId();
608                     addSIMDEdge(currBB, BBVector[bbID]);
609                 }
610             }
611             else if (lastInst->asCFInst()->getJip())
612             {
613                 if (op == G4_goto)
614                 {
615                     G4_Operand* jip = lastInst->asCFInst()->getJip();
616                     G4_Operand* uip = lastInst->asCFInst()->getUip();
617                     G4_BB_SB* jipBB = labelToBlockMap[jip->asLabel()];
618                     G4_BB_SB* uipBB = labelToBlockMap[uip->asLabel()];
619                     if (jipBB != uipBB && jipBB->first_node > uipBB->first_node)
620                     {//backedge, goto uip
621                         addSIMDEdge(currBB, uipBB);
622                     }
623                     else //goto jip
624                     {
625                         addSIMDEdge(currBB, jipBB);
626                     }
627 
628                     if (lastInst->getPredicate())
629                     {
630                         if (i + 1 != BBVector.size())
631                         {
632                             addSIMDEdge(currBB, BBVector[i + 1]);
633                         }
634                     }
635                 }
636                 else if (op == G4_break)
637                 {
638                     G4_Operand* jip = lastInst->asCFInst()->getJip();
639                     G4_Operand* uip = lastInst->asCFInst()->getUip();
640                     G4_BB_SB* jipBB = labelToBlockMap[jip->asLabel()];
641                     G4_BB_SB* uipBB = labelToBlockMap[uip->asLabel()];
642                     if (jipBB == uipBB)
643                     {
644                         G4_BB* bb = jipBB->getBB();
645                         unsigned bbID = bb->getId();
646                         assert(bbID + 1 != BBVector.size());
647                         addSIMDEdge(currBB, BBVector[bbID + 1]);
648                     }
649                     else  //Add the jip edge to the CFG
650                     {
651                         addSIMDEdge(currBB, jipBB);
652                     }
653                     if (i + 1 != BBVector.size())
654                     {
655                         addSIMDEdge(currBB, BBVector[i + 1]);
656                     }
657                 }
658                 else
659                 {
660                     G4_Operand* jip = lastInst->asCFInst()->getJip();
661                     G4_BB_SB* targetBB = labelToBlockMap[jip->asLabel()];
662                     addSIMDEdge(currBB, targetBB);
663                     if (i + 1 != BBVector.size())
664                     {
665                         addSIMDEdge(currBB, BBVector[i + 1]);
666                     }
667                 }
668             }
669             else
670             {
671                 if (i + 1 != BBVector.size())
672                 {
673                     addSIMDEdge(currBB, BBVector[i + 1]);
674                 }
675             }
676         }
677         else
678         {
679             if (i + 1 != BBVector.size())
680             {
681                 addSIMDEdge(currBB, BBVector[i + 1]);
682             }
683         }
684     }
685 }
686 
687 //Generate the dependence distance
SWSBDepDistanceGenerator(PointsToAnalysis & p,LiveGRFBuckets & LB,LiveGRFBuckets & globalSendsLB)688 void SWSB::SWSBDepDistanceGenerator(PointsToAnalysis& p, LiveGRFBuckets& LB, LiveGRFBuckets& globalSendsLB)
689 {
690     BB_LIST_ITER ib(fg.begin()), bend(fg.end());
691 
692     //Initialize global data
693     BBVector.resize(fg.size());
694 
695     //Set distance 1 at the first instruction in case there are runtime inserted instructions at prolog
696     if (kernel.getInt32KernelAttr(Attributes::ATTR_Target) != VISA_3D ||
697         fg.builder->getOptions()->getOption(vISA_SWSBStitch) )
698     {
699         setDefaultDistanceAtFirstInstruction();
700     }
701 
702     unsigned nestLoopLevel = 0;
703     //Local dependence analysis
704     for (; ib != bend; ++ib)
705     {
706         BBVector[(*ib)->getId()] = new (mem)G4_BB_SB(
707             *this,
708             *(fg.builder),
709             mem,
710             *ib,
711             &SBNodes,
712             &SBSendNodes,
713             &globalSendOpndList,
714             &indexes,
715             globalSendNum,
716             &LB,
717             &globalSendsLB,
718             p,
719             &labelToBlockMap,
720             tokenAfterDPASCycle);
721         if ((*ib)->getNestLevel())
722         {
723             nestLoopLevel = nestLoopLevel < (*ib)->getNestLevel() ? (*ib)->getNestLevel() : nestLoopLevel;
724         }
725     }
726 }
727 
handleFuncCall()728 void SWSB::handleFuncCall()
729 {
730     for (G4_BB_SB *bb : BBVector)
731     {
732         if (bb->last_node == -1)
733         {
734             continue;
735         }
736 
737         SBNode* node = SBNodes[bb->last_node];
738 
739         if ((node->GetInstruction()->isCall() || node->GetInstruction()->isFCall()) ||
740             (node->GetInstruction()->isReturn() || node->GetInstruction()->isFReturn()))
741         {
742             LiveGRFBuckets send_use_out(mem, kernel.getNumRegTotal(), *fg.getKernel());
743             for (const SBBucketNode* sBucketNode : globalSendOpndList)
744             {
745                 SBNode* sNode = sBucketNode->node;
746                 if (bb->send_live_out.isSrcSet(sNode->globalID) &&
747                     (sBucketNode->opndNum == Opnd_src0 ||
748                     sBucketNode->opndNum == Opnd_src1 ||
749                     sBucketNode->opndNum == Opnd_src2 ||
750                     sBucketNode->opndNum == Opnd_src3))
751                 {
752                     bb->createAddGRFEdge(sNode, node, WAR, DEP_EXPLICT);
753                 }
754                 if (bb->send_live_out.isDstSet(sNode->globalID) && (sBucketNode->opndNum == Opnd_dst))
755                 {
756                     bb->createAddGRFEdge(sNode, node, RAW, DEP_EXPLICT);
757                 }
758             }
759         }
760         if (node->GetInstruction()->isReturn() ||
761             node->GetInstruction()->isFReturn())
762         {
763             node->GetInstruction()->setDistance(1);
764             if (fg.builder->hasThreeALUPipes() || fg.builder->hasFourALUPipes())
765             {
766                 node->GetInstruction()->setDistanceTypeXe(G4_INST::DistanceType::DISTALL);
767             }
768         }
769     }
770 }
771 
SWSBGlobalTokenGenerator(PointsToAnalysis & p,LiveGRFBuckets & LB,LiveGRFBuckets & globalSendsLB)772 void SWSB::SWSBGlobalTokenGenerator(PointsToAnalysis& p, LiveGRFBuckets& LB, LiveGRFBuckets& globalSendsLB)
773 {
774     allTokenNodesMap.resize(totalTokenNum);
775     for (TokenAllocation& nodeMap : allTokenNodesMap)
776     {
777         nodeMap.bitset = BitSet(SBSendNodes.size(), false);
778     }
779 
780     const bool enableGlobalTokenAllocation = fg.builder->getOptions()->getOption(vISA_GlobalTokenAllocation);
781     const bool enableDistPropTokenAllocation = fg.builder->getOptions()->getOption(vISA_DistPropTokenAllocation);
782     // Get the live out, may kill bit sets
783     for (G4_BB_SB *bb : BBVector)
784     {
785         bb->send_live_in = SBBitSets(globalSendNum);
786         bb->send_live_out = SBBitSets(globalSendNum);
787         bb->send_def_out = SBBitSets(globalSendNum);
788 
789         bb->send_live_in_scalar = SBBitSets(globalSendNum);
790         bb->send_live_out_scalar = SBBitSets(globalSendNum);
791         bb->send_kill_scalar = SBBitSets(globalSendNum);
792         bb->liveInTokenNodes = BitSet(SBSendNodes.size(), false);
793         bb->liveOutTokenNodes = BitSet(SBSendNodes.size(), false);
794         bb->killedTokens = BitSet(totalTokenNum, false);
795 
796         if (enableGlobalTokenAllocation || enableDistPropTokenAllocation)
797         {
798             bb->tokenLiveInDist = (unsigned*)mem.alloc(sizeof(unsigned) * globalSendNum);
799             bb->tokenLiveOutDist = (unsigned*)mem.alloc(sizeof(unsigned) * globalSendNum);
800             for (unsigned k = 0; k < globalSendNum; k++)
801             {
802                 bb->tokenLiveInDist[k] = -1;
803                 bb->tokenLiveOutDist[k] = -1;
804             }
805         }
806         if (bb->send_start != -1)
807         {
808             for (int k = bb->send_start; k <= bb->send_end; k++)
809             {
810                 if (globalSendOpndList[k]->opndNum == Opnd_dst)
811                 {
812                     bb->send_def_out.setDst(globalSendOpndList[k]->node->globalID, true);
813                     bb->send_live_out.setDst(globalSendOpndList[k]->node->globalID, true);
814                 }
815                 if (globalSendOpndList[k]->opndNum == Opnd_src0 ||
816                     globalSendOpndList[k]->opndNum == Opnd_src1 ||
817                     globalSendOpndList[k]->opndNum == Opnd_src2 ||
818                     globalSendOpndList[k]->opndNum == Opnd_src3)
819                 {
820                     bb->send_def_out.setSrc(globalSendOpndList[k]->node->globalID, true);
821                     bb->send_live_out.setSrc(globalSendOpndList[k]->node->globalID, true);
822                 }
823             }
824         }
825 
826         bb->send_may_kill = SBBitSets(globalSendNum);
827         bb->send_WAW_may_kill = BitSet(globalSendNum, false);
828         bb->setSendOpndMayKilled(&globalSendsLB, &SBNodes, p);
829 
830 #ifdef DEBUG_VERBOSE_ON
831         bb->dumpLiveInfo(&globalSendOpndList, globalSendNum, nullptr);
832 #endif
833     }
834 
835     /*
836     Loop info is used to reduce the token required for certain instructions, or count the delay of the backedge for token reuse
837     We do the token reduction and count delay of backedge only for the nature loops, i.e with the backedge,
838     if the instruction distance is far enough, there is no need to set dependence.
839     For the irreducible flow graph, those optimizations wouldn't be kicked in.
840     */
841     for (G4_BB_SB *bb : BBVector)
842     {
843         for (auto&& be : kernel.fg.backEdges)
844         {
845             auto loopIt = kernel.fg.naturalLoops.find(be);
846 
847             if (loopIt != kernel.fg.naturalLoops.end())
848             {
849                 auto&& bbsInLoop = loopIt->second;
850 
851                 auto bb1InLoop = bbsInLoop.find(bb->getBB());
852                 if (bb1InLoop != bbsInLoop.end())
853                 {
854                     if (bb->getLoopStartBBID() != -1)
855                     {
856                         //Innermost loop only
857                         if (bb->getLoopStartBBID() <= be.second->getId() &&
858                             bb->getLoopEndBBID() >= be.first->getId())
859                         {
860                             bb->setLoopStartBBID(be.second->getId());
861                             bb->setLoopEndBBID(be.first->getId());
862                         }
863                     }
864                     else
865                     {
866                         bb->setLoopStartBBID(be.second->getId());
867                         bb->setLoopEndBBID(be.first->getId());
868                     }
869                 }
870             }
871         }
872     }
873 
874     //Global analysis until no live in change
875     SWSBGlobalScalarCFGReachAnalysis();
876 
877     //Add dependence according to analysis result
878     if (enableGlobalTokenAllocation || enableDistPropTokenAllocation)
879     {
880         addGlobalDependenceWithReachingDef(globalSendNum, &globalSendOpndList, &SBNodes, p, true);
881     }
882     else
883     {
884         addGlobalDependence(globalSendNum, &globalSendOpndList, &SBNodes, p, true);
885     }
886 
887     handleFuncCall();
888 
889     for (G4_BB_SB *bb : BBVector)
890     {
891         bb->send_live_in_scalar = bb->send_live_in;
892         bb->send_live_out_scalar = bb->send_live_out;
893     }
894 
895     SWSBBuildSIMDCFG();
896 
897     SWSBGlobalSIMDCFGReachAnalysis();
898 
899     //Add dependence according to analysis result
900     addGlobalDependence(globalSendNum, &globalSendOpndList, &SBNodes, p, false);
901 
902     //SWSB token allocation with linear scan algorithm.
903     if (enableGlobalTokenAllocation)
904     {
905         tokenAllocationGlobal();
906     }
907     else if (enableDistPropTokenAllocation)
908     {
909         tokenAllocationGlobalWithPropogation();
910     }
911     else if (fg.builder->getOptions()->getOption(vISA_QuickTokenAllocation))
912     {
913         quickTokenAllocation();
914     }
915     else
916     {
917         tokenAllocation();
918     }
919 
920     //Insert test instruction in case the dependences are more than token field in the instruction.
921     insertTest();
922 }
923 
924 static FCPatchingInfo::RegAccessType
getRegAccessType(Gen4_Operand_Number OpndNo)925 getRegAccessType(Gen4_Operand_Number OpndNo) {
926     if (OpndNo == Opnd_dst)
927         return FCPatchingInfo::Fully_Def;
928     return FCPatchingInfo::Fully_Use;
929 }
930 
getRegAccessPipe(G4_INST * Inst)931 static unsigned getRegAccessPipe(G4_INST* Inst) {
932     FCPatchingInfo::RegAccessPipe Pipe = FCPatchingInfo::Pipe_ALU;
933     unsigned SFID = 0;
934 
935     if (Inst->isSend())
936     {
937         Pipe = FCPatchingInfo::Pipe_Send;
938         SFID = SFIDtoInt(Inst->getMsgDesc()->getSFID()) & 0xF; // 4-bit SFID
939     }
940     else if (Inst->isMathPipeInst())
941     {
942         Pipe = FCPatchingInfo::Pipe_Math;
943     }
944     else if (Inst->isDpas())
945     {
946         Pipe = FCPatchingInfo::Pipe_Dpas;
947     }
948 
949     // Pipe ID is encoded as (SFID[3:0] | P[3:0]), where P is ALU, Math, or Send.
950     return unsigned(Pipe) | (SFID << 4);
951 }
952 
updateRegAccess(FCPatchingInfo * FCPI,SBNode * Node,Gen4_Operand_Number OpndNo,unsigned NumRegs)953 static void updateRegAccess(FCPatchingInfo* FCPI, SBNode* Node,
954     Gen4_Operand_Number OpndNo, unsigned NumRegs) {
955     for (auto F = Node->getFirstFootprint(OpndNo); F != nullptr; F = F->next) {
956         unsigned L = F->LeftB / numEltPerGRF<Type_UB>();
957         unsigned R = F->RightB / numEltPerGRF<Type_UB>();
958         if (F->fType != GRF_T)
959         {
960             continue;
961         }
962         ASSERT_USER(L < NumRegs, "Invalid register left bound!");
963         ASSERT_USER(R < NumRegs, "Invalid register right bound!");
964         for (unsigned n = L; n <= R; ++n) {
965             FCPatchingInfo::RegAccess Acc;
966             Acc.Type = getRegAccessType(OpndNo);
967             Acc.RegNo = n;
968             Acc.Pipe = getRegAccessPipe(Node->GetInstruction());
969             Acc.Inst = Node->GetInstruction();
970             Acc.Token = Acc.Inst->getSetToken();
971             // Update the first access list & map.
972             if (!FCPI->RegFirstAccessMap.count(n)) {
973                 FCPI->RegFirstAccessList.push_back(Acc);
974                 FCPI->RegFirstAccessMap[n] = &FCPI->RegFirstAccessList.back();
975             }
976             // Update the last access list & map.
977             if (FCPI->RegLastAccessMap.count(n)) {
978                 if (Acc.Type == FCPatchingInfo::Fully_Def) {
979                     // Remove previous accesses.
980                     auto PrevAcc = FCPI->RegLastAccessMap[n];
981                     while (PrevAcc) {
982                         auto Next = PrevAcc->Next;
983                         auto PrevAccInst = PrevAcc->Inst;
984                         auto PrevAccRegNo = PrevAcc->RegNo;
985                         // Remove all previous accesses on the same GRF.
986                         FCPI->RegLastAccessList.remove_if(
987                             [=](const FCPatchingInfo::RegAccess& A) {
988                             return (A.Inst == PrevAccInst) &&
989                                 (A.RegNo == PrevAccRegNo); });
990                         PrevAcc = Next;
991                     }
992                 }
993                 else {
994                     // Remove previous accesses with the same pipe.
995                     auto PrevAcc = FCPI->RegLastAccessMap[n];
996                     while (PrevAcc) {
997                         if (PrevAcc->Type == FCPatchingInfo::Fully_Use &&
998                             PrevAcc->Pipe != Acc.Pipe) {
999                             // Not the same, re-link them.
1000                             std::swap(Acc.Next, PrevAcc->Next);
1001                             std::swap(Acc.Next, PrevAcc);
1002                             continue;
1003                         }
1004                         auto Next = PrevAcc->Next;
1005                         auto PrevAccInst = PrevAcc->Inst;
1006                         auto PrevAccRegNo = PrevAcc->RegNo;
1007                         // Remove all previous accesses on the same GRF and the same pipe.
1008                         FCPI->RegLastAccessList.remove_if(
1009                             [=](const FCPatchingInfo::RegAccess& A) {
1010                             return (A.Inst == PrevAccInst) &&
1011                                 (A.RegNo == PrevAccRegNo); });
1012                         PrevAcc = Next;
1013                     }
1014                 }
1015             }
1016             FCPI->RegLastAccessList.push_back(Acc);
1017             FCPI->RegLastAccessMap[n] = &FCPI->RegLastAccessList.back();
1018         }
1019     }
1020 }
1021 
insertSyncBarrier(FCPatchingInfo * FCPI,SBNode * Node,unsigned NumRegs)1022 static void insertSyncBarrier(FCPatchingInfo* FCPI, SBNode* Node,
1023     unsigned NumRegs) {
1024     // Skip if sync barrier is already inserted.
1025     if (FCPI->RegFirstAccessList.size() == 0 || FCPI->RegFirstAccessList.back().RegNo == unsigned(-1))
1026         return;
1027 
1028     // Sync barrier is a special relocation where all registers are forced to be
1029     // synchronized.
1030     FCPatchingInfo::RegAccess Acc;
1031     Acc.Type = FCPatchingInfo::Fully_Use;
1032     Acc.RegNo = unsigned(-1); // A special register.
1033     // Sync barrier is inserted just before this instruction.
1034     Acc.Inst = Node->GetInstruction();
1035 
1036     // Append this access into the first access list.
1037     FCPI->RegFirstAccessList.push_back(Acc);
1038     // Update the first access map.
1039     for (unsigned n = 0; n < NumRegs; ++n) {
1040         if (FCPI->RegFirstAccessMap.count(n))
1041             continue;
1042         FCPI->RegFirstAccessMap[n] = &FCPI->RegFirstAccessList.back();
1043     }
1044     // Invalidate the last access list & map.
1045     FCPI->RegLastAccessMap.clear();
1046     FCPI->RegLastAccessList.clear();
1047 }
1048 
isBranch(SBNode * N)1049 static bool isBranch(SBNode* N) {
1050     auto Inst = N->GetInstruction();
1051     if (!Inst->isFlowControl())
1052         return false;
1053     // Skip function call/ret.
1054     if (Inst->isCall() || Inst->isReturn() ||
1055         Inst->opcode() == G4_pseudo_fc_call ||
1056         Inst->opcode() == G4_pseudo_fc_ret)
1057         return false;
1058     return true;
1059 }
1060 
updatePatchInfo(FCPatchingInfo * FCPI,SBNode * Node,unsigned NumRegs,unsigned NumTokens)1061 static void updatePatchInfo(FCPatchingInfo* FCPI, SBNode* Node,
1062     unsigned NumRegs, unsigned NumTokens) {
1063     // TODO: Branch is not supported in the current FC patch info as it
1064     // involves complicated handling. Issue a sync barrier just before the
1065     // first flow control instruction.
1066     if (isBranch(Node)) {
1067         insertSyncBarrier(FCPI, Node, NumRegs);
1068         return;
1069     }
1070     // Update access maps.
1071     updateRegAccess(FCPI, Node, Opnd_src0, NumRegs);
1072     updateRegAccess(FCPI, Node, Opnd_src1, NumRegs);
1073     updateRegAccess(FCPI, Node, Opnd_src2, NumRegs);
1074     // Per inst, 'use' access always happens before 'def' access.
1075     updateRegAccess(FCPI, Node, Opnd_dst, NumRegs);
1076 }
1077 
updateTokenSet(FCPatchingInfo * FCPI,SBNODE_VECT & Nodes,unsigned NumTokens)1078 static void updateTokenSet(FCPatchingInfo* FCPI, SBNODE_VECT& Nodes,
1079     unsigned NumTokens) {
1080     std::set<G4_INST*> LastAccInsts;
1081     // Collect last access instructions.
1082     for (auto I = FCPI->RegLastAccessList.begin(),
1083         E = FCPI->RegLastAccessList.end(); I != E; ++I) {
1084         LastAccInsts.insert(I->Inst);
1085     }
1086     // Scan node for tokens used in non-last access instructions.
1087     for (auto NI = Nodes.begin(), NE = Nodes.end(); NI != NE; ++NI) {
1088         auto Inst = (*NI)->GetInstruction();
1089         if (LastAccInsts.count(Inst))
1090             continue;
1091         auto T = Inst->getSetToken();
1092         // Skip if token is not allocated.
1093         if (T == (unsigned short)(-1))
1094             return;
1095         ASSERT_USER(T < NumTokens, "Invalid token number!");
1096         FCPI->AllocatedToken.insert(T);
1097     }
1098 }
1099 
genSWSBPatchInfo()1100 void SWSB::genSWSBPatchInfo() {
1101     unsigned NumRegs = kernel.getNumRegTotal();
1102     auto FCPI = fg.builder->getFCPatchInfo();
1103     for (auto Node : SBNodes) {
1104         updatePatchInfo(FCPI, Node, NumRegs, totalTokenNum);
1105     }
1106 
1107 #if 1
1108     //Update the live out tokens according to the live out of the exit BB of the kernel.
1109     for (G4_BB* bb : fg)
1110     {
1111         if (bb->Succs.size() == 0 &&
1112             BBVector[bb->getId()]->Succs.size() == 0)
1113         {
1114             LiveGRFBuckets send_use_out(mem, kernel.getNumRegTotal(), *fg.getKernel());
1115             for (size_t i = 0; i < globalSendOpndList.size(); i++)
1116             {
1117                 SBBucketNode* sBucketNode = globalSendOpndList[i];
1118                 SBNode* sNode = sBucketNode->node;
1119                 if (BBVector[bb->getId()]->send_live_out.isSrcSet(sNode->globalID) && (sBucketNode->opndNum == Opnd_src0 ||
1120                     sBucketNode->opndNum == Opnd_src1 ||
1121                     sBucketNode->opndNum == Opnd_src2 ||
1122                     sBucketNode->opndNum == Opnd_src3))
1123                 {
1124                     BBVector[bb->getId()]->getLiveBucketsFromFootprint(sNode->getFirstFootprint(sBucketNode->opndNum), sBucketNode, &send_use_out);
1125                 }
1126                 if (BBVector[bb->getId()]->send_live_out.isDstSet(sNode->globalID) && (sBucketNode->opndNum == Opnd_dst))
1127                 {
1128                     BBVector[bb->getId()]->getLiveBucketsFromFootprint(sNode->getFirstFootprint(sBucketNode->opndNum), sBucketNode, &send_use_out);
1129                 }
1130             }
1131 
1132             for (unsigned curBucket = 0; curBucket < kernel.getNumRegTotal(); curBucket++)
1133             {
1134                 for (LiveGRFBuckets::BN_iterator bn_it = send_use_out.begin(curBucket);
1135                     bn_it != send_use_out.end(curBucket); ++bn_it)
1136                 {
1137                     SBBucketNode* liveBN = (*bn_it);
1138                     SBNode* curLiveNode = liveBN->node;
1139                     Gen4_Operand_Number liveOpnd = liveBN->opndNum;
1140 
1141                     FCPatchingInfo::RegAccess Acc;
1142                     Acc.Type = getRegAccessType(liveOpnd);
1143                     Acc.RegNo = curBucket;
1144                     Acc.Pipe = getRegAccessPipe(curLiveNode->GetInstruction());
1145                     Acc.Inst = curLiveNode->GetInstruction();
1146                     Acc.Token = Acc.Inst->getSetToken();
1147                     FCPI->RegLastAccessList.push_back(Acc);
1148                     FCPI->RegLastAccessMap[curBucket] = &FCPI->RegLastAccessList.back();
1149                 }
1150             }
1151         }
1152     }
1153 #endif
1154 
1155     updateTokenSet(FCPI, SBNodes, totalTokenNum);
1156 
1157 #if defined(DEBUG_VERBOSE_ON)
1158     // First access.
1159     std::cerr << "FirstAccess:\n";
1160     auto& FirstAccess = FCPI->RegFirstAccessList;
1161     for (auto& Access : FirstAccess) {
1162         fprintf(stderr, "r%03u.%s", Access.RegNo,
1163             (Access.Type == FCPatchingInfo::Fully_Def ? "def" : "use"));
1164         fprintf(stderr, ", P%04x", Access.Pipe);
1165         if (Access.Token != (unsigned short)(-1))
1166             fprintf(stderr, ", $%u", Access.Token);
1167         fprintf(stderr, ":");
1168         Access.Inst->dump();
1169     }
1170     // Last access.
1171     std::cerr << "LastAccess:\n";
1172     auto& LastAccess = FCPI->RegLastAccessList;
1173     for (auto& Access : LastAccess) {
1174         fprintf(stderr, "r%03u.%s", Access.RegNo,
1175             (Access.Type == FCPatchingInfo::Fully_Def ? "def" : "use"));
1176         fprintf(stderr, ", P%04x", Access.Pipe);
1177         if (Access.Token != (unsigned short)(-1))
1178             fprintf(stderr, ", $%u", Access.Token);
1179         fprintf(stderr, ":");
1180         Access.Inst->dump();
1181     }
1182     // Allocated token.
1183     std::cerr << "AllocatedToken:\n";
1184     for (unsigned t = 0; t != NumTokens; ++t) {
1185         if (!FCPI->AllocatedToken.count(t))
1186             continue;
1187         if (t != 0)
1188             fprintf(stderr, ", ");
1189         fprintf(stderr, "$%u", t);
1190     }
1191     fprintf(stderr, "\n");
1192 #endif
1193 }
1194 
getDominators(ImmDominator * dom)1195 void SWSB::getDominators(ImmDominator* dom)
1196 {
1197     //BBVector[bb->getId()]->tokenAssigned = true;
1198     bool changed = true;
1199 
1200     while (changed)
1201     {
1202         changed = false;
1203 
1204         for (size_t i = 0; i < BBVector.size(); i++)
1205         {
1206             BitSet currDoms = BBVector[i]->dominators;
1207             if (dom->getIDoms()[i] != BBVector[i]->getBB())
1208             {
1209                 currDoms |= BBVector[dom->getIDoms()[i]->getId()]->dominators;
1210             }
1211 
1212             if (currDoms != BBVector[i]->dominators)
1213             {
1214                 changed = true;
1215                 BBVector[i]->dominators = currDoms;
1216             }
1217         }
1218     }
1219 }
1220 
1221 //
1222 //Entry to the software scoreboard generator
1223 //
SWSBGenerator()1224 void SWSB::SWSBGenerator()
1225 {
1226     DEBUG_VERBOSE("[SWSB]: Starting...");
1227     PointsToAnalysis p(kernel.Declares, kernel.fg.getNumBB());
1228     p.doPointsToAnalysis(kernel.fg);
1229 
1230     kernel.fg.reassignBlockIDs();
1231     kernel.fg.findBackEdges();
1232     kernel.fg.findNaturalLoops();
1233 
1234     //Note that getNumFlagRegisters() treat each 16 bits as a flag register
1235     LiveGRFBuckets LB(mem, kernel.getNumRegTotal() + fg.builder->getNumScalarRegisters() + kernel.getNumAcc() + fg.builder->getNumFlagRegisters(), kernel);
1236     LiveGRFBuckets globalSendsLB(mem, kernel.getNumRegTotal() + fg.builder->getNumScalarRegisters() + kernel.getNumAcc() + fg.builder->getNumFlagRegisters(), kernel);
1237 
1238     SWSBDepDistanceGenerator(p, LB, globalSendsLB);
1239 
1240 #ifdef DEBUG_VERBOSE_ON
1241     dumpDepInfo();
1242 #endif
1243 
1244     if (fg.builder->getOptions()->getOption(vISA_GlobalTokenAllocation) ||
1245         fg.builder->getOptions()->getOption(vISA_DistPropTokenAllocation))
1246     {
1247         auto& dom = fg.getImmDominator();
1248 
1249         //Build dom tree
1250         for (size_t i = 0; i < BBVector.size(); i++)
1251         {
1252             G4_BB* bb = BBVector[i]->getBB();
1253             BBVector[i]->dominators = BitSet(BBVector.size(), false);
1254             BBVector[i]->dominators.set(i, true);
1255 
1256             if (dom.getIDoms()[bb->getId()] != bb)
1257             {
1258                 BBVector[dom.getIDoms()[bb->getId()]->getId()]->domSuccs.push_back(BBVector[i]);
1259                 BBVector[i]->domPreds.push_back(BBVector[dom.getIDoms()[bb->getId()]->getId()]);
1260             }
1261         }
1262 
1263         for (size_t i = 0; i < BBVector.size(); i++)
1264         {
1265             if (BBVector[i]->domSuccs.size())
1266             {
1267                 BBVector[i]->domSuccs.sort(compareBBStart);
1268             }
1269         }
1270 
1271         getDominators(&dom);
1272 #ifdef DEBUG_VERBOSE_ON
1273         dumpImmDom(&dom);
1274 #endif
1275     }
1276 
1277     if (SBSendNodes.size())
1278     {
1279         SWSBGlobalTokenGenerator(p, LB, globalSendsLB);
1280     }
1281     else
1282     {
1283         handleFuncCall();
1284         insertTest();
1285     }
1286 
1287     if (fg.builder->getFCPatchInfo()->getFCComposableKernel())
1288     {
1289         genSWSBPatchInfo();
1290     }
1291 
1292 #ifdef DEBUG_VERBOSE_ON
1293     std::cerr << "\n" << "Dependence Graph:" << "\n";
1294 
1295     for (const SBNode* node : SBNodes)
1296     {
1297         G4_INST* inst = node->GetInstruction();
1298         std::cerr << node->getNodeID() << ":\t";
1299         inst->dump();
1300         std::cerr << "Succs:";
1301         for (const SBDEP_ITEM& curSucc : node->succs)
1302         {
1303             std::cerr << curSucc.node->getNodeID() << ",";
1304         }
1305         std::cerr << "\n";
1306         std::cerr << "Preds:";
1307         for (const SBDEP_ITEM& curPred : node->preds)
1308         {
1309             std::cerr << curPred.node->getNodeID() << ",";
1310         }
1311         std::cerr << "\n\n";
1312     }
1313 #endif
1314 
1315     return;
1316 }
1317 
calcDepDelayForNode(const SBNode * curNode) const1318 unsigned SWSB::calcDepDelayForNode(const SBNode* curNode) const
1319 {
1320     const G4_INST* inst = curNode->GetInstruction();
1321     int reuseDelay = 0;
1322 
1323     if (inst->isSend())
1324     {
1325         if (inst->getDst() == nullptr ||
1326             inst->getDst()->isNullReg())
1327         {
1328             return TOKEN_AFTER_READ_CYCLE;
1329         }
1330 
1331         const G4_SendDesc* msgDesc = inst->getMsgDesc();
1332         if (msgDesc->isSLM())
1333         {
1334             reuseDelay = tokenAfterWriteSendSlmCycle;
1335         }
1336         else if (msgDesc->isSampler())
1337         {
1338             reuseDelay = tokenAfterWriteSendSamplerCycle;
1339         }
1340         else
1341         {
1342             reuseDelay = tokenAfterWriteSendMemoryCycle;
1343         }
1344     }
1345     else if (inst->isMathPipeInst())
1346     {
1347         if (fg.builder->hasFixedCycleMathPipe())
1348         {
1349             assert(0 && "Math instruction is assigned token which is not supported in fixed mach cycle platform");
1350         }
1351 
1352         reuseDelay = tokenAfterWriteMathCycle;
1353     }
1354     else if (inst->isDpas())
1355     {
1356         reuseDelay = tokenAfterDPASCycle;
1357     }
1358     else
1359     {
1360         assert(0 && "unexpected token reuse instruction");
1361     }
1362 
1363     return reuseDelay;
1364 }
1365 
examineNodeForTokenReuse(unsigned nodeID,unsigned nodeDelay,const SBNode * reuseNode,unsigned char nestLoopLevel,unsigned curLoopStartBB,unsigned curLoopEndBB) const1366 std::pair<int, int> SWSB::examineNodeForTokenReuse(
1367     unsigned nodeID,
1368     unsigned nodeDelay,
1369     const SBNode *reuseNode,
1370     unsigned char nestLoopLevel,
1371     unsigned curLoopStartBB,
1372     unsigned curLoopEndBB) const
1373 {
1374     int reuseDelay = 0;
1375     int curDistance = 0;
1376     //The reuse node is before current node.
1377     if (nodeID > reuseNode->getNodeID())
1378     {
1379         unsigned curNodeDelay = reuseNode->getDepDelay();
1380 
1381         //reuse Delay is not accurate in different loop level
1382         reuseDelay = curNodeDelay - (nodeID - reuseNode->getNodeID());
1383 
1384         //If too far, count distance
1385         if (reuseDelay < 0)
1386         {
1387             curDistance = nodeID - reuseNode->getNodeID();
1388         }
1389     }
1390     else //The reuse node is after current node
1391     {
1392         reuseDelay = nodeDelay - (reuseNode->getNodeID() - nodeID);
1393         if (reuseDelay < 0)
1394         {
1395             curDistance = reuseNode->getNodeID() - nodeID;
1396         }
1397     }
1398 
1399     const G4_BB_SB *bb = BBVector[reuseNode->getBBID()];
1400     unsigned char curNodeNestLoopLevel = bb->getBB()->getNestLevel();
1401     unsigned loopLevelDiff = std::abs(curNodeNestLoopLevel - nestLoopLevel);
1402     constexpr unsigned loopFactorForTokenReuse = 5;
1403     if (reuseDelay > 0)
1404     {
1405         reuseDelay /= loopFactorForTokenReuse * loopLevelDiff + 1;
1406     }
1407     else
1408     {
1409         curDistance *= loopFactorForTokenReuse * loopLevelDiff + 1;
1410         if (nestLoopLevel && loopLevelDiff == 0)
1411         {
1412             if (curLoopStartBB == -1 || curLoopEndBB == -1)
1413             {
1414                 curLoopStartBB = bb->getLoopStartBBID();
1415                 curLoopEndBB = bb->getLoopEndBBID();
1416             }
1417             //Count the backedge, if the backedge distance is short, take it
1418             if (curLoopStartBB != -1 && curLoopEndBB != -1)
1419             {
1420                 unsigned loopStartID = BBVector[curLoopStartBB]->first_node;
1421                 unsigned loopEndID = BBVector[curLoopEndBB]->last_node;
1422 
1423                 // The reused node may in same loop as current node.
1424                 int backEdgeDistance = loopEndID - loopStartID - curDistance;
1425 
1426                 if (reuseNode->getNodeID() < loopStartID || reuseNode->getNodeID() > loopEndID)
1427                 {
1428                     // Or it may in another loop with same nest loop level
1429                     // Current back edge cannot cover the distance
1430                     // loop1 {
1431                     //    node1
1432                     // }
1433                     //
1434                     // loop2 {
1435                     //    node2
1436                     // }
1437                     curDistance = curDistance * (nestLoopLevel * loopFactorForTokenReuse + 1);
1438                 }
1439                 else
1440                 {
1441                     curDistance = std::min(curDistance, backEdgeDistance);
1442                 }
1443             }
1444         }
1445     }
1446     return std::make_pair(reuseDelay, curDistance);
1447 }
1448 
1449 //The algorithm for reuse selection: The live range which causes the least stall delay of current live range.
1450 //FIXME: for global variable, it's not accurate. Because the AFTER_SOURCE and AFTER_WRITE may in different branches.
1451 //Try not reuse the tokens set in adjacent instructions.
reuseTokenSelection(const SBNode * node) const1452 SBNode * SWSB::reuseTokenSelection(const SBNode * node) const
1453 {
1454     int delay = tokenAfterWriteSendSamplerCycle; //Assume the longest one
1455     int distance = 0; //Distance between the node
1456     const unsigned nodeID = node->getNodeID();
1457     const unsigned nodeDelay = node->getDepDelay(); // The longest delay the node may cause.
1458     const unsigned char nestLoopLevel = BBVector[node->getBBID()]->getBB()->getNestLevel();
1459     const unsigned loopStartBB = BBVector[node->getBBID()]->getLoopStartBBID();
1460     const unsigned loopEndBB = BBVector[node->getBBID()]->getLoopEndBBID();
1461 
1462     assert(linearScanLiveNodes.size() <= totalTokenNum);
1463 
1464     //The live nodes whose dependencies are not resolved in current node.
1465     SBNode* candidateNode = linearScanLiveNodes.front();
1466     for (SBNode* curNode : linearScanLiveNodes)
1467     {
1468         int maxTokenDelay = std::numeric_limits<int>::min(); //The delay may cause if reuse
1469         int minTokenDistance = std::numeric_limits<int>::max(); //The distance from the reused node
1470         // The token may be reused already, so check the 2 nodes that are
1471         // closest to the node using the same token. In most cases the
1472         // token allocation is done in ascending order. So, searching backward
1473         // should be fast. As for searching forward, only do that if there's
1474         // indeed a such node.
1475         const unsigned short token = curNode->getLastInstruction()->getSetToken();
1476         const unsigned lastBefore = allTokenNodesMap[token].bitset.findLastIn(0, node->getSendID());
1477         unsigned firstAfter = -1;
1478         if (node->getSendID() < allTokenNodesMap[token].maxSendID)
1479         {
1480             firstAfter = allTokenNodesMap[token].bitset.findFirstIn(node->getSendID() + 1,
1481                                                                     allTokenNodesMap[token].maxSendID + 1);
1482         }
1483         if (lastBefore != -1)
1484         {
1485             assert(allTokenNodesMap[token].bitset.isSet(lastBefore));
1486             const SBNode* n = SBSendNodes[lastBefore];
1487             auto res = examineNodeForTokenReuse(nodeID, nodeDelay, n, nestLoopLevel, loopStartBB, loopEndBB);
1488             //Largest reuse delay
1489             maxTokenDelay = std::max(maxTokenDelay, res.first);
1490             //Closest distance
1491             minTokenDistance = std::min(minTokenDistance, res.second);
1492         }
1493         if (firstAfter != -1)
1494         {
1495             assert(allTokenNodesMap[token].bitset.isSet(firstAfter));
1496             const SBNode* n = SBSendNodes[firstAfter];
1497             auto res = examineNodeForTokenReuse(nodeID, nodeDelay, n, nestLoopLevel, loopStartBB, loopEndBB);
1498             //Largest reuse delay
1499             maxTokenDelay = std::max(maxTokenDelay, res.first);
1500             //Closest distance
1501             minTokenDistance = std::min(minTokenDistance, res.second);
1502         }
1503 
1504         // Smallest one is the best one
1505         // if Distance is not 0, count the distance, otherwise, use the delay.
1506         // Distance is not 0 means there are candidate whose distance is larger than the delay
1507         if (!distance && maxTokenDelay > 0)
1508         {
1509             if (maxTokenDelay < delay)
1510             {
1511                 delay = maxTokenDelay;
1512                 candidateNode = curNode;
1513             }
1514         }
1515         else if (minTokenDistance > distance)
1516         {
1517             distance = minTokenDistance;
1518             candidateNode = curNode;
1519         }
1520     }
1521 
1522     return candidateNode;
1523 }
1524 
1525 /*
1526  * If the cycles of the instruction which occupied
1527 */
cycleExpired(const SBNode * node,int currentID) const1528 bool SWSB::cycleExpired(const SBNode* node, int currentID) const
1529 {
1530     if (node->GetInstruction()->isSend())
1531     {
1532         const G4_SendDesc* msgDesc = node->GetInstruction()->getMsgDesc();
1533 
1534         if (msgDesc->isSLM())
1535         {
1536             return tokenAfterWriteSendSlmCycle <= (currentID - node->getLiveStartID());
1537         }
1538         else if (msgDesc->isSampler())
1539         {
1540             return tokenAfterWriteSendSamplerCycle <= (currentID - node->getLiveStartID());
1541         }
1542         else
1543         {
1544             return tokenAfterWriteSendMemoryCycle <= (currentID - node->getLiveStartID());
1545         }
1546     }
1547     else if (node->GetInstruction()->isMathPipeInst())
1548     {
1549         if (fg.builder->hasFixedCycleMathPipe())
1550         {
1551             assert(0 && "Math instruction is assigned token which is not supported in fixed mach cycle platform");
1552         }
1553         return tokenAfterWriteMathCycle <= (currentID - node->getLiveStartID());
1554     }
1555     else if (node->GetInstruction()->isDpas())
1556     {
1557         return tokenAfterDPASCycle <= (int)(currentID - node->getLiveStartID());
1558     }
1559     else
1560     {
1561         assert(0 && "unexpected token reuse instruction");
1562     }
1563 
1564     return true;
1565 }
1566 
1567 //
1568 // Token dependence reduction is trying to reduce the unnecessary dependence when token reuse happens
1569 // Such as in following case
1570 //  1.  send r20,...           { $0 }
1571 //      ...
1572 //  20. send r30, ...         { $0 }
1573 //  21. add  r40 r20  r60    { $0.dst }
1574 // There is no need to set dependence for instruction 21,
1575 // because the reuse guarantee the dependency from instruction 1 is resolved before token 0 can be reused.
1576 // FIXME: Dominator info is required for global reduction
1577 //
tokenDepReduction(SBNode * n1,SBNode * n2)1578 void SWSB::tokenDepReduction(SBNode* n1, SBNode* n2)
1579 {
1580     SBNode* node1 = n1;
1581     SBNode* node2 = n2;
1582 
1583     assert(node1 != node2);
1584     if (n1->getNodeID() > n2->getNodeID())
1585     {
1586         node1 = n2;
1587         node2 = n1;
1588     }
1589 
1590     if (!fg.builder->getOptions()->getOption(vISA_SWSBDepReduction))
1591     {
1592         unsigned node1BBID = node1->getBBID();
1593         unsigned node2BBID = node2->getBBID();
1594 
1595         for (auto node_it = node1->succs.begin();
1596             node_it != node1->succs.end();
1597             )
1598         {
1599             SBDEP_ITEM& curSucc1 = (*node_it);
1600             SBNode* succ1 = curSucc1.node;
1601             unsigned bbID1 = succ1->getBBID();
1602 
1603             //node1(previous) and node2(current) are in same BB: kill all live out of node1
1604             // BB:
1605             //     node1
1606             //     node2
1607             //
1608             //Or the succ of node1 and node 2 are in same BB: kill all succ of node1 which after node 2
1609             // FIXME: will this one conflict with global dependence reduction?
1610             //BB:
1611             //     node2
1612             //     succ(node1)
1613             //if ((node1BBID == node2BBID && bbID1 != node2BBID) ||
1614             //    (node1BBID != node2BBID && bbID1 == node2BBID && succ1->getNodeID() > node2->getNodeID()))
1615             //{
1616             //    node_it = node1->succs.erase(node_it);//FIXME, if the succ is the token instruction, do we need free the tokens assigned to the instruction because of the dependence
1617             //    continue;
1618             //}
1619 
1620             //When two successors are in same BB, previous one kill the following one
1621             // FIXME: This may not be good, because the policy is trying to keep the longest dependence and move the short one
1622             // Of course, if the two predecessors are lived in from different branch, we can only kill the longer one
1623             bool killed = false;
1624             for (auto node2_it = node2->succs.begin();
1625                 node2_it != node2->succs.end();
1626                 )
1627             {
1628                 SBDEP_ITEM& curSucc2 = (*node2_it);
1629                 const SBNode* succ2 = curSucc2.node;
1630                 unsigned bbID2 = succ2->getBBID();
1631 
1632                 if (bbID1 == bbID2 &&
1633                     bbID1 != node1BBID &&
1634                     bbID2 != node2BBID &&
1635                     succ2 != succ1)
1636                 {
1637                     //succ2 is ahead
1638                     if (succ1->getNodeID() > succ2->getNodeID())
1639                     {
1640                         if (curSucc2.attr == DEP_EXPLICT &&
1641                             (curSucc1.type == curSucc2.type ||
1642                                 curSucc2.type == RAW ||
1643                                 curSucc2.type == WAW))
1644                         {
1645                             //succ1 killed
1646                             killed = true;
1647                             break;
1648                         }
1649                     }
1650                     else
1651                     {
1652                         if (curSucc1.attr == DEP_EXPLICT &&
1653                             (curSucc1.type == curSucc2.type ||
1654                                 curSucc1.type == RAW ||
1655                                 curSucc1.type == WAW))
1656                         {
1657                             node2_it = node2->succs.erase(node2_it);
1658                             continue;
1659                         }
1660                     }
1661                 }
1662                 node2_it++;
1663             }
1664 
1665             if (killed)
1666             {
1667                 node_it = node1->succs.erase(node_it);
1668                 continue;
1669             }
1670 
1671             node_it++;
1672         }
1673 
1674         //The succs of node2 in same BB as node1 and is behind node1
1675         for (auto node_it = node2->succs.begin();
1676             node_it != node2->succs.end();
1677             )
1678         {
1679             const SBNode* succ2 = node_it->node;
1680             unsigned bbID2 = succ2->getBBID();
1681 
1682             if ((node1BBID != node2BBID && bbID2 == node1BBID && succ2->getNodeID() > node1->getNodeID()))
1683             {
1684                 node_it = node2->succs.erase(node_it);
1685                 continue;
1686             }
1687 
1688             node_it++;
1689         }
1690     }
1691 
1692     n2->setLiveLatestID(n1->getLiveEndID(), n1->getLiveEndBBID());
1693     linearScanLiveNodes.remove(n1);
1694 
1695 #ifdef DEBUG_VERBOSE_ON
1696     printf("remove token 1: %d\n", n1->getLastInstruction()->getSetToken());
1697 #endif
1698     return;
1699 }
1700 
1701 /*
1702 *
1703 *  We need cycle based expiration because for the case like
1704 *  send   null, r2...      {$0}
1705 *  add  r2                 {$0.src}
1706 *  send   r20   r9...      {$0}
1707 *  The second send should not be assigned with $0.
1708 *  In compiler, if the live range of the r2 is end in the second instruction, token $0 is treated as free.
1709 *  However, the SBID $0 will cleared only when the instruction finished the execution.
1710 *  Assigned the same token to the third instruction will cause a long latency.
1711 *  We delay the end of the lives of the intervals until the cycles are all consumed, so that the token will not be assigned immediately.
1712 *  But if the dependence is .dst dependence, the live range is over. The stall will be going until the finish of the instruction.
1713 *
1714 */
expireIntervals(unsigned startID)1715 void SWSB::expireIntervals(unsigned startID)
1716 {
1717     for (SBNODE_LIST_ITER node_it = linearScanLiveNodes.begin();
1718         node_it != linearScanLiveNodes.end();
1719         )
1720     {
1721         SBNode* curNode = (*node_it);
1722         if (curNode->getLiveEndID() <= startID)
1723         {
1724             const SBNode* node = linearScanLiveNodes.front();
1725             if (node->hasAWDep() || cycleExpired(node, startID))
1726             {
1727                 unsigned short token = node->getLastInstruction()->getSetToken();
1728 
1729                 assert(token != (unsigned short)-1);
1730                 node_it = linearScanLiveNodes.erase(node_it);
1731 #ifdef DEBUG_VERBOSE_ON
1732                 printf("remove token %d:\n", token);
1733 #endif
1734                 //Remove token to free list
1735                 freeTokenList[token] = nullptr;
1736                 if (topIndex == -1)
1737                 {
1738                     topIndex = token;
1739                 }
1740                 continue;
1741             }
1742         }
1743         else
1744         {
1745             break;
1746         }
1747         node_it++;
1748     }
1749 }
1750 
1751 //GraphColoring can provide a more accurate version.
1752 //For linear scan, only if the instruction is not assigned before can be used for this OPT.
1753 //This is to avoid the false token sharing.
1754 //What's the impact on the token reduction?
1755 //Token reduction will remove the succ, i.e remove the dependence.
1756 //NOTE THAT: Token reduction happens only when run out of token.
shareToken(const SBNode * node,const SBNode * succ,unsigned short token)1757 void SWSB::shareToken(const SBNode* node, const SBNode* succ, unsigned short token)
1758 {
1759     if (node->getBBID() == succ->getBBID())
1760     {
1761         return;
1762     }
1763 
1764     for (const SBDEP_ITEM& curPred : succ->preds)
1765     {
1766         const SBNode* succPred = curPred.node;
1767 
1768         if (node->getBBID() != succPred->getBBID() &&
1769             succPred->getLastInstruction()->getTokenType() == G4_INST::SWSBTokenType::TOKEN_NONE &&
1770             tokenHonourInstruction(succPred->getLastInstruction()))
1771         {
1772             G4_BB_SB* curBB = BBVector[node->getBBID()];
1773             G4_BB_SB* succPredBB = BBVector[succPred->getBBID()];
1774             //FIXME: Only define BBs comparison is not enough. It may cause extra delay?
1775             if (!(curBB->send_live_in.isDstSet((unsigned)succPred->globalID) ||
1776                 curBB->send_live_in.isSrcSet((unsigned)succPred->globalID) ||
1777                 succPredBB->send_live_in.isDstSet((unsigned)node->globalID) ||
1778                 succPredBB->send_live_in.isSrcSet((unsigned)node->globalID)
1779                 ))
1780             {
1781                 succPred->getLastInstruction()->setSetToken(token);
1782             }
1783         }
1784     }
1785 
1786     return;
1787 }
1788 
assignDepToken(SBNode * node)1789 void SWSB::assignDepToken(SBNode* node)
1790 {
1791     unsigned short token = node->getLastInstruction()->getSetToken();
1792     assert(token != (unsigned short)-1 && "Failed to add token dependence to the node without token");
1793 
1794     //Set the dependent tokens for successors of current send
1795     //Remove the unnecessary dependent tokens in same BB, this work can be done when adding the edge,
1796     //However, since that's the bucket based, and is harder to do sorting for different GRF dependence
1797     //
1798     //1. Send r2-r5, r8, ....    $1
1799     //   ...
1800     //7. Add  r8, r16, r10   test $1S
1801     //8. Add  r12, r4, r14   test $1D
1802     //If WAR first as shown in instruction 7, we still need keep dependence for 8.
1803     //
1804     //1. Send r2-r5, r8, ....    $1
1805     //   ...
1806     //7. Add  r12, r4, r14   test $1D
1807     //8. Add  r8, r16, r10
1808     //Instead, if RAW happens first as shown in instruction 7, there is NO need for 8.
1809 
1810     for (const SBDEP_ITEM& curSucc : node->succs)
1811     {
1812         SBNode* succ = curSucc.node;
1813         DepType type = curSucc.type;
1814         SBDependenceAttr attr = curSucc.attr;
1815 
1816         if (attr == DEP_IMPLICIT)
1817         {
1818             continue;
1819         }
1820 
1821         //Same token,reuse happened, no need to set dep token
1822         if (tokenHonourInstruction(succ->getLastInstruction()) &&
1823             succ->getLastInstruction()->getSetToken() == token && (succ->instVec.size() <= 1)) //If the node size, the token reuse cannot guard the last instruction.
1824         {
1825             continue;
1826         }
1827 
1828         //set dependence token if live
1829         SWSBTokenType tokenType = type == WAR ? SWSBTokenType::AFTER_READ : SWSBTokenType::AFTER_WRITE;
1830         succ->setDepToken(token, tokenType, node);
1831 #ifdef DEBUG_VERBOSE_ON
1832         dumpSync(node, succ, token, tokenType);
1833 #endif
1834     }
1835 }
1836 
assignDepTokens()1837 void SWSB::assignDepTokens()
1838 {
1839     for (SBNode* node : SBSendNodes)
1840     {
1841         G4_INST* inst = node->getLastInstruction();
1842 
1843         if (inst->isEOT())
1844         {
1845             continue;
1846         }
1847 
1848         unsigned short token = inst->getSetToken();
1849         if (token != (unsigned short)-1)
1850         {
1851             assignDepToken(node);
1852         }
1853     }
1854 }
1855 
assignToken(SBNode * node,unsigned short assignedToken,uint32_t & AWtokenReuseCount,uint32_t & ARtokenReuseCount,uint32_t & AAtokenReuseCount)1856 void SWSB::assignToken(SBNode* node,
1857     unsigned short assignedToken,
1858     uint32_t& AWtokenReuseCount,
1859     uint32_t& ARtokenReuseCount,
1860     uint32_t& AAtokenReuseCount)
1861 {
1862     unsigned short token = (unsigned short)UNKNOWN_TOKEN;
1863 
1864     if (assignedToken == (unsigned short)UNKNOWN_TOKEN)
1865     {
1866         //Get token
1867         if (topIndex != -1)
1868         {
1869             //Have free token
1870             token = topIndex;
1871             freeTokenList[token] = node; //Cannot be moved after setTopTokenIndex();
1872             setTopTokenIndex();
1873 #ifdef DEBUG_VERBOSE_ON
1874             printf("Use free token: %d, QUEUE SIZE: %d\n", token, linearScanLiveNodes.size());
1875 #endif
1876         }
1877         else
1878         {
1879             //Have no free, use the oldest
1880             SBNode* oldNode = reuseTokenSelection(node);
1881             token = oldNode->getLastInstruction()->getSetToken();
1882             tokenDepReduction(oldNode, node);
1883             freeTokenList[token] = node;
1884 #ifdef DEBUG_VERBOSE_ON
1885             printf("Reuse token: %d,  current: %d  %d, reuse: %d  %d, QUEUE SIZE: %d\n", token, node->getSendID(), node->getNodeID(), oldNode->getSendID(), oldNode->getNodeID(), linearScanLiveNodes.size());
1886 #endif
1887             tokenReuseCount++;
1888             if (oldNode->hasAWDep())
1889             {
1890                 AWtokenReuseCount++;
1891             }
1892             else if (oldNode->hasARDep())
1893             {
1894                 ARtokenReuseCount++;
1895             }
1896             else
1897             {
1898                 AAtokenReuseCount++;
1899             }
1900             node->setTokenReuseNode(oldNode);
1901         }
1902     }
1903     else
1904     {
1905         //This reuse pred node may have been reused already
1906         //When it is in short of free SBID. So, it's may not in the active list.
1907         token = assignedToken;
1908         if (freeTokenList[token] != nullptr)
1909         { //If the end of predecessor node is current node, the pred node may have expired already. Otherwise do reduction
1910             SBNode* pred = freeTokenList[token];
1911             tokenDepReduction(pred, node);
1912         }
1913         freeTokenList[token] = node;
1914         if (topIndex == token)
1915         {
1916             setTopTokenIndex();
1917         }
1918 #ifdef DEBUG_VERBOSE_ON
1919         printf("Reuse token: %d,  QUEUE SIZE: %d\n", token, linearScanLiveNodes.size());
1920 #endif
1921     }
1922 #ifdef DEBUG_VERBOSE_ON
1923     printf("Assigned token: %d,  node: %d, send: %d,  QUEUE SIZE: %d\n", token, node->getNodeID(), node->getSendID(), linearScanLiveNodes.size());
1924 #endif
1925 
1926     //Set token to send
1927     node->getLastInstruction()->setSetToken(token);
1928     //For token reduction
1929     allTokenNodesMap[token].set(node->sendID);
1930 
1931     //Sort succs according to the BBID and node ID.
1932     std::sort(node->succs.begin(), node->succs.end(), nodeSortCompare);
1933     for (auto node_it = node->succs.begin();
1934         node_it != node->succs.end();
1935         )
1936     {
1937         const SBDEP_ITEM& curSucc = (*node_it);
1938         SBNode* succ = curSucc.node;
1939         SBDependenceAttr attr = curSucc.attr;
1940 
1941         if (attr == DEP_IMPLICIT)
1942         {
1943             node_it++;
1944             continue;
1945         }
1946 
1947         // In the case like following
1948         //  1. math.rsqrt   r20 r10           { $1 }
1949         //  2. math.in      r50  r20          { $1 }
1950         //  3. mul          r60 r50 r40       { $1.dst }
1951         if (tokenHonourInstruction(succ->getLastInstruction()))
1952         {
1953             unsigned distance = succ->getSendID() > node->getSendID() ? succ->getSendID() - node->getSendID() : node->getSendID() - succ->getSendID();
1954             if ((fg.builder->getOptions()->getOption(vISA_EnableISBIDBUNDLE) ||
1955                 distance < totalTokenNum))
1956             {
1957                 if ((curSucc.type == RAW || curSucc.type == WAW) &&
1958                     succ->getLastInstruction()->getSetToken() == (unsigned short)UNKNOWN_TOKEN)
1959                 {
1960                     if (fg.builder->getOptions()->getOption(vISA_EnableDPASTokenReduction))
1961                     {
1962                         //  If no instruction depends on DPAS, no SBID
1963                         if (!(succ->GetInstruction()->isDpas() && succ->succs.size() == 0))
1964                         {
1965                             succ->getLastInstruction()->setSetToken(token);
1966                             node->setLiveLatestID(succ->getLiveEndID(), succ->getLiveEndBBID());
1967                             allTokenNodesMap[token].set(succ->sendID);
1968                             succ->setTokenReuseNode(node);
1969                             continue;
1970                         }
1971                     }
1972                     else
1973                     {
1974                         succ->getLastInstruction()->setSetToken(token);
1975                         node->setLiveLatestID(succ->getLiveEndID(), succ->getLiveEndBBID());
1976                         allTokenNodesMap[token].set(succ->sendID);
1977                         succ->setTokenReuseNode(node);
1978                         continue;
1979                     }
1980                 }
1981             }
1982         }
1983 
1984         node_it++;
1985     }
1986 
1987     return;
1988 }
1989 
addToLiveList(SBNode * node)1990 void SWSB::addToLiveList(SBNode* node)
1991 {
1992     bool insert = false;
1993     assert(linearScanLiveNodes.size() < totalTokenNum);
1994     for (SBNODE_LIST_ITER node_it = linearScanLiveNodes.begin();
1995         node_it != linearScanLiveNodes.end();
1996         node_it++)
1997     {
1998         const SBNode* curNode = (*node_it);
1999 
2000         //Sort according to the ascending of the end ID.
2001         if (curNode->getLiveEndID() > node->getLiveEndID())
2002         {
2003             linearScanLiveNodes.insert(node_it, node);
2004             insert = true;
2005             break;
2006         }
2007         else if (curNode->getLiveEndID() == node->getLiveEndID())
2008         {
2009             if (curNode->getLiveStartID() > node->getLiveStartID())
2010             {
2011                 linearScanLiveNodes.insert(node_it, node);
2012                 insert = true;
2013                 break;
2014             }
2015             else if (curNode->getLiveStartID() == node->getLiveStartID())
2016             {
2017                 if (curNode->getNodeID() > node->getNodeID())
2018                 {
2019                     linearScanLiveNodes.insert(node_it, node);
2020                     insert = true;
2021                     break;
2022                 }
2023             }
2024         }
2025     }
2026 
2027     if (!insert)
2028     {
2029         linearScanLiveNodes.push_back(node);
2030     }
2031 
2032     unsigned usedToken = 0;
2033     for (const SBNode *node : freeTokenList)
2034     {
2035         if (node != nullptr)
2036         {
2037             usedToken++;
2038         }
2039     }
2040     assert(usedToken == linearScanLiveNodes.size());
2041 
2042 #ifdef DEBUG_VERBOSE_ON
2043     printf("Add token: %d\n", node->getLastInstruction()->getSetToken());
2044 #endif
2045     return;
2046 }
2047 
2048 //
2049 //  Global reaching define analysis for tokens
2050 //
globalTokenReachAnalysis(G4_BB * bb)2051 bool SWSB::globalTokenReachAnalysis(G4_BB* bb)
2052 {
2053     bool changed = false;
2054     unsigned bbID = bb->getId();
2055 
2056     // Do nothing for the entry BB
2057     // Because it has no live in
2058     if (bb->Preds.empty())
2059     {
2060         return false;
2061     }
2062 
2063     assert(BBVector[bbID]->liveInTokenNodes.getSize() != 0);
2064 
2065     BitSet temp_live_in(unsigned(SBSendNodes.size()), false);
2066     temp_live_in = BBVector[bbID]->liveInTokenNodes;
2067 
2068     //Union all of out of SIMDCF predecessor BB to the live in of current BB.
2069     for (const G4_BB_SB* predBB : BBVector[bbID]->Preds)
2070     {
2071         unsigned predID = predBB->getBB()->getId();
2072         temp_live_in |= BBVector[predID]->liveOutTokenNodes;
2073     }
2074 
2075     //Union all of out of scalar predecessor BB to the live in of current BB.
2076     for (const G4_BB* predBB : bb->Preds)
2077     {
2078         unsigned predID = predBB->getId();
2079         temp_live_in |= BBVector[predID]->liveOutTokenNodes;
2080     }
2081 
2082     //Changed? Yes, get the new live in, other wise do nothing
2083     if (temp_live_in != BBVector[bbID]->liveInTokenNodes)
2084     {
2085         changed = true;
2086         BBVector[bbID]->liveInTokenNodes = temp_live_in;
2087     }
2088 
2089     //Calculate the live out according to the live in and killed tokens in current BB
2090     for (uint32_t token = 0; token < totalTokenNum; token++)
2091     {
2092         if (BBVector[bbID]->killedTokens.isSet(token))
2093         {
2094             temp_live_in -= allTokenNodesMap[token].bitset;
2095         }
2096     }
2097 
2098     //Get the new live out,
2099     //FIXME: is it right? the live out is always assigned in increasing.
2100     //Original, we only have local live out.
2101     //should we separate the local live out vs total live out?
2102     //Not necessary, can live out, will always be live out.
2103     BBVector[bbID]->liveOutTokenNodes |= temp_live_in;
2104 
2105     return changed;
2106 }
2107 
SWSBGlobalTokenAnalysis()2108 void SWSB::SWSBGlobalTokenAnalysis()
2109 {
2110     bool change = true;
2111     while (change)
2112     {
2113         change = false;
2114         for (G4_BB* bb : fg)
2115         {
2116             if (globalTokenReachAnalysis(bb))
2117             {
2118                 change = true;
2119             }
2120         }
2121     }
2122 }
2123 
SWSBGlobalScalarCFGReachAnalysis()2124 void SWSB::SWSBGlobalScalarCFGReachAnalysis()
2125 {
2126     bool change = true;
2127     while (change)
2128     {
2129         change = false;
2130         for (G4_BB* bb : fg)
2131         {
2132             if (globalDependenceDefReachAnalysis(bb))
2133             {
2134                 change = true;
2135             }
2136         }
2137     }
2138 }
2139 
SWSBGlobalSIMDCFGReachAnalysis()2140 void SWSB::SWSBGlobalSIMDCFGReachAnalysis()
2141 {
2142     bool change = true;
2143     while (change)
2144     {
2145         change = false;
2146         for (G4_BB* bb : fg)
2147         {
2148             if (globalDependenceUseReachAnalysis(bb))
2149             {
2150                 change = true;
2151             }
2152         }
2153     }
2154 }
2155 
setTopTokenIndex()2156 void SWSB::setTopTokenIndex()
2157 {
2158     int startIndex = topIndex;
2159     if (topIndex == -1)
2160     {
2161         startIndex = 0;
2162     }
2163     for (int i = startIndex; i < (int)totalTokenNum; i++)
2164     {
2165         if (freeTokenList[i] == nullptr)
2166         {
2167             topIndex = i;
2168             return;
2169         }
2170     }
2171     for (int i = 0; i < startIndex; i++)
2172     {
2173         if (freeTokenList[i] == nullptr)
2174         {
2175             topIndex = i;
2176             return;
2177         }
2178     }
2179 
2180     topIndex = -1;
2181 }
2182 
propogateDist(G4_BB * bb)2183 bool SWSB::propogateDist(G4_BB* bb)
2184 {
2185     bool changed = false;
2186     unsigned bbID = bb->getId();
2187 
2188     if (bb->Preds.empty())
2189     {
2190         return false;
2191     }
2192 
2193     assert(BBVector[bbID]->send_live_in.getSize() != 0);
2194 
2195     SBBitSets temp_live_in(globalSendNum);
2196     temp_live_in = BBVector[bbID]->send_live_in;
2197     std::vector<unsigned> tokenLiveInDist;
2198     tokenLiveInDist.resize(globalSendNum);
2199 
2200     for (unsigned i = 0; i < globalSendNum; i++)
2201     {
2202         tokenLiveInDist[i] = BBVector[bbID]->tokenLiveInDist[i];
2203     }
2204 
2205     //Get the live out from all predicator BBs
2206     for (const G4_BB* predBB : bb->Preds)
2207     {
2208         unsigned predID = predBB->getId();
2209 
2210         for (unsigned i = 0; i < globalSendNum; i++)
2211         {
2212             if (BBVector[predID]->send_live_out.isDstSet(i) &&
2213                 BBVector[predID]->tokenLiveOutDist[i] != -1 &&
2214                 BBVector[predID]->tokenLiveOutDist[i] < tokenLiveInDist[i])
2215             {
2216                 tokenLiveInDist[i] = BBVector[predID]->tokenLiveOutDist[i];
2217             }
2218         }
2219     }
2220 
2221     //Update the live in
2222     for (unsigned i = 0; i < globalSendNum; i++)
2223     {
2224         if (tokenLiveInDist[i] != BBVector[bbID]->tokenLiveInDist[i] &&
2225             tokenLiveInDist[i] != -1)
2226         {
2227             changed = true;
2228             BBVector[bbID]->tokenLiveInDist[i] = tokenLiveInDist[i];
2229         }
2230     }
2231 
2232     //Update the live out
2233     if (changed)
2234     {
2235         for (unsigned i = 0; i < globalSendNum; i++)
2236         {
2237             if (BBVector[bbID]->send_live_in.isDstSet(i) &&
2238                 BBVector[bbID]->send_live_out.isDstSet(i) &&
2239                 !BBVector[bbID]->send_may_kill.isDstSet(i))
2240             {
2241                 BBVector[bbID]->tokenLiveOutDist[i] = BBVector[bbID]->tokenLiveInDist[i] + bb->size();
2242             }
2243         }
2244     }
2245 
2246     return changed;
2247 }
2248 
calculateDist()2249 void SWSB::calculateDist()
2250 {
2251 #ifdef DEBUG_VERBOSE_ON
2252     globalSBNodes.resize(globalSendNum);
2253 #endif
2254     //Initial all live out distance
2255     for (SBNode* node : SBSendNodes)
2256     {
2257         if (BBVector[node->getBBID()]->send_live_out.isDstSet(node->globalID))
2258         {
2259             BBVector[node->getBBID()]->tokenLiveOutDist[node->globalID] = BBVector[node->getBBID()]->last_node - node->getNodeID();
2260 #ifdef DEBUG_VERBOSE_ON
2261             globalSBNodes[node->globalID] = node;
2262 #endif
2263         }
2264     }
2265 
2266     bool change = true;
2267     while (change)
2268     {
2269         change = false;
2270         for (G4_BB* bb : fg)
2271         {
2272             if (propogateDist(bb))
2273             {
2274                 change = true;
2275             }
2276         }
2277     }
2278 
2279 #ifdef DEBUG_VERBOSE_ON
2280     for (size_t i = 0; i < BBVector.size(); i++)
2281     {
2282         std::cerr << "BB" << i << ": " << BBVector[i]->first_node << "-" << BBVector[i]->last_node << ", succ<";
2283         for (std::list<G4_BB*>::iterator sit = BBVector[i]->getBB()->Succs.begin(); sit != BBVector[i]->getBB()->Succs.end(); ++sit)
2284         {
2285             std::cerr << (*sit)->getId() << ",";
2286         }
2287         std::cerr << "> pred<";
2288         for (std::list<G4_BB*>::iterator pit = BBVector[i]->getBB()->Preds.begin(); pit != BBVector[i]->getBB()->Preds.end(); ++pit)
2289         {
2290             std::cerr << (*pit)->getId() << ",";
2291         }
2292 
2293         std::cerr << ">\n liveIn:";
2294         for (unsigned k = 0; k < globalSendNum; k++)
2295         {
2296             if (BBVector[i]->tokenLiveInDist[k] != -1)
2297             {
2298                 std::cerr << "  n" << globalSBNodes[k]->getNodeID() << ":" << BBVector[i]->tokenLiveInDist[k];
2299             }
2300         }
2301         std::cerr << "\n liveout:";
2302         for (unsigned k = 0; k < globalSendNum; k++)
2303         {
2304             if (BBVector[i]->tokenLiveOutDist[k] != -1)
2305             {
2306                 std::cerr << "  n" << globalSBNodes[k]->getNodeID() << ":" << BBVector[i]->tokenLiveOutDist[k];
2307             }
2308         }
2309         std::cerr << "\n\n";
2310     }
2311 #endif
2312 
2313 }
2314 
2315 
2316 /* Quick token allocation, allocate the token in round robin.
2317  */
quickTokenAllocation()2318 void SWSB::quickTokenAllocation()
2319 {
2320     uint32_t token = 0;
2321 
2322     //Linear scan
2323     for (SBNode* node : SBSendNodes)
2324     {
2325         if (node->getLastInstruction()->isEOT())
2326         {
2327             continue;
2328         }
2329 
2330         assert(node->getLastInstruction()->getSetToken() == (unsigned short)UNKNOWN_TOKEN);
2331         node->getLastInstruction()->setSetToken(token);
2332         if (token >= totalTokenNum - 1)
2333         {
2334             token = 0;
2335         }
2336         else
2337         {
2338             token ++;
2339         }
2340     }
2341 
2342     assignDepTokens();
2343 }
2344 
2345 /* Linear scan algorithm is used for the token allocation.
2346  * Based on the assumption that instruction scheduling has scheduled the instruction to the best.
2347  * FIXME: instruction scheduling doesn't consider the token pressure issue.
2348  */
tokenAllocation()2349 void SWSB::tokenAllocation()
2350 {
2351     //build live intervals
2352     buildLiveIntervals();
2353 
2354     //Initial free token list
2355     freeTokenList.resize(totalTokenNum);
2356     topIndex = 0;
2357 
2358     tokenProfile.setTokenInstructionCount((int)SBSendNodes.size());
2359     uint32_t AWTokenReuseCount = 0;
2360     uint32_t ARTokenReuseCount = 0;
2361     uint32_t AATokenReuseCount = 0;
2362     uint32_t mathInstCount = 0;
2363     //Linear scan
2364     //Assign tokens to nodes in the order of liveness. Here we only need to
2365     //iterate SB nodes in that order, and don't actually need to sort
2366     //SBSendNodes as it might be referenced through allTokenNodesMap.
2367     auto sortInLivenessOrder = [](const SBNODE_VECT& vec) {
2368         SBNODE_VECT sorted(vec.size());
2369         std::partial_sort_copy(vec.begin(), vec.end(), sorted.begin(), sorted.end(), compareInterval);
2370         return sorted;
2371     };
2372     const bool enableSendTokenReduction = fg.builder->getOptions()->getOption(vISA_EnableSendTokenReduction);
2373     const bool enableDPASTokenReduction = fg.builder->getOptions()->getOption(vISA_EnableDPASTokenReduction);
2374     for (SBNode* node : sortInLivenessOrder(SBSendNodes))
2375     {
2376         unsigned startID = node->getLiveStartID();
2377         G4_INST* inst = node->getLastInstruction();
2378 #ifdef DEBUG_VERBOSE_ON
2379         printf("\n=======nodeID: %d, startID: %d, endID: %d\n", node->getNodeID(), node->getLiveStartID(), node->getLiveEndID());
2380 #endif
2381         if (inst->isEOT())
2382         {
2383             continue;
2384         }
2385 
2386         if (enableSendTokenReduction && node->succs.size() == 0)
2387         {
2388             continue;
2389         }
2390 
2391         if (enableDPASTokenReduction)
2392         {
2393             //If there is no instruction depends on a DPAS instruction, no SBID
2394             if (inst->isDpas() && node->succs.size() == 0)
2395             {
2396                 continue;
2397             }
2398         }
2399 
2400         if (inst->isMathPipeInst())
2401         {
2402             mathInstCount++;
2403         }
2404 
2405         expireIntervals(startID);
2406 
2407         unsigned short assignedToken = node->getLastInstruction()->getSetToken();
2408         //If token reuse happened, and the live range of old node is longer than current one,
2409         //we will keep the old one in the active list.
2410         assignToken(node, assignedToken,
2411             AWTokenReuseCount,
2412             ARTokenReuseCount,
2413             AATokenReuseCount);
2414 
2415         addToLiveList(node);
2416     }
2417 
2418 #ifdef DEBUG_VERBOSE_ON
2419     dumpTokeAssignResult();
2420 #endif
2421 
2422     if (fg.builder->getOptions()->getOption(vISA_SWSBDepReduction))
2423     {
2424         for (G4_BB_SB* sb_bb : BBVector)
2425         {
2426             sb_bb->getLiveOutToken(unsigned(SBSendNodes.size()), &SBNodes);
2427         }
2428 #ifdef DEBUG_VERBOSE_ON
2429         dumpTokenLiveInfo();
2430 #endif
2431         SWSBGlobalTokenAnalysis();
2432 
2433 #ifdef DEBUG_VERBOSE_ON
2434         dumpTokenLiveInfo();
2435 #endif
2436 
2437         unsigned prunedEdgeNum = 0;
2438         unsigned prunedGlobalEdgeNum = 0;
2439         unsigned prunedDiffBBEdgeNum = 0;
2440         unsigned prunedDiffBBSameTokenEdgeNum = 0;
2441         tokenEdgePrune(prunedEdgeNum, prunedGlobalEdgeNum, prunedDiffBBEdgeNum, prunedDiffBBSameTokenEdgeNum);
2442         tokenProfile.setPrunedEdgeNum(prunedEdgeNum);
2443         tokenProfile.setPrunedGlobalEdgeNum(prunedGlobalEdgeNum);
2444         tokenProfile.setPrunedDiffBBEdgeNum(prunedDiffBBEdgeNum);
2445         tokenProfile.setPrunedDiffBBSameTokenEdgeNum(prunedDiffBBSameTokenEdgeNum);
2446     }
2447 
2448     assignDepTokens();
2449 
2450     tokenProfile.setAWTokenReuseCount(AWTokenReuseCount);
2451     tokenProfile.setARTokenReuseCount(ARTokenReuseCount);
2452     tokenProfile.setAATokenReuseCount(AATokenReuseCount);
2453     tokenProfile.setMathInstCount(mathInstCount);
2454 }
2455 
reuseTokenSelectionGlobal(SBNode * node,G4_BB * bb,SBNode * & candidateNode,bool & fromSibling)2456 unsigned short SWSB::reuseTokenSelectionGlobal(SBNode* node, G4_BB* bb, SBNode*& candidateNode, bool& fromSibling)
2457 {
2458     SBBitSets temp_live_in(globalSendNum);
2459     temp_live_in = BBVector[bb->getId()]->send_live_in;
2460     unsigned short reuseToken = (unsigned short)UNKNOWN_TOKEN;
2461     unsigned nodeReuseOverhead = -1;
2462 
2463     tokenReuseCount++;
2464     for (unsigned int i = 0; i < totalTokenNum; i++)
2465     {
2466         unsigned nodeDist = -1;
2467         unsigned tokenReuseOverhead = 0;
2468         SBNode* candidateTokenNode = nullptr;
2469         unsigned short curToken = (unsigned short)UNKNOWN_TOKEN;
2470         bool fromUse = false;
2471 
2472         for (SBNode* liveNode : *reachTokenArray[i])
2473         {
2474             unsigned liveNodeDelay = liveNode->getDepDelay();
2475             unsigned liveNodeOverhead = 0;
2476 
2477             //What about the global send come back to current BB?
2478             //Shouldn't be assigned
2479             if ((liveNode->globalID != -1) &&
2480                 (BBVector[bb->getId()]->tokenLiveInDist[liveNode->globalID] != -1) &&
2481                 (liveNode->getBBID() != bb->getId() || liveNode->getNodeID() > node->getNodeID()) )
2482             {
2483                 nodeDist = BBVector[bb->getId()]->tokenLiveInDist[liveNode->globalID] + (node->getNodeID() - BBVector[bb->getId()]->first_node);
2484             }
2485             else
2486             {
2487                 if (liveNode->getBBID() == bb->getId())
2488                 {
2489                     nodeDist = node->getNodeID() - liveNode->getNodeID();
2490                 }
2491                 else //Not dst live out global, which is not calculated, use the node distance
2492                 {
2493                     nodeDist = node->getNodeID() > liveNode->getNodeID() ? node->getNodeID() - liveNode->getNodeID() : liveNode->getNodeID() - node->getNodeID();
2494                 }
2495             }
2496 
2497             liveNodeOverhead = (liveNodeDelay > nodeDist ? (liveNodeDelay - nodeDist) : 0);
2498             liveNodeOverhead += liveNode->reuseOverhead;
2499 
2500             if ((candidateTokenNode == nullptr) || (liveNodeOverhead > tokenReuseOverhead))
2501             {
2502                 tokenReuseOverhead = liveNodeOverhead;
2503                 candidateTokenNode = liveNode;
2504                 curToken = i;
2505                 fromUse = false;
2506             }
2507         }
2508 
2509         if (fromSibling)
2510         {
2511             for (SBNode* useNode : *reachUseArray[i])
2512             {
2513                 unsigned nodeDelay = node->getDepDelay();
2514                 unsigned nodeOverhead = 0;
2515 
2516                 //What about the global send come back to current BB?
2517                 //Shouldn't be assigned
2518                 if ((node->globalID != -1) &&
2519                     (BBVector[useNode->getBBID()]->tokenLiveInDist[node->globalID] != -1) &&
2520                     (useNode->getBBID() != bb->getId() || useNode->getNodeID() > node->getNodeID()))
2521                 {
2522                     nodeDist = BBVector[useNode->getBBID()]->tokenLiveInDist[node->globalID] + (useNode->getNodeID() - BBVector[useNode->getBBID()]->first_node);
2523                 }
2524                 else
2525                 {
2526                     assert(useNode->getBBID() == bb->getId());
2527                     nodeDist = node->getNodeID() - useNode->getNodeID();
2528                 }
2529 
2530                 nodeOverhead = (nodeDelay > nodeDist ? (nodeDelay - nodeDist) : 0);
2531                 nodeOverhead += node->reuseOverhead;
2532 
2533                 if ((candidateTokenNode == nullptr) || (nodeOverhead > tokenReuseOverhead))
2534                 {
2535                     tokenReuseOverhead = nodeOverhead;
2536                     candidateTokenNode = useNode;
2537                     curToken = i;
2538                     fromUse = true;
2539                 }
2540             }
2541         }
2542 
2543         if (candidateTokenNode && (tokenReuseOverhead < nodeReuseOverhead))
2544         {
2545             nodeReuseOverhead = tokenReuseOverhead;
2546             candidateNode = candidateTokenNode;
2547             reuseToken = curToken;
2548             fromSibling = fromUse;
2549         }
2550     }
2551 
2552     assert(candidateNode != nullptr);
2553     if (!fromSibling)
2554     {
2555         node->reuseOverhead += nodeReuseOverhead;
2556     }
2557 
2558     return reuseToken;
2559 }
2560 
expireLocalIntervals(unsigned startID,unsigned BBID)2561 void SWSB::expireLocalIntervals(unsigned startID, unsigned BBID)
2562 {
2563     for (SBNODE_VECT_ITER it = localTokenUsage.begin(); it != localTokenUsage.end();)
2564     {
2565         SBNode* node = (*it);
2566 
2567         if (node->getLiveEndID() < startID)
2568         {
2569             it = localTokenUsage.erase(it);
2570             BBVector[BBID]->localReachingSends.setDst(node->sendID, false);
2571             continue;
2572         }
2573         it++;
2574     }
2575 }
2576 
assignTokenToPred(SBNode * node,SBNode * pred,G4_BB * bb)2577 void SWSB::assignTokenToPred(SBNode* node, SBNode* pred, G4_BB* bb)
2578 {
2579     unsigned predDist = -1;
2580     SBNode* canidateNode = nullptr;
2581 
2582     assert(pred->getLastInstruction()->getSetToken() != (unsigned short)UNKNOWN_TOKEN);
2583 
2584     for (auto node_it = node->preds.begin();
2585         node_it != node->preds.end(); node_it++)
2586     {
2587         SBDEP_ITEM& curPred = (*node_it);
2588         SBNode* otherPred = curPred.node;
2589         DepType type = curPred.type;
2590         unsigned dist = 0;
2591 
2592         if (otherPred == pred)
2593         {
2594             continue;
2595         }
2596 
2597         if (tokenHonourInstruction(otherPred->getLastInstruction()) &&
2598             (otherPred->getLastInstruction()->getSetToken() == (unsigned short)UNKNOWN_TOKEN) &&
2599             (type == RAW || type == WAW || otherPred->getLastInstruction()->getDst() == nullptr))
2600         {
2601             if ((!otherPred->reachingSends.isDstSet(pred->sendID)) &&
2602                 (!pred->reachingSends.isDstSet(otherPred->sendID)))
2603             {
2604                 if (otherPred->globalID != -1 &&
2605                     BBVector[node->getBBID()]->tokenLiveInDist[otherPred->globalID] != -1)
2606                 {
2607                     dist = BBVector[node->getBBID()]->tokenLiveInDist[otherPred->globalID] + (node->getNodeID() - BBVector[node->getBBID()]->first_node);
2608                 }
2609                 else
2610                 {
2611                     assert(otherPred->getBBID() == bb->getId());
2612                     dist = node->getNodeID() - otherPred->getNodeID();
2613                 }
2614                 if (dist < predDist)
2615                 {
2616                     canidateNode = otherPred;
2617                     predDist = dist;
2618                 }
2619             }
2620         }
2621     }
2622 
2623     if (canidateNode != nullptr)
2624     {
2625         canidateNode->getLastInstruction()->setSetToken(pred->getLastInstruction()->getSetToken());
2626 #ifdef DEBUG_VERBOSE_ON
2627         printf("Node: %d, PRED assign: %d, token: %d\n", node->getNodeID(), canidateNode->getNodeID(), canidateNode->getLastInstruction()->getSetToken());
2628 #endif
2629     }
2630 }
2631 
assignTokenWithPred(SBNode * node,G4_BB * bb)2632 bool SWSB::assignTokenWithPred(SBNode* node, G4_BB* bb)
2633 {
2634     unsigned predDist = -1;
2635     SBNode* canidateNode = nullptr;
2636     for (auto node_it = node->preds.begin();
2637         node_it != node->preds.end(); node_it++)
2638     {
2639         SBDEP_ITEM& curPred = (*node_it);
2640         SBNode* pred = curPred.node;
2641         DepType type = curPred.type;
2642         unsigned dist = 0;
2643 
2644         if (tokenHonourInstruction(pred->getLastInstruction()) &&
2645             (pred->getLastInstruction()->getSetToken() != (unsigned short)UNKNOWN_TOKEN) &&
2646             ((type == RAW) ||(type == WAW) || (pred->getLastInstruction()->getDst() == nullptr)))
2647         {
2648             if ((pred->globalID != -1) &&
2649                 (BBVector[bb->getId()]->tokenLiveInDist[pred->globalID] != -1))
2650             {
2651                 dist = BBVector[bb->getId()]->tokenLiveInDist[pred->globalID] + (node->getNodeID() - BBVector[bb->getId()]->first_node);
2652             }
2653             else
2654             {
2655                 if (fg.builder->getOptions()->getOption(vISA_DistPropTokenAllocation))
2656                 {
2657                     if (pred->getBBID() == bb->getId())
2658                     {
2659                         dist = node->getNodeID() - pred->getNodeID();
2660                     }
2661                     else
2662                     {
2663 #ifdef DEBUG_VERBOSE_ON
2664                         printf("Untracked distance: pred: BB%d:%d -- succ: BB%d:%d\n", pred->getBBID(), pred->getNodeID(), node->getBBID(), node->getNodeID());
2665 #endif
2666                         dist = node->getNodeID() - BBVector[bb->getId()]->first_node;
2667                     }
2668                 }
2669                 else
2670                 {
2671                     assert(pred->getBBID() == bb->getId());
2672                     dist = node->getNodeID() - pred->getNodeID();
2673                 }
2674             }
2675             if (dist < predDist)
2676             {
2677                 canidateNode = pred;
2678                 predDist = dist;
2679             }
2680         }
2681     }
2682 
2683     if (canidateNode != nullptr)
2684     {
2685         node->getLastInstruction()->setSetToken(canidateNode->getLastInstruction()->getSetToken());
2686         allTokenNodesMap[canidateNode->getLastInstruction()->getSetToken()].set(node->sendID);
2687 #ifdef DEBUG_VERBOSE_ON
2688         printf("Node: %d, pred reuse assign: %d, token: %d\n", node->getNodeID(), canidateNode->getNodeID(), node->getLastInstruction()->getSetToken());
2689 #endif
2690         return true;
2691     }
2692 
2693     return false;
2694 }
2695 
allocateToken(G4_BB * bb)2696 void SWSB::allocateToken(G4_BB* bb)
2697 {
2698     if ((BBVector[bb->getId()]->first_send_node == -1) ||
2699         BBVector[bb->getId()]->tokenAssigned)
2700     {
2701         return;
2702     }
2703 
2704     BBVector[bb->getId()]->localReachingSends = SBBitSets(SBSendNodes.size());
2705 
2706     assert((BBVector[bb->getId()]->last_send_node != -1) &&
2707         (BBVector[bb->getId()]->first_send_node <= BBVector[bb->getId()]->last_send_node));
2708 
2709     SBBitSets send_live(SBSendNodes.size());
2710     SBBitSets send_use(SBSendUses.size());
2711 
2712     for (int i = BBVector[bb->getId()]->first_send_node; i <= BBVector[bb->getId()]->last_send_node; i++)
2713     {
2714         SBNode* node = SBSendNodes[i];
2715 
2716         if (node->getLastInstruction()->getSetToken() != (unsigned short)UNKNOWN_TOKEN)
2717         {
2718             continue;
2719         }
2720 
2721         if (node->getLastInstruction()->isDpas() && node->succs.size() == 0 &&
2722             fg.builder->getOptions()->getOption(vISA_EnableDPASTokenReduction))
2723         {
2724             continue;
2725         }
2726 
2727         send_live = node->reachingSends; //The tokens will reach current node
2728 
2729         for (unsigned k = 0; k < totalTokenNum; k++)
2730         {
2731             reachTokenArray[k]->clear();
2732             reachUseArray[k]->clear();
2733         }
2734 
2735         for (size_t k = 0; k < SBSendNodes.size(); k++)
2736         {
2737             SBNode* liveNode = SBSendNodes[k];
2738             if ((liveNode->getLastInstruction()->getSetToken() != (unsigned short)UNKNOWN_TOKEN) &&
2739                 (send_live.isDstSet(k) ||
2740                 (send_live.isSrcSet(k) &&
2741                  isPrefetch(liveNode->getLastInstruction()))))
2742             {
2743                 reachTokenArray[liveNode->getLastInstruction()->getSetToken()]->push_back(liveNode);
2744             }
2745         }
2746 
2747         if (!fg.builder->getOptions()->getOption(vISA_DistPropTokenAllocation) && (node->reachedUses.getSize() != 0))
2748         {
2749             send_use = node->reachedUses;    //The uses of other sends can be reached by current node.
2750             for (size_t k = 0; k < SBSendUses.size(); k++)
2751             {
2752                 SBNode* liveNode = SBSendUses[k];
2753                 if (send_use.isDstSet(k))
2754                 {
2755                     for (size_t m = 0; m < liveNode->preds.size(); m++)
2756                     {
2757                         SBDEP_ITEM& curPred = liveNode->preds[m];
2758                         SBNode* pred = curPred.node;
2759                         if (pred->getLastInstruction()->getSetToken() != (unsigned short)UNKNOWN_TOKEN)
2760                         {
2761                             reachUseArray[pred->getLastInstruction()->getSetToken()]->push_back(liveNode);
2762                         }
2763                     }
2764                 }
2765             }
2766         }
2767 
2768         if (!assignTokenWithPred(node, bb))
2769         {
2770             bool assigned = false;
2771 
2772             //Assigned with coalescing
2773             if (!fg.builder->getOptions()->getOption(vISA_DistPropTokenAllocation) && (node->reachedUses.getSize() != 0))
2774             {
2775                 for (size_t i = 0; i < node->succs.size(); i++)
2776                 {
2777                     SBDEP_ITEM& curSucc = node->succs[i];
2778 
2779                     if (!curSucc.exclusiveNodes.size())
2780                     {
2781                         continue;
2782                     }
2783 
2784                     for (size_t j = 0; j < curSucc.exclusiveNodes.size(); j++)
2785                     {
2786                         SBNode* exclusiveNode = curSucc.exclusiveNodes[j];
2787                         unsigned short exToken = exclusiveNode->getLastInstruction()->getSetToken();
2788                         if (exToken != (unsigned short)UNKNOWN_TOKEN)
2789                         {
2790                             if (reachTokenArray[exToken]->size() == 0 &&
2791                                 reachUseArray[exToken]->size() == 0)
2792                             {
2793                                 node->getLastInstruction()->setSetToken(exToken);
2794                                 allTokenNodesMap[exToken].set(node->sendID);
2795 #ifdef DEBUG_VERBOSE_ON
2796                                 printf("node: %d :: Use exclusive token: %d\n", node->getNodeID(), exToken);
2797 #endif
2798                                 assigned = true;
2799                                 break;
2800                             }
2801                         }
2802                     }
2803                 }
2804             }
2805 
2806             if (!assigned)
2807             {
2808                 //Assigned with first free token
2809                 for (unsigned k = 0; k < totalTokenNum; k++)
2810                 {
2811                     if ((reachTokenArray[k]->size() == 0) &&
2812                         (reachUseArray[k]->size() == 0))
2813                     {
2814                         node->getLastInstruction()->setSetToken(k);
2815                         allTokenNodesMap[k].set(node->sendID);
2816                         assigned = true;
2817 #ifdef DEBUG_VERBOSE_ON
2818                         printf("node: %d :: Use free token: %d\n", node->getNodeID(), k);
2819 #endif
2820                         break;
2821                     }
2822                 }
2823             }
2824 
2825             //All tokens are assigned
2826             if (!assigned)
2827             {
2828                 SBNode* reuseNode = nullptr;
2829                 bool reuseSibling = !fg.builder->getOptions()->getOption(vISA_DistPropTokenAllocation) && (node->reachedUses.getSize() != 0);
2830                 unsigned short reuseToken = reuseTokenSelectionGlobal(node, bb, reuseNode, reuseSibling);
2831 
2832 #ifdef DEBUG_VERBOSE_ON
2833                 if (!reuseSibling)
2834                 {
2835                     printf("node: %d :: Reuse token: %d, from node: %d\n", node->getNodeID(), reuseToken, reuseNode->getNodeID());
2836                 }
2837                 else
2838                 {
2839                     printf("node: %d :: Reuse token: %d, from use node: %d\n", node->getNodeID(), reuseToken, reuseNode->getNodeID());
2840                 }
2841 #endif
2842 
2843                 node->getLastInstruction()->setSetToken(reuseToken);
2844                 allTokenNodesMap[reuseToken].set(node->sendID);
2845             }
2846         }
2847     }
2848 }
2849 
tokenAllocationBB(G4_BB * bb)2850 void SWSB::tokenAllocationBB(G4_BB* bb)
2851 {
2852     //Token allocation
2853     allocateToken(bb);
2854     BBVector[bb->getId()]->tokenAssigned = true;
2855 
2856     //Deep first allocation.
2857     for (const G4_BB_SB *succ : BBVector[bb->getId()]->domSuccs)
2858     {
2859         if (!succ->tokenAssigned)
2860         {
2861             tokenAllocationBB(succ->getBB());
2862         }
2863     }
2864 }
2865 
tokenAllocationWithDistPropogationPerBB(G4_BB * bb)2866 void SWSB::tokenAllocationWithDistPropogationPerBB(G4_BB * bb)
2867 {
2868     propogateDist(bb);
2869     allocateToken(bb);
2870     BBVector[bb->getId()]->tokenAssigned = true;
2871 
2872     for (const G4_BB_SB *succ : BBVector[bb->getId()]->domSuccs)
2873     {
2874         if (!succ->tokenAssigned)
2875         {
2876             tokenAllocationWithDistPropogationPerBB(succ->getBB());
2877         }
2878     }
2879 }
2880 
tokenAllocationWithDistPropogation()2881 void SWSB::tokenAllocationWithDistPropogation()
2882 {
2883 #ifdef DEBUG_VERBOSE_ON
2884     globalSBNodes.resize(globalSendNum);
2885 #endif
2886     //Initial all live out distance
2887     for (const SBNode* node : SBSendNodes)
2888     {
2889         if (BBVector[node->getBBID()]->send_live_out.isDstSet(node->globalID))
2890         {
2891             BBVector[node->getBBID()]->tokenLiveOutDist[node->globalID] = BBVector[node->getBBID()]->last_node - node->getNodeID();
2892 #ifdef DEBUG_VERBOSE_ON
2893             globalSBNodes[node->globalID] = node;
2894 #endif
2895         }
2896     }
2897 
2898     tokenAllocationWithDistPropogationPerBB(*fg.begin());
2899 
2900 #ifdef DEBUG_VERBOSE_ON
2901     for (size_t i = 0; i < BBVector.size(); i++)
2902     {
2903         const G4_BB_SB *bb = BBVector[i];
2904         std::cerr << "BB" << i << ": " << bb->first_node << "-" << bb->last_node << ", succ<";
2905         for (const G4_BB* succ : bb->getBB()->Succs)
2906         {
2907             std::cerr << succ->getId() << ",";
2908         }
2909         std::cerr << "> pred<";
2910         for (const G4_BB* pred : bb->getBB()->Preds)
2911         {
2912             std::cerr << pred->getId() << ",";
2913         }
2914 
2915         std::cerr << ">\n liveIn:";
2916         for (unsigned k = 0; k < globalSendNum; k++)
2917         {
2918             if (bb->tokenLiveInDist[k] != -1)
2919             {
2920                 std::cerr << "  n" << globalSBNodes[k]->getNodeID() << ":" << bb->tokenLiveInDist[k];
2921             }
2922         }
2923         std::cerr << "\n liveout:";
2924         for (unsigned k = 0; k < globalSendNum; k++)
2925         {
2926             if (bb->tokenLiveOutDist[k] != -1)
2927             {
2928                 std::cerr << "  n" << globalSBNodes[k]->getNodeID() << ":" << bb->tokenLiveOutDist[k];
2929             }
2930         }
2931         std::cerr << "\n\n";
2932     }
2933 #endif
2934 
2935 }
2936 
buildExclusiveForCoalescing()2937 void SWSB::buildExclusiveForCoalescing()
2938 {
2939     for (SBNode* node : SBSendNodes)
2940     {
2941         G4_INST* inst = node->getLastInstruction();
2942 
2943         if (inst->isEOT())
2944         {
2945             continue;
2946         }
2947 
2948         //If current one is a node with local live range, reuse cannot happen, because other nodes definitely can reach it.
2949         if (node->globalID == -1)
2950         {
2951             continue;
2952         }
2953 
2954         SBBitSets send_live(SBSendNodes.size());
2955 
2956         for (SBDEP_ITEM& curSucc : node->succs)
2957         {
2958             SBNode* succ = curSucc.node;
2959             DepType type = curSucc.type;
2960             if (((type == RAW) || (type == WAW)) && (succ->reachingSends.getSize() != 0))
2961             {
2962                 send_live = succ->reachingSends;
2963                 //FIXME, the complexity may be a little big high, n*n*succSize
2964                 for (size_t k = 0; k < SBSendNodes.size(); k++)
2965                 {
2966                     SBNode* liveNode = SBSendNodes[k];
2967                     if (send_live.isDstSet(k) &&
2968                         (liveNode != node) &&
2969                         (!(liveNode->reachingSends.isDstSet(node->sendID) ||
2970                             node->reachingSends.isDstSet(liveNode->sendID)) ||
2971                             tokenHonourInstruction(succ->GetInstruction())))
2972                         //If the use is token honour instruction and be assigned with same token as pred,
2973                         //it will cause dependence any way, cannot be removed.
2974                         //FIXME: But one send can depends on multiple previous send.
2975                         //Only the one set to the send will cause non-removable dependence.
2976                     {
2977                         addReachingUseSet(liveNode, succ);
2978                     }
2979                 }
2980             }
2981 
2982             if ((succ->preds.size() <= 1) ||( curSucc.exclusiveNodes.size()))
2983             {
2984                 continue;
2985             }
2986 
2987             if (!((succ->getBBID() == node->getBBID() && succ->getNodeID() > node->getNodeID()) ||
2988                 (succ->getBBID() != node->getBBID())))
2989             {
2990                 continue;
2991             }
2992 
2993             for (const SBDEP_ITEM& curPred : succ->preds)
2994             {
2995                 DepType type = curPred.type;
2996                 SBNode* pred = curPred.node;
2997 
2998                 if (pred == node)
2999                 {
3000                     continue;
3001                 }
3002 
3003                 if (type == WAW || type == RAW)
3004                 {
3005                     if (!((succ->getBBID() == pred->getBBID() && succ->getNodeID() > pred->getNodeID()) ||
3006                         (succ->getBBID() != pred->getBBID())))
3007                     {
3008                         continue;
3009                     }
3010 
3011                     curSucc.exclusiveNodes.push_back(pred);
3012                 }
3013             }
3014         }
3015     }
3016 
3017     return;
3018 }
3019 
tokenAllocationGlobalWithPropogation()3020 void SWSB::tokenAllocationGlobalWithPropogation()
3021 {
3022 #ifdef DEBUG_VERBOSE_ON
3023     dumpDepInfo();
3024 #endif
3025 
3026     buildExclusiveForCoalescing();
3027 
3028     reachTokenArray.resize(totalTokenNum);
3029     reachUseArray.resize(totalTokenNum);
3030 
3031     for (int bucket_i = 0; bucket_i != (int)totalTokenNum; ++bucket_i)
3032     {
3033         void* allocedMem = mem.alloc(sizeof(SBNODE_VECT));
3034         reachTokenArray[bucket_i] = new (allocedMem) SBNODE_VECT();
3035 
3036         allocedMem = mem.alloc(sizeof(SBNODE_VECT));
3037         reachUseArray[bucket_i] = new (allocedMem) SBNODE_VECT();
3038     }
3039 
3040     tokenAllocationWithDistPropogation();
3041 
3042     if (fg.builder->getOptions()->getOption(vISA_SWSBDepReduction))
3043     {
3044         for (G4_BB_SB *bb : BBVector)
3045         {
3046             bb->getLiveOutToken(unsigned(SBSendNodes.size()), &SBNodes);
3047         }
3048 #ifdef DEBUG_VERBOSE_ON
3049         dumpTokenLiveInfo();
3050 #endif
3051 
3052         SWSBGlobalTokenAnalysis();
3053 
3054 #ifdef DEBUG_VERBOSE_ON
3055         dumpTokenLiveInfo();
3056 #endif
3057 
3058 
3059         unsigned prunedEdgeNum = 0;
3060         unsigned prunedGlobalEdgeNum = 0;
3061         unsigned prunedDiffBBEdgeNum = 0;
3062         unsigned prunedDiffBBSameTokenEdgeNum = 0;
3063         tokenEdgePrune(prunedEdgeNum, prunedGlobalEdgeNum, prunedDiffBBEdgeNum, prunedDiffBBSameTokenEdgeNum);
3064         tokenProfile.setPrunedEdgeNum(prunedEdgeNum);
3065         tokenProfile.setPrunedGlobalEdgeNum(prunedGlobalEdgeNum);
3066         tokenProfile.setPrunedDiffBBEdgeNum(prunedDiffBBEdgeNum);
3067         tokenProfile.setPrunedDiffBBSameTokenEdgeNum(prunedDiffBBSameTokenEdgeNum);
3068     }
3069 
3070     assignDepTokens();
3071 }
3072 
tokenAllocationGlobal()3073 void SWSB::tokenAllocationGlobal()
3074 {
3075     G4_BB* bb = *fg.begin();
3076 
3077 #ifdef DEBUG_VERBOSE_ON
3078     dumpDepInfo();
3079 #endif
3080 
3081     calculateDist();
3082 
3083     buildExclusiveForCoalescing();
3084 
3085     reachTokenArray.resize(totalTokenNum);
3086     reachUseArray.resize(totalTokenNum);
3087 
3088     for (int bucket_i = 0; bucket_i != (int)totalTokenNum; ++bucket_i)
3089     {
3090         void* allocedMem = mem.alloc(sizeof(SBNODE_VECT));
3091         reachTokenArray[bucket_i] = new (allocedMem) SBNODE_VECT();
3092 
3093         allocedMem = mem.alloc(sizeof(SBNODE_VECT));
3094         reachUseArray[bucket_i] = new (allocedMem) SBNODE_VECT();
3095     }
3096 
3097     tokenAllocationBB(bb);
3098 
3099     if (fg.builder->getOptions()->getOption(vISA_SWSBDepReduction))
3100     {
3101         for (G4_BB_SB *bb : BBVector)
3102         {
3103             bb->getLiveOutToken(unsigned(SBSendNodes.size()), &SBNodes);
3104         }
3105 #ifdef DEBUG_VERBOSE_ON
3106         dumpTokenLiveInfo();
3107 #endif
3108 
3109         SWSBGlobalTokenAnalysis();
3110 
3111 #ifdef DEBUG_VERBOSE_ON
3112         dumpTokenLiveInfo();
3113 #endif
3114 
3115 
3116         unsigned prunedEdgeNum = 0;
3117         unsigned prunedGlobalEdgeNum = 0;
3118         unsigned prunedDiffBBEdgeNum = 0;
3119         unsigned prunedDiffBBSameTokenEdgeNum = 0;
3120         tokenEdgePrune(prunedEdgeNum, prunedGlobalEdgeNum, prunedDiffBBEdgeNum, prunedDiffBBSameTokenEdgeNum);
3121         tokenProfile.setPrunedEdgeNum(prunedEdgeNum);
3122         tokenProfile.setPrunedGlobalEdgeNum(prunedGlobalEdgeNum);
3123         tokenProfile.setPrunedDiffBBEdgeNum(prunedDiffBBEdgeNum);
3124         tokenProfile.setPrunedDiffBBSameTokenEdgeNum(prunedDiffBBSameTokenEdgeNum);
3125     }
3126 
3127     assignDepTokens();
3128 }
3129 
insertSyncInstruction(G4_BB * bb,INST_LIST_ITER nextIter,int CISAOff,int lineNo)3130 G4_INST* SWSB::insertSyncInstruction(G4_BB* bb, INST_LIST_ITER nextIter, int CISAOff, int lineNo)
3131 {
3132     G4_SrcRegRegion* src0 = fg.builder->createNullSrc(Type_UD);
3133     G4_INST* syncInst = fg.builder->createSync(G4_sync_nop, src0);
3134     bb->insertBefore(nextIter, syncInst);
3135     syncInstCount++;
3136 
3137     return syncInst;
3138 }
3139 
insertSyncInstructionAfter(G4_BB * bb,INST_LIST_ITER iter,int CISAOff,int lineNo)3140 G4_INST* SWSB::insertSyncInstructionAfter(G4_BB* bb, INST_LIST_ITER iter, int CISAOff, int lineNo)
3141 {
3142     INST_LIST_ITER nextIter = iter;
3143     nextIter++;
3144     G4_SrcRegRegion* src0 = fg.builder->createNullSrc(Type_UD);
3145     G4_INST* syncInst = fg.builder->createSync(G4_sync_nop, src0);
3146     bb->insertBefore(nextIter, syncInst);
3147     syncInstCount++;
3148 
3149     return syncInst;
3150 }
3151 
insertTestInstruction(G4_BB * bb,INST_LIST_ITER nextIter,int CISAOff,int lineNo,bool countSync)3152 G4_INST* SWSB::insertTestInstruction(G4_BB* bb, INST_LIST_ITER nextIter, int CISAOff, int lineNo, bool countSync)
3153 {
3154     G4_INST* nopInst = fg.builder->createNop(InstOpt_NoOpt);
3155     bb->insertBefore(nextIter, nopInst);
3156     if (countSync)
3157     {
3158         syncInstCount++;
3159     }
3160 
3161     return nopInst;
3162 }
3163 
insertSyncAllRDInstruction(G4_BB * bb,unsigned int SBIDs,INST_LIST_ITER nextIter,int CISAOff,int lineNo)3164 G4_INST* SWSB::insertSyncAllRDInstruction(G4_BB* bb, unsigned int SBIDs, INST_LIST_ITER nextIter, int CISAOff, int lineNo)
3165 {
3166     G4_INST* syncInst;
3167     if (SBIDs)
3168     {
3169         G4_Imm* src0 = fg.builder->createImm(SBIDs, Type_UD);
3170         syncInst = fg.builder->createSync(G4_sync_allrd, src0);
3171         ARSyncInstCount++;
3172     }
3173     else
3174     {
3175         G4_SrcRegRegion* src0 = fg.builder->createNullSrc(Type_UD);
3176         syncInst = fg.builder->createSync(G4_sync_allrd, src0);
3177         ARSyncAllCount++;
3178     }
3179     bb->insertBefore(nextIter, syncInst);
3180 
3181     return syncInst;
3182 }
3183 
insertSyncAllWRInstruction(G4_BB * bb,unsigned int SBIDs,INST_LIST_ITER nextIter,int CISAOff,int lineNo)3184 G4_INST* SWSB::insertSyncAllWRInstruction(G4_BB* bb, unsigned int SBIDs, INST_LIST_ITER nextIter, int CISAOff, int lineNo)
3185 {
3186     G4_INST* syncInst;
3187     if (SBIDs)
3188     {
3189         G4_Imm* src0 = fg.builder->createImm(SBIDs, Type_UD);
3190         syncInst = fg.builder->createSync(G4_sync_allwr, src0);
3191         AWSyncInstCount++;
3192     }
3193     else
3194     {
3195         G4_SrcRegRegion* src0 = fg.builder->createNullSrc(Type_UD);
3196         syncInst = fg.builder->createSync(G4_sync_allwr, src0);
3197         AWSyncAllCount++;
3198     }
3199     bb->insertBefore(nextIter, syncInst);
3200 
3201     return syncInst;
3202 }
3203 
insertSyncToken(G4_BB * bb,SBNode * node,G4_INST * inst,INST_LIST_ITER inst_it,int newInstID,BitSet * dstTokens,BitSet * srcTokens,bool & keepDst,bool removeAllToken)3204 bool SWSB::insertSyncToken(G4_BB* bb, SBNode* node, G4_INST* inst, INST_LIST_ITER inst_it, int newInstID, BitSet* dstTokens, BitSet* srcTokens, bool& keepDst, bool removeAllToken)
3205 {
3206     //Non-test instruction can only have
3207     // 1. non-send: one Dst Token with distance, or
3208     // 2. send: distance only, or
3209     // 2. one Dst token, or
3210     // 3. one Src token
3211     unsigned short dst = 0;
3212     unsigned short src = 0;
3213     std::vector<std::pair<unsigned short, unsigned>> dst_loc;
3214     std::vector<std::pair<unsigned short, unsigned>> src_loc;
3215 
3216     bool multipleDst = false;
3217     bool multipleSrc = false;
3218     unsigned short token = (unsigned short)-1;
3219     unsigned short dstToken = (unsigned short)-1;
3220     unsigned short srcToken = (unsigned short)-1;
3221     SWSBTokenType type = G4_INST::SWSBTokenType::TOKEN_NONE;
3222     bool insertedSync = false;
3223 
3224     for (unsigned int i = 0; i < node->getDepTokenNum();)
3225     {
3226         G4_INST* synAllInst = nullptr;
3227         token = node->getDepToken(i, type);
3228         unsigned depNodeID = node->getDepTokenNodeID(i);
3229         unsigned short bitToken = (unsigned short)(1 << token);
3230         assert(token != (unsigned short)UNKNOWN_TOKEN);
3231 
3232         switch (type)
3233         {
3234         case SWSBTokenType::AFTER_WRITE:
3235         case SWSBTokenType::AFTER_READ:
3236         {
3237             if (dstTokens->isSet(token) || (type == SWSBTokenType::AFTER_READ && srcTokens->isSet(token)))
3238             {
3239                 //Do BB level clean up
3240                 //So that there will be no case like following redundant sync
3241                 //     sync.nop {$1.src}
3242                 //     sync.nop {$1.src}
3243                 // or
3244                 //     sync.nop {$1.dst}
3245                 //     sync.nop {$1.src}
3246                 // or
3247                 //     mov        {$1.dst}
3248                 //     add        {$1.src}
3249                 node->eraseDepToken(i);
3250                 continue;
3251             }
3252             else
3253             {
3254                 if (!tokenHonourInstruction(inst) &&          //For send and math, no dependent token
3255                     !removeAllToken &&
3256                     !keepDst &&                                //No one kept yet
3257                     (!inst->getDistance() ||  //Only Dst can be kept
3258                         type == SWSBTokenType::AFTER_WRITE)) //Or there is no distance dependence
3259                         //FIXME: for tokenhonour instruction, we didn't support memdst only or memsrc only modes.
3260                         //       To support these two modes, the pre-condition is that current instruction has no SBID.
3261                 {
3262                     //Token is kept in original instruction
3263                     keepDst = true;
3264                     inst->setToken(token);
3265                     inst->setTokenType(type);
3266                     inst->setTokenLoc(token, depNodeID);
3267                     token = (unsigned short)UNKNOWN_TOKEN;
3268                     i++;
3269                     continue;
3270                 }
3271 
3272                 if (type == SWSBTokenType::AFTER_READ)
3273                 {
3274                     src |= bitToken;
3275                     src_loc.push_back(std::make_pair(token, depNodeID));
3276                     if (!multipleSrc && (src & ~bitToken))
3277                     {
3278                         multipleSrc = true;
3279                     }
3280                     srcToken = token;
3281                     srcTokens->set(token, true);
3282                 }
3283                 else
3284                 {
3285                     assert(type == SWSBTokenType::AFTER_WRITE);
3286                     dst |= bitToken;
3287                     dst_loc.push_back(std::make_pair(token, depNodeID));
3288                     if (!multipleDst && (dst & ~bitToken))
3289                     {
3290                         multipleDst = true;
3291                     }
3292                     dstToken = token;
3293                     dstTokens->set(token, true);
3294                 }
3295 
3296                 node->eraseDepToken(i);
3297                 continue;
3298             }
3299         }
3300         break;
3301         case SWSBTokenType::READ_ALL:
3302         {
3303             assert(token == (unsigned short)UNKNOWN_TOKEN);
3304             node->eraseDepToken(i);
3305             synAllInst = insertSyncAllRDInstruction(bb, 0, inst_it, inst->getCISAOff(), inst->getLineNo());
3306             synAllInst->setLexicalId(newInstID);
3307             i++;
3308             continue;
3309         }
3310         break;
3311         case SWSBTokenType::WRITE_ALL:
3312         {
3313             assert(token == (unsigned short)UNKNOWN_TOKEN);
3314             node->eraseDepToken(i);
3315             synAllInst = insertSyncAllWRInstruction(bb, 0, inst_it, inst->getCISAOff(), inst->getLineNo());
3316             synAllInst->setLexicalId(newInstID);
3317             i++;
3318             continue;
3319         }
3320         break;
3321         default:
3322             assert(0);
3323             break;
3324         }
3325         i++;
3326     }
3327 
3328     G4_INST* synInst;
3329     if (dst)
3330     {
3331         if (dst == 0xFFFF)
3332         {
3333             synInst = insertSyncAllWRInstruction(bb, 0, inst_it, inst->getCISAOff(), inst->getLineNo());
3334         }
3335         else if (multipleDst)
3336         {
3337             synInst = insertSyncAllWRInstruction(bb, dst, inst_it, inst->getCISAOff(), inst->getLineNo());
3338         }
3339         else
3340         {
3341             synInst = insertSyncInstruction(bb, inst_it, inst->getCISAOff(), inst->getLineNo());
3342             synInst->setToken(dstToken);
3343             synInst->setTokenType(SWSBTokenType::AFTER_WRITE);
3344         }
3345         synInst->setLexicalId(newInstID);
3346         insertedSync = true;
3347         for (auto loc:dst_loc)
3348         {
3349             synInst->setTokenLoc(loc.first, loc.second);
3350         }
3351     }
3352 
3353     if (src)
3354     {
3355         if (src == 0xFFFF)
3356         {
3357             synInst = insertSyncAllRDInstruction(bb, 0, inst_it, inst->getCISAOff(), inst->getLineNo());
3358         }
3359         else if (multipleSrc)
3360         {
3361             synInst = insertSyncAllRDInstruction(bb, src, inst_it, inst->getCISAOff(), inst->getLineNo());
3362         }
3363         else
3364         {
3365             synInst = insertSyncInstruction(bb, inst_it, inst->getCISAOff(), inst->getLineNo());
3366             synInst->setToken(srcToken);
3367             synInst->setTokenType(SWSBTokenType::AFTER_READ);
3368         }
3369         synInst->setLexicalId(newInstID);
3370         insertedSync = true;
3371         for (auto loc:src_loc)
3372         {
3373             synInst->setTokenLoc(loc.first, loc.second);
3374         }
3375     }
3376 
3377     return insertedSync;
3378 }
3379 
3380 /*
3381  *  For Xe, sync can be used for distance and token at the same time.
3382  *  The encoding limitations for instruction attached dependence info
3383  *  a. has to attached with instruction
3384  *      1. memSet
3385  *  b. Others
3386  *      1. Only regDst can be used when there is memSet for DPAS/math
3387  *      2. Only regDstAll can be used when there is memSet for send
3388  *      3. Only regDist can be used when there is mem.dst for ALU instructions
3389  *  c. To be consistent with the previous version (TGLLP)
3390  *      1. Tried to attach the distance with the original instructions.
3391  *      2. The only exception is the memSet for out-of-order instructions
3392  *
3393  *   SWSB format - non DPAS/send/math (in-order)
3394  *   7    6    5    4    3    2    1    0
3395  *   0    0    0    0    0    0    0    0
3396  *   0    0    0    0    0    regDist
3397  *   0    0    0    0    1    regDistAll
3398  *   0    0    0    1    0    regDistFloat
3399  *   0    0    0    1    1    regDistInt
3400  *   0    0    1    0    memSBid dst
3401  *   0    0    1    1    memSBid src
3402  *   0    1    0    0    R    R    R    R
3403  *   0    1    0    1    R    regDistLong
3404  *   0    1    1    R    R    R    R    R
3405  *   1    regDist           memSBid dst
3406  *
3407  *   SWSB format - DPAS/math (out-of-order)
3408  *   0    0    0    0    0    0    0    0
3409  *   0    0    0    0    0    regDist
3410  *   0    0    0    0    1    regDistAll
3411  *   0    0    0    1    0    regDistFloat
3412  *   0    0    0    1    1    regDistInt
3413  *   0    0    1    0    memSBid dst
3414  *   0    0    1    1    memSBid src
3415  *   0    1    0    0    memSBid set
3416  *   0    1    0    1    R    regDistLong
3417  *   0    1    1    R    R    R    R    R
3418  *   1    regDist           memSBid set
3419  *
3420  *   SWSB format -send (out-of-order)
3421  *   0    0    0    0    0    0    0    0
3422  *   0    0    0    0    0    regDist
3423  *   0    0    0    0    1    regDistAll
3424  *   0    0    0    1    0    regDistFloat
3425  *   0    0    0    1    1    regDistInt
3426  *   0    0    1    0    memSBid dst
3427  *   0    0    1    1    memSBid src
3428  *   0    1    0    0    memSBid set
3429  *   0    1    0    1    R    regDistLong
3430  *   0    1    1    R    R    R    R    R
3431  *   1    regDistAll           memSBid set
3432  */
insertSyncXe(G4_BB * bb,SBNode * node,G4_INST * inst,INST_LIST_ITER inst_it,int newInstID,BitSet * dstTokens,BitSet * srcTokens)3433 bool SWSB::insertSyncXe(G4_BB* bb, SBNode* node, G4_INST* inst, INST_LIST_ITER inst_it, int newInstID, BitSet* dstTokens, BitSet* srcTokens)
3434 {
3435     G4_INST::DistanceType distType = node->GetInstruction()->getDistanceTypeXe();
3436     bool insertedSync = false;
3437     bool keepDst = false;
3438     bool isCloseALUType = node->GetInstruction()->isClosestALUType();
3439 
3440     if (tokenHonourInstruction(inst))
3441     {
3442         //regDist $.set
3443         if (inst->isDpas())
3444         {
3445             if (distType != G4_INST::DistanceType::DIST_NONE &&
3446                 distType != G4_INST::DistanceType::DIST)
3447             {
3448                 G4_INST* synInst = insertSyncInstruction(bb, inst_it, inst->getCISAOff(), inst->getLineNo());
3449                 synInst->setDistance(inst->getDistance());
3450                 synInst->setDistanceTypeXe(inst->getDistanceTypeXe());
3451                 inst->setDistance(0);
3452                 inst->setDistanceTypeXe(G4_INST::DistanceType::DIST_NONE);
3453                 insertedSync = true;
3454             }
3455         }
3456 
3457         if (inst->isMathPipeInst())
3458         {
3459             if (isCloseALUType && distType != G4_INST::DistanceType::DIST_NONE)
3460             {
3461                 node->GetInstruction()->setDistanceTypeXe(G4_INST::DistanceType::DIST);
3462                 distType = G4_INST::DistanceType::DIST;
3463             }
3464             if (distType != G4_INST::DistanceType::DIST_NONE &&
3465                 distType != G4_INST::DistanceType::DIST)
3466             {
3467                 G4_INST* synInst = insertSyncInstruction(bb, inst_it, inst->getCISAOff(), inst->getLineNo());
3468                 synInst->setDistance(inst->getDistance());
3469                 synInst->setDistanceTypeXe(inst->getDistanceTypeXe());
3470                 inst->setDistance(0);
3471                 inst->setDistanceTypeXe(G4_INST::DistanceType::DIST_NONE);
3472                 insertedSync = true;
3473             }
3474         }
3475 
3476         // regDistAll $.set
3477         if (inst->isSend())
3478         {
3479             if (isCloseALUType && distType != G4_INST::DistanceType::DIST_NONE && (*inst_it) == inst)
3480             {
3481                 node->GetInstruction()->setDistanceTypeXe(G4_INST::DistanceType::DISTALL);
3482                 distType = G4_INST::DistanceType::DISTALL;
3483             }
3484             if ((distType != G4_INST::DistanceType::DIST_NONE &&
3485                 distType != G4_INST::DistanceType::DISTALL) || ((*inst_it) != inst))
3486             {
3487                 G4_INST* synInst = insertSyncInstruction(bb, inst_it, inst->getCISAOff(), inst->getLineNo());
3488                 synInst->setDistance(inst->getDistance());
3489                 synInst->setDistanceTypeXe(inst->getDistanceTypeXe());
3490                 inst->setDistance(0);
3491                 inst->setDistanceTypeXe(G4_INST::DistanceType::DIST_NONE);
3492                 insertedSync = true;
3493             }
3494         }
3495         //For out-of-order instruction, all dependence token will be moved out to sync
3496         insertedSync |= insertSyncToken(bb, node, inst, inst_it, newInstID, dstTokens, srcTokens, keepDst, true);
3497     }
3498     else
3499     {
3500         // regDist $.dst
3501         //For in-order instruction, trying to keep distance in the original instruction
3502         if (distType == G4_INST::DistanceType::DIST ||
3503             distType == G4_INST::DistanceType::DIST_NONE)
3504         {
3505             insertedSync = insertSyncToken(bb, node, inst, inst_it, newInstID, dstTokens, srcTokens, keepDst, false);
3506         }
3507         else
3508         {
3509             //Move all token dependence out
3510             insertedSync = insertSyncToken(bb, node, inst, inst_it, newInstID, dstTokens, srcTokens, keepDst, true);
3511         }
3512     }
3513 
3514     return insertedSync;
3515 }
3516 
3517 //For dpas/dpasw instructions
3518 //      RegDist         SBID.set
3519 //    RegDist         SBID.src
3520 //    RegDist         SBID.dst
3521 //For send instruction
3522 //    RegDistAll     SBID.set
3523 //    RegDistFloat   SBID.set
3524 //    RegDistInt     SBID.set
3525 //For non-send / non-dpas/dpasw instructions
3526 //    RegDist        SBID.dst
3527 //    RegDist        SBID.src
3528 //    RegDistAll     SBID.dst
insertSyncTokenPVC(G4_BB * bb,SBNode * node,G4_INST * inst,INST_LIST_ITER inst_it,int newInstID,BitSet * dstTokens,BitSet * srcTokens,bool removeAllToken)3529 bool SWSB::insertSyncTokenPVC(G4_BB* bb, SBNode* node, G4_INST* inst, INST_LIST_ITER inst_it, int newInstID, BitSet* dstTokens, BitSet* srcTokens, bool removeAllToken)
3530 {
3531     //SBID.set > SBID.dst > SBID.src
3532     unsigned int dst = 0;
3533     unsigned int src = 0;
3534     bool keepDst = false;
3535     bool multipleDst = false;
3536     bool multipleSrc = false;
3537     unsigned short token = (unsigned short)-1;
3538     unsigned short dstToken = (unsigned short)-1;
3539     unsigned short srcToken = (unsigned short)-1;
3540     std::vector<std::pair<unsigned short, unsigned>> dst_loc;
3541     std::vector<std::pair<unsigned short, unsigned>> src_loc;
3542     SWSBTokenType type = G4_INST::SWSBTokenType::TOKEN_NONE;
3543     bool insertedSync = false;
3544 
3545     for (unsigned int i = 0; i < node->getDepTokenNum();)
3546     {
3547         token = node->getDepToken(i, type);
3548         unsigned depNodeID = node->getDepTokenNodeID(i);
3549         unsigned int bitToken = (unsigned int)(1 << token);
3550         assert(token != (unsigned short)UNKNOWN_TOKEN);
3551 
3552         switch (type)
3553         {
3554         case SWSBTokenType::AFTER_WRITE:
3555         {
3556             if (dstTokens->isSet(token))
3557             {
3558                 //Do BB level clean up
3559                 //So that there will be no case like following redundant sync
3560                 //     sync.nop {$1.src}
3561                 //     sync.nop {$1.src}
3562                 // or
3563                 //     sync.nop {$1.dst}
3564                 //     sync.nop {$1.src}
3565                 // or
3566                 //     mov        {$1.dst}
3567                 //     add        {$1.src}
3568                 node->eraseDepToken(i);
3569                 continue;
3570             }
3571             else
3572             {
3573                 if (!removeAllToken &&                  //No set one marked.
3574                     !keepDst)                            //No dst one kept yet
3575                 {
3576                     //Token is kept in original instruction
3577                     keepDst = true;
3578                     inst->setToken(token);
3579                     inst->setTokenType(SWSBTokenType::AFTER_WRITE);
3580                     inst->setTokenLoc(token, depNodeID);
3581                     token = (unsigned short)UNKNOWN_TOKEN;
3582                     i++;
3583                     continue;
3584                 }
3585 
3586                 dst |= bitToken;
3587                 dst_loc.push_back(std::make_pair(token, depNodeID));
3588                 if (!multipleDst && (dst & ~bitToken))
3589                 {
3590                     multipleDst = true;
3591                 }
3592                 dstToken = token;
3593                 dstTokens->set(token, true);
3594 
3595                 node->eraseDepToken(i);
3596                 continue;
3597             }
3598         }
3599         break;
3600         default:
3601             assert(type == SWSBTokenType::AFTER_READ && "Wrong dependence type");
3602             break;
3603         }
3604         i++;
3605     }
3606 
3607     bool keepSrc = false;
3608     for (unsigned int i = 0; i < node->getDepTokenNum();)
3609     {
3610         token = node->getDepToken(i, type);
3611         unsigned depNodeID = node->getDepTokenNodeID(i);
3612         unsigned int bitToken = (unsigned int)(1 << token);
3613         assert(token != (unsigned short)UNKNOWN_TOKEN);
3614 
3615         switch (type)
3616         {
3617         case SWSBTokenType::AFTER_READ:
3618         {
3619             if (dstTokens->isSet(token) || (type == SWSBTokenType::AFTER_READ && srcTokens->isSet(token)))
3620             {
3621                 node->eraseDepToken(i);
3622                 continue;
3623             }
3624             else
3625             {
3626                 if (!removeAllToken &&
3627                     !keepDst &&
3628                     !keepSrc)
3629                 {
3630                     //Token is kept in original instruction
3631                     keepSrc = true;
3632                     inst->setToken(token);
3633                     inst->setTokenType(SWSBTokenType::AFTER_READ);
3634                     inst->setTokenLoc(token, depNodeID);
3635                     token = (unsigned short)UNKNOWN_TOKEN;
3636                     i++;
3637                     continue;
3638                 }
3639                 src |= bitToken;
3640                 src_loc.push_back(std::make_pair(token, depNodeID));
3641                 if (!multipleSrc && (src & ~bitToken))
3642                 {
3643                     multipleSrc = true;
3644                 }
3645                 srcToken = token;
3646                 srcTokens->set(token, true);
3647 
3648                 node->eraseDepToken(i);
3649                 continue;
3650             }
3651         }
3652         break;
3653         default:
3654             assert(type == SWSBTokenType::AFTER_WRITE && "Wrong dependence type");
3655             break;
3656         }
3657         i++;
3658     }
3659 
3660     G4_INST* synInst;
3661 
3662     if (dst)
3663     {
3664         if (dst == 0xFFFFFFFF)
3665         {
3666             synInst = insertSyncAllWRInstruction(bb, 0, inst_it, inst->getCISAOff(), inst->getLineNo());
3667         }
3668         else if (multipleDst)
3669         {
3670             synInst = insertSyncAllWRInstruction(bb, dst, inst_it, inst->getCISAOff(), inst->getLineNo());
3671         }
3672         else
3673         {
3674             synInst = insertSyncInstruction(bb, inst_it, inst->getCISAOff(), inst->getLineNo());
3675             synInst->setToken(dstToken);
3676             synInst->setTokenType(SWSBTokenType::AFTER_WRITE);
3677         }
3678         synInst->setLexicalId(newInstID);
3679         for (auto loc:dst_loc)
3680         {
3681             synInst->setTokenLoc(loc.first, loc.second);
3682         }
3683         insertedSync = true;
3684     }
3685 
3686     if (src)
3687     {
3688         if (src == 0xFFFFFFFF)
3689         {
3690             synInst = insertSyncAllRDInstruction(bb, 0, inst_it, inst->getCISAOff(), inst->getLineNo());
3691         }
3692         else if (multipleSrc)
3693         {
3694             synInst = insertSyncAllRDInstruction(bb, src, inst_it, inst->getCISAOff(), inst->getLineNo());
3695         }
3696         else
3697         {
3698             synInst = insertSyncInstruction(bb, inst_it, inst->getCISAOff(), inst->getLineNo());
3699             synInst->setToken(srcToken);
3700             synInst->setTokenType(SWSBTokenType::AFTER_READ);
3701         }
3702         synInst->setLexicalId(newInstID);
3703         for (auto loc:src_loc)
3704         {
3705             synInst->setTokenLoc(loc.first, loc.second);
3706         }
3707         insertedSync = true;
3708     }
3709 
3710     return insertedSync;
3711 }
3712 
3713 //If depends on multiple different ALU pipelines
3714 //    If all operands type matching the ALU pipelines --> regDist
3715 //    otherwise --> regDistAll
3716 //If depends on single different ALU pipeline and other same ALU pipelines.
3717 //    If all operands type matching the ALU pipelines --> regDist
3718 //    otherwise --> regDistAll
3719 //If depends on multiple same ALU pipelines
3720 //    If all operands type matching the ALU pipeline --> accurate/regDist
3721 //    otherwise--> accuarte
3722 //If depends on single ALU pipeline
3723 //    If operands type matching the ALU pipeline --> accurate/regDist
3724 //    otherwise--> accuarte
3725 //
3726 //Note that:
3727 // 1. one instruction can have multiple operands.
3728 // 2. instruction belongs to single pipeline
3729 //Combo:
3730 //For dpas/dpasw instructions
3731 //      RegDist         SBID.set
3732 //    RegDist         SBID.src
3733 //    RegDist         SBID.dst
3734 //For send instruction
3735 //    RegDistAll     SBID.set
3736 //    RegDistFloat   SBID.set
3737 //    RegDistInt     SBID.set
3738 //For non-send / non-dpas/dpasw instructions
3739 //    RegDist        SBID.dst
3740 //    RegDist        SBID.src
3741 //    RegDistAll     SBID.dst
insertSyncPVC(G4_BB * bb,SBNode * node,G4_INST * inst,INST_LIST_ITER inst_it,int newInstID,BitSet * dstTokens,BitSet * srcTokens)3742 bool SWSB::insertSyncPVC(G4_BB * bb, SBNode * node, G4_INST * inst, INST_LIST_ITER inst_it, int newInstID, BitSet * dstTokens, BitSet * srcTokens)
3743 {
3744     G4_INST::DistanceType distType = node->GetInstruction()->getDistanceTypeXe();
3745     bool operandTypeIndicated = node->GetInstruction()->isOperandTypeIndicated();
3746     bool insertedSync = false;
3747 
3748     if (tokenHonourInstruction(inst))
3749     {
3750         if (inst->getDistance())
3751         {
3752             //For dpas/dpasw instructions
3753             //      RegDist         SBID.set
3754             //    RegDist         SBID.src
3755             //    RegDist         SBID.dst
3756             if (inst->isDpas() ||
3757                 inst->isMathPipeInst()) //math Will be filtered out by tokenHonourInstruction in PVC
3758             {
3759                 if (inst->getSetToken() != (unsigned short)UNKNOWN_TOKEN ||
3760                     node->getDepTokenNum())
3761                 {
3762                     if (!operandTypeIndicated)
3763                     {
3764                         G4_INST* synInst = insertSyncInstruction(bb, inst_it, inst->getCISAOff(), inst->getLineNo());
3765                         synInst->setDistance(inst->getDistance());
3766                         synInst->setDistanceTypeXe(inst->getDistanceTypeXe());
3767                         inst->setDistance(0);
3768                         inst->setDistanceTypeXe(G4_INST::DistanceType::DIST_NONE);
3769                         insertedSync = true;
3770                     }
3771                     else if (inst->getDistanceTypeXe() != G4_INST::DistanceType::DIST &&
3772                         inst->getDistanceTypeXe() != G4_INST::DistanceType::DISTALL)
3773                     {
3774                         inst->setDistanceTypeXe(G4_INST::DistanceType::DIST);
3775                     }
3776                 }
3777             }
3778 
3779             //For send instruction
3780             //    RegDistAll     SBID.set
3781             //    RegDistFloat   SBID.set
3782             //    RegDistInt     SBID.set
3783             if (inst->isSend())
3784             {
3785                 if (inst->getSetToken() != (unsigned short)UNKNOWN_TOKEN)
3786                 {  //SBID.set > SBID.dst > SBID.src > distance
3787                     if (!(distType == G4_INST::DistanceType::DISTALL ||
3788                         distType == G4_INST::DistanceType::DISTINT ||
3789                         distType == G4_INST::DistanceType::DISTFLOAT) || (inst != (*inst_it)))
3790                     {
3791                         G4_INST* synInst = insertSyncInstruction(bb, inst_it, inst->getCISAOff(), inst->getLineNo());
3792                         synInst->setDistance(inst->getDistance());
3793                         synInst->setDistanceTypeXe(inst->getDistanceTypeXe());
3794                         inst->setDistance(0);
3795                         inst->setDistanceTypeXe(G4_INST::DistanceType::DIST_NONE);
3796                         insertedSync = true;
3797                     }
3798                 }
3799                 else if (node->getDepTokenNum())  //Keep only the SBID deps in the instruction
3800                 {
3801                     G4_INST* synInst = insertSyncInstruction(bb, inst_it, inst->getCISAOff(), inst->getLineNo());
3802                     synInst->setDistance(inst->getDistance());
3803                     synInst->setDistanceTypeXe(inst->getDistanceTypeXe());
3804                     inst->setDistance(0);
3805                     inst->setDistanceTypeXe(G4_INST::DistanceType::DIST_NONE);
3806                     insertedSync = true;
3807                 }
3808             }
3809         }
3810     }
3811     else
3812     {
3813         //For non-send / non-dpas/dpasw instructions
3814         //    RegDist        SBID.dst
3815         //    RegDist        SBID.src
3816         //    RegDistAll     SBID.dst
3817         if (inst->getDistance())
3818         {
3819             if (inst->opcode() == G4_mad && inst->hasNoACCSBSet())
3820             {
3821                 G4_INST* synInst = insertSyncInstruction(bb, inst_it, inst->getCISAOff(), inst->getLineNo());
3822                 synInst->setDistance(inst->getDistance());
3823                 synInst->setDistanceTypeXe(inst->getDistanceTypeXe());
3824                 inst->setDistance(0);
3825                 inst->setDistanceTypeXe(G4_INST::DistanceType::DIST_NONE);
3826                 insertedSync = true;
3827             }
3828             else if (node->getDepTokenNum())  //Keep only the SBID deps in the instruction
3829             {
3830                 if (!operandTypeIndicated && distType != G4_INST::DistanceType::DISTALL)
3831                 {
3832                     G4_INST* synInst = insertSyncInstruction(bb, inst_it, inst->getCISAOff(), inst->getLineNo());
3833                     synInst->setDistance(inst->getDistance());
3834                     synInst->setDistanceTypeXe(inst->getDistanceTypeXe());
3835                     inst->setDistance(0);
3836                     inst->setDistanceTypeXe(G4_INST::DistanceType::DIST_NONE);
3837                     insertedSync = true;
3838                 }
3839 
3840                 if (operandTypeIndicated && distType != G4_INST::DistanceType::DIST && distType != G4_INST::DistanceType::DISTALL)
3841                 {
3842                     inst->setDistanceTypeXe(G4_INST::DistanceType::DIST);
3843                 }
3844 
3845                 if (distType == G4_INST::DistanceType::DISTALL)
3846                 {
3847                     bool hasAfterWrite = false;
3848                     for (int i = 0; i < (int)node->getDepTokenNum(); i++)
3849                     {
3850                         unsigned short token = (unsigned short)-1;
3851                         SWSBTokenType type = SWSBTokenType::TOKEN_NONE;
3852                         token = node->getDepToken(i, type);
3853                         if (type == SWSBTokenType::AFTER_WRITE)
3854                         {
3855                             hasAfterWrite = true;
3856                         }
3857                     }
3858                     if (!hasAfterWrite)
3859                     {
3860                         G4_INST* synInst = insertSyncInstruction(bb, inst_it, inst->getCISAOff(), inst->getLineNo());
3861                         synInst->setDistance(inst->getDistance());
3862                         synInst->setDistanceTypeXe(inst->getDistanceTypeXe());
3863                         inst->setDistance(0);
3864                         inst->setDistanceTypeXe(G4_INST::DistanceType::DIST_NONE);
3865                         insertedSync = true;
3866                     }
3867                 }
3868             }
3869         }
3870     }
3871 
3872     bool removeAllTokenDep = (inst->getSetToken() != (unsigned short)UNKNOWN_TOKEN);
3873         removeAllTokenDep = removeAllTokenDep || (inst->opcode() == G4_mad && inst->hasNoACCSBSet());
3874     //For out-of-order instruction, all dependence token will be moved out to sync
3875     insertedSync |= insertSyncTokenPVC(bb, node, inst, inst_it, newInstID, dstTokens, srcTokens, removeAllTokenDep);
3876 
3877     return insertedSync;
3878 }
3879 
insertSync(G4_BB * bb,SBNode * node,G4_INST * inst,INST_LIST_ITER inst_it,int newInstID,BitSet * dstTokens,BitSet * srcTokens)3880 void SWSB::insertSync(G4_BB* bb, SBNode* node, G4_INST* inst, INST_LIST_ITER inst_it, int newInstID, BitSet* dstTokens, BitSet* srcTokens)
3881 {
3882     //The inst after arch register instruction.
3883     bool insertedSync = false;
3884     bool keepDst = false;
3885     INST_LIST_ITER prevIt = inst_it;
3886     if (node->followDistOneAreg())
3887     {
3888         prevIt--;
3889     }
3890 
3891     //Architecture register instruction
3892     bool hasValidNextInst = false;
3893     if (node->hasDistOneAreg())
3894     {
3895         INST_LIST_ITER nextIt = inst_it;
3896         nextIt++;
3897         if (nextIt != bb->end())
3898         {
3899             G4_INST *nextInst = *nextIt;
3900             if (tokenHonourInstruction(nextInst) ||
3901                 distanceHonourInstruction(nextInst))
3902             {
3903                 hasValidNextInst = true;
3904             }
3905         }
3906     }
3907 
3908     if (fg.builder->hasFourALUPipes()) //PVC
3909     {
3910         insertedSync = insertSyncPVC(bb, node, inst, inst_it, newInstID, dstTokens, srcTokens);
3911     }
3912     else if (fg.builder->hasThreeALUPipes()) //XeHP_SDV
3913     {
3914         insertedSync = insertSyncXe(bb, node, inst, inst_it, newInstID, dstTokens, srcTokens);
3915     }
3916     else //TGLLP
3917     {
3918         insertedSync = insertSyncToken(bb, node, inst, inst_it, newInstID, dstTokens, srcTokens, keepDst, false);
3919     }
3920 
3921     if (node->followDistOneAreg() && insertedSync)
3922     {
3923         G4_INST* syncInst = insertSyncInstructionAfter(bb, prevIt, inst->getCISAOff(), inst->getLineNo());
3924         syncInst->setDistance(1);
3925         if (fg.builder->hasThreeALUPipes() || fg.builder->hasFourALUPipes())
3926         {
3927             syncInst->setDistanceTypeXe(G4_INST::DistanceType::DISTALL);
3928         }
3929     }
3930 
3931     if (node->hasDistOneAreg() && !hasValidNextInst)
3932     {
3933         G4_INST* syncInst = insertSyncInstructionAfter(bb, inst_it, inst->getCISAOff(), inst->getLineNo());
3934         syncInst->setDistance(1);
3935         if (fg.builder->hasThreeALUPipes() || fg.builder->hasFourALUPipes())
3936         {
3937             syncInst->setDistanceTypeXe(G4_INST::DistanceType::DISTALL);
3938         }
3939     }
3940 }
3941 
3942 //
3943 // Insert the test instruction according to token assignment result. Re-assign the node id.
3944 // Except the test instruction, one instruction can have at most one token.
3945 // SWSB format - non send
3946 // 7    6    5    4    3    2    1    0
3947 // 0    0    0    0    0    0    0    0    No dependency
3948 // 0    0    0    0    regDist dst                      Reg only dep (1-15)
3949 // 0    0    0    1    R    R    R    R    Reserved
3950 // 0    0    1    0    R    memSBid dst        Memory dst only dep (0-7)
3951 // 0    0    1    1    R    memSBid src         Memory src only dep (0-7)
3952 // 0    1    R    R    R    R    R    R    Reserved for Future extensions
3953 // 1    memSBid dst        regDist dst            Reg and Memory dst dep
3954 //
3955 // SWSB format - send
3956 // 0    0    0    0    0    0    0    0    No dependency
3957 // 0    0    0    0    regDist dst            Reg only dep (1-15)
3958 // 0    0    0    1    R    memSBid set        SBid allocation only (0-7)
3959 // 0    0    1    0    R    memSBid dst        Memory dst only dep (0-7)
3960 // 0    0    1    1    R    memSBid src         Memory src only dep (0-7)
3961 // 0    1    R    R    R    R    R    R    Reserved for Future extensions
3962 // 1    memSBid set        regDist dst            SBid allocation and Reg only dep (1-15)
3963 //
3964 // 8bits [7:0]    8bits [15:8]    4bits [27:24]        1 bit  [29]     1bit [30]    16bits [47:32]
3965 // test = 0x70    SWSB            subOpcode            CmptCtrl = 1 DebugCtrl
3966 //                               0000 - Only SWSB check
3967 //                                 0001 - Check Send status                      2bits x 8 Sbid
3968 //                                                                              00 - SBid not checked
3969 //                                                                              01 - reserved
3970 //                                                                              10 - Check for data sent out
3971 //                                                                               11 - Check for data received
3972 //                              0010 - Check Address Register Dep            1bits x 16 address registers
3973 //                                                                                0 - Not checked
3974 //                                                                               1 - Check for Register dependency
3975 //                                  others - Reserved
3976 //
insertTest()3977 void SWSB::insertTest()
3978 {
3979     SBNODE_VECT_ITER node_it = SBNodes.begin();
3980     int newInstID = 0;
3981 
3982     for (G4_BB* bb : fg)
3983     {
3984         BitSet dstTokens(totalTokenNum, false);
3985         BitSet srcTokens(totalTokenNum, false);
3986 
3987         std::list<G4_INST*>::iterator inst_it(bb->begin()), iInstNext(bb->begin());
3988         while (iInstNext != bb->end())
3989         {
3990             inst_it = iInstNext;
3991             iInstNext++;
3992             G4_INST* inst = *inst_it;
3993 
3994             if (inst->isLabel())
3995             {
3996                 continue;
3997             }
3998 
3999             SBNode* node = *node_it;
4000             assert(node->GetInstruction() == inst);
4001 
4002             bool fusedSync = false;
4003             //HW W/A
4004             //For fused URB sends, or typed write, in HW, the dependence info of the second send instruction cannot be decoded
4005             //Software will check and promoted them before the first instruction.
4006             //If the second one is EOT instruction, syncAll is required.
4007             if (inst->isSend() &&
4008                 inst->isAtomicInst())
4009             {
4010                 INST_LIST_ITER tmp_it = inst_it;
4011                 tmp_it++;
4012                 if (tmp_it != bb->end())
4013                 {
4014                     const G4_INST* nextInst = *tmp_it;
4015 
4016                     if (nextInst->isSend())
4017                     {
4018                         G4_INST* synInst = nullptr;
4019                         if (nextInst->isEOT())
4020                         {
4021                             //If the second is EOT, sync all can be inserted directly, because EOT has no token info
4022                             synInst = insertSyncAllWRInstruction(bb, 0, inst_it, inst->getCISAOff(), nextInst->getLineNo());
4023                             synInst->setLexicalId(newInstID);
4024                         }
4025                         else
4026                         {
4027                             fusedSync = true;
4028                             if (inst->getSetToken() != (unsigned short)UNKNOWN_TOKEN)
4029                             {
4030                                 dstTokens.set(inst->getSetToken(), false);
4031                                 srcTokens.set(inst->getSetToken(), false);
4032                             }
4033                         }
4034                     }
4035                 }
4036             }
4037             else if ((kernel.fg.getHasStackCalls() || kernel.fg.getIsStackCallFunc()) && inst->isSend() && inst->getDst())
4038             {
4039                 //Stack call is using the NOMASK save and restore.
4040                 //This means there will be RAW dependence generated along the SIMD control flow.
4041                 //Such as in following case, {$1.dst} is required.
4042                 //if()
4043                 //{
4044                 //    ...
4045                 //    R1 --> save();
4046                 //    Fcall_0
4047                 //    R1 <-- retore(); {$1}
4048                 //    ...
4049                 //}
4050                 //else
4051                 //{
4052                 //    ...
4053                 //    R1 --> save() {$1.dst}
4054                 //    Fcall_1
4055                 //    R1 <-- retore();
4056                 //    ...
4057                 //}
4058                 //RAW dependence tracking in SWSB is scalar control flow based, because traditional RA will not generate this kind dependence.
4059                 //At the same time, since we handle the SWSB for stack call conservatively. So we can handle this dependence specially.
4060                 G4_Declare *dstDcl = GetTopDclFromRegRegion((G4_DstRegRegion *)inst->getDst());
4061                 if (std::find(kernel.callerRestoreDecls.begin(), kernel.callerRestoreDecls.end(), dstDcl) != kernel.callerRestoreDecls.end())
4062                 {
4063                     G4_INST* syncInst = insertSyncInstructionAfter(bb, inst_it, inst->getCISAOff(), inst->getLineNo());
4064                     unsigned short dstToken = (unsigned short)-1;
4065                     dstToken = node->getLastInstruction()->getSetToken();
4066                     syncInst->setToken(dstToken);
4067                     syncInst->setTokenType(SWSBTokenType::AFTER_WRITE);
4068                 }
4069             }
4070             if (fusedSync)
4071             {
4072                 insertSync(bb, node, inst, inst_it, newInstID, &dstTokens, &srcTokens);
4073                 inst->setLexicalId(newInstID);
4074                 newInstID++;
4075 
4076                 INST_LIST_ITER tmp_it = inst_it;
4077                 inst_it++;
4078                 iInstNext++;
4079                 node_it++;
4080                 inst = *inst_it;
4081                 node = *node_it;
4082                 if (inst->getSetToken() != (unsigned short)UNKNOWN_TOKEN)
4083                 {
4084                     dstTokens.set(inst->getSetToken(), false);
4085                     srcTokens.set(inst->getSetToken(), false);
4086                 }
4087                 //tmp_it keeps the position to insert new generated instructions.
4088                 insertSync(bb, node, inst, tmp_it, newInstID, &dstTokens, &srcTokens);
4089                 unsigned short token = inst->getSetToken();
4090                 if (token != (unsigned short)UNKNOWN_TOKEN)
4091                 {
4092                     G4_INST* synInst = insertSyncInstruction(bb, tmp_it, inst->getCISAOff(), inst->getLineNo());
4093                     synInst->setToken(token);
4094                     synInst->setTokenType(SWSBTokenType::AFTER_WRITE);
4095                     synInst->setLexicalId(newInstID);
4096                 }
4097             }
4098             else
4099             {
4100                 insertSync(bb, node, inst, inst_it, newInstID, &dstTokens, &srcTokens);
4101             }
4102 
4103             if (inst->getSetToken() != (unsigned short)UNKNOWN_TOKEN)
4104             {
4105                 dstTokens.set(inst->getSetToken(), false);
4106                 srcTokens.set(inst->getSetToken(), false);
4107             }
4108 
4109             inst->setLexicalId(newInstID);
4110             for (unsigned i = 1; i < node->instVec.size(); i++)
4111             {
4112                 inst = *iInstNext;
4113                 inst->setLexicalId(newInstID);
4114                 iInstNext++;
4115             }
4116 
4117             if (tokenHonourInstruction(inst) && inst->getSetToken() != (unsigned short)UNKNOWN_TOKEN)
4118             {
4119                 dstTokens.set(inst->getSetToken(), false);
4120                 srcTokens.set(inst->getSetToken(), false);
4121             }
4122 
4123             newInstID++;
4124             node_it++;
4125         }
4126     }
4127 
4128     tokenProfile.setSyncInstCount(syncInstCount);
4129     tokenProfile.setMathReuseCount(mathReuseCount);
4130     tokenProfile.setAWSyncInstCount(AWSyncInstCount);
4131     tokenProfile.setARSyncInstCount(ARSyncInstCount);
4132     tokenProfile.setAWSyncAllCount(AWSyncAllCount);
4133     tokenProfile.setARSyncAllCount(ARSyncAllCount);
4134     tokenProfile.setTokenReuseCount(tokenReuseCount);
4135 }
4136 
dumpDepInfo() const4137 void SWSB::dumpDepInfo() const
4138 {
4139     for (const SBNode* node : SBNodes)
4140     {
4141         if (node->GetInstruction()->isEOT())
4142         {
4143             continue;
4144         }
4145 
4146         const G4_INST* inst = node->GetInstruction();
4147         std::cerr << node->getNodeID() << ":\t";
4148         inst->dump();
4149         std::cerr << "Succs:";
4150         for (const SBDEP_ITEM& curSucc : node->succs)
4151         {
4152             std::cerr << curSucc.node->getNodeID() << ":" << ((curSucc.attr == DEP_EXPLICT) ? "E" : "I") << ", ";
4153             if (curSucc.type == RAW || curSucc.type == WAW)
4154             {
4155                 std::cerr << "AW;";
4156             }
4157             else
4158             {
4159                 std::cerr << "AR;";
4160             }
4161         }
4162         std::cerr << "\n";
4163         std::cerr << "Preds:";
4164         for (const SBDEP_ITEM& curPred : node->preds)
4165         {
4166             std::cerr << curPred.node->getNodeID() << ":" << ((curPred.attr == DEP_EXPLICT) ? "E" : "I") << ", ";
4167         }
4168         std::cerr << "\n\n";
4169     }
4170 }
4171 
dumpLiveIntervals() const4172 void SWSB::dumpLiveIntervals() const
4173 {
4174     std::cerr << "Internal:" << "\n";
4175     for (const SBNode* node : SBSendNodes)
4176     {
4177         if (node->GetInstruction()->isEOT())
4178         {
4179             continue;
4180         }
4181         node->dumpInterval();
4182     }
4183 }
4184 
dumpTokeAssignResult() const4185 void SWSB::dumpTokeAssignResult() const
4186 {
4187     std::cerr << "Internal:" << "\n";
4188     for (const SBNode* node : SBSendNodes)
4189     {
4190         if (node->GetInstruction()->isEOT())
4191         {
4192             continue;
4193         }
4194         node->dumpAssignedTokens();
4195     }
4196 }
4197 
dumpSync(const SBNode * tokenNode,const SBNode * syncNode,unsigned short token,SWSBTokenType type) const4198 void SWSB::dumpSync(const SBNode* tokenNode, const SBNode* syncNode, unsigned short token, SWSBTokenType type) const
4199 {
4200     std::cerr << "#" << syncNode->getNodeID() << "(" << token << ",";
4201     std::cerr << ((type == SWSBTokenType::AFTER_READ) ? "AR" : "AW") << ")";
4202     std::cerr << ": " << "#" << tokenNode->getNodeID() << "(" << tokenNode->getLiveStartID() << "-" << tokenNode->getLiveEndID() << ")\n";
4203 }
4204 
buildLiveIntervals()4205 void SWSB::buildLiveIntervals()
4206 {
4207     // For all send nodes
4208     // Set the live ranges according to dependence edges
4209     const bool trueDepOnly = fg.builder->getOptions()->getOption(vISA_TrueDepOnly);
4210     for (SBNode* node : SBSendNodes)
4211     {
4212         node->setLiveEarliestID(node->getNodeID(), node->getBBID());
4213         node->setLiveLatestID(node->getNodeID(), node->getBBID());
4214         for (SBDEP_ITEM& curSucc : node->succs)
4215         {
4216             const SBNode* succ = curSucc.node;
4217             if (trueDepOnly && node->GetInstruction()->isDpas() && node->getBBID() != succ->getBBID())
4218             {
4219                 node->setLiveLatestID(BBVector[node->getBBID()]->last_node, node->getBBID());
4220             }
4221             else
4222             {
4223                 node->setLiveLatestID(succ->getNodeID(), succ->getBBID());
4224             }
4225         }
4226     }
4227 
4228 #ifdef DEBUG_VERBOSE_ON
4229     dumpLiveIntervals();
4230     dumpDepInfo();
4231 #endif
4232 
4233     //For global send nodes
4234     //According to layout, extend the live range of each send operand to
4235     //the start of the first live in BB and end of last live out BB
4236     for (BB_LIST_ITER ib(fg.begin()), bend(fg.end()); ib != bend; ++ib)
4237     {
4238         unsigned bbID = (*ib)->getId();
4239         G4_BB_SB* sb_bb = BBVector[bbID];
4240         SBBitSets& send_live_in = sb_bb->send_live_in;
4241         SBBitSets& send_live_out = sb_bb->send_live_out;
4242         SBBitSets& send_live_in_scalar = sb_bb->send_live_in_scalar;
4243         SBBitSets& send_live_out_scalar = sb_bb->send_live_out_scalar;
4244 
4245         if (send_live_in.isEmpty())
4246         {
4247             continue;
4248         }
4249 
4250         for (SBBucketNode* bucketNode : globalSendOpndList)
4251         {
4252             SBNode* node = bucketNode->node;
4253             int globalID = node->globalID;
4254 
4255             if (trueDepOnly && node->GetInstruction()->isDpas())
4256             {
4257                 continue;
4258             }
4259 
4260             if (bucketNode->opndNum == Opnd_dst)
4261             {
4262                 if (sb_bb->first_node != -1 &&
4263                     send_live_in_scalar.isDstSet((unsigned)globalID))
4264                 {
4265                     if (!(*ib)->Preds.empty() || !(sb_bb->Preds.empty()))
4266                     {
4267                         node->setLiveEarliestID(sb_bb->first_node, bbID);
4268                     }
4269                 }
4270                 //FIXME: implicit dependence still have issue.
4271                 //the live range of implicit dependence may not counted. But that's ok? This may cause the delay. ...
4272                 if (sb_bb->first_node != -1 &&
4273                     send_live_out_scalar.isDstSet((unsigned)globalID))
4274                 {
4275                     if (!(*ib)->Succs.empty() || !(sb_bb->Succs.empty()))
4276                     {
4277                         node->setLiveLatestID(sb_bb->last_node, bbID);
4278                     }
4279                 }
4280             }
4281             else if (!trueDepOnly)
4282             {
4283                 if (sb_bb->first_node != -1 &&
4284                     send_live_in.isSrcSet((unsigned)globalID))
4285                 {
4286                     if (!(*ib)->Preds.empty() || !(sb_bb->Preds.empty()))
4287                     {
4288                         node->setLiveEarliestID(sb_bb->first_node, bbID);
4289                     }
4290                 }
4291                 //FIXME: implicit dependence still have issue.
4292                 //the live range of implicit dependence may not counted. But that's ok? This may cause the delay. ...
4293                 if (sb_bb->first_node != -1 &&
4294                     send_live_out.isSrcSet((unsigned)globalID))
4295                 {
4296                     if (!(*ib)->Succs.empty() || !(sb_bb->Succs.empty()))
4297                     {
4298                         node->setLiveLatestID(sb_bb->last_node, bbID);
4299                     }
4300                 }
4301             }
4302         }
4303     }
4304 #ifdef DEBUG_VERBOSE_ON
4305     dumpLiveIntervals();
4306 #endif
4307     return;
4308 }
4309 
4310 //
4311 // live_in(BBi) = Union(def_out(BBj)) // BBj is predecessor of BBi
4312 // live_out(BBi) += live_in(BBi) - may_kill(BBi)
4313 //
globalDependenceDefReachAnalysis(G4_BB * bb)4314 bool SWSB::globalDependenceDefReachAnalysis(G4_BB* bb)
4315 {
4316     bool changed = false;
4317     unsigned bbID = bb->getId();
4318 
4319     if (bb->Preds.empty())
4320     {
4321         return false;
4322     }
4323 
4324     SBBitSets temp_live_in(globalSendNum);
4325     temp_live_in = BBVector[bbID]->send_live_in;
4326 
4327     for (const G4_BB* predBB : bb->Preds)
4328     {
4329         unsigned predID = predBB->getId();
4330         temp_live_in |= BBVector[predID]->send_live_out;
4331     }
4332 
4333     if (temp_live_in != BBVector[bbID]->send_live_in)
4334     {
4335         changed = true;
4336         BBVector[bbID]->send_live_in = temp_live_in;
4337     }
4338 
4339     //Record the killed dst and src in scalar CF iterating
4340     SBBitSets temp_kill(globalSendNum);
4341     temp_kill = temp_live_in;
4342     temp_kill &= BBVector[bbID]->send_may_kill;
4343     BBVector[bbID]->send_kill_scalar |= temp_kill;
4344 
4345     temp_kill = temp_live_in;
4346     temp_kill.src &= BBVector[bbID]->send_may_kill.dst;
4347     BBVector[bbID]->send_kill_scalar.src |= temp_kill.src;
4348 
4349     //Kill nodes
4350     //once dst is killed, src definitely is killed
4351     temp_live_in -= BBVector[bbID]->send_may_kill;
4352     temp_live_in.src -= BBVector[bbID]->send_may_kill.dst;
4353 
4354     BBVector[bbID]->send_live_out |= temp_live_in;
4355 
4356     return changed;
4357 }
4358 
4359 //
4360 // live_in(BBi) = Union(def_out(BBj)) // BBj is predecessor of BBi
4361 // live_out(BBi) += live_in(BBi) - may_kill(BBi)
4362 //
globalDependenceUseReachAnalysis(G4_BB * bb)4363 bool SWSB::globalDependenceUseReachAnalysis(G4_BB* bb)
4364 {
4365     bool changed = false;
4366     unsigned bbID = bb->getId();
4367 
4368     if (bb->Preds.empty())
4369     {
4370         return false;
4371     }
4372 
4373     SBBitSets temp_live_in(globalSendNum);
4374     temp_live_in = BBVector[bbID]->send_live_in;
4375 
4376     for (BB_SWSB_LIST_ITER it = BBVector[bbID]->Preds.begin(); it != BBVector[bbID]->Preds.end(); it++)
4377     {
4378         G4_BB* predBB = (*it)->getBB();
4379         unsigned predID = predBB->getId();
4380         temp_live_in |= BBVector[predID]->send_live_out;
4381     }
4382 
4383     if (temp_live_in != BBVector[bbID]->send_live_in)
4384     {
4385         changed = true;
4386         BBVector[bbID]->send_live_in = temp_live_in;
4387     }
4388 
4389     //Kill scalar kills
4390     temp_live_in -= BBVector[bbID]->send_kill_scalar;
4391     temp_live_in.src -= BBVector[bbID]->send_may_kill.src;
4392     temp_live_in.dst -= BBVector[bbID]->send_WAW_may_kill;
4393 
4394     BBVector[bbID]->send_live_out |= temp_live_in;
4395 
4396     return changed;
4397 }
4398 
4399 
tokenEdgePrune(unsigned & prunedEdgeNum,unsigned & prunedGlobalEdgeNum,unsigned & prunedDiffBBEdgeNum,unsigned & prunedDiffBBSameTokenEdgeNum)4400 void SWSB::tokenEdgePrune(unsigned& prunedEdgeNum,
4401     unsigned& prunedGlobalEdgeNum,
4402     unsigned& prunedDiffBBEdgeNum,
4403     unsigned& prunedDiffBBSameTokenEdgeNum)
4404 {
4405     for (size_t i = 0; i < BBVector.size(); i++)
4406     {
4407         if (BBVector[i]->first_node == -1)
4408         {
4409             continue;
4410         }
4411 
4412         BitSet activateLiveIn(SBSendNodes.size(), false);
4413         activateLiveIn |= BBVector[i]->liveInTokenNodes;
4414 
4415         //Scan the instruction nodes of current BB
4416         for (int j = BBVector[i]->first_node; j <= BBVector[i]->last_node; j++)
4417         {
4418             SBNode* node = SBNodes[j];
4419             BitSet killedToken(totalTokenNum, false); //Track the token killed by current instruction.
4420 
4421             //scan the incoming dependence edges of current node
4422             for (auto node_it = node->preds.begin();
4423                 node_it != node->preds.end();
4424                 node_it++)
4425             {
4426                 SBDEP_ITEM& curPred = (*node_it);
4427                 DepType type = curPred.type;
4428                 SBNode* predNode = curPred.node;
4429 
4430                 //If the predecessor node is a token instruction node.
4431                 if (tokenHonourInstruction(predNode->GetInstruction()))
4432                 {
4433                     if (!activateLiveIn.isSet(predNode->sendID))
4434                     {
4435                         // If not in the live set of current instruction,
4436                         // (The live in set will be changed during instruction scan)
4437                         // remove the dependence from success list of previous node
4438                         // The dependence SBID assignment only depends on the succ nodes.
4439                         for (auto succ_it = predNode->succs.begin();
4440                             succ_it != predNode->succs.end();
4441                             succ_it++)
4442                         {
4443                             SBDEP_ITEM& currSucc = (*succ_it);
4444                             if (currSucc.node == node)
4445                             {
4446                                 //Don't do remove previous edge here.
4447                                 //1. Conflict with outer loop
4448                                 //2. There is no preds info required any more in following handling
4449                                 predNode->succs.erase(succ_it);
4450                                 prunedEdgeNum++;
4451                                 if (predNode->globalID != -1)
4452                                 {
4453                                     if (predNode->getBBID() != node->getBBID() &&
4454                                         !killedToken.isSet(predNode->getLastInstruction()->getSetToken()) &&
4455                                         (!(fg.builder->getOptions()->getOption(vISA_GlobalTokenAllocation) ||
4456                                            fg.builder->getOptions()->getOption(vISA_DistPropTokenAllocation)) ||
4457                                         !((fg.builder->getOptions()->getOption(vISA_GlobalTokenAllocation) ||
4458                                             fg.builder->getOptions()->getOption(vISA_DistPropTokenAllocation)) &&
4459                                           BBVector[node->getBBID()]->dominators.isSet(predNode->getBBID()))))
4460                                     {
4461                                         prunedDiffBBEdgeNum++;
4462 #ifdef DEBUG_VERBOSE_ON
4463                                         std::cerr << "Diff BB Token: " << predNode->getLastInstruction()->getSetToken() << " <Pred: " << predNode->getNodeID() << ", Succ: " << node->getNodeID() << ">" << std::endl;;
4464 #endif
4465                                     }
4466                                     else if (predNode->getBBID() != node->getBBID())
4467                                     {
4468                                         prunedDiffBBSameTokenEdgeNum++;
4469 #ifdef DEBUG_VERBOSE_ON
4470                                         std::cerr << "Diff BB Same Token: " << predNode->getLastInstruction()->getSetToken() << " <Pred: " << predNode->getNodeID() << ", Succ: " << node->getNodeID() << ">" << std::endl;;
4471 #endif
4472                                     }
4473                                     else
4474                                     {
4475                                         prunedGlobalEdgeNum++;
4476 #ifdef DEBUG_VERBOSE_ON
4477                                         std::cerr << "Global Token: " << predNode->getLastInstruction()->getSetToken() << " <Pred: " << predNode->getNodeID() << ", Succ: " << node->getNodeID() << ">" << std::endl;;
4478 #endif
4479                                     }
4480                                 }
4481 #ifdef DEBUG_VERBOSE_ON
4482                                 else
4483                                 {
4484                                     std::cerr << "Local Token: " << predNode->getLastInstruction()->getSetToken() << " <Pred: " << predNode->getNodeID() << ", Succ: " << node->getNodeID() << ">" << std::endl;;
4485                                 }
4486 #endif
4487                                 break;
4488                             }
4489                         }
4490                     }
4491                     else //In live in set
4492                     {
4493                         // Kill the dependence if it's a AW dependence
4494                         // What about WAR?
4495                         if (type == RAW || type == WAW)
4496                         {
4497                             int token = predNode->getLastInstruction()->getSetToken();
4498                             if (token != (unsigned short)UNKNOWN_TOKEN)
4499                             {
4500                                 activateLiveIn -= allTokenNodesMap[token].bitset;
4501                                 killedToken.set(token, true);
4502                             }
4503                         }
4504                     }
4505                 }
4506             }
4507 
4508             // Current instruction is marked as alive
4509             // How to kill the old one? Especially the WAR?
4510             // Token reuse will kill all previous nodes with same token? yes
4511             if (tokenHonourInstruction(node->GetInstruction()) && !node->GetInstruction()->isEOT())
4512             {
4513                 int token = node->getLastInstruction()->getSetToken();
4514                 if (token != (unsigned short)UNKNOWN_TOKEN)
4515                 {
4516                     activateLiveIn -= allTokenNodesMap[token].bitset;
4517                     activateLiveIn.set(node->sendID, true);
4518                 }
4519             }
4520         }
4521     }
4522 }
4523 
getLiveOutToken(unsigned allSendNum,const SBNODE_VECT * SBNodes)4524 void G4_BB_SB::getLiveOutToken(unsigned allSendNum,
4525     const SBNODE_VECT* SBNodes)
4526 {
4527     //Empty BB
4528     if (first_node == -1)
4529     {
4530         return;
4531     }
4532 
4533     uint32_t totalTokenNum = builder.kernel.getNumSWSBTokens();
4534     unsigned* liveNodeID = (unsigned*)mem.alloc(sizeof(unsigned) * totalTokenNum);
4535 
4536     if (tokeNodesMap.size() == 0)
4537     {
4538         tokeNodesMap.resize(totalTokenNum);
4539 
4540         //Each token ID has a bitset for all possible send instructions' ID
4541         for (size_t i = 0; i < totalTokenNum; i++)
4542         {
4543             tokeNodesMap[i] = BitSet(allSendNum, false);
4544             liveNodeID[i] = 0;
4545         }
4546     }
4547     else
4548     {
4549         for (size_t i = 0; i < totalTokenNum; i++)
4550         {
4551             tokeNodesMap[i].clear();
4552             liveNodeID[i] = 0;
4553         }
4554     }
4555 
4556     // Scan instructions forward to get the live out of current BB
4557     for (int i = first_node; i <= last_node; i++)
4558     {
4559         SBNode* node = (*SBNodes)[i];
4560 
4561         //Check the previous node.
4562         for (const SBDEP_ITEM& curPred : node->preds)
4563         {
4564             DepType type = curPred.type;
4565             SBNode* predNode = curPred.node;
4566 
4567             if ((predNode == node) ||
4568                 (predNode->getBBID() != node->getBBID()) ||
4569                 (predNode->getNodeID() > node->getNodeID()))
4570             {
4571                 continue;
4572             }
4573 
4574 
4575             //If there is a .dst dependence, kill all nodes with same token
4576             if (tokenHonourInstruction(predNode->getLastInstruction()) && (type == RAW || type == WAW))
4577             {
4578                 if (predNode->getLastInstruction()->getSetToken() != (unsigned short)UNKNOWN_TOKEN)
4579                 {
4580                     unsigned short token = predNode->getLastInstruction()->getSetToken();
4581                     // 1:  send r112                   {$9}
4582                     // 2:  send r18                    {$9}
4583                     // 3:  send r112                   {$9}
4584                     // 4:  send xxx,     r18           {12}
4585                     //
4586                     // Instruction 4 may clear the $9 because of instruction 2
4587                     // liveNodeID is used to track the live node id of each send. predNode can kill
4588                     if (liveNodeID[token] < predNode->getNodeID())
4589                     {
4590                         tokeNodesMap[token].clear(); //Kill all dependence in following instructions with the same token
4591 
4592                         //Record the killed token by current BB, Kill may kill all previous nodes which reach current node
4593                         killedTokens.set(token, true);  //Set previous token send killed in current BB
4594                     }
4595                 }
4596             }
4597         }
4598 
4599         //Token reuse will kill all previous nodes with same token
4600         //Will have only one?, yes, for BB local scan
4601         if (tokenHonourInstruction(node->getLastInstruction()) &&
4602             !node->getLastInstruction()->isEOT() &&
4603             node->getLastInstruction()->getSetToken() != (unsigned short)UNKNOWN_TOKEN)
4604         {
4605             unsigned short token = node->getLastInstruction()->getSetToken();
4606             tokeNodesMap[token].clear();
4607 
4608             //For future live in, will always be killed by current instruction
4609             killedTokens.set(token, true);
4610 
4611             //Current node may be in live out, if not be killed in following insts.
4612             tokeNodesMap[token].set(node->sendID, true);
4613             liveNodeID[token] = node->getNodeID();
4614         }
4615     }
4616 
4617     for (size_t i = 0; i < totalTokenNum; i++)
4618     {
4619         liveOutTokenNodes |= tokeNodesMap[i];
4620     }
4621 }
4622 //
4623 // Scan to check which global send operand for sends will be killed by current BB.
4624 // Note that there is no guarantee the send operand will in the live in set of BB.
4625 // !!! Note that: since this "may kill" info is used in global analysis, "may kill" is not accurate, here we in fact record the "definitely kill".
setSendOpndMayKilled(LiveGRFBuckets * globalSendsLB,SBNODE_VECT * SBNodes,PointsToAnalysis & p)4626 void G4_BB_SB::setSendOpndMayKilled(LiveGRFBuckets* globalSendsLB,
4627     SBNODE_VECT* SBNodes,
4628     PointsToAnalysis& p)
4629 {
4630     std::vector<SBBucketDesc> BDvec;
4631     if (first_node == -1)
4632     {
4633         return;
4634     }
4635 
4636     bool addGlobalSLMWARWA = false;
4637     for (int i = first_node; i <= last_node; i++)
4638     {
4639         SBNode* node = (*SBNodes)[i];
4640         G4_INST* curInst = (*SBNodes)[i]->GetInstruction();
4641 
4642         if (curInst->isLabel())
4643         {
4644             continue;
4645         }
4646 
4647         BDvec.clear();
4648         getGRFBucketDescs(node, BDvec, true);
4649         if (!BDvec.size())
4650         {
4651             continue;
4652         }
4653 
4654         // For all bucket descriptors of curInst
4655         for (const SBBucketDesc& BD : BDvec) {
4656             const int& curBucket = BD.bucket;
4657             const Gen4_Operand_Number& curOpnd = BD.opndNum;
4658             const SBFootprint* curFootprint = BD.footprint;
4659 
4660             for (LiveGRFBuckets::BN_iterator bn_it = globalSendsLB->begin(curBucket);
4661                 bn_it != globalSendsLB->end(curBucket);)
4662             {
4663                 SBBucketNode* liveBN = (*bn_it);
4664                 SBNode* curLiveNode = liveBN->node;
4665                 Gen4_Operand_Number liveOpnd = liveBN->opndNum;
4666                 const SBFootprint* liveFootprint = liveBN->footprint;
4667                 G4_INST* liveInst = liveFootprint->inst;
4668 
4669                 //Send operands are all GRF aligned, there is no overlap checking required.
4670                 //Fix me, this is not right, for math instruction, less than 1 GRF may happen.
4671                 //Find DEP type
4672                 unsigned short internalOffset = 0;
4673                 bool hasOverlap = curFootprint->hasOverlap(liveFootprint, internalOffset);
4674                 if (!hasOverlap)
4675                 {
4676                     ++bn_it;
4677                     continue;
4678                 }
4679 
4680                 DepType dep = getDepForOpnd(liveOpnd, curOpnd);
4681 
4682                 //For SBID global liveness analysis, both explicit and implicit kill counted.
4683                 if (dep == RAW || dep == WAW)
4684                 {
4685                     send_may_kill.setDst(curLiveNode->globalID, true);
4686                     if (dep == WAW)
4687                     {
4688                         send_WAW_may_kill.set(curLiveNode->globalID, true);
4689                     }
4690                 }
4691 
4692                 if (dep == WAR &&
4693                     WARDepRequired(liveInst, curFootprint->inst))
4694                 {
4695                     send_may_kill.setSrc(curLiveNode->globalID, true);
4696                 }
4697 
4698                 //FIXME: for NODEP, there is optimization chance.
4699                 //               if (hasSameFunctionID(liveInst, curInst))
4700                 //                    send  null,  r1, r73, ...   {$0}
4701                 //                    send  null,  r1, r60, ...   {$1}
4702                 //                    add    r60...    {$1.src}
4703                 //                    add    r73        // There is no need to set {$0.src}
4704                 //
4705                 //                    send  null,  r1, r73, ...   {$0}
4706                 //                    send  null,  r1, r60, ...   {$1}
4707                 //                    add    r73        {$0.src} // We need to set {$0.src}
4708                 //                    add    r60...    {$1.src}
4709                 //if (dep == NODEP && !hasSameFunctionID(liveInst, curInst)) //Conservative, only different pipeline, we will insert dependence tracking
4710                 //{
4711                 //    send_may_kill->setSrc(curLiveNode->globalID, true);
4712                 //}
4713 
4714                 assert(dep != DEPTYPE_MAX && "dep unassigned?");
4715                 ++bn_it;
4716             }
4717         }
4718 
4719         if (!addGlobalSLMWARWA && builder.hasSLMWARIssue() && curInst->isSend() &&
4720             (isSLMMsg(curInst) && (curInst->getDst() == nullptr || isFence(curInst))))
4721         {
4722             for (int curBucket = 0; curBucket < globalSendsLB->getNumOfBuckets(); curBucket++)
4723             {
4724                 for (LiveGRFBuckets::BN_iterator bn_it = globalSendsLB->begin(curBucket);
4725                     bn_it != globalSendsLB->end(curBucket);)
4726                 {
4727                     SBBucketNode* liveBN = (*bn_it);
4728                     SBNode* curLiveNode = liveBN->node;
4729                     G4_INST* liveInst = liveBN->footprint->inst;
4730 
4731                     if (liveInst->isSend() &&
4732                         isSLMMsg(liveInst) && liveInst->getDst() != nullptr && !liveInst->getDst()->isNullReg())
4733                     {
4734                         send_may_kill.setDst(curLiveNode->globalID, true);
4735                     }
4736                     ++bn_it;
4737                 }
4738             }
4739             addGlobalSLMWARWA = true;
4740         }
4741     }
4742 }
4743 
getFootprintForOperand(SBNode * node,G4_INST * inst,G4_Operand * opnd,Gen4_Operand_Number opndNum)4744 bool G4_BB_SB::getFootprintForOperand(SBNode* node,
4745     G4_INST* inst,
4746     G4_Operand* opnd,
4747     Gen4_Operand_Number opndNum)
4748 {
4749     int startingBucket = UNINIT_BUCKET;
4750     bool hasDistOneAReg = false;
4751     bool footprintOperand = false;
4752     bool isAccReg = false;
4753     bool isFlagReg = false;
4754     SBFootprint* footprint = nullptr;
4755     G4_VarBase* base = opnd->getBase();
4756 
4757     assert(base && "If no base, then the operand is not touched by the instr.");
4758 
4759     G4_VarBase* phyReg = (base->isRegVar()) ? base->asRegVar()->getPhyReg() : base;
4760 
4761     switch (phyReg->getKind())
4762     {
4763     case G4_VarBase::VK_phyGReg:
4764         startingBucket = 0;
4765         footprintOperand = true;
4766         break;
4767     case G4_VarBase::VK_phyAReg:
4768         if (phyReg->isSrReg() ||
4769             phyReg->isCrReg() ||
4770             phyReg->isSpReg() ||
4771             phyReg->isIpReg() ||
4772             phyReg->isTmReg() ||
4773             phyReg->isMaskReg() ||
4774             phyReg->isDbgReg())
4775         {
4776             hasDistOneAReg = true;
4777         }
4778         isAccReg = phyReg->isAccReg();
4779         isFlagReg = phyReg->isFlag();
4780         break;
4781     case G4_VarBase::VK_regVar:
4782         assert(0 && "Should not be a regvar. PhyReg is extracted from regvar.");
4783         break;
4784     default:
4785         assert(0 && "Bad kind");
4786         break;
4787     }
4788 
4789     if (footprintOperand)
4790     {
4791         // Create one or more buckets and push them into the vector
4792         footprint = getFootprintForGRF(opnd, opndNum, inst, startingBucket, inst->isSend());
4793         node->setFootprint(footprint, opndNum);
4794     }
4795 
4796     if ((builder.hasThreeALUPipes() || builder.hasFourALUPipes()))
4797     {
4798         if (isAccReg)
4799         {
4800             footprint = getFootprintForACC(opnd, opndNum, inst);
4801             node->setFootprint(footprint, opndNum);
4802         }
4803         if (isFlagReg)
4804         {
4805             footprint = getFootprintForFlag(opnd, opndNum, inst);
4806             node->setFootprint(footprint, opndNum);
4807         }
4808     }
4809 
4810 
4811     return hasDistOneAReg;
4812 }
4813 
getGRFFootprintForIndirect(SBNode * node,Gen4_Operand_Number opnd_num,G4_Operand * opnd,PointsToAnalysis & p)4814 void G4_BB_SB::getGRFFootprintForIndirect(SBNode* node,
4815     Gen4_Operand_Number opnd_num,
4816     G4_Operand* opnd,
4817     PointsToAnalysis& p)
4818 {
4819     G4_Declare* addrdcl = nullptr;
4820     SBFootprint* footprint = nullptr;
4821     G4_Type type = opnd->getType();
4822 
4823     if (opnd_num == Opnd_dst)
4824     {
4825         G4_DstRegRegion* dstrgn = opnd->asDstRegRegion();
4826         addrdcl = GetTopDclFromRegRegion(dstrgn);
4827     }
4828     else if (opnd_num == Opnd_src0 ||
4829         opnd_num == Opnd_src1 ||
4830         opnd_num == Opnd_src2 ||
4831         opnd_num == Opnd_src3)
4832     {
4833         G4_SrcRegRegion* srcrgn = opnd->asSrcRegRegion();
4834         addrdcl = GetTopDclFromRegRegion(srcrgn);
4835     }
4836     else
4837     {
4838         assert(0);
4839     }
4840 
4841 #ifdef DEBUG_VERBOSE_ON
4842     std::cerr << addrdcl->getName() << ":" << std::endl;
4843     std::cerr << node->getNodeID() << ":";
4844     node->GetInstruction()->dump();
4845     std::cerr << "Point to: ";
4846 #endif
4847 
4848     if (addrdcl == nullptr)
4849     {
4850         assert(0);
4851         return;
4852     }
4853 
4854     G4_RegVar* ptvar = NULL;
4855     int vid = 0;
4856 
4857     unsigned char offset = 0;
4858     while ((ptvar = p.getPointsTo(addrdcl->getRegVar(), vid++, offset)) != NULL)
4859     {
4860 
4861         uint32_t varID = ptvar->getId();
4862         G4_Declare* dcl = ptvar->getDeclare();
4863         G4_RegVar* var = NULL;
4864 
4865         while (dcl->getAliasDeclare())
4866         {
4867             dcl = dcl->getAliasDeclare();
4868         }
4869 
4870 
4871         int linearizedStart = 0;
4872         int linearizedEnd = 0;
4873 
4874         if (dcl->isSpilled()) //FIXME: Lost point analysis tracking due to spill, assume all registers are touched
4875         {
4876             linearizedEnd = totalGRFNum * numEltPerGRF<Type_UB>() - 1;
4877         }
4878         else
4879         {
4880             var = dcl->getRegVar();
4881 
4882             MUST_BE_TRUE(var->getId() == varID, "RA verification error: Invalid regVar ID!");
4883             MUST_BE_TRUE(var->getPhyReg()->isGreg(), "RA verification error: Invalid dst reg!");
4884 
4885             uint32_t regNum = var->getPhyReg()->asGreg()->getRegNum();
4886             uint32_t regOff = var->getPhyRegOff();
4887 
4888             {
4889                 linearizedStart = regNum * numEltPerGRF<Type_UB>() + regOff * TypeSize(dcl->getElemType());
4890                 linearizedEnd = regNum * numEltPerGRF<Type_UB>() + regOff * TypeSize(dcl->getElemType()) + dcl->getByteSize() - 1;
4891             }
4892         }
4893 
4894 
4895         void* allocedMem = mem.alloc(sizeof(SBFootprint));
4896         footprint = new (allocedMem)SBFootprint(GRF_T, type, (unsigned short)linearizedStart, (unsigned short)linearizedEnd, node->GetInstruction());
4897         node->setFootprint(footprint, opnd_num);
4898 #ifdef DEBUG_VERBOSE_ON
4899         int startingBucket = linearizedStart / numEltPerGRF<Type_UB>();
4900         int endingBucket = linearizedEnd / numEltPerGRF<Type_UB>();
4901         std::cerr << dcl->getName() << "<" << startingBucket << "," << endingBucket << ">";
4902 #endif
4903     }
4904 #ifdef DEBUG_VERBOSE_ON
4905     std::cerr << std::endl;
4906 #endif
4907     return;
4908 }
4909 
4910 //Create Buckets
getGRFBuckets(SBNode * node,const SBFootprint * footprint,Gen4_Operand_Number opndNum,std::vector<SBBucketDesc> & BDvec,bool GRFOnly)4911 void G4_BB_SB::getGRFBuckets(SBNode* node,
4912     const SBFootprint* footprint,
4913     Gen4_Operand_Number opndNum,
4914     std::vector<SBBucketDesc>& BDvec,
4915     bool GRFOnly)
4916 {
4917     for (const SBFootprint* curFootprint = footprint; curFootprint != nullptr; curFootprint = curFootprint->next)
4918     {
4919         if (GRFOnly && (curFootprint->fType != GRF_T))
4920         {
4921             continue;
4922         }
4923 
4924         int startingBucket = curFootprint->LeftB / numEltPerGRF<Type_UB>();
4925         int endingBucket = curFootprint->RightB / numEltPerGRF<Type_UB>();
4926         if (curFootprint->fType == ACC_T)
4927         {
4928             int aregOffset = totalGRFNum + builder.getNumScalarRegisters();
4929             startingBucket = startingBucket + aregOffset;
4930             endingBucket = endingBucket + aregOffset;
4931         }
4932         int numBuckets = endingBucket - startingBucket + 1;
4933         for (int j = startingBucket;
4934             j < (startingBucket + numBuckets); j++)
4935         {
4936             BDvec.push_back(SBBucketDesc(j, opndNum, node, curFootprint));
4937         }
4938     }
4939 }
4940 
getGRFFootPrintOperands(SBNode * node,G4_INST * inst,Gen4_Operand_Number first_opnd,Gen4_Operand_Number last_opnd,PointsToAnalysis & p)4941 bool G4_BB_SB::getGRFFootPrintOperands(SBNode* node,
4942     G4_INST* inst,
4943     Gen4_Operand_Number first_opnd,
4944     Gen4_Operand_Number last_opnd,
4945     PointsToAnalysis& p)
4946 {
4947     bool hasDistOneAreg = false;
4948     for (Gen4_Operand_Number opndNum = first_opnd; opndNum <= last_opnd; opndNum = (Gen4_Operand_Number)(opndNum + 1))
4949     {
4950 
4951         G4_Operand* opnd = inst->getOperand(opndNum);
4952 
4953         if (!opnd || !opnd->getBase())
4954         {
4955             continue;
4956         }
4957 
4958         if (opnd->isLabel() || opnd->isImm())
4959         {
4960             continue;
4961         }
4962 
4963         hasDistOneAreg |= getFootprintForOperand(node, inst, opnd, opndNum);
4964 
4965 
4966         //Get bucket for indirect access
4967         if (hasIndirection(opnd, opndNum))
4968         {
4969             getGRFFootprintForIndirect(node, opndNum, opnd, p);
4970         }
4971     }
4972 
4973     return hasDistOneAreg;
4974 }
4975 
getGRFBucketsForOperands(SBNode * node,Gen4_Operand_Number first_opnd,Gen4_Operand_Number last_opnd,std::vector<SBBucketDesc> & BDvec,bool GRFOnly)4976 void G4_BB_SB::getGRFBucketsForOperands(SBNode* node,
4977     Gen4_Operand_Number first_opnd,
4978     Gen4_Operand_Number last_opnd,
4979     std::vector<SBBucketDesc>& BDvec,
4980     bool GRFOnly)
4981 {
4982     for (Gen4_Operand_Number opndNum = first_opnd; opndNum <= last_opnd; opndNum = (Gen4_Operand_Number)(opndNum + 1))
4983     {
4984         const SBFootprint* footprint = node->getFirstFootprint(opndNum);
4985         if (!footprint || (GRFOnly && (footprint->fType != GRF_T)))
4986         {
4987             continue;
4988         }
4989         getGRFBuckets(node, footprint, opndNum, BDvec, GRFOnly);
4990     }
4991 
4992     return;
4993 }
4994 
getGRFFootPrint(SBNode * node,PointsToAnalysis & p)4995 bool G4_BB_SB::getGRFFootPrint(SBNode* node, PointsToAnalysis& p)
4996 {
4997     bool hasDistOneAReg = false;
4998     //We get the description for source first, so for current instruction, the scan order is src0, src1, src2, src3, dst
4999     for (G4_INST* inst : node->instVec)
5000     {
5001         hasDistOneAReg |= getGRFFootPrintOperands(node, inst, Opnd_src0, Opnd_src3, p);
5002         hasDistOneAReg |= getGRFFootPrintOperands(node, inst, Opnd_pred, Opnd_implAccDst, p);
5003         hasDistOneAReg |= getGRFFootPrintOperands(node, inst, Opnd_dst, Opnd_dst, p);
5004     }
5005 
5006     return hasDistOneAReg;
5007 }
5008 
getGRFBucketDescs(SBNode * node,std::vector<SBBucketDesc> & BDvec,bool GRFOnly)5009 void G4_BB_SB::getGRFBucketDescs(SBNode* node, std::vector<SBBucketDesc>& BDvec, bool GRFOnly)
5010 {
5011     //We get the description for source first, so for current instruction, the scan order is src0, src1, src2, src3, dst
5012     getGRFBucketsForOperands(node, Opnd_src0, Opnd_src3, BDvec, GRFOnly);
5013     if (!GRFOnly)
5014     {
5015         getGRFBucketsForOperands(node, Opnd_pred, Opnd_implAccDst, BDvec, GRFOnly);
5016     }
5017     getGRFBucketsForOperands(node, Opnd_dst, Opnd_dst, BDvec, GRFOnly);
5018 
5019     return;
5020 }
5021 
5022 
5023 // Clear the killed bucket nodes
5024 // May be killed by 4 ways
5025 // 1. distance > SWSB_MAX_ALU_DEPENDENCE_DISTANCE
5026 // 2. instruction killed.
5027 // 3. source operands killed.
5028 // 4. operand killed.
5029 // FIXME:
5030 // 1. scanning all buckets is time cost.
5031 // 2. some time, only 1 way checking is required.
5032 // 3. the function is called for every instruction, it's compilation time waste.
clearKilledBucketNodeXeLP(LiveGRFBuckets * LB,int ALUID)5033 void G4_BB_SB::clearKilledBucketNodeXeLP(LiveGRFBuckets* LB, int ALUID)
5034 {
5035     for (int curBucket = 0; curBucket < LB->getNumOfBuckets(); curBucket++)
5036     {
5037         for (LiveGRFBuckets::BN_iterator it = LB->begin(curBucket); it != LB->end(curBucket);)
5038         {
5039             SBBucketNode* liveBN = (*it);
5040             SBNode* curLiveNode = liveBN->node;
5041 
5042             if ((distanceHonourInstruction(curLiveNode->GetInstruction()) &&
5043                 ((ALUID - curLiveNode->getALUID()) > curLiveNode->getMaxDepDistance())) ||
5044                 curLiveNode->isInstKilled() ||
5045                 (curLiveNode->isSourceKilled() &&
5046                     liveBN->opndNum >= Opnd_src0 &&
5047                     liveBN->opndNum <= Opnd_src3))
5048             {
5049                 LB->killOperand(it);
5050                 continue;
5051             }
5052 
5053             ++it;
5054         }
5055     }
5056 }
5057 
clearKilledBucketNodeXeHP(LiveGRFBuckets * LB,int integerID,int floatID,int longID,int mathID)5058 void G4_BB_SB::clearKilledBucketNodeXeHP(LiveGRFBuckets* LB, int integerID, int floatID, int longID, int mathID)
5059 {
5060     for (int curBucket = 0; curBucket < LB->getNumOfBuckets(); curBucket++)
5061     {
5062         for (LiveGRFBuckets::BN_iterator it = LB->begin(curBucket); it != LB->end(curBucket);)
5063         {
5064             SBBucketNode* liveBN = (*it);
5065             SBNode* curLiveNode = liveBN->node;
5066 
5067             if (curLiveNode->isInstKilled() ||
5068                 (curLiveNode->isSourceKilled() &&
5069                     liveBN->opndNum >= Opnd_src0 &&
5070                     liveBN->opndNum <= Opnd_src3))
5071             {
5072                 LB->killOperand(it);
5073                 continue;
5074             }
5075 
5076             //Long pipeline must be checked first because it's definition is different with Integer and Float
5077             if (curLiveNode->GetInstruction()->isLongPipeInstructionXe() &&
5078                 ((longID - curLiveNode->getLongID()) > SWSB_MAX_ALU_DEPENDENCE_DISTANCE_64BIT))
5079             {
5080                 LB->killOperand(it);
5081                 continue;
5082             }
5083 
5084             if (curLiveNode->GetInstruction()->isIntegerPipeInstructionXe() &&
5085                 ((integerID - curLiveNode->getIntegerID()) > SWSB_MAX_ALU_DEPENDENCE_DISTANCE))
5086             {
5087                 LB->killOperand(it);
5088                 continue;
5089             }
5090 
5091             if (curLiveNode->GetInstruction()->isFloatPipeInstructionXe() &&
5092                 ((floatID - curLiveNode->getFloatID()) > SWSB_MAX_ALU_DEPENDENCE_DISTANCE))
5093             {
5094                 LB->killOperand(it);
5095                 continue;
5096             }
5097 
5098             if (curLiveNode->GetInstruction()->isMath() &&
5099                 builder.hasFixedCycleMathPipe() &&
5100                 (mathID - curLiveNode->getMathID() > SWSB_MAX_MATH_DEPENDENCE_DISTANCE))
5101             {
5102                 LB->killOperand(it);
5103                 continue;
5104             }
5105 
5106             ++it;
5107         }
5108     }
5109 }
5110 
clearSLMWARWAissue(SBNode * curNode,LiveGRFBuckets * LB)5111 void G4_BB_SB::clearSLMWARWAissue(SBNode* curNode, LiveGRFBuckets* LB)
5112 {
5113     for (int curBucket = 0; curBucket < LB->getNumOfBuckets(); curBucket++)
5114     {
5115         for (LiveGRFBuckets::BN_iterator it = LB->begin(curBucket); it != LB->end(curBucket);)
5116         {
5117             SBBucketNode* liveBN = (*it);
5118             SBNode* curLiveNode = liveBN->node;
5119             G4_INST* liveInst = liveBN->footprint->inst;
5120 
5121             if (liveInst->isSend() &&
5122                 isSLMMsg(liveInst) && liveInst->getDst() != nullptr && !liveInst->getDst()->isNullReg())
5123             {
5124                 createAddGRFEdge(curLiveNode, curNode, RAW, DEP_EXPLICT);
5125                 curLiveNode->setInstKilled(true);  //Instruction level kill
5126                 LB->killOperand(it);
5127                 continue;
5128             }
5129 
5130             ++it;
5131         }
5132     }
5133 }
5134 
setDistance(const SBFootprint * footprint,SBNode * node,SBNode * liveNode,bool dstDep)5135 void G4_BB_SB::setDistance(const SBFootprint* footprint, SBNode* node, SBNode* liveNode, bool dstDep)
5136 {
5137     if (builder.hasThreeALUPipes() || builder.hasFourALUPipes())
5138     {
5139         unsigned prevID = 0;
5140         unsigned currentID = 0;
5141         switch (liveNode->ALUPipe)
5142         {
5143         case PIPE_INT:
5144             prevID = liveNode->getIntegerID();
5145             if (prevID < latestDepALUID[PIPE_INT])
5146             {
5147                 return;
5148             }
5149             latestDepALUID[PIPE_INT] = prevID;
5150             currentID = node->ALUPipe == PIPE_INT ? node->getIntegerID() : integerID;
5151             break;
5152         case PIPE_FLOAT:
5153             prevID = liveNode->getFloatID();
5154             if (prevID < latestDepALUID[PIPE_FLOAT])
5155             {
5156                 return;
5157             }
5158             latestDepALUID[PIPE_FLOAT] = prevID;
5159             currentID = node->ALUPipe == PIPE_FLOAT ? node->getFloatID() : floatID;
5160             break;
5161         case PIPE_LONG:
5162             prevID = liveNode->getLongID();
5163             if (prevID < latestDepALUID[PIPE_LONG])
5164             {
5165                 return;
5166             }
5167             latestDepALUID[PIPE_LONG] = prevID;
5168             currentID = node->ALUPipe == PIPE_LONG ? node->getLongID() : longID;
5169             break;
5170         case PIPE_MATH:
5171             prevID = liveNode->getMathID();
5172             if (prevID < latestDepALUID[PIPE_MATH])
5173             {
5174                 return;
5175             }
5176             latestDepALUID[PIPE_MATH] = prevID;
5177             currentID = node->ALUPipe == PIPE_MATH ? node->getMathID() : mathID;
5178             break;
5179         default:
5180             assert(0 && "None ALU pipe");
5181             return;
5182         }
5183         SBDISTDEP_ITEM depItem;
5184         depItem.liveNodePipe = liveNode->ALUPipe;
5185         depItem.nodePipe = node->ALUPipe;
5186         depItem.operandType = node->GetInstruction()->getDataTypePipeXe(footprint->type);
5187         depItem.dstDep = dstDep;
5188         if (node->GetInstruction()->isSend())
5189         {
5190             depItem.operandType = PIPE_SEND;
5191         }
5192         assert(currentID > prevID && "Wrong node ALU ID");
5193         node->setDistance(currentID - prevID);
5194         node->distDep.push_back(depItem);
5195     }
5196     else
5197     {
5198         auto dist = node->getALUID() - liveNode->getALUID();
5199         assert(dist <= liveNode->getMaxDepDistance() && "dist should not exceed the max dep distance");
5200         node->setDistance(dist);
5201     }
5202 }
5203 
setSpecialDistance(SBNode * node)5204 void G4_BB_SB::setSpecialDistance(SBNode* node)
5205 {
5206     G4_INST* inst = node->GetInstruction();
5207     if (!inst->getDst())
5208     {
5209         return;
5210     }
5211 
5212     if (inst->getDst()->isA0())
5213     {
5214         SBDISTDEP_ITEM depItem;
5215         depItem.liveNodePipe = PIPE_FLOAT;
5216         depItem.nodePipe = node->ALUPipe;
5217         depItem.operandType = PIPE_INT;
5218         depItem.dstDep = false;
5219         node->setDistance(1);
5220         node->distDep.push_back(depItem);
5221     }
5222 
5223     return;
5224 }
5225 //The merged footprint is ordered from back to front instructions in the macro
5226 //As a result if killed, is the back instruction killed, which means front instructions are killed as well.
footprintMerge(SBNode * node,const SBNode * nextNode)5227 void G4_BB_SB::footprintMerge(SBNode* node, const SBNode* nextNode)
5228 {
5229     for (Gen4_Operand_Number opndNum
5230         : {Opnd_src0, Opnd_src1, Opnd_src2, Opnd_dst})
5231     {
5232         SBFootprint* nextfp = nextNode->getFirstFootprint(opndNum);
5233 
5234         if (nextfp != nullptr)
5235         {
5236             if (node->GetInstruction()->isDpas())
5237             {
5238                 nextfp->setOffset(node->getDPASSize());
5239             }
5240             node->setFootprint(nextfp, opndNum);
5241         }
5242     }
5243 
5244     return;
5245 }
5246 
hasInternalDependenceWithinDPAS(SBNode * node)5247 bool G4_BB_SB::hasInternalDependenceWithinDPAS(SBNode* node)
5248 {
5249     const SBFootprint* dstfp = node->getFirstFootprint(Opnd_dst);
5250 
5251     for (Gen4_Operand_Number opndNum
5252         : {Opnd_src0, Opnd_src1, Opnd_src2})
5253     {
5254         const SBFootprint* srcfp = node->getFirstFootprint(opndNum);
5255         unsigned short internalOffset = 0;
5256         if (dstfp->hasOverlap(srcfp, internalOffset))
5257         {
5258             if (opndNum == Opnd_src1)
5259             {
5260                 assert(0);
5261             }
5262             //For 8X8, it's allowed that dst and src0 share same registers (not internal dep). But not including partial overlap.
5263             if (opndNum == Opnd_src0)
5264             {
5265                 const G4_INST* curInst = node->getLastInstruction();
5266                 const G4_InstDpas* dpasInst = curInst->asDpasInst();
5267                 uint8_t D = dpasInst->getSystolicDepth();
5268 
5269                 if (D == 8) //Works only for 8x8
5270                 {
5271                     if ((dstfp->LeftB == srcfp->LeftB) && (dstfp->RightB == srcfp->RightB))
5272                     {
5273                         continue;
5274                     }
5275                 }
5276             }
5277 
5278             return true;
5279         }
5280     }
5281 
5282     return false;
5283 }
5284 
5285 //No WAR/RAW/WAW dependence within a DPAS macro
hasDependenceBetweenDPASNodes(SBNode * node,SBNode * nextNode)5286 bool G4_BB_SB::hasDependenceBetweenDPASNodes(SBNode* node, SBNode* nextNode)
5287 {
5288     for (Gen4_Operand_Number opndNum
5289         : {Opnd_src0, Opnd_src1, Opnd_src2, Opnd_dst})
5290     {
5291         const SBFootprint* fp = node->getFirstFootprint(opndNum);
5292         if (opndNum == Opnd_dst)
5293         {
5294             for (Gen4_Operand_Number opndNum2
5295                 : {Opnd_src0, Opnd_src1, Opnd_src2, Opnd_dst})
5296             {
5297                 const SBFootprint* nextfp = nextNode->getFirstFootprint(opndNum2);
5298                 unsigned short internalOffset = 0;
5299                 if (fp->hasOverlap(nextfp, internalOffset))
5300                 {
5301                     return true;
5302                 }
5303 
5304                 if (opndNum2 == Opnd_dst && nextfp->hasOverlap(fp, internalOffset))
5305                 {
5306                     return true;
5307                 }
5308             }
5309         }
5310     }
5311 
5312     return false;
5313 }
5314 
5315 #define SRC2_CACHE_SIZE 1024
src2FootPrintCachePVC(SBNode * curNode,SBNode * nextNode) const5316 bool G4_BB_SB::src2FootPrintCachePVC(SBNode * curNode, SBNode * nextNode) const
5317 {
5318     unsigned short GRFSize = getGRFSize();
5319     BitSet cachedGRF(totalGRFNum, false);
5320 
5321     for (const SBFootprint* fp = curNode->getFirstFootprint(Opnd_src2); fp; fp = fp->next)
5322     {
5323         unsigned short leftB = fp->LeftB / GRFSize;
5324         unsigned short rightB = fp->RightB / GRFSize;
5325         for (unsigned short i = leftB; i <= rightB; i++)
5326         {
5327             cachedGRF.set(i, true);
5328         }
5329     }
5330 
5331     for (const SBFootprint* fp = nextNode->getFirstFootprint(Opnd_src2); fp; fp = fp->next)
5332     {
5333         unsigned short leftB = fp->LeftB / GRFSize;
5334         unsigned short rightB = fp->RightB / GRFSize;
5335         for (unsigned short i = leftB; i <= rightB; i++)
5336         {
5337             cachedGRF.set(i, true);
5338         }
5339     }
5340 
5341     unsigned short cachedGRFNum = 0;
5342     for (unsigned short i = 0; i < totalGRFNum; i++)
5343     {
5344         if (cachedGRF.isSet(i))
5345         {
5346             cachedGRFNum++;
5347         }
5348     }
5349 
5350     return cachedGRFNum <= (SRC2_CACHE_SIZE + GRFSize - 1) / GRFSize;
5351 }
5352 
src2SameFootPrintDiffType(SBNode * curNode,SBNode * nextNode) const5353 bool G4_BB_SB::src2SameFootPrintDiffType(SBNode * curNode, SBNode * nextNode) const
5354 {
5355     unsigned short GRFSize = getGRFSize();
5356 
5357     for (const SBFootprint* fp = curNode->getFirstFootprint(Opnd_src2); fp; fp = fp->next)
5358     {
5359         unsigned short leftB = fp->LeftB / GRFSize;
5360         unsigned short rightB = fp->RightB / GRFSize;
5361         G4_Type type = fp->type;
5362 
5363         for (const SBFootprint* nextfp = nextNode->getFirstFootprint(Opnd_src2); nextfp; nextfp = nextfp->next)
5364         {
5365             unsigned short nextLeftB = nextfp->LeftB / GRFSize;
5366             unsigned short nextRightB = nextfp->RightB / GRFSize;
5367             G4_Type nextType = nextfp->type;
5368 
5369             if (!(nextLeftB > rightB || nextRightB < leftB))
5370             {
5371                 if (type != nextType)
5372                 {
5373                     return true;
5374                 }
5375             }
5376         }
5377     }
5378 
5379     return false;
5380 }
5381 
5382 //restrict a macro to :
5383 //    1. Consecutive instructions of same opcode, same datatype in all sources and dest and same register for Src1.
5384 //  2. Allow having variable repeat count
isLastDpas(SBNode * curNode,SBNode * nextNode)5385 bool G4_BB_SB::isLastDpas(SBNode* curNode, SBNode* nextNode)
5386 {
5387     G4_INST* curInst = curNode->getLastInstruction();
5388     G4_INST* nextInst = nextNode->GetInstruction();
5389     if (nextInst == nullptr || !nextInst->isDpas())
5390     {
5391         return true;
5392     }
5393 
5394     if (!hasSameExecMask(curInst, nextInst))
5395     {
5396         return true;
5397     }
5398     //All types should be same for all operands.
5399     for (Gen4_Operand_Number opndNum
5400         : {Opnd_src0, Opnd_src1, Opnd_src2, Opnd_dst})
5401     {
5402         if (curNode->getFirstFootprint(opndNum) && nextNode->getFirstFootprint(opndNum) &&
5403             curNode->getFirstFootprint(opndNum)->type != nextNode->getFirstFootprint(opndNum)->type)
5404         {
5405             return true;
5406         }
5407     }
5408 
5409     G4_InstDpas* dpasInst = curInst->asDpasInst();
5410     G4_Operand* srcOpnd1 = curInst->getSrc(1);
5411     G4_Operand* srcOpnd2 = curInst->getSrc(2);
5412     unsigned short leftBound1 = srcOpnd1->getLinearizedStart();
5413     unsigned short leftBound2 = srcOpnd2->getLinearizedStart();
5414     uint8_t curD = dpasInst->getSystolicDepth();
5415     uint8_t curC = dpasInst->getRepeatCount();
5416     int curSrc1Reg = leftBound1 / numEltPerGRF<Type_UB>();
5417     int curSrc2Reg = leftBound2 / numEltPerGRF<Type_UB>();
5418 
5419     G4_InstDpas* nextDpasInst = nextInst->asDpasInst();
5420     uint8_t nextD = nextDpasInst->getSystolicDepth();
5421     uint8_t nextC = nextDpasInst->getRepeatCount();
5422 
5423     //Same depth
5424     if (curD != nextD)
5425     {
5426         return true;
5427     }
5428 
5429     if (VISA_WA_CHECK(builder.getPWaTable(), Wa_16011859583) ||
5430         VISA_WA_CHECK(builder.getPWaTable(), Wa_14012420496) ||
5431         builder.getOption(vISA_NoDPASMacro))
5432     {
5433         if (curD != 8 || nextD != 8 || curC != 8 || nextC != 8)
5434         {
5435             return true;
5436         }
5437     }
5438 
5439     srcOpnd1 = nextDpasInst->getSrc(1);
5440     srcOpnd2 = nextDpasInst->getSrc(2);
5441     leftBound1 = srcOpnd1->getLinearizedStart();
5442     leftBound2 = srcOpnd2->getLinearizedStart();
5443     int nextSrc1Reg = leftBound1 / numEltPerGRF<Type_UB>();
5444     int nextSrc2Reg = leftBound2 / numEltPerGRF<Type_UB>();
5445 
5446     if (builder.hasSrc2ReadSupression() &&
5447         builder.hasSrc2ReadSupressionSameRegSameType() &&
5448         src2SameFootPrintDiffType(curNode, nextNode))
5449     {
5450         return true;
5451     }
5452 
5453     //Same src1 or src2
5454     if (curSrc1Reg == nextSrc1Reg ||
5455         (builder.hasSrc2ReadSupression() &&  (curSrc2Reg == nextSrc2Reg &&
5456             curC == nextC &&
5457             curC == 8)))
5458     {
5459         return false;
5460     }
5461 
5462     // Using {Atomic} in the last line of a macro (such as in the lines I highlighted) has some implications in the hardware implementation:
5463     //1. In 8x8 macros (such as the one you pasted) is fine.
5464     //2. In other repetitions, it will cause that the src1 of the next macro will be ignored.
5465     // Hardware uses {Atomic} to indicate that the next instruction will reuse the src1. In an 8x8, they always verify
5466 
5467     if (builder.hasSrc2ReadSupression() &&
5468         curC == nextC &&
5469         curC == 8 &&
5470         src2FootPrintCachePVC(curNode, nextNode) &&
5471         curNode->getFirstFootprint(Opnd_src2)->isWholeOverlap(nextNode->getFirstFootprint(Opnd_src2)))
5472     {
5473         return false;
5474     }
5475 
5476     return true;
5477 }
5478 
pushItemToQueue(std::vector<unsigned> * nodeIDQueue,unsigned nodeID)5479 void G4_BB_SB::pushItemToQueue(std::vector<unsigned> *nodeIDQueue, unsigned nodeID)
5480 {
5481     nodeIDQueue->push_back(nodeID);
5482 
5483     if (nodeIDQueue->size() > SWSB_MAX_ALU_DEPENDENCE_DISTANCE_VALUE)
5484     {
5485         nodeIDQueue->erase(nodeIDQueue->begin());
5486     }
5487 }
5488 
hasInternalDependence(SBNode * nodeFirst,SBNode * nodeNext)5489 bool G4_BB_SB::hasInternalDependence(SBNode* nodeFirst, SBNode* nodeNext)
5490 {
5491     for (Gen4_Operand_Number opndNum1
5492         : {Opnd_dst, Opnd_src0, Opnd_src1, Opnd_src2})
5493     {
5494         const SBFootprint* firstfp = nodeFirst->getFirstFootprint(opndNum1);
5495 
5496         for (Gen4_Operand_Number opndNum2
5497             : {Opnd_dst, Opnd_src0, Opnd_src1, Opnd_src2})
5498         {
5499             if (opndNum1 > Opnd_dst && opndNum2 > Opnd_dst) //Don't track read after read.
5500             {
5501                 continue;
5502             }
5503 
5504             const SBFootprint* secondfp = nodeNext->getFirstFootprint(opndNum2);
5505             unsigned short internalOffset = 0;
5506             if (firstfp->hasOverlap(secondfp, internalOffset))
5507             {
5508                 return true;
5509             }
5510         }
5511     }
5512 
5513     return false;
5514 }
5515 
5516 
is2xDPBlockCandidate(G4_INST * inst,bool accDST)5517 bool G4_BB_SB::is2xDPBlockCandidate(G4_INST* inst, bool accDST)
5518 {
5519     if (inst->opcode() != G4_mad)
5520     {
5521         return false;
5522     }
5523 
5524     if (inst->getPredicate())
5525     {
5526         return false;
5527     }
5528 
5529     if (inst->getExecSize() != g4::SIMD16)
5530     {
5531         return false;
5532     }
5533 
5534     if (!inst->getDst() || inst->getDst()->isNullReg())
5535     {
5536         return false;
5537     }
5538 
5539     if (accDST && !inst->getDst()->isAccReg())
5540     {
5541         return false;
5542     }
5543 
5544     for (Gen4_Operand_Number opndNum
5545         : {Opnd_dst, Opnd_src0, Opnd_src1, Opnd_src2})
5546     {
5547         G4_Operand* opnd = inst->getOperand(opndNum);
5548 
5549         if (opnd->getType() != G4_Type::Type_DF)
5550         {
5551             return false;
5552         }
5553     }
5554 
5555     return true;
5556 }
5557 
SBDDD(G4_BB * bb,LiveGRFBuckets * & LB,LiveGRFBuckets * & globalSendsLB,SBNODE_VECT * SBNodes,SBNODE_VECT * SBSendNodes,SBBUCKET_VECTOR * globalSendOpndList,SWSB_INDEXES * indexes,uint32_t & globalSendNum,PointsToAnalysis & p,std::map<G4_Label *,G4_BB_SB * > * LabelToBlockMap)5558 void G4_BB_SB::SBDDD(G4_BB* bb,
5559     LiveGRFBuckets*& LB,
5560     LiveGRFBuckets*& globalSendsLB,
5561     SBNODE_VECT* SBNodes,
5562     SBNODE_VECT* SBSendNodes,
5563     SBBUCKET_VECTOR* globalSendOpndList,
5564     SWSB_INDEXES* indexes,
5565     uint32_t& globalSendNum,
5566     PointsToAnalysis& p,
5567     std::map<G4_Label*, G4_BB_SB*>* LabelToBlockMap)
5568 {
5569     nodeID = indexes->instIndex;
5570     ALUID = indexes->ALUIndex;
5571     integerID = indexes->integerIndex;
5572     floatID = indexes->floatIndex;
5573     longID = indexes->longIndex;
5574     DPASID = indexes->DPASIndex;
5575     mathID = indexes->mathIndex;
5576     first_DPASID = indexes->DPASIndex;
5577 
5578     for (int i = 0; i < PIPE_DPAS; i++)
5579     {
5580         latestDepALUID[i] = indexes->latestDepALUID[i];
5581         latestInstID[i] = &indexes->latestInstID[i];
5582     }
5583     SBNODE_LIST tmpSBSendNodes;
5584     bool hasFollowDistOneAReg = false;
5585 
5586     std::list<G4_INST*>::iterator iInst(bb->begin()), iInstEnd(bb->end()), iInstNext(bb->begin());
5587     for (; iInst != iInstEnd; ++iInst)
5588     {
5589         SBNode* node = nullptr;
5590         G4_INST* curInst = *iInst;
5591         iInstNext = iInst;
5592         iInstNext++;
5593         G4_INST* nextInst = nullptr;
5594         if (iInstNext != iInstEnd)
5595         {
5596             nextInst = *iInstNext;
5597         }
5598 
5599         if (curInst->isLabel())
5600         {
5601             (*LabelToBlockMap)[curInst->getLabel()] = this;
5602             continue;
5603         }
5604 
5605         //For the instructions not counted in the distance, we assign the same ALUID as the following
5606         node = new (mem)SBNode(nodeID, ALUID, bb->getId(), curInst);
5607         SBNodes->emplace_back(node);
5608         curInst->setLocalId(0);
5609 
5610         if (builder.hasA0WARHWissue() && builder.hasThreeALUPipes())
5611         {
5612             setSpecialDistance(node);
5613         }
5614         //Record the node IDs of the instructions in BB
5615         if (first_node == -1)
5616         {
5617             first_node = nodeID;
5618         }
5619         last_node = nodeID;
5620         nodeID++;
5621 
5622         //For architecture registers ce#, sp, sr0.#, cr0.#, ip, tm0, dbg0, set distance 1
5623         if (hasFollowDistOneAReg)
5624         {
5625             node->setDistance(1);
5626             node->setFollowDistOneAReg();
5627             hasFollowDistOneAReg = false;
5628             if (builder.hasThreeALUPipes() || builder.hasFourALUPipes())
5629             {
5630                 node->instVec.front()->setDistanceTypeXe(G4_INST::DistanceType::DISTALL);
5631             }
5632         }
5633 
5634         hasFollowDistOneAReg = getGRFFootPrint(node, p);
5635 
5636         //For architecture registers ce#, sp, sr0.#, cr0.#, ip, tm0, dbg0, set distance 1
5637         if (hasFollowDistOneAReg)
5638         {
5639             node->setDistance(1);
5640             node->setDistOneAReg();
5641             if (builder.hasThreeALUPipes() || builder.hasFourALUPipes())
5642             {
5643                 node->instVec.front()->setDistanceTypeXe(G4_INST::DistanceType::DISTALL);
5644             }
5645         }
5646 
5647         //Support for the mad block in DPAS pipeline
5648         if (builder.has2xDP() &&
5649             builder.getOption(vISA_ScheduleFor2xSP) &&
5650             is2xDPBlockCandidate(curInst, true))
5651         {
5652             int depDistance = curInst->getDst()->getLinearizedEnd() - curInst->getDst()->getLinearizedStart() + 1;
5653             std::list<G4_INST*>::iterator iNextInst = iInst;
5654             iNextInst++;
5655             G4_INST* nInst = *iNextInst;
5656             while (is2xDPBlockCandidate(nInst, false))
5657             {
5658                 SBNode nextNode(nodeID, ALUID, bb->getId(), nInst);
5659                 getGRFFootPrint(&nextNode, p);
5660 
5661                 if (hasInternalDependence(node, &nextNode))
5662                 {
5663                     break;
5664                 }
5665                 depDistance += nInst->getDst()->getLinearizedEnd() - nInst->getDst()->getLinearizedStart() + 1;
5666                 iNextInst ++;
5667                 nInst = *iNextInst;
5668                 if (iInstNext == iInstEnd)
5669                 {
5670                     break;
5671                 }
5672                 if (depDistance >= getGRFSize() * 8)
5673                 {
5674                     break;
5675                 }
5676             }
5677 
5678             if (depDistance >= getGRFSize() * 8)
5679             {
5680                 curInst->setNoACCSBSet();
5681             }
5682         }
5683 
5684         // Support for atomic write combine
5685         // Treat block instructions as one in distance calculation.
5686         // The write combine in the local scheduling guarantee that all instructions in the block belong to same instruction pipeline.
5687         auto isWriteCombineBlockCandidate = [&](G4_INST * inst)
5688         {
5689             return (inst->opcode() == G4_mov &&
5690                 IS_BTYPE(inst->getDst()->getType()) &&
5691                 (IS_BTYPE(inst->getSrc(0)->getType()) || IS_WTYPE(inst->getSrc(0)->getType()) || IS_DTYPE(inst->getSrc(0)->getType()) || inst->getSrc(0)->getType() == Type_F) &&
5692                 inst->getPredicate() == nullptr);
5693         };
5694 
5695         if (builder.getOption(vISA_writeCombine) && isWriteCombineBlockCandidate(curInst) && curInst->isAtomicInst())
5696         {
5697             while (nextInst && isWriteCombineBlockCandidate(nextInst))
5698             {
5699                 SBNode nextNode = SBNode(nodeID, ALUID, bb->getId(), nextInst);
5700                 getGRFFootPrint(&nextNode, p);
5701                 footprintMerge(node, &nextNode);
5702                 node->addInstruction(nextInst);
5703 
5704                 curInst = nextInst;
5705                 iInst = iInstNext;
5706                 iInstNext++;
5707                 nextInst = *iInstNext;
5708 
5709                 if (!curInst->isAtomicInst())
5710                 {
5711                     break;
5712                 }
5713             }
5714 
5715             // check last instruction in the block is correct or not
5716             assert(curInst && isWriteCombineBlockCandidate(curInst) && !curInst->isAtomicInst() && "the last instruction in the write combine block is wrong");
5717         }
5718 
5719         //Support for DPAS
5720         //To fully provide the efficiency of DPAS pipeline
5721         //We'd like to promote the dependence to or before the first instruction of a DPAS block
5722         //At the same time, push all dependence BD to the last instruction.
5723         //Keeping the dependence within a DPAS block will drop performance a lot.
5724         if (curInst->isDpas())
5725         {
5726             unsigned dpas_count = 0;
5727             if (nextInst && nextInst->isDpas())
5728             {
5729                 SBNode nextNode;
5730                 bool sameSrcDst = false;
5731                 while (curInst != nullptr && curInst->isDpas())
5732                 {
5733                     //following instructions, first instruction is in node already
5734                     if (dpas_count != 0)
5735                     {
5736                         if (nextNode.getNodeID() != -1)
5737                         {
5738                             footprintMerge(node, &nextNode);
5739                         }
5740                         node->addInstruction(curInst);
5741                         const G4_InstDpas* dpasInst = curInst->asDpasInst();
5742                         node->addDPASSize(dpasInst->getRepeatCount());
5743                     }
5744                     else  //If the first node has internal dependence, break immediately
5745                     {
5746                         if (hasInternalDependenceWithinDPAS(node))
5747                         {
5748                             break;
5749                         }
5750                     }
5751 
5752                     nextNode = SBNode(nodeID, ALUID, bb->getId(), nextInst);
5753                     getGRFFootPrint(&nextNode, p);
5754 
5755                     //Has dependence cannot be merged into same node.
5756                     //Different Depth, src1 and type cannot be merged
5757                     //Same register reuse in dest and src cannot be a part of a macro, even the last one.
5758                     if (sameSrcDst ||
5759                         isLastDpas(node, &nextNode) ||
5760                         hasDependenceBetweenDPASNodes(node, &nextNode))
5761                     {
5762                         break;
5763                     }
5764 
5765                     if (hasInternalDependenceWithinDPAS(&nextNode))
5766                     {
5767                         sameSrcDst = true;
5768                     }
5769 
5770                     curInst->setOptionOn(InstOpt_Atomic);
5771                     dpas_count++;
5772 
5773                     curInst = nextInst;
5774                     iInst = iInstNext;
5775                     iInstNext++;
5776                     if (iInstNext == iInstEnd)
5777                     {
5778                         if (nextNode.getNodeID() != -1)
5779                         {
5780                             footprintMerge(node, &nextNode);
5781                         }
5782                         node->addInstruction(curInst);
5783                         nextInst = nullptr;
5784                         break;
5785                     }
5786                     nextInst = *iInstNext;
5787                 }
5788                 curInst = node->GetInstruction();
5789             }
5790         }
5791         if (node->getLastInstruction()->isDpas())
5792         {
5793             node->setDPASID(DPASID);
5794             DPASID += node->getDPASSize();
5795         }
5796 
5797         //Get buckets for all GRF registers which are used in curInst
5798         std::vector<SBBucketDesc> BDvec;
5799         std::vector<SBBucketDesc> liveBDvec;
5800         BDvec.clear();
5801         liveBDvec.clear();
5802 
5803         getGRFBucketDescs(node, BDvec, false);
5804         if (node->instVec.size() > 1)
5805         {
5806             getGRFBucketDescs(node, liveBDvec, false);
5807         }
5808 
5809         if (builder.hasThreeALUPipes() || builder.hasFourALUPipes())
5810         {
5811             node->ALUPipe = curInst->getInstructionPipeXe();
5812         }
5813 
5814         // For ALU instructions without GRF usage
5815         if (distanceHonourInstruction(curInst))
5816         {
5817             ALUID++;
5818 
5819             if (builder.hasThreeALUPipes() || builder.hasFourALUPipes())
5820             {
5821                 switch (node->ALUPipe)
5822                 {
5823                 case PIPE_INT:
5824                     node->setIntegerID(integerID);
5825                     pushItemToQueue(latestInstID[PIPE_INT], node->getNodeID());
5826                     integerID++;
5827                     break;
5828                 case PIPE_FLOAT:
5829                     node->setFloatID(floatID);
5830                     pushItemToQueue(latestInstID[PIPE_FLOAT], node->getNodeID());
5831                     floatID++;
5832                     break;
5833                 case PIPE_LONG:
5834                     node->setLongID(longID);
5835                     pushItemToQueue(latestInstID[PIPE_LONG], node->getNodeID());
5836                     longID++;
5837                     break;
5838                 case PIPE_MATH:
5839                     node->setMathID(mathID);
5840                     pushItemToQueue(latestInstID[PIPE_MATH], node->getNodeID());
5841                     mathID++;
5842                     break;
5843                 default:
5844                     ASSERT_USER(curInst->hasNoPipe(), "Unexpected instruction found in distance ");
5845                 }
5846             }
5847 
5848             if (!BDvec.size())
5849             {
5850                 if (ALUID >= SWSB_MAX_ALU_DEPENDENCE_DISTANCE && ALUID != node->getALUID())
5851                 {
5852                     if (builder.hasThreeALUPipes() || builder.hasFourALUPipes())
5853                     {
5854                         clearKilledBucketNodeXeHP(LB, integerID, floatID, longID, mathID);
5855                     }
5856                     else
5857                     {
5858                         clearKilledBucketNodeXeLP(LB, ALUID);
5859                     }
5860                 }
5861                 continue;
5862             }
5863         }
5864 
5865         // Considering instruction level liveness kill, i.e killing the live instructions/operands,
5866         // the dependence checking order must be RAR/RAW --> WAR/WAW, the bucket descriptions in BDvec must in the order of src->dst.
5867         // If WAW is done first, RAW may be missed:
5868         //    If both live and current instructions are in-order instructions, WAW no dependence required, but RAW is required.
5869         //    If both live and current instructions are out-of-order instructions, WAW and RAW have same effect.
5870         //    If live is in-order and current is out-of-order, WAW and RAW have same effect.
5871         //    If live is out-of-order and current is in-order, WAW and RAW have same effect.
5872         // If RAR is done before WAR, WAR will not be missed:
5873         //    If both live and current instructions are in-order instructions, both RAR and WAR are not required.
5874         //    If both live and current instructions are out-of-order instructions,
5875         //                                   same pipeline, both RAR and WAR are not required
5876         //                                   different pipeline, both R are kept for RAR, and WAR dependence is required, RAR will not cause WAR miss.
5877         //    If live is in-order and current is out-of-order, WAW and RAW have same effect.
5878         //    If live is out-of-order and current is in-order, WAW and RAW have same effect.
5879         //                                   Both R will be kept, RAR will not cause WAR miss.
5880         // For WAW and RAW, once explicit dependencies are required, kill the liveness of instruction.
5881         // For WAR, once explicit dependencies is required, kill the source operands.
5882         // Others, only operand kill.
5883         bool instKill = false;
5884 
5885         // For all bucket descriptors of curInst
5886         for (const SBBucketDesc& BD : BDvec) {
5887             const int& curBucket = BD.bucket;
5888             const Gen4_Operand_Number& curOpnd = BD.opndNum;
5889             const SBFootprint* curFootprint = BD.footprint;
5890 
5891             // Check liveness for each live curBucket node.
5892             // Add explicit dependence if liveness is killed and there is no implicit dependence
5893             for (LiveGRFBuckets::BN_iterator bn_it = LB->begin(curBucket);
5894                 bn_it != LB->end(curBucket);)
5895             {
5896                 SBBucketNode* liveBN = (*bn_it);
5897                 SBNode* liveNode = liveBN->node;
5898 
5899                 if (liveNode->isInstKilled() ||
5900                     (liveNode->isSourceKilled() &&
5901                         liveBN->opndNum >= Opnd_src0 &&
5902                         liveBN->opndNum <= Opnd_src3))
5903                 {
5904                     ++bn_it;
5905                     continue;
5906                 }
5907 
5908                 unsigned short internalOffset = 0;
5909                 Gen4_Operand_Number liveOpnd = liveBN->opndNum;
5910                 const SBFootprint* liveFootprint = liveBN->footprint;
5911                 G4_INST* liveInst = liveFootprint->inst;
5912 
5913                 bool hasOverlap = curFootprint->hasOverlap(liveFootprint, internalOffset);
5914                 bool hasRMWOverlap = false;
5915                 if (builder.hasFourALUPipes() && distanceHonourInstruction(liveInst) &&
5916                     distanceHonourInstruction(curInst))
5917                 {
5918                     hasOverlap = curFootprint->hasOverlap(liveFootprint, hasRMWOverlap, internalOffset);
5919                 }
5920 
5921                 //RAW:                     R kill W    R-->live       explicit dependence
5922                 //WAW: same pipeline and inorder   W2 kill W1  W2-->live      implicit dependence
5923                 //WAW: different pipelines or OOO  W2 kill W1  W2-->live      explict dependence
5924                 //WAR: different pipelines W kill R    W-->live       explicit dependence
5925                 //WAR: same pipeline       W kill R    W-->live       implicit dependence
5926                 //RAR: same pipeline               R2 kill R1  R2-->live      no dependence
5927                 //RAR: different pipelines         no kill     R1,R2-->live   no dependence
5928                 //Find DEP type
5929                 DepType dep = getDepForOpnd(liveOpnd, curOpnd);
5930 
5931                 //W/A for the read suppression caused issue
5932                 //1)(~f0.0.anyv) math.cos(2 | M0)      r23.7<2>:hf   r11.7<4; 2, 2> : hf{ $14 }
5933                 //2)             mul(8 | M0)               acc0.0<1>:ud  r35.3<8; 8, 0> : ud   r23.0<8; 4, 0> : uw   //With execution mask, only r23.0~r23.3 are read
5934                 //3)             mach(8 | M0)              r52.0<1>:ud   r35.3<8; 8, 0> : ud   r23.0<4; 4, 0> : ud{ $14.dst }
5935                 //FIXME, For performance, we need check the 3rd instruction as well
5936 
5937                 if (!hasOverlap &&
5938                     !builder.hasFixedCycleMathPipe() &&
5939                     dep == RAW &&
5940                     liveInst->isMath() && !curInst->isMath() &&
5941                     builder.hasRSForSpecificPlatform() &&
5942                     (!hasSamePredicator(liveInst, curInst) || builder.hasMathRSIsuue()))
5943                 {
5944                     hasOverlap = curFootprint->hasGRFGrainOverlap(liveFootprint);
5945                 }
5946 
5947                 if (!hasOverlap)
5948                 {
5949                     ++bn_it;
5950                     continue;
5951                 }
5952 
5953                 if (tokenHonourInstruction(liveInst))
5954                 {
5955                     if (dep == RAW || dep == WAW) {
5956                         if (builder.getOption(vISA_EnableDPASTokenReduction) &&
5957                             node->getLastInstruction()->isDpas() &&
5958                             liveNode->getLastInstruction()->isDpas() &&
5959                             curFootprint->isWholeOverlap(liveFootprint))
5960                         {
5961                             if ((node->getDPASID() + curFootprint->offset - (liveNode->getDPASID() + internalOffset) < tokenAfterDPASCycle))
5962                             {
5963                                 LB->killOperand(bn_it);
5964                                 createAddGRFEdge(liveNode, node, dep, DEP_EXPLICT);
5965                                 liveNode->setInstKilled(true);  //Instruction level kill
5966                                 instKill = true;
5967                                 continue;
5968                             }
5969                             else if (dep == WAW)  //For RAW, we cannot
5970                             {
5971                                 LB->killOperand(bn_it);
5972                                 continue;
5973                             }
5974                         }
5975                         else
5976                         {
5977                             LB->killOperand(bn_it);
5978                             createAddGRFEdge(liveNode, node, dep, DEP_EXPLICT);
5979                             liveNode->setInstKilled(true);  //Instruction level kill
5980                             instKill = true;
5981                             continue;
5982                         }
5983                     }
5984 
5985                     if (dep == WAR) {
5986                         bool killed = false;
5987 
5988                         //Killed if region overlap
5989                         if (curFootprint->isWholeOverlap(liveFootprint))
5990                         {
5991                             LB->killOperand(bn_it);
5992                             liveNode->setAR();
5993                             if (WARDepRequired(liveInst, curInst))
5994                             {
5995                                 liveNode->setSourceKilled(true);
5996                             }
5997                             killed = true;
5998                         }
5999 
6000                         //Different pipeline/functionID, added Edge
6001                         //If not whole region overlap, still killed
6002                         if (WARDepRequired(liveInst, curInst))
6003                         {
6004                             if (!killed)
6005                             {
6006                                 LB->killOperand(bn_it);
6007                                 liveNode->setAR();
6008                                 liveNode->setSourceKilled(true);
6009                                 killed = true;
6010                             }
6011 
6012                             if (builder.getOption(vISA_EnableDPASTokenReduction) &&
6013                                 node->getLastInstruction()->isDpas() &&
6014                                 liveNode->getLastInstruction()->isDpas() &&
6015                                 curFootprint->isWholeOverlap(liveFootprint))
6016                             {
6017                                 //
6018                                 //  dpasw.8x7(8 | M0)         r84 : f         r84 : f             r52 : bf            r14.0 : bf{ Atomic }
6019                                 //  dpasw.8x7(8 | M0)         r92 : f         r92 : f             r52 : bf            r22.0 : bf{ Atomic }
6020                                 //  dpasw.8x7(8 | M0)         r100 : f        r100 : f            r52 : bf            r30.0 : bf{ Atomic }
6021                                 //  dpasw.8x7(8 | M0)         r108 : f        r108 : f            r52 : bf            r38.0 : bf{ Atomic }
6022                                 //  dpasw.8x7(8 | M0)         r116 : f        r116 : f            r52 : bf            r46.0 : bf{ $5 }
6023                                 //  sync.nop                      null{ Compacted, $5.src }
6024                                 //  (W)send.dc0(16 | M0)         r52      r6      null    0x0         0x28805FE  {$0}
6025                                 //
6026                                 //  Although there is WAR dependence because of r52. However, due to the read suppression, the sync.nop is not required.
6027                                 //  The DPAS in-order GRF read cycles can cover the GRF read of r52 to r58.
6028 
6029                                 if (liveOpnd == Opnd_src1)
6030                                 {
6031                                     if (node->getDPASID() + curFootprint->offset - liveNode->getDPASID() <= TOKEN_AFTER_READ_DPAS_CYCLE)
6032                                     {
6033                                         createAddGRFEdge(liveNode, node, dep, DEP_EXPLICT);
6034                                     } //else do nothing, previous whole region check kill the bucket node already.
6035                                 }
6036                                 else  //src0, src2
6037                                 {
6038                                     if (node->getDPASID() + curFootprint->offset - (liveNode->getDPASID() + internalOffset) <= TOKEN_AFTER_READ_DPAS_CYCLE)
6039                                     {
6040                                         createAddGRFEdge(liveNode, node, dep, DEP_EXPLICT);
6041                                     } //else do nothing
6042                                 }
6043                             }
6044                             else
6045                             {
6046                                 createAddGRFEdge(liveNode, node, dep, DEP_EXPLICT);
6047                             }
6048                         }  //else, same pipeline, there is no need to set the dependence.
6049 
6050                         if (killed)
6051                         {
6052                             continue;
6053                         }
6054                     }
6055 
6056                     if (dep == NODEP &&
6057                         hasSameFunctionID(liveInst, curInst) &&
6058                         hasSamePredicator(liveInst, curInst) &&
6059                         hasSameExecMask(liveInst, curInst))
6060                     {
6061                         if (curFootprint->isWholeOverlap(liveFootprint))
6062                         {
6063                             LB->killOperand(bn_it);
6064                             continue;
6065                         }
6066                     }
6067                     assert(dep != DEPTYPE_MAX && "dep unassigned?");
6068                 }
6069 
6070                 if (distanceHonourInstruction(liveInst))
6071                 {
6072                     if (dep == RAW &&
6073                         (curBucket < (totalGRFNum + (int)builder.getNumScalarRegisters())))
6074                     {//Only need track GRF RAW dependence
6075                         LB->killOperand(bn_it);
6076                         setDistance(curFootprint, node, liveNode, false);
6077                         liveNode->setInstKilled(true);  //Instrtuction level kill
6078                         instKill = true;
6079                         continue;
6080                     }
6081 
6082                     if (dep == WAW) {
6083                         bool killed = false;
6084                         //For implicit dependence, the previous node can be killed only when it's wholly overlapped by the following one
6085                         if (curFootprint->isWholeOverlap(liveFootprint))
6086                         {
6087                             LB->killOperand(bn_it);
6088                             killed = true;
6089                         }
6090 
6091                         if (builder.hasThreeALUPipes() || builder.hasFourALUPipes())
6092                         {
6093                             if (!distanceHonourInstruction(curInst) ||
6094                                 node->ALUPipe != liveNode->ALUPipe ||
6095                                 (node->ALUPipe == liveNode->ALUPipe && hasRMWOverlap))
6096                             {
6097                                 if (!killed)
6098                                 {
6099                                     LB->killOperand(bn_it);
6100                                     killed = true;
6101                                 }
6102 
6103                                 setDistance(curFootprint, node, liveNode, true);
6104                                 liveNode->setInstKilled(true); //Instruction level kill
6105                                 instKill = true;
6106                             }
6107                         }
6108                         else if (!curInst->distanceHonourInstruction()
6109                                 || (liveInst->isLongPipeInstructionXe() && !curInst->isLongPipeInstructionXe())
6110                                 )
6111                             {
6112                                 if (!killed)
6113                                 {
6114                                     LB->killOperand(bn_it);
6115                                     killed = true;
6116                                 }
6117                                 setDistance(curFootprint, node, liveNode, true);
6118                                 liveNode->setInstKilled(true); //Instruction level kill
6119                                 instKill = true;
6120                             }
6121 
6122                         if (killed)
6123                         {
6124                             continue;
6125                         }
6126                     }
6127 
6128                     if (dep == WAR) {
6129                         bool killed = false;
6130                         //For implicit dependence, the previous node can be killed only when it's wholly overlapped by the following one
6131                         if (curFootprint->isWholeOverlap(liveFootprint))
6132                         {
6133                             LB->killOperand(bn_it);
6134                             killed = true;
6135                         }
6136 
6137                         if (builder.hasThreeALUPipes() || builder.hasFourALUPipes())
6138                         {
6139                             if (!curInst->distanceHonourInstruction() || node->ALUPipe != liveNode->ALUPipe)
6140                             {
6141                                 if (!killed)
6142                                 {
6143                                     LB->killOperand(bn_it);
6144                                     killed = true;
6145                                 }
6146                                 setDistance(curFootprint, node, liveNode, true);
6147                                 liveNode->setInstKilled(true); //Instruction level kill
6148                             }
6149                         }
6150                         else if (!hasSameFunctionID(liveInst, curInst))
6151                         {
6152                             if (!killed)
6153                             {
6154                                 LB->killOperand(bn_it);
6155                                 killed = true;
6156                             }
6157                             setDistance(curFootprint, node, liveNode, true);
6158                             liveNode->setSourceKilled(true);
6159                         }
6160 
6161                         if (killed)
6162                         {
6163                             continue;
6164                         }
6165                     }
6166 
6167                     if (dep == NODEP && hasSameFunctionID(liveInst, curInst))
6168                     {
6169                         if (curFootprint->isWholeOverlap(liveFootprint))
6170                         {
6171                             if (builder.hasThreeALUPipes() || builder.hasFourALUPipes())
6172                             {
6173                                 if (node->ALUPipe == liveNode->ALUPipe)
6174                                 {
6175                                     LB->killOperand(bn_it);
6176                                     continue;
6177                                 }
6178                             }
6179                             else
6180                             {
6181                                 LB->killOperand(bn_it);
6182                                 continue;
6183                             }
6184                         }
6185                     }
6186                     assert(dep != DEPTYPE_MAX && "dep unassigned?");
6187                 }
6188 
6189                 ++bn_it;
6190             }
6191         }
6192 
6193         if (node->distDep.size())
6194         {
6195             if (builder.hasFiveALUPipes())
6196             {
6197                 node->finalizeDistanceType2(builder, latestInstID);
6198             }
6199             else if (builder.hasThreeALUPipes() || builder.hasFourALUPipes())
6200             {
6201                 node->finalizeDistanceType1(builder, latestInstID);
6202             }
6203         }
6204 
6205         if ((builder.getOption(vISA_EnableSwitch) && node->GetInstruction()->isYieldInst()) ||
6206             (node->GetInstruction()->isCall() || node->GetInstruction()->isFCall()) ||
6207             (VISA_WA_CHECK(builder.getPWaTable(), Wa_14013672992) && node->GetInstruction()->isEOT()))
6208         {
6209             node->setDistance(1);
6210             if (builder.hasThreeALUPipes() || builder.hasFourALUPipes())
6211             {
6212                 node->instVec.front()->setDistanceTypeXe(G4_INST::DistanceType::DISTALL);
6213             }
6214         }
6215 
6216         //Simplify the LB according to the distance, and if the node is killed
6217         if (instKill ||
6218             (ALUID >= SWSB_MAX_ALU_DEPENDENCE_DISTANCE && ALUID != node->getALUID()))
6219         {
6220             if (builder.hasThreeALUPipes() || builder.hasFourALUPipes())
6221             {
6222                 clearKilledBucketNodeXeHP(LB, integerID, floatID, longID, mathID);
6223             }
6224             else
6225             {
6226                 clearKilledBucketNodeXeLP(LB, ALUID);
6227             }
6228         }
6229 
6230         if (builder.hasSLMWARIssue() && curInst->isSend() &&
6231             (isSLMMsg(curInst) && (curInst->getDst() == nullptr || isFence(curInst))))
6232         {
6233             clearSLMWARWAissue(node, LB);
6234         }
6235 
6236         // Add buckets of current instruction to bucket list
6237         if (node->instVec.size() > 1)
6238         {
6239             std::map<const SBFootprint*, std::vector<SBBucketNode*>> bucketNodes;
6240             for (const SBBucketDesc& BD : liveBDvec)
6241             {
6242                 auto iter = std::find_if(bucketNodes[BD.footprint].begin(), bucketNodes[BD.footprint].end(),
6243                     [&BD](SBBucketNode* node) {return BD.opndNum == node->opndNum; });
6244                 if (iter != bucketNodes[BD.footprint].end())
6245                 {
6246                     LB->add((*iter), BD.bucket);
6247                 }
6248                 else
6249                 {
6250                     void* allocedMem = mem.alloc(sizeof(SBBucketNode));
6251                     SBBucketNode* newNode = new (allocedMem)SBBucketNode(node, BD.opndNum, BD.footprint);
6252                     bucketNodes[BD.footprint].push_back(newNode);
6253                     LB->add(newNode, BD.bucket);
6254                 }
6255             }
6256         }
6257         else
6258         {
6259             std::vector<SBBucketNode*>  bucketNodes(Opnd_total_num, nullptr);  //The coarse grained footprint of operands
6260             for (const SBBucketDesc& BD : BDvec)
6261             {
6262                 if (bucketNodes[BD.opndNum] == nullptr)
6263                 {
6264                     void* allocedMem = mem.alloc(sizeof(SBBucketNode));
6265                     SBBucketNode* newNode = new (allocedMem)SBBucketNode(node, BD.opndNum, BD.footprint);
6266                     bucketNodes[BD.opndNum] = newNode;
6267                 }
6268 
6269                 LB->add(bucketNodes[BD.opndNum], BD.bucket);
6270             }
6271         }
6272 
6273         // Record token sensitive nodes.
6274         if (tokenHonourInstruction(curInst))
6275         {
6276             if (first_send_node == -1)
6277             {
6278                 first_send_node = SBSendNodes->size();
6279             }
6280             last_send_node = SBSendNodes->size();
6281             node->setSendID(int(SBSendNodes->size()));
6282             // The dep delay of the node should be constant, so we can
6283             // calculate and save it for future uses.
6284             node->setDepDelay(swsb.calcDepDelayForNode(node));
6285             SBSendNodes->push_back(node);
6286         }
6287     }
6288 
6289     //Check the live out token nodes after the scan of current BB.
6290     //Record the nodes and the buckets for global analysis.
6291     for (int curBucket = 0; curBucket < LB->getNumOfBuckets(); curBucket++)
6292     {
6293         for (auto it = LB->begin(curBucket); it != LB->end(curBucket);)
6294         {
6295             SBBucketNode* liveBN = (*it);
6296             SBNode* node = liveBN->node;
6297 
6298             //Only the live outs from current BB
6299             if (tokenHonourInstruction(node->GetInstruction()) &&
6300                 (int)node->getNodeID() >= first_node &&
6301                 (int)node->getNodeID() <= last_node)
6302             {
6303                 if (liveBN->getSendID() == -1)
6304                 {
6305                     if (send_start == -1)
6306                     {
6307                         send_start = (int)globalSendOpndList->size();
6308                     }
6309 
6310                     //Record all send operands which live out current BB.
6311                     globalSendOpndList->push_back(liveBN);
6312                     send_end = (int)globalSendOpndList->size() - 1;
6313 
6314                     //Record the position of the node in global send operands list.
6315                     liveBN->setSendID(send_end);
6316                 }
6317 
6318                 //Set global send instruction ID
6319                 if (liveBN->node->globalID == -1)
6320                 {
6321                     liveBN->node->globalID = globalSendNum;
6322                     globalSendNum++;
6323                 }
6324 
6325                 //Record all buckets of the send operand
6326                 globalSendsLB->add(liveBN, curBucket);
6327                 LB->killSingleOperand(it);
6328                 continue;
6329             }
6330             ++it;
6331         }
6332     }
6333 
6334     //return the node ID and ALU ID for following BB
6335     indexes->ALUIndex = ALUID;
6336     indexes->instIndex = nodeID;
6337     indexes->integerIndex = integerID;
6338     indexes->floatIndex = floatID;
6339     indexes->longIndex = longID;
6340     indexes->DPASIndex = DPASID;
6341     indexes->mathIndex = mathID;
6342     last_DPASID = DPASID;
6343 
6344     for (int i = 0; i < PIPE_DPAS; i++)
6345     {
6346         indexes->latestDepALUID[i] = latestDepALUID[i];
6347     }
6348 
6349 #ifdef DEBUG_VERBOSE_ON
6350     std::cerr << "\nLIVE OUT: \n";
6351     LB->dumpLives();
6352 #endif
6353 
6354     return;
6355 }
6356 
6357 //#ifdef DEBUG_VERBOSE_ON
6358 
dumpLiveInfo(const SBBUCKET_VECTOR * globalSendOpndList,unsigned globalSendNum,const SBBitSets * send_kill) const6359 void G4_BB_SB::dumpLiveInfo(const SBBUCKET_VECTOR* globalSendOpndList, unsigned globalSendNum, const SBBitSets* send_kill) const
6360 {
6361     std::cerr << "\nBB" << bb->getId() << ":" << first_node << "-" << last_node << ", succ<";
6362     for (const G4_BB* succ : bb->Succs)
6363     {
6364         std::cerr << succ->getId() << ",";
6365     }
6366     std::cerr << "> pred<";
6367     for (const G4_BB* pred : bb->Preds)
6368     {
6369         std::cerr << pred->getId() << ",";
6370     }
6371 
6372     std::cerr << "> JIPSucc <";
6373     for (const G4_BB_SB* succ : Succs)
6374     {
6375         std::cerr << succ->getBB()->getId() << ",";
6376     }
6377     std::cerr << "> JIPPred <";
6378     for (const G4_BB_SB* pred : Preds)
6379     {
6380         std::cerr << pred->getBB()->getId() << ",";
6381     }
6382     std::cerr << ">";
6383     if (bb->getBBType() & G4_BB_CALL_TYPE)
6384     {
6385         std::cerr << ":CALL";
6386     }
6387     if (bb->getBBType() & G4_BB_INIT_TYPE)
6388     {
6389         std::cerr << ":INIT";
6390     }
6391     if (bb->getBBType() & G4_BB_EXIT_TYPE)
6392     {
6393         std::cerr << ":EXIT";
6394     }
6395     if (bb->getBBType() & G4_BB_RETURN_TYPE)
6396     {
6397         std::cerr << ":RETURN";
6398     }
6399     std::cerr << std::endl;
6400 
6401     for (size_t i = 0; i < globalSendOpndList->size(); i++)
6402     {
6403         const SBBucketNode* sNode = (*globalSendOpndList)[i];
6404         std::cerr << i << ": ";
6405         sNode->dump();
6406     }
6407     std::cerr << std::endl;
6408 
6409     std::cerr << "Live In:  ";
6410     std::cerr << std::endl;
6411     if (send_live_in.getSize() != 0)
6412     {
6413         std::cerr << "\tdst:  ";
6414         for (const SBBucketNode* sNode : *globalSendOpndList)
6415         {
6416             if (sNode->opndNum == Opnd_dst &&
6417                 send_live_in.isDstSet(sNode->node->globalID))
6418             {
6419                 sNode->dump();
6420             }
6421         }
6422         std::cerr << std::endl;
6423 
6424         std::cerr << "\tsrc:  ";
6425         for (const SBBucketNode* sNode : *globalSendOpndList)
6426         {
6427             if (sNode->opndNum >= Opnd_src0 && sNode->opndNum <= Opnd_src3 &&
6428                 send_live_in.isSrcSet(sNode->node->globalID))
6429             {
6430                 sNode->dump();
6431             }
6432         }
6433         std::cerr << std::endl;
6434     }
6435     std::cerr << std::endl;
6436 
6437     std::cerr << "May Kill: ";
6438     std::cerr << std::endl;
6439     if (send_may_kill.getSize() != 0)
6440     {
6441         std::cerr << "\tdst:  ";
6442         for (const SBBucketNode* sNode : *globalSendOpndList)
6443         {
6444             if (sNode->opndNum == Opnd_dst &&
6445                 send_may_kill.isDstSet(sNode->node->globalID))
6446             {
6447                 sNode->dump();
6448             }
6449         }
6450         std::cerr << std::endl;
6451         std::cerr << "\tsrc:  ";
6452         for (const SBBucketNode* sNode : *globalSendOpndList)
6453         {
6454             if (sNode->opndNum >= Opnd_src0 && sNode->opndNum <= Opnd_src3 &&
6455                 send_may_kill.isSrcSet(sNode->node->globalID))
6456             {
6457                 sNode->dump();
6458             }
6459         }
6460         std::cerr << std::endl;
6461     }
6462     std::cerr << std::endl;
6463 
6464     std::cerr << "WAW May Kill: ";
6465     std::cerr << std::endl;
6466     if (send_WAW_may_kill.getSize() != 0)
6467     {
6468         std::cerr << "\tdst:  ";
6469         for (const SBBucketNode* sNode : *globalSendOpndList)
6470         {
6471             if (sNode->opndNum == Opnd_dst &&
6472                 send_WAW_may_kill.isSet(sNode->node->globalID))
6473             {
6474                 sNode->dump();
6475             }
6476         }
6477         std::cerr << std::endl;
6478     }
6479     std::cerr << std::endl;
6480 
6481     std::cerr << "Killed:   ";
6482     std::cerr << std::endl;
6483     if (send_kill != nullptr)
6484     {
6485         std::cerr << "\tdst:  ";
6486         for (const SBBucketNode* sNode : *globalSendOpndList)
6487         {
6488             if (sNode->opndNum == Opnd_dst &&
6489                 send_kill->isDstSet(sNode->node->globalID))
6490             {
6491                 sNode->dump();
6492             }
6493         }
6494         std::cerr << std::endl;
6495         std::cerr << "\tsrc:  ";
6496         for (const SBBucketNode* sNode : *globalSendOpndList)
6497         {
6498             if (sNode->opndNum >= Opnd_src0 && sNode->opndNum <= Opnd_src3 &&
6499                 send_kill->isSrcSet(sNode->node->globalID))
6500             {
6501                 sNode->dump();
6502             }
6503         }
6504         std::cerr << std::endl;
6505     }
6506     std::cerr << std::endl;
6507 
6508     std::cerr << "Scalar Killed:   ";
6509     std::cerr << std::endl;
6510     if (send_live_out.getSize() != 0)
6511     {
6512         std::cerr << "\tdst:  ";
6513         for (const SBBucketNode* sNode : *globalSendOpndList)
6514         {
6515             if (sNode->opndNum == Opnd_dst &&
6516                 send_kill_scalar.isDstSet(sNode->node->globalID))
6517             {
6518                 sNode->dump();
6519             }
6520         }
6521         std::cerr << std::endl;
6522         std::cerr << "\tsrc:  ";
6523         for (const SBBucketNode* sNode : *globalSendOpndList)
6524         {
6525             if (sNode->opndNum >= Opnd_src0 && sNode->opndNum <= Opnd_src3 &&
6526                 send_kill_scalar.isSrcSet(sNode->node->globalID))
6527             {
6528                 sNode->dump();
6529             }
6530         }
6531         std::cerr << std::endl;
6532     }
6533     std::cerr << std::endl;
6534 
6535     std::cerr << "Live Out: ";
6536     std::cerr << std::endl;
6537     if (send_live_out.getSize() != 0)
6538     {
6539         std::cerr << "\tdst:  ";
6540         for (const SBBucketNode* sNode : *globalSendOpndList)
6541         {
6542             if (sNode->opndNum == Opnd_dst &&
6543                 send_live_out.isDstSet(sNode->node->globalID))
6544             {
6545                 sNode->dump();
6546             }
6547         }
6548         std::cerr << std::endl;
6549         std::cerr << "\tsrc:  ";
6550         for (const SBBucketNode* sNode : *globalSendOpndList)
6551         {
6552             if (sNode->opndNum >= Opnd_src0 && sNode->opndNum <= Opnd_src3 &&
6553                 send_live_out.isSrcSet(sNode->node->globalID))
6554             {
6555                 sNode->dump();
6556             }
6557         }
6558         std::cerr << std::endl;
6559     }
6560     std::cerr << std::endl;
6561 
6562 }
6563 //#endif
6564 
dumpTokenLiveInfo()6565 void SWSB::dumpTokenLiveInfo()
6566 {
6567     for (size_t i = 0; i < BBVector.size(); i++)
6568     {
6569         G4_BB* bb = BBVector[i]->getBB();
6570 
6571         std::cerr << "\nBB" << bb->getId() << ":" << BBVector[i]->first_node << "-" << BBVector[i]->last_node << ", succ<";
6572         for (std::list<G4_BB*>::iterator sit = bb->Succs.begin(); sit != bb->Succs.end(); ++sit)
6573         {
6574             std::cerr << (*sit)->getId() << ",";
6575         }
6576         std::cerr << "> pred<";
6577         for (std::list<G4_BB*>::iterator pit = bb->Preds.begin(); pit != bb->Preds.end(); ++pit)
6578         {
6579             std::cerr << (*pit)->getId() << ",";
6580         }
6581 
6582         std::cerr << "> JIPSucc <";
6583         for (std::list<G4_BB_SB*>::iterator pit = BBVector[i]->Succs.begin(); pit != BBVector[i]->Succs.end(); ++pit)
6584         {
6585             std::cerr << (*pit)->getBB()->getId() << ",";
6586         }
6587         std::cerr << "> JIPPred <";
6588         for (std::list<G4_BB_SB*>::iterator pit = BBVector[i]->Preds.begin(); pit != BBVector[i]->Preds.end(); ++pit)
6589         {
6590             std::cerr << (*pit)->getBB()->getId() << ",";
6591         }
6592         std::cerr << ">";
6593         if (bb->getBBType() & G4_BB_CALL_TYPE)
6594         {
6595             std::cerr << ":CALL";
6596         }
6597         if (bb->getBBType() & G4_BB_INIT_TYPE)
6598         {
6599             std::cerr << ":INIT";
6600         }
6601         if (bb->getBBType() & G4_BB_EXIT_TYPE)
6602         {
6603             std::cerr << ":EXIT";
6604         }
6605         if (bb->getBBType() & G4_BB_RETURN_TYPE)
6606         {
6607             std::cerr << ":RETURN";
6608         }
6609         std::cerr << std::endl;
6610 
6611         if (fg.builder->getOptions()->getOption(vISA_GlobalTokenAllocation) ||
6612             fg.builder->getOptions()->getOption(vISA_DistPropTokenAllocation))
6613         {
6614             std::cerr << "Doms: ";
6615 
6616             for (size_t k = 0; k < BBVector.size(); k++)
6617             {
6618                 if (k != i &&
6619                     BBVector[i]->dominators.isSet(k))
6620                 {
6621                     std::cerr << "#BB" << k << ", ";
6622                 }
6623             }
6624             std::cerr << std::endl;
6625         }
6626 
6627         std::cerr << "Live Out: ";
6628         std::cerr << std::endl;
6629         if (BBVector[i]->liveOutTokenNodes.getSize() != 0)
6630         {
6631             for (SBNODE_VECT_ITER node_it = SBSendNodes.begin();
6632                 node_it != SBSendNodes.end();
6633                 node_it++)
6634             {
6635                 SBNode* node = (*node_it);
6636                 if (BBVector[i]->liveOutTokenNodes.isSet(node->sendID))
6637                 {
6638                     std::cerr << " #" << node->getNodeID() << ":" << node->sendID << ":" << node->GetInstruction()->getSetToken();
6639                 }
6640             }
6641             std::cerr << std::endl;
6642         }
6643 
6644         std::cerr << "Killed Tokens: ";
6645         std::cerr << std::endl;
6646         if (BBVector[i]->killedTokens.getSize() != 0)
6647         {
6648             uint32_t totalTokenNum = kernel.getNumSWSBTokens();
6649             for (uint32_t k = 0; k < totalTokenNum; k++)
6650             {
6651                 if (BBVector[i]->killedTokens.isSet(k))
6652                 {
6653                     std::cerr << " #" << k << ", ";
6654                 }
6655             }
6656         }
6657         std::cerr << std::endl;
6658 
6659     }
6660 
6661     return;
6662 }
6663 
getLiveBucketsFromFootprint(const SBFootprint * firstFootprint,SBBucketNode * sBucketNode,LiveGRFBuckets * send_use_kills) const6664 void G4_BB_SB::getLiveBucketsFromFootprint(const SBFootprint* firstFootprint, SBBucketNode* sBucketNode, LiveGRFBuckets* send_use_kills) const
6665 {
6666     const SBFootprint* footprint = firstFootprint;
6667 
6668     while (footprint)
6669     {
6670         int startBucket = footprint->LeftB / numEltPerGRF<Type_UB>();
6671         int endBucket = footprint->RightB / numEltPerGRF<Type_UB>();
6672 
6673         //We only track the global dependence for GRF
6674         if (footprint->fType != GRF_T)
6675         {
6676             footprint = footprint->next;
6677             continue;
6678         }
6679 
6680         for (int j = startBucket; j < endBucket + 1; j++)
6681         {
6682             send_use_kills->add(sBucketNode, j);
6683         }
6684         footprint = footprint->next;
6685     }
6686 
6687     return;
6688 }
6689 
6690 /*
6691 * Note that the fall through dependencies are captured in the SBDDD linear scan already
6692 */
addGlobalDependence(unsigned globalSendNum,SBBUCKET_VECTOR * globalSendOpndList,SBNODE_VECT * SBNodes,PointsToAnalysis & p,bool afterWrite)6693 void SWSB::addGlobalDependence(unsigned globalSendNum, SBBUCKET_VECTOR* globalSendOpndList, SBNODE_VECT* SBNodes, PointsToAnalysis& p, bool afterWrite)
6694 {
6695     for (size_t i = 0; i < BBVector.size(); i++)
6696     {
6697         //Get global send operands killed by current BB
6698         SBBitSets send_kill(globalSendNum);
6699         send_kill |= BBVector[i]->send_live_in;
6700         send_kill &= BBVector[i]->send_may_kill;
6701 
6702 #ifdef DEBUG_VERBOSE_ON
6703         BBVector[i]->dumpLiveInfo(globalSendOpndList, globalSendNum, &send_kill);
6704 #endif
6705         //Change the global send operands into live bucket for liveness scan
6706         //Instruction level liveness kill:
6707         //   For token dependence, there is only implicit RAR and WAR dependencies.
6708         //   the order of the operands are scanned is not an issue anymore.
6709         //   i.e explicit RAW and WAW can cover all other dependences.
6710         LiveGRFBuckets send_use_kills(mem, kernel.getNumRegTotal(), BBVector[i]->getBB()->getKernel());
6711         for (SBBucketNode* sBucketNode : *globalSendOpndList)
6712         {
6713             SBNode* sNode = sBucketNode->node;
6714             if (send_kill.isSrcSet(sNode->globalID) && (sBucketNode->opndNum == Opnd_src0 ||
6715                 sBucketNode->opndNum == Opnd_src1 ||
6716                 sBucketNode->opndNum == Opnd_src2 ||
6717                 sBucketNode->opndNum == Opnd_src3))
6718             {
6719                 BBVector[i]->getLiveBucketsFromFootprint(sNode->getFirstFootprint(sBucketNode->opndNum), sBucketNode, &send_use_kills);
6720             }
6721             if (send_kill.isDstSet(sNode->globalID) && (sBucketNode->opndNum == Opnd_dst))
6722             {
6723                 BBVector[i]->getLiveBucketsFromFootprint(sNode->getFirstFootprint(sBucketNode->opndNum), sBucketNode, &send_use_kills);
6724             }
6725             sNode->setInstKilled(false);
6726             sNode->setSourceKilled(false);
6727         }
6728 
6729         if (BBVector[i]->first_node == -1)
6730         {
6731             continue;
6732         }
6733 
6734         //Scan BB again to figure out the dependence caused by global send operands
6735         std::vector<SBBucketDesc> BDvec;
6736         for (int j = BBVector[i]->first_node; j <= BBVector[i]->last_node; j++)
6737         {
6738             SBNode* node = (*SBNodes)[j];
6739             G4_INST* curInst = node->getLastInstruction();
6740 
6741             BDvec.clear();
6742             BBVector[i]->getGRFBucketDescs(node, BDvec, true);
6743             if (!BDvec.size())
6744             {
6745                 continue;
6746             }
6747 
6748             bool instKill = false;
6749             // For all bucket descriptors of curInst
6750             for (const SBBucketDesc& BD : BDvec)
6751             {
6752                 const int& curBucket = BD.bucket;
6753                 const Gen4_Operand_Number& curOpnd = BD.opndNum;
6754                 const SBFootprint* curFootprint = BD.footprint;
6755 
6756                 for (LiveGRFBuckets::BN_iterator bn_it = send_use_kills.begin(curBucket);
6757                     bn_it != send_use_kills.end(curBucket);)
6758                 {
6759                     SBBucketNode* liveBN = (*bn_it);
6760                     SBNode* curLiveNode = liveBN->node;
6761                     Gen4_Operand_Number liveOpnd = liveBN->opndNum;
6762                     const SBFootprint* liveFootprint = liveBN->footprint;
6763                     G4_INST* liveInst = liveFootprint->inst;
6764                     unsigned short internalOffset = 0;
6765                     bool hasOverlap = curFootprint->hasOverlap(liveFootprint, internalOffset);
6766 
6767                     //Find DEP type
6768                     DepType dep = getDepForOpnd(liveOpnd, curOpnd);
6769 
6770                     //RAW:                     R kill W    R-->live       explicit dependence
6771                     //WAW:                     W2 kill W1  W2-->live      explicit dependence
6772                     //WAW: same pipeline/inorder W2 kill W1  W2-->live      implicit dependence
6773                     //WAR: different pipelines W kill R    W-->live       explicit dependence
6774                     //WAR: same pipeline       W kill R    W-->live       implicit dependence
6775                     //RAR: sample pipeline     R2 kill R1  R2-->live      implicit dependence
6776                     //RAR: different pipelines   no kill     R1,R2-->live   no dependence
6777                     if (hasOverlap)
6778                     {
6779                         assert(tokenHonourInstruction(liveInst));
6780                         if (dep == RAW || dep == WAW)
6781                         {
6782                             if (BBVector[i]->isGRFEdgeAdded(curLiveNode, node, dep, DEP_EXPLICT))
6783                             {
6784                                 send_use_kills.killOperand(bn_it);
6785                                 curLiveNode->setInstKilled(true);  //Instruction level kill
6786                                 instKill = true;
6787                                 continue;
6788                             }
6789                             //WAW need be tracked in both scalar and SIMD control flow
6790                             //The reason is that:
6791                             // 1. RA track the liveness in use-->define way
6792                             // 2. SWSB track  in define-->use way.
6793                             // For the case like following
6794                             //
6795                             //   if
6796                             //    v1 <--    //v1 is never be used
6797                             //    if
6798                             //       <--v1
6799                             //    endif
6800                             //   endif
6801                             //   v2 <--
6802                             //RA may assign same register to v1 and v2.
6803                             //Scalar CFG cannot capture the dependence v1-->v2 when they are assigned with same registers.
6804                             if (afterWrite || dep == WAW)  //There is no RAW kill for SIMDCF
6805                             {
6806                                 if (fg.builder->getOption(vISA_EnableDPASTokenReduction) &&
6807                                     node->getLastInstruction()->isDpas() &&
6808                                     curLiveNode->getLastInstruction()->isDpas() &&
6809                                     curFootprint->isWholeOverlap(liveFootprint))
6810                                 {
6811                                     if (node->getDPASID() > curLiveNode->getDPASID())
6812                                     {
6813                                         if ((node->getDPASID() + curFootprint->offset - (curLiveNode->getDPASID() + internalOffset) < tokenAfterDPASCycle))
6814                                         {
6815                                             send_use_kills.killOperand(bn_it);
6816                                             BBVector[i]->createAddGRFEdge(curLiveNode, node, dep, DEP_EXPLICT);
6817                                             curLiveNode->setInstKilled(true);  //Instruction level kill
6818                                             instKill = true;
6819                                             continue;
6820                                         }
6821                                         else if (dep == WAW)
6822                                         {
6823                                             send_use_kills.killOperand(bn_it);
6824                                             continue;
6825                                         }
6826                                     }
6827 
6828                                     if (node->getDPASID() <= curLiveNode->getDPASID())
6829                                     {
6830                                         unsigned loopStartBB = BBVector[node->getBBID()]->getLoopStartBBID();
6831                                         unsigned loopEndBB = BBVector[curLiveNode->getBBID()]->getLoopEndBBID();
6832                                         if (loopStartBB != -1 && loopEndBB != -1)
6833                                         {
6834                                             unsigned frontDist = node->getDPASID() - BBVector[loopStartBB]->first_DPASID;
6835                                             unsigned endDist = BBVector[loopEndBB]->last_DPASID - curLiveNode->getDPASID();
6836 
6837                                             //Note that if node and live node are in different but nest loop, the calculation will be conservative
6838                                             if ((int)(frontDist + endDist + curFootprint->offset - internalOffset) < tokenAfterDPASCycle)
6839                                             {
6840                                                 send_use_kills.killOperand(bn_it);
6841                                                 BBVector[i]->createAddGRFEdge(curLiveNode, node, dep, DEP_EXPLICT);
6842                                                 curLiveNode->setInstKilled(true);  //Instruction level kill
6843                                                 instKill = true;
6844                                                 continue;
6845                                             }
6846                                             else if (dep == WAW)
6847                                             {
6848                                                 send_use_kills.killOperand(bn_it);
6849                                                 continue;
6850                                             }
6851                                         }
6852                                         else
6853                                         {
6854                                             send_use_kills.killOperand(bn_it);
6855                                             BBVector[i]->createAddGRFEdge(curLiveNode, node, dep, DEP_EXPLICT);
6856                                             curLiveNode->setInstKilled(true);
6857                                             instKill = true;
6858                                             continue;
6859                                         }
6860                                     }
6861                                 }
6862                                 else
6863                                 {
6864                                     send_use_kills.killOperand(bn_it);
6865                                     BBVector[i]->createAddGRFEdge(curLiveNode, node, dep, DEP_EXPLICT);
6866                                     curLiveNode->setInstKilled(true);  //Instruction level kill
6867                                     instKill = true;
6868                                     continue;
6869                                 }
6870                             }
6871                         }
6872 
6873                         if (dep == WAR)
6874                         {
6875                             bool killed = false;
6876                             //For implicit dependence, the previous node can be killed only when it's wholly overlapped by the following one
6877                             if (curFootprint->isWholeOverlap(liveFootprint))
6878                             {
6879                                 send_use_kills.killOperand(bn_it);
6880                                 if (WARDepRequired(liveInst, curInst))
6881                                     //Implicit dependence cannot block the following instruction from issue.
6882                                 {
6883                                     curLiveNode->setSourceKilled(true);
6884                                 }
6885                                 curLiveNode->setAR();
6886                                 killed = true;
6887                             }
6888 
6889                             if (WARDepRequired(liveInst, curInst))
6890                             {
6891                                 if (!killed)
6892                                 {
6893                                     send_use_kills.killOperand(bn_it);
6894                                     curLiveNode->setSourceKilled(true);
6895                                     curLiveNode->setAR();
6896                                     killed = true;
6897                                 }
6898                                 instKill = true;
6899                                 if (!afterWrite) //After read dependence is more comprehensive in SIMDCF, so add edge only in SIMDCF pass
6900                                 {
6901                                     BBVector[i]->createAddGRFEdge(curLiveNode, node, dep, DEP_EXPLICT);
6902                                 }
6903                             }
6904                             else
6905                             {
6906                                 if (!afterWrite) //After read dependence is more comprehensive in SIMDCF, so add edge only in SIMDCF pass
6907                                 {
6908                                     BBVector[i]->createAddGRFEdge(curLiveNode, node, dep, DEP_IMPLICIT);
6909                                 }
6910                             }
6911 
6912                             if (killed)
6913                             {
6914                                 continue;
6915                             }
6916                         }
6917 
6918                         if (dep == NODEP &&
6919                             hasSameFunctionID(liveInst, curInst) &&
6920                             hasSamePredicator(liveInst, curInst) &&
6921                             hasSameExecMask(liveInst, curInst))
6922                         {
6923                             if (curFootprint->isWholeOverlap(liveFootprint))
6924                             {
6925                                 send_use_kills.killOperand(bn_it);
6926                                 continue;
6927                             }
6928                         }
6929                     }
6930 
6931                     assert(dep != DEPTYPE_MAX && "dep unassigned?");
6932                     ++bn_it;
6933                 }
6934             }
6935 
6936             if (instKill)
6937             {
6938                 if (fg.builder->hasThreeALUPipes() || fg.builder->hasFourALUPipes())
6939                 {
6940                     BBVector[i]->clearKilledBucketNodeXeHP(&send_use_kills, 0, 0, 0, 0);
6941                 }
6942                 else
6943                 {
6944                     BBVector[i]->clearKilledBucketNodeXeLP(&send_use_kills, 0);
6945                 }
6946             }
6947             if (fg.builder->hasSLMWARIssue() && curInst->isSend() &&
6948                 (isSLMMsg(curInst) && (curInst->getDst() == nullptr || isFence(curInst))))
6949             {
6950                 BBVector[i]->clearSLMWARWAissue(node, &send_use_kills);
6951             }
6952         }
6953     }
6954 
6955     return;
6956 }
6957 
addReachingDefineSet(SBNode * node,SBBitSets * globalLiveSet,SBBitSets * localLiveSet)6958 void SWSB::addReachingDefineSet(SBNode* node, SBBitSets* globalLiveSet, SBBitSets* localLiveSet)
6959 {
6960     if (node->reachingSends.getSize() == 0)
6961     {
6962         node->reachingSends = SBBitSets(SBSendNodes.size());
6963     }
6964 
6965     node->reachingSends |= *globalLiveSet;
6966 
6967     node->reachingSends |= *localLiveSet;
6968 
6969     return;
6970 }
6971 
addReachingUseSet(SBNode * node,SBNode * use)6972 void SWSB::addReachingUseSet(SBNode* node, SBNode* use)
6973 {
6974     if (use->getSendUseID() != -1)
6975     {
6976         if (node->reachedUses.getSize() == 0)
6977         {
6978             node->reachedUses = SBBitSets(SBSendUses.size());
6979         }
6980 
6981         node->reachedUses.setDst(use->getSendUseID(), true);
6982     }
6983 
6984     return;
6985 }
6986 
addGlobalDependenceWithReachingDef(unsigned globalSendNum,SBBUCKET_VECTOR * globalSendOpndList,SBNODE_VECT * SBNodes,PointsToAnalysis & p,bool afterWrite)6987 void SWSB::addGlobalDependenceWithReachingDef(unsigned globalSendNum, SBBUCKET_VECTOR* globalSendOpndList, SBNODE_VECT* SBNodes, PointsToAnalysis& p, bool afterWrite)
6988 {
6989     for (size_t i = 0; i < BBVector.size(); i++)
6990     {
6991         //Get global send operands killed by current BB
6992         SBBitSets send_kill(globalSendNum);
6993         //send_live record the live ones from out side of BB, but kill by BB
6994         SBBitSets send_live(SBSendNodes.size());
6995 
6996         SBBitSets send_live_through(globalSendNum);
6997         //send_reach_all record all the global livs live through the BB
6998         SBBitSets send_reach_all(SBSendNodes.size());
6999 
7000         send_kill |= BBVector[i]->send_live_in;
7001         send_kill &= BBVector[i]->send_may_kill;
7002         send_live_through |= BBVector[i]->send_live_in;
7003         send_live_through -= send_kill;
7004 
7005 #ifdef DEBUG_VERBOSE_ON
7006         BBVector[i]->dumpLiveInfo(globalSendOpndList, globalSendNum, &send_kill);
7007 #endif
7008         //Change the global send operands into live bucket for liveness scan
7009         //Instruction level liveness kill:
7010         //   For token dependence, there is only implicit RAR and WAR dependencies.
7011         //   the order of the operands are scanned is not an issue anymore.
7012         //   i.e explicit RAW and WAW can cover all other dependences.
7013         LiveGRFBuckets send_use_kills(mem, kernel.getNumRegTotal(), BBVector[i]->getBB()->getKernel());
7014         for (size_t j = 0; j < globalSendOpndList->size(); j++)
7015         {
7016             SBBucketNode* sBucketNode = (*globalSendOpndList)[j];
7017             SBNode* sNode = sBucketNode->node;
7018             if (send_kill.isSrcSet(sNode->globalID) && (sBucketNode->opndNum == Opnd_src0 ||
7019                 sBucketNode->opndNum == Opnd_src1 ||
7020                 sBucketNode->opndNum == Opnd_src2 ||
7021                 sBucketNode->opndNum == Opnd_src3))
7022             {
7023                 BBVector[i]->getLiveBucketsFromFootprint(sNode->getFirstFootprint(sBucketNode->opndNum), sBucketNode, &send_use_kills);
7024                 send_live.setSrc(sNode->getSendID(), true);
7025             }
7026             if (send_kill.isDstSet(sNode->globalID) && (sBucketNode->opndNum == Opnd_dst))
7027             {
7028                 BBVector[i]->getLiveBucketsFromFootprint(sNode->getFirstFootprint(sBucketNode->opndNum), sBucketNode, &send_use_kills);
7029                 send_live.setDst(sNode->getSendID(), true);
7030             }
7031 
7032             if (send_live_through.isSrcSet(sNode->globalID) && (sBucketNode->opndNum == Opnd_src0 ||
7033                 sBucketNode->opndNum == Opnd_src1 ||
7034                 sBucketNode->opndNum == Opnd_src2 ||
7035                 sBucketNode->opndNum == Opnd_src3))
7036             {
7037                 send_reach_all.setSrc(sNode->getSendID(), true);
7038             }
7039             if (send_live_through.isDstSet(sNode->globalID) && (sBucketNode->opndNum == Opnd_dst))
7040             {
7041                 send_reach_all.setDst(sNode->getSendID(), true);
7042             }
7043             sNode->setInstKilled(false);
7044             sNode->setSourceKilled(false);
7045         }
7046 
7047         if (BBVector[i]->first_node == -1)
7048         {
7049             continue;
7050         }
7051 
7052         BBVector[i]->localReachingSends = SBBitSets(SBSendNodes.size());
7053 
7054         if (BBVector[i]->first_send_node != -1)
7055         {
7056             for (int j = BBVector[i]->first_send_node; j <= BBVector[i]->last_send_node; j++)
7057             {
7058                 SBNode* node = SBSendNodes[j];
7059 
7060                 //Get the live range for the local ones
7061                 if (node->globalID == -1)
7062                 {
7063                     assert(node->getBBID() == i);
7064 
7065                     node->setLiveEarliestID(node->getNodeID());
7066                     node->setLiveLatestID(node->getNodeID());
7067                     if (node->succs.size())
7068                     {
7069                         for (int k = 0; k < (int)(node->succs.size()); k++)
7070                         {
7071                             SBDEP_ITEM& curSucc = node->succs[k];
7072                             SBNode* succ = curSucc.node;
7073 
7074                             node->setLiveLatestID(succ->getNodeID(), succ->getBBID());
7075                         }
7076                     }
7077                     else
7078                     {
7079                         node->setLiveLatestID(BBVector[i]->last_node);
7080                     }
7081                 }
7082                 else
7083                 {
7084                     node->setLiveEarliestID(node->getNodeID());
7085                     node->setLiveLatestID(BBVector[i]->last_node);
7086                 }
7087             }
7088         }
7089         localTokenUsage.clear(); //Add to the live node
7090 
7091         //Scan BB again to figure out the dependence caused by global send operands
7092         std::vector<SBBucketDesc> BDvec;
7093         for (int j = BBVector[i]->first_node; j <= BBVector[i]->last_node; j++)
7094         {
7095             SBNode* node = (*SBNodes)[j];
7096             G4_INST* curInst = (*SBNodes)[j]->getLastInstruction();
7097 
7098             BDvec.clear();
7099             BBVector[i]->getGRFBucketDescs(node, BDvec, true);
7100             if (!BDvec.size())
7101             {
7102                 continue;
7103             }
7104 
7105             //Tack all the token nodes defined in current BB
7106             if (tokenHonourInstruction(node->GetInstruction()))
7107             {
7108                 addReachingDefineSet(node, &send_live, &BBVector[i]->localReachingSends);
7109                 node->reachingSends |= send_reach_all;
7110 
7111                 expireLocalIntervals(node->getNodeID(), i);
7112                 if (node->GetInstruction()->getDst() != nullptr &&
7113                     !node->GetInstruction()->getDst()->isNullReg())
7114                 {
7115                     BBVector[i]->localReachingSends.setDst(node->sendID, true);
7116                 }
7117                 else
7118                 {
7119                     BBVector[i]->localReachingSends.setSrc(node->sendID, true);
7120                 }
7121                 localTokenUsage.push_back(node); //Add to the live node
7122             }
7123 
7124             bool instKill = false;
7125             // For all bucket descriptors of curInst
7126             for (const SBBucketDesc& BD : BDvec)
7127             {
7128                 const int& curBucket = BD.bucket;
7129                 const Gen4_Operand_Number& curOpnd = BD.opndNum;
7130                 const SBFootprint* curFootprint = BD.footprint;
7131 
7132                 for (LiveGRFBuckets::BN_iterator bn_it = send_use_kills.begin(curBucket);
7133                     bn_it != send_use_kills.end(curBucket);)
7134                 {
7135                     SBBucketNode* liveBN = (*bn_it);
7136                     SBNode* curLiveNode = liveBN->node;
7137                     Gen4_Operand_Number liveOpnd = liveBN->opndNum;
7138                     const SBFootprint* liveFootprint = liveBN->footprint;
7139                     G4_INST* liveInst = liveFootprint->inst;
7140                     unsigned short internalOffset = 0;
7141                     bool hasOverlap = curFootprint->hasOverlap(liveFootprint, internalOffset);
7142 
7143                     //Find DEP type
7144                     DepType dep = getDepForOpnd(liveOpnd, curOpnd);
7145 
7146                     //RAW:                     R kill W    R-->live       explicit dependence
7147                     //WAW:                     W2 kill W1  W2-->live      explicit dependence
7148                     //WAW: same pipeline/inorder W2 kill W1  W2-->live      implicit dependence
7149                     //WAR: different pipelines W kill R    W-->live       explicit dependence
7150                     //WAR: same pipeline       W kill R    W-->live       implicit dependence
7151                     //RAR: sample pipeline     R2 kill R1  R2-->live      implicit dependence
7152                     //RAR: different pipelines   no kill     R1,R2-->live   no dependence
7153                     if (hasOverlap)
7154                     {
7155                         assert(tokenHonourInstruction(liveInst));
7156                         if (dep == RAW || dep == WAW)
7157                         {
7158                             if (BBVector[i]->isGRFEdgeAdded(curLiveNode, node, dep, DEP_EXPLICT))
7159                             {
7160                                 send_use_kills.killOperand(bn_it);
7161                                 curLiveNode->setInstKilled(true);  //Instruction level kill
7162                                 instKill = true;
7163                                 addReachingDefineSet(node, &send_live, &BBVector[i]->localReachingSends);
7164                                 send_live.setDst(curLiveNode->getSendID(), false);
7165                                 continue;
7166                             }
7167                             //WAW need be tracked in both scalar and SIMD control flow
7168                             //The reason is that:
7169                             // 1. RA track the liveness in use-->define way
7170                             // 2. SWSB track  in define-->use way.
7171                             // For the case like following
7172                             //
7173                             //   if
7174                             //    v1 <--    //v1 is never be used
7175                             //    if
7176                             //       <--v1
7177                             //    endif
7178                             //   endif
7179                             //   v2 <--
7180                             //RA may assign same register to v1 and v2.
7181                             //Scalar CFG cannot capture the dependence v1-->v2 when they are assigned with same registers.
7182                             if (afterWrite || dep == WAW)  //There is no RAW kill for SIMDCF
7183                             {
7184                                 if (fg.builder->getOption(vISA_EnableDPASTokenReduction) &&
7185                                     node->getLastInstruction()->isDpas() &&
7186                                     curLiveNode->getLastInstruction()->isDpas() &&
7187                                     curFootprint->isWholeOverlap(liveFootprint))
7188                                 {
7189                                     if (node->getDPASID() > curLiveNode->getDPASID())
7190                                     {
7191                                         if ((node->getDPASID() + curFootprint->offset - (curLiveNode->getDPASID() + internalOffset) < tokenAfterDPASCycle))
7192                                         {
7193                                             send_use_kills.killOperand(bn_it);
7194                                             BBVector[i]->createAddGRFEdge(curLiveNode, node, dep, DEP_EXPLICT);
7195                                             curLiveNode->setInstKilled(true);  //Instruction level kill
7196                                             instKill = true;
7197                                             continue;
7198                                         }
7199                                         else if (dep == WAW)
7200                                         {
7201                                             send_use_kills.killOperand(bn_it);
7202                                             continue;
7203                                         }
7204                                     }
7205 
7206                                     if (node->getDPASID() <= curLiveNode->getDPASID())
7207                                     {
7208                                         unsigned loopStartBB = BBVector[node->getBBID()]->getLoopStartBBID();
7209                                         unsigned loopEndBB = BBVector[curLiveNode->getBBID()]->getLoopEndBBID();
7210 
7211                                         if (loopStartBB != -1 && loopEndBB != -1)
7212                                         {
7213                                             unsigned frontDist = node->getDPASID() - BBVector[loopStartBB]->first_DPASID;
7214                                             unsigned endDist = BBVector[loopEndBB]->last_DPASID - curLiveNode->getDPASID();
7215 
7216                                             if ((int)(frontDist + endDist + curFootprint->offset - internalOffset) < tokenAfterDPASCycle)
7217                                             {
7218                                                 send_use_kills.killOperand(bn_it);
7219                                                 BBVector[i]->createAddGRFEdge(curLiveNode, node, dep, DEP_EXPLICT);
7220                                                 curLiveNode->setInstKilled(true);  //Instruction level kill
7221                                                 instKill = true;
7222                                                 continue;
7223                                             }
7224                                             else if (dep == WAW)
7225                                             {
7226                                                 send_use_kills.killOperand(bn_it);
7227                                                 continue;
7228                                             }
7229                                         }
7230                                         else
7231                                         {
7232                                             send_use_kills.killOperand(bn_it);
7233                                             BBVector[i]->createAddGRFEdge(curLiveNode, node, dep, DEP_EXPLICT);
7234                                             curLiveNode->setInstKilled(true);  //Instruction level kill
7235                                             instKill = true;
7236                                             continue;
7237                                         }
7238                                     }
7239                                 }
7240                                 else
7241                                 {
7242                                     send_use_kills.killOperand(bn_it);
7243                                     BBVector[i]->createAddGRFEdge(curLiveNode, node, dep, DEP_EXPLICT);
7244                                     curLiveNode->setInstKilled(true);  //Instruction level kill
7245                                     instKill = true;
7246 
7247                                     //Kill from live
7248                                     addReachingDefineSet(node, &send_live, &BBVector[i]->localReachingSends);
7249                                     send_live.setDst(curLiveNode->getSendID(), false);
7250                                     continue;
7251                                 }
7252                             }
7253                         }
7254 
7255                         if (dep == WAR)
7256                         {
7257                             bool killed = false;
7258                             //For implicit dependence, the previous node can be killed only when it's wholly overlapped by the following one
7259                             if (curFootprint->isWholeOverlap(liveFootprint))
7260                             {
7261                                 send_use_kills.killOperand(bn_it);
7262                                 if (WARDepRequired(liveInst, curInst))
7263                                     //Implicit dependence cannot block the following instruction from issue.
7264                                 {
7265                                     curLiveNode->setSourceKilled(true);
7266                                 }
7267                                 curLiveNode->setAR();
7268                                 killed = true;
7269                             }
7270 
7271                             if (WARDepRequired(liveInst, curInst))
7272                             {
7273                                 if (!killed)
7274                                 {
7275                                     send_use_kills.killOperand(bn_it);
7276                                     curLiveNode->setSourceKilled(true);
7277                                     curLiveNode->setAR();
7278                                     killed = true;
7279                                 }
7280                                 instKill = true;
7281                                 if (!afterWrite) //After read dependence is more comprehensive in SIMDCF, so add edge only in SIMDCF pass
7282                                 {
7283                                     BBVector[i]->createAddGRFEdge(curLiveNode, node, dep, DEP_EXPLICT);
7284                                 }
7285                             }
7286                             else
7287                             {
7288                                 if (!afterWrite) //After read dependence is more comprehensive in SIMDCF, so add edge only in SIMDCF pass
7289                                 {
7290                                     BBVector[i]->createAddGRFEdge(curLiveNode, node, dep, DEP_IMPLICIT);
7291                                 }
7292                             }
7293                             if (killed)
7294                             {
7295                                 addReachingDefineSet(node, &send_live, &BBVector[i]->localReachingSends);
7296                                 send_live.setSrc(curLiveNode->getSendID(), false);
7297                                 continue;
7298                             }
7299                         }
7300 
7301                         if (dep == NODEP &&
7302                             hasSameFunctionID(liveInst, curInst) &&
7303                             hasSamePredicator(liveInst, curInst) &&
7304                             hasSameExecMask(liveInst, curInst))
7305                         {
7306                             if (curFootprint->isWholeOverlap(liveFootprint))
7307                             {
7308                                 send_use_kills.killOperand(bn_it);
7309                                 continue;
7310                             }
7311                         }
7312                     }
7313 
7314                     assert(dep != DEPTYPE_MAX && "dep unassigned?");
7315                     ++bn_it;
7316                 }
7317             }
7318 
7319             if (node->preds.size() != 0)
7320             {
7321                 addReachingDefineSet(node, &send_live, &BBVector[i]->localReachingSends);
7322                 node->reachingSends |= send_reach_all;
7323                 node->setSendUseID(SBSendUses.size());
7324                 SBSendUses.push_back(node);
7325             }
7326 
7327             if (instKill)
7328             {
7329                 if (fg.builder->hasThreeALUPipes() || fg.builder->hasFourALUPipes())
7330                 {
7331                     BBVector[i]->clearKilledBucketNodeXeHP(&send_use_kills, 0, 0, 0, 0);
7332                 }
7333                 else
7334                 {
7335                     BBVector[i]->clearKilledBucketNodeXeLP(&send_use_kills, 0);
7336                 }
7337             }
7338             if (fg.builder->hasSLMWARIssue() && curInst->isSend() &&
7339                 (isSLMMsg(curInst) && (curInst->getDst() == nullptr || isFence(curInst))))
7340             {
7341                 BBVector[i]->clearSLMWARWAissue(node, &send_use_kills);
7342             }
7343         }
7344     }
7345 
7346     return;
7347 }
7348 
7349 //
7350 //Works only for RAW and WAW
7351 //Check if edge has been added during the data dependence analysis for SIMDCF control flow.
7352 //If it's added, the tracking for the corresponding bucket will be killed
7353 //
isGRFEdgeAdded(const SBNode * pred,const SBNode * succ,DepType d,SBDependenceAttr a)7354 bool G4_BB_SB::isGRFEdgeAdded(const SBNode* pred, const SBNode* succ, DepType d, SBDependenceAttr a)
7355 {
7356     // When there are multiple dependence edges between two instructions
7357     // We think the RAW and WAW > WAR, which means if WAR co-exists with any other, it will be dropped.
7358     // This is especially important for send instructions. when there are multiple dependencies from same send instruction.
7359     // For the case like following, only the dst
7360     //1. Send r2-r5, r8, ....    $1
7361     //   ...
7362     //7. Add  r8,  r2, r10   test $1D
7363     // For WAW and RAW, we think they are equal.
7364     for (const SBDEP_ITEM& curSucc : pred->succs)
7365     {
7366         if (curSucc.node == succ)
7367         {
7368             //If there is dependence edges already current edge will be ignored if it's WAR
7369             //if exist dependence is RAW or WAW, there is no need to add new edges
7370             if (curSucc.type == RAW || curSucc.type == WAW)
7371             {
7372                 return true;
7373             }
7374         }
7375     }
7376 
7377     return false;
7378 }
7379 
removePredsEdges(SBNode * node,SBNode * pred)7380 void SWSB::removePredsEdges(SBNode* node, SBNode* pred)
7381 {
7382     for (auto pred_it = node->preds.begin();
7383         pred_it != node->preds.end();)
7384     {
7385         if ((*pred_it).node == pred)
7386         {
7387             pred_it = node->preds.erase(pred_it);
7388             continue;
7389         }
7390         pred_it++;
7391     }
7392 
7393     return;
7394 }
7395 
createAddGRFEdge(SBNode * pred,SBNode * succ,DepType d,SBDependenceAttr a)7396 void G4_BB_SB::createAddGRFEdge(SBNode* pred, SBNode* succ, DepType d, SBDependenceAttr a)
7397 {
7398     // When there are multiple dependence edges between two instructions
7399     // We think the RAW and WAW > WAR, which means if WAR co-exists with any other, it will be dropped.
7400     // This is especially important for send instructions. When there are multiple dependencies from same send instruction.
7401     // For the case like following, only the dst
7402     //1. Send r2-r5, r8, ....    $1
7403     //   ...
7404     //7. Add  r8,  r2, r10   test $1D
7405     // For WAW and RAW, we think they are equal.
7406 
7407     for (int i = 0; i < (int)(pred->succs.size()); i++)
7408     {
7409         SBDEP_ITEM& curSucc = pred->succs[i];
7410         if (curSucc.node == succ)
7411         {
7412             //If there is dependence edges already current edge will be ignored if it's WAR
7413             //if exist dependence is RAW or WAW, there is no need to add new edges
7414             if (d == WAR || curSucc.type == RAW || curSucc.type == WAW)
7415             {
7416                 return;
7417             }
7418             //Otherwise, d == RAW or d == WAW, but curSucc.type == WAR
7419             //Change the dependency type to d
7420             curSucc.type = d;
7421             curSucc.attr = a;
7422             bool findPred = false;
7423             for (int j = 0; j < (int)(succ->preds.size()); j++)
7424             {
7425                 SBDEP_ITEM& curPred = succ->preds[j];
7426 
7427                 if (curPred.node == pred)
7428                 {
7429                     curPred.type = d;
7430                     curPred.attr = a;
7431                     findPred = true;
7432                 }
7433             }
7434             assert(findPred);
7435             return;
7436         }
7437     }
7438 
7439     // No edge with the same successor exists. Append this edge.
7440     SBDEP_ITEM newEdge = SBDEP_ITEM(succ, d, a);
7441     pred->succs.emplace_back(newEdge);
7442     newEdge = SBDEP_ITEM(pred, d, a);
7443     succ->preds.emplace_back(newEdge);
7444     return;
7445 }
7446 
7447 
emitRegInfo(std::ostream & output,G4_INST * inst,int offset)7448 void G4_BB::emitRegInfo(std::ostream& output, G4_INST* inst, int offset)
7449 {
7450     output << "#" << inst->getLexicalId() << "|" << offset << ":";
7451     G4_DstRegRegion* dstOpnd = inst->getDst();
7452 
7453     if (dstOpnd &&
7454         !dstOpnd->isIndirect() &&
7455         dstOpnd->isGreg())
7456     {
7457         uint32_t byteAddress = dstOpnd->getLinearizedStart();
7458         unsigned dstReg0 = byteAddress / numEltPerGRF<Type_UB>();
7459         output << " {";
7460         output << "D:" << dstReg0;
7461         output << "}";
7462     }
7463 
7464     for (int i = 0; i < inst->getNumSrc(); i++)
7465     {
7466         G4_Operand* srcOpnd = inst->getSrc(i);
7467         if (srcOpnd)
7468         {
7469             if (srcOpnd->isSrcRegRegion() &&
7470                 srcOpnd->asSrcRegRegion()->getBase() &&
7471                 !srcOpnd->asSrcRegRegion()->isIndirect() &&
7472                 srcOpnd->asSrcRegRegion()->getBase()->isRegVar())
7473             {
7474                 G4_RegVar* baseVar = static_cast<G4_RegVar*>(srcOpnd->asSrcRegRegion()->getBase());
7475                 if (baseVar->isGreg()) {
7476                     uint32_t byteAddress = srcOpnd->getLinearizedStart();
7477                     unsigned srcReg = byteAddress / numEltPerGRF<Type_UB>();
7478                     output << " {";
7479                     output << "S" << i;
7480                     output << ":" << srcReg;
7481                     output << "}";
7482                 }
7483             }
7484         }
7485     }
7486 
7487     output << std::endl;
7488     return;
7489 }
7490 
isSWSBRequired(IR_Builder * builder,G4_INST * inst)7491 static bool isSWSBRequired(IR_Builder* builder, G4_INST* inst)
7492 {
7493     // Iterate over all operands and create buckets.
7494     for (Gen4_Operand_Number opndNum
7495         : {Opnd_src0, Opnd_src1, Opnd_src2, Opnd_src3, Opnd_dst}) {
7496         G4_Operand* opnd = inst->getOperand(opndNum);
7497         // Skip if no operand or the operand is not touched by the instruction
7498         if (!opnd || !opnd->getBase()) {
7499             continue;
7500         }
7501         if (opnd->isLabel() || opnd->isImm())
7502         {
7503             continue;
7504         }
7505 
7506         G4_VarBase* base = opnd->getBase();
7507         assert(base && "If no base, then the operand is not touched by the instr.");
7508         G4_VarBase* phyReg = (base->isRegVar()) ? base->asRegVar()->getPhyReg() : base;
7509 
7510         if (phyReg->getKind() == G4_VarBase::VK_phyGReg)
7511         {
7512             return true;
7513         }
7514         if (phyReg->getKind() == G4_VarBase::VK_phyAReg)
7515         {
7516             if (phyReg->getAreg()->getArchRegType() == AREG_A0)
7517             {
7518                 return true;
7519             }
7520             if (builder->hasThreeALUPipes() || builder->hasFourALUPipes())
7521             {
7522                 return true;
7523             }
7524         }
7525 
7526     }
7527 
7528     return false;
7529 }
7530 
setForceDebugSWSB(IR_Builder * builder,G4_BB * bb,INST_LIST_ITER inst_it)7531 static G4_INST* setForceDebugSWSB(IR_Builder* builder, G4_BB* bb, INST_LIST_ITER inst_it)
7532 {
7533     G4_INST* inst = (*inst_it);
7534     G4_INST* syncInst = nullptr;
7535 
7536     if (!isSWSBRequired(builder, inst))
7537     {
7538         return nullptr;
7539     }
7540 
7541     if (builder->hasThreeALUPipes() || builder->hasFourALUPipes())
7542     {
7543         if (!inst->tokenHonourInstruction())
7544         {
7545             inst->setDistance(1);
7546             inst->setDistanceTypeXe(G4_INST::DistanceType::DISTALL);
7547         }
7548         else
7549         {
7550             G4_SrcRegRegion* src0 = builder->createNullSrc(Type_UD);
7551             G4_INST* extraSyncInst = builder->createSync(G4_sync_nop, src0);
7552             extraSyncInst->setDistance(1);
7553             extraSyncInst->setDistanceTypeXe(G4_INST::DistanceType::DISTALL);
7554             bb->insertBefore(inst_it, extraSyncInst);
7555         }
7556     }
7557     else
7558     {
7559         inst->setDistance(1);
7560     }
7561 
7562     if (inst->tokenHonourInstruction())
7563     {
7564         inst->setSetToken(0);
7565         if (inst->isEOT())
7566         {
7567             inst->setDistance(1);
7568             if (builder->hasThreeALUPipes() || builder->hasFourALUPipes())
7569             {
7570                 inst->setDistanceTypeXe(G4_INST::DistanceType::DISTALL);
7571             }
7572         }
7573         G4_SrcRegRegion* src0 = builder->createNullSrc(Type_UD);
7574         syncInst = builder->createSync(G4_sync_nop, src0);
7575         G4_Operand* opnd = inst->getOperand(Opnd_dst);
7576         SWSBTokenType tokenType = SWSBTokenType::TOKEN_NONE;
7577         if (!opnd || !opnd->getBase() || opnd->isNullReg())
7578         {
7579             tokenType = SWSBTokenType::AFTER_READ;
7580         }
7581         else
7582         {
7583             tokenType = SWSBTokenType::AFTER_WRITE;
7584         }
7585         syncInst->setToken(0);
7586         syncInst->setTokenType(tokenType);
7587     }
7588 
7589     return syncInst;
7590 }
7591 
forceDebugSWSB(G4_Kernel * kernel)7592 void vISA::forceDebugSWSB(G4_Kernel* kernel)
7593 {
7594     BB_LIST_ITER bbEnd = kernel->fg.end();
7595     int instID = 0;
7596 
7597     for (BB_LIST_ITER bb_it = kernel->fg.begin();
7598         bb_it != bbEnd;
7599         bb_it++)
7600     {
7601         G4_BB* bb = (*bb_it);
7602         if (bb->size() > 0)
7603         {
7604             INST_LIST_ITER inst_end = bb->end();
7605             for (INST_LIST_ITER inst_it = bb->begin();
7606                 inst_it != inst_end;
7607                 inst_it++)
7608             {
7609                 G4_INST* inst = (*inst_it);
7610                 G4_INST* newInst = nullptr;
7611 
7612                 newInst = setForceDebugSWSB(kernel->fg.builder, bb, inst_it);
7613                 inst->setLexicalId(instID);
7614                 instID++;
7615 
7616                 if (newInst)
7617                 {
7618                     INST_LIST_ITER new_it = inst_it;
7619                     new_it++;
7620                     bb->insertBefore(new_it, newInst);
7621                     newInst->setLexicalId(instID);
7622                     instID++;
7623                     if (new_it == bb->end())
7624                     {
7625                         break;
7626                     }
7627                     inst_it++;
7628                 }
7629             }
7630         }
7631     }
7632 }
7633 
setInstructionStallSWSB(IR_Builder * builder,G4_BB * bb,INST_LIST_ITER & inst_it)7634 static void setInstructionStallSWSB(IR_Builder* builder,
7635     G4_BB* bb,
7636     INST_LIST_ITER& inst_it)
7637 {
7638     G4_INST* inst = *inst_it;
7639     INST_LIST_ITER next_it = inst_it;
7640     next_it++;
7641 
7642     if (!inst->distanceHonourInstruction() &&
7643         !inst->tokenHonourInstruction())
7644     {
7645         return;
7646     }
7647 
7648     if (inst->distanceHonourInstruction())
7649     {
7650         G4_SrcRegRegion* src0 = builder->createNullSrc(Type_UD);
7651         G4_INST* extraSyncInst = builder->createSync(G4_sync_nop, src0);
7652         extraSyncInst->setDistance(1);
7653         if (builder->hasThreeALUPipes() || builder->hasFourALUPipes())
7654         {
7655             extraSyncInst->setDistanceTypeXe(G4_INST::DistanceType::DISTALL);
7656         }
7657         bb->insertBefore(inst_it, extraSyncInst);
7658 
7659         return;
7660     }
7661 
7662     if (inst->tokenHonourInstruction())
7663     {
7664         G4_SrcRegRegion* src0_1 = builder->createNullSrc(Type_UD);
7665         G4_INST* extraSyncInst = builder->createSync(G4_sync_nop, src0_1);
7666         extraSyncInst->setDistance(1);
7667         if (builder->hasThreeALUPipes() || builder->hasFourALUPipes())
7668         {
7669             extraSyncInst->setDistanceTypeXe(G4_INST::DistanceType::DISTALL);
7670         }
7671         bb->insertBefore(inst_it, extraSyncInst);
7672 
7673         if (!inst->isEOT())
7674         {
7675             G4_SrcRegRegion* src0 = builder->createNullSrc(Type_UD);
7676             G4_INST* syncInst = builder->createSync(G4_sync_nop, src0);
7677 
7678             unsigned short token = inst->getSetToken();
7679             SWSBTokenType tokenType = SWSBTokenType::TOKEN_NONE;
7680             G4_Operand* opnd = inst->getOperand(Opnd_dst);
7681             if (!opnd || !opnd->getBase() || opnd->isNullReg())
7682             {
7683                 tokenType = SWSBTokenType::AFTER_READ;
7684             }
7685             else
7686             {
7687                 tokenType = SWSBTokenType::AFTER_WRITE;
7688             }
7689             syncInst->setToken(token);
7690             syncInst->setTokenType(tokenType);
7691             inst_it = bb->insertBefore(next_it, syncInst);
7692         }
7693     }
7694 
7695     return;
7696 }
7697 
setInstructionBarrierSWSB(IR_Builder * builder,G4_BB * bb,INST_LIST_ITER & inst_it)7698 static void setInstructionBarrierSWSB(IR_Builder* builder,
7699     G4_BB* bb,
7700     INST_LIST_ITER& inst_it)
7701 {
7702 
7703     G4_INST* syncAllRdInst = nullptr;
7704     G4_SrcRegRegion* src0 = builder->createNullSrc(Type_UD);
7705     syncAllRdInst = builder->createSync(G4_sync_allrd, src0);
7706     syncAllRdInst->setDistance(1);
7707     if (builder->hasThreeALUPipes() || builder->hasFourALUPipes())
7708     {
7709         syncAllRdInst->setDistanceTypeXe(G4_INST::DistanceType::DISTALL);
7710     }
7711     INST_LIST_ITER next_it = inst_it;
7712     next_it++;
7713     inst_it = bb->insertBefore(next_it, syncAllRdInst);
7714 
7715     G4_INST* syncAllWrInst = nullptr;
7716     src0 = builder->createNullSrc(Type_UD);
7717     syncAllWrInst = builder->createSync(G4_sync_allwr, src0);
7718 
7719     next_it = inst_it;
7720     next_it++;
7721     inst_it = bb->insertBefore(next_it, syncAllWrInst);
7722 }
7723 
7724 
singleInstStallSWSB(G4_Kernel * kernel,uint32_t instID,uint32_t endInstID,bool is_barrier)7725 void vISA::singleInstStallSWSB(G4_Kernel* kernel, uint32_t instID, uint32_t endInstID, bool is_barrier)
7726 {
7727     BB_LIST_ITER bbEnd = kernel->fg.end();
7728 
7729     for (BB_LIST_ITER bb_it = kernel->fg.begin();
7730         bb_it != bbEnd;
7731         bb_it++)
7732     {
7733         G4_BB* bb = (*bb_it);
7734 
7735         if (bb->size() > 0)
7736         {
7737             INST_LIST_ITER inst_end = bb->end();
7738             for (INST_LIST_ITER inst_it = bb->begin();
7739                 inst_it != inst_end;
7740                 inst_it++)
7741             {
7742                 G4_INST* inst = (*inst_it);
7743 
7744                 if (is_barrier && inst->getLexicalId() == instID)
7745                 {
7746                     setInstructionBarrierSWSB(kernel->fg.builder, bb, inst_it);
7747                 }
7748                 else
7749                 {
7750 
7751                     if ((inst->getLexicalId() <= (int)endInstID &&
7752                         inst->getLexicalId() >= (int)instID) ||
7753                         (inst->getLexicalId() == instID))
7754                     {
7755                         setInstructionStallSWSB(kernel->fg.builder, bb, inst_it);
7756                     }
7757                 }
7758             }
7759         }
7760     }
7761 }
7762 
dumpImmDom(ImmDominator * dom) const7763 void SWSB::dumpImmDom(ImmDominator* dom) const
7764 {
7765     for (auto bb : fg)
7766     {
7767         printf("BB%d %d:%d - SUCC:", bb->getId(), BBVector[bb->getId()]->first_node, BBVector[bb->getId()]->last_node);
7768         for (auto succ : bb->Succs)
7769         {
7770             printf("BB%d, ", succ->getId());
7771         }
7772         printf("--PRED:");
7773         for (auto pred : bb->Preds)
7774         {
7775             printf("BB%d, ", pred->getId());
7776         }
7777         auto& idomBB = dom->getIDoms()[bb->getId()];
7778         assert(idomBB != nullptr);
7779         printf("\n\t iDOM: BB%d -- DOM SUCC: ", dom->getIDoms()[bb->getId()]->getId());
7780         for (const G4_BB_SB* succ : BBVector[bb->getId()]->domSuccs)
7781         {
7782             printf("BB%d, ", succ->getBB()->getId());
7783         }
7784         printf("\n");
7785     }
7786 }
7787