1 /*========================== begin_copyright_notice ============================
2
3 Copyright (C) 2017-2021 Intel Corporation
4
5 SPDX-License-Identifier: MIT
6
7 ============================= end_copyright_notice ===========================*/
8
9 #include "SWSB_G4IR.h"
10 #include "Dependencies_G4IR.h"
11 #include "../G4_Opcode.h"
12 #include "../Timer.h"
13 #include "../RegAlloc.h"
14 #include "visa_wa.h"
15
16 #include <algorithm>
17 #include <fstream>
18 #include <functional>
19 #include <sstream>
20 #include <queue>
21
22 using namespace vISA;
23
getDPASPipelineCycle(uint8_t repc)24 static uint8_t getDPASPipelineCycle(uint8_t repc)
25 {
26 switch (repc)
27 {
28 case REP_1:
29 return DPAS_8x1_CYCLE;
30 case REP_2:
31 return DPAS_8x2_CYCLE;
32 case REP_4:
33 return DPAS_8x4_CYCLE;
34 case REP_8:
35 return DPAS_8x8_CYCLE;
36 default:
37 assert(0 && "Unexpected DPAS repeat count");
38 }
39
40 return 0;
41 }
42
getDPASGRFReadCycle(uint8_t repc)43 static uint8_t getDPASGRFReadCycle(uint8_t repc)
44 {
45 switch (repc)
46 {
47 case REP_1:
48 return DPAS_8x1_GRFREAD_CYCLE;
49 case REP_2:
50 return DPAS_8x2_GRFREAD_CYCLE;
51 case REP_4:
52 return DPAS_8x4_GRFREAD_CYCLE;
53 case REP_8:
54 return DPAS_8x8_GRFREAD_CYCLE;
55 default:
56 assert(0 && "Unexpected DPAS repeat count");
57 }
58
59 return 0;
60 }
61
hasSameFunctionID(const G4_INST * inst1,const G4_INST * inst2)62 static bool hasSameFunctionID(const G4_INST* inst1, const G4_INST* inst2)
63 {
64 if (inst1->isSend() && inst2->isSend())
65 {
66 G4_SendDesc* msgDesc1 = inst1->getMsgDesc();
67 G4_SendDesc* msgDesc2 = inst2->getMsgDesc();
68
69 if (msgDesc1->isSLM() && msgDesc2->isSLM())
70 {
71 return (msgDesc1->getSFID() == msgDesc2->getSFID());
72 }
73 else if (msgDesc1->isSLM() || msgDesc2->isSLM())
74 {
75 return false;
76 }
77
78 return (msgDesc1->getSFID() == msgDesc2->getSFID());
79 }
80 else if (inst1->isSend() || inst2->isSend())
81 {
82 return false;
83 }
84 else if (inst1->isMathPipeInst() && inst2->isMathPipeInst())
85 {
86 return true;
87 }
88 else if (inst1->isDpas() && inst2->isDpas())
89 {
90 return true;
91 }
92 else if (inst1->isDpas() || inst2->isDpas())
93 {
94 return false;
95 }
96 else if (inst1->isMathPipeInst() || inst2->isMathPipeInst())
97 {
98 return false;
99 }
100 else
101 {
102 return true;
103 }
104 }
105
isSLMMsg(const G4_INST * inst)106 static bool isSLMMsg(const G4_INST* inst)
107 {
108 assert(inst->isSend());
109 const G4_SendDesc* msgDesc = inst->getMsgDesc();
110 if (msgDesc->isSLM())
111 {
112 return true;
113 }
114 return false;
115 }
116
isPrefetch(const G4_INST * inst)117 static bool isPrefetch(const G4_INST* inst)
118 {
119 if(!inst->isSend())
120 {
121 return false;
122 }
123
124 const G4_SendDesc* msgDesc = inst->getMsgDesc();
125 if (msgDesc->isRead() && (inst->getDst() == nullptr || inst->getDst()->isNullReg()))
126 {
127 return true;
128 }
129 return false;
130 }
131
isFence(const G4_INST * inst)132 static bool isFence(const G4_INST* inst)
133 {
134 assert(inst->isSend());
135 const G4_SendDesc* msgDesc = inst->getMsgDesc();
136 if (msgDesc->isFence())
137 {
138 return true;
139 }
140 return false;
141 }
142
hasSamePredicator(const G4_INST * inst1,const G4_INST * inst2)143 static bool hasSamePredicator(const G4_INST* inst1, const G4_INST* inst2)
144 {
145 G4_Predicate* pred1 = inst1->getPredicate();
146 G4_Predicate* pred2 = inst2->getPredicate();
147
148 if (pred1 && pred2)
149 {
150 bool flagRegNumValid = true;
151 unsigned short refOff1 = pred1->getBase()->ExRegNum(flagRegNumValid);
152 unsigned short subRefOff1 = pred1->getBase()->asRegVar()->getPhyRegOff();;
153 unsigned short refOff2 = pred2->getBase()->ExRegNum(flagRegNumValid);
154 unsigned short subRefOff2 = pred2->getBase()->asRegVar()->getPhyRegOff();;
155
156 if (refOff1 == refOff2 &&
157 subRefOff1 == subRefOff2)
158 {
159 return true;
160 }
161 return false;
162 }
163
164 if (pred1 || pred2)
165 {
166 return false;
167 }
168
169 if (inst1->isWriteEnableInst() || inst2->isWriteEnableInst())
170 {
171 return false;
172 }
173
174 return true;
175 }
176
hasSameExecMask(const G4_INST * inst1,const G4_INST * inst2)177 static bool hasSameExecMask(const G4_INST* inst1, const G4_INST* inst2)
178 {
179 uint16_t mask1 = inst1->getMaskOffset();
180 uint16_t mask2 = inst2->getMaskOffset();
181 if (mask1 != mask2)
182 {
183 return false;
184 }
185
186 unsigned char execSize1 = inst1->getExecSize();
187 unsigned char execSize2 = inst2->getExecSize();
188 if (execSize1 != execSize2)
189 {
190 return false;
191 }
192
193 return true;
194 }
195
WARDepRequired(const G4_INST * inst1,const G4_INST * inst2)196 static bool WARDepRequired(const G4_INST* inst1, const G4_INST* inst2)
197 {
198 if (!hasSameFunctionID(inst1, inst2) ||
199 (hasSameFunctionID(inst1, inst2) &&
200 (!hasSamePredicator(inst1, inst2) ||
201 !hasSameExecMask(inst1, inst2))))
202 {
203 return true;
204 }
205
206 return false;
207 }
208
209 // check if two operands occupy overlapping GRFs
210 // we put them here instead of inside G4_Operand since this is only valid till after RA
211 // It's the caller's responsibility to ensure that opnd1 and opnd2 are both GRF allocated
operandOverlap(G4_Operand * opnd1,G4_Operand * opnd2)212 static bool operandOverlap(G4_Operand* opnd1, G4_Operand* opnd2)
213 {
214 return (opnd1->getLinearizedStart() <= opnd2->getLinearizedStart() &&
215 opnd1->getLinearizedEnd() > opnd2->getLinearizedStart()) ||
216 (opnd2->getLinearizedStart() <= opnd1->getLinearizedStart() &&
217 opnd2->getLinearizedEnd() > opnd1->getLinearizedStart());
218 }
219
getDPASDataType(GenPrecision p)220 static G4_Type getDPASDataType(GenPrecision p)
221 {
222 switch (p)
223 {
224 case GenPrecision::U1:
225 case GenPrecision::U2:
226 case GenPrecision::U4:
227 case GenPrecision::U8: return Type_UB;
228 case GenPrecision::S2:
229 case GenPrecision::S4:
230 case GenPrecision::S8: return Type_B;
231 case GenPrecision::FP16: return Type_HF;
232 case GenPrecision::BF16: return Type_BF;
233 case GenPrecision::BF8: return Type_UNDEF;
234 case GenPrecision::TF32: return Type_UNDEF;
235 default:
236 assert(false && "illegal Operand Precision");
237 return Type_UD;
238 }
239 }
240
241 // Compute the range of registers touched by OPND.
getFootprintForGRF(G4_Operand * opnd,Gen4_Operand_Number opnd_num,G4_INST * inst,int startingBucket,bool mustBeWholeGRF)242 SBFootprint* G4_BB_SB::getFootprintForGRF(
243 G4_Operand* opnd,
244 Gen4_Operand_Number opnd_num,
245 G4_INST* inst,
246 int startingBucket,
247 bool mustBeWholeGRF)
248 {
249 unsigned short LB = 0;
250 unsigned short RB = 0;
251 int aregOffset = totalGRFNum;
252 G4_Type type = opnd->getType();
253 if (inst->opcode() == G4_fcvt &&
254 (type == Type_UB ||
255 (type == Type_UD && builder.hasPartialInt64Support())))
256 {
257 type = Type_F;
258 }
259 if (inst->opcode() == G4_srnd)
260 { // srnd ub hf hf | srnd hf f f
261 type = inst->getSrc(0)->getType();
262 }
263
264 if (inst->isDpas() && (opnd_num == Opnd_src1 || opnd_num == Opnd_src2))
265 {
266 if (opnd_num == Opnd_src1)
267 {
268 type = getDPASDataType(inst->asDpasInst()->getSrc1Precision());
269 }
270 if (opnd_num == Opnd_src2)
271 {
272 type = getDPASDataType(inst->asDpasInst()->getSrc2Precision());
273 }
274 }
275
276 switch (opnd_num) {
277 case Opnd_src0:
278 case Opnd_src1:
279 case Opnd_src2:
280 case Opnd_src3:
281 case Opnd_dst:
282 LB = (unsigned short)opnd->getLinearizedStart();
283 RB = (unsigned short)opnd->getLinearizedEnd();
284 if (inst->isSend())
285 {
286 assert((LB % numEltPerGRF<Type_UB>()) == 0);
287 //For the operands of the send instructions,
288 //we are using the message length to avoid the in-consistence with the HW requirement.
289 //
290 if (opnd_num == Opnd_src0)
291 {
292 RB = LB + numEltPerGRF<Type_UB>() * inst->getMsgDesc()->getSrc0LenRegs() - 1;
293 }
294
295 if (inst->isSplitSend() &&
296 opnd_num == Opnd_src1)
297 {
298 RB = LB + numEltPerGRF<Type_UB>() * inst->getMsgDesc()->getSrc1LenRegs() - 1;
299 }
300
301 if (opnd_num == Opnd_dst)
302 {
303 int dstSize = inst->getMsgDesc()->getDstLenRegs();
304 // DG2 A0 W/A to treat SIMD8 SLM load with single GRF return as two GRF return
305 if (VISA_WA_CHECK(builder.getPWaTable(), Wa_14012562260) &&
306 inst->getExecSize() <= 8 && isSLMMsg(inst) && dstSize == 1)
307 {
308 if ((LB / numEltPerGRF<Type_UB>()) < 127)
309 {
310 dstSize = 2;
311 }
312 }
313
314 if ((LB / numEltPerGRF<Type_UB>()) < (unsigned short)(totalGRFNum - 1))
315 {
316 RB = LB + numEltPerGRF<Type_UB>() * dstSize - 1;
317 }
318 }
319
320 assert(RB < (numEltPerGRF<Type_UB>() * aregOffset) && "Out of register bound");
321 }
322 //HW WA for DPAS src2, treat all source 2 as 8x8 source 2 to avoid the read suppression issue
323 if (builder.hasDPASSrc2ReadSuppressionDepIssue() &&
324 inst->opcode() == G4_dpas && opnd_num == Opnd_src2)
325 {
326 const G4_InstDpas* dpasInst = inst->asDpasInst();
327 uint32_t bytesPerLane = dpasInst->getSrc2SizePerLaneInByte();
328 uint32_t bytes = bytesPerLane * 8* 8;
329 RB = LB + bytes - 1;
330 }
331
332 //HW WA for DPAS src1, treat all source 1 8GRF size
333 if (VISA_WA_CHECK(builder.getPWaTable(), Wa_14013341720) &&
334 inst->opcode() == G4_dpas && opnd_num == Opnd_src1)
335 {
336 uint32_t bytes = getGRFSize() * 8;
337 RB = LB + bytes - 1;
338 }
339 break;
340 default:
341 assert(0 && "Bad opnd");
342 }
343
344 void* allocedMem = mem.alloc(sizeof(SBFootprint));
345 if (startingBucket >= aregOffset)
346 {
347 LB = startingBucket * numEltPerGRF<Type_UB>() + LB;
348 RB = startingBucket * numEltPerGRF<Type_UB>() + RB;
349 }
350
351 //This is WA which assumes whole GRF will be touched in send instruction, not matter the occupation of real valid value.
352 //FIXME: But this is not true in media block read/write, which can specify the byte level size in descriptor, no GRF align required.
353 if (mustBeWholeGRF)
354 {
355 LB = (LB / numEltPerGRF<Type_UB>()) * numEltPerGRF<Type_UB>();
356 RB = ((RB / numEltPerGRF<Type_UB>()) + 1) * numEltPerGRF<Type_UB>() - 1;
357 }
358
359 SBFootprint* footprint = new (allocedMem)SBFootprint(GRF_T, type, LB, RB, inst);
360
361 return footprint;
362 }
363
needBothAcc(IR_Builder & builder,G4_INST * inst,G4_Operand * opnd)364 bool needBothAcc(IR_Builder& builder, G4_INST* inst, G4_Operand * opnd)
365 {
366 switch (opnd->getType())
367 {
368 case Type_F:
369 return inst->getExecSize() == G4_ExecSize(builder.getNativeExecSize() * 2);
370 case Type_HF:
371 case Type_BF:
372 return false;
373 case Type_DF:
374 return inst->getExecSize() > G4_ExecSize(builder.getNativeExecSize() / 2);
375 default:
376 return true;
377 }
378 }
379
380
381 // Compute the range of registers touched by OPND.
getFootprintForACC(G4_Operand * opnd,Gen4_Operand_Number opnd_num,G4_INST * inst)382 SBFootprint* G4_BB_SB::getFootprintForACC(G4_Operand* opnd,
383 Gen4_Operand_Number opnd_num,
384 G4_INST* inst)
385 {
386 unsigned short LB = 0;
387 unsigned short RB = 0;
388 G4_Type type = opnd->getType();
389
390 switch (opnd_num) {
391 case Opnd_src0:
392 case Opnd_src1:
393 case Opnd_src2:
394 case Opnd_src3:
395 case Opnd_dst:
396 case Opnd_implAccSrc:
397 case Opnd_implAccDst:
398 LB = (unsigned short)opnd->getLinearizedStart();
399 RB = (unsigned short)opnd->getLinearizedEnd();
400 break;
401 default:
402 assert(0 && "Bad opnd");
403 }
404
405 if (needBothAcc(builder, inst, opnd))
406 {
407 if (((RB - LB + 1) / numEltPerGRF<Type_UB>()) < 2)
408 {
409 RB = LB + numEltPerGRF<Type_UB>() * 2 - 1;
410 }
411 }
412 int regNum = 0;
413 if (opnd->isDstRegRegion())
414 regNum += opnd->asDstRegRegion()->getRegOff();
415 else if (opnd->isSrcRegRegion())
416 regNum += opnd->asSrcRegRegion()->getRegOff();
417
418 LB += regNum * numEltPerGRF<Type_UB>();
419 RB += regNum * numEltPerGRF<Type_UB>();
420
421 void* allocedMem = mem.alloc(sizeof(SBFootprint));
422 SBFootprint* footprint = nullptr;
423
424 footprint = new (allocedMem)SBFootprint(ACC_T, type, LB, RB, inst);
425
426 return footprint;
427 }
428
429 // Compute the range of flag registers touched by OPND.
430 // Treat each 16 bit of the flag register as a bucket unit, GRF size
431 // 64 bytes GRF: each bit means 8 bytes
432 // 32 bytes GRF: each bit means 4 bytes
getFootprintForFlag(G4_Operand * opnd,Gen4_Operand_Number opnd_num,G4_INST * inst)433 SBFootprint* G4_BB_SB::getFootprintForFlag(G4_Operand* opnd,
434 Gen4_Operand_Number opnd_num,
435 G4_INST* inst)
436 {
437 unsigned short LB = 0;
438 unsigned short RB = 0;
439 G4_Type type = opnd->getType();
440 bool valid = true;
441 unsigned subRegOff = opnd->getBase()->ExSubRegNum(valid);
442 LB = (unsigned short)(opnd->getLeftBound() + subRegOff * 16) * FLAG_TO_GRF_MAP;
443 RB = (unsigned short)(opnd->getRightBound() + subRegOff * 16) * FLAG_TO_GRF_MAP;
444
445 LB += (builder.kernel.getNumRegTotal() + builder.getNumScalarRegisters() + builder.kernel.getNumAcc()) * numEltPerGRF<Type_UB>();
446 RB += (builder.kernel.getNumRegTotal() + builder.getNumScalarRegisters() + builder.kernel.getNumAcc()) * numEltPerGRF<Type_UB>();
447
448 void* allocedMem = mem.alloc(sizeof(SBFootprint));
449 SBFootprint* footprint = nullptr;
450
451 footprint = new (allocedMem)SBFootprint(FLAG_T, type, LB, RB, inst);
452
453 return footprint;
454 }
455
456
compareInterval(SBNode * n1,SBNode * n2)457 static bool compareInterval(SBNode* n1, SBNode* n2)
458 {
459 return n1->getLiveStartID() < n2->getLiveStartID();
460 }
461
compareBBStart(G4_BB_SB * b1,G4_BB_SB * b2)462 static bool compareBBStart(G4_BB_SB* b1, G4_BB_SB* b2)
463 {
464 return b1->first_node < b2->first_node;
465 }
466
nodeSortCompare(SBDEP_ITEM dep1,SBDEP_ITEM dep2)467 static bool nodeSortCompare(SBDEP_ITEM dep1, SBDEP_ITEM dep2)
468 {
469 if (dep1.node->getBBID() < dep2.node->getBBID())
470 {
471 return true;
472 }
473 else if (dep1.node->getBBID() == dep2.node->getBBID())
474 {
475 return (dep1.node->getNodeID() < dep2.node->getNodeID());
476 }
477
478 return false;
479 }
480
481 // Return TRUE if opnd corresponding to opndNum has indirect access.
hasIndirection(const G4_Operand * opnd,Gen4_Operand_Number opndNum)482 static inline bool hasIndirection(const G4_Operand* opnd, Gen4_Operand_Number opndNum) {
483 switch (opndNum) {
484 case Opnd_dst:
485 return opnd->asDstRegRegion()->isIndirect();
486 case Opnd_src0:
487 case Opnd_src1:
488 case Opnd_src2:
489 return opnd->asSrcRegRegion()->isIndirect();
490 case Opnd_src3:
491 case Opnd_pred:
492 case Opnd_condMod:
493 case Opnd_implAccSrc:
494 case Opnd_implAccDst:
495 return false;
496 default:
497 assert(0 && "Bad opndNum");
498 return false; // Unreachable
499 }
500 }
501
distanceHonourInstruction(const G4_INST * inst)502 static inline bool distanceHonourInstruction(const G4_INST* inst)
503 {
504 return !inst->tokenHonourInstruction() && !inst->isWait() && inst->opcode() != G4_nop && inst->opcode() != G4_halt;
505 }
506
tokenHonourInstruction(const G4_INST * inst)507 static inline bool tokenHonourInstruction(const G4_INST* inst)
508 {
509 return inst->tokenHonourInstruction();
510 }
511
512 //Generate the dependence distance
setDefaultDistanceAtFirstInstruction()513 void SWSB::setDefaultDistanceAtFirstInstruction()
514 {
515 for (auto bb : fg)
516 {
517 for (auto it = bb->begin();
518 it != bb->end();
519 it++)
520 {
521 if (!(*it)->isLabel())
522 {
523 (*it)->setDistance(1);
524 if (fg.builder->hasThreeALUPipes() || fg.builder->hasFourALUPipes())
525 {
526 (*it)->setDistanceTypeXe(G4_INST::DistanceType::DISTALL);
527 }
528 if (fg.builder->getFCPatchInfo()->getFCComposableKernel() && fg.builder->hasFourALUPipes())
529 {
530 insertSyncAllWRInstruction(bb, 0, it, (*it)->getCISAOff(), (*it)->getLineNo());
531 insertSyncAllRDInstruction(bb, 0, it, (*it)->getCISAOff(), (*it)->getLineNo());
532 }
533
534 return;
535 }
536 }
537 }
538 }
539
addSIMDEdge(G4_BB_SB * pred,G4_BB_SB * succ)540 void SWSB::addSIMDEdge(G4_BB_SB* pred, G4_BB_SB* succ)
541 {
542 pred->Succs.push_back(succ);
543 succ->Preds.push_back(pred);
544 }
545
546 // Build SIMD CFG for the global WAR dependence tracking
547 // 1. When building CFG, except the backedge, all using JIP branch edge.
548 // 2. For the join and endif instructions which are no separated and place in the head of a BB. We do edge propagation
549 // Such as: BB a, b, c, d, there is a join in BB b which JIP to d, and there is an edge from a to b, we will add edge from a to d, instead of b to d.
SWSBBuildSIMDCFG()550 void SWSB::SWSBBuildSIMDCFG()
551 {
552 //Build parallel control flow graph
553 for (size_t i = 0; i < BBVector.size(); i++)
554 {
555 G4_BB_SB* currBB = BBVector[i];
556 const G4_INST* lastInst = currBB->getBB()->back();
557 for (const G4_INST* firstInst : *currBB->getBB())
558 {
559 if (firstInst->isLabel())
560 continue;
561
562 if (firstInst != lastInst &&
563 G4_Inst_Table[firstInst->opcode()].instType == InstTypeFlow)
564 {
565 if (firstInst->asCFInst()->getJip())
566 {
567 G4_Operand* jip = firstInst->asCFInst()->getJip();
568 G4_BB_SB* targetBB = labelToBlockMap[jip->asLabel()];
569
570 //Do we need to propagate edge for fall through preds?
571 for (G4_BB_SB* predBB : currBB->Preds)
572 {
573 addSIMDEdge(predBB, targetBB);
574 }
575 }
576 }
577 break;
578 }
579
580 if (lastInst->isEOT())
581 {
582 continue;
583 }
584
585 if (G4_Inst_Table[lastInst->opcode()].instType == InstTypeFlow)
586 {
587 G4_opcode op = lastInst->opcode();
588
589 if (op == G4_jmpi)
590 {
591 G4_Operand* jip = lastInst->getSrc(0);
592 G4_BB_SB* targetBB = labelToBlockMap[jip->asLabel()];
593 addSIMDEdge(currBB, targetBB);
594 if (lastInst->getPredicate())
595 {
596 if (i + 1 != BBVector.size())
597 {
598 addSIMDEdge(currBB, BBVector[i + 1]);
599 }
600 }
601 }
602 else if (lastInst->isReturn() || lastInst->isCall() ||
603 lastInst->isFReturn() || lastInst->isFCall())
604 {
605 for (const G4_BB* bb : currBB->getBB()->Succs)
606 {
607 unsigned bbID = bb->getId();
608 addSIMDEdge(currBB, BBVector[bbID]);
609 }
610 }
611 else if (lastInst->asCFInst()->getJip())
612 {
613 if (op == G4_goto)
614 {
615 G4_Operand* jip = lastInst->asCFInst()->getJip();
616 G4_Operand* uip = lastInst->asCFInst()->getUip();
617 G4_BB_SB* jipBB = labelToBlockMap[jip->asLabel()];
618 G4_BB_SB* uipBB = labelToBlockMap[uip->asLabel()];
619 if (jipBB != uipBB && jipBB->first_node > uipBB->first_node)
620 {//backedge, goto uip
621 addSIMDEdge(currBB, uipBB);
622 }
623 else //goto jip
624 {
625 addSIMDEdge(currBB, jipBB);
626 }
627
628 if (lastInst->getPredicate())
629 {
630 if (i + 1 != BBVector.size())
631 {
632 addSIMDEdge(currBB, BBVector[i + 1]);
633 }
634 }
635 }
636 else if (op == G4_break)
637 {
638 G4_Operand* jip = lastInst->asCFInst()->getJip();
639 G4_Operand* uip = lastInst->asCFInst()->getUip();
640 G4_BB_SB* jipBB = labelToBlockMap[jip->asLabel()];
641 G4_BB_SB* uipBB = labelToBlockMap[uip->asLabel()];
642 if (jipBB == uipBB)
643 {
644 G4_BB* bb = jipBB->getBB();
645 unsigned bbID = bb->getId();
646 assert(bbID + 1 != BBVector.size());
647 addSIMDEdge(currBB, BBVector[bbID + 1]);
648 }
649 else //Add the jip edge to the CFG
650 {
651 addSIMDEdge(currBB, jipBB);
652 }
653 if (i + 1 != BBVector.size())
654 {
655 addSIMDEdge(currBB, BBVector[i + 1]);
656 }
657 }
658 else
659 {
660 G4_Operand* jip = lastInst->asCFInst()->getJip();
661 G4_BB_SB* targetBB = labelToBlockMap[jip->asLabel()];
662 addSIMDEdge(currBB, targetBB);
663 if (i + 1 != BBVector.size())
664 {
665 addSIMDEdge(currBB, BBVector[i + 1]);
666 }
667 }
668 }
669 else
670 {
671 if (i + 1 != BBVector.size())
672 {
673 addSIMDEdge(currBB, BBVector[i + 1]);
674 }
675 }
676 }
677 else
678 {
679 if (i + 1 != BBVector.size())
680 {
681 addSIMDEdge(currBB, BBVector[i + 1]);
682 }
683 }
684 }
685 }
686
687 //Generate the dependence distance
SWSBDepDistanceGenerator(PointsToAnalysis & p,LiveGRFBuckets & LB,LiveGRFBuckets & globalSendsLB)688 void SWSB::SWSBDepDistanceGenerator(PointsToAnalysis& p, LiveGRFBuckets& LB, LiveGRFBuckets& globalSendsLB)
689 {
690 BB_LIST_ITER ib(fg.begin()), bend(fg.end());
691
692 //Initialize global data
693 BBVector.resize(fg.size());
694
695 //Set distance 1 at the first instruction in case there are runtime inserted instructions at prolog
696 if (kernel.getInt32KernelAttr(Attributes::ATTR_Target) != VISA_3D ||
697 fg.builder->getOptions()->getOption(vISA_SWSBStitch) )
698 {
699 setDefaultDistanceAtFirstInstruction();
700 }
701
702 unsigned nestLoopLevel = 0;
703 //Local dependence analysis
704 for (; ib != bend; ++ib)
705 {
706 BBVector[(*ib)->getId()] = new (mem)G4_BB_SB(
707 *this,
708 *(fg.builder),
709 mem,
710 *ib,
711 &SBNodes,
712 &SBSendNodes,
713 &globalSendOpndList,
714 &indexes,
715 globalSendNum,
716 &LB,
717 &globalSendsLB,
718 p,
719 &labelToBlockMap,
720 tokenAfterDPASCycle);
721 if ((*ib)->getNestLevel())
722 {
723 nestLoopLevel = nestLoopLevel < (*ib)->getNestLevel() ? (*ib)->getNestLevel() : nestLoopLevel;
724 }
725 }
726 }
727
handleFuncCall()728 void SWSB::handleFuncCall()
729 {
730 for (G4_BB_SB *bb : BBVector)
731 {
732 if (bb->last_node == -1)
733 {
734 continue;
735 }
736
737 SBNode* node = SBNodes[bb->last_node];
738
739 if ((node->GetInstruction()->isCall() || node->GetInstruction()->isFCall()) ||
740 (node->GetInstruction()->isReturn() || node->GetInstruction()->isFReturn()))
741 {
742 LiveGRFBuckets send_use_out(mem, kernel.getNumRegTotal(), *fg.getKernel());
743 for (const SBBucketNode* sBucketNode : globalSendOpndList)
744 {
745 SBNode* sNode = sBucketNode->node;
746 if (bb->send_live_out.isSrcSet(sNode->globalID) &&
747 (sBucketNode->opndNum == Opnd_src0 ||
748 sBucketNode->opndNum == Opnd_src1 ||
749 sBucketNode->opndNum == Opnd_src2 ||
750 sBucketNode->opndNum == Opnd_src3))
751 {
752 bb->createAddGRFEdge(sNode, node, WAR, DEP_EXPLICT);
753 }
754 if (bb->send_live_out.isDstSet(sNode->globalID) && (sBucketNode->opndNum == Opnd_dst))
755 {
756 bb->createAddGRFEdge(sNode, node, RAW, DEP_EXPLICT);
757 }
758 }
759 }
760 if (node->GetInstruction()->isReturn() ||
761 node->GetInstruction()->isFReturn())
762 {
763 node->GetInstruction()->setDistance(1);
764 if (fg.builder->hasThreeALUPipes() || fg.builder->hasFourALUPipes())
765 {
766 node->GetInstruction()->setDistanceTypeXe(G4_INST::DistanceType::DISTALL);
767 }
768 }
769 }
770 }
771
SWSBGlobalTokenGenerator(PointsToAnalysis & p,LiveGRFBuckets & LB,LiveGRFBuckets & globalSendsLB)772 void SWSB::SWSBGlobalTokenGenerator(PointsToAnalysis& p, LiveGRFBuckets& LB, LiveGRFBuckets& globalSendsLB)
773 {
774 allTokenNodesMap.resize(totalTokenNum);
775 for (TokenAllocation& nodeMap : allTokenNodesMap)
776 {
777 nodeMap.bitset = BitSet(SBSendNodes.size(), false);
778 }
779
780 const bool enableGlobalTokenAllocation = fg.builder->getOptions()->getOption(vISA_GlobalTokenAllocation);
781 const bool enableDistPropTokenAllocation = fg.builder->getOptions()->getOption(vISA_DistPropTokenAllocation);
782 // Get the live out, may kill bit sets
783 for (G4_BB_SB *bb : BBVector)
784 {
785 bb->send_live_in = SBBitSets(globalSendNum);
786 bb->send_live_out = SBBitSets(globalSendNum);
787 bb->send_def_out = SBBitSets(globalSendNum);
788
789 bb->send_live_in_scalar = SBBitSets(globalSendNum);
790 bb->send_live_out_scalar = SBBitSets(globalSendNum);
791 bb->send_kill_scalar = SBBitSets(globalSendNum);
792 bb->liveInTokenNodes = BitSet(SBSendNodes.size(), false);
793 bb->liveOutTokenNodes = BitSet(SBSendNodes.size(), false);
794 bb->killedTokens = BitSet(totalTokenNum, false);
795
796 if (enableGlobalTokenAllocation || enableDistPropTokenAllocation)
797 {
798 bb->tokenLiveInDist = (unsigned*)mem.alloc(sizeof(unsigned) * globalSendNum);
799 bb->tokenLiveOutDist = (unsigned*)mem.alloc(sizeof(unsigned) * globalSendNum);
800 for (unsigned k = 0; k < globalSendNum; k++)
801 {
802 bb->tokenLiveInDist[k] = -1;
803 bb->tokenLiveOutDist[k] = -1;
804 }
805 }
806 if (bb->send_start != -1)
807 {
808 for (int k = bb->send_start; k <= bb->send_end; k++)
809 {
810 if (globalSendOpndList[k]->opndNum == Opnd_dst)
811 {
812 bb->send_def_out.setDst(globalSendOpndList[k]->node->globalID, true);
813 bb->send_live_out.setDst(globalSendOpndList[k]->node->globalID, true);
814 }
815 if (globalSendOpndList[k]->opndNum == Opnd_src0 ||
816 globalSendOpndList[k]->opndNum == Opnd_src1 ||
817 globalSendOpndList[k]->opndNum == Opnd_src2 ||
818 globalSendOpndList[k]->opndNum == Opnd_src3)
819 {
820 bb->send_def_out.setSrc(globalSendOpndList[k]->node->globalID, true);
821 bb->send_live_out.setSrc(globalSendOpndList[k]->node->globalID, true);
822 }
823 }
824 }
825
826 bb->send_may_kill = SBBitSets(globalSendNum);
827 bb->send_WAW_may_kill = BitSet(globalSendNum, false);
828 bb->setSendOpndMayKilled(&globalSendsLB, &SBNodes, p);
829
830 #ifdef DEBUG_VERBOSE_ON
831 bb->dumpLiveInfo(&globalSendOpndList, globalSendNum, nullptr);
832 #endif
833 }
834
835 /*
836 Loop info is used to reduce the token required for certain instructions, or count the delay of the backedge for token reuse
837 We do the token reduction and count delay of backedge only for the nature loops, i.e with the backedge,
838 if the instruction distance is far enough, there is no need to set dependence.
839 For the irreducible flow graph, those optimizations wouldn't be kicked in.
840 */
841 for (G4_BB_SB *bb : BBVector)
842 {
843 for (auto&& be : kernel.fg.backEdges)
844 {
845 auto loopIt = kernel.fg.naturalLoops.find(be);
846
847 if (loopIt != kernel.fg.naturalLoops.end())
848 {
849 auto&& bbsInLoop = loopIt->second;
850
851 auto bb1InLoop = bbsInLoop.find(bb->getBB());
852 if (bb1InLoop != bbsInLoop.end())
853 {
854 if (bb->getLoopStartBBID() != -1)
855 {
856 //Innermost loop only
857 if (bb->getLoopStartBBID() <= be.second->getId() &&
858 bb->getLoopEndBBID() >= be.first->getId())
859 {
860 bb->setLoopStartBBID(be.second->getId());
861 bb->setLoopEndBBID(be.first->getId());
862 }
863 }
864 else
865 {
866 bb->setLoopStartBBID(be.second->getId());
867 bb->setLoopEndBBID(be.first->getId());
868 }
869 }
870 }
871 }
872 }
873
874 //Global analysis until no live in change
875 SWSBGlobalScalarCFGReachAnalysis();
876
877 //Add dependence according to analysis result
878 if (enableGlobalTokenAllocation || enableDistPropTokenAllocation)
879 {
880 addGlobalDependenceWithReachingDef(globalSendNum, &globalSendOpndList, &SBNodes, p, true);
881 }
882 else
883 {
884 addGlobalDependence(globalSendNum, &globalSendOpndList, &SBNodes, p, true);
885 }
886
887 handleFuncCall();
888
889 for (G4_BB_SB *bb : BBVector)
890 {
891 bb->send_live_in_scalar = bb->send_live_in;
892 bb->send_live_out_scalar = bb->send_live_out;
893 }
894
895 SWSBBuildSIMDCFG();
896
897 SWSBGlobalSIMDCFGReachAnalysis();
898
899 //Add dependence according to analysis result
900 addGlobalDependence(globalSendNum, &globalSendOpndList, &SBNodes, p, false);
901
902 //SWSB token allocation with linear scan algorithm.
903 if (enableGlobalTokenAllocation)
904 {
905 tokenAllocationGlobal();
906 }
907 else if (enableDistPropTokenAllocation)
908 {
909 tokenAllocationGlobalWithPropogation();
910 }
911 else if (fg.builder->getOptions()->getOption(vISA_QuickTokenAllocation))
912 {
913 quickTokenAllocation();
914 }
915 else
916 {
917 tokenAllocation();
918 }
919
920 //Insert test instruction in case the dependences are more than token field in the instruction.
921 insertTest();
922 }
923
924 static FCPatchingInfo::RegAccessType
getRegAccessType(Gen4_Operand_Number OpndNo)925 getRegAccessType(Gen4_Operand_Number OpndNo) {
926 if (OpndNo == Opnd_dst)
927 return FCPatchingInfo::Fully_Def;
928 return FCPatchingInfo::Fully_Use;
929 }
930
getRegAccessPipe(G4_INST * Inst)931 static unsigned getRegAccessPipe(G4_INST* Inst) {
932 FCPatchingInfo::RegAccessPipe Pipe = FCPatchingInfo::Pipe_ALU;
933 unsigned SFID = 0;
934
935 if (Inst->isSend())
936 {
937 Pipe = FCPatchingInfo::Pipe_Send;
938 SFID = SFIDtoInt(Inst->getMsgDesc()->getSFID()) & 0xF; // 4-bit SFID
939 }
940 else if (Inst->isMathPipeInst())
941 {
942 Pipe = FCPatchingInfo::Pipe_Math;
943 }
944 else if (Inst->isDpas())
945 {
946 Pipe = FCPatchingInfo::Pipe_Dpas;
947 }
948
949 // Pipe ID is encoded as (SFID[3:0] | P[3:0]), where P is ALU, Math, or Send.
950 return unsigned(Pipe) | (SFID << 4);
951 }
952
updateRegAccess(FCPatchingInfo * FCPI,SBNode * Node,Gen4_Operand_Number OpndNo,unsigned NumRegs)953 static void updateRegAccess(FCPatchingInfo* FCPI, SBNode* Node,
954 Gen4_Operand_Number OpndNo, unsigned NumRegs) {
955 for (auto F = Node->getFirstFootprint(OpndNo); F != nullptr; F = F->next) {
956 unsigned L = F->LeftB / numEltPerGRF<Type_UB>();
957 unsigned R = F->RightB / numEltPerGRF<Type_UB>();
958 if (F->fType != GRF_T)
959 {
960 continue;
961 }
962 ASSERT_USER(L < NumRegs, "Invalid register left bound!");
963 ASSERT_USER(R < NumRegs, "Invalid register right bound!");
964 for (unsigned n = L; n <= R; ++n) {
965 FCPatchingInfo::RegAccess Acc;
966 Acc.Type = getRegAccessType(OpndNo);
967 Acc.RegNo = n;
968 Acc.Pipe = getRegAccessPipe(Node->GetInstruction());
969 Acc.Inst = Node->GetInstruction();
970 Acc.Token = Acc.Inst->getSetToken();
971 // Update the first access list & map.
972 if (!FCPI->RegFirstAccessMap.count(n)) {
973 FCPI->RegFirstAccessList.push_back(Acc);
974 FCPI->RegFirstAccessMap[n] = &FCPI->RegFirstAccessList.back();
975 }
976 // Update the last access list & map.
977 if (FCPI->RegLastAccessMap.count(n)) {
978 if (Acc.Type == FCPatchingInfo::Fully_Def) {
979 // Remove previous accesses.
980 auto PrevAcc = FCPI->RegLastAccessMap[n];
981 while (PrevAcc) {
982 auto Next = PrevAcc->Next;
983 auto PrevAccInst = PrevAcc->Inst;
984 auto PrevAccRegNo = PrevAcc->RegNo;
985 // Remove all previous accesses on the same GRF.
986 FCPI->RegLastAccessList.remove_if(
987 [=](const FCPatchingInfo::RegAccess& A) {
988 return (A.Inst == PrevAccInst) &&
989 (A.RegNo == PrevAccRegNo); });
990 PrevAcc = Next;
991 }
992 }
993 else {
994 // Remove previous accesses with the same pipe.
995 auto PrevAcc = FCPI->RegLastAccessMap[n];
996 while (PrevAcc) {
997 if (PrevAcc->Type == FCPatchingInfo::Fully_Use &&
998 PrevAcc->Pipe != Acc.Pipe) {
999 // Not the same, re-link them.
1000 std::swap(Acc.Next, PrevAcc->Next);
1001 std::swap(Acc.Next, PrevAcc);
1002 continue;
1003 }
1004 auto Next = PrevAcc->Next;
1005 auto PrevAccInst = PrevAcc->Inst;
1006 auto PrevAccRegNo = PrevAcc->RegNo;
1007 // Remove all previous accesses on the same GRF and the same pipe.
1008 FCPI->RegLastAccessList.remove_if(
1009 [=](const FCPatchingInfo::RegAccess& A) {
1010 return (A.Inst == PrevAccInst) &&
1011 (A.RegNo == PrevAccRegNo); });
1012 PrevAcc = Next;
1013 }
1014 }
1015 }
1016 FCPI->RegLastAccessList.push_back(Acc);
1017 FCPI->RegLastAccessMap[n] = &FCPI->RegLastAccessList.back();
1018 }
1019 }
1020 }
1021
insertSyncBarrier(FCPatchingInfo * FCPI,SBNode * Node,unsigned NumRegs)1022 static void insertSyncBarrier(FCPatchingInfo* FCPI, SBNode* Node,
1023 unsigned NumRegs) {
1024 // Skip if sync barrier is already inserted.
1025 if (FCPI->RegFirstAccessList.size() == 0 || FCPI->RegFirstAccessList.back().RegNo == unsigned(-1))
1026 return;
1027
1028 // Sync barrier is a special relocation where all registers are forced to be
1029 // synchronized.
1030 FCPatchingInfo::RegAccess Acc;
1031 Acc.Type = FCPatchingInfo::Fully_Use;
1032 Acc.RegNo = unsigned(-1); // A special register.
1033 // Sync barrier is inserted just before this instruction.
1034 Acc.Inst = Node->GetInstruction();
1035
1036 // Append this access into the first access list.
1037 FCPI->RegFirstAccessList.push_back(Acc);
1038 // Update the first access map.
1039 for (unsigned n = 0; n < NumRegs; ++n) {
1040 if (FCPI->RegFirstAccessMap.count(n))
1041 continue;
1042 FCPI->RegFirstAccessMap[n] = &FCPI->RegFirstAccessList.back();
1043 }
1044 // Invalidate the last access list & map.
1045 FCPI->RegLastAccessMap.clear();
1046 FCPI->RegLastAccessList.clear();
1047 }
1048
isBranch(SBNode * N)1049 static bool isBranch(SBNode* N) {
1050 auto Inst = N->GetInstruction();
1051 if (!Inst->isFlowControl())
1052 return false;
1053 // Skip function call/ret.
1054 if (Inst->isCall() || Inst->isReturn() ||
1055 Inst->opcode() == G4_pseudo_fc_call ||
1056 Inst->opcode() == G4_pseudo_fc_ret)
1057 return false;
1058 return true;
1059 }
1060
updatePatchInfo(FCPatchingInfo * FCPI,SBNode * Node,unsigned NumRegs,unsigned NumTokens)1061 static void updatePatchInfo(FCPatchingInfo* FCPI, SBNode* Node,
1062 unsigned NumRegs, unsigned NumTokens) {
1063 // TODO: Branch is not supported in the current FC patch info as it
1064 // involves complicated handling. Issue a sync barrier just before the
1065 // first flow control instruction.
1066 if (isBranch(Node)) {
1067 insertSyncBarrier(FCPI, Node, NumRegs);
1068 return;
1069 }
1070 // Update access maps.
1071 updateRegAccess(FCPI, Node, Opnd_src0, NumRegs);
1072 updateRegAccess(FCPI, Node, Opnd_src1, NumRegs);
1073 updateRegAccess(FCPI, Node, Opnd_src2, NumRegs);
1074 // Per inst, 'use' access always happens before 'def' access.
1075 updateRegAccess(FCPI, Node, Opnd_dst, NumRegs);
1076 }
1077
updateTokenSet(FCPatchingInfo * FCPI,SBNODE_VECT & Nodes,unsigned NumTokens)1078 static void updateTokenSet(FCPatchingInfo* FCPI, SBNODE_VECT& Nodes,
1079 unsigned NumTokens) {
1080 std::set<G4_INST*> LastAccInsts;
1081 // Collect last access instructions.
1082 for (auto I = FCPI->RegLastAccessList.begin(),
1083 E = FCPI->RegLastAccessList.end(); I != E; ++I) {
1084 LastAccInsts.insert(I->Inst);
1085 }
1086 // Scan node for tokens used in non-last access instructions.
1087 for (auto NI = Nodes.begin(), NE = Nodes.end(); NI != NE; ++NI) {
1088 auto Inst = (*NI)->GetInstruction();
1089 if (LastAccInsts.count(Inst))
1090 continue;
1091 auto T = Inst->getSetToken();
1092 // Skip if token is not allocated.
1093 if (T == (unsigned short)(-1))
1094 return;
1095 ASSERT_USER(T < NumTokens, "Invalid token number!");
1096 FCPI->AllocatedToken.insert(T);
1097 }
1098 }
1099
genSWSBPatchInfo()1100 void SWSB::genSWSBPatchInfo() {
1101 unsigned NumRegs = kernel.getNumRegTotal();
1102 auto FCPI = fg.builder->getFCPatchInfo();
1103 for (auto Node : SBNodes) {
1104 updatePatchInfo(FCPI, Node, NumRegs, totalTokenNum);
1105 }
1106
1107 #if 1
1108 //Update the live out tokens according to the live out of the exit BB of the kernel.
1109 for (G4_BB* bb : fg)
1110 {
1111 if (bb->Succs.size() == 0 &&
1112 BBVector[bb->getId()]->Succs.size() == 0)
1113 {
1114 LiveGRFBuckets send_use_out(mem, kernel.getNumRegTotal(), *fg.getKernel());
1115 for (size_t i = 0; i < globalSendOpndList.size(); i++)
1116 {
1117 SBBucketNode* sBucketNode = globalSendOpndList[i];
1118 SBNode* sNode = sBucketNode->node;
1119 if (BBVector[bb->getId()]->send_live_out.isSrcSet(sNode->globalID) && (sBucketNode->opndNum == Opnd_src0 ||
1120 sBucketNode->opndNum == Opnd_src1 ||
1121 sBucketNode->opndNum == Opnd_src2 ||
1122 sBucketNode->opndNum == Opnd_src3))
1123 {
1124 BBVector[bb->getId()]->getLiveBucketsFromFootprint(sNode->getFirstFootprint(sBucketNode->opndNum), sBucketNode, &send_use_out);
1125 }
1126 if (BBVector[bb->getId()]->send_live_out.isDstSet(sNode->globalID) && (sBucketNode->opndNum == Opnd_dst))
1127 {
1128 BBVector[bb->getId()]->getLiveBucketsFromFootprint(sNode->getFirstFootprint(sBucketNode->opndNum), sBucketNode, &send_use_out);
1129 }
1130 }
1131
1132 for (unsigned curBucket = 0; curBucket < kernel.getNumRegTotal(); curBucket++)
1133 {
1134 for (LiveGRFBuckets::BN_iterator bn_it = send_use_out.begin(curBucket);
1135 bn_it != send_use_out.end(curBucket); ++bn_it)
1136 {
1137 SBBucketNode* liveBN = (*bn_it);
1138 SBNode* curLiveNode = liveBN->node;
1139 Gen4_Operand_Number liveOpnd = liveBN->opndNum;
1140
1141 FCPatchingInfo::RegAccess Acc;
1142 Acc.Type = getRegAccessType(liveOpnd);
1143 Acc.RegNo = curBucket;
1144 Acc.Pipe = getRegAccessPipe(curLiveNode->GetInstruction());
1145 Acc.Inst = curLiveNode->GetInstruction();
1146 Acc.Token = Acc.Inst->getSetToken();
1147 FCPI->RegLastAccessList.push_back(Acc);
1148 FCPI->RegLastAccessMap[curBucket] = &FCPI->RegLastAccessList.back();
1149 }
1150 }
1151 }
1152 }
1153 #endif
1154
1155 updateTokenSet(FCPI, SBNodes, totalTokenNum);
1156
1157 #if defined(DEBUG_VERBOSE_ON)
1158 // First access.
1159 std::cerr << "FirstAccess:\n";
1160 auto& FirstAccess = FCPI->RegFirstAccessList;
1161 for (auto& Access : FirstAccess) {
1162 fprintf(stderr, "r%03u.%s", Access.RegNo,
1163 (Access.Type == FCPatchingInfo::Fully_Def ? "def" : "use"));
1164 fprintf(stderr, ", P%04x", Access.Pipe);
1165 if (Access.Token != (unsigned short)(-1))
1166 fprintf(stderr, ", $%u", Access.Token);
1167 fprintf(stderr, ":");
1168 Access.Inst->dump();
1169 }
1170 // Last access.
1171 std::cerr << "LastAccess:\n";
1172 auto& LastAccess = FCPI->RegLastAccessList;
1173 for (auto& Access : LastAccess) {
1174 fprintf(stderr, "r%03u.%s", Access.RegNo,
1175 (Access.Type == FCPatchingInfo::Fully_Def ? "def" : "use"));
1176 fprintf(stderr, ", P%04x", Access.Pipe);
1177 if (Access.Token != (unsigned short)(-1))
1178 fprintf(stderr, ", $%u", Access.Token);
1179 fprintf(stderr, ":");
1180 Access.Inst->dump();
1181 }
1182 // Allocated token.
1183 std::cerr << "AllocatedToken:\n";
1184 for (unsigned t = 0; t != NumTokens; ++t) {
1185 if (!FCPI->AllocatedToken.count(t))
1186 continue;
1187 if (t != 0)
1188 fprintf(stderr, ", ");
1189 fprintf(stderr, "$%u", t);
1190 }
1191 fprintf(stderr, "\n");
1192 #endif
1193 }
1194
getDominators(ImmDominator * dom)1195 void SWSB::getDominators(ImmDominator* dom)
1196 {
1197 //BBVector[bb->getId()]->tokenAssigned = true;
1198 bool changed = true;
1199
1200 while (changed)
1201 {
1202 changed = false;
1203
1204 for (size_t i = 0; i < BBVector.size(); i++)
1205 {
1206 BitSet currDoms = BBVector[i]->dominators;
1207 if (dom->getIDoms()[i] != BBVector[i]->getBB())
1208 {
1209 currDoms |= BBVector[dom->getIDoms()[i]->getId()]->dominators;
1210 }
1211
1212 if (currDoms != BBVector[i]->dominators)
1213 {
1214 changed = true;
1215 BBVector[i]->dominators = currDoms;
1216 }
1217 }
1218 }
1219 }
1220
1221 //
1222 //Entry to the software scoreboard generator
1223 //
SWSBGenerator()1224 void SWSB::SWSBGenerator()
1225 {
1226 DEBUG_VERBOSE("[SWSB]: Starting...");
1227 PointsToAnalysis p(kernel.Declares, kernel.fg.getNumBB());
1228 p.doPointsToAnalysis(kernel.fg);
1229
1230 kernel.fg.reassignBlockIDs();
1231 kernel.fg.findBackEdges();
1232 kernel.fg.findNaturalLoops();
1233
1234 //Note that getNumFlagRegisters() treat each 16 bits as a flag register
1235 LiveGRFBuckets LB(mem, kernel.getNumRegTotal() + fg.builder->getNumScalarRegisters() + kernel.getNumAcc() + fg.builder->getNumFlagRegisters(), kernel);
1236 LiveGRFBuckets globalSendsLB(mem, kernel.getNumRegTotal() + fg.builder->getNumScalarRegisters() + kernel.getNumAcc() + fg.builder->getNumFlagRegisters(), kernel);
1237
1238 SWSBDepDistanceGenerator(p, LB, globalSendsLB);
1239
1240 #ifdef DEBUG_VERBOSE_ON
1241 dumpDepInfo();
1242 #endif
1243
1244 if (fg.builder->getOptions()->getOption(vISA_GlobalTokenAllocation) ||
1245 fg.builder->getOptions()->getOption(vISA_DistPropTokenAllocation))
1246 {
1247 auto& dom = fg.getImmDominator();
1248
1249 //Build dom tree
1250 for (size_t i = 0; i < BBVector.size(); i++)
1251 {
1252 G4_BB* bb = BBVector[i]->getBB();
1253 BBVector[i]->dominators = BitSet(BBVector.size(), false);
1254 BBVector[i]->dominators.set(i, true);
1255
1256 if (dom.getIDoms()[bb->getId()] != bb)
1257 {
1258 BBVector[dom.getIDoms()[bb->getId()]->getId()]->domSuccs.push_back(BBVector[i]);
1259 BBVector[i]->domPreds.push_back(BBVector[dom.getIDoms()[bb->getId()]->getId()]);
1260 }
1261 }
1262
1263 for (size_t i = 0; i < BBVector.size(); i++)
1264 {
1265 if (BBVector[i]->domSuccs.size())
1266 {
1267 BBVector[i]->domSuccs.sort(compareBBStart);
1268 }
1269 }
1270
1271 getDominators(&dom);
1272 #ifdef DEBUG_VERBOSE_ON
1273 dumpImmDom(&dom);
1274 #endif
1275 }
1276
1277 if (SBSendNodes.size())
1278 {
1279 SWSBGlobalTokenGenerator(p, LB, globalSendsLB);
1280 }
1281 else
1282 {
1283 handleFuncCall();
1284 insertTest();
1285 }
1286
1287 if (fg.builder->getFCPatchInfo()->getFCComposableKernel())
1288 {
1289 genSWSBPatchInfo();
1290 }
1291
1292 #ifdef DEBUG_VERBOSE_ON
1293 std::cerr << "\n" << "Dependence Graph:" << "\n";
1294
1295 for (const SBNode* node : SBNodes)
1296 {
1297 G4_INST* inst = node->GetInstruction();
1298 std::cerr << node->getNodeID() << ":\t";
1299 inst->dump();
1300 std::cerr << "Succs:";
1301 for (const SBDEP_ITEM& curSucc : node->succs)
1302 {
1303 std::cerr << curSucc.node->getNodeID() << ",";
1304 }
1305 std::cerr << "\n";
1306 std::cerr << "Preds:";
1307 for (const SBDEP_ITEM& curPred : node->preds)
1308 {
1309 std::cerr << curPred.node->getNodeID() << ",";
1310 }
1311 std::cerr << "\n\n";
1312 }
1313 #endif
1314
1315 return;
1316 }
1317
calcDepDelayForNode(const SBNode * curNode) const1318 unsigned SWSB::calcDepDelayForNode(const SBNode* curNode) const
1319 {
1320 const G4_INST* inst = curNode->GetInstruction();
1321 int reuseDelay = 0;
1322
1323 if (inst->isSend())
1324 {
1325 if (inst->getDst() == nullptr ||
1326 inst->getDst()->isNullReg())
1327 {
1328 return TOKEN_AFTER_READ_CYCLE;
1329 }
1330
1331 const G4_SendDesc* msgDesc = inst->getMsgDesc();
1332 if (msgDesc->isSLM())
1333 {
1334 reuseDelay = tokenAfterWriteSendSlmCycle;
1335 }
1336 else if (msgDesc->isSampler())
1337 {
1338 reuseDelay = tokenAfterWriteSendSamplerCycle;
1339 }
1340 else
1341 {
1342 reuseDelay = tokenAfterWriteSendMemoryCycle;
1343 }
1344 }
1345 else if (inst->isMathPipeInst())
1346 {
1347 if (fg.builder->hasFixedCycleMathPipe())
1348 {
1349 assert(0 && "Math instruction is assigned token which is not supported in fixed mach cycle platform");
1350 }
1351
1352 reuseDelay = tokenAfterWriteMathCycle;
1353 }
1354 else if (inst->isDpas())
1355 {
1356 reuseDelay = tokenAfterDPASCycle;
1357 }
1358 else
1359 {
1360 assert(0 && "unexpected token reuse instruction");
1361 }
1362
1363 return reuseDelay;
1364 }
1365
examineNodeForTokenReuse(unsigned nodeID,unsigned nodeDelay,const SBNode * reuseNode,unsigned char nestLoopLevel,unsigned curLoopStartBB,unsigned curLoopEndBB) const1366 std::pair<int, int> SWSB::examineNodeForTokenReuse(
1367 unsigned nodeID,
1368 unsigned nodeDelay,
1369 const SBNode *reuseNode,
1370 unsigned char nestLoopLevel,
1371 unsigned curLoopStartBB,
1372 unsigned curLoopEndBB) const
1373 {
1374 int reuseDelay = 0;
1375 int curDistance = 0;
1376 //The reuse node is before current node.
1377 if (nodeID > reuseNode->getNodeID())
1378 {
1379 unsigned curNodeDelay = reuseNode->getDepDelay();
1380
1381 //reuse Delay is not accurate in different loop level
1382 reuseDelay = curNodeDelay - (nodeID - reuseNode->getNodeID());
1383
1384 //If too far, count distance
1385 if (reuseDelay < 0)
1386 {
1387 curDistance = nodeID - reuseNode->getNodeID();
1388 }
1389 }
1390 else //The reuse node is after current node
1391 {
1392 reuseDelay = nodeDelay - (reuseNode->getNodeID() - nodeID);
1393 if (reuseDelay < 0)
1394 {
1395 curDistance = reuseNode->getNodeID() - nodeID;
1396 }
1397 }
1398
1399 const G4_BB_SB *bb = BBVector[reuseNode->getBBID()];
1400 unsigned char curNodeNestLoopLevel = bb->getBB()->getNestLevel();
1401 unsigned loopLevelDiff = std::abs(curNodeNestLoopLevel - nestLoopLevel);
1402 constexpr unsigned loopFactorForTokenReuse = 5;
1403 if (reuseDelay > 0)
1404 {
1405 reuseDelay /= loopFactorForTokenReuse * loopLevelDiff + 1;
1406 }
1407 else
1408 {
1409 curDistance *= loopFactorForTokenReuse * loopLevelDiff + 1;
1410 if (nestLoopLevel && loopLevelDiff == 0)
1411 {
1412 if (curLoopStartBB == -1 || curLoopEndBB == -1)
1413 {
1414 curLoopStartBB = bb->getLoopStartBBID();
1415 curLoopEndBB = bb->getLoopEndBBID();
1416 }
1417 //Count the backedge, if the backedge distance is short, take it
1418 if (curLoopStartBB != -1 && curLoopEndBB != -1)
1419 {
1420 unsigned loopStartID = BBVector[curLoopStartBB]->first_node;
1421 unsigned loopEndID = BBVector[curLoopEndBB]->last_node;
1422
1423 // The reused node may in same loop as current node.
1424 int backEdgeDistance = loopEndID - loopStartID - curDistance;
1425
1426 if (reuseNode->getNodeID() < loopStartID || reuseNode->getNodeID() > loopEndID)
1427 {
1428 // Or it may in another loop with same nest loop level
1429 // Current back edge cannot cover the distance
1430 // loop1 {
1431 // node1
1432 // }
1433 //
1434 // loop2 {
1435 // node2
1436 // }
1437 curDistance = curDistance * (nestLoopLevel * loopFactorForTokenReuse + 1);
1438 }
1439 else
1440 {
1441 curDistance = std::min(curDistance, backEdgeDistance);
1442 }
1443 }
1444 }
1445 }
1446 return std::make_pair(reuseDelay, curDistance);
1447 }
1448
1449 //The algorithm for reuse selection: The live range which causes the least stall delay of current live range.
1450 //FIXME: for global variable, it's not accurate. Because the AFTER_SOURCE and AFTER_WRITE may in different branches.
1451 //Try not reuse the tokens set in adjacent instructions.
reuseTokenSelection(const SBNode * node) const1452 SBNode * SWSB::reuseTokenSelection(const SBNode * node) const
1453 {
1454 int delay = tokenAfterWriteSendSamplerCycle; //Assume the longest one
1455 int distance = 0; //Distance between the node
1456 const unsigned nodeID = node->getNodeID();
1457 const unsigned nodeDelay = node->getDepDelay(); // The longest delay the node may cause.
1458 const unsigned char nestLoopLevel = BBVector[node->getBBID()]->getBB()->getNestLevel();
1459 const unsigned loopStartBB = BBVector[node->getBBID()]->getLoopStartBBID();
1460 const unsigned loopEndBB = BBVector[node->getBBID()]->getLoopEndBBID();
1461
1462 assert(linearScanLiveNodes.size() <= totalTokenNum);
1463
1464 //The live nodes whose dependencies are not resolved in current node.
1465 SBNode* candidateNode = linearScanLiveNodes.front();
1466 for (SBNode* curNode : linearScanLiveNodes)
1467 {
1468 int maxTokenDelay = std::numeric_limits<int>::min(); //The delay may cause if reuse
1469 int minTokenDistance = std::numeric_limits<int>::max(); //The distance from the reused node
1470 // The token may be reused already, so check the 2 nodes that are
1471 // closest to the node using the same token. In most cases the
1472 // token allocation is done in ascending order. So, searching backward
1473 // should be fast. As for searching forward, only do that if there's
1474 // indeed a such node.
1475 const unsigned short token = curNode->getLastInstruction()->getSetToken();
1476 const unsigned lastBefore = allTokenNodesMap[token].bitset.findLastIn(0, node->getSendID());
1477 unsigned firstAfter = -1;
1478 if (node->getSendID() < allTokenNodesMap[token].maxSendID)
1479 {
1480 firstAfter = allTokenNodesMap[token].bitset.findFirstIn(node->getSendID() + 1,
1481 allTokenNodesMap[token].maxSendID + 1);
1482 }
1483 if (lastBefore != -1)
1484 {
1485 assert(allTokenNodesMap[token].bitset.isSet(lastBefore));
1486 const SBNode* n = SBSendNodes[lastBefore];
1487 auto res = examineNodeForTokenReuse(nodeID, nodeDelay, n, nestLoopLevel, loopStartBB, loopEndBB);
1488 //Largest reuse delay
1489 maxTokenDelay = std::max(maxTokenDelay, res.first);
1490 //Closest distance
1491 minTokenDistance = std::min(minTokenDistance, res.second);
1492 }
1493 if (firstAfter != -1)
1494 {
1495 assert(allTokenNodesMap[token].bitset.isSet(firstAfter));
1496 const SBNode* n = SBSendNodes[firstAfter];
1497 auto res = examineNodeForTokenReuse(nodeID, nodeDelay, n, nestLoopLevel, loopStartBB, loopEndBB);
1498 //Largest reuse delay
1499 maxTokenDelay = std::max(maxTokenDelay, res.first);
1500 //Closest distance
1501 minTokenDistance = std::min(minTokenDistance, res.second);
1502 }
1503
1504 // Smallest one is the best one
1505 // if Distance is not 0, count the distance, otherwise, use the delay.
1506 // Distance is not 0 means there are candidate whose distance is larger than the delay
1507 if (!distance && maxTokenDelay > 0)
1508 {
1509 if (maxTokenDelay < delay)
1510 {
1511 delay = maxTokenDelay;
1512 candidateNode = curNode;
1513 }
1514 }
1515 else if (minTokenDistance > distance)
1516 {
1517 distance = minTokenDistance;
1518 candidateNode = curNode;
1519 }
1520 }
1521
1522 return candidateNode;
1523 }
1524
1525 /*
1526 * If the cycles of the instruction which occupied
1527 */
cycleExpired(const SBNode * node,int currentID) const1528 bool SWSB::cycleExpired(const SBNode* node, int currentID) const
1529 {
1530 if (node->GetInstruction()->isSend())
1531 {
1532 const G4_SendDesc* msgDesc = node->GetInstruction()->getMsgDesc();
1533
1534 if (msgDesc->isSLM())
1535 {
1536 return tokenAfterWriteSendSlmCycle <= (currentID - node->getLiveStartID());
1537 }
1538 else if (msgDesc->isSampler())
1539 {
1540 return tokenAfterWriteSendSamplerCycle <= (currentID - node->getLiveStartID());
1541 }
1542 else
1543 {
1544 return tokenAfterWriteSendMemoryCycle <= (currentID - node->getLiveStartID());
1545 }
1546 }
1547 else if (node->GetInstruction()->isMathPipeInst())
1548 {
1549 if (fg.builder->hasFixedCycleMathPipe())
1550 {
1551 assert(0 && "Math instruction is assigned token which is not supported in fixed mach cycle platform");
1552 }
1553 return tokenAfterWriteMathCycle <= (currentID - node->getLiveStartID());
1554 }
1555 else if (node->GetInstruction()->isDpas())
1556 {
1557 return tokenAfterDPASCycle <= (int)(currentID - node->getLiveStartID());
1558 }
1559 else
1560 {
1561 assert(0 && "unexpected token reuse instruction");
1562 }
1563
1564 return true;
1565 }
1566
1567 //
1568 // Token dependence reduction is trying to reduce the unnecessary dependence when token reuse happens
1569 // Such as in following case
1570 // 1. send r20,... { $0 }
1571 // ...
1572 // 20. send r30, ... { $0 }
1573 // 21. add r40 r20 r60 { $0.dst }
1574 // There is no need to set dependence for instruction 21,
1575 // because the reuse guarantee the dependency from instruction 1 is resolved before token 0 can be reused.
1576 // FIXME: Dominator info is required for global reduction
1577 //
tokenDepReduction(SBNode * n1,SBNode * n2)1578 void SWSB::tokenDepReduction(SBNode* n1, SBNode* n2)
1579 {
1580 SBNode* node1 = n1;
1581 SBNode* node2 = n2;
1582
1583 assert(node1 != node2);
1584 if (n1->getNodeID() > n2->getNodeID())
1585 {
1586 node1 = n2;
1587 node2 = n1;
1588 }
1589
1590 if (!fg.builder->getOptions()->getOption(vISA_SWSBDepReduction))
1591 {
1592 unsigned node1BBID = node1->getBBID();
1593 unsigned node2BBID = node2->getBBID();
1594
1595 for (auto node_it = node1->succs.begin();
1596 node_it != node1->succs.end();
1597 )
1598 {
1599 SBDEP_ITEM& curSucc1 = (*node_it);
1600 SBNode* succ1 = curSucc1.node;
1601 unsigned bbID1 = succ1->getBBID();
1602
1603 //node1(previous) and node2(current) are in same BB: kill all live out of node1
1604 // BB:
1605 // node1
1606 // node2
1607 //
1608 //Or the succ of node1 and node 2 are in same BB: kill all succ of node1 which after node 2
1609 // FIXME: will this one conflict with global dependence reduction?
1610 //BB:
1611 // node2
1612 // succ(node1)
1613 //if ((node1BBID == node2BBID && bbID1 != node2BBID) ||
1614 // (node1BBID != node2BBID && bbID1 == node2BBID && succ1->getNodeID() > node2->getNodeID()))
1615 //{
1616 // node_it = node1->succs.erase(node_it);//FIXME, if the succ is the token instruction, do we need free the tokens assigned to the instruction because of the dependence
1617 // continue;
1618 //}
1619
1620 //When two successors are in same BB, previous one kill the following one
1621 // FIXME: This may not be good, because the policy is trying to keep the longest dependence and move the short one
1622 // Of course, if the two predecessors are lived in from different branch, we can only kill the longer one
1623 bool killed = false;
1624 for (auto node2_it = node2->succs.begin();
1625 node2_it != node2->succs.end();
1626 )
1627 {
1628 SBDEP_ITEM& curSucc2 = (*node2_it);
1629 const SBNode* succ2 = curSucc2.node;
1630 unsigned bbID2 = succ2->getBBID();
1631
1632 if (bbID1 == bbID2 &&
1633 bbID1 != node1BBID &&
1634 bbID2 != node2BBID &&
1635 succ2 != succ1)
1636 {
1637 //succ2 is ahead
1638 if (succ1->getNodeID() > succ2->getNodeID())
1639 {
1640 if (curSucc2.attr == DEP_EXPLICT &&
1641 (curSucc1.type == curSucc2.type ||
1642 curSucc2.type == RAW ||
1643 curSucc2.type == WAW))
1644 {
1645 //succ1 killed
1646 killed = true;
1647 break;
1648 }
1649 }
1650 else
1651 {
1652 if (curSucc1.attr == DEP_EXPLICT &&
1653 (curSucc1.type == curSucc2.type ||
1654 curSucc1.type == RAW ||
1655 curSucc1.type == WAW))
1656 {
1657 node2_it = node2->succs.erase(node2_it);
1658 continue;
1659 }
1660 }
1661 }
1662 node2_it++;
1663 }
1664
1665 if (killed)
1666 {
1667 node_it = node1->succs.erase(node_it);
1668 continue;
1669 }
1670
1671 node_it++;
1672 }
1673
1674 //The succs of node2 in same BB as node1 and is behind node1
1675 for (auto node_it = node2->succs.begin();
1676 node_it != node2->succs.end();
1677 )
1678 {
1679 const SBNode* succ2 = node_it->node;
1680 unsigned bbID2 = succ2->getBBID();
1681
1682 if ((node1BBID != node2BBID && bbID2 == node1BBID && succ2->getNodeID() > node1->getNodeID()))
1683 {
1684 node_it = node2->succs.erase(node_it);
1685 continue;
1686 }
1687
1688 node_it++;
1689 }
1690 }
1691
1692 n2->setLiveLatestID(n1->getLiveEndID(), n1->getLiveEndBBID());
1693 linearScanLiveNodes.remove(n1);
1694
1695 #ifdef DEBUG_VERBOSE_ON
1696 printf("remove token 1: %d\n", n1->getLastInstruction()->getSetToken());
1697 #endif
1698 return;
1699 }
1700
1701 /*
1702 *
1703 * We need cycle based expiration because for the case like
1704 * send null, r2... {$0}
1705 * add r2 {$0.src}
1706 * send r20 r9... {$0}
1707 * The second send should not be assigned with $0.
1708 * In compiler, if the live range of the r2 is end in the second instruction, token $0 is treated as free.
1709 * However, the SBID $0 will cleared only when the instruction finished the execution.
1710 * Assigned the same token to the third instruction will cause a long latency.
1711 * We delay the end of the lives of the intervals until the cycles are all consumed, so that the token will not be assigned immediately.
1712 * But if the dependence is .dst dependence, the live range is over. The stall will be going until the finish of the instruction.
1713 *
1714 */
expireIntervals(unsigned startID)1715 void SWSB::expireIntervals(unsigned startID)
1716 {
1717 for (SBNODE_LIST_ITER node_it = linearScanLiveNodes.begin();
1718 node_it != linearScanLiveNodes.end();
1719 )
1720 {
1721 SBNode* curNode = (*node_it);
1722 if (curNode->getLiveEndID() <= startID)
1723 {
1724 const SBNode* node = linearScanLiveNodes.front();
1725 if (node->hasAWDep() || cycleExpired(node, startID))
1726 {
1727 unsigned short token = node->getLastInstruction()->getSetToken();
1728
1729 assert(token != (unsigned short)-1);
1730 node_it = linearScanLiveNodes.erase(node_it);
1731 #ifdef DEBUG_VERBOSE_ON
1732 printf("remove token %d:\n", token);
1733 #endif
1734 //Remove token to free list
1735 freeTokenList[token] = nullptr;
1736 if (topIndex == -1)
1737 {
1738 topIndex = token;
1739 }
1740 continue;
1741 }
1742 }
1743 else
1744 {
1745 break;
1746 }
1747 node_it++;
1748 }
1749 }
1750
1751 //GraphColoring can provide a more accurate version.
1752 //For linear scan, only if the instruction is not assigned before can be used for this OPT.
1753 //This is to avoid the false token sharing.
1754 //What's the impact on the token reduction?
1755 //Token reduction will remove the succ, i.e remove the dependence.
1756 //NOTE THAT: Token reduction happens only when run out of token.
shareToken(const SBNode * node,const SBNode * succ,unsigned short token)1757 void SWSB::shareToken(const SBNode* node, const SBNode* succ, unsigned short token)
1758 {
1759 if (node->getBBID() == succ->getBBID())
1760 {
1761 return;
1762 }
1763
1764 for (const SBDEP_ITEM& curPred : succ->preds)
1765 {
1766 const SBNode* succPred = curPred.node;
1767
1768 if (node->getBBID() != succPred->getBBID() &&
1769 succPred->getLastInstruction()->getTokenType() == G4_INST::SWSBTokenType::TOKEN_NONE &&
1770 tokenHonourInstruction(succPred->getLastInstruction()))
1771 {
1772 G4_BB_SB* curBB = BBVector[node->getBBID()];
1773 G4_BB_SB* succPredBB = BBVector[succPred->getBBID()];
1774 //FIXME: Only define BBs comparison is not enough. It may cause extra delay?
1775 if (!(curBB->send_live_in.isDstSet((unsigned)succPred->globalID) ||
1776 curBB->send_live_in.isSrcSet((unsigned)succPred->globalID) ||
1777 succPredBB->send_live_in.isDstSet((unsigned)node->globalID) ||
1778 succPredBB->send_live_in.isSrcSet((unsigned)node->globalID)
1779 ))
1780 {
1781 succPred->getLastInstruction()->setSetToken(token);
1782 }
1783 }
1784 }
1785
1786 return;
1787 }
1788
assignDepToken(SBNode * node)1789 void SWSB::assignDepToken(SBNode* node)
1790 {
1791 unsigned short token = node->getLastInstruction()->getSetToken();
1792 assert(token != (unsigned short)-1 && "Failed to add token dependence to the node without token");
1793
1794 //Set the dependent tokens for successors of current send
1795 //Remove the unnecessary dependent tokens in same BB, this work can be done when adding the edge,
1796 //However, since that's the bucket based, and is harder to do sorting for different GRF dependence
1797 //
1798 //1. Send r2-r5, r8, .... $1
1799 // ...
1800 //7. Add r8, r16, r10 test $1S
1801 //8. Add r12, r4, r14 test $1D
1802 //If WAR first as shown in instruction 7, we still need keep dependence for 8.
1803 //
1804 //1. Send r2-r5, r8, .... $1
1805 // ...
1806 //7. Add r12, r4, r14 test $1D
1807 //8. Add r8, r16, r10
1808 //Instead, if RAW happens first as shown in instruction 7, there is NO need for 8.
1809
1810 for (const SBDEP_ITEM& curSucc : node->succs)
1811 {
1812 SBNode* succ = curSucc.node;
1813 DepType type = curSucc.type;
1814 SBDependenceAttr attr = curSucc.attr;
1815
1816 if (attr == DEP_IMPLICIT)
1817 {
1818 continue;
1819 }
1820
1821 //Same token,reuse happened, no need to set dep token
1822 if (tokenHonourInstruction(succ->getLastInstruction()) &&
1823 succ->getLastInstruction()->getSetToken() == token && (succ->instVec.size() <= 1)) //If the node size, the token reuse cannot guard the last instruction.
1824 {
1825 continue;
1826 }
1827
1828 //set dependence token if live
1829 SWSBTokenType tokenType = type == WAR ? SWSBTokenType::AFTER_READ : SWSBTokenType::AFTER_WRITE;
1830 succ->setDepToken(token, tokenType, node);
1831 #ifdef DEBUG_VERBOSE_ON
1832 dumpSync(node, succ, token, tokenType);
1833 #endif
1834 }
1835 }
1836
assignDepTokens()1837 void SWSB::assignDepTokens()
1838 {
1839 for (SBNode* node : SBSendNodes)
1840 {
1841 G4_INST* inst = node->getLastInstruction();
1842
1843 if (inst->isEOT())
1844 {
1845 continue;
1846 }
1847
1848 unsigned short token = inst->getSetToken();
1849 if (token != (unsigned short)-1)
1850 {
1851 assignDepToken(node);
1852 }
1853 }
1854 }
1855
assignToken(SBNode * node,unsigned short assignedToken,uint32_t & AWtokenReuseCount,uint32_t & ARtokenReuseCount,uint32_t & AAtokenReuseCount)1856 void SWSB::assignToken(SBNode* node,
1857 unsigned short assignedToken,
1858 uint32_t& AWtokenReuseCount,
1859 uint32_t& ARtokenReuseCount,
1860 uint32_t& AAtokenReuseCount)
1861 {
1862 unsigned short token = (unsigned short)UNKNOWN_TOKEN;
1863
1864 if (assignedToken == (unsigned short)UNKNOWN_TOKEN)
1865 {
1866 //Get token
1867 if (topIndex != -1)
1868 {
1869 //Have free token
1870 token = topIndex;
1871 freeTokenList[token] = node; //Cannot be moved after setTopTokenIndex();
1872 setTopTokenIndex();
1873 #ifdef DEBUG_VERBOSE_ON
1874 printf("Use free token: %d, QUEUE SIZE: %d\n", token, linearScanLiveNodes.size());
1875 #endif
1876 }
1877 else
1878 {
1879 //Have no free, use the oldest
1880 SBNode* oldNode = reuseTokenSelection(node);
1881 token = oldNode->getLastInstruction()->getSetToken();
1882 tokenDepReduction(oldNode, node);
1883 freeTokenList[token] = node;
1884 #ifdef DEBUG_VERBOSE_ON
1885 printf("Reuse token: %d, current: %d %d, reuse: %d %d, QUEUE SIZE: %d\n", token, node->getSendID(), node->getNodeID(), oldNode->getSendID(), oldNode->getNodeID(), linearScanLiveNodes.size());
1886 #endif
1887 tokenReuseCount++;
1888 if (oldNode->hasAWDep())
1889 {
1890 AWtokenReuseCount++;
1891 }
1892 else if (oldNode->hasARDep())
1893 {
1894 ARtokenReuseCount++;
1895 }
1896 else
1897 {
1898 AAtokenReuseCount++;
1899 }
1900 node->setTokenReuseNode(oldNode);
1901 }
1902 }
1903 else
1904 {
1905 //This reuse pred node may have been reused already
1906 //When it is in short of free SBID. So, it's may not in the active list.
1907 token = assignedToken;
1908 if (freeTokenList[token] != nullptr)
1909 { //If the end of predecessor node is current node, the pred node may have expired already. Otherwise do reduction
1910 SBNode* pred = freeTokenList[token];
1911 tokenDepReduction(pred, node);
1912 }
1913 freeTokenList[token] = node;
1914 if (topIndex == token)
1915 {
1916 setTopTokenIndex();
1917 }
1918 #ifdef DEBUG_VERBOSE_ON
1919 printf("Reuse token: %d, QUEUE SIZE: %d\n", token, linearScanLiveNodes.size());
1920 #endif
1921 }
1922 #ifdef DEBUG_VERBOSE_ON
1923 printf("Assigned token: %d, node: %d, send: %d, QUEUE SIZE: %d\n", token, node->getNodeID(), node->getSendID(), linearScanLiveNodes.size());
1924 #endif
1925
1926 //Set token to send
1927 node->getLastInstruction()->setSetToken(token);
1928 //For token reduction
1929 allTokenNodesMap[token].set(node->sendID);
1930
1931 //Sort succs according to the BBID and node ID.
1932 std::sort(node->succs.begin(), node->succs.end(), nodeSortCompare);
1933 for (auto node_it = node->succs.begin();
1934 node_it != node->succs.end();
1935 )
1936 {
1937 const SBDEP_ITEM& curSucc = (*node_it);
1938 SBNode* succ = curSucc.node;
1939 SBDependenceAttr attr = curSucc.attr;
1940
1941 if (attr == DEP_IMPLICIT)
1942 {
1943 node_it++;
1944 continue;
1945 }
1946
1947 // In the case like following
1948 // 1. math.rsqrt r20 r10 { $1 }
1949 // 2. math.in r50 r20 { $1 }
1950 // 3. mul r60 r50 r40 { $1.dst }
1951 if (tokenHonourInstruction(succ->getLastInstruction()))
1952 {
1953 unsigned distance = succ->getSendID() > node->getSendID() ? succ->getSendID() - node->getSendID() : node->getSendID() - succ->getSendID();
1954 if ((fg.builder->getOptions()->getOption(vISA_EnableISBIDBUNDLE) ||
1955 distance < totalTokenNum))
1956 {
1957 if ((curSucc.type == RAW || curSucc.type == WAW) &&
1958 succ->getLastInstruction()->getSetToken() == (unsigned short)UNKNOWN_TOKEN)
1959 {
1960 if (fg.builder->getOptions()->getOption(vISA_EnableDPASTokenReduction))
1961 {
1962 // If no instruction depends on DPAS, no SBID
1963 if (!(succ->GetInstruction()->isDpas() && succ->succs.size() == 0))
1964 {
1965 succ->getLastInstruction()->setSetToken(token);
1966 node->setLiveLatestID(succ->getLiveEndID(), succ->getLiveEndBBID());
1967 allTokenNodesMap[token].set(succ->sendID);
1968 succ->setTokenReuseNode(node);
1969 continue;
1970 }
1971 }
1972 else
1973 {
1974 succ->getLastInstruction()->setSetToken(token);
1975 node->setLiveLatestID(succ->getLiveEndID(), succ->getLiveEndBBID());
1976 allTokenNodesMap[token].set(succ->sendID);
1977 succ->setTokenReuseNode(node);
1978 continue;
1979 }
1980 }
1981 }
1982 }
1983
1984 node_it++;
1985 }
1986
1987 return;
1988 }
1989
addToLiveList(SBNode * node)1990 void SWSB::addToLiveList(SBNode* node)
1991 {
1992 bool insert = false;
1993 assert(linearScanLiveNodes.size() < totalTokenNum);
1994 for (SBNODE_LIST_ITER node_it = linearScanLiveNodes.begin();
1995 node_it != linearScanLiveNodes.end();
1996 node_it++)
1997 {
1998 const SBNode* curNode = (*node_it);
1999
2000 //Sort according to the ascending of the end ID.
2001 if (curNode->getLiveEndID() > node->getLiveEndID())
2002 {
2003 linearScanLiveNodes.insert(node_it, node);
2004 insert = true;
2005 break;
2006 }
2007 else if (curNode->getLiveEndID() == node->getLiveEndID())
2008 {
2009 if (curNode->getLiveStartID() > node->getLiveStartID())
2010 {
2011 linearScanLiveNodes.insert(node_it, node);
2012 insert = true;
2013 break;
2014 }
2015 else if (curNode->getLiveStartID() == node->getLiveStartID())
2016 {
2017 if (curNode->getNodeID() > node->getNodeID())
2018 {
2019 linearScanLiveNodes.insert(node_it, node);
2020 insert = true;
2021 break;
2022 }
2023 }
2024 }
2025 }
2026
2027 if (!insert)
2028 {
2029 linearScanLiveNodes.push_back(node);
2030 }
2031
2032 unsigned usedToken = 0;
2033 for (const SBNode *node : freeTokenList)
2034 {
2035 if (node != nullptr)
2036 {
2037 usedToken++;
2038 }
2039 }
2040 assert(usedToken == linearScanLiveNodes.size());
2041
2042 #ifdef DEBUG_VERBOSE_ON
2043 printf("Add token: %d\n", node->getLastInstruction()->getSetToken());
2044 #endif
2045 return;
2046 }
2047
2048 //
2049 // Global reaching define analysis for tokens
2050 //
globalTokenReachAnalysis(G4_BB * bb)2051 bool SWSB::globalTokenReachAnalysis(G4_BB* bb)
2052 {
2053 bool changed = false;
2054 unsigned bbID = bb->getId();
2055
2056 // Do nothing for the entry BB
2057 // Because it has no live in
2058 if (bb->Preds.empty())
2059 {
2060 return false;
2061 }
2062
2063 assert(BBVector[bbID]->liveInTokenNodes.getSize() != 0);
2064
2065 BitSet temp_live_in(unsigned(SBSendNodes.size()), false);
2066 temp_live_in = BBVector[bbID]->liveInTokenNodes;
2067
2068 //Union all of out of SIMDCF predecessor BB to the live in of current BB.
2069 for (const G4_BB_SB* predBB : BBVector[bbID]->Preds)
2070 {
2071 unsigned predID = predBB->getBB()->getId();
2072 temp_live_in |= BBVector[predID]->liveOutTokenNodes;
2073 }
2074
2075 //Union all of out of scalar predecessor BB to the live in of current BB.
2076 for (const G4_BB* predBB : bb->Preds)
2077 {
2078 unsigned predID = predBB->getId();
2079 temp_live_in |= BBVector[predID]->liveOutTokenNodes;
2080 }
2081
2082 //Changed? Yes, get the new live in, other wise do nothing
2083 if (temp_live_in != BBVector[bbID]->liveInTokenNodes)
2084 {
2085 changed = true;
2086 BBVector[bbID]->liveInTokenNodes = temp_live_in;
2087 }
2088
2089 //Calculate the live out according to the live in and killed tokens in current BB
2090 for (uint32_t token = 0; token < totalTokenNum; token++)
2091 {
2092 if (BBVector[bbID]->killedTokens.isSet(token))
2093 {
2094 temp_live_in -= allTokenNodesMap[token].bitset;
2095 }
2096 }
2097
2098 //Get the new live out,
2099 //FIXME: is it right? the live out is always assigned in increasing.
2100 //Original, we only have local live out.
2101 //should we separate the local live out vs total live out?
2102 //Not necessary, can live out, will always be live out.
2103 BBVector[bbID]->liveOutTokenNodes |= temp_live_in;
2104
2105 return changed;
2106 }
2107
SWSBGlobalTokenAnalysis()2108 void SWSB::SWSBGlobalTokenAnalysis()
2109 {
2110 bool change = true;
2111 while (change)
2112 {
2113 change = false;
2114 for (G4_BB* bb : fg)
2115 {
2116 if (globalTokenReachAnalysis(bb))
2117 {
2118 change = true;
2119 }
2120 }
2121 }
2122 }
2123
SWSBGlobalScalarCFGReachAnalysis()2124 void SWSB::SWSBGlobalScalarCFGReachAnalysis()
2125 {
2126 bool change = true;
2127 while (change)
2128 {
2129 change = false;
2130 for (G4_BB* bb : fg)
2131 {
2132 if (globalDependenceDefReachAnalysis(bb))
2133 {
2134 change = true;
2135 }
2136 }
2137 }
2138 }
2139
SWSBGlobalSIMDCFGReachAnalysis()2140 void SWSB::SWSBGlobalSIMDCFGReachAnalysis()
2141 {
2142 bool change = true;
2143 while (change)
2144 {
2145 change = false;
2146 for (G4_BB* bb : fg)
2147 {
2148 if (globalDependenceUseReachAnalysis(bb))
2149 {
2150 change = true;
2151 }
2152 }
2153 }
2154 }
2155
setTopTokenIndex()2156 void SWSB::setTopTokenIndex()
2157 {
2158 int startIndex = topIndex;
2159 if (topIndex == -1)
2160 {
2161 startIndex = 0;
2162 }
2163 for (int i = startIndex; i < (int)totalTokenNum; i++)
2164 {
2165 if (freeTokenList[i] == nullptr)
2166 {
2167 topIndex = i;
2168 return;
2169 }
2170 }
2171 for (int i = 0; i < startIndex; i++)
2172 {
2173 if (freeTokenList[i] == nullptr)
2174 {
2175 topIndex = i;
2176 return;
2177 }
2178 }
2179
2180 topIndex = -1;
2181 }
2182
propogateDist(G4_BB * bb)2183 bool SWSB::propogateDist(G4_BB* bb)
2184 {
2185 bool changed = false;
2186 unsigned bbID = bb->getId();
2187
2188 if (bb->Preds.empty())
2189 {
2190 return false;
2191 }
2192
2193 assert(BBVector[bbID]->send_live_in.getSize() != 0);
2194
2195 SBBitSets temp_live_in(globalSendNum);
2196 temp_live_in = BBVector[bbID]->send_live_in;
2197 std::vector<unsigned> tokenLiveInDist;
2198 tokenLiveInDist.resize(globalSendNum);
2199
2200 for (unsigned i = 0; i < globalSendNum; i++)
2201 {
2202 tokenLiveInDist[i] = BBVector[bbID]->tokenLiveInDist[i];
2203 }
2204
2205 //Get the live out from all predicator BBs
2206 for (const G4_BB* predBB : bb->Preds)
2207 {
2208 unsigned predID = predBB->getId();
2209
2210 for (unsigned i = 0; i < globalSendNum; i++)
2211 {
2212 if (BBVector[predID]->send_live_out.isDstSet(i) &&
2213 BBVector[predID]->tokenLiveOutDist[i] != -1 &&
2214 BBVector[predID]->tokenLiveOutDist[i] < tokenLiveInDist[i])
2215 {
2216 tokenLiveInDist[i] = BBVector[predID]->tokenLiveOutDist[i];
2217 }
2218 }
2219 }
2220
2221 //Update the live in
2222 for (unsigned i = 0; i < globalSendNum; i++)
2223 {
2224 if (tokenLiveInDist[i] != BBVector[bbID]->tokenLiveInDist[i] &&
2225 tokenLiveInDist[i] != -1)
2226 {
2227 changed = true;
2228 BBVector[bbID]->tokenLiveInDist[i] = tokenLiveInDist[i];
2229 }
2230 }
2231
2232 //Update the live out
2233 if (changed)
2234 {
2235 for (unsigned i = 0; i < globalSendNum; i++)
2236 {
2237 if (BBVector[bbID]->send_live_in.isDstSet(i) &&
2238 BBVector[bbID]->send_live_out.isDstSet(i) &&
2239 !BBVector[bbID]->send_may_kill.isDstSet(i))
2240 {
2241 BBVector[bbID]->tokenLiveOutDist[i] = BBVector[bbID]->tokenLiveInDist[i] + bb->size();
2242 }
2243 }
2244 }
2245
2246 return changed;
2247 }
2248
calculateDist()2249 void SWSB::calculateDist()
2250 {
2251 #ifdef DEBUG_VERBOSE_ON
2252 globalSBNodes.resize(globalSendNum);
2253 #endif
2254 //Initial all live out distance
2255 for (SBNode* node : SBSendNodes)
2256 {
2257 if (BBVector[node->getBBID()]->send_live_out.isDstSet(node->globalID))
2258 {
2259 BBVector[node->getBBID()]->tokenLiveOutDist[node->globalID] = BBVector[node->getBBID()]->last_node - node->getNodeID();
2260 #ifdef DEBUG_VERBOSE_ON
2261 globalSBNodes[node->globalID] = node;
2262 #endif
2263 }
2264 }
2265
2266 bool change = true;
2267 while (change)
2268 {
2269 change = false;
2270 for (G4_BB* bb : fg)
2271 {
2272 if (propogateDist(bb))
2273 {
2274 change = true;
2275 }
2276 }
2277 }
2278
2279 #ifdef DEBUG_VERBOSE_ON
2280 for (size_t i = 0; i < BBVector.size(); i++)
2281 {
2282 std::cerr << "BB" << i << ": " << BBVector[i]->first_node << "-" << BBVector[i]->last_node << ", succ<";
2283 for (std::list<G4_BB*>::iterator sit = BBVector[i]->getBB()->Succs.begin(); sit != BBVector[i]->getBB()->Succs.end(); ++sit)
2284 {
2285 std::cerr << (*sit)->getId() << ",";
2286 }
2287 std::cerr << "> pred<";
2288 for (std::list<G4_BB*>::iterator pit = BBVector[i]->getBB()->Preds.begin(); pit != BBVector[i]->getBB()->Preds.end(); ++pit)
2289 {
2290 std::cerr << (*pit)->getId() << ",";
2291 }
2292
2293 std::cerr << ">\n liveIn:";
2294 for (unsigned k = 0; k < globalSendNum; k++)
2295 {
2296 if (BBVector[i]->tokenLiveInDist[k] != -1)
2297 {
2298 std::cerr << " n" << globalSBNodes[k]->getNodeID() << ":" << BBVector[i]->tokenLiveInDist[k];
2299 }
2300 }
2301 std::cerr << "\n liveout:";
2302 for (unsigned k = 0; k < globalSendNum; k++)
2303 {
2304 if (BBVector[i]->tokenLiveOutDist[k] != -1)
2305 {
2306 std::cerr << " n" << globalSBNodes[k]->getNodeID() << ":" << BBVector[i]->tokenLiveOutDist[k];
2307 }
2308 }
2309 std::cerr << "\n\n";
2310 }
2311 #endif
2312
2313 }
2314
2315
2316 /* Quick token allocation, allocate the token in round robin.
2317 */
quickTokenAllocation()2318 void SWSB::quickTokenAllocation()
2319 {
2320 uint32_t token = 0;
2321
2322 //Linear scan
2323 for (SBNode* node : SBSendNodes)
2324 {
2325 if (node->getLastInstruction()->isEOT())
2326 {
2327 continue;
2328 }
2329
2330 assert(node->getLastInstruction()->getSetToken() == (unsigned short)UNKNOWN_TOKEN);
2331 node->getLastInstruction()->setSetToken(token);
2332 if (token >= totalTokenNum - 1)
2333 {
2334 token = 0;
2335 }
2336 else
2337 {
2338 token ++;
2339 }
2340 }
2341
2342 assignDepTokens();
2343 }
2344
2345 /* Linear scan algorithm is used for the token allocation.
2346 * Based on the assumption that instruction scheduling has scheduled the instruction to the best.
2347 * FIXME: instruction scheduling doesn't consider the token pressure issue.
2348 */
tokenAllocation()2349 void SWSB::tokenAllocation()
2350 {
2351 //build live intervals
2352 buildLiveIntervals();
2353
2354 //Initial free token list
2355 freeTokenList.resize(totalTokenNum);
2356 topIndex = 0;
2357
2358 tokenProfile.setTokenInstructionCount((int)SBSendNodes.size());
2359 uint32_t AWTokenReuseCount = 0;
2360 uint32_t ARTokenReuseCount = 0;
2361 uint32_t AATokenReuseCount = 0;
2362 uint32_t mathInstCount = 0;
2363 //Linear scan
2364 //Assign tokens to nodes in the order of liveness. Here we only need to
2365 //iterate SB nodes in that order, and don't actually need to sort
2366 //SBSendNodes as it might be referenced through allTokenNodesMap.
2367 auto sortInLivenessOrder = [](const SBNODE_VECT& vec) {
2368 SBNODE_VECT sorted(vec.size());
2369 std::partial_sort_copy(vec.begin(), vec.end(), sorted.begin(), sorted.end(), compareInterval);
2370 return sorted;
2371 };
2372 const bool enableSendTokenReduction = fg.builder->getOptions()->getOption(vISA_EnableSendTokenReduction);
2373 const bool enableDPASTokenReduction = fg.builder->getOptions()->getOption(vISA_EnableDPASTokenReduction);
2374 for (SBNode* node : sortInLivenessOrder(SBSendNodes))
2375 {
2376 unsigned startID = node->getLiveStartID();
2377 G4_INST* inst = node->getLastInstruction();
2378 #ifdef DEBUG_VERBOSE_ON
2379 printf("\n=======nodeID: %d, startID: %d, endID: %d\n", node->getNodeID(), node->getLiveStartID(), node->getLiveEndID());
2380 #endif
2381 if (inst->isEOT())
2382 {
2383 continue;
2384 }
2385
2386 if (enableSendTokenReduction && node->succs.size() == 0)
2387 {
2388 continue;
2389 }
2390
2391 if (enableDPASTokenReduction)
2392 {
2393 //If there is no instruction depends on a DPAS instruction, no SBID
2394 if (inst->isDpas() && node->succs.size() == 0)
2395 {
2396 continue;
2397 }
2398 }
2399
2400 if (inst->isMathPipeInst())
2401 {
2402 mathInstCount++;
2403 }
2404
2405 expireIntervals(startID);
2406
2407 unsigned short assignedToken = node->getLastInstruction()->getSetToken();
2408 //If token reuse happened, and the live range of old node is longer than current one,
2409 //we will keep the old one in the active list.
2410 assignToken(node, assignedToken,
2411 AWTokenReuseCount,
2412 ARTokenReuseCount,
2413 AATokenReuseCount);
2414
2415 addToLiveList(node);
2416 }
2417
2418 #ifdef DEBUG_VERBOSE_ON
2419 dumpTokeAssignResult();
2420 #endif
2421
2422 if (fg.builder->getOptions()->getOption(vISA_SWSBDepReduction))
2423 {
2424 for (G4_BB_SB* sb_bb : BBVector)
2425 {
2426 sb_bb->getLiveOutToken(unsigned(SBSendNodes.size()), &SBNodes);
2427 }
2428 #ifdef DEBUG_VERBOSE_ON
2429 dumpTokenLiveInfo();
2430 #endif
2431 SWSBGlobalTokenAnalysis();
2432
2433 #ifdef DEBUG_VERBOSE_ON
2434 dumpTokenLiveInfo();
2435 #endif
2436
2437 unsigned prunedEdgeNum = 0;
2438 unsigned prunedGlobalEdgeNum = 0;
2439 unsigned prunedDiffBBEdgeNum = 0;
2440 unsigned prunedDiffBBSameTokenEdgeNum = 0;
2441 tokenEdgePrune(prunedEdgeNum, prunedGlobalEdgeNum, prunedDiffBBEdgeNum, prunedDiffBBSameTokenEdgeNum);
2442 tokenProfile.setPrunedEdgeNum(prunedEdgeNum);
2443 tokenProfile.setPrunedGlobalEdgeNum(prunedGlobalEdgeNum);
2444 tokenProfile.setPrunedDiffBBEdgeNum(prunedDiffBBEdgeNum);
2445 tokenProfile.setPrunedDiffBBSameTokenEdgeNum(prunedDiffBBSameTokenEdgeNum);
2446 }
2447
2448 assignDepTokens();
2449
2450 tokenProfile.setAWTokenReuseCount(AWTokenReuseCount);
2451 tokenProfile.setARTokenReuseCount(ARTokenReuseCount);
2452 tokenProfile.setAATokenReuseCount(AATokenReuseCount);
2453 tokenProfile.setMathInstCount(mathInstCount);
2454 }
2455
reuseTokenSelectionGlobal(SBNode * node,G4_BB * bb,SBNode * & candidateNode,bool & fromSibling)2456 unsigned short SWSB::reuseTokenSelectionGlobal(SBNode* node, G4_BB* bb, SBNode*& candidateNode, bool& fromSibling)
2457 {
2458 SBBitSets temp_live_in(globalSendNum);
2459 temp_live_in = BBVector[bb->getId()]->send_live_in;
2460 unsigned short reuseToken = (unsigned short)UNKNOWN_TOKEN;
2461 unsigned nodeReuseOverhead = -1;
2462
2463 tokenReuseCount++;
2464 for (unsigned int i = 0; i < totalTokenNum; i++)
2465 {
2466 unsigned nodeDist = -1;
2467 unsigned tokenReuseOverhead = 0;
2468 SBNode* candidateTokenNode = nullptr;
2469 unsigned short curToken = (unsigned short)UNKNOWN_TOKEN;
2470 bool fromUse = false;
2471
2472 for (SBNode* liveNode : *reachTokenArray[i])
2473 {
2474 unsigned liveNodeDelay = liveNode->getDepDelay();
2475 unsigned liveNodeOverhead = 0;
2476
2477 //What about the global send come back to current BB?
2478 //Shouldn't be assigned
2479 if ((liveNode->globalID != -1) &&
2480 (BBVector[bb->getId()]->tokenLiveInDist[liveNode->globalID] != -1) &&
2481 (liveNode->getBBID() != bb->getId() || liveNode->getNodeID() > node->getNodeID()) )
2482 {
2483 nodeDist = BBVector[bb->getId()]->tokenLiveInDist[liveNode->globalID] + (node->getNodeID() - BBVector[bb->getId()]->first_node);
2484 }
2485 else
2486 {
2487 if (liveNode->getBBID() == bb->getId())
2488 {
2489 nodeDist = node->getNodeID() - liveNode->getNodeID();
2490 }
2491 else //Not dst live out global, which is not calculated, use the node distance
2492 {
2493 nodeDist = node->getNodeID() > liveNode->getNodeID() ? node->getNodeID() - liveNode->getNodeID() : liveNode->getNodeID() - node->getNodeID();
2494 }
2495 }
2496
2497 liveNodeOverhead = (liveNodeDelay > nodeDist ? (liveNodeDelay - nodeDist) : 0);
2498 liveNodeOverhead += liveNode->reuseOverhead;
2499
2500 if ((candidateTokenNode == nullptr) || (liveNodeOverhead > tokenReuseOverhead))
2501 {
2502 tokenReuseOverhead = liveNodeOverhead;
2503 candidateTokenNode = liveNode;
2504 curToken = i;
2505 fromUse = false;
2506 }
2507 }
2508
2509 if (fromSibling)
2510 {
2511 for (SBNode* useNode : *reachUseArray[i])
2512 {
2513 unsigned nodeDelay = node->getDepDelay();
2514 unsigned nodeOverhead = 0;
2515
2516 //What about the global send come back to current BB?
2517 //Shouldn't be assigned
2518 if ((node->globalID != -1) &&
2519 (BBVector[useNode->getBBID()]->tokenLiveInDist[node->globalID] != -1) &&
2520 (useNode->getBBID() != bb->getId() || useNode->getNodeID() > node->getNodeID()))
2521 {
2522 nodeDist = BBVector[useNode->getBBID()]->tokenLiveInDist[node->globalID] + (useNode->getNodeID() - BBVector[useNode->getBBID()]->first_node);
2523 }
2524 else
2525 {
2526 assert(useNode->getBBID() == bb->getId());
2527 nodeDist = node->getNodeID() - useNode->getNodeID();
2528 }
2529
2530 nodeOverhead = (nodeDelay > nodeDist ? (nodeDelay - nodeDist) : 0);
2531 nodeOverhead += node->reuseOverhead;
2532
2533 if ((candidateTokenNode == nullptr) || (nodeOverhead > tokenReuseOverhead))
2534 {
2535 tokenReuseOverhead = nodeOverhead;
2536 candidateTokenNode = useNode;
2537 curToken = i;
2538 fromUse = true;
2539 }
2540 }
2541 }
2542
2543 if (candidateTokenNode && (tokenReuseOverhead < nodeReuseOverhead))
2544 {
2545 nodeReuseOverhead = tokenReuseOverhead;
2546 candidateNode = candidateTokenNode;
2547 reuseToken = curToken;
2548 fromSibling = fromUse;
2549 }
2550 }
2551
2552 assert(candidateNode != nullptr);
2553 if (!fromSibling)
2554 {
2555 node->reuseOverhead += nodeReuseOverhead;
2556 }
2557
2558 return reuseToken;
2559 }
2560
expireLocalIntervals(unsigned startID,unsigned BBID)2561 void SWSB::expireLocalIntervals(unsigned startID, unsigned BBID)
2562 {
2563 for (SBNODE_VECT_ITER it = localTokenUsage.begin(); it != localTokenUsage.end();)
2564 {
2565 SBNode* node = (*it);
2566
2567 if (node->getLiveEndID() < startID)
2568 {
2569 it = localTokenUsage.erase(it);
2570 BBVector[BBID]->localReachingSends.setDst(node->sendID, false);
2571 continue;
2572 }
2573 it++;
2574 }
2575 }
2576
assignTokenToPred(SBNode * node,SBNode * pred,G4_BB * bb)2577 void SWSB::assignTokenToPred(SBNode* node, SBNode* pred, G4_BB* bb)
2578 {
2579 unsigned predDist = -1;
2580 SBNode* canidateNode = nullptr;
2581
2582 assert(pred->getLastInstruction()->getSetToken() != (unsigned short)UNKNOWN_TOKEN);
2583
2584 for (auto node_it = node->preds.begin();
2585 node_it != node->preds.end(); node_it++)
2586 {
2587 SBDEP_ITEM& curPred = (*node_it);
2588 SBNode* otherPred = curPred.node;
2589 DepType type = curPred.type;
2590 unsigned dist = 0;
2591
2592 if (otherPred == pred)
2593 {
2594 continue;
2595 }
2596
2597 if (tokenHonourInstruction(otherPred->getLastInstruction()) &&
2598 (otherPred->getLastInstruction()->getSetToken() == (unsigned short)UNKNOWN_TOKEN) &&
2599 (type == RAW || type == WAW || otherPred->getLastInstruction()->getDst() == nullptr))
2600 {
2601 if ((!otherPred->reachingSends.isDstSet(pred->sendID)) &&
2602 (!pred->reachingSends.isDstSet(otherPred->sendID)))
2603 {
2604 if (otherPred->globalID != -1 &&
2605 BBVector[node->getBBID()]->tokenLiveInDist[otherPred->globalID] != -1)
2606 {
2607 dist = BBVector[node->getBBID()]->tokenLiveInDist[otherPred->globalID] + (node->getNodeID() - BBVector[node->getBBID()]->first_node);
2608 }
2609 else
2610 {
2611 assert(otherPred->getBBID() == bb->getId());
2612 dist = node->getNodeID() - otherPred->getNodeID();
2613 }
2614 if (dist < predDist)
2615 {
2616 canidateNode = otherPred;
2617 predDist = dist;
2618 }
2619 }
2620 }
2621 }
2622
2623 if (canidateNode != nullptr)
2624 {
2625 canidateNode->getLastInstruction()->setSetToken(pred->getLastInstruction()->getSetToken());
2626 #ifdef DEBUG_VERBOSE_ON
2627 printf("Node: %d, PRED assign: %d, token: %d\n", node->getNodeID(), canidateNode->getNodeID(), canidateNode->getLastInstruction()->getSetToken());
2628 #endif
2629 }
2630 }
2631
assignTokenWithPred(SBNode * node,G4_BB * bb)2632 bool SWSB::assignTokenWithPred(SBNode* node, G4_BB* bb)
2633 {
2634 unsigned predDist = -1;
2635 SBNode* canidateNode = nullptr;
2636 for (auto node_it = node->preds.begin();
2637 node_it != node->preds.end(); node_it++)
2638 {
2639 SBDEP_ITEM& curPred = (*node_it);
2640 SBNode* pred = curPred.node;
2641 DepType type = curPred.type;
2642 unsigned dist = 0;
2643
2644 if (tokenHonourInstruction(pred->getLastInstruction()) &&
2645 (pred->getLastInstruction()->getSetToken() != (unsigned short)UNKNOWN_TOKEN) &&
2646 ((type == RAW) ||(type == WAW) || (pred->getLastInstruction()->getDst() == nullptr)))
2647 {
2648 if ((pred->globalID != -1) &&
2649 (BBVector[bb->getId()]->tokenLiveInDist[pred->globalID] != -1))
2650 {
2651 dist = BBVector[bb->getId()]->tokenLiveInDist[pred->globalID] + (node->getNodeID() - BBVector[bb->getId()]->first_node);
2652 }
2653 else
2654 {
2655 if (fg.builder->getOptions()->getOption(vISA_DistPropTokenAllocation))
2656 {
2657 if (pred->getBBID() == bb->getId())
2658 {
2659 dist = node->getNodeID() - pred->getNodeID();
2660 }
2661 else
2662 {
2663 #ifdef DEBUG_VERBOSE_ON
2664 printf("Untracked distance: pred: BB%d:%d -- succ: BB%d:%d\n", pred->getBBID(), pred->getNodeID(), node->getBBID(), node->getNodeID());
2665 #endif
2666 dist = node->getNodeID() - BBVector[bb->getId()]->first_node;
2667 }
2668 }
2669 else
2670 {
2671 assert(pred->getBBID() == bb->getId());
2672 dist = node->getNodeID() - pred->getNodeID();
2673 }
2674 }
2675 if (dist < predDist)
2676 {
2677 canidateNode = pred;
2678 predDist = dist;
2679 }
2680 }
2681 }
2682
2683 if (canidateNode != nullptr)
2684 {
2685 node->getLastInstruction()->setSetToken(canidateNode->getLastInstruction()->getSetToken());
2686 allTokenNodesMap[canidateNode->getLastInstruction()->getSetToken()].set(node->sendID);
2687 #ifdef DEBUG_VERBOSE_ON
2688 printf("Node: %d, pred reuse assign: %d, token: %d\n", node->getNodeID(), canidateNode->getNodeID(), node->getLastInstruction()->getSetToken());
2689 #endif
2690 return true;
2691 }
2692
2693 return false;
2694 }
2695
allocateToken(G4_BB * bb)2696 void SWSB::allocateToken(G4_BB* bb)
2697 {
2698 if ((BBVector[bb->getId()]->first_send_node == -1) ||
2699 BBVector[bb->getId()]->tokenAssigned)
2700 {
2701 return;
2702 }
2703
2704 BBVector[bb->getId()]->localReachingSends = SBBitSets(SBSendNodes.size());
2705
2706 assert((BBVector[bb->getId()]->last_send_node != -1) &&
2707 (BBVector[bb->getId()]->first_send_node <= BBVector[bb->getId()]->last_send_node));
2708
2709 SBBitSets send_live(SBSendNodes.size());
2710 SBBitSets send_use(SBSendUses.size());
2711
2712 for (int i = BBVector[bb->getId()]->first_send_node; i <= BBVector[bb->getId()]->last_send_node; i++)
2713 {
2714 SBNode* node = SBSendNodes[i];
2715
2716 if (node->getLastInstruction()->getSetToken() != (unsigned short)UNKNOWN_TOKEN)
2717 {
2718 continue;
2719 }
2720
2721 if (node->getLastInstruction()->isDpas() && node->succs.size() == 0 &&
2722 fg.builder->getOptions()->getOption(vISA_EnableDPASTokenReduction))
2723 {
2724 continue;
2725 }
2726
2727 send_live = node->reachingSends; //The tokens will reach current node
2728
2729 for (unsigned k = 0; k < totalTokenNum; k++)
2730 {
2731 reachTokenArray[k]->clear();
2732 reachUseArray[k]->clear();
2733 }
2734
2735 for (size_t k = 0; k < SBSendNodes.size(); k++)
2736 {
2737 SBNode* liveNode = SBSendNodes[k];
2738 if ((liveNode->getLastInstruction()->getSetToken() != (unsigned short)UNKNOWN_TOKEN) &&
2739 (send_live.isDstSet(k) ||
2740 (send_live.isSrcSet(k) &&
2741 isPrefetch(liveNode->getLastInstruction()))))
2742 {
2743 reachTokenArray[liveNode->getLastInstruction()->getSetToken()]->push_back(liveNode);
2744 }
2745 }
2746
2747 if (!fg.builder->getOptions()->getOption(vISA_DistPropTokenAllocation) && (node->reachedUses.getSize() != 0))
2748 {
2749 send_use = node->reachedUses; //The uses of other sends can be reached by current node.
2750 for (size_t k = 0; k < SBSendUses.size(); k++)
2751 {
2752 SBNode* liveNode = SBSendUses[k];
2753 if (send_use.isDstSet(k))
2754 {
2755 for (size_t m = 0; m < liveNode->preds.size(); m++)
2756 {
2757 SBDEP_ITEM& curPred = liveNode->preds[m];
2758 SBNode* pred = curPred.node;
2759 if (pred->getLastInstruction()->getSetToken() != (unsigned short)UNKNOWN_TOKEN)
2760 {
2761 reachUseArray[pred->getLastInstruction()->getSetToken()]->push_back(liveNode);
2762 }
2763 }
2764 }
2765 }
2766 }
2767
2768 if (!assignTokenWithPred(node, bb))
2769 {
2770 bool assigned = false;
2771
2772 //Assigned with coalescing
2773 if (!fg.builder->getOptions()->getOption(vISA_DistPropTokenAllocation) && (node->reachedUses.getSize() != 0))
2774 {
2775 for (size_t i = 0; i < node->succs.size(); i++)
2776 {
2777 SBDEP_ITEM& curSucc = node->succs[i];
2778
2779 if (!curSucc.exclusiveNodes.size())
2780 {
2781 continue;
2782 }
2783
2784 for (size_t j = 0; j < curSucc.exclusiveNodes.size(); j++)
2785 {
2786 SBNode* exclusiveNode = curSucc.exclusiveNodes[j];
2787 unsigned short exToken = exclusiveNode->getLastInstruction()->getSetToken();
2788 if (exToken != (unsigned short)UNKNOWN_TOKEN)
2789 {
2790 if (reachTokenArray[exToken]->size() == 0 &&
2791 reachUseArray[exToken]->size() == 0)
2792 {
2793 node->getLastInstruction()->setSetToken(exToken);
2794 allTokenNodesMap[exToken].set(node->sendID);
2795 #ifdef DEBUG_VERBOSE_ON
2796 printf("node: %d :: Use exclusive token: %d\n", node->getNodeID(), exToken);
2797 #endif
2798 assigned = true;
2799 break;
2800 }
2801 }
2802 }
2803 }
2804 }
2805
2806 if (!assigned)
2807 {
2808 //Assigned with first free token
2809 for (unsigned k = 0; k < totalTokenNum; k++)
2810 {
2811 if ((reachTokenArray[k]->size() == 0) &&
2812 (reachUseArray[k]->size() == 0))
2813 {
2814 node->getLastInstruction()->setSetToken(k);
2815 allTokenNodesMap[k].set(node->sendID);
2816 assigned = true;
2817 #ifdef DEBUG_VERBOSE_ON
2818 printf("node: %d :: Use free token: %d\n", node->getNodeID(), k);
2819 #endif
2820 break;
2821 }
2822 }
2823 }
2824
2825 //All tokens are assigned
2826 if (!assigned)
2827 {
2828 SBNode* reuseNode = nullptr;
2829 bool reuseSibling = !fg.builder->getOptions()->getOption(vISA_DistPropTokenAllocation) && (node->reachedUses.getSize() != 0);
2830 unsigned short reuseToken = reuseTokenSelectionGlobal(node, bb, reuseNode, reuseSibling);
2831
2832 #ifdef DEBUG_VERBOSE_ON
2833 if (!reuseSibling)
2834 {
2835 printf("node: %d :: Reuse token: %d, from node: %d\n", node->getNodeID(), reuseToken, reuseNode->getNodeID());
2836 }
2837 else
2838 {
2839 printf("node: %d :: Reuse token: %d, from use node: %d\n", node->getNodeID(), reuseToken, reuseNode->getNodeID());
2840 }
2841 #endif
2842
2843 node->getLastInstruction()->setSetToken(reuseToken);
2844 allTokenNodesMap[reuseToken].set(node->sendID);
2845 }
2846 }
2847 }
2848 }
2849
tokenAllocationBB(G4_BB * bb)2850 void SWSB::tokenAllocationBB(G4_BB* bb)
2851 {
2852 //Token allocation
2853 allocateToken(bb);
2854 BBVector[bb->getId()]->tokenAssigned = true;
2855
2856 //Deep first allocation.
2857 for (const G4_BB_SB *succ : BBVector[bb->getId()]->domSuccs)
2858 {
2859 if (!succ->tokenAssigned)
2860 {
2861 tokenAllocationBB(succ->getBB());
2862 }
2863 }
2864 }
2865
tokenAllocationWithDistPropogationPerBB(G4_BB * bb)2866 void SWSB::tokenAllocationWithDistPropogationPerBB(G4_BB * bb)
2867 {
2868 propogateDist(bb);
2869 allocateToken(bb);
2870 BBVector[bb->getId()]->tokenAssigned = true;
2871
2872 for (const G4_BB_SB *succ : BBVector[bb->getId()]->domSuccs)
2873 {
2874 if (!succ->tokenAssigned)
2875 {
2876 tokenAllocationWithDistPropogationPerBB(succ->getBB());
2877 }
2878 }
2879 }
2880
tokenAllocationWithDistPropogation()2881 void SWSB::tokenAllocationWithDistPropogation()
2882 {
2883 #ifdef DEBUG_VERBOSE_ON
2884 globalSBNodes.resize(globalSendNum);
2885 #endif
2886 //Initial all live out distance
2887 for (const SBNode* node : SBSendNodes)
2888 {
2889 if (BBVector[node->getBBID()]->send_live_out.isDstSet(node->globalID))
2890 {
2891 BBVector[node->getBBID()]->tokenLiveOutDist[node->globalID] = BBVector[node->getBBID()]->last_node - node->getNodeID();
2892 #ifdef DEBUG_VERBOSE_ON
2893 globalSBNodes[node->globalID] = node;
2894 #endif
2895 }
2896 }
2897
2898 tokenAllocationWithDistPropogationPerBB(*fg.begin());
2899
2900 #ifdef DEBUG_VERBOSE_ON
2901 for (size_t i = 0; i < BBVector.size(); i++)
2902 {
2903 const G4_BB_SB *bb = BBVector[i];
2904 std::cerr << "BB" << i << ": " << bb->first_node << "-" << bb->last_node << ", succ<";
2905 for (const G4_BB* succ : bb->getBB()->Succs)
2906 {
2907 std::cerr << succ->getId() << ",";
2908 }
2909 std::cerr << "> pred<";
2910 for (const G4_BB* pred : bb->getBB()->Preds)
2911 {
2912 std::cerr << pred->getId() << ",";
2913 }
2914
2915 std::cerr << ">\n liveIn:";
2916 for (unsigned k = 0; k < globalSendNum; k++)
2917 {
2918 if (bb->tokenLiveInDist[k] != -1)
2919 {
2920 std::cerr << " n" << globalSBNodes[k]->getNodeID() << ":" << bb->tokenLiveInDist[k];
2921 }
2922 }
2923 std::cerr << "\n liveout:";
2924 for (unsigned k = 0; k < globalSendNum; k++)
2925 {
2926 if (bb->tokenLiveOutDist[k] != -1)
2927 {
2928 std::cerr << " n" << globalSBNodes[k]->getNodeID() << ":" << bb->tokenLiveOutDist[k];
2929 }
2930 }
2931 std::cerr << "\n\n";
2932 }
2933 #endif
2934
2935 }
2936
buildExclusiveForCoalescing()2937 void SWSB::buildExclusiveForCoalescing()
2938 {
2939 for (SBNode* node : SBSendNodes)
2940 {
2941 G4_INST* inst = node->getLastInstruction();
2942
2943 if (inst->isEOT())
2944 {
2945 continue;
2946 }
2947
2948 //If current one is a node with local live range, reuse cannot happen, because other nodes definitely can reach it.
2949 if (node->globalID == -1)
2950 {
2951 continue;
2952 }
2953
2954 SBBitSets send_live(SBSendNodes.size());
2955
2956 for (SBDEP_ITEM& curSucc : node->succs)
2957 {
2958 SBNode* succ = curSucc.node;
2959 DepType type = curSucc.type;
2960 if (((type == RAW) || (type == WAW)) && (succ->reachingSends.getSize() != 0))
2961 {
2962 send_live = succ->reachingSends;
2963 //FIXME, the complexity may be a little big high, n*n*succSize
2964 for (size_t k = 0; k < SBSendNodes.size(); k++)
2965 {
2966 SBNode* liveNode = SBSendNodes[k];
2967 if (send_live.isDstSet(k) &&
2968 (liveNode != node) &&
2969 (!(liveNode->reachingSends.isDstSet(node->sendID) ||
2970 node->reachingSends.isDstSet(liveNode->sendID)) ||
2971 tokenHonourInstruction(succ->GetInstruction())))
2972 //If the use is token honour instruction and be assigned with same token as pred,
2973 //it will cause dependence any way, cannot be removed.
2974 //FIXME: But one send can depends on multiple previous send.
2975 //Only the one set to the send will cause non-removable dependence.
2976 {
2977 addReachingUseSet(liveNode, succ);
2978 }
2979 }
2980 }
2981
2982 if ((succ->preds.size() <= 1) ||( curSucc.exclusiveNodes.size()))
2983 {
2984 continue;
2985 }
2986
2987 if (!((succ->getBBID() == node->getBBID() && succ->getNodeID() > node->getNodeID()) ||
2988 (succ->getBBID() != node->getBBID())))
2989 {
2990 continue;
2991 }
2992
2993 for (const SBDEP_ITEM& curPred : succ->preds)
2994 {
2995 DepType type = curPred.type;
2996 SBNode* pred = curPred.node;
2997
2998 if (pred == node)
2999 {
3000 continue;
3001 }
3002
3003 if (type == WAW || type == RAW)
3004 {
3005 if (!((succ->getBBID() == pred->getBBID() && succ->getNodeID() > pred->getNodeID()) ||
3006 (succ->getBBID() != pred->getBBID())))
3007 {
3008 continue;
3009 }
3010
3011 curSucc.exclusiveNodes.push_back(pred);
3012 }
3013 }
3014 }
3015 }
3016
3017 return;
3018 }
3019
tokenAllocationGlobalWithPropogation()3020 void SWSB::tokenAllocationGlobalWithPropogation()
3021 {
3022 #ifdef DEBUG_VERBOSE_ON
3023 dumpDepInfo();
3024 #endif
3025
3026 buildExclusiveForCoalescing();
3027
3028 reachTokenArray.resize(totalTokenNum);
3029 reachUseArray.resize(totalTokenNum);
3030
3031 for (int bucket_i = 0; bucket_i != (int)totalTokenNum; ++bucket_i)
3032 {
3033 void* allocedMem = mem.alloc(sizeof(SBNODE_VECT));
3034 reachTokenArray[bucket_i] = new (allocedMem) SBNODE_VECT();
3035
3036 allocedMem = mem.alloc(sizeof(SBNODE_VECT));
3037 reachUseArray[bucket_i] = new (allocedMem) SBNODE_VECT();
3038 }
3039
3040 tokenAllocationWithDistPropogation();
3041
3042 if (fg.builder->getOptions()->getOption(vISA_SWSBDepReduction))
3043 {
3044 for (G4_BB_SB *bb : BBVector)
3045 {
3046 bb->getLiveOutToken(unsigned(SBSendNodes.size()), &SBNodes);
3047 }
3048 #ifdef DEBUG_VERBOSE_ON
3049 dumpTokenLiveInfo();
3050 #endif
3051
3052 SWSBGlobalTokenAnalysis();
3053
3054 #ifdef DEBUG_VERBOSE_ON
3055 dumpTokenLiveInfo();
3056 #endif
3057
3058
3059 unsigned prunedEdgeNum = 0;
3060 unsigned prunedGlobalEdgeNum = 0;
3061 unsigned prunedDiffBBEdgeNum = 0;
3062 unsigned prunedDiffBBSameTokenEdgeNum = 0;
3063 tokenEdgePrune(prunedEdgeNum, prunedGlobalEdgeNum, prunedDiffBBEdgeNum, prunedDiffBBSameTokenEdgeNum);
3064 tokenProfile.setPrunedEdgeNum(prunedEdgeNum);
3065 tokenProfile.setPrunedGlobalEdgeNum(prunedGlobalEdgeNum);
3066 tokenProfile.setPrunedDiffBBEdgeNum(prunedDiffBBEdgeNum);
3067 tokenProfile.setPrunedDiffBBSameTokenEdgeNum(prunedDiffBBSameTokenEdgeNum);
3068 }
3069
3070 assignDepTokens();
3071 }
3072
tokenAllocationGlobal()3073 void SWSB::tokenAllocationGlobal()
3074 {
3075 G4_BB* bb = *fg.begin();
3076
3077 #ifdef DEBUG_VERBOSE_ON
3078 dumpDepInfo();
3079 #endif
3080
3081 calculateDist();
3082
3083 buildExclusiveForCoalescing();
3084
3085 reachTokenArray.resize(totalTokenNum);
3086 reachUseArray.resize(totalTokenNum);
3087
3088 for (int bucket_i = 0; bucket_i != (int)totalTokenNum; ++bucket_i)
3089 {
3090 void* allocedMem = mem.alloc(sizeof(SBNODE_VECT));
3091 reachTokenArray[bucket_i] = new (allocedMem) SBNODE_VECT();
3092
3093 allocedMem = mem.alloc(sizeof(SBNODE_VECT));
3094 reachUseArray[bucket_i] = new (allocedMem) SBNODE_VECT();
3095 }
3096
3097 tokenAllocationBB(bb);
3098
3099 if (fg.builder->getOptions()->getOption(vISA_SWSBDepReduction))
3100 {
3101 for (G4_BB_SB *bb : BBVector)
3102 {
3103 bb->getLiveOutToken(unsigned(SBSendNodes.size()), &SBNodes);
3104 }
3105 #ifdef DEBUG_VERBOSE_ON
3106 dumpTokenLiveInfo();
3107 #endif
3108
3109 SWSBGlobalTokenAnalysis();
3110
3111 #ifdef DEBUG_VERBOSE_ON
3112 dumpTokenLiveInfo();
3113 #endif
3114
3115
3116 unsigned prunedEdgeNum = 0;
3117 unsigned prunedGlobalEdgeNum = 0;
3118 unsigned prunedDiffBBEdgeNum = 0;
3119 unsigned prunedDiffBBSameTokenEdgeNum = 0;
3120 tokenEdgePrune(prunedEdgeNum, prunedGlobalEdgeNum, prunedDiffBBEdgeNum, prunedDiffBBSameTokenEdgeNum);
3121 tokenProfile.setPrunedEdgeNum(prunedEdgeNum);
3122 tokenProfile.setPrunedGlobalEdgeNum(prunedGlobalEdgeNum);
3123 tokenProfile.setPrunedDiffBBEdgeNum(prunedDiffBBEdgeNum);
3124 tokenProfile.setPrunedDiffBBSameTokenEdgeNum(prunedDiffBBSameTokenEdgeNum);
3125 }
3126
3127 assignDepTokens();
3128 }
3129
insertSyncInstruction(G4_BB * bb,INST_LIST_ITER nextIter,int CISAOff,int lineNo)3130 G4_INST* SWSB::insertSyncInstruction(G4_BB* bb, INST_LIST_ITER nextIter, int CISAOff, int lineNo)
3131 {
3132 G4_SrcRegRegion* src0 = fg.builder->createNullSrc(Type_UD);
3133 G4_INST* syncInst = fg.builder->createSync(G4_sync_nop, src0);
3134 bb->insertBefore(nextIter, syncInst);
3135 syncInstCount++;
3136
3137 return syncInst;
3138 }
3139
insertSyncInstructionAfter(G4_BB * bb,INST_LIST_ITER iter,int CISAOff,int lineNo)3140 G4_INST* SWSB::insertSyncInstructionAfter(G4_BB* bb, INST_LIST_ITER iter, int CISAOff, int lineNo)
3141 {
3142 INST_LIST_ITER nextIter = iter;
3143 nextIter++;
3144 G4_SrcRegRegion* src0 = fg.builder->createNullSrc(Type_UD);
3145 G4_INST* syncInst = fg.builder->createSync(G4_sync_nop, src0);
3146 bb->insertBefore(nextIter, syncInst);
3147 syncInstCount++;
3148
3149 return syncInst;
3150 }
3151
insertTestInstruction(G4_BB * bb,INST_LIST_ITER nextIter,int CISAOff,int lineNo,bool countSync)3152 G4_INST* SWSB::insertTestInstruction(G4_BB* bb, INST_LIST_ITER nextIter, int CISAOff, int lineNo, bool countSync)
3153 {
3154 G4_INST* nopInst = fg.builder->createNop(InstOpt_NoOpt);
3155 bb->insertBefore(nextIter, nopInst);
3156 if (countSync)
3157 {
3158 syncInstCount++;
3159 }
3160
3161 return nopInst;
3162 }
3163
insertSyncAllRDInstruction(G4_BB * bb,unsigned int SBIDs,INST_LIST_ITER nextIter,int CISAOff,int lineNo)3164 G4_INST* SWSB::insertSyncAllRDInstruction(G4_BB* bb, unsigned int SBIDs, INST_LIST_ITER nextIter, int CISAOff, int lineNo)
3165 {
3166 G4_INST* syncInst;
3167 if (SBIDs)
3168 {
3169 G4_Imm* src0 = fg.builder->createImm(SBIDs, Type_UD);
3170 syncInst = fg.builder->createSync(G4_sync_allrd, src0);
3171 ARSyncInstCount++;
3172 }
3173 else
3174 {
3175 G4_SrcRegRegion* src0 = fg.builder->createNullSrc(Type_UD);
3176 syncInst = fg.builder->createSync(G4_sync_allrd, src0);
3177 ARSyncAllCount++;
3178 }
3179 bb->insertBefore(nextIter, syncInst);
3180
3181 return syncInst;
3182 }
3183
insertSyncAllWRInstruction(G4_BB * bb,unsigned int SBIDs,INST_LIST_ITER nextIter,int CISAOff,int lineNo)3184 G4_INST* SWSB::insertSyncAllWRInstruction(G4_BB* bb, unsigned int SBIDs, INST_LIST_ITER nextIter, int CISAOff, int lineNo)
3185 {
3186 G4_INST* syncInst;
3187 if (SBIDs)
3188 {
3189 G4_Imm* src0 = fg.builder->createImm(SBIDs, Type_UD);
3190 syncInst = fg.builder->createSync(G4_sync_allwr, src0);
3191 AWSyncInstCount++;
3192 }
3193 else
3194 {
3195 G4_SrcRegRegion* src0 = fg.builder->createNullSrc(Type_UD);
3196 syncInst = fg.builder->createSync(G4_sync_allwr, src0);
3197 AWSyncAllCount++;
3198 }
3199 bb->insertBefore(nextIter, syncInst);
3200
3201 return syncInst;
3202 }
3203
insertSyncToken(G4_BB * bb,SBNode * node,G4_INST * inst,INST_LIST_ITER inst_it,int newInstID,BitSet * dstTokens,BitSet * srcTokens,bool & keepDst,bool removeAllToken)3204 bool SWSB::insertSyncToken(G4_BB* bb, SBNode* node, G4_INST* inst, INST_LIST_ITER inst_it, int newInstID, BitSet* dstTokens, BitSet* srcTokens, bool& keepDst, bool removeAllToken)
3205 {
3206 //Non-test instruction can only have
3207 // 1. non-send: one Dst Token with distance, or
3208 // 2. send: distance only, or
3209 // 2. one Dst token, or
3210 // 3. one Src token
3211 unsigned short dst = 0;
3212 unsigned short src = 0;
3213 std::vector<std::pair<unsigned short, unsigned>> dst_loc;
3214 std::vector<std::pair<unsigned short, unsigned>> src_loc;
3215
3216 bool multipleDst = false;
3217 bool multipleSrc = false;
3218 unsigned short token = (unsigned short)-1;
3219 unsigned short dstToken = (unsigned short)-1;
3220 unsigned short srcToken = (unsigned short)-1;
3221 SWSBTokenType type = G4_INST::SWSBTokenType::TOKEN_NONE;
3222 bool insertedSync = false;
3223
3224 for (unsigned int i = 0; i < node->getDepTokenNum();)
3225 {
3226 G4_INST* synAllInst = nullptr;
3227 token = node->getDepToken(i, type);
3228 unsigned depNodeID = node->getDepTokenNodeID(i);
3229 unsigned short bitToken = (unsigned short)(1 << token);
3230 assert(token != (unsigned short)UNKNOWN_TOKEN);
3231
3232 switch (type)
3233 {
3234 case SWSBTokenType::AFTER_WRITE:
3235 case SWSBTokenType::AFTER_READ:
3236 {
3237 if (dstTokens->isSet(token) || (type == SWSBTokenType::AFTER_READ && srcTokens->isSet(token)))
3238 {
3239 //Do BB level clean up
3240 //So that there will be no case like following redundant sync
3241 // sync.nop {$1.src}
3242 // sync.nop {$1.src}
3243 // or
3244 // sync.nop {$1.dst}
3245 // sync.nop {$1.src}
3246 // or
3247 // mov {$1.dst}
3248 // add {$1.src}
3249 node->eraseDepToken(i);
3250 continue;
3251 }
3252 else
3253 {
3254 if (!tokenHonourInstruction(inst) && //For send and math, no dependent token
3255 !removeAllToken &&
3256 !keepDst && //No one kept yet
3257 (!inst->getDistance() || //Only Dst can be kept
3258 type == SWSBTokenType::AFTER_WRITE)) //Or there is no distance dependence
3259 //FIXME: for tokenhonour instruction, we didn't support memdst only or memsrc only modes.
3260 // To support these two modes, the pre-condition is that current instruction has no SBID.
3261 {
3262 //Token is kept in original instruction
3263 keepDst = true;
3264 inst->setToken(token);
3265 inst->setTokenType(type);
3266 inst->setTokenLoc(token, depNodeID);
3267 token = (unsigned short)UNKNOWN_TOKEN;
3268 i++;
3269 continue;
3270 }
3271
3272 if (type == SWSBTokenType::AFTER_READ)
3273 {
3274 src |= bitToken;
3275 src_loc.push_back(std::make_pair(token, depNodeID));
3276 if (!multipleSrc && (src & ~bitToken))
3277 {
3278 multipleSrc = true;
3279 }
3280 srcToken = token;
3281 srcTokens->set(token, true);
3282 }
3283 else
3284 {
3285 assert(type == SWSBTokenType::AFTER_WRITE);
3286 dst |= bitToken;
3287 dst_loc.push_back(std::make_pair(token, depNodeID));
3288 if (!multipleDst && (dst & ~bitToken))
3289 {
3290 multipleDst = true;
3291 }
3292 dstToken = token;
3293 dstTokens->set(token, true);
3294 }
3295
3296 node->eraseDepToken(i);
3297 continue;
3298 }
3299 }
3300 break;
3301 case SWSBTokenType::READ_ALL:
3302 {
3303 assert(token == (unsigned short)UNKNOWN_TOKEN);
3304 node->eraseDepToken(i);
3305 synAllInst = insertSyncAllRDInstruction(bb, 0, inst_it, inst->getCISAOff(), inst->getLineNo());
3306 synAllInst->setLexicalId(newInstID);
3307 i++;
3308 continue;
3309 }
3310 break;
3311 case SWSBTokenType::WRITE_ALL:
3312 {
3313 assert(token == (unsigned short)UNKNOWN_TOKEN);
3314 node->eraseDepToken(i);
3315 synAllInst = insertSyncAllWRInstruction(bb, 0, inst_it, inst->getCISAOff(), inst->getLineNo());
3316 synAllInst->setLexicalId(newInstID);
3317 i++;
3318 continue;
3319 }
3320 break;
3321 default:
3322 assert(0);
3323 break;
3324 }
3325 i++;
3326 }
3327
3328 G4_INST* synInst;
3329 if (dst)
3330 {
3331 if (dst == 0xFFFF)
3332 {
3333 synInst = insertSyncAllWRInstruction(bb, 0, inst_it, inst->getCISAOff(), inst->getLineNo());
3334 }
3335 else if (multipleDst)
3336 {
3337 synInst = insertSyncAllWRInstruction(bb, dst, inst_it, inst->getCISAOff(), inst->getLineNo());
3338 }
3339 else
3340 {
3341 synInst = insertSyncInstruction(bb, inst_it, inst->getCISAOff(), inst->getLineNo());
3342 synInst->setToken(dstToken);
3343 synInst->setTokenType(SWSBTokenType::AFTER_WRITE);
3344 }
3345 synInst->setLexicalId(newInstID);
3346 insertedSync = true;
3347 for (auto loc:dst_loc)
3348 {
3349 synInst->setTokenLoc(loc.first, loc.second);
3350 }
3351 }
3352
3353 if (src)
3354 {
3355 if (src == 0xFFFF)
3356 {
3357 synInst = insertSyncAllRDInstruction(bb, 0, inst_it, inst->getCISAOff(), inst->getLineNo());
3358 }
3359 else if (multipleSrc)
3360 {
3361 synInst = insertSyncAllRDInstruction(bb, src, inst_it, inst->getCISAOff(), inst->getLineNo());
3362 }
3363 else
3364 {
3365 synInst = insertSyncInstruction(bb, inst_it, inst->getCISAOff(), inst->getLineNo());
3366 synInst->setToken(srcToken);
3367 synInst->setTokenType(SWSBTokenType::AFTER_READ);
3368 }
3369 synInst->setLexicalId(newInstID);
3370 insertedSync = true;
3371 for (auto loc:src_loc)
3372 {
3373 synInst->setTokenLoc(loc.first, loc.second);
3374 }
3375 }
3376
3377 return insertedSync;
3378 }
3379
3380 /*
3381 * For Xe, sync can be used for distance and token at the same time.
3382 * The encoding limitations for instruction attached dependence info
3383 * a. has to attached with instruction
3384 * 1. memSet
3385 * b. Others
3386 * 1. Only regDst can be used when there is memSet for DPAS/math
3387 * 2. Only regDstAll can be used when there is memSet for send
3388 * 3. Only regDist can be used when there is mem.dst for ALU instructions
3389 * c. To be consistent with the previous version (TGLLP)
3390 * 1. Tried to attach the distance with the original instructions.
3391 * 2. The only exception is the memSet for out-of-order instructions
3392 *
3393 * SWSB format - non DPAS/send/math (in-order)
3394 * 7 6 5 4 3 2 1 0
3395 * 0 0 0 0 0 0 0 0
3396 * 0 0 0 0 0 regDist
3397 * 0 0 0 0 1 regDistAll
3398 * 0 0 0 1 0 regDistFloat
3399 * 0 0 0 1 1 regDistInt
3400 * 0 0 1 0 memSBid dst
3401 * 0 0 1 1 memSBid src
3402 * 0 1 0 0 R R R R
3403 * 0 1 0 1 R regDistLong
3404 * 0 1 1 R R R R R
3405 * 1 regDist memSBid dst
3406 *
3407 * SWSB format - DPAS/math (out-of-order)
3408 * 0 0 0 0 0 0 0 0
3409 * 0 0 0 0 0 regDist
3410 * 0 0 0 0 1 regDistAll
3411 * 0 0 0 1 0 regDistFloat
3412 * 0 0 0 1 1 regDistInt
3413 * 0 0 1 0 memSBid dst
3414 * 0 0 1 1 memSBid src
3415 * 0 1 0 0 memSBid set
3416 * 0 1 0 1 R regDistLong
3417 * 0 1 1 R R R R R
3418 * 1 regDist memSBid set
3419 *
3420 * SWSB format -send (out-of-order)
3421 * 0 0 0 0 0 0 0 0
3422 * 0 0 0 0 0 regDist
3423 * 0 0 0 0 1 regDistAll
3424 * 0 0 0 1 0 regDistFloat
3425 * 0 0 0 1 1 regDistInt
3426 * 0 0 1 0 memSBid dst
3427 * 0 0 1 1 memSBid src
3428 * 0 1 0 0 memSBid set
3429 * 0 1 0 1 R regDistLong
3430 * 0 1 1 R R R R R
3431 * 1 regDistAll memSBid set
3432 */
insertSyncXe(G4_BB * bb,SBNode * node,G4_INST * inst,INST_LIST_ITER inst_it,int newInstID,BitSet * dstTokens,BitSet * srcTokens)3433 bool SWSB::insertSyncXe(G4_BB* bb, SBNode* node, G4_INST* inst, INST_LIST_ITER inst_it, int newInstID, BitSet* dstTokens, BitSet* srcTokens)
3434 {
3435 G4_INST::DistanceType distType = node->GetInstruction()->getDistanceTypeXe();
3436 bool insertedSync = false;
3437 bool keepDst = false;
3438 bool isCloseALUType = node->GetInstruction()->isClosestALUType();
3439
3440 if (tokenHonourInstruction(inst))
3441 {
3442 //regDist $.set
3443 if (inst->isDpas())
3444 {
3445 if (distType != G4_INST::DistanceType::DIST_NONE &&
3446 distType != G4_INST::DistanceType::DIST)
3447 {
3448 G4_INST* synInst = insertSyncInstruction(bb, inst_it, inst->getCISAOff(), inst->getLineNo());
3449 synInst->setDistance(inst->getDistance());
3450 synInst->setDistanceTypeXe(inst->getDistanceTypeXe());
3451 inst->setDistance(0);
3452 inst->setDistanceTypeXe(G4_INST::DistanceType::DIST_NONE);
3453 insertedSync = true;
3454 }
3455 }
3456
3457 if (inst->isMathPipeInst())
3458 {
3459 if (isCloseALUType && distType != G4_INST::DistanceType::DIST_NONE)
3460 {
3461 node->GetInstruction()->setDistanceTypeXe(G4_INST::DistanceType::DIST);
3462 distType = G4_INST::DistanceType::DIST;
3463 }
3464 if (distType != G4_INST::DistanceType::DIST_NONE &&
3465 distType != G4_INST::DistanceType::DIST)
3466 {
3467 G4_INST* synInst = insertSyncInstruction(bb, inst_it, inst->getCISAOff(), inst->getLineNo());
3468 synInst->setDistance(inst->getDistance());
3469 synInst->setDistanceTypeXe(inst->getDistanceTypeXe());
3470 inst->setDistance(0);
3471 inst->setDistanceTypeXe(G4_INST::DistanceType::DIST_NONE);
3472 insertedSync = true;
3473 }
3474 }
3475
3476 // regDistAll $.set
3477 if (inst->isSend())
3478 {
3479 if (isCloseALUType && distType != G4_INST::DistanceType::DIST_NONE && (*inst_it) == inst)
3480 {
3481 node->GetInstruction()->setDistanceTypeXe(G4_INST::DistanceType::DISTALL);
3482 distType = G4_INST::DistanceType::DISTALL;
3483 }
3484 if ((distType != G4_INST::DistanceType::DIST_NONE &&
3485 distType != G4_INST::DistanceType::DISTALL) || ((*inst_it) != inst))
3486 {
3487 G4_INST* synInst = insertSyncInstruction(bb, inst_it, inst->getCISAOff(), inst->getLineNo());
3488 synInst->setDistance(inst->getDistance());
3489 synInst->setDistanceTypeXe(inst->getDistanceTypeXe());
3490 inst->setDistance(0);
3491 inst->setDistanceTypeXe(G4_INST::DistanceType::DIST_NONE);
3492 insertedSync = true;
3493 }
3494 }
3495 //For out-of-order instruction, all dependence token will be moved out to sync
3496 insertedSync |= insertSyncToken(bb, node, inst, inst_it, newInstID, dstTokens, srcTokens, keepDst, true);
3497 }
3498 else
3499 {
3500 // regDist $.dst
3501 //For in-order instruction, trying to keep distance in the original instruction
3502 if (distType == G4_INST::DistanceType::DIST ||
3503 distType == G4_INST::DistanceType::DIST_NONE)
3504 {
3505 insertedSync = insertSyncToken(bb, node, inst, inst_it, newInstID, dstTokens, srcTokens, keepDst, false);
3506 }
3507 else
3508 {
3509 //Move all token dependence out
3510 insertedSync = insertSyncToken(bb, node, inst, inst_it, newInstID, dstTokens, srcTokens, keepDst, true);
3511 }
3512 }
3513
3514 return insertedSync;
3515 }
3516
3517 //For dpas/dpasw instructions
3518 // RegDist SBID.set
3519 // RegDist SBID.src
3520 // RegDist SBID.dst
3521 //For send instruction
3522 // RegDistAll SBID.set
3523 // RegDistFloat SBID.set
3524 // RegDistInt SBID.set
3525 //For non-send / non-dpas/dpasw instructions
3526 // RegDist SBID.dst
3527 // RegDist SBID.src
3528 // RegDistAll SBID.dst
insertSyncTokenPVC(G4_BB * bb,SBNode * node,G4_INST * inst,INST_LIST_ITER inst_it,int newInstID,BitSet * dstTokens,BitSet * srcTokens,bool removeAllToken)3529 bool SWSB::insertSyncTokenPVC(G4_BB* bb, SBNode* node, G4_INST* inst, INST_LIST_ITER inst_it, int newInstID, BitSet* dstTokens, BitSet* srcTokens, bool removeAllToken)
3530 {
3531 //SBID.set > SBID.dst > SBID.src
3532 unsigned int dst = 0;
3533 unsigned int src = 0;
3534 bool keepDst = false;
3535 bool multipleDst = false;
3536 bool multipleSrc = false;
3537 unsigned short token = (unsigned short)-1;
3538 unsigned short dstToken = (unsigned short)-1;
3539 unsigned short srcToken = (unsigned short)-1;
3540 std::vector<std::pair<unsigned short, unsigned>> dst_loc;
3541 std::vector<std::pair<unsigned short, unsigned>> src_loc;
3542 SWSBTokenType type = G4_INST::SWSBTokenType::TOKEN_NONE;
3543 bool insertedSync = false;
3544
3545 for (unsigned int i = 0; i < node->getDepTokenNum();)
3546 {
3547 token = node->getDepToken(i, type);
3548 unsigned depNodeID = node->getDepTokenNodeID(i);
3549 unsigned int bitToken = (unsigned int)(1 << token);
3550 assert(token != (unsigned short)UNKNOWN_TOKEN);
3551
3552 switch (type)
3553 {
3554 case SWSBTokenType::AFTER_WRITE:
3555 {
3556 if (dstTokens->isSet(token))
3557 {
3558 //Do BB level clean up
3559 //So that there will be no case like following redundant sync
3560 // sync.nop {$1.src}
3561 // sync.nop {$1.src}
3562 // or
3563 // sync.nop {$1.dst}
3564 // sync.nop {$1.src}
3565 // or
3566 // mov {$1.dst}
3567 // add {$1.src}
3568 node->eraseDepToken(i);
3569 continue;
3570 }
3571 else
3572 {
3573 if (!removeAllToken && //No set one marked.
3574 !keepDst) //No dst one kept yet
3575 {
3576 //Token is kept in original instruction
3577 keepDst = true;
3578 inst->setToken(token);
3579 inst->setTokenType(SWSBTokenType::AFTER_WRITE);
3580 inst->setTokenLoc(token, depNodeID);
3581 token = (unsigned short)UNKNOWN_TOKEN;
3582 i++;
3583 continue;
3584 }
3585
3586 dst |= bitToken;
3587 dst_loc.push_back(std::make_pair(token, depNodeID));
3588 if (!multipleDst && (dst & ~bitToken))
3589 {
3590 multipleDst = true;
3591 }
3592 dstToken = token;
3593 dstTokens->set(token, true);
3594
3595 node->eraseDepToken(i);
3596 continue;
3597 }
3598 }
3599 break;
3600 default:
3601 assert(type == SWSBTokenType::AFTER_READ && "Wrong dependence type");
3602 break;
3603 }
3604 i++;
3605 }
3606
3607 bool keepSrc = false;
3608 for (unsigned int i = 0; i < node->getDepTokenNum();)
3609 {
3610 token = node->getDepToken(i, type);
3611 unsigned depNodeID = node->getDepTokenNodeID(i);
3612 unsigned int bitToken = (unsigned int)(1 << token);
3613 assert(token != (unsigned short)UNKNOWN_TOKEN);
3614
3615 switch (type)
3616 {
3617 case SWSBTokenType::AFTER_READ:
3618 {
3619 if (dstTokens->isSet(token) || (type == SWSBTokenType::AFTER_READ && srcTokens->isSet(token)))
3620 {
3621 node->eraseDepToken(i);
3622 continue;
3623 }
3624 else
3625 {
3626 if (!removeAllToken &&
3627 !keepDst &&
3628 !keepSrc)
3629 {
3630 //Token is kept in original instruction
3631 keepSrc = true;
3632 inst->setToken(token);
3633 inst->setTokenType(SWSBTokenType::AFTER_READ);
3634 inst->setTokenLoc(token, depNodeID);
3635 token = (unsigned short)UNKNOWN_TOKEN;
3636 i++;
3637 continue;
3638 }
3639 src |= bitToken;
3640 src_loc.push_back(std::make_pair(token, depNodeID));
3641 if (!multipleSrc && (src & ~bitToken))
3642 {
3643 multipleSrc = true;
3644 }
3645 srcToken = token;
3646 srcTokens->set(token, true);
3647
3648 node->eraseDepToken(i);
3649 continue;
3650 }
3651 }
3652 break;
3653 default:
3654 assert(type == SWSBTokenType::AFTER_WRITE && "Wrong dependence type");
3655 break;
3656 }
3657 i++;
3658 }
3659
3660 G4_INST* synInst;
3661
3662 if (dst)
3663 {
3664 if (dst == 0xFFFFFFFF)
3665 {
3666 synInst = insertSyncAllWRInstruction(bb, 0, inst_it, inst->getCISAOff(), inst->getLineNo());
3667 }
3668 else if (multipleDst)
3669 {
3670 synInst = insertSyncAllWRInstruction(bb, dst, inst_it, inst->getCISAOff(), inst->getLineNo());
3671 }
3672 else
3673 {
3674 synInst = insertSyncInstruction(bb, inst_it, inst->getCISAOff(), inst->getLineNo());
3675 synInst->setToken(dstToken);
3676 synInst->setTokenType(SWSBTokenType::AFTER_WRITE);
3677 }
3678 synInst->setLexicalId(newInstID);
3679 for (auto loc:dst_loc)
3680 {
3681 synInst->setTokenLoc(loc.first, loc.second);
3682 }
3683 insertedSync = true;
3684 }
3685
3686 if (src)
3687 {
3688 if (src == 0xFFFFFFFF)
3689 {
3690 synInst = insertSyncAllRDInstruction(bb, 0, inst_it, inst->getCISAOff(), inst->getLineNo());
3691 }
3692 else if (multipleSrc)
3693 {
3694 synInst = insertSyncAllRDInstruction(bb, src, inst_it, inst->getCISAOff(), inst->getLineNo());
3695 }
3696 else
3697 {
3698 synInst = insertSyncInstruction(bb, inst_it, inst->getCISAOff(), inst->getLineNo());
3699 synInst->setToken(srcToken);
3700 synInst->setTokenType(SWSBTokenType::AFTER_READ);
3701 }
3702 synInst->setLexicalId(newInstID);
3703 for (auto loc:src_loc)
3704 {
3705 synInst->setTokenLoc(loc.first, loc.second);
3706 }
3707 insertedSync = true;
3708 }
3709
3710 return insertedSync;
3711 }
3712
3713 //If depends on multiple different ALU pipelines
3714 // If all operands type matching the ALU pipelines --> regDist
3715 // otherwise --> regDistAll
3716 //If depends on single different ALU pipeline and other same ALU pipelines.
3717 // If all operands type matching the ALU pipelines --> regDist
3718 // otherwise --> regDistAll
3719 //If depends on multiple same ALU pipelines
3720 // If all operands type matching the ALU pipeline --> accurate/regDist
3721 // otherwise--> accuarte
3722 //If depends on single ALU pipeline
3723 // If operands type matching the ALU pipeline --> accurate/regDist
3724 // otherwise--> accuarte
3725 //
3726 //Note that:
3727 // 1. one instruction can have multiple operands.
3728 // 2. instruction belongs to single pipeline
3729 //Combo:
3730 //For dpas/dpasw instructions
3731 // RegDist SBID.set
3732 // RegDist SBID.src
3733 // RegDist SBID.dst
3734 //For send instruction
3735 // RegDistAll SBID.set
3736 // RegDistFloat SBID.set
3737 // RegDistInt SBID.set
3738 //For non-send / non-dpas/dpasw instructions
3739 // RegDist SBID.dst
3740 // RegDist SBID.src
3741 // RegDistAll SBID.dst
insertSyncPVC(G4_BB * bb,SBNode * node,G4_INST * inst,INST_LIST_ITER inst_it,int newInstID,BitSet * dstTokens,BitSet * srcTokens)3742 bool SWSB::insertSyncPVC(G4_BB * bb, SBNode * node, G4_INST * inst, INST_LIST_ITER inst_it, int newInstID, BitSet * dstTokens, BitSet * srcTokens)
3743 {
3744 G4_INST::DistanceType distType = node->GetInstruction()->getDistanceTypeXe();
3745 bool operandTypeIndicated = node->GetInstruction()->isOperandTypeIndicated();
3746 bool insertedSync = false;
3747
3748 if (tokenHonourInstruction(inst))
3749 {
3750 if (inst->getDistance())
3751 {
3752 //For dpas/dpasw instructions
3753 // RegDist SBID.set
3754 // RegDist SBID.src
3755 // RegDist SBID.dst
3756 if (inst->isDpas() ||
3757 inst->isMathPipeInst()) //math Will be filtered out by tokenHonourInstruction in PVC
3758 {
3759 if (inst->getSetToken() != (unsigned short)UNKNOWN_TOKEN ||
3760 node->getDepTokenNum())
3761 {
3762 if (!operandTypeIndicated)
3763 {
3764 G4_INST* synInst = insertSyncInstruction(bb, inst_it, inst->getCISAOff(), inst->getLineNo());
3765 synInst->setDistance(inst->getDistance());
3766 synInst->setDistanceTypeXe(inst->getDistanceTypeXe());
3767 inst->setDistance(0);
3768 inst->setDistanceTypeXe(G4_INST::DistanceType::DIST_NONE);
3769 insertedSync = true;
3770 }
3771 else if (inst->getDistanceTypeXe() != G4_INST::DistanceType::DIST &&
3772 inst->getDistanceTypeXe() != G4_INST::DistanceType::DISTALL)
3773 {
3774 inst->setDistanceTypeXe(G4_INST::DistanceType::DIST);
3775 }
3776 }
3777 }
3778
3779 //For send instruction
3780 // RegDistAll SBID.set
3781 // RegDistFloat SBID.set
3782 // RegDistInt SBID.set
3783 if (inst->isSend())
3784 {
3785 if (inst->getSetToken() != (unsigned short)UNKNOWN_TOKEN)
3786 { //SBID.set > SBID.dst > SBID.src > distance
3787 if (!(distType == G4_INST::DistanceType::DISTALL ||
3788 distType == G4_INST::DistanceType::DISTINT ||
3789 distType == G4_INST::DistanceType::DISTFLOAT) || (inst != (*inst_it)))
3790 {
3791 G4_INST* synInst = insertSyncInstruction(bb, inst_it, inst->getCISAOff(), inst->getLineNo());
3792 synInst->setDistance(inst->getDistance());
3793 synInst->setDistanceTypeXe(inst->getDistanceTypeXe());
3794 inst->setDistance(0);
3795 inst->setDistanceTypeXe(G4_INST::DistanceType::DIST_NONE);
3796 insertedSync = true;
3797 }
3798 }
3799 else if (node->getDepTokenNum()) //Keep only the SBID deps in the instruction
3800 {
3801 G4_INST* synInst = insertSyncInstruction(bb, inst_it, inst->getCISAOff(), inst->getLineNo());
3802 synInst->setDistance(inst->getDistance());
3803 synInst->setDistanceTypeXe(inst->getDistanceTypeXe());
3804 inst->setDistance(0);
3805 inst->setDistanceTypeXe(G4_INST::DistanceType::DIST_NONE);
3806 insertedSync = true;
3807 }
3808 }
3809 }
3810 }
3811 else
3812 {
3813 //For non-send / non-dpas/dpasw instructions
3814 // RegDist SBID.dst
3815 // RegDist SBID.src
3816 // RegDistAll SBID.dst
3817 if (inst->getDistance())
3818 {
3819 if (inst->opcode() == G4_mad && inst->hasNoACCSBSet())
3820 {
3821 G4_INST* synInst = insertSyncInstruction(bb, inst_it, inst->getCISAOff(), inst->getLineNo());
3822 synInst->setDistance(inst->getDistance());
3823 synInst->setDistanceTypeXe(inst->getDistanceTypeXe());
3824 inst->setDistance(0);
3825 inst->setDistanceTypeXe(G4_INST::DistanceType::DIST_NONE);
3826 insertedSync = true;
3827 }
3828 else if (node->getDepTokenNum()) //Keep only the SBID deps in the instruction
3829 {
3830 if (!operandTypeIndicated && distType != G4_INST::DistanceType::DISTALL)
3831 {
3832 G4_INST* synInst = insertSyncInstruction(bb, inst_it, inst->getCISAOff(), inst->getLineNo());
3833 synInst->setDistance(inst->getDistance());
3834 synInst->setDistanceTypeXe(inst->getDistanceTypeXe());
3835 inst->setDistance(0);
3836 inst->setDistanceTypeXe(G4_INST::DistanceType::DIST_NONE);
3837 insertedSync = true;
3838 }
3839
3840 if (operandTypeIndicated && distType != G4_INST::DistanceType::DIST && distType != G4_INST::DistanceType::DISTALL)
3841 {
3842 inst->setDistanceTypeXe(G4_INST::DistanceType::DIST);
3843 }
3844
3845 if (distType == G4_INST::DistanceType::DISTALL)
3846 {
3847 bool hasAfterWrite = false;
3848 for (int i = 0; i < (int)node->getDepTokenNum(); i++)
3849 {
3850 unsigned short token = (unsigned short)-1;
3851 SWSBTokenType type = SWSBTokenType::TOKEN_NONE;
3852 token = node->getDepToken(i, type);
3853 if (type == SWSBTokenType::AFTER_WRITE)
3854 {
3855 hasAfterWrite = true;
3856 }
3857 }
3858 if (!hasAfterWrite)
3859 {
3860 G4_INST* synInst = insertSyncInstruction(bb, inst_it, inst->getCISAOff(), inst->getLineNo());
3861 synInst->setDistance(inst->getDistance());
3862 synInst->setDistanceTypeXe(inst->getDistanceTypeXe());
3863 inst->setDistance(0);
3864 inst->setDistanceTypeXe(G4_INST::DistanceType::DIST_NONE);
3865 insertedSync = true;
3866 }
3867 }
3868 }
3869 }
3870 }
3871
3872 bool removeAllTokenDep = (inst->getSetToken() != (unsigned short)UNKNOWN_TOKEN);
3873 removeAllTokenDep = removeAllTokenDep || (inst->opcode() == G4_mad && inst->hasNoACCSBSet());
3874 //For out-of-order instruction, all dependence token will be moved out to sync
3875 insertedSync |= insertSyncTokenPVC(bb, node, inst, inst_it, newInstID, dstTokens, srcTokens, removeAllTokenDep);
3876
3877 return insertedSync;
3878 }
3879
insertSync(G4_BB * bb,SBNode * node,G4_INST * inst,INST_LIST_ITER inst_it,int newInstID,BitSet * dstTokens,BitSet * srcTokens)3880 void SWSB::insertSync(G4_BB* bb, SBNode* node, G4_INST* inst, INST_LIST_ITER inst_it, int newInstID, BitSet* dstTokens, BitSet* srcTokens)
3881 {
3882 //The inst after arch register instruction.
3883 bool insertedSync = false;
3884 bool keepDst = false;
3885 INST_LIST_ITER prevIt = inst_it;
3886 if (node->followDistOneAreg())
3887 {
3888 prevIt--;
3889 }
3890
3891 //Architecture register instruction
3892 bool hasValidNextInst = false;
3893 if (node->hasDistOneAreg())
3894 {
3895 INST_LIST_ITER nextIt = inst_it;
3896 nextIt++;
3897 if (nextIt != bb->end())
3898 {
3899 G4_INST *nextInst = *nextIt;
3900 if (tokenHonourInstruction(nextInst) ||
3901 distanceHonourInstruction(nextInst))
3902 {
3903 hasValidNextInst = true;
3904 }
3905 }
3906 }
3907
3908 if (fg.builder->hasFourALUPipes()) //PVC
3909 {
3910 insertedSync = insertSyncPVC(bb, node, inst, inst_it, newInstID, dstTokens, srcTokens);
3911 }
3912 else if (fg.builder->hasThreeALUPipes()) //XeHP_SDV
3913 {
3914 insertedSync = insertSyncXe(bb, node, inst, inst_it, newInstID, dstTokens, srcTokens);
3915 }
3916 else //TGLLP
3917 {
3918 insertedSync = insertSyncToken(bb, node, inst, inst_it, newInstID, dstTokens, srcTokens, keepDst, false);
3919 }
3920
3921 if (node->followDistOneAreg() && insertedSync)
3922 {
3923 G4_INST* syncInst = insertSyncInstructionAfter(bb, prevIt, inst->getCISAOff(), inst->getLineNo());
3924 syncInst->setDistance(1);
3925 if (fg.builder->hasThreeALUPipes() || fg.builder->hasFourALUPipes())
3926 {
3927 syncInst->setDistanceTypeXe(G4_INST::DistanceType::DISTALL);
3928 }
3929 }
3930
3931 if (node->hasDistOneAreg() && !hasValidNextInst)
3932 {
3933 G4_INST* syncInst = insertSyncInstructionAfter(bb, inst_it, inst->getCISAOff(), inst->getLineNo());
3934 syncInst->setDistance(1);
3935 if (fg.builder->hasThreeALUPipes() || fg.builder->hasFourALUPipes())
3936 {
3937 syncInst->setDistanceTypeXe(G4_INST::DistanceType::DISTALL);
3938 }
3939 }
3940 }
3941
3942 //
3943 // Insert the test instruction according to token assignment result. Re-assign the node id.
3944 // Except the test instruction, one instruction can have at most one token.
3945 // SWSB format - non send
3946 // 7 6 5 4 3 2 1 0
3947 // 0 0 0 0 0 0 0 0 No dependency
3948 // 0 0 0 0 regDist dst Reg only dep (1-15)
3949 // 0 0 0 1 R R R R Reserved
3950 // 0 0 1 0 R memSBid dst Memory dst only dep (0-7)
3951 // 0 0 1 1 R memSBid src Memory src only dep (0-7)
3952 // 0 1 R R R R R R Reserved for Future extensions
3953 // 1 memSBid dst regDist dst Reg and Memory dst dep
3954 //
3955 // SWSB format - send
3956 // 0 0 0 0 0 0 0 0 No dependency
3957 // 0 0 0 0 regDist dst Reg only dep (1-15)
3958 // 0 0 0 1 R memSBid set SBid allocation only (0-7)
3959 // 0 0 1 0 R memSBid dst Memory dst only dep (0-7)
3960 // 0 0 1 1 R memSBid src Memory src only dep (0-7)
3961 // 0 1 R R R R R R Reserved for Future extensions
3962 // 1 memSBid set regDist dst SBid allocation and Reg only dep (1-15)
3963 //
3964 // 8bits [7:0] 8bits [15:8] 4bits [27:24] 1 bit [29] 1bit [30] 16bits [47:32]
3965 // test = 0x70 SWSB subOpcode CmptCtrl = 1 DebugCtrl
3966 // 0000 - Only SWSB check
3967 // 0001 - Check Send status 2bits x 8 Sbid
3968 // 00 - SBid not checked
3969 // 01 - reserved
3970 // 10 - Check for data sent out
3971 // 11 - Check for data received
3972 // 0010 - Check Address Register Dep 1bits x 16 address registers
3973 // 0 - Not checked
3974 // 1 - Check for Register dependency
3975 // others - Reserved
3976 //
insertTest()3977 void SWSB::insertTest()
3978 {
3979 SBNODE_VECT_ITER node_it = SBNodes.begin();
3980 int newInstID = 0;
3981
3982 for (G4_BB* bb : fg)
3983 {
3984 BitSet dstTokens(totalTokenNum, false);
3985 BitSet srcTokens(totalTokenNum, false);
3986
3987 std::list<G4_INST*>::iterator inst_it(bb->begin()), iInstNext(bb->begin());
3988 while (iInstNext != bb->end())
3989 {
3990 inst_it = iInstNext;
3991 iInstNext++;
3992 G4_INST* inst = *inst_it;
3993
3994 if (inst->isLabel())
3995 {
3996 continue;
3997 }
3998
3999 SBNode* node = *node_it;
4000 assert(node->GetInstruction() == inst);
4001
4002 bool fusedSync = false;
4003 //HW W/A
4004 //For fused URB sends, or typed write, in HW, the dependence info of the second send instruction cannot be decoded
4005 //Software will check and promoted them before the first instruction.
4006 //If the second one is EOT instruction, syncAll is required.
4007 if (inst->isSend() &&
4008 inst->isAtomicInst())
4009 {
4010 INST_LIST_ITER tmp_it = inst_it;
4011 tmp_it++;
4012 if (tmp_it != bb->end())
4013 {
4014 const G4_INST* nextInst = *tmp_it;
4015
4016 if (nextInst->isSend())
4017 {
4018 G4_INST* synInst = nullptr;
4019 if (nextInst->isEOT())
4020 {
4021 //If the second is EOT, sync all can be inserted directly, because EOT has no token info
4022 synInst = insertSyncAllWRInstruction(bb, 0, inst_it, inst->getCISAOff(), nextInst->getLineNo());
4023 synInst->setLexicalId(newInstID);
4024 }
4025 else
4026 {
4027 fusedSync = true;
4028 if (inst->getSetToken() != (unsigned short)UNKNOWN_TOKEN)
4029 {
4030 dstTokens.set(inst->getSetToken(), false);
4031 srcTokens.set(inst->getSetToken(), false);
4032 }
4033 }
4034 }
4035 }
4036 }
4037 else if ((kernel.fg.getHasStackCalls() || kernel.fg.getIsStackCallFunc()) && inst->isSend() && inst->getDst())
4038 {
4039 //Stack call is using the NOMASK save and restore.
4040 //This means there will be RAW dependence generated along the SIMD control flow.
4041 //Such as in following case, {$1.dst} is required.
4042 //if()
4043 //{
4044 // ...
4045 // R1 --> save();
4046 // Fcall_0
4047 // R1 <-- retore(); {$1}
4048 // ...
4049 //}
4050 //else
4051 //{
4052 // ...
4053 // R1 --> save() {$1.dst}
4054 // Fcall_1
4055 // R1 <-- retore();
4056 // ...
4057 //}
4058 //RAW dependence tracking in SWSB is scalar control flow based, because traditional RA will not generate this kind dependence.
4059 //At the same time, since we handle the SWSB for stack call conservatively. So we can handle this dependence specially.
4060 G4_Declare *dstDcl = GetTopDclFromRegRegion((G4_DstRegRegion *)inst->getDst());
4061 if (std::find(kernel.callerRestoreDecls.begin(), kernel.callerRestoreDecls.end(), dstDcl) != kernel.callerRestoreDecls.end())
4062 {
4063 G4_INST* syncInst = insertSyncInstructionAfter(bb, inst_it, inst->getCISAOff(), inst->getLineNo());
4064 unsigned short dstToken = (unsigned short)-1;
4065 dstToken = node->getLastInstruction()->getSetToken();
4066 syncInst->setToken(dstToken);
4067 syncInst->setTokenType(SWSBTokenType::AFTER_WRITE);
4068 }
4069 }
4070 if (fusedSync)
4071 {
4072 insertSync(bb, node, inst, inst_it, newInstID, &dstTokens, &srcTokens);
4073 inst->setLexicalId(newInstID);
4074 newInstID++;
4075
4076 INST_LIST_ITER tmp_it = inst_it;
4077 inst_it++;
4078 iInstNext++;
4079 node_it++;
4080 inst = *inst_it;
4081 node = *node_it;
4082 if (inst->getSetToken() != (unsigned short)UNKNOWN_TOKEN)
4083 {
4084 dstTokens.set(inst->getSetToken(), false);
4085 srcTokens.set(inst->getSetToken(), false);
4086 }
4087 //tmp_it keeps the position to insert new generated instructions.
4088 insertSync(bb, node, inst, tmp_it, newInstID, &dstTokens, &srcTokens);
4089 unsigned short token = inst->getSetToken();
4090 if (token != (unsigned short)UNKNOWN_TOKEN)
4091 {
4092 G4_INST* synInst = insertSyncInstruction(bb, tmp_it, inst->getCISAOff(), inst->getLineNo());
4093 synInst->setToken(token);
4094 synInst->setTokenType(SWSBTokenType::AFTER_WRITE);
4095 synInst->setLexicalId(newInstID);
4096 }
4097 }
4098 else
4099 {
4100 insertSync(bb, node, inst, inst_it, newInstID, &dstTokens, &srcTokens);
4101 }
4102
4103 if (inst->getSetToken() != (unsigned short)UNKNOWN_TOKEN)
4104 {
4105 dstTokens.set(inst->getSetToken(), false);
4106 srcTokens.set(inst->getSetToken(), false);
4107 }
4108
4109 inst->setLexicalId(newInstID);
4110 for (unsigned i = 1; i < node->instVec.size(); i++)
4111 {
4112 inst = *iInstNext;
4113 inst->setLexicalId(newInstID);
4114 iInstNext++;
4115 }
4116
4117 if (tokenHonourInstruction(inst) && inst->getSetToken() != (unsigned short)UNKNOWN_TOKEN)
4118 {
4119 dstTokens.set(inst->getSetToken(), false);
4120 srcTokens.set(inst->getSetToken(), false);
4121 }
4122
4123 newInstID++;
4124 node_it++;
4125 }
4126 }
4127
4128 tokenProfile.setSyncInstCount(syncInstCount);
4129 tokenProfile.setMathReuseCount(mathReuseCount);
4130 tokenProfile.setAWSyncInstCount(AWSyncInstCount);
4131 tokenProfile.setARSyncInstCount(ARSyncInstCount);
4132 tokenProfile.setAWSyncAllCount(AWSyncAllCount);
4133 tokenProfile.setARSyncAllCount(ARSyncAllCount);
4134 tokenProfile.setTokenReuseCount(tokenReuseCount);
4135 }
4136
dumpDepInfo() const4137 void SWSB::dumpDepInfo() const
4138 {
4139 for (const SBNode* node : SBNodes)
4140 {
4141 if (node->GetInstruction()->isEOT())
4142 {
4143 continue;
4144 }
4145
4146 const G4_INST* inst = node->GetInstruction();
4147 std::cerr << node->getNodeID() << ":\t";
4148 inst->dump();
4149 std::cerr << "Succs:";
4150 for (const SBDEP_ITEM& curSucc : node->succs)
4151 {
4152 std::cerr << curSucc.node->getNodeID() << ":" << ((curSucc.attr == DEP_EXPLICT) ? "E" : "I") << ", ";
4153 if (curSucc.type == RAW || curSucc.type == WAW)
4154 {
4155 std::cerr << "AW;";
4156 }
4157 else
4158 {
4159 std::cerr << "AR;";
4160 }
4161 }
4162 std::cerr << "\n";
4163 std::cerr << "Preds:";
4164 for (const SBDEP_ITEM& curPred : node->preds)
4165 {
4166 std::cerr << curPred.node->getNodeID() << ":" << ((curPred.attr == DEP_EXPLICT) ? "E" : "I") << ", ";
4167 }
4168 std::cerr << "\n\n";
4169 }
4170 }
4171
dumpLiveIntervals() const4172 void SWSB::dumpLiveIntervals() const
4173 {
4174 std::cerr << "Internal:" << "\n";
4175 for (const SBNode* node : SBSendNodes)
4176 {
4177 if (node->GetInstruction()->isEOT())
4178 {
4179 continue;
4180 }
4181 node->dumpInterval();
4182 }
4183 }
4184
dumpTokeAssignResult() const4185 void SWSB::dumpTokeAssignResult() const
4186 {
4187 std::cerr << "Internal:" << "\n";
4188 for (const SBNode* node : SBSendNodes)
4189 {
4190 if (node->GetInstruction()->isEOT())
4191 {
4192 continue;
4193 }
4194 node->dumpAssignedTokens();
4195 }
4196 }
4197
dumpSync(const SBNode * tokenNode,const SBNode * syncNode,unsigned short token,SWSBTokenType type) const4198 void SWSB::dumpSync(const SBNode* tokenNode, const SBNode* syncNode, unsigned short token, SWSBTokenType type) const
4199 {
4200 std::cerr << "#" << syncNode->getNodeID() << "(" << token << ",";
4201 std::cerr << ((type == SWSBTokenType::AFTER_READ) ? "AR" : "AW") << ")";
4202 std::cerr << ": " << "#" << tokenNode->getNodeID() << "(" << tokenNode->getLiveStartID() << "-" << tokenNode->getLiveEndID() << ")\n";
4203 }
4204
buildLiveIntervals()4205 void SWSB::buildLiveIntervals()
4206 {
4207 // For all send nodes
4208 // Set the live ranges according to dependence edges
4209 const bool trueDepOnly = fg.builder->getOptions()->getOption(vISA_TrueDepOnly);
4210 for (SBNode* node : SBSendNodes)
4211 {
4212 node->setLiveEarliestID(node->getNodeID(), node->getBBID());
4213 node->setLiveLatestID(node->getNodeID(), node->getBBID());
4214 for (SBDEP_ITEM& curSucc : node->succs)
4215 {
4216 const SBNode* succ = curSucc.node;
4217 if (trueDepOnly && node->GetInstruction()->isDpas() && node->getBBID() != succ->getBBID())
4218 {
4219 node->setLiveLatestID(BBVector[node->getBBID()]->last_node, node->getBBID());
4220 }
4221 else
4222 {
4223 node->setLiveLatestID(succ->getNodeID(), succ->getBBID());
4224 }
4225 }
4226 }
4227
4228 #ifdef DEBUG_VERBOSE_ON
4229 dumpLiveIntervals();
4230 dumpDepInfo();
4231 #endif
4232
4233 //For global send nodes
4234 //According to layout, extend the live range of each send operand to
4235 //the start of the first live in BB and end of last live out BB
4236 for (BB_LIST_ITER ib(fg.begin()), bend(fg.end()); ib != bend; ++ib)
4237 {
4238 unsigned bbID = (*ib)->getId();
4239 G4_BB_SB* sb_bb = BBVector[bbID];
4240 SBBitSets& send_live_in = sb_bb->send_live_in;
4241 SBBitSets& send_live_out = sb_bb->send_live_out;
4242 SBBitSets& send_live_in_scalar = sb_bb->send_live_in_scalar;
4243 SBBitSets& send_live_out_scalar = sb_bb->send_live_out_scalar;
4244
4245 if (send_live_in.isEmpty())
4246 {
4247 continue;
4248 }
4249
4250 for (SBBucketNode* bucketNode : globalSendOpndList)
4251 {
4252 SBNode* node = bucketNode->node;
4253 int globalID = node->globalID;
4254
4255 if (trueDepOnly && node->GetInstruction()->isDpas())
4256 {
4257 continue;
4258 }
4259
4260 if (bucketNode->opndNum == Opnd_dst)
4261 {
4262 if (sb_bb->first_node != -1 &&
4263 send_live_in_scalar.isDstSet((unsigned)globalID))
4264 {
4265 if (!(*ib)->Preds.empty() || !(sb_bb->Preds.empty()))
4266 {
4267 node->setLiveEarliestID(sb_bb->first_node, bbID);
4268 }
4269 }
4270 //FIXME: implicit dependence still have issue.
4271 //the live range of implicit dependence may not counted. But that's ok? This may cause the delay. ...
4272 if (sb_bb->first_node != -1 &&
4273 send_live_out_scalar.isDstSet((unsigned)globalID))
4274 {
4275 if (!(*ib)->Succs.empty() || !(sb_bb->Succs.empty()))
4276 {
4277 node->setLiveLatestID(sb_bb->last_node, bbID);
4278 }
4279 }
4280 }
4281 else if (!trueDepOnly)
4282 {
4283 if (sb_bb->first_node != -1 &&
4284 send_live_in.isSrcSet((unsigned)globalID))
4285 {
4286 if (!(*ib)->Preds.empty() || !(sb_bb->Preds.empty()))
4287 {
4288 node->setLiveEarliestID(sb_bb->first_node, bbID);
4289 }
4290 }
4291 //FIXME: implicit dependence still have issue.
4292 //the live range of implicit dependence may not counted. But that's ok? This may cause the delay. ...
4293 if (sb_bb->first_node != -1 &&
4294 send_live_out.isSrcSet((unsigned)globalID))
4295 {
4296 if (!(*ib)->Succs.empty() || !(sb_bb->Succs.empty()))
4297 {
4298 node->setLiveLatestID(sb_bb->last_node, bbID);
4299 }
4300 }
4301 }
4302 }
4303 }
4304 #ifdef DEBUG_VERBOSE_ON
4305 dumpLiveIntervals();
4306 #endif
4307 return;
4308 }
4309
4310 //
4311 // live_in(BBi) = Union(def_out(BBj)) // BBj is predecessor of BBi
4312 // live_out(BBi) += live_in(BBi) - may_kill(BBi)
4313 //
globalDependenceDefReachAnalysis(G4_BB * bb)4314 bool SWSB::globalDependenceDefReachAnalysis(G4_BB* bb)
4315 {
4316 bool changed = false;
4317 unsigned bbID = bb->getId();
4318
4319 if (bb->Preds.empty())
4320 {
4321 return false;
4322 }
4323
4324 SBBitSets temp_live_in(globalSendNum);
4325 temp_live_in = BBVector[bbID]->send_live_in;
4326
4327 for (const G4_BB* predBB : bb->Preds)
4328 {
4329 unsigned predID = predBB->getId();
4330 temp_live_in |= BBVector[predID]->send_live_out;
4331 }
4332
4333 if (temp_live_in != BBVector[bbID]->send_live_in)
4334 {
4335 changed = true;
4336 BBVector[bbID]->send_live_in = temp_live_in;
4337 }
4338
4339 //Record the killed dst and src in scalar CF iterating
4340 SBBitSets temp_kill(globalSendNum);
4341 temp_kill = temp_live_in;
4342 temp_kill &= BBVector[bbID]->send_may_kill;
4343 BBVector[bbID]->send_kill_scalar |= temp_kill;
4344
4345 temp_kill = temp_live_in;
4346 temp_kill.src &= BBVector[bbID]->send_may_kill.dst;
4347 BBVector[bbID]->send_kill_scalar.src |= temp_kill.src;
4348
4349 //Kill nodes
4350 //once dst is killed, src definitely is killed
4351 temp_live_in -= BBVector[bbID]->send_may_kill;
4352 temp_live_in.src -= BBVector[bbID]->send_may_kill.dst;
4353
4354 BBVector[bbID]->send_live_out |= temp_live_in;
4355
4356 return changed;
4357 }
4358
4359 //
4360 // live_in(BBi) = Union(def_out(BBj)) // BBj is predecessor of BBi
4361 // live_out(BBi) += live_in(BBi) - may_kill(BBi)
4362 //
globalDependenceUseReachAnalysis(G4_BB * bb)4363 bool SWSB::globalDependenceUseReachAnalysis(G4_BB* bb)
4364 {
4365 bool changed = false;
4366 unsigned bbID = bb->getId();
4367
4368 if (bb->Preds.empty())
4369 {
4370 return false;
4371 }
4372
4373 SBBitSets temp_live_in(globalSendNum);
4374 temp_live_in = BBVector[bbID]->send_live_in;
4375
4376 for (BB_SWSB_LIST_ITER it = BBVector[bbID]->Preds.begin(); it != BBVector[bbID]->Preds.end(); it++)
4377 {
4378 G4_BB* predBB = (*it)->getBB();
4379 unsigned predID = predBB->getId();
4380 temp_live_in |= BBVector[predID]->send_live_out;
4381 }
4382
4383 if (temp_live_in != BBVector[bbID]->send_live_in)
4384 {
4385 changed = true;
4386 BBVector[bbID]->send_live_in = temp_live_in;
4387 }
4388
4389 //Kill scalar kills
4390 temp_live_in -= BBVector[bbID]->send_kill_scalar;
4391 temp_live_in.src -= BBVector[bbID]->send_may_kill.src;
4392 temp_live_in.dst -= BBVector[bbID]->send_WAW_may_kill;
4393
4394 BBVector[bbID]->send_live_out |= temp_live_in;
4395
4396 return changed;
4397 }
4398
4399
tokenEdgePrune(unsigned & prunedEdgeNum,unsigned & prunedGlobalEdgeNum,unsigned & prunedDiffBBEdgeNum,unsigned & prunedDiffBBSameTokenEdgeNum)4400 void SWSB::tokenEdgePrune(unsigned& prunedEdgeNum,
4401 unsigned& prunedGlobalEdgeNum,
4402 unsigned& prunedDiffBBEdgeNum,
4403 unsigned& prunedDiffBBSameTokenEdgeNum)
4404 {
4405 for (size_t i = 0; i < BBVector.size(); i++)
4406 {
4407 if (BBVector[i]->first_node == -1)
4408 {
4409 continue;
4410 }
4411
4412 BitSet activateLiveIn(SBSendNodes.size(), false);
4413 activateLiveIn |= BBVector[i]->liveInTokenNodes;
4414
4415 //Scan the instruction nodes of current BB
4416 for (int j = BBVector[i]->first_node; j <= BBVector[i]->last_node; j++)
4417 {
4418 SBNode* node = SBNodes[j];
4419 BitSet killedToken(totalTokenNum, false); //Track the token killed by current instruction.
4420
4421 //scan the incoming dependence edges of current node
4422 for (auto node_it = node->preds.begin();
4423 node_it != node->preds.end();
4424 node_it++)
4425 {
4426 SBDEP_ITEM& curPred = (*node_it);
4427 DepType type = curPred.type;
4428 SBNode* predNode = curPred.node;
4429
4430 //If the predecessor node is a token instruction node.
4431 if (tokenHonourInstruction(predNode->GetInstruction()))
4432 {
4433 if (!activateLiveIn.isSet(predNode->sendID))
4434 {
4435 // If not in the live set of current instruction,
4436 // (The live in set will be changed during instruction scan)
4437 // remove the dependence from success list of previous node
4438 // The dependence SBID assignment only depends on the succ nodes.
4439 for (auto succ_it = predNode->succs.begin();
4440 succ_it != predNode->succs.end();
4441 succ_it++)
4442 {
4443 SBDEP_ITEM& currSucc = (*succ_it);
4444 if (currSucc.node == node)
4445 {
4446 //Don't do remove previous edge here.
4447 //1. Conflict with outer loop
4448 //2. There is no preds info required any more in following handling
4449 predNode->succs.erase(succ_it);
4450 prunedEdgeNum++;
4451 if (predNode->globalID != -1)
4452 {
4453 if (predNode->getBBID() != node->getBBID() &&
4454 !killedToken.isSet(predNode->getLastInstruction()->getSetToken()) &&
4455 (!(fg.builder->getOptions()->getOption(vISA_GlobalTokenAllocation) ||
4456 fg.builder->getOptions()->getOption(vISA_DistPropTokenAllocation)) ||
4457 !((fg.builder->getOptions()->getOption(vISA_GlobalTokenAllocation) ||
4458 fg.builder->getOptions()->getOption(vISA_DistPropTokenAllocation)) &&
4459 BBVector[node->getBBID()]->dominators.isSet(predNode->getBBID()))))
4460 {
4461 prunedDiffBBEdgeNum++;
4462 #ifdef DEBUG_VERBOSE_ON
4463 std::cerr << "Diff BB Token: " << predNode->getLastInstruction()->getSetToken() << " <Pred: " << predNode->getNodeID() << ", Succ: " << node->getNodeID() << ">" << std::endl;;
4464 #endif
4465 }
4466 else if (predNode->getBBID() != node->getBBID())
4467 {
4468 prunedDiffBBSameTokenEdgeNum++;
4469 #ifdef DEBUG_VERBOSE_ON
4470 std::cerr << "Diff BB Same Token: " << predNode->getLastInstruction()->getSetToken() << " <Pred: " << predNode->getNodeID() << ", Succ: " << node->getNodeID() << ">" << std::endl;;
4471 #endif
4472 }
4473 else
4474 {
4475 prunedGlobalEdgeNum++;
4476 #ifdef DEBUG_VERBOSE_ON
4477 std::cerr << "Global Token: " << predNode->getLastInstruction()->getSetToken() << " <Pred: " << predNode->getNodeID() << ", Succ: " << node->getNodeID() << ">" << std::endl;;
4478 #endif
4479 }
4480 }
4481 #ifdef DEBUG_VERBOSE_ON
4482 else
4483 {
4484 std::cerr << "Local Token: " << predNode->getLastInstruction()->getSetToken() << " <Pred: " << predNode->getNodeID() << ", Succ: " << node->getNodeID() << ">" << std::endl;;
4485 }
4486 #endif
4487 break;
4488 }
4489 }
4490 }
4491 else //In live in set
4492 {
4493 // Kill the dependence if it's a AW dependence
4494 // What about WAR?
4495 if (type == RAW || type == WAW)
4496 {
4497 int token = predNode->getLastInstruction()->getSetToken();
4498 if (token != (unsigned short)UNKNOWN_TOKEN)
4499 {
4500 activateLiveIn -= allTokenNodesMap[token].bitset;
4501 killedToken.set(token, true);
4502 }
4503 }
4504 }
4505 }
4506 }
4507
4508 // Current instruction is marked as alive
4509 // How to kill the old one? Especially the WAR?
4510 // Token reuse will kill all previous nodes with same token? yes
4511 if (tokenHonourInstruction(node->GetInstruction()) && !node->GetInstruction()->isEOT())
4512 {
4513 int token = node->getLastInstruction()->getSetToken();
4514 if (token != (unsigned short)UNKNOWN_TOKEN)
4515 {
4516 activateLiveIn -= allTokenNodesMap[token].bitset;
4517 activateLiveIn.set(node->sendID, true);
4518 }
4519 }
4520 }
4521 }
4522 }
4523
getLiveOutToken(unsigned allSendNum,const SBNODE_VECT * SBNodes)4524 void G4_BB_SB::getLiveOutToken(unsigned allSendNum,
4525 const SBNODE_VECT* SBNodes)
4526 {
4527 //Empty BB
4528 if (first_node == -1)
4529 {
4530 return;
4531 }
4532
4533 uint32_t totalTokenNum = builder.kernel.getNumSWSBTokens();
4534 unsigned* liveNodeID = (unsigned*)mem.alloc(sizeof(unsigned) * totalTokenNum);
4535
4536 if (tokeNodesMap.size() == 0)
4537 {
4538 tokeNodesMap.resize(totalTokenNum);
4539
4540 //Each token ID has a bitset for all possible send instructions' ID
4541 for (size_t i = 0; i < totalTokenNum; i++)
4542 {
4543 tokeNodesMap[i] = BitSet(allSendNum, false);
4544 liveNodeID[i] = 0;
4545 }
4546 }
4547 else
4548 {
4549 for (size_t i = 0; i < totalTokenNum; i++)
4550 {
4551 tokeNodesMap[i].clear();
4552 liveNodeID[i] = 0;
4553 }
4554 }
4555
4556 // Scan instructions forward to get the live out of current BB
4557 for (int i = first_node; i <= last_node; i++)
4558 {
4559 SBNode* node = (*SBNodes)[i];
4560
4561 //Check the previous node.
4562 for (const SBDEP_ITEM& curPred : node->preds)
4563 {
4564 DepType type = curPred.type;
4565 SBNode* predNode = curPred.node;
4566
4567 if ((predNode == node) ||
4568 (predNode->getBBID() != node->getBBID()) ||
4569 (predNode->getNodeID() > node->getNodeID()))
4570 {
4571 continue;
4572 }
4573
4574
4575 //If there is a .dst dependence, kill all nodes with same token
4576 if (tokenHonourInstruction(predNode->getLastInstruction()) && (type == RAW || type == WAW))
4577 {
4578 if (predNode->getLastInstruction()->getSetToken() != (unsigned short)UNKNOWN_TOKEN)
4579 {
4580 unsigned short token = predNode->getLastInstruction()->getSetToken();
4581 // 1: send r112 {$9}
4582 // 2: send r18 {$9}
4583 // 3: send r112 {$9}
4584 // 4: send xxx, r18 {12}
4585 //
4586 // Instruction 4 may clear the $9 because of instruction 2
4587 // liveNodeID is used to track the live node id of each send. predNode can kill
4588 if (liveNodeID[token] < predNode->getNodeID())
4589 {
4590 tokeNodesMap[token].clear(); //Kill all dependence in following instructions with the same token
4591
4592 //Record the killed token by current BB, Kill may kill all previous nodes which reach current node
4593 killedTokens.set(token, true); //Set previous token send killed in current BB
4594 }
4595 }
4596 }
4597 }
4598
4599 //Token reuse will kill all previous nodes with same token
4600 //Will have only one?, yes, for BB local scan
4601 if (tokenHonourInstruction(node->getLastInstruction()) &&
4602 !node->getLastInstruction()->isEOT() &&
4603 node->getLastInstruction()->getSetToken() != (unsigned short)UNKNOWN_TOKEN)
4604 {
4605 unsigned short token = node->getLastInstruction()->getSetToken();
4606 tokeNodesMap[token].clear();
4607
4608 //For future live in, will always be killed by current instruction
4609 killedTokens.set(token, true);
4610
4611 //Current node may be in live out, if not be killed in following insts.
4612 tokeNodesMap[token].set(node->sendID, true);
4613 liveNodeID[token] = node->getNodeID();
4614 }
4615 }
4616
4617 for (size_t i = 0; i < totalTokenNum; i++)
4618 {
4619 liveOutTokenNodes |= tokeNodesMap[i];
4620 }
4621 }
4622 //
4623 // Scan to check which global send operand for sends will be killed by current BB.
4624 // Note that there is no guarantee the send operand will in the live in set of BB.
4625 // !!! Note that: since this "may kill" info is used in global analysis, "may kill" is not accurate, here we in fact record the "definitely kill".
setSendOpndMayKilled(LiveGRFBuckets * globalSendsLB,SBNODE_VECT * SBNodes,PointsToAnalysis & p)4626 void G4_BB_SB::setSendOpndMayKilled(LiveGRFBuckets* globalSendsLB,
4627 SBNODE_VECT* SBNodes,
4628 PointsToAnalysis& p)
4629 {
4630 std::vector<SBBucketDesc> BDvec;
4631 if (first_node == -1)
4632 {
4633 return;
4634 }
4635
4636 bool addGlobalSLMWARWA = false;
4637 for (int i = first_node; i <= last_node; i++)
4638 {
4639 SBNode* node = (*SBNodes)[i];
4640 G4_INST* curInst = (*SBNodes)[i]->GetInstruction();
4641
4642 if (curInst->isLabel())
4643 {
4644 continue;
4645 }
4646
4647 BDvec.clear();
4648 getGRFBucketDescs(node, BDvec, true);
4649 if (!BDvec.size())
4650 {
4651 continue;
4652 }
4653
4654 // For all bucket descriptors of curInst
4655 for (const SBBucketDesc& BD : BDvec) {
4656 const int& curBucket = BD.bucket;
4657 const Gen4_Operand_Number& curOpnd = BD.opndNum;
4658 const SBFootprint* curFootprint = BD.footprint;
4659
4660 for (LiveGRFBuckets::BN_iterator bn_it = globalSendsLB->begin(curBucket);
4661 bn_it != globalSendsLB->end(curBucket);)
4662 {
4663 SBBucketNode* liveBN = (*bn_it);
4664 SBNode* curLiveNode = liveBN->node;
4665 Gen4_Operand_Number liveOpnd = liveBN->opndNum;
4666 const SBFootprint* liveFootprint = liveBN->footprint;
4667 G4_INST* liveInst = liveFootprint->inst;
4668
4669 //Send operands are all GRF aligned, there is no overlap checking required.
4670 //Fix me, this is not right, for math instruction, less than 1 GRF may happen.
4671 //Find DEP type
4672 unsigned short internalOffset = 0;
4673 bool hasOverlap = curFootprint->hasOverlap(liveFootprint, internalOffset);
4674 if (!hasOverlap)
4675 {
4676 ++bn_it;
4677 continue;
4678 }
4679
4680 DepType dep = getDepForOpnd(liveOpnd, curOpnd);
4681
4682 //For SBID global liveness analysis, both explicit and implicit kill counted.
4683 if (dep == RAW || dep == WAW)
4684 {
4685 send_may_kill.setDst(curLiveNode->globalID, true);
4686 if (dep == WAW)
4687 {
4688 send_WAW_may_kill.set(curLiveNode->globalID, true);
4689 }
4690 }
4691
4692 if (dep == WAR &&
4693 WARDepRequired(liveInst, curFootprint->inst))
4694 {
4695 send_may_kill.setSrc(curLiveNode->globalID, true);
4696 }
4697
4698 //FIXME: for NODEP, there is optimization chance.
4699 // if (hasSameFunctionID(liveInst, curInst))
4700 // send null, r1, r73, ... {$0}
4701 // send null, r1, r60, ... {$1}
4702 // add r60... {$1.src}
4703 // add r73 // There is no need to set {$0.src}
4704 //
4705 // send null, r1, r73, ... {$0}
4706 // send null, r1, r60, ... {$1}
4707 // add r73 {$0.src} // We need to set {$0.src}
4708 // add r60... {$1.src}
4709 //if (dep == NODEP && !hasSameFunctionID(liveInst, curInst)) //Conservative, only different pipeline, we will insert dependence tracking
4710 //{
4711 // send_may_kill->setSrc(curLiveNode->globalID, true);
4712 //}
4713
4714 assert(dep != DEPTYPE_MAX && "dep unassigned?");
4715 ++bn_it;
4716 }
4717 }
4718
4719 if (!addGlobalSLMWARWA && builder.hasSLMWARIssue() && curInst->isSend() &&
4720 (isSLMMsg(curInst) && (curInst->getDst() == nullptr || isFence(curInst))))
4721 {
4722 for (int curBucket = 0; curBucket < globalSendsLB->getNumOfBuckets(); curBucket++)
4723 {
4724 for (LiveGRFBuckets::BN_iterator bn_it = globalSendsLB->begin(curBucket);
4725 bn_it != globalSendsLB->end(curBucket);)
4726 {
4727 SBBucketNode* liveBN = (*bn_it);
4728 SBNode* curLiveNode = liveBN->node;
4729 G4_INST* liveInst = liveBN->footprint->inst;
4730
4731 if (liveInst->isSend() &&
4732 isSLMMsg(liveInst) && liveInst->getDst() != nullptr && !liveInst->getDst()->isNullReg())
4733 {
4734 send_may_kill.setDst(curLiveNode->globalID, true);
4735 }
4736 ++bn_it;
4737 }
4738 }
4739 addGlobalSLMWARWA = true;
4740 }
4741 }
4742 }
4743
getFootprintForOperand(SBNode * node,G4_INST * inst,G4_Operand * opnd,Gen4_Operand_Number opndNum)4744 bool G4_BB_SB::getFootprintForOperand(SBNode* node,
4745 G4_INST* inst,
4746 G4_Operand* opnd,
4747 Gen4_Operand_Number opndNum)
4748 {
4749 int startingBucket = UNINIT_BUCKET;
4750 bool hasDistOneAReg = false;
4751 bool footprintOperand = false;
4752 bool isAccReg = false;
4753 bool isFlagReg = false;
4754 SBFootprint* footprint = nullptr;
4755 G4_VarBase* base = opnd->getBase();
4756
4757 assert(base && "If no base, then the operand is not touched by the instr.");
4758
4759 G4_VarBase* phyReg = (base->isRegVar()) ? base->asRegVar()->getPhyReg() : base;
4760
4761 switch (phyReg->getKind())
4762 {
4763 case G4_VarBase::VK_phyGReg:
4764 startingBucket = 0;
4765 footprintOperand = true;
4766 break;
4767 case G4_VarBase::VK_phyAReg:
4768 if (phyReg->isSrReg() ||
4769 phyReg->isCrReg() ||
4770 phyReg->isSpReg() ||
4771 phyReg->isIpReg() ||
4772 phyReg->isTmReg() ||
4773 phyReg->isMaskReg() ||
4774 phyReg->isDbgReg())
4775 {
4776 hasDistOneAReg = true;
4777 }
4778 isAccReg = phyReg->isAccReg();
4779 isFlagReg = phyReg->isFlag();
4780 break;
4781 case G4_VarBase::VK_regVar:
4782 assert(0 && "Should not be a regvar. PhyReg is extracted from regvar.");
4783 break;
4784 default:
4785 assert(0 && "Bad kind");
4786 break;
4787 }
4788
4789 if (footprintOperand)
4790 {
4791 // Create one or more buckets and push them into the vector
4792 footprint = getFootprintForGRF(opnd, opndNum, inst, startingBucket, inst->isSend());
4793 node->setFootprint(footprint, opndNum);
4794 }
4795
4796 if ((builder.hasThreeALUPipes() || builder.hasFourALUPipes()))
4797 {
4798 if (isAccReg)
4799 {
4800 footprint = getFootprintForACC(opnd, opndNum, inst);
4801 node->setFootprint(footprint, opndNum);
4802 }
4803 if (isFlagReg)
4804 {
4805 footprint = getFootprintForFlag(opnd, opndNum, inst);
4806 node->setFootprint(footprint, opndNum);
4807 }
4808 }
4809
4810
4811 return hasDistOneAReg;
4812 }
4813
getGRFFootprintForIndirect(SBNode * node,Gen4_Operand_Number opnd_num,G4_Operand * opnd,PointsToAnalysis & p)4814 void G4_BB_SB::getGRFFootprintForIndirect(SBNode* node,
4815 Gen4_Operand_Number opnd_num,
4816 G4_Operand* opnd,
4817 PointsToAnalysis& p)
4818 {
4819 G4_Declare* addrdcl = nullptr;
4820 SBFootprint* footprint = nullptr;
4821 G4_Type type = opnd->getType();
4822
4823 if (opnd_num == Opnd_dst)
4824 {
4825 G4_DstRegRegion* dstrgn = opnd->asDstRegRegion();
4826 addrdcl = GetTopDclFromRegRegion(dstrgn);
4827 }
4828 else if (opnd_num == Opnd_src0 ||
4829 opnd_num == Opnd_src1 ||
4830 opnd_num == Opnd_src2 ||
4831 opnd_num == Opnd_src3)
4832 {
4833 G4_SrcRegRegion* srcrgn = opnd->asSrcRegRegion();
4834 addrdcl = GetTopDclFromRegRegion(srcrgn);
4835 }
4836 else
4837 {
4838 assert(0);
4839 }
4840
4841 #ifdef DEBUG_VERBOSE_ON
4842 std::cerr << addrdcl->getName() << ":" << std::endl;
4843 std::cerr << node->getNodeID() << ":";
4844 node->GetInstruction()->dump();
4845 std::cerr << "Point to: ";
4846 #endif
4847
4848 if (addrdcl == nullptr)
4849 {
4850 assert(0);
4851 return;
4852 }
4853
4854 G4_RegVar* ptvar = NULL;
4855 int vid = 0;
4856
4857 unsigned char offset = 0;
4858 while ((ptvar = p.getPointsTo(addrdcl->getRegVar(), vid++, offset)) != NULL)
4859 {
4860
4861 uint32_t varID = ptvar->getId();
4862 G4_Declare* dcl = ptvar->getDeclare();
4863 G4_RegVar* var = NULL;
4864
4865 while (dcl->getAliasDeclare())
4866 {
4867 dcl = dcl->getAliasDeclare();
4868 }
4869
4870
4871 int linearizedStart = 0;
4872 int linearizedEnd = 0;
4873
4874 if (dcl->isSpilled()) //FIXME: Lost point analysis tracking due to spill, assume all registers are touched
4875 {
4876 linearizedEnd = totalGRFNum * numEltPerGRF<Type_UB>() - 1;
4877 }
4878 else
4879 {
4880 var = dcl->getRegVar();
4881
4882 MUST_BE_TRUE(var->getId() == varID, "RA verification error: Invalid regVar ID!");
4883 MUST_BE_TRUE(var->getPhyReg()->isGreg(), "RA verification error: Invalid dst reg!");
4884
4885 uint32_t regNum = var->getPhyReg()->asGreg()->getRegNum();
4886 uint32_t regOff = var->getPhyRegOff();
4887
4888 {
4889 linearizedStart = regNum * numEltPerGRF<Type_UB>() + regOff * TypeSize(dcl->getElemType());
4890 linearizedEnd = regNum * numEltPerGRF<Type_UB>() + regOff * TypeSize(dcl->getElemType()) + dcl->getByteSize() - 1;
4891 }
4892 }
4893
4894
4895 void* allocedMem = mem.alloc(sizeof(SBFootprint));
4896 footprint = new (allocedMem)SBFootprint(GRF_T, type, (unsigned short)linearizedStart, (unsigned short)linearizedEnd, node->GetInstruction());
4897 node->setFootprint(footprint, opnd_num);
4898 #ifdef DEBUG_VERBOSE_ON
4899 int startingBucket = linearizedStart / numEltPerGRF<Type_UB>();
4900 int endingBucket = linearizedEnd / numEltPerGRF<Type_UB>();
4901 std::cerr << dcl->getName() << "<" << startingBucket << "," << endingBucket << ">";
4902 #endif
4903 }
4904 #ifdef DEBUG_VERBOSE_ON
4905 std::cerr << std::endl;
4906 #endif
4907 return;
4908 }
4909
4910 //Create Buckets
getGRFBuckets(SBNode * node,const SBFootprint * footprint,Gen4_Operand_Number opndNum,std::vector<SBBucketDesc> & BDvec,bool GRFOnly)4911 void G4_BB_SB::getGRFBuckets(SBNode* node,
4912 const SBFootprint* footprint,
4913 Gen4_Operand_Number opndNum,
4914 std::vector<SBBucketDesc>& BDvec,
4915 bool GRFOnly)
4916 {
4917 for (const SBFootprint* curFootprint = footprint; curFootprint != nullptr; curFootprint = curFootprint->next)
4918 {
4919 if (GRFOnly && (curFootprint->fType != GRF_T))
4920 {
4921 continue;
4922 }
4923
4924 int startingBucket = curFootprint->LeftB / numEltPerGRF<Type_UB>();
4925 int endingBucket = curFootprint->RightB / numEltPerGRF<Type_UB>();
4926 if (curFootprint->fType == ACC_T)
4927 {
4928 int aregOffset = totalGRFNum + builder.getNumScalarRegisters();
4929 startingBucket = startingBucket + aregOffset;
4930 endingBucket = endingBucket + aregOffset;
4931 }
4932 int numBuckets = endingBucket - startingBucket + 1;
4933 for (int j = startingBucket;
4934 j < (startingBucket + numBuckets); j++)
4935 {
4936 BDvec.push_back(SBBucketDesc(j, opndNum, node, curFootprint));
4937 }
4938 }
4939 }
4940
getGRFFootPrintOperands(SBNode * node,G4_INST * inst,Gen4_Operand_Number first_opnd,Gen4_Operand_Number last_opnd,PointsToAnalysis & p)4941 bool G4_BB_SB::getGRFFootPrintOperands(SBNode* node,
4942 G4_INST* inst,
4943 Gen4_Operand_Number first_opnd,
4944 Gen4_Operand_Number last_opnd,
4945 PointsToAnalysis& p)
4946 {
4947 bool hasDistOneAreg = false;
4948 for (Gen4_Operand_Number opndNum = first_opnd; opndNum <= last_opnd; opndNum = (Gen4_Operand_Number)(opndNum + 1))
4949 {
4950
4951 G4_Operand* opnd = inst->getOperand(opndNum);
4952
4953 if (!opnd || !opnd->getBase())
4954 {
4955 continue;
4956 }
4957
4958 if (opnd->isLabel() || opnd->isImm())
4959 {
4960 continue;
4961 }
4962
4963 hasDistOneAreg |= getFootprintForOperand(node, inst, opnd, opndNum);
4964
4965
4966 //Get bucket for indirect access
4967 if (hasIndirection(opnd, opndNum))
4968 {
4969 getGRFFootprintForIndirect(node, opndNum, opnd, p);
4970 }
4971 }
4972
4973 return hasDistOneAreg;
4974 }
4975
getGRFBucketsForOperands(SBNode * node,Gen4_Operand_Number first_opnd,Gen4_Operand_Number last_opnd,std::vector<SBBucketDesc> & BDvec,bool GRFOnly)4976 void G4_BB_SB::getGRFBucketsForOperands(SBNode* node,
4977 Gen4_Operand_Number first_opnd,
4978 Gen4_Operand_Number last_opnd,
4979 std::vector<SBBucketDesc>& BDvec,
4980 bool GRFOnly)
4981 {
4982 for (Gen4_Operand_Number opndNum = first_opnd; opndNum <= last_opnd; opndNum = (Gen4_Operand_Number)(opndNum + 1))
4983 {
4984 const SBFootprint* footprint = node->getFirstFootprint(opndNum);
4985 if (!footprint || (GRFOnly && (footprint->fType != GRF_T)))
4986 {
4987 continue;
4988 }
4989 getGRFBuckets(node, footprint, opndNum, BDvec, GRFOnly);
4990 }
4991
4992 return;
4993 }
4994
getGRFFootPrint(SBNode * node,PointsToAnalysis & p)4995 bool G4_BB_SB::getGRFFootPrint(SBNode* node, PointsToAnalysis& p)
4996 {
4997 bool hasDistOneAReg = false;
4998 //We get the description for source first, so for current instruction, the scan order is src0, src1, src2, src3, dst
4999 for (G4_INST* inst : node->instVec)
5000 {
5001 hasDistOneAReg |= getGRFFootPrintOperands(node, inst, Opnd_src0, Opnd_src3, p);
5002 hasDistOneAReg |= getGRFFootPrintOperands(node, inst, Opnd_pred, Opnd_implAccDst, p);
5003 hasDistOneAReg |= getGRFFootPrintOperands(node, inst, Opnd_dst, Opnd_dst, p);
5004 }
5005
5006 return hasDistOneAReg;
5007 }
5008
getGRFBucketDescs(SBNode * node,std::vector<SBBucketDesc> & BDvec,bool GRFOnly)5009 void G4_BB_SB::getGRFBucketDescs(SBNode* node, std::vector<SBBucketDesc>& BDvec, bool GRFOnly)
5010 {
5011 //We get the description for source first, so for current instruction, the scan order is src0, src1, src2, src3, dst
5012 getGRFBucketsForOperands(node, Opnd_src0, Opnd_src3, BDvec, GRFOnly);
5013 if (!GRFOnly)
5014 {
5015 getGRFBucketsForOperands(node, Opnd_pred, Opnd_implAccDst, BDvec, GRFOnly);
5016 }
5017 getGRFBucketsForOperands(node, Opnd_dst, Opnd_dst, BDvec, GRFOnly);
5018
5019 return;
5020 }
5021
5022
5023 // Clear the killed bucket nodes
5024 // May be killed by 4 ways
5025 // 1. distance > SWSB_MAX_ALU_DEPENDENCE_DISTANCE
5026 // 2. instruction killed.
5027 // 3. source operands killed.
5028 // 4. operand killed.
5029 // FIXME:
5030 // 1. scanning all buckets is time cost.
5031 // 2. some time, only 1 way checking is required.
5032 // 3. the function is called for every instruction, it's compilation time waste.
clearKilledBucketNodeXeLP(LiveGRFBuckets * LB,int ALUID)5033 void G4_BB_SB::clearKilledBucketNodeXeLP(LiveGRFBuckets* LB, int ALUID)
5034 {
5035 for (int curBucket = 0; curBucket < LB->getNumOfBuckets(); curBucket++)
5036 {
5037 for (LiveGRFBuckets::BN_iterator it = LB->begin(curBucket); it != LB->end(curBucket);)
5038 {
5039 SBBucketNode* liveBN = (*it);
5040 SBNode* curLiveNode = liveBN->node;
5041
5042 if ((distanceHonourInstruction(curLiveNode->GetInstruction()) &&
5043 ((ALUID - curLiveNode->getALUID()) > curLiveNode->getMaxDepDistance())) ||
5044 curLiveNode->isInstKilled() ||
5045 (curLiveNode->isSourceKilled() &&
5046 liveBN->opndNum >= Opnd_src0 &&
5047 liveBN->opndNum <= Opnd_src3))
5048 {
5049 LB->killOperand(it);
5050 continue;
5051 }
5052
5053 ++it;
5054 }
5055 }
5056 }
5057
clearKilledBucketNodeXeHP(LiveGRFBuckets * LB,int integerID,int floatID,int longID,int mathID)5058 void G4_BB_SB::clearKilledBucketNodeXeHP(LiveGRFBuckets* LB, int integerID, int floatID, int longID, int mathID)
5059 {
5060 for (int curBucket = 0; curBucket < LB->getNumOfBuckets(); curBucket++)
5061 {
5062 for (LiveGRFBuckets::BN_iterator it = LB->begin(curBucket); it != LB->end(curBucket);)
5063 {
5064 SBBucketNode* liveBN = (*it);
5065 SBNode* curLiveNode = liveBN->node;
5066
5067 if (curLiveNode->isInstKilled() ||
5068 (curLiveNode->isSourceKilled() &&
5069 liveBN->opndNum >= Opnd_src0 &&
5070 liveBN->opndNum <= Opnd_src3))
5071 {
5072 LB->killOperand(it);
5073 continue;
5074 }
5075
5076 //Long pipeline must be checked first because it's definition is different with Integer and Float
5077 if (curLiveNode->GetInstruction()->isLongPipeInstructionXe() &&
5078 ((longID - curLiveNode->getLongID()) > SWSB_MAX_ALU_DEPENDENCE_DISTANCE_64BIT))
5079 {
5080 LB->killOperand(it);
5081 continue;
5082 }
5083
5084 if (curLiveNode->GetInstruction()->isIntegerPipeInstructionXe() &&
5085 ((integerID - curLiveNode->getIntegerID()) > SWSB_MAX_ALU_DEPENDENCE_DISTANCE))
5086 {
5087 LB->killOperand(it);
5088 continue;
5089 }
5090
5091 if (curLiveNode->GetInstruction()->isFloatPipeInstructionXe() &&
5092 ((floatID - curLiveNode->getFloatID()) > SWSB_MAX_ALU_DEPENDENCE_DISTANCE))
5093 {
5094 LB->killOperand(it);
5095 continue;
5096 }
5097
5098 if (curLiveNode->GetInstruction()->isMath() &&
5099 builder.hasFixedCycleMathPipe() &&
5100 (mathID - curLiveNode->getMathID() > SWSB_MAX_MATH_DEPENDENCE_DISTANCE))
5101 {
5102 LB->killOperand(it);
5103 continue;
5104 }
5105
5106 ++it;
5107 }
5108 }
5109 }
5110
clearSLMWARWAissue(SBNode * curNode,LiveGRFBuckets * LB)5111 void G4_BB_SB::clearSLMWARWAissue(SBNode* curNode, LiveGRFBuckets* LB)
5112 {
5113 for (int curBucket = 0; curBucket < LB->getNumOfBuckets(); curBucket++)
5114 {
5115 for (LiveGRFBuckets::BN_iterator it = LB->begin(curBucket); it != LB->end(curBucket);)
5116 {
5117 SBBucketNode* liveBN = (*it);
5118 SBNode* curLiveNode = liveBN->node;
5119 G4_INST* liveInst = liveBN->footprint->inst;
5120
5121 if (liveInst->isSend() &&
5122 isSLMMsg(liveInst) && liveInst->getDst() != nullptr && !liveInst->getDst()->isNullReg())
5123 {
5124 createAddGRFEdge(curLiveNode, curNode, RAW, DEP_EXPLICT);
5125 curLiveNode->setInstKilled(true); //Instruction level kill
5126 LB->killOperand(it);
5127 continue;
5128 }
5129
5130 ++it;
5131 }
5132 }
5133 }
5134
setDistance(const SBFootprint * footprint,SBNode * node,SBNode * liveNode,bool dstDep)5135 void G4_BB_SB::setDistance(const SBFootprint* footprint, SBNode* node, SBNode* liveNode, bool dstDep)
5136 {
5137 if (builder.hasThreeALUPipes() || builder.hasFourALUPipes())
5138 {
5139 unsigned prevID = 0;
5140 unsigned currentID = 0;
5141 switch (liveNode->ALUPipe)
5142 {
5143 case PIPE_INT:
5144 prevID = liveNode->getIntegerID();
5145 if (prevID < latestDepALUID[PIPE_INT])
5146 {
5147 return;
5148 }
5149 latestDepALUID[PIPE_INT] = prevID;
5150 currentID = node->ALUPipe == PIPE_INT ? node->getIntegerID() : integerID;
5151 break;
5152 case PIPE_FLOAT:
5153 prevID = liveNode->getFloatID();
5154 if (prevID < latestDepALUID[PIPE_FLOAT])
5155 {
5156 return;
5157 }
5158 latestDepALUID[PIPE_FLOAT] = prevID;
5159 currentID = node->ALUPipe == PIPE_FLOAT ? node->getFloatID() : floatID;
5160 break;
5161 case PIPE_LONG:
5162 prevID = liveNode->getLongID();
5163 if (prevID < latestDepALUID[PIPE_LONG])
5164 {
5165 return;
5166 }
5167 latestDepALUID[PIPE_LONG] = prevID;
5168 currentID = node->ALUPipe == PIPE_LONG ? node->getLongID() : longID;
5169 break;
5170 case PIPE_MATH:
5171 prevID = liveNode->getMathID();
5172 if (prevID < latestDepALUID[PIPE_MATH])
5173 {
5174 return;
5175 }
5176 latestDepALUID[PIPE_MATH] = prevID;
5177 currentID = node->ALUPipe == PIPE_MATH ? node->getMathID() : mathID;
5178 break;
5179 default:
5180 assert(0 && "None ALU pipe");
5181 return;
5182 }
5183 SBDISTDEP_ITEM depItem;
5184 depItem.liveNodePipe = liveNode->ALUPipe;
5185 depItem.nodePipe = node->ALUPipe;
5186 depItem.operandType = node->GetInstruction()->getDataTypePipeXe(footprint->type);
5187 depItem.dstDep = dstDep;
5188 if (node->GetInstruction()->isSend())
5189 {
5190 depItem.operandType = PIPE_SEND;
5191 }
5192 assert(currentID > prevID && "Wrong node ALU ID");
5193 node->setDistance(currentID - prevID);
5194 node->distDep.push_back(depItem);
5195 }
5196 else
5197 {
5198 auto dist = node->getALUID() - liveNode->getALUID();
5199 assert(dist <= liveNode->getMaxDepDistance() && "dist should not exceed the max dep distance");
5200 node->setDistance(dist);
5201 }
5202 }
5203
setSpecialDistance(SBNode * node)5204 void G4_BB_SB::setSpecialDistance(SBNode* node)
5205 {
5206 G4_INST* inst = node->GetInstruction();
5207 if (!inst->getDst())
5208 {
5209 return;
5210 }
5211
5212 if (inst->getDst()->isA0())
5213 {
5214 SBDISTDEP_ITEM depItem;
5215 depItem.liveNodePipe = PIPE_FLOAT;
5216 depItem.nodePipe = node->ALUPipe;
5217 depItem.operandType = PIPE_INT;
5218 depItem.dstDep = false;
5219 node->setDistance(1);
5220 node->distDep.push_back(depItem);
5221 }
5222
5223 return;
5224 }
5225 //The merged footprint is ordered from back to front instructions in the macro
5226 //As a result if killed, is the back instruction killed, which means front instructions are killed as well.
footprintMerge(SBNode * node,const SBNode * nextNode)5227 void G4_BB_SB::footprintMerge(SBNode* node, const SBNode* nextNode)
5228 {
5229 for (Gen4_Operand_Number opndNum
5230 : {Opnd_src0, Opnd_src1, Opnd_src2, Opnd_dst})
5231 {
5232 SBFootprint* nextfp = nextNode->getFirstFootprint(opndNum);
5233
5234 if (nextfp != nullptr)
5235 {
5236 if (node->GetInstruction()->isDpas())
5237 {
5238 nextfp->setOffset(node->getDPASSize());
5239 }
5240 node->setFootprint(nextfp, opndNum);
5241 }
5242 }
5243
5244 return;
5245 }
5246
hasInternalDependenceWithinDPAS(SBNode * node)5247 bool G4_BB_SB::hasInternalDependenceWithinDPAS(SBNode* node)
5248 {
5249 const SBFootprint* dstfp = node->getFirstFootprint(Opnd_dst);
5250
5251 for (Gen4_Operand_Number opndNum
5252 : {Opnd_src0, Opnd_src1, Opnd_src2})
5253 {
5254 const SBFootprint* srcfp = node->getFirstFootprint(opndNum);
5255 unsigned short internalOffset = 0;
5256 if (dstfp->hasOverlap(srcfp, internalOffset))
5257 {
5258 if (opndNum == Opnd_src1)
5259 {
5260 assert(0);
5261 }
5262 //For 8X8, it's allowed that dst and src0 share same registers (not internal dep). But not including partial overlap.
5263 if (opndNum == Opnd_src0)
5264 {
5265 const G4_INST* curInst = node->getLastInstruction();
5266 const G4_InstDpas* dpasInst = curInst->asDpasInst();
5267 uint8_t D = dpasInst->getSystolicDepth();
5268
5269 if (D == 8) //Works only for 8x8
5270 {
5271 if ((dstfp->LeftB == srcfp->LeftB) && (dstfp->RightB == srcfp->RightB))
5272 {
5273 continue;
5274 }
5275 }
5276 }
5277
5278 return true;
5279 }
5280 }
5281
5282 return false;
5283 }
5284
5285 //No WAR/RAW/WAW dependence within a DPAS macro
hasDependenceBetweenDPASNodes(SBNode * node,SBNode * nextNode)5286 bool G4_BB_SB::hasDependenceBetweenDPASNodes(SBNode* node, SBNode* nextNode)
5287 {
5288 for (Gen4_Operand_Number opndNum
5289 : {Opnd_src0, Opnd_src1, Opnd_src2, Opnd_dst})
5290 {
5291 const SBFootprint* fp = node->getFirstFootprint(opndNum);
5292 if (opndNum == Opnd_dst)
5293 {
5294 for (Gen4_Operand_Number opndNum2
5295 : {Opnd_src0, Opnd_src1, Opnd_src2, Opnd_dst})
5296 {
5297 const SBFootprint* nextfp = nextNode->getFirstFootprint(opndNum2);
5298 unsigned short internalOffset = 0;
5299 if (fp->hasOverlap(nextfp, internalOffset))
5300 {
5301 return true;
5302 }
5303
5304 if (opndNum2 == Opnd_dst && nextfp->hasOverlap(fp, internalOffset))
5305 {
5306 return true;
5307 }
5308 }
5309 }
5310 }
5311
5312 return false;
5313 }
5314
5315 #define SRC2_CACHE_SIZE 1024
src2FootPrintCachePVC(SBNode * curNode,SBNode * nextNode) const5316 bool G4_BB_SB::src2FootPrintCachePVC(SBNode * curNode, SBNode * nextNode) const
5317 {
5318 unsigned short GRFSize = getGRFSize();
5319 BitSet cachedGRF(totalGRFNum, false);
5320
5321 for (const SBFootprint* fp = curNode->getFirstFootprint(Opnd_src2); fp; fp = fp->next)
5322 {
5323 unsigned short leftB = fp->LeftB / GRFSize;
5324 unsigned short rightB = fp->RightB / GRFSize;
5325 for (unsigned short i = leftB; i <= rightB; i++)
5326 {
5327 cachedGRF.set(i, true);
5328 }
5329 }
5330
5331 for (const SBFootprint* fp = nextNode->getFirstFootprint(Opnd_src2); fp; fp = fp->next)
5332 {
5333 unsigned short leftB = fp->LeftB / GRFSize;
5334 unsigned short rightB = fp->RightB / GRFSize;
5335 for (unsigned short i = leftB; i <= rightB; i++)
5336 {
5337 cachedGRF.set(i, true);
5338 }
5339 }
5340
5341 unsigned short cachedGRFNum = 0;
5342 for (unsigned short i = 0; i < totalGRFNum; i++)
5343 {
5344 if (cachedGRF.isSet(i))
5345 {
5346 cachedGRFNum++;
5347 }
5348 }
5349
5350 return cachedGRFNum <= (SRC2_CACHE_SIZE + GRFSize - 1) / GRFSize;
5351 }
5352
src2SameFootPrintDiffType(SBNode * curNode,SBNode * nextNode) const5353 bool G4_BB_SB::src2SameFootPrintDiffType(SBNode * curNode, SBNode * nextNode) const
5354 {
5355 unsigned short GRFSize = getGRFSize();
5356
5357 for (const SBFootprint* fp = curNode->getFirstFootprint(Opnd_src2); fp; fp = fp->next)
5358 {
5359 unsigned short leftB = fp->LeftB / GRFSize;
5360 unsigned short rightB = fp->RightB / GRFSize;
5361 G4_Type type = fp->type;
5362
5363 for (const SBFootprint* nextfp = nextNode->getFirstFootprint(Opnd_src2); nextfp; nextfp = nextfp->next)
5364 {
5365 unsigned short nextLeftB = nextfp->LeftB / GRFSize;
5366 unsigned short nextRightB = nextfp->RightB / GRFSize;
5367 G4_Type nextType = nextfp->type;
5368
5369 if (!(nextLeftB > rightB || nextRightB < leftB))
5370 {
5371 if (type != nextType)
5372 {
5373 return true;
5374 }
5375 }
5376 }
5377 }
5378
5379 return false;
5380 }
5381
5382 //restrict a macro to :
5383 // 1. Consecutive instructions of same opcode, same datatype in all sources and dest and same register for Src1.
5384 // 2. Allow having variable repeat count
isLastDpas(SBNode * curNode,SBNode * nextNode)5385 bool G4_BB_SB::isLastDpas(SBNode* curNode, SBNode* nextNode)
5386 {
5387 G4_INST* curInst = curNode->getLastInstruction();
5388 G4_INST* nextInst = nextNode->GetInstruction();
5389 if (nextInst == nullptr || !nextInst->isDpas())
5390 {
5391 return true;
5392 }
5393
5394 if (!hasSameExecMask(curInst, nextInst))
5395 {
5396 return true;
5397 }
5398 //All types should be same for all operands.
5399 for (Gen4_Operand_Number opndNum
5400 : {Opnd_src0, Opnd_src1, Opnd_src2, Opnd_dst})
5401 {
5402 if (curNode->getFirstFootprint(opndNum) && nextNode->getFirstFootprint(opndNum) &&
5403 curNode->getFirstFootprint(opndNum)->type != nextNode->getFirstFootprint(opndNum)->type)
5404 {
5405 return true;
5406 }
5407 }
5408
5409 G4_InstDpas* dpasInst = curInst->asDpasInst();
5410 G4_Operand* srcOpnd1 = curInst->getSrc(1);
5411 G4_Operand* srcOpnd2 = curInst->getSrc(2);
5412 unsigned short leftBound1 = srcOpnd1->getLinearizedStart();
5413 unsigned short leftBound2 = srcOpnd2->getLinearizedStart();
5414 uint8_t curD = dpasInst->getSystolicDepth();
5415 uint8_t curC = dpasInst->getRepeatCount();
5416 int curSrc1Reg = leftBound1 / numEltPerGRF<Type_UB>();
5417 int curSrc2Reg = leftBound2 / numEltPerGRF<Type_UB>();
5418
5419 G4_InstDpas* nextDpasInst = nextInst->asDpasInst();
5420 uint8_t nextD = nextDpasInst->getSystolicDepth();
5421 uint8_t nextC = nextDpasInst->getRepeatCount();
5422
5423 //Same depth
5424 if (curD != nextD)
5425 {
5426 return true;
5427 }
5428
5429 if (VISA_WA_CHECK(builder.getPWaTable(), Wa_16011859583) ||
5430 VISA_WA_CHECK(builder.getPWaTable(), Wa_14012420496) ||
5431 builder.getOption(vISA_NoDPASMacro))
5432 {
5433 if (curD != 8 || nextD != 8 || curC != 8 || nextC != 8)
5434 {
5435 return true;
5436 }
5437 }
5438
5439 srcOpnd1 = nextDpasInst->getSrc(1);
5440 srcOpnd2 = nextDpasInst->getSrc(2);
5441 leftBound1 = srcOpnd1->getLinearizedStart();
5442 leftBound2 = srcOpnd2->getLinearizedStart();
5443 int nextSrc1Reg = leftBound1 / numEltPerGRF<Type_UB>();
5444 int nextSrc2Reg = leftBound2 / numEltPerGRF<Type_UB>();
5445
5446 if (builder.hasSrc2ReadSupression() &&
5447 builder.hasSrc2ReadSupressionSameRegSameType() &&
5448 src2SameFootPrintDiffType(curNode, nextNode))
5449 {
5450 return true;
5451 }
5452
5453 //Same src1 or src2
5454 if (curSrc1Reg == nextSrc1Reg ||
5455 (builder.hasSrc2ReadSupression() && (curSrc2Reg == nextSrc2Reg &&
5456 curC == nextC &&
5457 curC == 8)))
5458 {
5459 return false;
5460 }
5461
5462 // Using {Atomic} in the last line of a macro (such as in the lines I highlighted) has some implications in the hardware implementation:
5463 //1. In 8x8 macros (such as the one you pasted) is fine.
5464 //2. In other repetitions, it will cause that the src1 of the next macro will be ignored.
5465 // Hardware uses {Atomic} to indicate that the next instruction will reuse the src1. In an 8x8, they always verify
5466
5467 if (builder.hasSrc2ReadSupression() &&
5468 curC == nextC &&
5469 curC == 8 &&
5470 src2FootPrintCachePVC(curNode, nextNode) &&
5471 curNode->getFirstFootprint(Opnd_src2)->isWholeOverlap(nextNode->getFirstFootprint(Opnd_src2)))
5472 {
5473 return false;
5474 }
5475
5476 return true;
5477 }
5478
pushItemToQueue(std::vector<unsigned> * nodeIDQueue,unsigned nodeID)5479 void G4_BB_SB::pushItemToQueue(std::vector<unsigned> *nodeIDQueue, unsigned nodeID)
5480 {
5481 nodeIDQueue->push_back(nodeID);
5482
5483 if (nodeIDQueue->size() > SWSB_MAX_ALU_DEPENDENCE_DISTANCE_VALUE)
5484 {
5485 nodeIDQueue->erase(nodeIDQueue->begin());
5486 }
5487 }
5488
hasInternalDependence(SBNode * nodeFirst,SBNode * nodeNext)5489 bool G4_BB_SB::hasInternalDependence(SBNode* nodeFirst, SBNode* nodeNext)
5490 {
5491 for (Gen4_Operand_Number opndNum1
5492 : {Opnd_dst, Opnd_src0, Opnd_src1, Opnd_src2})
5493 {
5494 const SBFootprint* firstfp = nodeFirst->getFirstFootprint(opndNum1);
5495
5496 for (Gen4_Operand_Number opndNum2
5497 : {Opnd_dst, Opnd_src0, Opnd_src1, Opnd_src2})
5498 {
5499 if (opndNum1 > Opnd_dst && opndNum2 > Opnd_dst) //Don't track read after read.
5500 {
5501 continue;
5502 }
5503
5504 const SBFootprint* secondfp = nodeNext->getFirstFootprint(opndNum2);
5505 unsigned short internalOffset = 0;
5506 if (firstfp->hasOverlap(secondfp, internalOffset))
5507 {
5508 return true;
5509 }
5510 }
5511 }
5512
5513 return false;
5514 }
5515
5516
is2xDPBlockCandidate(G4_INST * inst,bool accDST)5517 bool G4_BB_SB::is2xDPBlockCandidate(G4_INST* inst, bool accDST)
5518 {
5519 if (inst->opcode() != G4_mad)
5520 {
5521 return false;
5522 }
5523
5524 if (inst->getPredicate())
5525 {
5526 return false;
5527 }
5528
5529 if (inst->getExecSize() != g4::SIMD16)
5530 {
5531 return false;
5532 }
5533
5534 if (!inst->getDst() || inst->getDst()->isNullReg())
5535 {
5536 return false;
5537 }
5538
5539 if (accDST && !inst->getDst()->isAccReg())
5540 {
5541 return false;
5542 }
5543
5544 for (Gen4_Operand_Number opndNum
5545 : {Opnd_dst, Opnd_src0, Opnd_src1, Opnd_src2})
5546 {
5547 G4_Operand* opnd = inst->getOperand(opndNum);
5548
5549 if (opnd->getType() != G4_Type::Type_DF)
5550 {
5551 return false;
5552 }
5553 }
5554
5555 return true;
5556 }
5557
SBDDD(G4_BB * bb,LiveGRFBuckets * & LB,LiveGRFBuckets * & globalSendsLB,SBNODE_VECT * SBNodes,SBNODE_VECT * SBSendNodes,SBBUCKET_VECTOR * globalSendOpndList,SWSB_INDEXES * indexes,uint32_t & globalSendNum,PointsToAnalysis & p,std::map<G4_Label *,G4_BB_SB * > * LabelToBlockMap)5558 void G4_BB_SB::SBDDD(G4_BB* bb,
5559 LiveGRFBuckets*& LB,
5560 LiveGRFBuckets*& globalSendsLB,
5561 SBNODE_VECT* SBNodes,
5562 SBNODE_VECT* SBSendNodes,
5563 SBBUCKET_VECTOR* globalSendOpndList,
5564 SWSB_INDEXES* indexes,
5565 uint32_t& globalSendNum,
5566 PointsToAnalysis& p,
5567 std::map<G4_Label*, G4_BB_SB*>* LabelToBlockMap)
5568 {
5569 nodeID = indexes->instIndex;
5570 ALUID = indexes->ALUIndex;
5571 integerID = indexes->integerIndex;
5572 floatID = indexes->floatIndex;
5573 longID = indexes->longIndex;
5574 DPASID = indexes->DPASIndex;
5575 mathID = indexes->mathIndex;
5576 first_DPASID = indexes->DPASIndex;
5577
5578 for (int i = 0; i < PIPE_DPAS; i++)
5579 {
5580 latestDepALUID[i] = indexes->latestDepALUID[i];
5581 latestInstID[i] = &indexes->latestInstID[i];
5582 }
5583 SBNODE_LIST tmpSBSendNodes;
5584 bool hasFollowDistOneAReg = false;
5585
5586 std::list<G4_INST*>::iterator iInst(bb->begin()), iInstEnd(bb->end()), iInstNext(bb->begin());
5587 for (; iInst != iInstEnd; ++iInst)
5588 {
5589 SBNode* node = nullptr;
5590 G4_INST* curInst = *iInst;
5591 iInstNext = iInst;
5592 iInstNext++;
5593 G4_INST* nextInst = nullptr;
5594 if (iInstNext != iInstEnd)
5595 {
5596 nextInst = *iInstNext;
5597 }
5598
5599 if (curInst->isLabel())
5600 {
5601 (*LabelToBlockMap)[curInst->getLabel()] = this;
5602 continue;
5603 }
5604
5605 //For the instructions not counted in the distance, we assign the same ALUID as the following
5606 node = new (mem)SBNode(nodeID, ALUID, bb->getId(), curInst);
5607 SBNodes->emplace_back(node);
5608 curInst->setLocalId(0);
5609
5610 if (builder.hasA0WARHWissue() && builder.hasThreeALUPipes())
5611 {
5612 setSpecialDistance(node);
5613 }
5614 //Record the node IDs of the instructions in BB
5615 if (first_node == -1)
5616 {
5617 first_node = nodeID;
5618 }
5619 last_node = nodeID;
5620 nodeID++;
5621
5622 //For architecture registers ce#, sp, sr0.#, cr0.#, ip, tm0, dbg0, set distance 1
5623 if (hasFollowDistOneAReg)
5624 {
5625 node->setDistance(1);
5626 node->setFollowDistOneAReg();
5627 hasFollowDistOneAReg = false;
5628 if (builder.hasThreeALUPipes() || builder.hasFourALUPipes())
5629 {
5630 node->instVec.front()->setDistanceTypeXe(G4_INST::DistanceType::DISTALL);
5631 }
5632 }
5633
5634 hasFollowDistOneAReg = getGRFFootPrint(node, p);
5635
5636 //For architecture registers ce#, sp, sr0.#, cr0.#, ip, tm0, dbg0, set distance 1
5637 if (hasFollowDistOneAReg)
5638 {
5639 node->setDistance(1);
5640 node->setDistOneAReg();
5641 if (builder.hasThreeALUPipes() || builder.hasFourALUPipes())
5642 {
5643 node->instVec.front()->setDistanceTypeXe(G4_INST::DistanceType::DISTALL);
5644 }
5645 }
5646
5647 //Support for the mad block in DPAS pipeline
5648 if (builder.has2xDP() &&
5649 builder.getOption(vISA_ScheduleFor2xSP) &&
5650 is2xDPBlockCandidate(curInst, true))
5651 {
5652 int depDistance = curInst->getDst()->getLinearizedEnd() - curInst->getDst()->getLinearizedStart() + 1;
5653 std::list<G4_INST*>::iterator iNextInst = iInst;
5654 iNextInst++;
5655 G4_INST* nInst = *iNextInst;
5656 while (is2xDPBlockCandidate(nInst, false))
5657 {
5658 SBNode nextNode(nodeID, ALUID, bb->getId(), nInst);
5659 getGRFFootPrint(&nextNode, p);
5660
5661 if (hasInternalDependence(node, &nextNode))
5662 {
5663 break;
5664 }
5665 depDistance += nInst->getDst()->getLinearizedEnd() - nInst->getDst()->getLinearizedStart() + 1;
5666 iNextInst ++;
5667 nInst = *iNextInst;
5668 if (iInstNext == iInstEnd)
5669 {
5670 break;
5671 }
5672 if (depDistance >= getGRFSize() * 8)
5673 {
5674 break;
5675 }
5676 }
5677
5678 if (depDistance >= getGRFSize() * 8)
5679 {
5680 curInst->setNoACCSBSet();
5681 }
5682 }
5683
5684 // Support for atomic write combine
5685 // Treat block instructions as one in distance calculation.
5686 // The write combine in the local scheduling guarantee that all instructions in the block belong to same instruction pipeline.
5687 auto isWriteCombineBlockCandidate = [&](G4_INST * inst)
5688 {
5689 return (inst->opcode() == G4_mov &&
5690 IS_BTYPE(inst->getDst()->getType()) &&
5691 (IS_BTYPE(inst->getSrc(0)->getType()) || IS_WTYPE(inst->getSrc(0)->getType()) || IS_DTYPE(inst->getSrc(0)->getType()) || inst->getSrc(0)->getType() == Type_F) &&
5692 inst->getPredicate() == nullptr);
5693 };
5694
5695 if (builder.getOption(vISA_writeCombine) && isWriteCombineBlockCandidate(curInst) && curInst->isAtomicInst())
5696 {
5697 while (nextInst && isWriteCombineBlockCandidate(nextInst))
5698 {
5699 SBNode nextNode = SBNode(nodeID, ALUID, bb->getId(), nextInst);
5700 getGRFFootPrint(&nextNode, p);
5701 footprintMerge(node, &nextNode);
5702 node->addInstruction(nextInst);
5703
5704 curInst = nextInst;
5705 iInst = iInstNext;
5706 iInstNext++;
5707 nextInst = *iInstNext;
5708
5709 if (!curInst->isAtomicInst())
5710 {
5711 break;
5712 }
5713 }
5714
5715 // check last instruction in the block is correct or not
5716 assert(curInst && isWriteCombineBlockCandidate(curInst) && !curInst->isAtomicInst() && "the last instruction in the write combine block is wrong");
5717 }
5718
5719 //Support for DPAS
5720 //To fully provide the efficiency of DPAS pipeline
5721 //We'd like to promote the dependence to or before the first instruction of a DPAS block
5722 //At the same time, push all dependence BD to the last instruction.
5723 //Keeping the dependence within a DPAS block will drop performance a lot.
5724 if (curInst->isDpas())
5725 {
5726 unsigned dpas_count = 0;
5727 if (nextInst && nextInst->isDpas())
5728 {
5729 SBNode nextNode;
5730 bool sameSrcDst = false;
5731 while (curInst != nullptr && curInst->isDpas())
5732 {
5733 //following instructions, first instruction is in node already
5734 if (dpas_count != 0)
5735 {
5736 if (nextNode.getNodeID() != -1)
5737 {
5738 footprintMerge(node, &nextNode);
5739 }
5740 node->addInstruction(curInst);
5741 const G4_InstDpas* dpasInst = curInst->asDpasInst();
5742 node->addDPASSize(dpasInst->getRepeatCount());
5743 }
5744 else //If the first node has internal dependence, break immediately
5745 {
5746 if (hasInternalDependenceWithinDPAS(node))
5747 {
5748 break;
5749 }
5750 }
5751
5752 nextNode = SBNode(nodeID, ALUID, bb->getId(), nextInst);
5753 getGRFFootPrint(&nextNode, p);
5754
5755 //Has dependence cannot be merged into same node.
5756 //Different Depth, src1 and type cannot be merged
5757 //Same register reuse in dest and src cannot be a part of a macro, even the last one.
5758 if (sameSrcDst ||
5759 isLastDpas(node, &nextNode) ||
5760 hasDependenceBetweenDPASNodes(node, &nextNode))
5761 {
5762 break;
5763 }
5764
5765 if (hasInternalDependenceWithinDPAS(&nextNode))
5766 {
5767 sameSrcDst = true;
5768 }
5769
5770 curInst->setOptionOn(InstOpt_Atomic);
5771 dpas_count++;
5772
5773 curInst = nextInst;
5774 iInst = iInstNext;
5775 iInstNext++;
5776 if (iInstNext == iInstEnd)
5777 {
5778 if (nextNode.getNodeID() != -1)
5779 {
5780 footprintMerge(node, &nextNode);
5781 }
5782 node->addInstruction(curInst);
5783 nextInst = nullptr;
5784 break;
5785 }
5786 nextInst = *iInstNext;
5787 }
5788 curInst = node->GetInstruction();
5789 }
5790 }
5791 if (node->getLastInstruction()->isDpas())
5792 {
5793 node->setDPASID(DPASID);
5794 DPASID += node->getDPASSize();
5795 }
5796
5797 //Get buckets for all GRF registers which are used in curInst
5798 std::vector<SBBucketDesc> BDvec;
5799 std::vector<SBBucketDesc> liveBDvec;
5800 BDvec.clear();
5801 liveBDvec.clear();
5802
5803 getGRFBucketDescs(node, BDvec, false);
5804 if (node->instVec.size() > 1)
5805 {
5806 getGRFBucketDescs(node, liveBDvec, false);
5807 }
5808
5809 if (builder.hasThreeALUPipes() || builder.hasFourALUPipes())
5810 {
5811 node->ALUPipe = curInst->getInstructionPipeXe();
5812 }
5813
5814 // For ALU instructions without GRF usage
5815 if (distanceHonourInstruction(curInst))
5816 {
5817 ALUID++;
5818
5819 if (builder.hasThreeALUPipes() || builder.hasFourALUPipes())
5820 {
5821 switch (node->ALUPipe)
5822 {
5823 case PIPE_INT:
5824 node->setIntegerID(integerID);
5825 pushItemToQueue(latestInstID[PIPE_INT], node->getNodeID());
5826 integerID++;
5827 break;
5828 case PIPE_FLOAT:
5829 node->setFloatID(floatID);
5830 pushItemToQueue(latestInstID[PIPE_FLOAT], node->getNodeID());
5831 floatID++;
5832 break;
5833 case PIPE_LONG:
5834 node->setLongID(longID);
5835 pushItemToQueue(latestInstID[PIPE_LONG], node->getNodeID());
5836 longID++;
5837 break;
5838 case PIPE_MATH:
5839 node->setMathID(mathID);
5840 pushItemToQueue(latestInstID[PIPE_MATH], node->getNodeID());
5841 mathID++;
5842 break;
5843 default:
5844 ASSERT_USER(curInst->hasNoPipe(), "Unexpected instruction found in distance ");
5845 }
5846 }
5847
5848 if (!BDvec.size())
5849 {
5850 if (ALUID >= SWSB_MAX_ALU_DEPENDENCE_DISTANCE && ALUID != node->getALUID())
5851 {
5852 if (builder.hasThreeALUPipes() || builder.hasFourALUPipes())
5853 {
5854 clearKilledBucketNodeXeHP(LB, integerID, floatID, longID, mathID);
5855 }
5856 else
5857 {
5858 clearKilledBucketNodeXeLP(LB, ALUID);
5859 }
5860 }
5861 continue;
5862 }
5863 }
5864
5865 // Considering instruction level liveness kill, i.e killing the live instructions/operands,
5866 // the dependence checking order must be RAR/RAW --> WAR/WAW, the bucket descriptions in BDvec must in the order of src->dst.
5867 // If WAW is done first, RAW may be missed:
5868 // If both live and current instructions are in-order instructions, WAW no dependence required, but RAW is required.
5869 // If both live and current instructions are out-of-order instructions, WAW and RAW have same effect.
5870 // If live is in-order and current is out-of-order, WAW and RAW have same effect.
5871 // If live is out-of-order and current is in-order, WAW and RAW have same effect.
5872 // If RAR is done before WAR, WAR will not be missed:
5873 // If both live and current instructions are in-order instructions, both RAR and WAR are not required.
5874 // If both live and current instructions are out-of-order instructions,
5875 // same pipeline, both RAR and WAR are not required
5876 // different pipeline, both R are kept for RAR, and WAR dependence is required, RAR will not cause WAR miss.
5877 // If live is in-order and current is out-of-order, WAW and RAW have same effect.
5878 // If live is out-of-order and current is in-order, WAW and RAW have same effect.
5879 // Both R will be kept, RAR will not cause WAR miss.
5880 // For WAW and RAW, once explicit dependencies are required, kill the liveness of instruction.
5881 // For WAR, once explicit dependencies is required, kill the source operands.
5882 // Others, only operand kill.
5883 bool instKill = false;
5884
5885 // For all bucket descriptors of curInst
5886 for (const SBBucketDesc& BD : BDvec) {
5887 const int& curBucket = BD.bucket;
5888 const Gen4_Operand_Number& curOpnd = BD.opndNum;
5889 const SBFootprint* curFootprint = BD.footprint;
5890
5891 // Check liveness for each live curBucket node.
5892 // Add explicit dependence if liveness is killed and there is no implicit dependence
5893 for (LiveGRFBuckets::BN_iterator bn_it = LB->begin(curBucket);
5894 bn_it != LB->end(curBucket);)
5895 {
5896 SBBucketNode* liveBN = (*bn_it);
5897 SBNode* liveNode = liveBN->node;
5898
5899 if (liveNode->isInstKilled() ||
5900 (liveNode->isSourceKilled() &&
5901 liveBN->opndNum >= Opnd_src0 &&
5902 liveBN->opndNum <= Opnd_src3))
5903 {
5904 ++bn_it;
5905 continue;
5906 }
5907
5908 unsigned short internalOffset = 0;
5909 Gen4_Operand_Number liveOpnd = liveBN->opndNum;
5910 const SBFootprint* liveFootprint = liveBN->footprint;
5911 G4_INST* liveInst = liveFootprint->inst;
5912
5913 bool hasOverlap = curFootprint->hasOverlap(liveFootprint, internalOffset);
5914 bool hasRMWOverlap = false;
5915 if (builder.hasFourALUPipes() && distanceHonourInstruction(liveInst) &&
5916 distanceHonourInstruction(curInst))
5917 {
5918 hasOverlap = curFootprint->hasOverlap(liveFootprint, hasRMWOverlap, internalOffset);
5919 }
5920
5921 //RAW: R kill W R-->live explicit dependence
5922 //WAW: same pipeline and inorder W2 kill W1 W2-->live implicit dependence
5923 //WAW: different pipelines or OOO W2 kill W1 W2-->live explict dependence
5924 //WAR: different pipelines W kill R W-->live explicit dependence
5925 //WAR: same pipeline W kill R W-->live implicit dependence
5926 //RAR: same pipeline R2 kill R1 R2-->live no dependence
5927 //RAR: different pipelines no kill R1,R2-->live no dependence
5928 //Find DEP type
5929 DepType dep = getDepForOpnd(liveOpnd, curOpnd);
5930
5931 //W/A for the read suppression caused issue
5932 //1)(~f0.0.anyv) math.cos(2 | M0) r23.7<2>:hf r11.7<4; 2, 2> : hf{ $14 }
5933 //2) mul(8 | M0) acc0.0<1>:ud r35.3<8; 8, 0> : ud r23.0<8; 4, 0> : uw //With execution mask, only r23.0~r23.3 are read
5934 //3) mach(8 | M0) r52.0<1>:ud r35.3<8; 8, 0> : ud r23.0<4; 4, 0> : ud{ $14.dst }
5935 //FIXME, For performance, we need check the 3rd instruction as well
5936
5937 if (!hasOverlap &&
5938 !builder.hasFixedCycleMathPipe() &&
5939 dep == RAW &&
5940 liveInst->isMath() && !curInst->isMath() &&
5941 builder.hasRSForSpecificPlatform() &&
5942 (!hasSamePredicator(liveInst, curInst) || builder.hasMathRSIsuue()))
5943 {
5944 hasOverlap = curFootprint->hasGRFGrainOverlap(liveFootprint);
5945 }
5946
5947 if (!hasOverlap)
5948 {
5949 ++bn_it;
5950 continue;
5951 }
5952
5953 if (tokenHonourInstruction(liveInst))
5954 {
5955 if (dep == RAW || dep == WAW) {
5956 if (builder.getOption(vISA_EnableDPASTokenReduction) &&
5957 node->getLastInstruction()->isDpas() &&
5958 liveNode->getLastInstruction()->isDpas() &&
5959 curFootprint->isWholeOverlap(liveFootprint))
5960 {
5961 if ((node->getDPASID() + curFootprint->offset - (liveNode->getDPASID() + internalOffset) < tokenAfterDPASCycle))
5962 {
5963 LB->killOperand(bn_it);
5964 createAddGRFEdge(liveNode, node, dep, DEP_EXPLICT);
5965 liveNode->setInstKilled(true); //Instruction level kill
5966 instKill = true;
5967 continue;
5968 }
5969 else if (dep == WAW) //For RAW, we cannot
5970 {
5971 LB->killOperand(bn_it);
5972 continue;
5973 }
5974 }
5975 else
5976 {
5977 LB->killOperand(bn_it);
5978 createAddGRFEdge(liveNode, node, dep, DEP_EXPLICT);
5979 liveNode->setInstKilled(true); //Instruction level kill
5980 instKill = true;
5981 continue;
5982 }
5983 }
5984
5985 if (dep == WAR) {
5986 bool killed = false;
5987
5988 //Killed if region overlap
5989 if (curFootprint->isWholeOverlap(liveFootprint))
5990 {
5991 LB->killOperand(bn_it);
5992 liveNode->setAR();
5993 if (WARDepRequired(liveInst, curInst))
5994 {
5995 liveNode->setSourceKilled(true);
5996 }
5997 killed = true;
5998 }
5999
6000 //Different pipeline/functionID, added Edge
6001 //If not whole region overlap, still killed
6002 if (WARDepRequired(liveInst, curInst))
6003 {
6004 if (!killed)
6005 {
6006 LB->killOperand(bn_it);
6007 liveNode->setAR();
6008 liveNode->setSourceKilled(true);
6009 killed = true;
6010 }
6011
6012 if (builder.getOption(vISA_EnableDPASTokenReduction) &&
6013 node->getLastInstruction()->isDpas() &&
6014 liveNode->getLastInstruction()->isDpas() &&
6015 curFootprint->isWholeOverlap(liveFootprint))
6016 {
6017 //
6018 // dpasw.8x7(8 | M0) r84 : f r84 : f r52 : bf r14.0 : bf{ Atomic }
6019 // dpasw.8x7(8 | M0) r92 : f r92 : f r52 : bf r22.0 : bf{ Atomic }
6020 // dpasw.8x7(8 | M0) r100 : f r100 : f r52 : bf r30.0 : bf{ Atomic }
6021 // dpasw.8x7(8 | M0) r108 : f r108 : f r52 : bf r38.0 : bf{ Atomic }
6022 // dpasw.8x7(8 | M0) r116 : f r116 : f r52 : bf r46.0 : bf{ $5 }
6023 // sync.nop null{ Compacted, $5.src }
6024 // (W)send.dc0(16 | M0) r52 r6 null 0x0 0x28805FE {$0}
6025 //
6026 // Although there is WAR dependence because of r52. However, due to the read suppression, the sync.nop is not required.
6027 // The DPAS in-order GRF read cycles can cover the GRF read of r52 to r58.
6028
6029 if (liveOpnd == Opnd_src1)
6030 {
6031 if (node->getDPASID() + curFootprint->offset - liveNode->getDPASID() <= TOKEN_AFTER_READ_DPAS_CYCLE)
6032 {
6033 createAddGRFEdge(liveNode, node, dep, DEP_EXPLICT);
6034 } //else do nothing, previous whole region check kill the bucket node already.
6035 }
6036 else //src0, src2
6037 {
6038 if (node->getDPASID() + curFootprint->offset - (liveNode->getDPASID() + internalOffset) <= TOKEN_AFTER_READ_DPAS_CYCLE)
6039 {
6040 createAddGRFEdge(liveNode, node, dep, DEP_EXPLICT);
6041 } //else do nothing
6042 }
6043 }
6044 else
6045 {
6046 createAddGRFEdge(liveNode, node, dep, DEP_EXPLICT);
6047 }
6048 } //else, same pipeline, there is no need to set the dependence.
6049
6050 if (killed)
6051 {
6052 continue;
6053 }
6054 }
6055
6056 if (dep == NODEP &&
6057 hasSameFunctionID(liveInst, curInst) &&
6058 hasSamePredicator(liveInst, curInst) &&
6059 hasSameExecMask(liveInst, curInst))
6060 {
6061 if (curFootprint->isWholeOverlap(liveFootprint))
6062 {
6063 LB->killOperand(bn_it);
6064 continue;
6065 }
6066 }
6067 assert(dep != DEPTYPE_MAX && "dep unassigned?");
6068 }
6069
6070 if (distanceHonourInstruction(liveInst))
6071 {
6072 if (dep == RAW &&
6073 (curBucket < (totalGRFNum + (int)builder.getNumScalarRegisters())))
6074 {//Only need track GRF RAW dependence
6075 LB->killOperand(bn_it);
6076 setDistance(curFootprint, node, liveNode, false);
6077 liveNode->setInstKilled(true); //Instrtuction level kill
6078 instKill = true;
6079 continue;
6080 }
6081
6082 if (dep == WAW) {
6083 bool killed = false;
6084 //For implicit dependence, the previous node can be killed only when it's wholly overlapped by the following one
6085 if (curFootprint->isWholeOverlap(liveFootprint))
6086 {
6087 LB->killOperand(bn_it);
6088 killed = true;
6089 }
6090
6091 if (builder.hasThreeALUPipes() || builder.hasFourALUPipes())
6092 {
6093 if (!distanceHonourInstruction(curInst) ||
6094 node->ALUPipe != liveNode->ALUPipe ||
6095 (node->ALUPipe == liveNode->ALUPipe && hasRMWOverlap))
6096 {
6097 if (!killed)
6098 {
6099 LB->killOperand(bn_it);
6100 killed = true;
6101 }
6102
6103 setDistance(curFootprint, node, liveNode, true);
6104 liveNode->setInstKilled(true); //Instruction level kill
6105 instKill = true;
6106 }
6107 }
6108 else if (!curInst->distanceHonourInstruction()
6109 || (liveInst->isLongPipeInstructionXe() && !curInst->isLongPipeInstructionXe())
6110 )
6111 {
6112 if (!killed)
6113 {
6114 LB->killOperand(bn_it);
6115 killed = true;
6116 }
6117 setDistance(curFootprint, node, liveNode, true);
6118 liveNode->setInstKilled(true); //Instruction level kill
6119 instKill = true;
6120 }
6121
6122 if (killed)
6123 {
6124 continue;
6125 }
6126 }
6127
6128 if (dep == WAR) {
6129 bool killed = false;
6130 //For implicit dependence, the previous node can be killed only when it's wholly overlapped by the following one
6131 if (curFootprint->isWholeOverlap(liveFootprint))
6132 {
6133 LB->killOperand(bn_it);
6134 killed = true;
6135 }
6136
6137 if (builder.hasThreeALUPipes() || builder.hasFourALUPipes())
6138 {
6139 if (!curInst->distanceHonourInstruction() || node->ALUPipe != liveNode->ALUPipe)
6140 {
6141 if (!killed)
6142 {
6143 LB->killOperand(bn_it);
6144 killed = true;
6145 }
6146 setDistance(curFootprint, node, liveNode, true);
6147 liveNode->setInstKilled(true); //Instruction level kill
6148 }
6149 }
6150 else if (!hasSameFunctionID(liveInst, curInst))
6151 {
6152 if (!killed)
6153 {
6154 LB->killOperand(bn_it);
6155 killed = true;
6156 }
6157 setDistance(curFootprint, node, liveNode, true);
6158 liveNode->setSourceKilled(true);
6159 }
6160
6161 if (killed)
6162 {
6163 continue;
6164 }
6165 }
6166
6167 if (dep == NODEP && hasSameFunctionID(liveInst, curInst))
6168 {
6169 if (curFootprint->isWholeOverlap(liveFootprint))
6170 {
6171 if (builder.hasThreeALUPipes() || builder.hasFourALUPipes())
6172 {
6173 if (node->ALUPipe == liveNode->ALUPipe)
6174 {
6175 LB->killOperand(bn_it);
6176 continue;
6177 }
6178 }
6179 else
6180 {
6181 LB->killOperand(bn_it);
6182 continue;
6183 }
6184 }
6185 }
6186 assert(dep != DEPTYPE_MAX && "dep unassigned?");
6187 }
6188
6189 ++bn_it;
6190 }
6191 }
6192
6193 if (node->distDep.size())
6194 {
6195 if (builder.hasFiveALUPipes())
6196 {
6197 node->finalizeDistanceType2(builder, latestInstID);
6198 }
6199 else if (builder.hasThreeALUPipes() || builder.hasFourALUPipes())
6200 {
6201 node->finalizeDistanceType1(builder, latestInstID);
6202 }
6203 }
6204
6205 if ((builder.getOption(vISA_EnableSwitch) && node->GetInstruction()->isYieldInst()) ||
6206 (node->GetInstruction()->isCall() || node->GetInstruction()->isFCall()) ||
6207 (VISA_WA_CHECK(builder.getPWaTable(), Wa_14013672992) && node->GetInstruction()->isEOT()))
6208 {
6209 node->setDistance(1);
6210 if (builder.hasThreeALUPipes() || builder.hasFourALUPipes())
6211 {
6212 node->instVec.front()->setDistanceTypeXe(G4_INST::DistanceType::DISTALL);
6213 }
6214 }
6215
6216 //Simplify the LB according to the distance, and if the node is killed
6217 if (instKill ||
6218 (ALUID >= SWSB_MAX_ALU_DEPENDENCE_DISTANCE && ALUID != node->getALUID()))
6219 {
6220 if (builder.hasThreeALUPipes() || builder.hasFourALUPipes())
6221 {
6222 clearKilledBucketNodeXeHP(LB, integerID, floatID, longID, mathID);
6223 }
6224 else
6225 {
6226 clearKilledBucketNodeXeLP(LB, ALUID);
6227 }
6228 }
6229
6230 if (builder.hasSLMWARIssue() && curInst->isSend() &&
6231 (isSLMMsg(curInst) && (curInst->getDst() == nullptr || isFence(curInst))))
6232 {
6233 clearSLMWARWAissue(node, LB);
6234 }
6235
6236 // Add buckets of current instruction to bucket list
6237 if (node->instVec.size() > 1)
6238 {
6239 std::map<const SBFootprint*, std::vector<SBBucketNode*>> bucketNodes;
6240 for (const SBBucketDesc& BD : liveBDvec)
6241 {
6242 auto iter = std::find_if(bucketNodes[BD.footprint].begin(), bucketNodes[BD.footprint].end(),
6243 [&BD](SBBucketNode* node) {return BD.opndNum == node->opndNum; });
6244 if (iter != bucketNodes[BD.footprint].end())
6245 {
6246 LB->add((*iter), BD.bucket);
6247 }
6248 else
6249 {
6250 void* allocedMem = mem.alloc(sizeof(SBBucketNode));
6251 SBBucketNode* newNode = new (allocedMem)SBBucketNode(node, BD.opndNum, BD.footprint);
6252 bucketNodes[BD.footprint].push_back(newNode);
6253 LB->add(newNode, BD.bucket);
6254 }
6255 }
6256 }
6257 else
6258 {
6259 std::vector<SBBucketNode*> bucketNodes(Opnd_total_num, nullptr); //The coarse grained footprint of operands
6260 for (const SBBucketDesc& BD : BDvec)
6261 {
6262 if (bucketNodes[BD.opndNum] == nullptr)
6263 {
6264 void* allocedMem = mem.alloc(sizeof(SBBucketNode));
6265 SBBucketNode* newNode = new (allocedMem)SBBucketNode(node, BD.opndNum, BD.footprint);
6266 bucketNodes[BD.opndNum] = newNode;
6267 }
6268
6269 LB->add(bucketNodes[BD.opndNum], BD.bucket);
6270 }
6271 }
6272
6273 // Record token sensitive nodes.
6274 if (tokenHonourInstruction(curInst))
6275 {
6276 if (first_send_node == -1)
6277 {
6278 first_send_node = SBSendNodes->size();
6279 }
6280 last_send_node = SBSendNodes->size();
6281 node->setSendID(int(SBSendNodes->size()));
6282 // The dep delay of the node should be constant, so we can
6283 // calculate and save it for future uses.
6284 node->setDepDelay(swsb.calcDepDelayForNode(node));
6285 SBSendNodes->push_back(node);
6286 }
6287 }
6288
6289 //Check the live out token nodes after the scan of current BB.
6290 //Record the nodes and the buckets for global analysis.
6291 for (int curBucket = 0; curBucket < LB->getNumOfBuckets(); curBucket++)
6292 {
6293 for (auto it = LB->begin(curBucket); it != LB->end(curBucket);)
6294 {
6295 SBBucketNode* liveBN = (*it);
6296 SBNode* node = liveBN->node;
6297
6298 //Only the live outs from current BB
6299 if (tokenHonourInstruction(node->GetInstruction()) &&
6300 (int)node->getNodeID() >= first_node &&
6301 (int)node->getNodeID() <= last_node)
6302 {
6303 if (liveBN->getSendID() == -1)
6304 {
6305 if (send_start == -1)
6306 {
6307 send_start = (int)globalSendOpndList->size();
6308 }
6309
6310 //Record all send operands which live out current BB.
6311 globalSendOpndList->push_back(liveBN);
6312 send_end = (int)globalSendOpndList->size() - 1;
6313
6314 //Record the position of the node in global send operands list.
6315 liveBN->setSendID(send_end);
6316 }
6317
6318 //Set global send instruction ID
6319 if (liveBN->node->globalID == -1)
6320 {
6321 liveBN->node->globalID = globalSendNum;
6322 globalSendNum++;
6323 }
6324
6325 //Record all buckets of the send operand
6326 globalSendsLB->add(liveBN, curBucket);
6327 LB->killSingleOperand(it);
6328 continue;
6329 }
6330 ++it;
6331 }
6332 }
6333
6334 //return the node ID and ALU ID for following BB
6335 indexes->ALUIndex = ALUID;
6336 indexes->instIndex = nodeID;
6337 indexes->integerIndex = integerID;
6338 indexes->floatIndex = floatID;
6339 indexes->longIndex = longID;
6340 indexes->DPASIndex = DPASID;
6341 indexes->mathIndex = mathID;
6342 last_DPASID = DPASID;
6343
6344 for (int i = 0; i < PIPE_DPAS; i++)
6345 {
6346 indexes->latestDepALUID[i] = latestDepALUID[i];
6347 }
6348
6349 #ifdef DEBUG_VERBOSE_ON
6350 std::cerr << "\nLIVE OUT: \n";
6351 LB->dumpLives();
6352 #endif
6353
6354 return;
6355 }
6356
6357 //#ifdef DEBUG_VERBOSE_ON
6358
dumpLiveInfo(const SBBUCKET_VECTOR * globalSendOpndList,unsigned globalSendNum,const SBBitSets * send_kill) const6359 void G4_BB_SB::dumpLiveInfo(const SBBUCKET_VECTOR* globalSendOpndList, unsigned globalSendNum, const SBBitSets* send_kill) const
6360 {
6361 std::cerr << "\nBB" << bb->getId() << ":" << first_node << "-" << last_node << ", succ<";
6362 for (const G4_BB* succ : bb->Succs)
6363 {
6364 std::cerr << succ->getId() << ",";
6365 }
6366 std::cerr << "> pred<";
6367 for (const G4_BB* pred : bb->Preds)
6368 {
6369 std::cerr << pred->getId() << ",";
6370 }
6371
6372 std::cerr << "> JIPSucc <";
6373 for (const G4_BB_SB* succ : Succs)
6374 {
6375 std::cerr << succ->getBB()->getId() << ",";
6376 }
6377 std::cerr << "> JIPPred <";
6378 for (const G4_BB_SB* pred : Preds)
6379 {
6380 std::cerr << pred->getBB()->getId() << ",";
6381 }
6382 std::cerr << ">";
6383 if (bb->getBBType() & G4_BB_CALL_TYPE)
6384 {
6385 std::cerr << ":CALL";
6386 }
6387 if (bb->getBBType() & G4_BB_INIT_TYPE)
6388 {
6389 std::cerr << ":INIT";
6390 }
6391 if (bb->getBBType() & G4_BB_EXIT_TYPE)
6392 {
6393 std::cerr << ":EXIT";
6394 }
6395 if (bb->getBBType() & G4_BB_RETURN_TYPE)
6396 {
6397 std::cerr << ":RETURN";
6398 }
6399 std::cerr << std::endl;
6400
6401 for (size_t i = 0; i < globalSendOpndList->size(); i++)
6402 {
6403 const SBBucketNode* sNode = (*globalSendOpndList)[i];
6404 std::cerr << i << ": ";
6405 sNode->dump();
6406 }
6407 std::cerr << std::endl;
6408
6409 std::cerr << "Live In: ";
6410 std::cerr << std::endl;
6411 if (send_live_in.getSize() != 0)
6412 {
6413 std::cerr << "\tdst: ";
6414 for (const SBBucketNode* sNode : *globalSendOpndList)
6415 {
6416 if (sNode->opndNum == Opnd_dst &&
6417 send_live_in.isDstSet(sNode->node->globalID))
6418 {
6419 sNode->dump();
6420 }
6421 }
6422 std::cerr << std::endl;
6423
6424 std::cerr << "\tsrc: ";
6425 for (const SBBucketNode* sNode : *globalSendOpndList)
6426 {
6427 if (sNode->opndNum >= Opnd_src0 && sNode->opndNum <= Opnd_src3 &&
6428 send_live_in.isSrcSet(sNode->node->globalID))
6429 {
6430 sNode->dump();
6431 }
6432 }
6433 std::cerr << std::endl;
6434 }
6435 std::cerr << std::endl;
6436
6437 std::cerr << "May Kill: ";
6438 std::cerr << std::endl;
6439 if (send_may_kill.getSize() != 0)
6440 {
6441 std::cerr << "\tdst: ";
6442 for (const SBBucketNode* sNode : *globalSendOpndList)
6443 {
6444 if (sNode->opndNum == Opnd_dst &&
6445 send_may_kill.isDstSet(sNode->node->globalID))
6446 {
6447 sNode->dump();
6448 }
6449 }
6450 std::cerr << std::endl;
6451 std::cerr << "\tsrc: ";
6452 for (const SBBucketNode* sNode : *globalSendOpndList)
6453 {
6454 if (sNode->opndNum >= Opnd_src0 && sNode->opndNum <= Opnd_src3 &&
6455 send_may_kill.isSrcSet(sNode->node->globalID))
6456 {
6457 sNode->dump();
6458 }
6459 }
6460 std::cerr << std::endl;
6461 }
6462 std::cerr << std::endl;
6463
6464 std::cerr << "WAW May Kill: ";
6465 std::cerr << std::endl;
6466 if (send_WAW_may_kill.getSize() != 0)
6467 {
6468 std::cerr << "\tdst: ";
6469 for (const SBBucketNode* sNode : *globalSendOpndList)
6470 {
6471 if (sNode->opndNum == Opnd_dst &&
6472 send_WAW_may_kill.isSet(sNode->node->globalID))
6473 {
6474 sNode->dump();
6475 }
6476 }
6477 std::cerr << std::endl;
6478 }
6479 std::cerr << std::endl;
6480
6481 std::cerr << "Killed: ";
6482 std::cerr << std::endl;
6483 if (send_kill != nullptr)
6484 {
6485 std::cerr << "\tdst: ";
6486 for (const SBBucketNode* sNode : *globalSendOpndList)
6487 {
6488 if (sNode->opndNum == Opnd_dst &&
6489 send_kill->isDstSet(sNode->node->globalID))
6490 {
6491 sNode->dump();
6492 }
6493 }
6494 std::cerr << std::endl;
6495 std::cerr << "\tsrc: ";
6496 for (const SBBucketNode* sNode : *globalSendOpndList)
6497 {
6498 if (sNode->opndNum >= Opnd_src0 && sNode->opndNum <= Opnd_src3 &&
6499 send_kill->isSrcSet(sNode->node->globalID))
6500 {
6501 sNode->dump();
6502 }
6503 }
6504 std::cerr << std::endl;
6505 }
6506 std::cerr << std::endl;
6507
6508 std::cerr << "Scalar Killed: ";
6509 std::cerr << std::endl;
6510 if (send_live_out.getSize() != 0)
6511 {
6512 std::cerr << "\tdst: ";
6513 for (const SBBucketNode* sNode : *globalSendOpndList)
6514 {
6515 if (sNode->opndNum == Opnd_dst &&
6516 send_kill_scalar.isDstSet(sNode->node->globalID))
6517 {
6518 sNode->dump();
6519 }
6520 }
6521 std::cerr << std::endl;
6522 std::cerr << "\tsrc: ";
6523 for (const SBBucketNode* sNode : *globalSendOpndList)
6524 {
6525 if (sNode->opndNum >= Opnd_src0 && sNode->opndNum <= Opnd_src3 &&
6526 send_kill_scalar.isSrcSet(sNode->node->globalID))
6527 {
6528 sNode->dump();
6529 }
6530 }
6531 std::cerr << std::endl;
6532 }
6533 std::cerr << std::endl;
6534
6535 std::cerr << "Live Out: ";
6536 std::cerr << std::endl;
6537 if (send_live_out.getSize() != 0)
6538 {
6539 std::cerr << "\tdst: ";
6540 for (const SBBucketNode* sNode : *globalSendOpndList)
6541 {
6542 if (sNode->opndNum == Opnd_dst &&
6543 send_live_out.isDstSet(sNode->node->globalID))
6544 {
6545 sNode->dump();
6546 }
6547 }
6548 std::cerr << std::endl;
6549 std::cerr << "\tsrc: ";
6550 for (const SBBucketNode* sNode : *globalSendOpndList)
6551 {
6552 if (sNode->opndNum >= Opnd_src0 && sNode->opndNum <= Opnd_src3 &&
6553 send_live_out.isSrcSet(sNode->node->globalID))
6554 {
6555 sNode->dump();
6556 }
6557 }
6558 std::cerr << std::endl;
6559 }
6560 std::cerr << std::endl;
6561
6562 }
6563 //#endif
6564
dumpTokenLiveInfo()6565 void SWSB::dumpTokenLiveInfo()
6566 {
6567 for (size_t i = 0; i < BBVector.size(); i++)
6568 {
6569 G4_BB* bb = BBVector[i]->getBB();
6570
6571 std::cerr << "\nBB" << bb->getId() << ":" << BBVector[i]->first_node << "-" << BBVector[i]->last_node << ", succ<";
6572 for (std::list<G4_BB*>::iterator sit = bb->Succs.begin(); sit != bb->Succs.end(); ++sit)
6573 {
6574 std::cerr << (*sit)->getId() << ",";
6575 }
6576 std::cerr << "> pred<";
6577 for (std::list<G4_BB*>::iterator pit = bb->Preds.begin(); pit != bb->Preds.end(); ++pit)
6578 {
6579 std::cerr << (*pit)->getId() << ",";
6580 }
6581
6582 std::cerr << "> JIPSucc <";
6583 for (std::list<G4_BB_SB*>::iterator pit = BBVector[i]->Succs.begin(); pit != BBVector[i]->Succs.end(); ++pit)
6584 {
6585 std::cerr << (*pit)->getBB()->getId() << ",";
6586 }
6587 std::cerr << "> JIPPred <";
6588 for (std::list<G4_BB_SB*>::iterator pit = BBVector[i]->Preds.begin(); pit != BBVector[i]->Preds.end(); ++pit)
6589 {
6590 std::cerr << (*pit)->getBB()->getId() << ",";
6591 }
6592 std::cerr << ">";
6593 if (bb->getBBType() & G4_BB_CALL_TYPE)
6594 {
6595 std::cerr << ":CALL";
6596 }
6597 if (bb->getBBType() & G4_BB_INIT_TYPE)
6598 {
6599 std::cerr << ":INIT";
6600 }
6601 if (bb->getBBType() & G4_BB_EXIT_TYPE)
6602 {
6603 std::cerr << ":EXIT";
6604 }
6605 if (bb->getBBType() & G4_BB_RETURN_TYPE)
6606 {
6607 std::cerr << ":RETURN";
6608 }
6609 std::cerr << std::endl;
6610
6611 if (fg.builder->getOptions()->getOption(vISA_GlobalTokenAllocation) ||
6612 fg.builder->getOptions()->getOption(vISA_DistPropTokenAllocation))
6613 {
6614 std::cerr << "Doms: ";
6615
6616 for (size_t k = 0; k < BBVector.size(); k++)
6617 {
6618 if (k != i &&
6619 BBVector[i]->dominators.isSet(k))
6620 {
6621 std::cerr << "#BB" << k << ", ";
6622 }
6623 }
6624 std::cerr << std::endl;
6625 }
6626
6627 std::cerr << "Live Out: ";
6628 std::cerr << std::endl;
6629 if (BBVector[i]->liveOutTokenNodes.getSize() != 0)
6630 {
6631 for (SBNODE_VECT_ITER node_it = SBSendNodes.begin();
6632 node_it != SBSendNodes.end();
6633 node_it++)
6634 {
6635 SBNode* node = (*node_it);
6636 if (BBVector[i]->liveOutTokenNodes.isSet(node->sendID))
6637 {
6638 std::cerr << " #" << node->getNodeID() << ":" << node->sendID << ":" << node->GetInstruction()->getSetToken();
6639 }
6640 }
6641 std::cerr << std::endl;
6642 }
6643
6644 std::cerr << "Killed Tokens: ";
6645 std::cerr << std::endl;
6646 if (BBVector[i]->killedTokens.getSize() != 0)
6647 {
6648 uint32_t totalTokenNum = kernel.getNumSWSBTokens();
6649 for (uint32_t k = 0; k < totalTokenNum; k++)
6650 {
6651 if (BBVector[i]->killedTokens.isSet(k))
6652 {
6653 std::cerr << " #" << k << ", ";
6654 }
6655 }
6656 }
6657 std::cerr << std::endl;
6658
6659 }
6660
6661 return;
6662 }
6663
getLiveBucketsFromFootprint(const SBFootprint * firstFootprint,SBBucketNode * sBucketNode,LiveGRFBuckets * send_use_kills) const6664 void G4_BB_SB::getLiveBucketsFromFootprint(const SBFootprint* firstFootprint, SBBucketNode* sBucketNode, LiveGRFBuckets* send_use_kills) const
6665 {
6666 const SBFootprint* footprint = firstFootprint;
6667
6668 while (footprint)
6669 {
6670 int startBucket = footprint->LeftB / numEltPerGRF<Type_UB>();
6671 int endBucket = footprint->RightB / numEltPerGRF<Type_UB>();
6672
6673 //We only track the global dependence for GRF
6674 if (footprint->fType != GRF_T)
6675 {
6676 footprint = footprint->next;
6677 continue;
6678 }
6679
6680 for (int j = startBucket; j < endBucket + 1; j++)
6681 {
6682 send_use_kills->add(sBucketNode, j);
6683 }
6684 footprint = footprint->next;
6685 }
6686
6687 return;
6688 }
6689
6690 /*
6691 * Note that the fall through dependencies are captured in the SBDDD linear scan already
6692 */
addGlobalDependence(unsigned globalSendNum,SBBUCKET_VECTOR * globalSendOpndList,SBNODE_VECT * SBNodes,PointsToAnalysis & p,bool afterWrite)6693 void SWSB::addGlobalDependence(unsigned globalSendNum, SBBUCKET_VECTOR* globalSendOpndList, SBNODE_VECT* SBNodes, PointsToAnalysis& p, bool afterWrite)
6694 {
6695 for (size_t i = 0; i < BBVector.size(); i++)
6696 {
6697 //Get global send operands killed by current BB
6698 SBBitSets send_kill(globalSendNum);
6699 send_kill |= BBVector[i]->send_live_in;
6700 send_kill &= BBVector[i]->send_may_kill;
6701
6702 #ifdef DEBUG_VERBOSE_ON
6703 BBVector[i]->dumpLiveInfo(globalSendOpndList, globalSendNum, &send_kill);
6704 #endif
6705 //Change the global send operands into live bucket for liveness scan
6706 //Instruction level liveness kill:
6707 // For token dependence, there is only implicit RAR and WAR dependencies.
6708 // the order of the operands are scanned is not an issue anymore.
6709 // i.e explicit RAW and WAW can cover all other dependences.
6710 LiveGRFBuckets send_use_kills(mem, kernel.getNumRegTotal(), BBVector[i]->getBB()->getKernel());
6711 for (SBBucketNode* sBucketNode : *globalSendOpndList)
6712 {
6713 SBNode* sNode = sBucketNode->node;
6714 if (send_kill.isSrcSet(sNode->globalID) && (sBucketNode->opndNum == Opnd_src0 ||
6715 sBucketNode->opndNum == Opnd_src1 ||
6716 sBucketNode->opndNum == Opnd_src2 ||
6717 sBucketNode->opndNum == Opnd_src3))
6718 {
6719 BBVector[i]->getLiveBucketsFromFootprint(sNode->getFirstFootprint(sBucketNode->opndNum), sBucketNode, &send_use_kills);
6720 }
6721 if (send_kill.isDstSet(sNode->globalID) && (sBucketNode->opndNum == Opnd_dst))
6722 {
6723 BBVector[i]->getLiveBucketsFromFootprint(sNode->getFirstFootprint(sBucketNode->opndNum), sBucketNode, &send_use_kills);
6724 }
6725 sNode->setInstKilled(false);
6726 sNode->setSourceKilled(false);
6727 }
6728
6729 if (BBVector[i]->first_node == -1)
6730 {
6731 continue;
6732 }
6733
6734 //Scan BB again to figure out the dependence caused by global send operands
6735 std::vector<SBBucketDesc> BDvec;
6736 for (int j = BBVector[i]->first_node; j <= BBVector[i]->last_node; j++)
6737 {
6738 SBNode* node = (*SBNodes)[j];
6739 G4_INST* curInst = node->getLastInstruction();
6740
6741 BDvec.clear();
6742 BBVector[i]->getGRFBucketDescs(node, BDvec, true);
6743 if (!BDvec.size())
6744 {
6745 continue;
6746 }
6747
6748 bool instKill = false;
6749 // For all bucket descriptors of curInst
6750 for (const SBBucketDesc& BD : BDvec)
6751 {
6752 const int& curBucket = BD.bucket;
6753 const Gen4_Operand_Number& curOpnd = BD.opndNum;
6754 const SBFootprint* curFootprint = BD.footprint;
6755
6756 for (LiveGRFBuckets::BN_iterator bn_it = send_use_kills.begin(curBucket);
6757 bn_it != send_use_kills.end(curBucket);)
6758 {
6759 SBBucketNode* liveBN = (*bn_it);
6760 SBNode* curLiveNode = liveBN->node;
6761 Gen4_Operand_Number liveOpnd = liveBN->opndNum;
6762 const SBFootprint* liveFootprint = liveBN->footprint;
6763 G4_INST* liveInst = liveFootprint->inst;
6764 unsigned short internalOffset = 0;
6765 bool hasOverlap = curFootprint->hasOverlap(liveFootprint, internalOffset);
6766
6767 //Find DEP type
6768 DepType dep = getDepForOpnd(liveOpnd, curOpnd);
6769
6770 //RAW: R kill W R-->live explicit dependence
6771 //WAW: W2 kill W1 W2-->live explicit dependence
6772 //WAW: same pipeline/inorder W2 kill W1 W2-->live implicit dependence
6773 //WAR: different pipelines W kill R W-->live explicit dependence
6774 //WAR: same pipeline W kill R W-->live implicit dependence
6775 //RAR: sample pipeline R2 kill R1 R2-->live implicit dependence
6776 //RAR: different pipelines no kill R1,R2-->live no dependence
6777 if (hasOverlap)
6778 {
6779 assert(tokenHonourInstruction(liveInst));
6780 if (dep == RAW || dep == WAW)
6781 {
6782 if (BBVector[i]->isGRFEdgeAdded(curLiveNode, node, dep, DEP_EXPLICT))
6783 {
6784 send_use_kills.killOperand(bn_it);
6785 curLiveNode->setInstKilled(true); //Instruction level kill
6786 instKill = true;
6787 continue;
6788 }
6789 //WAW need be tracked in both scalar and SIMD control flow
6790 //The reason is that:
6791 // 1. RA track the liveness in use-->define way
6792 // 2. SWSB track in define-->use way.
6793 // For the case like following
6794 //
6795 // if
6796 // v1 <-- //v1 is never be used
6797 // if
6798 // <--v1
6799 // endif
6800 // endif
6801 // v2 <--
6802 //RA may assign same register to v1 and v2.
6803 //Scalar CFG cannot capture the dependence v1-->v2 when they are assigned with same registers.
6804 if (afterWrite || dep == WAW) //There is no RAW kill for SIMDCF
6805 {
6806 if (fg.builder->getOption(vISA_EnableDPASTokenReduction) &&
6807 node->getLastInstruction()->isDpas() &&
6808 curLiveNode->getLastInstruction()->isDpas() &&
6809 curFootprint->isWholeOverlap(liveFootprint))
6810 {
6811 if (node->getDPASID() > curLiveNode->getDPASID())
6812 {
6813 if ((node->getDPASID() + curFootprint->offset - (curLiveNode->getDPASID() + internalOffset) < tokenAfterDPASCycle))
6814 {
6815 send_use_kills.killOperand(bn_it);
6816 BBVector[i]->createAddGRFEdge(curLiveNode, node, dep, DEP_EXPLICT);
6817 curLiveNode->setInstKilled(true); //Instruction level kill
6818 instKill = true;
6819 continue;
6820 }
6821 else if (dep == WAW)
6822 {
6823 send_use_kills.killOperand(bn_it);
6824 continue;
6825 }
6826 }
6827
6828 if (node->getDPASID() <= curLiveNode->getDPASID())
6829 {
6830 unsigned loopStartBB = BBVector[node->getBBID()]->getLoopStartBBID();
6831 unsigned loopEndBB = BBVector[curLiveNode->getBBID()]->getLoopEndBBID();
6832 if (loopStartBB != -1 && loopEndBB != -1)
6833 {
6834 unsigned frontDist = node->getDPASID() - BBVector[loopStartBB]->first_DPASID;
6835 unsigned endDist = BBVector[loopEndBB]->last_DPASID - curLiveNode->getDPASID();
6836
6837 //Note that if node and live node are in different but nest loop, the calculation will be conservative
6838 if ((int)(frontDist + endDist + curFootprint->offset - internalOffset) < tokenAfterDPASCycle)
6839 {
6840 send_use_kills.killOperand(bn_it);
6841 BBVector[i]->createAddGRFEdge(curLiveNode, node, dep, DEP_EXPLICT);
6842 curLiveNode->setInstKilled(true); //Instruction level kill
6843 instKill = true;
6844 continue;
6845 }
6846 else if (dep == WAW)
6847 {
6848 send_use_kills.killOperand(bn_it);
6849 continue;
6850 }
6851 }
6852 else
6853 {
6854 send_use_kills.killOperand(bn_it);
6855 BBVector[i]->createAddGRFEdge(curLiveNode, node, dep, DEP_EXPLICT);
6856 curLiveNode->setInstKilled(true);
6857 instKill = true;
6858 continue;
6859 }
6860 }
6861 }
6862 else
6863 {
6864 send_use_kills.killOperand(bn_it);
6865 BBVector[i]->createAddGRFEdge(curLiveNode, node, dep, DEP_EXPLICT);
6866 curLiveNode->setInstKilled(true); //Instruction level kill
6867 instKill = true;
6868 continue;
6869 }
6870 }
6871 }
6872
6873 if (dep == WAR)
6874 {
6875 bool killed = false;
6876 //For implicit dependence, the previous node can be killed only when it's wholly overlapped by the following one
6877 if (curFootprint->isWholeOverlap(liveFootprint))
6878 {
6879 send_use_kills.killOperand(bn_it);
6880 if (WARDepRequired(liveInst, curInst))
6881 //Implicit dependence cannot block the following instruction from issue.
6882 {
6883 curLiveNode->setSourceKilled(true);
6884 }
6885 curLiveNode->setAR();
6886 killed = true;
6887 }
6888
6889 if (WARDepRequired(liveInst, curInst))
6890 {
6891 if (!killed)
6892 {
6893 send_use_kills.killOperand(bn_it);
6894 curLiveNode->setSourceKilled(true);
6895 curLiveNode->setAR();
6896 killed = true;
6897 }
6898 instKill = true;
6899 if (!afterWrite) //After read dependence is more comprehensive in SIMDCF, so add edge only in SIMDCF pass
6900 {
6901 BBVector[i]->createAddGRFEdge(curLiveNode, node, dep, DEP_EXPLICT);
6902 }
6903 }
6904 else
6905 {
6906 if (!afterWrite) //After read dependence is more comprehensive in SIMDCF, so add edge only in SIMDCF pass
6907 {
6908 BBVector[i]->createAddGRFEdge(curLiveNode, node, dep, DEP_IMPLICIT);
6909 }
6910 }
6911
6912 if (killed)
6913 {
6914 continue;
6915 }
6916 }
6917
6918 if (dep == NODEP &&
6919 hasSameFunctionID(liveInst, curInst) &&
6920 hasSamePredicator(liveInst, curInst) &&
6921 hasSameExecMask(liveInst, curInst))
6922 {
6923 if (curFootprint->isWholeOverlap(liveFootprint))
6924 {
6925 send_use_kills.killOperand(bn_it);
6926 continue;
6927 }
6928 }
6929 }
6930
6931 assert(dep != DEPTYPE_MAX && "dep unassigned?");
6932 ++bn_it;
6933 }
6934 }
6935
6936 if (instKill)
6937 {
6938 if (fg.builder->hasThreeALUPipes() || fg.builder->hasFourALUPipes())
6939 {
6940 BBVector[i]->clearKilledBucketNodeXeHP(&send_use_kills, 0, 0, 0, 0);
6941 }
6942 else
6943 {
6944 BBVector[i]->clearKilledBucketNodeXeLP(&send_use_kills, 0);
6945 }
6946 }
6947 if (fg.builder->hasSLMWARIssue() && curInst->isSend() &&
6948 (isSLMMsg(curInst) && (curInst->getDst() == nullptr || isFence(curInst))))
6949 {
6950 BBVector[i]->clearSLMWARWAissue(node, &send_use_kills);
6951 }
6952 }
6953 }
6954
6955 return;
6956 }
6957
addReachingDefineSet(SBNode * node,SBBitSets * globalLiveSet,SBBitSets * localLiveSet)6958 void SWSB::addReachingDefineSet(SBNode* node, SBBitSets* globalLiveSet, SBBitSets* localLiveSet)
6959 {
6960 if (node->reachingSends.getSize() == 0)
6961 {
6962 node->reachingSends = SBBitSets(SBSendNodes.size());
6963 }
6964
6965 node->reachingSends |= *globalLiveSet;
6966
6967 node->reachingSends |= *localLiveSet;
6968
6969 return;
6970 }
6971
addReachingUseSet(SBNode * node,SBNode * use)6972 void SWSB::addReachingUseSet(SBNode* node, SBNode* use)
6973 {
6974 if (use->getSendUseID() != -1)
6975 {
6976 if (node->reachedUses.getSize() == 0)
6977 {
6978 node->reachedUses = SBBitSets(SBSendUses.size());
6979 }
6980
6981 node->reachedUses.setDst(use->getSendUseID(), true);
6982 }
6983
6984 return;
6985 }
6986
addGlobalDependenceWithReachingDef(unsigned globalSendNum,SBBUCKET_VECTOR * globalSendOpndList,SBNODE_VECT * SBNodes,PointsToAnalysis & p,bool afterWrite)6987 void SWSB::addGlobalDependenceWithReachingDef(unsigned globalSendNum, SBBUCKET_VECTOR* globalSendOpndList, SBNODE_VECT* SBNodes, PointsToAnalysis& p, bool afterWrite)
6988 {
6989 for (size_t i = 0; i < BBVector.size(); i++)
6990 {
6991 //Get global send operands killed by current BB
6992 SBBitSets send_kill(globalSendNum);
6993 //send_live record the live ones from out side of BB, but kill by BB
6994 SBBitSets send_live(SBSendNodes.size());
6995
6996 SBBitSets send_live_through(globalSendNum);
6997 //send_reach_all record all the global livs live through the BB
6998 SBBitSets send_reach_all(SBSendNodes.size());
6999
7000 send_kill |= BBVector[i]->send_live_in;
7001 send_kill &= BBVector[i]->send_may_kill;
7002 send_live_through |= BBVector[i]->send_live_in;
7003 send_live_through -= send_kill;
7004
7005 #ifdef DEBUG_VERBOSE_ON
7006 BBVector[i]->dumpLiveInfo(globalSendOpndList, globalSendNum, &send_kill);
7007 #endif
7008 //Change the global send operands into live bucket for liveness scan
7009 //Instruction level liveness kill:
7010 // For token dependence, there is only implicit RAR and WAR dependencies.
7011 // the order of the operands are scanned is not an issue anymore.
7012 // i.e explicit RAW and WAW can cover all other dependences.
7013 LiveGRFBuckets send_use_kills(mem, kernel.getNumRegTotal(), BBVector[i]->getBB()->getKernel());
7014 for (size_t j = 0; j < globalSendOpndList->size(); j++)
7015 {
7016 SBBucketNode* sBucketNode = (*globalSendOpndList)[j];
7017 SBNode* sNode = sBucketNode->node;
7018 if (send_kill.isSrcSet(sNode->globalID) && (sBucketNode->opndNum == Opnd_src0 ||
7019 sBucketNode->opndNum == Opnd_src1 ||
7020 sBucketNode->opndNum == Opnd_src2 ||
7021 sBucketNode->opndNum == Opnd_src3))
7022 {
7023 BBVector[i]->getLiveBucketsFromFootprint(sNode->getFirstFootprint(sBucketNode->opndNum), sBucketNode, &send_use_kills);
7024 send_live.setSrc(sNode->getSendID(), true);
7025 }
7026 if (send_kill.isDstSet(sNode->globalID) && (sBucketNode->opndNum == Opnd_dst))
7027 {
7028 BBVector[i]->getLiveBucketsFromFootprint(sNode->getFirstFootprint(sBucketNode->opndNum), sBucketNode, &send_use_kills);
7029 send_live.setDst(sNode->getSendID(), true);
7030 }
7031
7032 if (send_live_through.isSrcSet(sNode->globalID) && (sBucketNode->opndNum == Opnd_src0 ||
7033 sBucketNode->opndNum == Opnd_src1 ||
7034 sBucketNode->opndNum == Opnd_src2 ||
7035 sBucketNode->opndNum == Opnd_src3))
7036 {
7037 send_reach_all.setSrc(sNode->getSendID(), true);
7038 }
7039 if (send_live_through.isDstSet(sNode->globalID) && (sBucketNode->opndNum == Opnd_dst))
7040 {
7041 send_reach_all.setDst(sNode->getSendID(), true);
7042 }
7043 sNode->setInstKilled(false);
7044 sNode->setSourceKilled(false);
7045 }
7046
7047 if (BBVector[i]->first_node == -1)
7048 {
7049 continue;
7050 }
7051
7052 BBVector[i]->localReachingSends = SBBitSets(SBSendNodes.size());
7053
7054 if (BBVector[i]->first_send_node != -1)
7055 {
7056 for (int j = BBVector[i]->first_send_node; j <= BBVector[i]->last_send_node; j++)
7057 {
7058 SBNode* node = SBSendNodes[j];
7059
7060 //Get the live range for the local ones
7061 if (node->globalID == -1)
7062 {
7063 assert(node->getBBID() == i);
7064
7065 node->setLiveEarliestID(node->getNodeID());
7066 node->setLiveLatestID(node->getNodeID());
7067 if (node->succs.size())
7068 {
7069 for (int k = 0; k < (int)(node->succs.size()); k++)
7070 {
7071 SBDEP_ITEM& curSucc = node->succs[k];
7072 SBNode* succ = curSucc.node;
7073
7074 node->setLiveLatestID(succ->getNodeID(), succ->getBBID());
7075 }
7076 }
7077 else
7078 {
7079 node->setLiveLatestID(BBVector[i]->last_node);
7080 }
7081 }
7082 else
7083 {
7084 node->setLiveEarliestID(node->getNodeID());
7085 node->setLiveLatestID(BBVector[i]->last_node);
7086 }
7087 }
7088 }
7089 localTokenUsage.clear(); //Add to the live node
7090
7091 //Scan BB again to figure out the dependence caused by global send operands
7092 std::vector<SBBucketDesc> BDvec;
7093 for (int j = BBVector[i]->first_node; j <= BBVector[i]->last_node; j++)
7094 {
7095 SBNode* node = (*SBNodes)[j];
7096 G4_INST* curInst = (*SBNodes)[j]->getLastInstruction();
7097
7098 BDvec.clear();
7099 BBVector[i]->getGRFBucketDescs(node, BDvec, true);
7100 if (!BDvec.size())
7101 {
7102 continue;
7103 }
7104
7105 //Tack all the token nodes defined in current BB
7106 if (tokenHonourInstruction(node->GetInstruction()))
7107 {
7108 addReachingDefineSet(node, &send_live, &BBVector[i]->localReachingSends);
7109 node->reachingSends |= send_reach_all;
7110
7111 expireLocalIntervals(node->getNodeID(), i);
7112 if (node->GetInstruction()->getDst() != nullptr &&
7113 !node->GetInstruction()->getDst()->isNullReg())
7114 {
7115 BBVector[i]->localReachingSends.setDst(node->sendID, true);
7116 }
7117 else
7118 {
7119 BBVector[i]->localReachingSends.setSrc(node->sendID, true);
7120 }
7121 localTokenUsage.push_back(node); //Add to the live node
7122 }
7123
7124 bool instKill = false;
7125 // For all bucket descriptors of curInst
7126 for (const SBBucketDesc& BD : BDvec)
7127 {
7128 const int& curBucket = BD.bucket;
7129 const Gen4_Operand_Number& curOpnd = BD.opndNum;
7130 const SBFootprint* curFootprint = BD.footprint;
7131
7132 for (LiveGRFBuckets::BN_iterator bn_it = send_use_kills.begin(curBucket);
7133 bn_it != send_use_kills.end(curBucket);)
7134 {
7135 SBBucketNode* liveBN = (*bn_it);
7136 SBNode* curLiveNode = liveBN->node;
7137 Gen4_Operand_Number liveOpnd = liveBN->opndNum;
7138 const SBFootprint* liveFootprint = liveBN->footprint;
7139 G4_INST* liveInst = liveFootprint->inst;
7140 unsigned short internalOffset = 0;
7141 bool hasOverlap = curFootprint->hasOverlap(liveFootprint, internalOffset);
7142
7143 //Find DEP type
7144 DepType dep = getDepForOpnd(liveOpnd, curOpnd);
7145
7146 //RAW: R kill W R-->live explicit dependence
7147 //WAW: W2 kill W1 W2-->live explicit dependence
7148 //WAW: same pipeline/inorder W2 kill W1 W2-->live implicit dependence
7149 //WAR: different pipelines W kill R W-->live explicit dependence
7150 //WAR: same pipeline W kill R W-->live implicit dependence
7151 //RAR: sample pipeline R2 kill R1 R2-->live implicit dependence
7152 //RAR: different pipelines no kill R1,R2-->live no dependence
7153 if (hasOverlap)
7154 {
7155 assert(tokenHonourInstruction(liveInst));
7156 if (dep == RAW || dep == WAW)
7157 {
7158 if (BBVector[i]->isGRFEdgeAdded(curLiveNode, node, dep, DEP_EXPLICT))
7159 {
7160 send_use_kills.killOperand(bn_it);
7161 curLiveNode->setInstKilled(true); //Instruction level kill
7162 instKill = true;
7163 addReachingDefineSet(node, &send_live, &BBVector[i]->localReachingSends);
7164 send_live.setDst(curLiveNode->getSendID(), false);
7165 continue;
7166 }
7167 //WAW need be tracked in both scalar and SIMD control flow
7168 //The reason is that:
7169 // 1. RA track the liveness in use-->define way
7170 // 2. SWSB track in define-->use way.
7171 // For the case like following
7172 //
7173 // if
7174 // v1 <-- //v1 is never be used
7175 // if
7176 // <--v1
7177 // endif
7178 // endif
7179 // v2 <--
7180 //RA may assign same register to v1 and v2.
7181 //Scalar CFG cannot capture the dependence v1-->v2 when they are assigned with same registers.
7182 if (afterWrite || dep == WAW) //There is no RAW kill for SIMDCF
7183 {
7184 if (fg.builder->getOption(vISA_EnableDPASTokenReduction) &&
7185 node->getLastInstruction()->isDpas() &&
7186 curLiveNode->getLastInstruction()->isDpas() &&
7187 curFootprint->isWholeOverlap(liveFootprint))
7188 {
7189 if (node->getDPASID() > curLiveNode->getDPASID())
7190 {
7191 if ((node->getDPASID() + curFootprint->offset - (curLiveNode->getDPASID() + internalOffset) < tokenAfterDPASCycle))
7192 {
7193 send_use_kills.killOperand(bn_it);
7194 BBVector[i]->createAddGRFEdge(curLiveNode, node, dep, DEP_EXPLICT);
7195 curLiveNode->setInstKilled(true); //Instruction level kill
7196 instKill = true;
7197 continue;
7198 }
7199 else if (dep == WAW)
7200 {
7201 send_use_kills.killOperand(bn_it);
7202 continue;
7203 }
7204 }
7205
7206 if (node->getDPASID() <= curLiveNode->getDPASID())
7207 {
7208 unsigned loopStartBB = BBVector[node->getBBID()]->getLoopStartBBID();
7209 unsigned loopEndBB = BBVector[curLiveNode->getBBID()]->getLoopEndBBID();
7210
7211 if (loopStartBB != -1 && loopEndBB != -1)
7212 {
7213 unsigned frontDist = node->getDPASID() - BBVector[loopStartBB]->first_DPASID;
7214 unsigned endDist = BBVector[loopEndBB]->last_DPASID - curLiveNode->getDPASID();
7215
7216 if ((int)(frontDist + endDist + curFootprint->offset - internalOffset) < tokenAfterDPASCycle)
7217 {
7218 send_use_kills.killOperand(bn_it);
7219 BBVector[i]->createAddGRFEdge(curLiveNode, node, dep, DEP_EXPLICT);
7220 curLiveNode->setInstKilled(true); //Instruction level kill
7221 instKill = true;
7222 continue;
7223 }
7224 else if (dep == WAW)
7225 {
7226 send_use_kills.killOperand(bn_it);
7227 continue;
7228 }
7229 }
7230 else
7231 {
7232 send_use_kills.killOperand(bn_it);
7233 BBVector[i]->createAddGRFEdge(curLiveNode, node, dep, DEP_EXPLICT);
7234 curLiveNode->setInstKilled(true); //Instruction level kill
7235 instKill = true;
7236 continue;
7237 }
7238 }
7239 }
7240 else
7241 {
7242 send_use_kills.killOperand(bn_it);
7243 BBVector[i]->createAddGRFEdge(curLiveNode, node, dep, DEP_EXPLICT);
7244 curLiveNode->setInstKilled(true); //Instruction level kill
7245 instKill = true;
7246
7247 //Kill from live
7248 addReachingDefineSet(node, &send_live, &BBVector[i]->localReachingSends);
7249 send_live.setDst(curLiveNode->getSendID(), false);
7250 continue;
7251 }
7252 }
7253 }
7254
7255 if (dep == WAR)
7256 {
7257 bool killed = false;
7258 //For implicit dependence, the previous node can be killed only when it's wholly overlapped by the following one
7259 if (curFootprint->isWholeOverlap(liveFootprint))
7260 {
7261 send_use_kills.killOperand(bn_it);
7262 if (WARDepRequired(liveInst, curInst))
7263 //Implicit dependence cannot block the following instruction from issue.
7264 {
7265 curLiveNode->setSourceKilled(true);
7266 }
7267 curLiveNode->setAR();
7268 killed = true;
7269 }
7270
7271 if (WARDepRequired(liveInst, curInst))
7272 {
7273 if (!killed)
7274 {
7275 send_use_kills.killOperand(bn_it);
7276 curLiveNode->setSourceKilled(true);
7277 curLiveNode->setAR();
7278 killed = true;
7279 }
7280 instKill = true;
7281 if (!afterWrite) //After read dependence is more comprehensive in SIMDCF, so add edge only in SIMDCF pass
7282 {
7283 BBVector[i]->createAddGRFEdge(curLiveNode, node, dep, DEP_EXPLICT);
7284 }
7285 }
7286 else
7287 {
7288 if (!afterWrite) //After read dependence is more comprehensive in SIMDCF, so add edge only in SIMDCF pass
7289 {
7290 BBVector[i]->createAddGRFEdge(curLiveNode, node, dep, DEP_IMPLICIT);
7291 }
7292 }
7293 if (killed)
7294 {
7295 addReachingDefineSet(node, &send_live, &BBVector[i]->localReachingSends);
7296 send_live.setSrc(curLiveNode->getSendID(), false);
7297 continue;
7298 }
7299 }
7300
7301 if (dep == NODEP &&
7302 hasSameFunctionID(liveInst, curInst) &&
7303 hasSamePredicator(liveInst, curInst) &&
7304 hasSameExecMask(liveInst, curInst))
7305 {
7306 if (curFootprint->isWholeOverlap(liveFootprint))
7307 {
7308 send_use_kills.killOperand(bn_it);
7309 continue;
7310 }
7311 }
7312 }
7313
7314 assert(dep != DEPTYPE_MAX && "dep unassigned?");
7315 ++bn_it;
7316 }
7317 }
7318
7319 if (node->preds.size() != 0)
7320 {
7321 addReachingDefineSet(node, &send_live, &BBVector[i]->localReachingSends);
7322 node->reachingSends |= send_reach_all;
7323 node->setSendUseID(SBSendUses.size());
7324 SBSendUses.push_back(node);
7325 }
7326
7327 if (instKill)
7328 {
7329 if (fg.builder->hasThreeALUPipes() || fg.builder->hasFourALUPipes())
7330 {
7331 BBVector[i]->clearKilledBucketNodeXeHP(&send_use_kills, 0, 0, 0, 0);
7332 }
7333 else
7334 {
7335 BBVector[i]->clearKilledBucketNodeXeLP(&send_use_kills, 0);
7336 }
7337 }
7338 if (fg.builder->hasSLMWARIssue() && curInst->isSend() &&
7339 (isSLMMsg(curInst) && (curInst->getDst() == nullptr || isFence(curInst))))
7340 {
7341 BBVector[i]->clearSLMWARWAissue(node, &send_use_kills);
7342 }
7343 }
7344 }
7345
7346 return;
7347 }
7348
7349 //
7350 //Works only for RAW and WAW
7351 //Check if edge has been added during the data dependence analysis for SIMDCF control flow.
7352 //If it's added, the tracking for the corresponding bucket will be killed
7353 //
isGRFEdgeAdded(const SBNode * pred,const SBNode * succ,DepType d,SBDependenceAttr a)7354 bool G4_BB_SB::isGRFEdgeAdded(const SBNode* pred, const SBNode* succ, DepType d, SBDependenceAttr a)
7355 {
7356 // When there are multiple dependence edges between two instructions
7357 // We think the RAW and WAW > WAR, which means if WAR co-exists with any other, it will be dropped.
7358 // This is especially important for send instructions. when there are multiple dependencies from same send instruction.
7359 // For the case like following, only the dst
7360 //1. Send r2-r5, r8, .... $1
7361 // ...
7362 //7. Add r8, r2, r10 test $1D
7363 // For WAW and RAW, we think they are equal.
7364 for (const SBDEP_ITEM& curSucc : pred->succs)
7365 {
7366 if (curSucc.node == succ)
7367 {
7368 //If there is dependence edges already current edge will be ignored if it's WAR
7369 //if exist dependence is RAW or WAW, there is no need to add new edges
7370 if (curSucc.type == RAW || curSucc.type == WAW)
7371 {
7372 return true;
7373 }
7374 }
7375 }
7376
7377 return false;
7378 }
7379
removePredsEdges(SBNode * node,SBNode * pred)7380 void SWSB::removePredsEdges(SBNode* node, SBNode* pred)
7381 {
7382 for (auto pred_it = node->preds.begin();
7383 pred_it != node->preds.end();)
7384 {
7385 if ((*pred_it).node == pred)
7386 {
7387 pred_it = node->preds.erase(pred_it);
7388 continue;
7389 }
7390 pred_it++;
7391 }
7392
7393 return;
7394 }
7395
createAddGRFEdge(SBNode * pred,SBNode * succ,DepType d,SBDependenceAttr a)7396 void G4_BB_SB::createAddGRFEdge(SBNode* pred, SBNode* succ, DepType d, SBDependenceAttr a)
7397 {
7398 // When there are multiple dependence edges between two instructions
7399 // We think the RAW and WAW > WAR, which means if WAR co-exists with any other, it will be dropped.
7400 // This is especially important for send instructions. When there are multiple dependencies from same send instruction.
7401 // For the case like following, only the dst
7402 //1. Send r2-r5, r8, .... $1
7403 // ...
7404 //7. Add r8, r2, r10 test $1D
7405 // For WAW and RAW, we think they are equal.
7406
7407 for (int i = 0; i < (int)(pred->succs.size()); i++)
7408 {
7409 SBDEP_ITEM& curSucc = pred->succs[i];
7410 if (curSucc.node == succ)
7411 {
7412 //If there is dependence edges already current edge will be ignored if it's WAR
7413 //if exist dependence is RAW or WAW, there is no need to add new edges
7414 if (d == WAR || curSucc.type == RAW || curSucc.type == WAW)
7415 {
7416 return;
7417 }
7418 //Otherwise, d == RAW or d == WAW, but curSucc.type == WAR
7419 //Change the dependency type to d
7420 curSucc.type = d;
7421 curSucc.attr = a;
7422 bool findPred = false;
7423 for (int j = 0; j < (int)(succ->preds.size()); j++)
7424 {
7425 SBDEP_ITEM& curPred = succ->preds[j];
7426
7427 if (curPred.node == pred)
7428 {
7429 curPred.type = d;
7430 curPred.attr = a;
7431 findPred = true;
7432 }
7433 }
7434 assert(findPred);
7435 return;
7436 }
7437 }
7438
7439 // No edge with the same successor exists. Append this edge.
7440 SBDEP_ITEM newEdge = SBDEP_ITEM(succ, d, a);
7441 pred->succs.emplace_back(newEdge);
7442 newEdge = SBDEP_ITEM(pred, d, a);
7443 succ->preds.emplace_back(newEdge);
7444 return;
7445 }
7446
7447
emitRegInfo(std::ostream & output,G4_INST * inst,int offset)7448 void G4_BB::emitRegInfo(std::ostream& output, G4_INST* inst, int offset)
7449 {
7450 output << "#" << inst->getLexicalId() << "|" << offset << ":";
7451 G4_DstRegRegion* dstOpnd = inst->getDst();
7452
7453 if (dstOpnd &&
7454 !dstOpnd->isIndirect() &&
7455 dstOpnd->isGreg())
7456 {
7457 uint32_t byteAddress = dstOpnd->getLinearizedStart();
7458 unsigned dstReg0 = byteAddress / numEltPerGRF<Type_UB>();
7459 output << " {";
7460 output << "D:" << dstReg0;
7461 output << "}";
7462 }
7463
7464 for (int i = 0; i < inst->getNumSrc(); i++)
7465 {
7466 G4_Operand* srcOpnd = inst->getSrc(i);
7467 if (srcOpnd)
7468 {
7469 if (srcOpnd->isSrcRegRegion() &&
7470 srcOpnd->asSrcRegRegion()->getBase() &&
7471 !srcOpnd->asSrcRegRegion()->isIndirect() &&
7472 srcOpnd->asSrcRegRegion()->getBase()->isRegVar())
7473 {
7474 G4_RegVar* baseVar = static_cast<G4_RegVar*>(srcOpnd->asSrcRegRegion()->getBase());
7475 if (baseVar->isGreg()) {
7476 uint32_t byteAddress = srcOpnd->getLinearizedStart();
7477 unsigned srcReg = byteAddress / numEltPerGRF<Type_UB>();
7478 output << " {";
7479 output << "S" << i;
7480 output << ":" << srcReg;
7481 output << "}";
7482 }
7483 }
7484 }
7485 }
7486
7487 output << std::endl;
7488 return;
7489 }
7490
isSWSBRequired(IR_Builder * builder,G4_INST * inst)7491 static bool isSWSBRequired(IR_Builder* builder, G4_INST* inst)
7492 {
7493 // Iterate over all operands and create buckets.
7494 for (Gen4_Operand_Number opndNum
7495 : {Opnd_src0, Opnd_src1, Opnd_src2, Opnd_src3, Opnd_dst}) {
7496 G4_Operand* opnd = inst->getOperand(opndNum);
7497 // Skip if no operand or the operand is not touched by the instruction
7498 if (!opnd || !opnd->getBase()) {
7499 continue;
7500 }
7501 if (opnd->isLabel() || opnd->isImm())
7502 {
7503 continue;
7504 }
7505
7506 G4_VarBase* base = opnd->getBase();
7507 assert(base && "If no base, then the operand is not touched by the instr.");
7508 G4_VarBase* phyReg = (base->isRegVar()) ? base->asRegVar()->getPhyReg() : base;
7509
7510 if (phyReg->getKind() == G4_VarBase::VK_phyGReg)
7511 {
7512 return true;
7513 }
7514 if (phyReg->getKind() == G4_VarBase::VK_phyAReg)
7515 {
7516 if (phyReg->getAreg()->getArchRegType() == AREG_A0)
7517 {
7518 return true;
7519 }
7520 if (builder->hasThreeALUPipes() || builder->hasFourALUPipes())
7521 {
7522 return true;
7523 }
7524 }
7525
7526 }
7527
7528 return false;
7529 }
7530
setForceDebugSWSB(IR_Builder * builder,G4_BB * bb,INST_LIST_ITER inst_it)7531 static G4_INST* setForceDebugSWSB(IR_Builder* builder, G4_BB* bb, INST_LIST_ITER inst_it)
7532 {
7533 G4_INST* inst = (*inst_it);
7534 G4_INST* syncInst = nullptr;
7535
7536 if (!isSWSBRequired(builder, inst))
7537 {
7538 return nullptr;
7539 }
7540
7541 if (builder->hasThreeALUPipes() || builder->hasFourALUPipes())
7542 {
7543 if (!inst->tokenHonourInstruction())
7544 {
7545 inst->setDistance(1);
7546 inst->setDistanceTypeXe(G4_INST::DistanceType::DISTALL);
7547 }
7548 else
7549 {
7550 G4_SrcRegRegion* src0 = builder->createNullSrc(Type_UD);
7551 G4_INST* extraSyncInst = builder->createSync(G4_sync_nop, src0);
7552 extraSyncInst->setDistance(1);
7553 extraSyncInst->setDistanceTypeXe(G4_INST::DistanceType::DISTALL);
7554 bb->insertBefore(inst_it, extraSyncInst);
7555 }
7556 }
7557 else
7558 {
7559 inst->setDistance(1);
7560 }
7561
7562 if (inst->tokenHonourInstruction())
7563 {
7564 inst->setSetToken(0);
7565 if (inst->isEOT())
7566 {
7567 inst->setDistance(1);
7568 if (builder->hasThreeALUPipes() || builder->hasFourALUPipes())
7569 {
7570 inst->setDistanceTypeXe(G4_INST::DistanceType::DISTALL);
7571 }
7572 }
7573 G4_SrcRegRegion* src0 = builder->createNullSrc(Type_UD);
7574 syncInst = builder->createSync(G4_sync_nop, src0);
7575 G4_Operand* opnd = inst->getOperand(Opnd_dst);
7576 SWSBTokenType tokenType = SWSBTokenType::TOKEN_NONE;
7577 if (!opnd || !opnd->getBase() || opnd->isNullReg())
7578 {
7579 tokenType = SWSBTokenType::AFTER_READ;
7580 }
7581 else
7582 {
7583 tokenType = SWSBTokenType::AFTER_WRITE;
7584 }
7585 syncInst->setToken(0);
7586 syncInst->setTokenType(tokenType);
7587 }
7588
7589 return syncInst;
7590 }
7591
forceDebugSWSB(G4_Kernel * kernel)7592 void vISA::forceDebugSWSB(G4_Kernel* kernel)
7593 {
7594 BB_LIST_ITER bbEnd = kernel->fg.end();
7595 int instID = 0;
7596
7597 for (BB_LIST_ITER bb_it = kernel->fg.begin();
7598 bb_it != bbEnd;
7599 bb_it++)
7600 {
7601 G4_BB* bb = (*bb_it);
7602 if (bb->size() > 0)
7603 {
7604 INST_LIST_ITER inst_end = bb->end();
7605 for (INST_LIST_ITER inst_it = bb->begin();
7606 inst_it != inst_end;
7607 inst_it++)
7608 {
7609 G4_INST* inst = (*inst_it);
7610 G4_INST* newInst = nullptr;
7611
7612 newInst = setForceDebugSWSB(kernel->fg.builder, bb, inst_it);
7613 inst->setLexicalId(instID);
7614 instID++;
7615
7616 if (newInst)
7617 {
7618 INST_LIST_ITER new_it = inst_it;
7619 new_it++;
7620 bb->insertBefore(new_it, newInst);
7621 newInst->setLexicalId(instID);
7622 instID++;
7623 if (new_it == bb->end())
7624 {
7625 break;
7626 }
7627 inst_it++;
7628 }
7629 }
7630 }
7631 }
7632 }
7633
setInstructionStallSWSB(IR_Builder * builder,G4_BB * bb,INST_LIST_ITER & inst_it)7634 static void setInstructionStallSWSB(IR_Builder* builder,
7635 G4_BB* bb,
7636 INST_LIST_ITER& inst_it)
7637 {
7638 G4_INST* inst = *inst_it;
7639 INST_LIST_ITER next_it = inst_it;
7640 next_it++;
7641
7642 if (!inst->distanceHonourInstruction() &&
7643 !inst->tokenHonourInstruction())
7644 {
7645 return;
7646 }
7647
7648 if (inst->distanceHonourInstruction())
7649 {
7650 G4_SrcRegRegion* src0 = builder->createNullSrc(Type_UD);
7651 G4_INST* extraSyncInst = builder->createSync(G4_sync_nop, src0);
7652 extraSyncInst->setDistance(1);
7653 if (builder->hasThreeALUPipes() || builder->hasFourALUPipes())
7654 {
7655 extraSyncInst->setDistanceTypeXe(G4_INST::DistanceType::DISTALL);
7656 }
7657 bb->insertBefore(inst_it, extraSyncInst);
7658
7659 return;
7660 }
7661
7662 if (inst->tokenHonourInstruction())
7663 {
7664 G4_SrcRegRegion* src0_1 = builder->createNullSrc(Type_UD);
7665 G4_INST* extraSyncInst = builder->createSync(G4_sync_nop, src0_1);
7666 extraSyncInst->setDistance(1);
7667 if (builder->hasThreeALUPipes() || builder->hasFourALUPipes())
7668 {
7669 extraSyncInst->setDistanceTypeXe(G4_INST::DistanceType::DISTALL);
7670 }
7671 bb->insertBefore(inst_it, extraSyncInst);
7672
7673 if (!inst->isEOT())
7674 {
7675 G4_SrcRegRegion* src0 = builder->createNullSrc(Type_UD);
7676 G4_INST* syncInst = builder->createSync(G4_sync_nop, src0);
7677
7678 unsigned short token = inst->getSetToken();
7679 SWSBTokenType tokenType = SWSBTokenType::TOKEN_NONE;
7680 G4_Operand* opnd = inst->getOperand(Opnd_dst);
7681 if (!opnd || !opnd->getBase() || opnd->isNullReg())
7682 {
7683 tokenType = SWSBTokenType::AFTER_READ;
7684 }
7685 else
7686 {
7687 tokenType = SWSBTokenType::AFTER_WRITE;
7688 }
7689 syncInst->setToken(token);
7690 syncInst->setTokenType(tokenType);
7691 inst_it = bb->insertBefore(next_it, syncInst);
7692 }
7693 }
7694
7695 return;
7696 }
7697
setInstructionBarrierSWSB(IR_Builder * builder,G4_BB * bb,INST_LIST_ITER & inst_it)7698 static void setInstructionBarrierSWSB(IR_Builder* builder,
7699 G4_BB* bb,
7700 INST_LIST_ITER& inst_it)
7701 {
7702
7703 G4_INST* syncAllRdInst = nullptr;
7704 G4_SrcRegRegion* src0 = builder->createNullSrc(Type_UD);
7705 syncAllRdInst = builder->createSync(G4_sync_allrd, src0);
7706 syncAllRdInst->setDistance(1);
7707 if (builder->hasThreeALUPipes() || builder->hasFourALUPipes())
7708 {
7709 syncAllRdInst->setDistanceTypeXe(G4_INST::DistanceType::DISTALL);
7710 }
7711 INST_LIST_ITER next_it = inst_it;
7712 next_it++;
7713 inst_it = bb->insertBefore(next_it, syncAllRdInst);
7714
7715 G4_INST* syncAllWrInst = nullptr;
7716 src0 = builder->createNullSrc(Type_UD);
7717 syncAllWrInst = builder->createSync(G4_sync_allwr, src0);
7718
7719 next_it = inst_it;
7720 next_it++;
7721 inst_it = bb->insertBefore(next_it, syncAllWrInst);
7722 }
7723
7724
singleInstStallSWSB(G4_Kernel * kernel,uint32_t instID,uint32_t endInstID,bool is_barrier)7725 void vISA::singleInstStallSWSB(G4_Kernel* kernel, uint32_t instID, uint32_t endInstID, bool is_barrier)
7726 {
7727 BB_LIST_ITER bbEnd = kernel->fg.end();
7728
7729 for (BB_LIST_ITER bb_it = kernel->fg.begin();
7730 bb_it != bbEnd;
7731 bb_it++)
7732 {
7733 G4_BB* bb = (*bb_it);
7734
7735 if (bb->size() > 0)
7736 {
7737 INST_LIST_ITER inst_end = bb->end();
7738 for (INST_LIST_ITER inst_it = bb->begin();
7739 inst_it != inst_end;
7740 inst_it++)
7741 {
7742 G4_INST* inst = (*inst_it);
7743
7744 if (is_barrier && inst->getLexicalId() == instID)
7745 {
7746 setInstructionBarrierSWSB(kernel->fg.builder, bb, inst_it);
7747 }
7748 else
7749 {
7750
7751 if ((inst->getLexicalId() <= (int)endInstID &&
7752 inst->getLexicalId() >= (int)instID) ||
7753 (inst->getLexicalId() == instID))
7754 {
7755 setInstructionStallSWSB(kernel->fg.builder, bb, inst_it);
7756 }
7757 }
7758 }
7759 }
7760 }
7761 }
7762
dumpImmDom(ImmDominator * dom) const7763 void SWSB::dumpImmDom(ImmDominator* dom) const
7764 {
7765 for (auto bb : fg)
7766 {
7767 printf("BB%d %d:%d - SUCC:", bb->getId(), BBVector[bb->getId()]->first_node, BBVector[bb->getId()]->last_node);
7768 for (auto succ : bb->Succs)
7769 {
7770 printf("BB%d, ", succ->getId());
7771 }
7772 printf("--PRED:");
7773 for (auto pred : bb->Preds)
7774 {
7775 printf("BB%d, ", pred->getId());
7776 }
7777 auto& idomBB = dom->getIDoms()[bb->getId()];
7778 assert(idomBB != nullptr);
7779 printf("\n\t iDOM: BB%d -- DOM SUCC: ", dom->getIDoms()[bb->getId()]->getId());
7780 for (const G4_BB_SB* succ : BBVector[bb->getId()]->domSuccs)
7781 {
7782 printf("BB%d, ", succ->getBB()->getId());
7783 }
7784 printf("\n");
7785 }
7786 }
7787