1 /*========================== begin_copyright_notice ============================ 2 3 Copyright (C) 2017-2021 Intel Corporation 4 5 SPDX-License-Identifier: MIT 6 7 ============================= end_copyright_notice ===========================*/ 8 9 #ifndef _SWSB_H_ 10 #define _SWSB_H_ 11 #include <string> 12 #include <set> 13 #include <bitset> 14 #include "../Mem_Manager.h" 15 #include "../FlowGraph.h" 16 #include "../G4_IR.hpp" 17 #include "../Timer.h" 18 #include "../RegAlloc.h" 19 #include <vector> 20 #include "../BitSet.h" 21 #include "LocalScheduler_G4IR.h" 22 #include <utility> 23 24 namespace vISA 25 { 26 class SBNode; 27 struct SBBucketNode; 28 class SBDDD; 29 class G4_BB_SB; 30 class SWSB; 31 } 32 33 //FIXME, build a table for different platforms 34 #define SWSB_MAX_ALU_DEPENDENCE_DISTANCE 11 35 #define SWSB_MAX_ALU_DEPENDENCE_DISTANCE_64BIT 15 36 #define SWSB_MAX_MATH_DEPENDENCE_DISTANCE 18 37 #define SWSB_MAX_ALU_DEPENDENCE_DISTANCE_VALUE 7u 38 39 #define TOKEN_AFTER_READ_DPAS_CYCLE 8 40 #define SWSB_MAX_DPAS_DEPENDENCE_DISTANCE 4 41 #define FOUR_THREAD_TOKEN_NUM 32 42 43 #define DEPENCENCE_ATTR(DO) \ 44 DO(DEP_EXPLICT) \ 45 DO(DEP_IMPLICIT) 46 47 enum SBDependenceAttr 48 { 49 DEPENCENCE_ATTR(MAKE_ENUM) 50 }; 51 52 #define DPAS_BLOCK_SIZE 8 53 #define DPAS_8x8_CYCLE 28 54 #define DPAS_8x4_CYCLE 24 55 #define DPAS_8x2_CYCLE 22 56 #define DPAS_8x1_CYCLE 21 57 58 #define DPAS_CYCLES_BEFORE_GRF_READ 8 59 #define DPAS_8x8_GRFREAD_CYCLE 8 60 #define DPAS_8x4_GRFREAD_CYCLE 4 61 #define DPAS_8x2_GRFREAD_CYCLE 2 62 #define DPAS_8x1_GRFREAD_CYCLE 1 63 64 //1 GRF map to 1 flag: bytes per bit 65 #define FLAG_TO_GRF_MAP (numEltPerGRF<Type_UB>() / 16) 66 67 typedef enum _DPAS_DEP 68 { 69 REP_1 = 1, 70 REP_2 = 2, 71 REP_4 = 4, 72 REP_8 = 8 73 } DPAS_DEP; 74 75 #define UNKNOWN_TOKEN -1 76 #define UNINIT_BUCKET -1 77 78 namespace vISA { 79 struct SBDep { SBDepSBDep80 SBDep(SBNode *SBNode, DepType Type, SBDependenceAttr Attr) { 81 node = SBNode; 82 type = Type; 83 attr = Attr; 84 exclusiveNodes.clear(); 85 } 86 SBNode* node; 87 DepType type; 88 SBDependenceAttr attr; 89 std::vector<vISA::SBNode*> exclusiveNodes; 90 }; 91 92 struct SBDISTDep { 93 SB_INST_PIPE liveNodePipe; 94 SB_INST_PIPE nodePipe; 95 SB_INST_PIPE operandType; 96 bool dstDep; 97 }; 98 99 using SWSBTokenType = G4_INST::SWSBTokenType; 100 void singleInstStallSWSB(G4_Kernel *kernel, uint32_t instID, uint32_t endInstID, bool is_barrier); 101 void forceDebugSWSB(G4_Kernel *kernel); 102 void forceFCSWSB(G4_Kernel *kernel); 103 } 104 105 typedef vISA::SBDep SBDEP_ITEM; 106 typedef std::vector< SBDEP_ITEM> SBDEP_VECTOR; 107 typedef SBDEP_VECTOR::iterator SBDEP_VECTOR_ITER; 108 109 typedef vISA::SBDISTDep SBDISTDEP_ITEM; 110 typedef std::vector< SBDISTDEP_ITEM> SBDISTDEP_VECTOR; 111 112 namespace vISA 113 { 114 typedef enum _FOOTPRINT_TYPE 115 { 116 GRF_T = 1, 117 ACC_T = 2, 118 FLAG_T = 4, 119 } FOOTPRINT_TYPE; 120 121 struct SBFootprint { 122 const FOOTPRINT_TYPE fType; 123 const G4_Type type; 124 const unsigned short LeftB; 125 const unsigned short RightB; 126 unsigned short offset = 0; 127 G4_INST* inst; 128 struct SBFootprint *next = nullptr; //The ranges linked together to represent the possible register ranges may be occupied by an operand. 129 //For indirect access, non-contiguous ranges exist. 130 SBFootprintSBFootprint131 SBFootprint() : fType(GRF_T), type (Type_UNDEF), LeftB(0), RightB(0), inst(nullptr) { ; } SBFootprintSBFootprint132 SBFootprint(FOOTPRINT_TYPE ft, G4_Type t, unsigned short LB, unsigned short RB, G4_INST *i) 133 : fType(ft), type(t), LeftB(LB), RightB(RB), inst(i) { 134 ; 135 } ~SBFootprintSBFootprint136 ~SBFootprint() 137 { 138 } 139 setOffsetSBFootprint140 void setOffset(unsigned short o) { offset = o; } 141 hasOverlapSBFootprint142 bool hasOverlap(const SBFootprint *liveFootprint, unsigned short &internalOffset) const 143 { 144 for (const SBFootprint *curFootprint2Ptr = liveFootprint; curFootprint2Ptr; curFootprint2Ptr = curFootprint2Ptr->next) 145 { 146 // Negative of no overlap: !(LeftB > curFootprint2Ptr->RightB || RightB < curFootprint2Ptr->LeftB) 147 if (fType == curFootprint2Ptr->fType && 148 LeftB <= curFootprint2Ptr->RightB && RightB >= curFootprint2Ptr->LeftB) 149 { 150 internalOffset = curFootprint2Ptr->offset; 151 return true; 152 } 153 } 154 155 //Overlap with other ranges. 156 157 for (const SBFootprint *curFootprintPtr = next; curFootprintPtr; curFootprintPtr = curFootprintPtr->next) 158 { 159 FOOTPRINT_TYPE curFType = curFootprintPtr->fType; 160 for (const SBFootprint *curFootprint2Ptr = liveFootprint; curFootprint2Ptr; curFootprint2Ptr = curFootprint2Ptr->next) 161 { 162 if (curFType == curFootprint2Ptr->fType && 163 curFootprintPtr->LeftB <= curFootprint2Ptr->RightB && curFootprintPtr->RightB >= curFootprint2Ptr->LeftB) 164 { 165 internalOffset = curFootprint2Ptr->offset; 166 return true; 167 } 168 } 169 } 170 171 return false; 172 } 173 hasOverlapSBFootprint174 bool hasOverlap(const SBFootprint *liveFootprint, bool &isRMWOverlap, unsigned short &internalOffset) const 175 { 176 for (const SBFootprint *curFootprint2Ptr = liveFootprint; curFootprint2Ptr; curFootprint2Ptr = curFootprint2Ptr->next) 177 { 178 // Negative of no overlap: !(LeftB > curFootprint2Ptr->RightB || RightB < curFootprint2Ptr->LeftB) 179 if (fType == curFootprint2Ptr->fType) 180 { 181 if (LeftB <= curFootprint2Ptr->RightB && RightB >= curFootprint2Ptr->LeftB) 182 { 183 internalOffset = curFootprint2Ptr->offset; 184 if (fType == GRF_T 185 && (type == Type_UB || type == Type_B)) 186 { 187 isRMWOverlap = true; 188 } 189 return true; 190 } 191 else if (fType == GRF_T 192 && (type == Type_UB || type == Type_B)) 193 { 194 unsigned short w_LeftB = LeftB/2; 195 unsigned short w_RightB = RightB/2; 196 unsigned short w_curLeftB = curFootprint2Ptr->LeftB/2; 197 unsigned short w_curRightB = curFootprint2Ptr->RightB/2; 198 if (w_LeftB <= w_curRightB && w_RightB >= w_curLeftB) 199 { 200 isRMWOverlap = true; 201 return true; 202 } 203 } 204 } 205 } 206 207 //Overlap with other ranges. 208 for (const SBFootprint *curFootprintPtr = next; curFootprintPtr; curFootprintPtr = curFootprintPtr->next) 209 { 210 FOOTPRINT_TYPE curFType = curFootprintPtr->fType; 211 for (const SBFootprint *curFootprint2Ptr = liveFootprint; curFootprint2Ptr; curFootprint2Ptr = curFootprint2Ptr->next) 212 { 213 if (curFType == curFootprint2Ptr->fType) 214 { 215 if (curFootprintPtr->LeftB <= curFootprint2Ptr->RightB && curFootprintPtr->RightB >= curFootprint2Ptr->LeftB) 216 { 217 internalOffset = curFootprint2Ptr->offset; 218 if (curFType == GRF_T 219 && (type == Type_UB || type == Type_B)) 220 { 221 isRMWOverlap = true; 222 } 223 return true; 224 } 225 else if (curFType == GRF_T 226 && (type == Type_UB || type == Type_B)) 227 { 228 unsigned short w_LeftB = curFootprintPtr->LeftB/2; 229 unsigned short w_RightB = curFootprintPtr->RightB/2; 230 unsigned short w_curLeftB = curFootprint2Ptr->LeftB/2; 231 unsigned short w_curRightB = curFootprint2Ptr->RightB/2; 232 if (w_LeftB <= w_curRightB && w_RightB >= w_curLeftB) 233 { 234 isRMWOverlap = true; 235 return true; 236 } 237 } 238 } 239 } 240 } 241 242 return false; 243 } 244 hasGRFGrainOverlapSBFootprint245 bool hasGRFGrainOverlap(const SBFootprint *liveFootprint) const 246 { 247 for (const SBFootprint *curFootprint2Ptr = liveFootprint; curFootprint2Ptr; curFootprint2Ptr = curFootprint2Ptr->next) 248 { 249 if (fType == curFootprint2Ptr->fType && 250 ((LeftB / numEltPerGRF<Type_UB>()) <= (curFootprint2Ptr->RightB / numEltPerGRF<Type_UB>())) && 251 ((RightB / numEltPerGRF<Type_UB>()) >= (curFootprint2Ptr->LeftB / numEltPerGRF<Type_UB>()))) 252 { 253 return true; 254 } 255 } 256 257 //Overlap with other ranges. 258 for (const SBFootprint *curFootprintPtr = next; curFootprintPtr; curFootprintPtr = curFootprintPtr->next) 259 { 260 for (const SBFootprint *curFootprint2Ptr = liveFootprint; curFootprint2Ptr; curFootprint2Ptr = curFootprint2Ptr->next) 261 { 262 if (fType == curFootprint2Ptr->fType && 263 ((curFootprintPtr->LeftB / numEltPerGRF<Type_UB>()) <= (curFootprint2Ptr->RightB / numEltPerGRF<Type_UB>())) && 264 ((curFootprintPtr->RightB / numEltPerGRF<Type_UB>()) >= (curFootprint2Ptr->LeftB / numEltPerGRF<Type_UB>()))) 265 { 266 return true; 267 } 268 } 269 } 270 271 return false; 272 } 273 274 275 //Check if current footprint overlaps footprint2 276 //FIXME: it's conservative. Because for the indirect, the ranges may be contiguous? isWholeOverlapSBFootprint277 bool isWholeOverlap(const SBFootprint *liveFootprint) const 278 { 279 bool findOverlap = false; 280 281 for (const SBFootprint *footprint2Ptr = liveFootprint; footprint2Ptr; footprint2Ptr = footprint2Ptr->next) 282 { 283 findOverlap = false; 284 285 if (fType == footprint2Ptr->fType && 286 LeftB <= footprint2Ptr->LeftB && RightB >= footprint2Ptr->RightB) 287 { 288 findOverlap = true; 289 continue; 290 } 291 for (const SBFootprint *curFootprintPtr = next; curFootprintPtr; curFootprintPtr = curFootprintPtr->next) 292 { 293 FOOTPRINT_TYPE curFType = curFootprintPtr->fType; 294 if (curFType == footprint2Ptr->fType && 295 curFootprintPtr->LeftB <= footprint2Ptr->LeftB && curFootprintPtr->RightB >= footprint2Ptr->RightB) 296 { 297 findOverlap = true; 298 break; 299 } 300 } 301 302 if (!findOverlap) 303 { 304 return false; 305 } 306 } 307 308 return findOverlap; 309 } 310 311 }; 312 313 // Bit set which is used for global dependence analysis for SBID. 314 // Since dependencies may come from dst and src and there may be dependence kill between dst and src dependencies, 315 // we use internal bit set to track the live of dst and src separately. 316 // Each bit map to one global SBID node according to the node's global ID. 317 struct SBBitSets { 318 BitSet dst; 319 BitSet src; 320 SBBitSetsSBBitSets321 SBBitSets() 322 { 323 } 324 SBBitSetsSBBitSets325 SBBitSets(unsigned size) : dst(size, false), src(size, false) 326 { 327 } 328 ~SBBitSetsSBBitSets329 ~SBBitSets() 330 { 331 } 332 setDstSBBitSets333 void setDst(int ID, bool value) 334 { 335 dst.set(ID, value); 336 } setSrcSBBitSets337 void setSrc(int ID, bool value) 338 { 339 src.set(ID, value); 340 } 341 getSizeSBBitSets342 unsigned getSize() const 343 { 344 assert(dst.getSize() == src.getSize()); 345 return dst.getSize(); 346 } 347 isEmptySBBitSets348 bool isEmpty() const 349 { 350 return dst.isEmpty() && src.isEmpty(); 351 } 352 isDstEmptySBBitSets353 bool isDstEmpty() const 354 { 355 return dst.isEmpty(); 356 } 357 isSrcEmptySBBitSets358 bool isSrcEmpty() const 359 { 360 return src.isEmpty(); 361 } 362 isDstSetSBBitSets363 bool isDstSet(unsigned i) const 364 { 365 return dst.isSet(i); 366 } 367 isSrcSetSBBitSets368 bool isSrcSet(unsigned i) const 369 { 370 return src.isSet(i); 371 } 372 373 SBBitSets& operator= (const SBBitSets& other) 374 { 375 dst = other.dst; 376 src = other.src; 377 return *this; 378 } 379 380 SBBitSets& operator= (SBBitSets&& other) noexcept = default; 381 382 SBBitSets& operator|= (const SBBitSets& other) 383 { 384 dst |= other.dst; 385 src |= other.src; 386 return *this; 387 } 388 389 SBBitSets& operator&= (const SBBitSets& other) 390 { 391 dst &= other.dst; 392 src &= other.src; 393 return *this; 394 } 395 396 SBBitSets& operator-= (const SBBitSets& other) 397 { 398 dst -= other.dst; 399 src -= other.src; 400 return *this; 401 } 402 403 bool operator!= (const SBBitSets& other) const 404 { 405 return (dst != other.dst) || (src != other.src); 406 } 407 newSBBitSets408 void* operator new(size_t sz, vISA::Mem_Manager& m) { return m.alloc(sz); } 409 }; 410 411 class SBNode { 412 private: 413 std::vector<SBFootprint*> footprints; // The coarse grained footprint of operands 414 unsigned nodeID = -1; // Unique ID of the node 415 unsigned BBID = 0; // ID of basic block 416 int ALUID = 0; // The ID for in-order instructions. The out-of-order instructions are not counted. 417 unsigned liveStartID = 0; // The start ID of live range 418 unsigned liveEndID = 0; // The end ID of live range 419 unsigned liveStartBBID = -1; // The start BB ID of live range 420 unsigned liveEndBBID = -1; // The start BB ID of live range 421 int integerID = -1; 422 int floatID = -1; 423 int longID = -1; 424 int mathID = -1; 425 int DPASID = -1; 426 unsigned short DPASSize = 0; 427 bool instKilled = false; // Used for global analysis, if the node generated dependencies have all been resolved. 428 bool sourceKilled = false; // If the dependencies caused by source operands have all been resolved. 429 bool hasAW = false; // Used for global analysis, if has AW (RAW or WAW) dependencies from the node 430 bool hasAR = false; // Used for global analysis, if has AR (WAR) dependencies from the node 431 bool hasFollowDistOneAReg = false; 432 bool followDistOneAReg = false; 433 unsigned depDelay = 0; 434 435 struct DepToken { 436 unsigned short token; 437 SWSBTokenType type; 438 SBNode* depNode; 439 }; 440 std::vector <DepToken> depTokens; 441 442 public: 443 std::vector<G4_INST *> instVec; 444 SBDEP_VECTOR succs; // A list of node's successors in dependence graph 445 SBDEP_VECTOR preds; // A list of node's predecessors in dependence graph 446 int globalID = -1; // ID of global send instructions 447 int sendID = -1; 448 int sendUseID = -1; 449 SBNode * tokenReusedNode = nullptr; // For global token reuse optimization, the node whose token is reused by current one. 450 SB_INST_PIPE ALUPipe; 451 SBDISTDEP_VECTOR distDep; 452 SBBitSets reachingSends; 453 SBBitSets reachedUses; 454 unsigned reuseOverhead = 0; 455 456 /* Constructor */ SBNode()457 SBNode() 458 { 459 460 } 461 SBNode(uint32_t id,int ALUId,uint32_t BBId,G4_INST * i)462 SBNode(uint32_t id, int ALUId, uint32_t BBId, G4_INST *i) 463 : nodeID(id), ALUID(ALUId), BBID(BBId), 464 footprints(Opnd_total_num, nullptr) 465 { 466 ALUPipe = PIPE_NONE; 467 if (i->isDpas()) 468 { 469 G4_InstDpas* dpasInst = i->asDpasInst(); 470 DPASSize = dpasInst->getRepeatCount(); //Set the size of the current dpas inst, if there is a macro, following will be added. 471 } 472 473 instVec.push_back(i); 474 } 475 ~SBNode()476 ~SBNode() 477 { 478 for (SBFootprint *sm : footprints) 479 { 480 sm->~SBFootprint(); 481 } 482 } 483 setSendID(int ID)484 void setSendID(int ID) 485 { 486 sendID = ID; 487 } 488 setSendUseID(int ID)489 void setSendUseID(int ID) 490 { 491 sendUseID = ID; 492 } 493 getSendID()494 int getSendID() const 495 { 496 return sendID; 497 } 498 getSendUseID()499 int getSendUseID() const 500 { 501 return sendUseID; 502 } 503 504 /* Member functions */ GetInstruction()505 G4_INST* GetInstruction() const { return instVec.front(); } addInstruction(G4_INST * i)506 void addInstruction(G4_INST *i) { instVec.push_back(i); } getLastInstruction()507 G4_INST* getLastInstruction() const { return instVec.back(); } setDepDelay(unsigned val)508 void setDepDelay(unsigned val) { depDelay = val; } getDepDelay()509 unsigned getDepDelay() const { return depDelay; } 510 getALUID()511 int getALUID() const { return ALUID; } getNodeID()512 unsigned getNodeID() const { return nodeID; }; getBBID()513 unsigned getBBID() const { return BBID; }; getIntegerID()514 int getIntegerID() const { return integerID; } getFloatID()515 int getFloatID() const { return floatID; } getLongID()516 int getLongID() const { return longID; } getMathID()517 int getMathID() const {return mathID;} getDPASID()518 int getDPASID() const { return DPASID; } getDPASSize()519 unsigned short getDPASSize() const { return DPASSize; } 520 setIntegerID(int id)521 void setIntegerID(int id) { integerID = id; } setFloatID(int id)522 void setFloatID(int id) { floatID = id; } setLongID(int id)523 void setLongID(int id) { longID = id; } setMathID(int id)524 void setMathID(int id) { mathID = id; } setDPASID(int id)525 void setDPASID(int id) { DPASID = id; } addDPASSize(unsigned short size)526 void addDPASSize(unsigned short size) { DPASSize += size; } 527 getLiveStartID()528 unsigned getLiveStartID() const { return liveStartID; } getLiveEndID()529 unsigned getLiveEndID() const { return liveEndID; } 530 getLiveStartBBID()531 unsigned getLiveStartBBID() const { return liveStartBBID; } getLiveEndBBID()532 unsigned getLiveEndBBID() const { return liveEndBBID; } 533 setLiveEarliestID(unsigned id,unsigned startBBID)534 void setLiveEarliestID(unsigned id, unsigned startBBID) 535 { 536 if (!liveStartID) 537 { 538 liveStartID = id; 539 liveStartBBID = startBBID; 540 } 541 else if (liveStartID > id) 542 { 543 liveStartID = id; 544 liveStartBBID = startBBID; 545 } 546 } setLiveLatestID(unsigned id,unsigned endBBID)547 void setLiveLatestID(unsigned id, unsigned endBBID) 548 { 549 if (!liveEndID) 550 { 551 liveEndID = id; 552 liveEndBBID = endBBID; 553 } 554 else if (liveEndID < id) 555 { 556 liveEndID = id; 557 liveEndBBID = endBBID; 558 } 559 } 560 setLiveEarliestID(unsigned id)561 void setLiveEarliestID(unsigned id) 562 { 563 liveStartID = id; 564 } 565 setLiveLatestID(unsigned id)566 void setLiveLatestID(unsigned id) 567 { 568 liveEndID = id; 569 } 570 setInstKilled(bool value)571 void setInstKilled(bool value) 572 { 573 instKilled = value; 574 if (value) 575 { 576 hasAW = true; 577 } 578 } 579 isInstKilled()580 bool isInstKilled() const 581 { 582 return instKilled; 583 } 584 setSourceKilled(bool value)585 void setSourceKilled(bool value) 586 { 587 sourceKilled = value; 588 if (value) 589 { 590 hasAR = true; 591 } 592 } 593 isSourceKilled()594 bool isSourceKilled() const 595 { 596 return sourceKilled; 597 } 598 setAW()599 void setAW() 600 { 601 hasAW = true; 602 } 603 setAR()604 void setAR() 605 { 606 hasAR = true; 607 } 608 setDistOneAReg()609 void setDistOneAReg() 610 { 611 hasFollowDistOneAReg = true; 612 } setFollowDistOneAReg()613 void setFollowDistOneAReg() 614 { 615 followDistOneAReg = true; 616 } 617 hasAWDep()618 bool hasAWDep() const { return hasAW; } hasARDep()619 bool hasARDep() const { return hasAR; } hasDistOneAreg()620 bool hasDistOneAreg() const { return hasFollowDistOneAReg; } followDistOneAreg()621 bool followDistOneAreg() const { return followDistOneAReg; } 622 setFootprint(SBFootprint * footprint,Gen4_Operand_Number opndNum)623 void setFootprint(SBFootprint *footprint, Gen4_Operand_Number opndNum) 624 { 625 if (footprints[opndNum] == nullptr) 626 { 627 footprints[opndNum] = footprint; 628 } 629 else 630 { 631 footprint->next = footprints[opndNum]; 632 footprints[opndNum] = footprint; 633 } 634 } 635 getFirstFootprint(Gen4_Operand_Number opndNum)636 SBFootprint* getFirstFootprint(Gen4_Operand_Number opndNum) const 637 { 638 return footprints[opndNum]; 639 } 640 setDistance(unsigned distance)641 void setDistance(unsigned distance) 642 { 643 distance = std::min(distance, SWSB_MAX_ALU_DEPENDENCE_DISTANCE_VALUE); 644 unsigned curDistance = (unsigned)instVec.front()->getDistance(); 645 if (curDistance == 0 || 646 curDistance > distance) 647 { 648 instVec.front()->setDistance((unsigned char)distance); 649 } 650 } 651 setAccurateDistType(SB_INST_PIPE depPipe)652 void setAccurateDistType(SB_INST_PIPE depPipe) 653 { 654 switch (depPipe) 655 { 656 case PIPE_INT: 657 instVec.front()->setDistanceTypeXe(G4_INST::DistanceType::DISTINT); 658 break; 659 case PIPE_FLOAT: 660 instVec.front()->setDistanceTypeXe(G4_INST::DistanceType::DISTFLOAT); 661 break; 662 case PIPE_LONG: 663 instVec.front()->setDistanceTypeXe(G4_INST::DistanceType::DISTLONG); 664 break; 665 case PIPE_MATH: 666 instVec.front()->setDistanceTypeXe(G4_INST::DistanceType::DISTMATH); 667 break; 668 case PIPE_SEND: 669 instVec.front()->setDistanceTypeXe(G4_INST::DistanceType::DISTALL); 670 break; 671 default: 672 assert(0 && "Wrong ALU PIPE"); 673 break; 674 } 675 } 676 677 //This is to reduce the sync instructions. 678 //When A@1 is used to replace I@1, make sure it will not cause the stall for float or long pipelines isClosedALUType(std::vector<unsigned> ** latestInstID,unsigned distance,SB_INST_PIPE type)679 bool isClosedALUType(std::vector<unsigned>** latestInstID, unsigned distance, SB_INST_PIPE type) 680 { 681 int instDist = SWSB_MAX_MATH_DEPENDENCE_DISTANCE; 682 SB_INST_PIPE curType = PIPE_NONE; 683 684 for (int i = 1; i < PIPE_DPAS; i++) 685 { 686 int dist = SWSB_MAX_MATH_DEPENDENCE_DISTANCE + 1; 687 if (i == PIPE_INT || i == PIPE_FLOAT) 688 { 689 if (latestInstID[i]->size() >= distance) 690 { 691 dist = (int)nodeID - (*latestInstID[i])[latestInstID[i]->size() - distance] - SWSB_MAX_ALU_DEPENDENCE_DISTANCE; 692 } 693 } 694 if (i == PIPE_MATH) 695 { 696 if (latestInstID[i]->size() >= distance) 697 { 698 dist = (int)nodeID - (*latestInstID[i])[latestInstID[i]->size() - distance] - SWSB_MAX_MATH_DEPENDENCE_DISTANCE; 699 } 700 } 701 if (i == PIPE_LONG) 702 { 703 if (latestInstID[i]->size() >= distance) 704 { 705 dist = (int)nodeID - (*latestInstID[i])[latestInstID[i]->size() - distance] - SWSB_MAX_ALU_DEPENDENCE_DISTANCE_64BIT; 706 } 707 } 708 709 if (dist < instDist) 710 { 711 instDist = dist; 712 curType = (SB_INST_PIPE)i; 713 } 714 } 715 716 return type == curType; 717 } 718 finalizeDistanceType1(IR_Builder & builder,std::vector<unsigned> ** latestInstID)719 void finalizeDistanceType1(IR_Builder& builder, std::vector<unsigned>** latestInstID) 720 { 721 if (instVec.front()->getDistanceTypeXe() != G4_INST::DistanceType::DIST_NONE) 722 { //Forced to a type already 723 return; 724 } 725 726 if (builder.loadThreadPayload() && nodeID <= 8 && 727 GetInstruction()->isSend() && distDep.size()) 728 { 729 instVec.front()->setDistanceTypeXe(G4_INST::DistanceType::DISTALL); 730 return; 731 } 732 733 if (builder.hasA0WARHWissue() && builder.hasFourALUPipes()) 734 { 735 G4_INST* inst = GetInstruction(); 736 737 if (inst->getDst() && inst->getDst()->isA0()) 738 { 739 instVec.front()->setDistance(1); 740 instVec.front()->setDistanceTypeXe(G4_INST::DistanceType::DISTALL); 741 742 return; 743 } 744 } 745 746 unsigned curDistance = (unsigned)instVec.front()->getDistance(); 747 if (distDep.size()) 748 { 749 SB_INST_PIPE depPipe = PIPE_NONE; 750 SB_INST_PIPE sameOpndTypeDepPipe = PIPE_NONE; 751 bool sameOperandType = true; 752 int diffPipes = 0; 753 bool depLongPipe = false; 754 755 //Check if there is type conversion 756 for (int i = 0; i < (int)(distDep.size()); i++) 757 { 758 SBDISTDEP_ITEM& it = distDep[i]; 759 760 if (it.operandType != it.liveNodePipe || it.dstDep) 761 { 762 sameOperandType = false; 763 } 764 } 765 766 //If the dependent instructions belong to different pipelines 767 for (int i = 0; i < (int)(distDep.size()); i++) 768 { 769 SBDISTDEP_ITEM& it = distDep[i]; 770 771 if (it.liveNodePipe != it.nodePipe) 772 { 773 diffPipes++; 774 depPipe = it.liveNodePipe; 775 } 776 777 depLongPipe = it.liveNodePipe == PIPE_LONG; 778 } 779 780 //In some platforms, A@ need be used for following case when the regDist cannot be used (due to HW issue), and inst depends on different pipes 781 //mov(8 | M0) r63.0 < 2 > :ud r15.0 < 1; 1, 0 > : ud{ I@7 } 782 //... 783 //add(8 | M0) r95.0 < 1 > : q r87.0 < 1; 1, 0 > : q r10.0 < 0; 1, 0 > : q{ I@7 } 784 //... 785 //add(8 | M0) r55.0 < 2 > : d r95.0 < 1; 1, 0 > : q r63.0 < 2; 1, 0 > : ud{ A@4 } 786 //In some platforms I@ accurate dependence can used for following case when the regDist cannot be used (due to HW issue), because inst depends on same pipe 787 //The example code piece : 788 //mov(16 | M0) r88.0 < 2 > : d r84.0 < 1; 1, 0 > : d{ F@2 } // $79&103 789 //mov(16 | M16) r90.0 < 2 > : d r85.0 < 1; 1, 0 > : d // $80&104 790 //mov(16 | M0) r88.1 < 2 > : d r86.0 < 1; 1, 0 > : d{ F@1 } // $81&105 791 //mov(16 | M16) r90.1 < 2 > : d r87.0 < 1; 1, 0 > : d // $82&106 792 //mov(16 | M0) r32.0 < 1 > : df r88.0 < 1; 1, 0 > : q{ @2/I@2 } // $83&107 793 //Since:q and : d all go to integer pipeline in, @2 should work.However, due to HW bug, I@2 is good 794 bool mulitpleSamePipe = false; 795 if (distDep.size() > 1 && sameOperandType) 796 { 797 sameOpndTypeDepPipe = distDep[0].liveNodePipe; 798 for (int i = 1; i < (int)(distDep.size()); i++) 799 { 800 SBDISTDEP_ITEM& it = distDep[i]; 801 802 if (it.liveNodePipe != sameOpndTypeDepPipe) 803 { 804 mulitpleSamePipe = true; 805 } 806 } 807 } 808 809 if (builder.hasLongOperandTypeDepIssue() && depLongPipe) 810 { 811 sameOperandType = false; 812 } 813 814 if (!builder.getOptions()->getOption(vISA_disableRegDistDep) && 815 !builder.hasRegDistDepIssue()) 816 { 817 instVec.front()->setOperandTypeIndicated(sameOperandType); 818 } 819 820 //Multiple dependences 821 if (distDep.size() > 1) 822 { 823 if (diffPipes) //Dependence instructions belong to different ALU pipelines. 824 { 825 if (sameOperandType) //But the operand type is from same instruction pipeline. 826 { 827 assert(!GetInstruction()->isSend()); //Send operand has no type, sameOperandType is always false 828 if (mulitpleSamePipe) 829 { 830 instVec.front()->setDistanceTypeXe(G4_INST::DistanceType::DISTALL); 831 } 832 else 833 { 834 setAccurateDistType(depPipe); 835 } 836 instVec.front()->setOperandTypeIndicated(false); //DPAS distance will be in sync inst 837 } 838 else // For send, we will set it to DISTALL if there are multiple dependences. For non-send, DIST 839 { 840 instVec.front()->setDistanceTypeXe(G4_INST::DistanceType::DISTALL); 841 instVec.front()->setOperandTypeIndicated(false); 842 } 843 } 844 else //From same pipeline 845 { 846 setAccurateDistType(ALUPipe); 847 instVec.front()->setIsClosestALUType(isClosedALUType(latestInstID, curDistance, ALUPipe)); 848 } 849 } 850 else //Only one dependency 851 { 852 if (depPipe != PIPE_NONE) // From different pipeline 853 { 854 setAccurateDistType(depPipe); 855 if (sameOperandType || GetInstruction()->isSend()) 856 { 857 instVec.front()->setIsClosestALUType(isClosedALUType(latestInstID, curDistance, depPipe)); 858 } 859 } 860 else // From same pipeline 861 { 862 setAccurateDistType(ALUPipe); 863 if (sameOperandType) 864 { 865 instVec.front()->setIsClosestALUType(isClosedALUType(latestInstID, curDistance, ALUPipe)); 866 } 867 } 868 } 869 } 870 } 871 finalizeDistanceType2(IR_Builder & builder,std::vector<unsigned> ** latestInstID)872 void finalizeDistanceType2(IR_Builder& builder, std::vector<unsigned> **latestInstID) 873 { 874 if (instVec.front()->getDistanceTypeXe() != G4_INST::DistanceType::DIST_NONE) 875 { //Forced to a type already 876 return; 877 } 878 879 if (builder.loadThreadPayload() && nodeID <= 8 && 880 GetInstruction()->isSend() && distDep.size()) 881 { 882 instVec.front()->setDistanceTypeXe(G4_INST::DistanceType::DISTALL); 883 return; 884 } 885 886 if (builder.hasA0WARHWissue() && builder.hasFourALUPipes()) 887 { 888 G4_INST* inst = GetInstruction(); 889 890 if (inst->getDst() && inst->getDst()->isA0()) 891 { 892 instVec.front()->setDistance(1); 893 instVec.front()->setDistanceTypeXe(G4_INST::DistanceType::DISTALL); 894 895 return; 896 } 897 } 898 899 unsigned curDistance = (unsigned)instVec.front()->getDistance(); 900 if (distDep.size()) 901 { 902 SB_INST_PIPE depPipe = PIPE_NONE; 903 SB_INST_PIPE sameOpndTypeDepPipe = PIPE_NONE; 904 bool sameOperandType = true; 905 bool depSamePipe = true; 906 bool depLongPipe = false; 907 908 //Check the potential to use regDist 909 for (int i = 0; i < (int)(distDep.size()); i++) 910 { 911 SBDISTDEP_ITEM& it = distDep[i]; 912 913 //For regDist, the operand type is same as pipeline type of dependent instruction 914 if (it.operandType != it.liveNodePipe || it.dstDep) 915 { 916 sameOperandType = false; 917 } 918 } 919 920 //If the dependent instructions belong to different pipelines 921 for (int i = 0; i < (int)(distDep.size()); i++) 922 { 923 SBDISTDEP_ITEM& it = distDep[i]; 924 925 if (it.liveNodePipe != it.nodePipe) 926 { 927 depSamePipe = false; //Current instruction and dependence instruction belong to different pipeline 928 } 929 930 if (depPipe == PIPE_NONE) 931 { 932 depPipe = it.liveNodePipe; 933 } 934 935 depLongPipe = it.liveNodePipe == PIPE_LONG; 936 } 937 938 //In some platforms, A@ need be used for following case when the regDist cannot be used (due to HW issue), and inst depends on different pipes 939 //mov(8 | M0) r63.0 < 2 > :ud r15.0 < 1; 1, 0 > : ud{ I@7 } 940 //... 941 //add(8 | M0) r95.0 < 1 > : q r87.0 < 1; 1, 0 > : q r10.0 < 0; 1, 0 > : q{ I@7 } 942 //... 943 //add(8 | M0) r55.0 < 2 > : d r95.0 < 1; 1, 0 > : q r63.0 < 2; 1, 0 > : ud{ A@4 } 944 //In some platforms I@ accurate dependence can used for following case when the regDist cannot be used (due to HW issue), because inst depends on same pipe 945 //The example code piece : 946 //mov(16 | M0) r88.0 < 2 > : d r84.0 < 1; 1, 0 > : d{ F@2 } // $79&103 947 //mov(16 | M16) r90.0 < 2 > : d r85.0 < 1; 1, 0 > : d // $80&104 948 //mov(16 | M0) r88.1 < 2 > : d r86.0 < 1; 1, 0 > : d{ F@1 } // $81&105 949 //mov(16 | M16) r90.1 < 2 > : d r87.0 < 1; 1, 0 > : d // $82&106 950 //mov(16 | M0) r32.0 < 1 > : df r88.0 < 1; 1, 0 > : q{ @2/I@2 } // $83&107 951 //Since:q and : d all go to integer pipeline in, @2 should work.However, due to HW bug, we can only set to I@2 952 //Depends on multiple pipelines but same pipeline 953 bool mulitpleSamePipe = true; 954 if (distDep.size() > 1) 955 { 956 sameOpndTypeDepPipe = distDep[0].liveNodePipe; 957 for (int i = 1; i < (int)(distDep.size()); i++) 958 { 959 SBDISTDEP_ITEM& it = distDep[i]; 960 961 if (it.liveNodePipe != sameOpndTypeDepPipe) 962 { 963 mulitpleSamePipe = false; 964 } 965 } 966 } 967 968 if (builder.hasLongOperandTypeDepIssue() && depLongPipe) 969 { 970 sameOperandType = false; 971 } 972 973 if (!builder.getOptions()->getOption(vISA_disableRegDistDep) && 974 !builder.hasRegDistDepIssue()) 975 { 976 instVec.front()->setOperandTypeIndicated(sameOperandType); 977 } 978 979 //Multiple dependences 980 if (distDep.size() > 1) 981 { 982 if (!depSamePipe) //Dependent instruction and current instruction belong to different ALU pipelines. 983 { 984 if (sameOperandType) //But the operand type and dependence instruction are from same instruction pipeline. 985 { 986 assert(!GetInstruction()->isSend()); //Send operand has no type, sameOperandType is always false 987 if (mulitpleSamePipe) 988 { 989 setAccurateDistType(depPipe); 990 } 991 else 992 { 993 instVec.front()->setDistanceTypeXe(G4_INST::DistanceType::DISTALL); 994 } 995 instVec.front()->setOperandTypeIndicated(false); //DPAS distance will be in sync inst 996 } 997 else if (mulitpleSamePipe) 998 { 999 setAccurateDistType(depPipe); 1000 instVec.front()->setOperandTypeIndicated(false); 1001 } 1002 else // set to DISTALL 1003 { 1004 instVec.front()->setDistanceTypeXe(G4_INST::DistanceType::DISTALL); 1005 instVec.front()->setOperandTypeIndicated(false); 1006 } 1007 } 1008 else //From same pipeline 1009 { 1010 setAccurateDistType(depPipe); 1011 instVec.front()->setIsClosestALUType(isClosedALUType(latestInstID, curDistance, depPipe)); 1012 } 1013 } 1014 else //Only one dependency 1015 { 1016 if (depPipe != PIPE_NONE) // From different pipeline 1017 { 1018 setAccurateDistType(depPipe); 1019 if (sameOperandType || GetInstruction()->isSend()) 1020 { 1021 instVec.front()->setIsClosestALUType(isClosedALUType(latestInstID, curDistance, depPipe)); 1022 } 1023 } 1024 else // From same pipeline 1025 { 1026 setAccurateDistType(ALUPipe); 1027 if (sameOperandType) 1028 { 1029 instVec.front()->setIsClosestALUType(isClosedALUType(latestInstID, curDistance, ALUPipe)); 1030 } 1031 } 1032 } 1033 } 1034 } 1035 getMaxDepDistance()1036 int getMaxDepDistance() const 1037 { 1038 auto inst = GetInstruction(); 1039 if (inst->getDst() && inst->getDst()->getTypeSize() == 8) 1040 { // Note that for XeLP, there are no 8 bytes ALU instruction. 1041 return SWSB_MAX_ALU_DEPENDENCE_DISTANCE_64BIT; 1042 } 1043 else 1044 { 1045 return SWSB_MAX_ALU_DEPENDENCE_DISTANCE; 1046 } 1047 } 1048 setDepToken(unsigned short token,SWSBTokenType type,SBNode * node)1049 void setDepToken(unsigned short token, SWSBTokenType type, SBNode *node) 1050 { 1051 for (DepToken& depToken : depTokens) 1052 { 1053 if (depToken.token == token) 1054 { 1055 if (depToken.type == SWSBTokenType::AFTER_WRITE) 1056 { 1057 return; 1058 } 1059 if (type == SWSBTokenType::AFTER_WRITE) 1060 { 1061 depToken.type = type; 1062 depToken.depNode = node; 1063 } 1064 return; 1065 } 1066 } 1067 1068 struct DepToken dt; 1069 dt.token = token; 1070 dt.type = type; 1071 dt.depNode = node; 1072 depTokens.push_back(dt); 1073 } eraseDepToken(unsigned i)1074 void eraseDepToken(unsigned i) 1075 { 1076 depTokens.erase(depTokens.begin() + i); 1077 } 1078 getDepTokenNum()1079 size_t getDepTokenNum() const { return depTokens.size(); } getDepToken(unsigned int i,SWSBTokenType & type)1080 unsigned short getDepToken(unsigned int i, SWSBTokenType& type) const 1081 { 1082 type = depTokens[i].type; 1083 return depTokens[i].token; 1084 } 1085 getDepTokenNodeID(unsigned int i)1086 unsigned short getDepTokenNodeID(unsigned int i) const 1087 { 1088 return depTokens[i].depNode->getNodeID();; 1089 } 1090 setTokenReuseNode(SBNode * node)1091 void setTokenReuseNode(SBNode *node) 1092 { 1093 tokenReusedNode = node; 1094 } 1095 new(size_t sz,vISA::Mem_Manager & m)1096 void *operator new(size_t sz, vISA::Mem_Manager& m) { return m.alloc(sz); } 1097 dumpInterval()1098 void dumpInterval() const 1099 { 1100 std::cerr << "#" << nodeID << ": " << liveStartID << "-" << liveEndID << "\n"; 1101 } dumpAssignedTokens()1102 void dumpAssignedTokens() const 1103 { 1104 std::cerr << "#" << nodeID << ": " << liveStartID << "-" << liveEndID << "\n"; 1105 std::cerr << "Token: " << getLastInstruction()->getToken(); 1106 if (tokenReusedNode != nullptr) 1107 { 1108 std::cerr << "\t\tReuse:" << tokenReusedNode->getNodeID(); 1109 } 1110 std::cerr << "\n"; 1111 } 1112 }; 1113 } 1114 typedef std::vector<vISA::SBNode*> SBNODE_VECT; 1115 typedef std::vector<vISA::SBNode*>::iterator SBNODE_VECT_ITER; 1116 typedef std::list<vISA::SBNode*> SBNODE_LIST; 1117 typedef std::list<vISA::SBNode*>::iterator SBNODE_LIST_ITER; 1118 1119 namespace vISA 1120 { 1121 //Similar as SBBucketNode, but it's used for the bucket descriptions got from each operands. 1122 struct SBBucketDesc { 1123 const int bucket; 1124 const Gen4_Operand_Number opndNum; 1125 SBNode* const node; 1126 const SBFootprint* footprint; 1127 SBBucketDescSBBucketDesc1128 SBBucketDesc(int Bucket, Gen4_Operand_Number opnd_num, SBNode *sNode, const SBFootprint *f) 1129 : bucket(Bucket), opndNum(opnd_num), node(sNode), footprint(f) { 1130 ; 1131 } 1132 }; 1133 1134 //The node in the node vector of each bucket. 1135 struct SBBucketNode 1136 { 1137 SBNode* node; 1138 Gen4_Operand_Number opndNum; 1139 int sendID = -1; 1140 const SBFootprint* footprint; 1141 SBBucketNodeSBBucketNode1142 SBBucketNode(SBNode *node1, Gen4_Operand_Number opndNum1, const SBFootprint *f) 1143 : node(node1), opndNum(opndNum1), footprint(f) 1144 { 1145 } 1146 ~SBBucketNodeSBBucketNode1147 ~SBBucketNode() 1148 { 1149 } 1150 setSendIDSBBucketNode1151 void setSendID(int ID) 1152 { 1153 sendID = ID; 1154 } 1155 getSendIDSBBucketNode1156 int getSendID() const 1157 { 1158 return sendID; 1159 } 1160 dumpSBBucketNode1161 void dump() const 1162 { 1163 std::cerr << "#" << node->getNodeID() << "-" << opndNum << ","; 1164 } 1165 }; 1166 1167 typedef std::vector<SBBucketNode *> SBBUCKET_VECTOR; 1168 typedef SBBUCKET_VECTOR::iterator SBBUCKET_VECTOR_ITER; 1169 1170 // This class hides the internals of dependence tracking using buckets 1171 class LiveGRFBuckets 1172 { 1173 std::vector<SBBUCKET_VECTOR *> nodeBucketsArray; 1174 G4_Kernel &k; 1175 vISA::Mem_Manager &mem; 1176 const int numOfBuckets; 1177 1178 public: LiveGRFBuckets(vISA::Mem_Manager & m,int TOTAL_BUCKETS,G4_Kernel & k)1179 LiveGRFBuckets(vISA::Mem_Manager& m, int TOTAL_BUCKETS, G4_Kernel& k) 1180 : nodeBucketsArray(TOTAL_BUCKETS), k(k), mem(m), numOfBuckets(TOTAL_BUCKETS) 1181 { 1182 // Initialize a vector for each bucket 1183 for (auto& bucket : nodeBucketsArray) 1184 { 1185 void* allocedMem = mem.alloc(sizeof(SBBUCKET_VECTOR)); 1186 bucket = new (allocedMem) SBBUCKET_VECTOR(); 1187 } 1188 } 1189 ~LiveGRFBuckets()1190 ~LiveGRFBuckets() 1191 { 1192 for (int i = 0; i < numOfBuckets; i++) 1193 { 1194 SBBUCKET_VECTOR *bucketVec = nodeBucketsArray[i]; 1195 if (bucketVec) 1196 { 1197 bucketVec->~SBBUCKET_VECTOR(); 1198 } 1199 } 1200 } 1201 getNumOfBuckets()1202 int getNumOfBuckets() const 1203 { 1204 return numOfBuckets; 1205 } 1206 1207 //The iterator which is used to scan the node vector of each bucket 1208 class BN_iterator 1209 { 1210 public: 1211 const LiveGRFBuckets *LB; 1212 SBBUCKET_VECTOR_ITER node_it; 1213 int bucket; 1214 BN_iterator(const LiveGRFBuckets * LB1,SBBUCKET_VECTOR_ITER It,int Bucket)1215 BN_iterator(const LiveGRFBuckets *LB1, SBBUCKET_VECTOR_ITER It, int Bucket) 1216 : LB(LB1), node_it(It), bucket(Bucket) 1217 { 1218 } 1219 1220 BN_iterator &operator++() 1221 { 1222 ++node_it; 1223 return *this; 1224 } 1225 1226 bool operator!=(const BN_iterator &it2) const 1227 { 1228 assert(LB == it2.LB); 1229 return (!(bucket == it2.bucket && node_it == it2.node_it)); 1230 } 1231 1232 SBBucketNode *operator*() 1233 { 1234 assert(node_it != LB->nodeBucketsArray[bucket]->end()); 1235 return *node_it; 1236 } 1237 }; 1238 begin(int bucket)1239 BN_iterator begin(int bucket) const 1240 { 1241 return BN_iterator(this, nodeBucketsArray[bucket]->begin(), bucket); 1242 } 1243 end(int bucket)1244 BN_iterator end(int bucket) const 1245 { 1246 return BN_iterator(this, nodeBucketsArray[bucket]->end(), bucket); 1247 } 1248 1249 //Scan the node vector of the bucket, and kill the bucket node with the specified node and operand bucketKill(int bucket,SBNode * node,Gen4_Operand_Number opnd)1250 void bucketKill(int bucket, SBNode *node, Gen4_Operand_Number opnd) 1251 { 1252 SBBUCKET_VECTOR &vec = *nodeBucketsArray[bucket]; 1253 for (unsigned int i = 0; i < vec.size(); i++) 1254 { 1255 SBBucketNode *bNode = vec[i]; 1256 1257 //Same node and same operand 1258 if (bNode->node == node && 1259 bNode->opndNum == opnd) 1260 { 1261 vec.erase(vec.begin() + i); 1262 break; 1263 } 1264 } 1265 } 1266 //Kill the bucket node specified by bn_it killSingleOperand(BN_iterator & bn_it)1267 void killSingleOperand(BN_iterator &bn_it) 1268 { 1269 SBBUCKET_VECTOR &vec = *nodeBucketsArray[bn_it.bucket]; 1270 SBBUCKET_VECTOR_ITER &node_it = bn_it.node_it; 1271 1272 //Kill current node 1273 if (*node_it == vec.back()) 1274 { 1275 vec.pop_back(); 1276 node_it = vec.end(); 1277 } 1278 else 1279 { 1280 //Current node is assigned with the last one 1281 //For caller, same iterator position need be handled again, 1282 //Because a new node is copied here 1283 *node_it = vec.back(); 1284 vec.pop_back(); 1285 } 1286 } 1287 1288 //Kill the bucket node specified by bn_it, also kill the same node in other buckets killOperand(BN_iterator & bn_it)1289 void killOperand(BN_iterator &bn_it) 1290 { 1291 SBBUCKET_VECTOR &vec = *nodeBucketsArray[bn_it.bucket]; 1292 SBBUCKET_VECTOR_ITER &node_it = bn_it.node_it; 1293 SBBucketNode *bucketNode = *node_it; //Get the node before it is destroyed 1294 int aregOffset = k.getNumRegTotal(); 1295 1296 //Kill current node 1297 if (*node_it == vec.back()) 1298 { 1299 vec.pop_back(); 1300 node_it = vec.end(); 1301 } 1302 else 1303 { 1304 //Current node is assigned with the last one 1305 //For caller, same iterator position need be handled again, 1306 //Because a new node is copied here 1307 *node_it = vec.back(); 1308 vec.pop_back(); 1309 } 1310 1311 //Kill the same node in other bucket. 1312 for (const SBFootprint *footprint = bucketNode->node->getFirstFootprint(bucketNode->opndNum); footprint; footprint = footprint->next) 1313 { 1314 unsigned int startBucket = footprint->LeftB / numEltPerGRF<Type_UB>(); 1315 unsigned int endBucket = footprint->RightB / numEltPerGRF<Type_UB>(); 1316 if (footprint->fType == ACC_T) 1317 { 1318 startBucket = startBucket + aregOffset; 1319 endBucket = endBucket + aregOffset; 1320 } 1321 1322 if (footprint->inst == bucketNode->footprint->inst) 1323 { 1324 for (unsigned int i = startBucket; (i < endBucket + 1) && (i < nodeBucketsArray.size()); i++) 1325 { 1326 if (i == bn_it.bucket) 1327 { 1328 continue; 1329 } 1330 bucketKill(i, bucketNode->node, bucketNode->opndNum); 1331 } 1332 } 1333 } 1334 } 1335 1336 //Add a node into bucket add(SBBucketNode * bucketNode,int bucket)1337 void add(SBBucketNode *bucketNode, int bucket) 1338 { 1339 assert(nodeBucketsArray[bucket] != nullptr); 1340 SBBUCKET_VECTOR& nodeVec = *(nodeBucketsArray[bucket]); 1341 if (std::find(nodeVec.begin(), nodeVec.end(), bucketNode) == nodeVec.end()) 1342 { 1343 nodeVec.push_back(bucketNode); 1344 } 1345 } 1346 new(size_t sz,vISA::Mem_Manager & m)1347 void *operator new(size_t sz, vISA::Mem_Manager& m) { return m.alloc(sz); } 1348 dumpLives()1349 void dumpLives() const 1350 { 1351 for (int curBucket = 0; curBucket < numOfBuckets; curBucket++) 1352 { 1353 if (nodeBucketsArray[curBucket]->size()) 1354 { 1355 std::cerr << " GRF" << curBucket << ":"; 1356 for (SBBucketNode *liveBN : *nodeBucketsArray[curBucket]) 1357 { 1358 std::cerr << " " << liveBN->node->getNodeID() << "(" << liveBN->opndNum << ")"; 1359 } 1360 std::cerr << "\t"; 1361 if ((curBucket + 1) % 8 == 0) 1362 { 1363 std::cerr << "\n"; 1364 } 1365 } 1366 } 1367 std::cerr << "\n"; 1368 } 1369 }; 1370 1371 typedef std::list<G4_BB_SB *> BB_SWSB_LIST; 1372 typedef BB_SWSB_LIST::iterator BB_SWSB_LIST_ITER; 1373 1374 typedef struct _SWSB_INDEXES { 1375 int instIndex = 0; 1376 int ALUIndex = 0; 1377 int integerIndex = 0; 1378 int floatIndex = 0; 1379 int longIndex = 0; 1380 int DPASIndex = 0; 1381 int mathIndex = 0; 1382 unsigned latestDepALUID[PIPE_DPAS] = {0}; 1383 std::vector<unsigned> latestInstID[PIPE_DPAS]; 1384 } SWSB_INDEXES; 1385 1386 typedef struct _SWSB_LOOP { 1387 int entryBBID = -1 ; 1388 int endBBID = -1; 1389 } SWSB_LOOP; 1390 1391 class G4_BB_SB { 1392 private: 1393 const SWSB& swsb; 1394 IR_Builder& builder; 1395 vISA::Mem_Manager &mem; 1396 G4_BB *bb; 1397 G4_Label* BBLabel; 1398 int nodeID; 1399 int ALUID; 1400 1401 SBNode* dpasNode = nullptr; 1402 SBNode* lastDpasNode = nullptr; 1403 int integerID; 1404 int floatID; 1405 int longID; 1406 int mathID; 1407 int DPASID; 1408 unsigned latestDepALUID[PIPE_DPAS]; 1409 std::vector<unsigned> *latestInstID[PIPE_DPAS]; 1410 1411 int totalGRFNum; 1412 int tokenAfterDPASCycle; 1413 1414 public: 1415 LiveGRFBuckets *send_use_kills; 1416 BB_SWSB_LIST Preds; 1417 BB_SWSB_LIST Succs; 1418 1419 BB_SWSB_LIST domPreds; 1420 BB_SWSB_LIST domSuccs; 1421 1422 int first_node = -1; 1423 int last_node = -1; 1424 1425 int first_send_node; 1426 int last_send_node; 1427 1428 bool tokenAssigned = false; 1429 1430 int send_start = -1; 1431 int send_end = -1; 1432 1433 unsigned loopStartBBID = -1; // The start BB ID of live range 1434 unsigned loopEndBBID = -1; // The start BB ID of live range 1435 1436 SBBitSets send_def_out; 1437 SBBitSets send_live_in; 1438 SBBitSets send_live_out; 1439 SBBitSets send_may_kill; 1440 SBBitSets send_live_in_scalar; 1441 SBBitSets send_live_out_scalar; 1442 SBBitSets send_kill_scalar; 1443 1444 BitSet dominators; 1445 BitSet send_WAW_may_kill; 1446 1447 //For token reduction 1448 BitSet liveInTokenNodes; 1449 BitSet liveOutTokenNodes; 1450 BitSet killedTokens; 1451 std::vector<BitSet> tokeNodesMap; 1452 int first_DPASID = 0; 1453 int last_DPASID = 0; 1454 unsigned *tokenLiveInDist; 1455 unsigned *tokenLiveOutDist; 1456 SBBitSets localReachingSends; 1457 1458 //BB local data dependence analysis G4_BB_SB(const SWSB & sb,IR_Builder & b,Mem_Manager & m,G4_BB * block,SBNODE_VECT * SBNodes,SBNODE_VECT * SBSendNodes,SBBUCKET_VECTOR * globalSendOpndList,SWSB_INDEXES * indexes,uint32_t & globalSendNum,LiveGRFBuckets * lb,LiveGRFBuckets * globalLB,PointsToAnalysis & p,std::map<G4_Label *,G4_BB_SB * > * LabelToBlockMap,const unsigned dpasLatency)1459 G4_BB_SB(const SWSB& sb, IR_Builder& b, Mem_Manager &m, G4_BB *block, SBNODE_VECT *SBNodes, SBNODE_VECT* SBSendNodes, 1460 SBBUCKET_VECTOR *globalSendOpndList, SWSB_INDEXES *indexes, uint32_t &globalSendNum, LiveGRFBuckets *lb, 1461 LiveGRFBuckets *globalLB, PointsToAnalysis& p, 1462 std::map<G4_Label*, G4_BB_SB*> *LabelToBlockMap, const unsigned dpasLatency) : swsb(sb), builder(b), mem(m), bb(block), tokenAfterDPASCycle(dpasLatency) 1463 { 1464 for (int i = 0; i < PIPE_DPAS; i++) 1465 { 1466 latestDepALUID[i] = 0; 1467 latestInstID[i] = nullptr; 1468 } 1469 first_send_node = -1; 1470 last_send_node = -1; 1471 totalGRFNum = block->getKernel().getNumRegTotal(); 1472 SBDDD(bb, lb, globalLB, SBNodes, SBSendNodes, globalSendOpndList, indexes, globalSendNum, p, LabelToBlockMap); 1473 } 1474 ~G4_BB_SB()1475 ~G4_BB_SB() 1476 { 1477 } 1478 getBB()1479 G4_BB* getBB() const { return bb; } getLabel()1480 G4_Label* getLabel() const { return BBLabel; } 1481 1482 static bool isGRFEdgeAdded(const SBNode* pred, const SBNode* succ, DepType d, SBDependenceAttr a); 1483 void createAddGRFEdge(SBNode* pred, SBNode* succ, DepType d, SBDependenceAttr a); 1484 1485 //Bucket and range analysis 1486 SBFootprint* getFootprintForGRF(G4_Operand * opnd, 1487 Gen4_Operand_Number opnd_num, 1488 G4_INST *inst, 1489 int startingBucket, 1490 bool mustBeWholeGRF); 1491 SBFootprint * getFootprintForACC(G4_Operand * opnd, 1492 Gen4_Operand_Number opnd_num, 1493 G4_INST *inst); 1494 SBFootprint* getFootprintForFlag(G4_Operand* opnd, 1495 Gen4_Operand_Number opnd_num, 1496 G4_INST* inst); 1497 bool isLongPipeType(G4_Type type) const; 1498 bool isIntegerPipeType(G4_Type type) const; 1499 bool isLongPipeInstructionXe(const G4_INST* inst) const; 1500 bool isIntegerPipeInstructionXe(const G4_INST* inst) const; 1501 bool isFloatPipeInstructionXe(const G4_INST* inst) const; 1502 SB_INST_PIPE getInstructionPipeXe(const G4_INST* inst); 1503 bool getFootprintForOperand(SBNode *node, 1504 G4_INST *inst, 1505 G4_Operand* opnd, 1506 Gen4_Operand_Number opnd_num); 1507 void getGRFBuckets(SBNode* node, const SBFootprint* footprint, Gen4_Operand_Number opndNum, std::vector<SBBucketDesc>& BDvec, bool GRFOnly); 1508 bool getGRFFootPrintOperands(SBNode *node, 1509 G4_INST *inst, 1510 Gen4_Operand_Number first_opnd, 1511 Gen4_Operand_Number last_opnd, 1512 PointsToAnalysis& p); 1513 void getGRFFootprintForIndirect(SBNode* node, 1514 Gen4_Operand_Number opnd_num, 1515 G4_Operand* opnd, 1516 PointsToAnalysis& p); 1517 void getGRFBucketsForOperands(SBNode *node, 1518 Gen4_Operand_Number first_opnd, 1519 Gen4_Operand_Number last_opnd, 1520 std::vector<SBBucketDesc>& BDvec, 1521 bool GRFOnly); 1522 1523 bool getGRFFootPrint(SBNode *node, 1524 PointsToAnalysis &p); 1525 1526 void getGRFBucketDescs(SBNode *node, 1527 std::vector<SBBucketDesc>& BDvec, 1528 bool GRFOnly); 1529 1530 void clearSLMWARWAissue(SBNode* curNode, LiveGRFBuckets* LB); 1531 1532 void setDistance(const SBFootprint * footprint, SBNode *node, SBNode *liveNode, bool dstDep); 1533 void setSpecialDistance(SBNode* node); 1534 static void footprintMerge(SBNode * node, const SBNode * nextNode); 1535 1536 void pushItemToQueue(std::vector<unsigned>* nodeIDQueue, unsigned nodeID); 1537 1538 bool hasInternalDependence(SBNode* nodeFirst, SBNode* nodeNext); 1539 1540 bool is2xDPBlockCandidate(G4_INST* inst, bool accDST); 1541 //Local distance dependence analysis and assignment 1542 void SBDDD(G4_BB* bb, 1543 LiveGRFBuckets* &LB, 1544 LiveGRFBuckets* &globalSendsLB, 1545 SBNODE_VECT *SBNodes, 1546 SBNODE_VECT *SBSendNodes, 1547 SBBUCKET_VECTOR *globalSendOpndList, 1548 SWSB_INDEXES *indexes, 1549 uint32_t &globalSendNum, 1550 PointsToAnalysis& p, 1551 std::map<G4_Label*, G4_BB_SB*> *LabelToBlockMap); 1552 1553 //Global SBID dependence analysis 1554 void setSendOpndMayKilled(LiveGRFBuckets *globalSendsLB, SBNODE_VECT *SBNodes, PointsToAnalysis &p); 1555 void dumpTokenLiveInfo(SBNODE_VECT * SBSendNodes); 1556 void getLiveBucketsFromFootprint(const SBFootprint *firstFootprint, SBBucketNode* sBucketNode, LiveGRFBuckets *send_use_kills) const; 1557 void addGlobalDependence(unsigned globalSendNum, SBBUCKET_VECTOR *globalSendOpndList, SBNODE_VECT *SBNodes, PointsToAnalysis &p, bool afterWrite); 1558 void clearKilledBucketNodeXeLP(LiveGRFBuckets * LB, int ALUID); 1559 1560 void clearKilledBucketNodeXeHP(LiveGRFBuckets* LB, int integerID, int floatID, int longID, int mathID); 1561 1562 bool hasInternalDependenceWithinDPAS(SBNode *node); 1563 bool hasDependenceBetweenDPASNodes(SBNode * node, SBNode * nextNode); 1564 bool src2FootPrintCachePVC(SBNode* curNode, SBNode* nextNode) const; 1565 bool src2SameFootPrintDiffType(SBNode* curNode, SBNode* nextNode) const; 1566 bool isLastDpas(SBNode * curNode, SBNode * nextNode); 1567 1568 1569 void getLiveOutToken(unsigned allSendNum, const SBNODE_VECT *SBNodes); 1570 1571 getLoopStartBBID()1572 unsigned getLoopStartBBID() const { return loopStartBBID; } getLoopEndBBID()1573 unsigned getLoopEndBBID() const { return loopEndBBID; } 1574 setLoopStartBBID(unsigned id)1575 void setLoopStartBBID(unsigned id) { loopStartBBID = id; } setLoopEndBBID(unsigned id)1576 void setLoopEndBBID(unsigned id) { loopEndBBID = id; } 1577 new(size_t sz,vISA::Mem_Manager & m)1578 void *operator new(size_t sz, vISA::Mem_Manager& m) { return m.alloc(sz); } 1579 1580 void dumpLiveInfo(const SBBUCKET_VECTOR *globalSendOpndList, unsigned globalSendNum, const SBBitSets *send_kill) const; 1581 }; 1582 1583 typedef std::vector<G4_BB_SB *> BB_SWSB_VECTOR; 1584 1585 class SWSB_TOKEN_PROFILE { 1586 uint32_t tokenInstructionCount = 0; 1587 uint32_t tokenReuseCount = 0; 1588 uint32_t AWTokenReuseCount = 0; 1589 uint32_t ARTokenReuseCount = 0; 1590 uint32_t AATokenReuseCount = 0; 1591 uint32_t mathInstCount = 0; 1592 uint32_t syncInstCount = 0; 1593 uint32_t mathReuseCount = 0; 1594 uint32_t ARSyncInstCount = 0; 1595 uint32_t AWSyncInstCount = 0; 1596 uint32_t ARSyncAllCount = 0; 1597 uint32_t AWSyncAllCount = 0; 1598 uint32_t prunedDepEdges = 0; 1599 uint32_t prunedGlobalEdgeNum = 0; 1600 uint32_t prunedDiffBBEdgeNum = 0; 1601 uint32_t prunedDiffBBSameTokenEdgeNum = 0; 1602 public: SWSB_TOKEN_PROFILE()1603 SWSB_TOKEN_PROFILE() { 1604 ; 1605 } 1606 ~SWSB_TOKEN_PROFILE()1607 ~SWSB_TOKEN_PROFILE() 1608 { 1609 } new(size_t sz,vISA::Mem_Manager & m)1610 void* operator new(size_t sz, vISA::Mem_Manager& m) { return m.alloc(sz); } 1611 setTokenInstructionCount(int count)1612 void setTokenInstructionCount(int count) { tokenInstructionCount = count; } getTokenInstructionCount()1613 uint32_t getTokenInstructionCount() const { return tokenInstructionCount; } 1614 setTokenReuseCount(int count)1615 void setTokenReuseCount(int count) { tokenReuseCount = count; } getTokenReuseCount()1616 uint32_t getTokenReuseCount() const { return tokenReuseCount; } 1617 setAWTokenReuseCount(int count)1618 void setAWTokenReuseCount(int count) { AWTokenReuseCount = count; } getAWTokenReuseCount()1619 uint32_t getAWTokenReuseCount() const { return AWTokenReuseCount; } 1620 setARTokenReuseCount(int count)1621 void setARTokenReuseCount(int count) { ARTokenReuseCount = count; } getARTokenReuseCount()1622 uint32_t getARTokenReuseCount() const { return ARTokenReuseCount; } 1623 setAATokenReuseCount(int count)1624 void setAATokenReuseCount(int count) { AATokenReuseCount = count; } getAATokenReuseCount()1625 uint32_t getAATokenReuseCount() const { return AATokenReuseCount; } 1626 setMathInstCount(int count)1627 void setMathInstCount(int count) { mathInstCount = count; } getMathInstCount()1628 uint32_t getMathInstCount() const { return mathInstCount; } 1629 setSyncInstCount(int count)1630 void setSyncInstCount(int count) { syncInstCount = count; } getSyncInstCount()1631 uint32_t getSyncInstCount() const { return syncInstCount; } 1632 setMathReuseCount(int count)1633 void setMathReuseCount(int count) { mathReuseCount = count; } getMathReuseCount()1634 uint32_t getMathReuseCount() const { return mathReuseCount; } 1635 setARSyncInstCount(int count)1636 void setARSyncInstCount(int count) { ARSyncInstCount = count; } getARSyncInstCount()1637 uint32_t getARSyncInstCount() const { return ARSyncInstCount; } 1638 setAWSyncInstCount(int count)1639 void setAWSyncInstCount(int count) { AWSyncInstCount = count; } getAWSyncInstCount()1640 uint32_t getAWSyncInstCount() const { return AWSyncInstCount; } 1641 setARSyncAllCount(int count)1642 void setARSyncAllCount(int count) { ARSyncAllCount = count; } getARSyncAllCount()1643 uint32_t getARSyncAllCount() const { return ARSyncAllCount; } 1644 setAWSyncAllCount(int count)1645 void setAWSyncAllCount(int count) { AWSyncAllCount = count; } getAWSyncAllCount()1646 uint32_t getAWSyncAllCount() const { return AWSyncAllCount; } 1647 setPrunedEdgeNum(int num)1648 void setPrunedEdgeNum(int num) { prunedDepEdges = num; } getPrunedEdgeNum()1649 uint32_t getPrunedEdgeNum() const { return prunedDepEdges; } 1650 setPrunedGlobalEdgeNum(int num)1651 void setPrunedGlobalEdgeNum(int num) { prunedGlobalEdgeNum = num; } getPrunedGlobalEdgeNum()1652 uint32_t getPrunedGlobalEdgeNum() const { return prunedGlobalEdgeNum; } 1653 setPrunedDiffBBEdgeNum(int num)1654 void setPrunedDiffBBEdgeNum(int num) { prunedDiffBBEdgeNum = num; } getPrunedDiffBBEdgeNum()1655 uint32_t getPrunedDiffBBEdgeNum() const { return prunedDiffBBEdgeNum; } 1656 setPrunedDiffBBSameTokenEdgeNum(int num)1657 void setPrunedDiffBBSameTokenEdgeNum(int num) { prunedDiffBBSameTokenEdgeNum = num; } getPrunedDiffBBSameTokenEdgeNum()1658 uint32_t getPrunedDiffBBSameTokenEdgeNum() const { return prunedDiffBBSameTokenEdgeNum; } 1659 }; 1660 1661 class SWSB { 1662 G4_Kernel &kernel; 1663 FlowGraph& fg; 1664 vISA::Mem_Manager& mem; 1665 1666 BB_SWSB_VECTOR BBVector; // The basic block vector, ordered with ID of the BB 1667 SBNODE_VECT SBNodes; // All instruction nodes 1668 SBNODE_VECT SBSendNodes; // All out-of-order instruction nodes 1669 SBNODE_VECT SBSendUses; // All out-of-order instruction nodes 1670 #ifdef DEBUG_VERBOSE_ON 1671 SBNODE_VECT globalSBNodes; // All instruction nodes 1672 #endif 1673 SWSB_INDEXES indexes; // To pass ALU ID from previous BB to current. 1674 uint32_t globalSendNum = 0; // The number of out-of-order instructions which generate global dependencies. 1675 SBBUCKET_VECTOR globalSendOpndList; //All send operands which live out their instructions' BBs. No redundant. 1676 const uint32_t totalTokenNum; 1677 static constexpr unsigned TOKEN_AFTER_READ_CYCLE = 4; 1678 const unsigned tokenAfterWriteMathCycle; 1679 const unsigned tokenAfterWriteSendSlmCycle; 1680 const unsigned tokenAfterWriteSendMemoryCycle; 1681 const unsigned tokenAfterWriteSendSamplerCycle; 1682 int tokenAfterDPASCycle; 1683 1684 //For profiling 1685 uint32_t syncInstCount = 0; 1686 uint32_t AWSyncInstCount = 0; 1687 uint32_t ARSyncInstCount = 0; 1688 uint32_t mathReuseCount = 0; 1689 uint32_t ARSyncAllCount = 0; 1690 uint32_t AWSyncAllCount = 0; 1691 uint32_t tokenReuseCount = 0; 1692 1693 bool hasFCall = false; 1694 //Linear scan data structures for token allocation 1695 SBNODE_LIST linearScanLiveNodes; 1696 1697 std::vector<SBNode *> freeTokenList; 1698 1699 std::vector<SBNODE_VECT *> reachTokenArray; 1700 std::vector<SBNODE_VECT *> reachUseArray; 1701 SBNODE_VECT localTokenUsage; 1702 1703 int topIndex = -1; 1704 1705 std::map<G4_Label*, G4_BB_SB*> labelToBlockMap; 1706 1707 // TokenAllocation uses a BitSet to track nodes assigned by marking the 1708 // send IDs of nodes, so that it's possible to get a SBNode using the 1709 // send ID to index into SBSendNodes. 1710 // Also add a filed to record the max send ID being assigned. 1711 struct TokenAllocation 1712 { 1713 BitSet bitset; 1714 int maxSendID = 0; setTokenAllocation1715 void set(int sendID) 1716 { 1717 bitset.set(sendID, true); 1718 maxSendID = std::max(sendID, maxSendID); 1719 } 1720 }; 1721 std::vector<TokenAllocation> allTokenNodesMap; 1722 SWSB_TOKEN_PROFILE tokenProfile; 1723 1724 //Global dependence analysis 1725 bool globalDependenceDefReachAnalysis(G4_BB* bb); 1726 bool globalDependenceUseReachAnalysis(G4_BB* bb); 1727 void addGlobalDependence(unsigned globalSendNum, SBBUCKET_VECTOR *globalSendOpndList, SBNODE_VECT *SBNodes, PointsToAnalysis &p, bool afterWrite); 1728 void tokenEdgePrune(unsigned& prunedEdgeNum, unsigned& prunedGlobalEdgeNum, unsigned& prunedDiffBBEdgeNum, unsigned& prunedDiffBBSameTokenEdgeNum); 1729 void dumpTokenLiveInfo(); 1730 1731 void removePredsEdges(SBNode * node, SBNode * pred); 1732 1733 void dumpImmDom(ImmDominator* dom) const; 1734 1735 void setDefaultDistanceAtFirstInstruction(); 1736 1737 void quickTokenAllocation(); 1738 1739 //Token allocation 1740 void tokenAllocation(); 1741 void buildLiveIntervals(); 1742 void expireIntervals(unsigned startID); 1743 void addToLiveList(SBNode *node); 1744 1745 std::pair<int /*reuseDelay*/, int /*curDistance*/> examineNodeForTokenReuse(unsigned nodeID, unsigned nodeDelay, const SBNode *curNode, unsigned char nestLoopLevel, unsigned curLoopStartBB, unsigned curLoopEndBB) const; 1746 SBNode * reuseTokenSelection(const SBNode * node) const; 1747 unsigned short reuseTokenSelectionGlobal(SBNode* node, G4_BB* bb, SBNode*& candidateNode, bool& fromUse); 1748 void addReachingDefineSet(SBNode* node, SBBitSets* globalLiveSet, SBBitSets* localLiveSet); 1749 void addReachingUseSet(SBNode* node, SBNode* use); 1750 void addGlobalDependenceWithReachingDef(unsigned globalSendNum, SBBUCKET_VECTOR* globalSendOpndList, SBNODE_VECT* SBNodes, PointsToAnalysis& p, bool afterWrite); 1751 void expireLocalIntervals(unsigned startID, unsigned BBID); 1752 void assignTokenToPred(SBNode* node, SBNode* pred, G4_BB* bb); 1753 bool assignTokenWithPred(SBNode* node, G4_BB* bb); 1754 void allocateToken(G4_BB* bb); 1755 void tokenAllocationBB(G4_BB* bb); 1756 void buildExclusiveForCoalescing(); 1757 void tokenAllocationGlobalWithPropogation(); 1758 void tokenAllocationGlobal(); 1759 bool propogateDist(G4_BB* bb); 1760 void tokenAllocationWithDistPropogationPerBB(G4_BB* bb); 1761 void tokenAllocationWithDistPropogation(); 1762 void calculateDist(); 1763 1764 1765 //Assign Token 1766 void assignToken(SBNode *node, unsigned short token, uint32_t &AWTokenReuseCount, uint32_t &ARTokenReuseCount, uint32_t &AATokenReuseCount); 1767 void assignDepToken(SBNode *node); 1768 void assignDepTokens(); 1769 bool insertSyncTokenPVC(G4_BB * bb, SBNode * node, G4_INST * inst, INST_LIST_ITER inst_it, int newInstID, BitSet * dstTokens, BitSet * srcTokens, bool removeAllToken); 1770 bool insertSyncPVC(G4_BB * bb, SBNode * node, G4_INST * inst, INST_LIST_ITER inst_it, int newInstID, BitSet * dstTokens, BitSet * srcTokens); 1771 bool insertSyncXe(G4_BB* bb, SBNode* node, G4_INST* inst, INST_LIST_ITER inst_it, int newInstID, BitSet* dstTokens, BitSet* srcTokens); 1772 void insertSync(G4_BB* bb, SBNode* node, G4_INST* inst, INST_LIST_ITER inst_it, int newInstID, BitSet* dstTokens, BitSet* srcTokens); 1773 void insertTest(); 1774 1775 //Insert sync instructions 1776 G4_INST *insertTestInstruction(G4_BB *bb, INST_LIST_ITER nextIter, int CISAOff, int lineNo, bool countSyns); //FIXME: Please remove it when meta is not needed anymore 1777 G4_INST *insertSyncInstruction(G4_BB *bb, INST_LIST_ITER nextIter, int CISAOff, int lineNo); 1778 G4_INST *insertSyncInstructionAfter(G4_BB * bb, INST_LIST_ITER nextIter, int CISAOff, int lineNo); 1779 G4_INST* insertSyncAllRDInstruction(G4_BB *bb, unsigned int SBIDs, INST_LIST_ITER nextIter, int CISAOff, int lineNo); 1780 G4_INST *insertSyncAllWRInstruction(G4_BB *bb, unsigned int SBIDs, INST_LIST_ITER nextIter, int CISAOff, int lineNo); 1781 1782 bool insertSyncToken(G4_BB* bb, SBNode* node, G4_INST* inst, INST_LIST_ITER inst_it, int newInstID, BitSet* dstTokens, BitSet* srcTokens, bool& keepDst, bool removeAllToken); 1783 1784 void SWSBDepDistanceGenerator(PointsToAnalysis& p, LiveGRFBuckets &LB, LiveGRFBuckets &globalSendsLB); 1785 void handleFuncCall(); 1786 void SWSBGlobalTokenGenerator(PointsToAnalysis& p, LiveGRFBuckets &LB, LiveGRFBuckets &globalSendsLB); 1787 void SWSBBuildSIMDCFG(); 1788 void addSIMDEdge(G4_BB_SB *pred, G4_BB_SB* succ); 1789 void SWSBGlobalScalarCFGReachAnalysis(); 1790 void SWSBGlobalSIMDCFGReachAnalysis(); 1791 1792 void setTopTokenIndex(); 1793 1794 //Optimizations 1795 void tokenDepReduction(SBNode* node1, SBNode *node2); 1796 bool cycleExpired(const SBNode * node, int currentID) const; 1797 1798 void shareToken(const SBNode *node, const SBNode *succ, unsigned short token); 1799 1800 void SWSBGlobalTokenAnalysis(); 1801 bool globalTokenReachAnalysis(G4_BB *bb); 1802 1803 1804 //Dump 1805 void dumpDepInfo() const; 1806 void dumpLiveIntervals() const; 1807 void dumpTokeAssignResult() const; 1808 void dumpSync(const SBNode * tokenNode, const SBNode * syncNode, unsigned short token, SWSBTokenType type) const; 1809 1810 // Fast-composite support. 1811 void genSWSBPatchInfo(); 1812 1813 void getDominators(ImmDominator* dom); 1814 1815 1816 public: SWSB(G4_Kernel & k,vISA::Mem_Manager & m)1817 SWSB(G4_Kernel &k, vISA::Mem_Manager& m) 1818 : kernel(k), fg(k.fg), mem(m), 1819 totalTokenNum(k.fg.builder->kernel.getNumSWSBTokens()), 1820 tokenAfterWriteMathCycle(k.fg.builder->isXeLP() ? 20u : 17u), 1821 tokenAfterWriteSendSlmCycle(k.fg.builder->isXeLP() ? 33u : 25u), //unlocaled 25 1822 tokenAfterWriteSendMemoryCycle(k.fg.builder->getOptions()->getOption(vISA_USEL3HIT) 1823 ? (k.fg.builder->isXeLP() ? 106u : 150u) // TOKEN_AFTER_WRITE_SEND_L3_MEMORY_CYCLE 1824 : (k.fg.builder->isXeLP() ? 65u : 50u)), // TOKEN_AFTER_WRITE_SEND_L1_MEMORY_CYCLE 1825 tokenAfterWriteSendSamplerCycle(k.fg.builder->getOptions()->getOption(vISA_USEL3HIT) 1826 ? (k.fg.builder->isXeLP() ? 175u : 210u) // TOKEN_AFTER_WRITE_SEND_L3_SAMPLER_CYCLE 1827 : 60u) // TOKEN_AFTER_WRITE_SEND_L1_SAMPLER_CYCLE 1828 { 1829 indexes.instIndex = 0; 1830 indexes.ALUIndex = 0; 1831 indexes.integerIndex = 0; 1832 indexes.floatIndex = 0; 1833 indexes.longIndex = 0; 1834 indexes.DPASIndex = 0; 1835 indexes.mathIndex = 0; 1836 LatencyTable LT(k.fg.builder); 1837 tokenAfterDPASCycle = LT.getDPAS8x8Latency(); 1838 } ~SWSB()1839 ~SWSB() 1840 { 1841 for (SBNode *node : SBNodes) 1842 { 1843 node->~SBNode(); 1844 } 1845 1846 for (G4_BB_SB *bb : BBVector) 1847 { 1848 bb->~G4_BB_SB(); 1849 } 1850 1851 for (int i = 0; i != (int)reachTokenArray.size(); ++i) 1852 { 1853 reachTokenArray[i]->~SBNODE_VECT(); 1854 reachUseArray[i]->~SBNODE_VECT(); 1855 } 1856 } 1857 void SWSBGenerator(); 1858 unsigned calcDepDelayForNode(const SBNode *node) const; 1859 }; 1860 } 1861 #endif // _SWSB_H_ 1862