1 /*========================== begin_copyright_notice ============================ 2 3 Copyright (C) 2017-2021 Intel Corporation 4 5 SPDX-License-Identifier: MIT 6 7 ============================= end_copyright_notice ===========================*/ 8 9 #include "Rematerialization.h" 10 11 namespace vISA 12 { populateRefs()13 void Rematerialization::populateRefs() 14 { 15 unsigned int id = 0; 16 for (auto bb : kernel.fg) 17 { 18 for (auto inst : *bb) 19 { 20 inst->setLexicalId(id++); 21 22 if (inst->isPseudoKill()) 23 continue; 24 25 auto dst = inst->getDst(); 26 27 if (dst && !dst->isNullReg()) 28 { 29 auto topdcl = dst->getTopDcl(); 30 31 if (topdcl) 32 { 33 operations[topdcl].def.push_back(std::make_pair(inst, bb)); 34 } 35 } 36 37 for (unsigned int i = 0; i < G4_MAX_SRCS; i++) 38 { 39 auto srcOpnd = inst->getSrc(i); 40 if (srcOpnd && 41 srcOpnd->isSrcRegRegion()) 42 { 43 auto topdcl = srcOpnd->asSrcRegRegion()->getTopDcl(); 44 unsigned int startRow = srcOpnd->getLeftBound() / numEltPerGRF<Type_UB>(); 45 unsigned int endRow = srcOpnd->getRightBound() / numEltPerGRF<Type_UB>(); 46 if (topdcl) 47 { 48 auto dclIt = operations.find(topdcl); 49 if (dclIt == operations.end()) 50 { 51 References r; 52 r.numUses = 1; 53 for (unsigned int k = startRow; k <= endRow; k++) 54 { 55 r.rowsUsed.insert(k); 56 } 57 //r.uses.push_back(std::make_pair(inst, bb)); 58 r.lastUseLexId = inst->getLexicalId(); 59 operations.insert(std::make_pair(topdcl, r)); 60 } 61 else 62 { 63 (*dclIt).second.numUses++; 64 for (unsigned int k = startRow; k <= endRow; k++) 65 { 66 (*dclIt).second.rowsUsed.insert(k); 67 } 68 (*dclIt).second.lastUseLexId = inst->getLexicalId(); 69 //(*dclIt).second.uses.push_back(std::make_pair(inst, bb)); 70 } 71 } 72 } 73 } 74 } 75 76 // Update lastUseLexId based on BB live-out set 77 for (unsigned int i = 0; i < liveness.getNumSelectedVar(); i++) 78 { 79 if (bb->size() > 0 && liveness.isLiveAtExit(bb, i)) 80 { 81 auto lr = coloring.getLiveRanges()[i]; 82 auto dclIt = operations.find(lr->getDcl()->getRootDeclare()); 83 if (dclIt != operations.end()) 84 { 85 (*dclIt).second.lastUseLexId = bb->back()->getLexicalId(); 86 } 87 } 88 } 89 } 90 91 for (auto& ref : operations) 92 { 93 auto dcl = ref.first; 94 if (dcl->getRegVar() && 95 dcl->getRegVar()->getPhyReg()) 96 preDefinedVars.push_back(dcl); 97 } 98 } 99 populateSamplerHeaderMap()100 void Rematerialization::populateSamplerHeaderMap() 101 { 102 samplerHeaderMapPopulated = true; 103 104 if (!samplerHeader) 105 return; 106 107 for (auto bb : kernel.fg) 108 { 109 G4_INST* samplerHeaderMov = nullptr; 110 for (auto inst : *bb) 111 { 112 if (inst->getDst() && 113 inst->getDst()->getTopDcl() == samplerHeader) 114 { 115 samplerHeaderMov = inst; 116 continue; 117 } 118 119 if (samplerHeaderMov && 120 inst->isSplitSend() && 121 inst->getMsgDesc()->isSampler() && 122 inst->getMsgDescRaw() && 123 inst->getMsgDescRaw()->isHeaderPresent()) 124 { 125 MUST_BE_TRUE(samplerHeaderMov->getExecSize() == 1, "Unexpected sampler header"); 126 samplerHeaderMap.insert(std::make_pair(inst, samplerHeaderMov)); 127 } 128 } 129 } 130 } 131 deLVNSamplers(G4_BB * bb)132 void Rematerialization::deLVNSamplers(G4_BB* bb) 133 { 134 // LVN pass removes redundant samplerHeader movs. This way 135 // several consecutive samplers can use same samplerHeader 136 // instruction. However, when remat is done, extra care 137 // needs to be taken so that all samplers still use same 138 // header as before. Consider this snippet: 139 // 140 // samplerHeader(0,2) = a 141 // send (16) ... samplerHeader ... 142 // = V1 143 // send (16) ... samplerHeader ... 144 // 145 // After remating V1: 146 // 147 // samplerHeader(0,2) = a 148 // send (16) ... samplerHeader ... 149 // samplerHeader(0,2) = b 150 // send (16) REMAT_V1 samplerHeader ... 151 // send (16) ... samplerHeader ... <-- Uses incorrect samplerHeader! 152 // 153 // This function deLVNs all samplerHeaders in the program and later 154 // we LVN them back after remating is done. This ensures correctness. 155 if (!samplerHeader) 156 return; 157 158 for (auto instIt = bb->begin(); 159 instIt != bb->end(); 160 ) 161 { 162 auto inst = (*instIt); 163 164 if (inst->isSplitSend() && 165 inst->getMsgDesc()->isSampler()) 166 { 167 auto samplerHeaderInstIt = samplerHeaderMap.find(inst); 168 169 if (samplerHeaderInstIt != samplerHeaderMap.end()) 170 { 171 auto samplerHeaderMov = (*samplerHeaderInstIt).second; 172 173 auto dupOp = samplerHeaderMov->cloneInst(); 174 175 bb->insertBefore(instIt, dupOp); 176 } 177 } 178 179 instIt++; 180 } 181 } 182 inSameSubroutine(G4_BB * use,G4_BB * def)183 bool Rematerialization::inSameSubroutine(G4_BB* use, G4_BB* def) 184 { 185 // Return true if both BBs belong to same sub 186 auto defBBIt = BBPerSubroutine.find(def); 187 auto useBBIt = BBPerSubroutine.find(use); 188 189 // Neither BBs found in map means both are part of main kernel 190 if (defBBIt == BBPerSubroutine.end() && 191 useBBIt == BBPerSubroutine.end()) 192 return true; 193 194 if (defBBIt != BBPerSubroutine.end() && 195 useBBIt != BBPerSubroutine.end()) 196 { 197 // Both BBs part of same subroutine 198 if ((*defBBIt).second == (*useBBIt).second) 199 return true; 200 } 201 202 // BBs not part of same subroutine 203 return false; 204 } 205 206 // bb1 should block defining original computation and 207 // bb2 should be the block where remat is expected. areInSameLoop(G4_BB * bb1,G4_BB * bb2,bool & bb1OutsideLoop)208 bool Rematerialization::areInSameLoop(G4_BB* bb1, G4_BB* bb2, bool& bb1OutsideLoop) 209 { 210 bool bb1InAnyLoop = false; 211 bb1OutsideLoop = false; 212 213 // Check whether bb1 is in any loop at all. If not, 214 // then we can allow remat even if bb2 is in a loop. 215 // The case that is disallowed is where bb1 and bb2 216 // are both in loops, but in different ones. 217 for (auto&& be : kernel.fg.backEdges) 218 { 219 auto loopIt = kernel.fg.naturalLoops.find(be); 220 221 if (loopIt != kernel.fg.naturalLoops.end()) 222 { 223 auto&& bbsInLoop = (*loopIt).second; 224 225 auto bb1InLoop = bbsInLoop.find(bb1); 226 if (bb1InLoop != bbsInLoop.end()) 227 { 228 bb1InAnyLoop = true; 229 break; 230 } 231 } 232 } 233 234 if (!bb1InAnyLoop) 235 bb1OutsideLoop = true; 236 237 for (auto&& be : kernel.fg.backEdges) 238 { 239 auto loopIt = kernel.fg.naturalLoops.find(be); 240 241 if (loopIt != kernel.fg.naturalLoops.end()) 242 { 243 auto&& bbsInLoop = (*loopIt).second; 244 245 auto bb1InLoop = bbsInLoop.find(bb1); 246 auto bb2InLoop = bbsInLoop.find(bb2); 247 248 // Both BBs must be present in all nested loops 249 if ((bb1InLoop == bbsInLoop.end() && bb2InLoop != bbsInLoop.end()) || 250 (bb1InLoop != bbsInLoop.end() && bb2InLoop == bbsInLoop.end())) 251 { 252 return false; 253 } 254 } 255 } 256 257 return true; 258 } 259 isRangeSpilled(G4_Declare * dcl)260 bool Rematerialization::isRangeSpilled(G4_Declare* dcl) 261 { 262 if (dcl) 263 return dcl->isSpilled(); 264 265 return false; 266 } 267 areAllDefsInBB(G4_Declare * dcl,G4_BB * bb,unsigned int lexId)268 bool Rematerialization::areAllDefsInBB(G4_Declare* dcl, G4_BB* bb, unsigned int lexId) 269 { 270 auto defsIt = operations.find(dcl); 271 if (defsIt == operations.end()) 272 return false; 273 274 auto&& refs = (*defsIt).second; 275 // Each def must be in same BB as sampler header must appear lexically before sampler 276 for (auto&& d : refs.def) 277 { 278 if (d.second != bb) 279 return false; 280 281 if (d.first->getLexicalId() > lexId) 282 return false; 283 } 284 285 286 return true; 287 } 288 getLastUseLexId(G4_Declare * dcl)289 unsigned int Rematerialization::getLastUseLexId(G4_Declare* dcl) 290 { 291 unsigned int lastLexId = 0; 292 auto it = operations.find(dcl); 293 if (it != operations.end()) 294 lastLexId = (*it).second.lastUseLexId; 295 296 return lastLexId; 297 } 298 cleanRedundantSamplerHeaders()299 void Rematerialization::cleanRedundantSamplerHeaders() 300 { 301 if (!samplerHeader) 302 return; 303 304 for (auto bb : kernel.fg) 305 { 306 std::list<G4_INST*> lastMov; 307 308 INST_LIST_ITER toErase = bb->end(); 309 310 if (deLVNedBBs.find(bb) == deLVNedBBs.end()) 311 continue; 312 313 for (auto instIt = bb->begin(), instItEnd = bb->end(); 314 instIt != instItEnd; 315 ) 316 { 317 auto inst = (*instIt); 318 319 if (toErase != bb->end()) 320 { 321 for (unsigned int i = 0; i != inst->getNumSrc(); ++i) 322 { 323 auto src = inst->getSrc(i); 324 if (src && src->isSrcRegRegion()) 325 { 326 auto topdcl = src->getTopDcl(); 327 if (topdcl == samplerHeader) 328 { 329 // samplerHeader is used, so can't erase it 330 toErase = bb->end(); 331 } 332 } 333 } 334 } 335 336 if (inst->isMov() && inst->getDst() && inst->getExecSize() == 1) 337 { 338 // mov (1|NM) samplerHeader(0,2)<1>:ud imm 339 auto dstTopDcl = inst->getDst()->getTopDcl(); 340 341 if (dstTopDcl == samplerHeader) 342 { 343 if (toErase != bb->end()) 344 { 345 lastMov.remove(*toErase); 346 bb->erase(toErase); 347 toErase = instIt; 348 } 349 350 if (lastMov.size() > 0) 351 { 352 auto lastMovSrc0 = lastMov.back()->getSrc(0); 353 auto instSrc0 = inst->getSrc(0); 354 355 if (inst->getDst()->getSubRegOff() == 2 && 356 lastMovSrc0->isImm() == instSrc0->isImm() && 357 lastMovSrc0->asImm()->getImm() == instSrc0->asImm()->getImm() && 358 lastMovSrc0->getType() == instSrc0->getType()) 359 { 360 // Remove current instruction 361 #if 0 362 printf("Removing sampler header mov at $%d\n", inst->getCISAOff()); 363 #endif 364 instIt = bb->erase(instIt); 365 toErase = bb->end(); 366 continue; 367 } 368 } 369 370 toErase = instIt; 371 372 lastMov.push_back(inst); 373 } 374 } 375 376 instIt++; 377 } 378 379 if (toErase != bb->end()) 380 bb->erase(toErase); 381 } 382 } 383 checkLocalWAR(G4_INST * defInst,G4_BB * bb,INST_LIST_ITER useIter)384 bool Rematerialization::checkLocalWAR(G4_INST* defInst, G4_BB* bb, INST_LIST_ITER useIter) 385 { 386 INST_LIST_ITER currIter = useIter; 387 while (currIter != bb->begin()) 388 { 389 currIter--; 390 auto currInst = *currIter; 391 if (currInst == defInst) 392 break; 393 394 auto currDst = currInst->getDst(); 395 if (currDst && !currDst->isNullReg()) 396 { 397 auto dstDcl = currDst->getTopDcl(); 398 unsigned int curLb = currDst->getLeftBound(); 399 unsigned int curRb = currDst->getRightBound(); 400 401 for (unsigned int i = 0; i < G4_MAX_SRCS; i++) 402 { 403 auto srcOpnd = defInst->getSrc(i); 404 if (srcOpnd && 405 !(srcOpnd->isNullReg()) && 406 srcOpnd->isSrcRegRegion()) 407 { 408 G4_SrcRegRegion* srcRegion = srcOpnd->asSrcRegRegion(); 409 auto srcDcl = srcRegion->getTopDcl(); 410 unsigned int srcLb = srcRegion->getLeftBound(), srcRb = srcRegion->getRightBound(); 411 412 if (dstDcl == srcDcl && 413 curRb >= srcLb && 414 curLb <= srcRb) 415 { 416 return false; 417 } 418 } 419 } 420 } 421 } 422 423 MUST_BE_TRUE(*currIter == defInst, "Cannot find defInst for Remat candidate!"); 424 425 return true; 426 } 427 usesNoMaskWA(const Reference * uniqueDef)428 bool Rematerialization::usesNoMaskWA(const Reference* uniqueDef) 429 { 430 auto defInst = uniqueDef->first; 431 432 // look for pattern like: 433 // (W&fx.y.anyh) inst 434 // 435 // where fx.y is: 436 // cmp.eq.fx.y (..) null rega rega 437 if (!defInst->isWriteEnableInst()) 438 return false; 439 440 if (!defInst->getPredicate()) 441 return false; 442 443 auto predCtrl = defInst->getPredicate()->getControl(); 444 if (predCtrl != PRED_ANY8H && 445 predCtrl != PRED_ANY16H && 446 predCtrl != PRED_ANY32H) 447 return false; 448 449 return defInst->getPredicate()->isSameAsNoMask(); 450 } 451 isPartGRFBusyInput(G4_Declare * inputDcl,unsigned int atLexId)452 bool Rematerialization::isPartGRFBusyInput(G4_Declare* inputDcl, unsigned int atLexId) 453 { 454 // inputDcl is an input G4_Declare that has pre-defined assignment. 455 // Extending a pre-assigned assignment can be bad if its a scalar 456 // and no other part of that GRF is busy. OTOH, it may be beneficial 457 // to extend inputDcl if there is another pre-defined G4_Declare 458 // sharing physical register assignment (different sub-register) 459 // with inputDcl and is live beyond where we want to extend inputDcl. 460 461 // This function checks whether there is any other G4_Declare that 462 // shares same GRF assignment as inputDcl. If there is then check 463 // whether last use of that assignment is beyond atLexId. If one 464 // if found then return true. Return false otherwise. 465 466 if (!inputDcl->getRegVar()->getPhyReg() || 467 !inputDcl->getRegVar()->getPhyReg()->isGreg()) 468 { 469 return false; 470 } 471 472 auto inputRegNum = inputDcl->getRegVar()->getPhyReg()->asGreg()->getRegNum(); 473 474 for (auto dcl : preDefinedVars) 475 { 476 auto ref = operations.find(dcl); 477 if (ref == operations.end()) 478 continue; 479 480 if (!dcl->getRegVar()->getPhyReg() || 481 !dcl->getRegVar()->getPhyReg()->isGreg()) 482 continue; 483 484 auto regNum = dcl->getRegVar()->getPhyReg()->asGreg()->getRegNum(); 485 if (regNum == inputRegNum) 486 { 487 if ((*ref).second.lastUseLexId >= atLexId) 488 return true; 489 } 490 } 491 492 return false; 493 } 494 495 canRematerialize(G4_SrcRegRegion * src,G4_BB * bb,const Reference * & ref,INST_LIST_ITER instIter)496 bool Rematerialization::canRematerialize(G4_SrcRegRegion* src, G4_BB* bb, const Reference*& ref, INST_LIST_ITER instIter) 497 { 498 // op1 (8) A B C 499 // ... 500 // op2 (8) D A X 501 // 502 // This function will check whether rematerialize an operand, 503 // eg A in op2 is possible. 504 // 505 auto topdcl = src->getTopDcl(); 506 if (!topdcl) 507 return false; 508 509 if (src->getInst()->isSplitIntrinsic()) 510 return false; 511 512 // ADDRESS/FLAG spilled declare 513 if (topdcl->getSpilledDeclare()) 514 return false; 515 516 if (topdcl->getAddressed()) 517 return false; 518 519 if (topdcl->getRegVar()->getPhyReg()) 520 return false; 521 522 // Src must belong to GRF file 523 if ((topdcl->getRegFile() & 524 (G4_RegFileKind::G4_GRF | G4_RegFileKind::G4_INPUT)) == 0x0) 525 return false; 526 527 // Skip remat if src opnd uses special acc registers 528 if (src->getAccRegSel() != ACC_UNDEFINED) 529 return false; 530 531 // Lookup defs of src in program 532 auto opIt = operations.find(topdcl); 533 if (opIt == operations.end()) 534 return false; 535 536 auto&& refs = (*opIt).second; 537 auto uniqueDef = findUniqueDef(refs, src); 538 539 if (!uniqueDef) 540 return false; 541 542 if (gra.isNoRemat(uniqueDef->first)) 543 return false; 544 545 // Def has a lot of uses so we will need lots of remat to make this profitable 546 if (refs.numUses > MAX_USES_REMAT) 547 return false; 548 549 if (uniqueDef->first->getCondMod()) 550 return false; 551 552 if (uniqueDef->first->getPredicate() && 553 !usesNoMaskWA(uniqueDef)) 554 return false; 555 556 // It is illegal to rematerialize intrinsic.split instruction as it 557 // is dependent on an earlier send. 558 if (uniqueDef->first->isSplitIntrinsic()) 559 return false; 560 561 ref = uniqueDef; 562 563 // Check whether op1 can be recomputed 564 auto srcInst = src->getInst(); 565 auto uniqueDefInst = uniqueDef->first; 566 auto uniqueDefBB = uniqueDef->second; 567 568 if (!isRematCandidateOp(uniqueDefInst)) 569 return false; 570 571 unsigned int srcLexId = srcInst->getLexicalId(); 572 unsigned int origOpLexId = uniqueDefInst->getLexicalId(); 573 574 if (origOpLexId > srcLexId) 575 return false; 576 577 // Def-use must be far away 578 unsigned int minDefUseDist = MIN_DEF_USE_DISTANCE; 579 580 // If def is a scalar and its def/use lie entirely in a BB, 581 // then increase min def use distance heuristic as remating 582 // closeby is unlikely to provide perf benefit. 583 if (uniqueDefInst->getExecSize() == 1) 584 { 585 if(uniqueDefBB->back()->getLexicalId() >= refs.lastUseLexId) 586 minDefUseDist *= 2; 587 } 588 589 if ((srcLexId - origOpLexId) < minDefUseDist) 590 return false; 591 592 if (!inSameSubroutine(bb, uniqueDefBB)) 593 return false; 594 595 // If uniqueDefBB is not under SIMD CF, current BB is under SIMD CF 596 // and use has NoMask set, then we can remat only if def has NoMask 597 // option set. 598 if (!uniqueDefBB->isDivergent() && 599 bb->isDivergent() && 600 !uniqueDefInst->isWriteEnableInst() && 601 srcInst->isWriteEnableInst()) 602 { 603 return false; 604 } 605 606 // Check whether they are in a loop. If yes, they should be in same loop. 607 bool uniqueDefOutsideLoop = false; 608 bool srcDclSpilled = isRangeSpilled(topdcl); 609 bool inSameLoop = areInSameLoop(uniqueDefBB, bb, uniqueDefOutsideLoop); 610 bool onlyUseInLoop = uniqueDefOutsideLoop && !inSameLoop; 611 bool doNumRematCheck = false; 612 613 // Decide whether it is profitable to push def inside loop before each use 614 if (onlyUseInLoop && !srcDclSpilled) 615 { 616 // If topdcl does not interfere with other spilled 617 // range then skip remating this operation. 618 // Be less aggressive if this is SIMD8 since we run the 619 // chance of perf penalty with this. 620 if ((kernel.getSimdSize() == 8 && rpe.getRegisterPressure(srcInst) < (float)rematLoopRegPressure * 1.6f) || 621 rematCandidates[topdcl->getRegVar()->getId()] == false || 622 rpe.getRegisterPressure(srcInst) < rematLoopRegPressure) 623 return false; 624 625 if (getNumRematsInLoop() > 0) 626 { 627 // Restrict non-SIMD1 remats to a low percent of loop instructions. 628 float loopInstToTotalInstRatio = (float)getNumRematsInLoop() / (float)loopInstsBeforeRemat*100.0f; 629 if (rpe.getMaxRP() < rematRegPressure * 1.4f) 630 { 631 // If max RPE is not very high, dont sink too many instructions in loop 632 if(loopInstToTotalInstRatio > 1.75f) 633 return false; 634 } 635 else if (loopInstToTotalInstRatio > 3.89f) 636 return false; 637 } 638 } 639 640 if (!inSameLoop) 641 { 642 if (!uniqueDefOutsideLoop) 643 return false; 644 else 645 { 646 // When op1 is outside loop and op2 is indside loop, 647 // allow remat if op1 dst dcl is marked spilled. 648 // Because that means a load will be inserted in the 649 // loop and remat might be more efficient here. 650 if (!srcDclSpilled) 651 { 652 // If src dcl is not spilled, check whether all 653 // src opnds of defInst have been remat'd atleast once. 654 // This heuristic helps decide if remat will be worthwhile 655 // in a loop. 656 doNumRematCheck = true; 657 } 658 } 659 } 660 661 if (inSameLoop && !uniqueDefOutsideLoop) 662 { 663 // Remat is done in loop only if declare 664 // is marked as spill, so remat will 665 // benefit it. Otherwise, if var has a 666 // single use within the loop then remat 667 // can be done as it doesnt contribute to 668 // increase in inst count. 669 if (!srcDclSpilled && refs.numUses > 1) 670 return false; 671 } 672 673 // Check liveness of each src operand in original op 674 bool srcLive[G4_MAX_SRCS]; 675 bool anySrcNotLive = false; 676 for (unsigned int i = 0; i < G4_MAX_SRCS; i++) 677 { 678 srcLive[i] = true; 679 auto srcOpnd = uniqueDefInst->getSrc(i); 680 if (!srcOpnd || srcOpnd->isImm() || srcOpnd->isNullReg()) 681 continue; 682 683 if (srcOpnd->isSrcRegRegion()) 684 { 685 // If src operand base is non-regvar (eg, architecture 686 // register) then dont remat. Moving around such 687 // registers could be dangerous. 688 if (!srcOpnd->getBase()->isRegVar()) 689 return false; 690 691 // Check whether this src has a single unique def 692 auto srcOpndRgn = srcOpnd->asSrcRegRegion(); 693 auto srcOpndTopDcl = srcOpndRgn->getTopDcl(); 694 695 if (doNumRematCheck && getNumRemats(srcOpndTopDcl) == 0) 696 { 697 return false; 698 } 699 700 const auto &pointsToSet = liveness.getPointsToAnalysis().getIndrUseVectorForBB(bb->getId()); 701 G4_RegVar* srcVar = srcOpndTopDcl->getRegVar(); 702 auto it = std::find_if(pointsToSet.begin(), pointsToSet.end(), 703 [&srcVar](const pointInfo& element) {return element.var == srcVar && element.off == 0; }); 704 705 if (srcOpndTopDcl->getAddressed() && 706 ((uniqueDefBB != bb) || 707 it != pointsToSet.end())) 708 { 709 // Indirectly addressed src opnd should not be extended 710 return false; 711 } 712 713 if ((srcOpndTopDcl->getRegFile() & 714 (G4_RegFileKind::G4_GRF | G4_RegFileKind::G4_INPUT)) == 0x0) 715 return false; 716 717 // If an instruction has physical registers allocated then 718 // dont optimize it. 719 if (srcOpndRgn->getBase()->asRegVar()->getPhyReg() && 720 !srcOpndTopDcl->isInput()) 721 return false; 722 723 if (srcOpndTopDcl->isInput()) 724 { 725 auto opIt = operations.find(srcOpndTopDcl); 726 if (opIt != operations.end()) 727 { 728 // Check whether input variable has explicit def in function 729 if ((*opIt).second.def.size() > 0) 730 return false; 731 } 732 733 if ((*opIt).second.lastUseLexId < srcLexId && 734 (!isPartGRFBusyInput((*opIt).first, srcLexId) || 735 !inSameLoop)) 736 { 737 // Inputs are pre-assigned and extending such ranges 738 // could lead to worse RA results, unless the input 739 // already extends beyond where we intend to remat. 740 return false; 741 } 742 } 743 744 // Run separate checks for sampler 745 if (uniqueDefInst->isSplitSend() && 746 uniqueDefInst->getMsgDesc()->isSampler() && 747 uniqueDefInst->getSrc(2)->isImm() && 748 uniqueDefInst->getSrc(3)->isImm()) 749 { 750 if (!kernel.getOptions()->getOption(vISA_cacheSamplerHeader)) 751 return false; 752 753 // Sampler definition to be rematerialized 754 // sends (8) V54(0,0):f samplerHeader(0,0) V53(0,0) 0x42:ud 0x24a7002:ud{Align1, Q1} 755 // resLen = 4, msgLen = 1, extMsgLen = 1 756 // samplerHeader can be rematerialized as it is r0.0 with modified r0.2. 757 // V53 above will simply be extended since it requires extra computation to rematerialize. 758 // Above sampler inst has a header. Some sampler instructions may not have a header. 759 // For such headerless samplers we need to check whether it is profitable to extend 760 // both src operands. 761 762 // Ensure resLen > extMsgLen to make rematerialization profitable. 763 unsigned len = uniqueDefInst->getMsgDesc()->getSrc1LenRegs(); 764 765 // For Sanity, just verify V53 has defs before sampler send only. 766 auto extMsgOpnd = uniqueDefInst->getSrc(1); 767 MUST_BE_TRUE(extMsgOpnd->isSrcRegRegion() == true, "Unexpected src opnd for sampler"); 768 769 // Dont remat if sampler def is outside loop and use inside loop 770 if (onlyUseInLoop) 771 return false; 772 773 if (!areAllDefsInBB(extMsgOpnd->asSrcRegRegion()->getTopDcl(), uniqueDefBB, uniqueDefInst->getLexicalId())) 774 return false; 775 776 bool samplerHeaderNotUsed = uniqueDefInst->getSrc(0)->asSrcRegRegion()->getTopDcl() != kernel.fg.builder->getBuiltinSamplerHeader(); 777 778 if (!uniqueDefInst->getMsgDescRaw() || 779 !uniqueDefInst->getMsgDescRaw()->isHeaderPresent() || 780 samplerHeaderNotUsed) 781 { 782 len += uniqueDefInst->getMsgDesc()->getSrc0LenRegs(); 783 784 auto msgOpnd = uniqueDefInst->getSrc(0); 785 if (!areAllDefsInBB(msgOpnd->asSrcRegRegion()->getTopDcl(), uniqueDefBB, uniqueDefInst->getLexicalId())) 786 return false; 787 788 if (liveness.isLiveAtExit(bb, msgOpnd->getTopDcl()->getRegVar()->getId()) || 789 getLastUseLexId(msgOpnd->getTopDcl()) >= srcLexId) 790 len -= uniqueDefInst->getMsgDesc()->getSrc0LenRegs(); 791 } 792 793 if (samplerHeaderNotUsed) 794 { 795 // Ensure header creation instructions are used only by sampler 796 auto msgOpndTopDcl = uniqueDefInst->getSrc(0)->asSrcRegRegion()->getTopDcl(); 797 auto topDclOpsIt = operations.find(msgOpndTopDcl); 798 if (topDclOpsIt == operations.end()) 799 return false; 800 801 if ((*topDclOpsIt).second.numUses > 1) 802 return false; 803 804 for (auto& def : (*topDclOpsIt).second.def) 805 { 806 for (unsigned int i = 0; i != G4_MAX_SRCS; i++) 807 { 808 auto src = def.first->getSrc(i); 809 if (!src) 810 continue; 811 812 if (src->isImm()) 813 continue; 814 815 if (src->isSrcRegRegion() && 816 (src->asSrcRegRegion()->getTopDcl() == kernel.fg.builder->getBuiltinSamplerHeader() || 817 src->asSrcRegRegion()->getTopDcl() == kernel.fg.builder->getBuiltinR0())) 818 continue; 819 820 // Using some other var in payload src requires extra checks to remat, so skip it 821 return false; 822 } 823 } 824 } 825 826 if (liveness.isLiveAtExit(bb, extMsgOpnd->getTopDcl()->getRegVar()->getId()) || 827 getLastUseLexId(extMsgOpnd->getTopDcl()) >= srcLexId) 828 len -= uniqueDefInst->getMsgDesc()->getSrc1LenRegs(); 829 830 if (refs.rowsUsed.size() <= len) 831 return false; 832 833 return true; 834 } 835 else 836 { 837 // Non-sampler definition to be rematerialized 838 if (uniqueDefInst->isSend()) 839 return false; 840 841 auto opIt = operations.find(srcOpndTopDcl); 842 if (opIt == operations.end()) 843 return false; 844 845 auto&& srcOpndRefs = (*opIt).second; 846 auto srcOpndUniqueDef = findUniqueDef(srcOpndRefs, srcOpndRgn); 847 848 bool isSrcAvailble = false; 849 if (kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_CM && 850 uniqueDefBB == bb) 851 { 852 isSrcAvailble = checkLocalWAR(uniqueDefInst, bb, instIter); 853 } 854 855 if (!srcOpndUniqueDef && 856 !isSrcAvailble && 857 !srcOpndTopDcl->isInput()) 858 return false; 859 860 if (srcOpndUniqueDef && 861 !inSameSubroutine(bb, srcOpndUniqueDef->second)) 862 return false; 863 864 // Check if its live in/live out to/of current BB 865 unsigned int id = srcOpndTopDcl->getRegVar()->getId(); 866 if (!liveness.isLiveAtExit(bb, id) && 867 // Even if a var is not live-out, its live-range 868 // might extend till inst of interest. 869 srcOpndRefs.lastUseLexId < srcInst->getLexicalId()) 870 { 871 // Opnd may not be live, but it is still possible to 872 // extend its live-range to remat it. For scalars, this 873 // could be profitable too. 874 srcLive[i] = false; 875 anySrcNotLive = true; 876 } 877 } 878 } 879 } 880 881 if (anySrcNotLive) 882 { 883 // Apply cost heuristic. It may be profitable to extend 884 // scalars sometimes. 885 for (unsigned int i = 0; i < G4_MAX_SRCS; i++) 886 { 887 if (!srcLive[i]) 888 { 889 G4_SrcRegRegion* srcRgn = uniqueDefInst->getSrc(i)->asSrcRegRegion(); 890 891 if (srcRgn->getTopDcl()->getNumElems() > 1 && 892 getNumUses(srcRgn->getTopDcl()) < 20) 893 { 894 // Extending non-scalar operands can be expensive 895 return false; 896 } 897 } 898 } 899 } 900 901 // Record remats in loop only for non-scalar operations. This is a heuristic used 902 // to not remat excessively in loops. 903 if (!inSameLoop && 904 uniqueDefInst->getExecSize() > 1) 905 incNumRematsInLoop(); 906 907 if (cr0DefBB && IS_TYPE_FLOAT_ALL(uniqueDefInst->getExecType())) 908 { 909 return false; 910 } 911 912 return true; 913 } 914 rematerialize(G4_SrcRegRegion * src,G4_BB * bb,const Reference * uniqueDef,std::list<G4_INST * > & newInst,G4_INST * & cacheInst)915 G4_SrcRegRegion* Rematerialization::rematerialize( 916 G4_SrcRegRegion* src, G4_BB* bb, const Reference* uniqueDef, 917 std::list<G4_INST*>& newInst, G4_INST*& cacheInst) 918 { 919 // op1 (8) A B C 920 // ... 921 // op2 (8) D A E 922 // 923 // => 924 // op1 (8) A B C 925 // ... 926 // op1_dup (8) A1 B C 927 // op2 (8) D A1 E 928 929 G4_SrcRegRegion* rematSrc = nullptr; 930 931 auto dstInst = uniqueDef->first; 932 auto dst = dstInst->getDst(); 933 bool isSampler = dstInst->isSplitSend() && dstInst->getMsgDesc()->isSampler(); 934 935 for (unsigned int i = 0; i < G4_MAX_SRCS; i++) 936 { 937 G4_Operand* src = dstInst->getSrc(i); 938 if (src && 939 src->isSrcRegRegion()) 940 { 941 incNumRemat(src->asSrcRegRegion()->getTopDcl()); 942 } 943 } 944 945 if (!isSampler) 946 { 947 unsigned int diffBound = dst->getRightBound() - (dst->getRegOff() * numEltPerGRF<Type_UB>()); 948 unsigned numElems = (diffBound + 1) / dst->getTypeSize(); 949 auto newTemp = kernel.fg.builder->createTempVar(numElems, dst->getType(), Any, "REMAT_"); 950 newTemp->copyAlign(dst->getTopDcl()); 951 gra.copyAlignment(newTemp, dst->getTopDcl()); 952 G4_DstRegRegion* newDst = kernel.fg.builder->createDst(newTemp->getRegVar(), 0, 953 (dst->getLeftBound() % numEltPerGRF<Type_UB>()) / dst->getTypeSize(), 954 dst->getHorzStride(), dst->getType()); 955 G4_INST* dupOp = dstInst->cloneInst(); 956 dupOp->setDest(newDst); 957 dupOp->inheritDIFrom(dstInst); 958 959 rematSrc = createSrcRgn(src, dst, newTemp); 960 961 newInst.push_back(dupOp); 962 963 cacheInst = newInst.back(); 964 } 965 else 966 { 967 G4_Operand* src0 = nullptr; 968 // Look up samplerHeader(0,2) definition 969 auto sampleHeaderTopDcl = uniqueDef->first->getSrc(0)->asSrcRegRegion()->getTopDcl(); 970 if (sampleHeaderTopDcl == kernel.fg.builder->getBuiltinSamplerHeader()) 971 { 972 samplerHeader = sampleHeaderTopDcl; 973 if (!samplerHeaderMapPopulated) 974 { 975 populateSamplerHeaderMap(); 976 } 977 978 if (deLVNedBBs.find(bb) == deLVNedBBs.end()) 979 { 980 // DeLVN one bb at a time when required 981 deLVNSamplers(bb); 982 deLVNedBBs.insert(bb); 983 } 984 985 auto samplerDefIt = samplerHeaderMap.find(uniqueDef->first); 986 auto prevHeaderMov = (*samplerDefIt).second; 987 988 src0 = dstInst->getSrc(0); 989 990 // Duplicate sampler header setup instruction 991 auto dupOp = prevHeaderMov->cloneInst(); 992 newInst.push_back(dupOp); 993 } 994 else 995 { 996 // Handle sampler when src0 is not builtin sampler header 997 auto src0Rgn = uniqueDef->first->getSrc(0)->asSrcRegRegion(); 998 auto src0TopDcl = src0Rgn->getTopDcl(); 999 auto ops = operations.find(src0TopDcl); 1000 MUST_BE_TRUE(ops != operations.end(), "Didnt find record in map"); 1001 MUST_BE_TRUE((*ops).second.numUses == 1, "Expecting src0 to be used only in sampler"); 1002 1003 G4_Declare* newSrc0Dcl = nullptr; 1004 if (src0TopDcl->getRegVar()->isPhyRegAssigned()) 1005 { 1006 newSrc0Dcl = src0TopDcl; 1007 } 1008 else 1009 { 1010 newSrc0Dcl = kernel.fg.builder->createTempVar(src0TopDcl->getTotalElems(), 1011 src0TopDcl->getElemType(), gra.getSubRegAlign(src0TopDcl)); 1012 1013 // Clone all defining instructions for sampler's msg header 1014 for (unsigned int i = 0; i != (*ops).second.def.size(); i++) 1015 { 1016 auto& headerDefInst = (*ops).second.def[i].first; 1017 1018 auto dupOp = headerDefInst->cloneInst(); 1019 auto headerDefDst = headerDefInst->getDst(); 1020 assert(!headerDefDst->isIndirect()); // we dont allow send header to be defined indirectly 1021 dupOp->setDest(kernel.fg.builder->createDst( 1022 newSrc0Dcl->getRegVar(), headerDefDst->getRegOff(), headerDefDst->getSubRegOff(), 1023 headerDefDst->getHorzStride(), headerDefDst->getType())); 1024 newInst.push_back(dupOp); 1025 } 1026 } 1027 1028 auto rd = kernel.fg.builder->createRegionDesc(src0Rgn->getRegion()->vertStride, 1029 src0Rgn->getRegion()->width, src0Rgn->getRegion()->horzStride); 1030 1031 src0 = kernel.fg.builder->createSrc( 1032 newSrc0Dcl->getRegVar(), src0Rgn->getRegOff(), src0Rgn->getSubRegOff(), 1033 rd, src0Rgn->getType()); 1034 } 1035 1036 auto samplerDst = kernel.fg.builder->createTempVar(dst->getTopDcl()->getTotalElems(), dst->getTopDcl()->getElemType(), 1037 gra.getSubRegAlign(dst->getTopDcl()), "REMAT_SAMPLER_"); 1038 auto samplerDstRgn = kernel.fg.builder->createDst(samplerDst->getRegVar(), 0, 1039 0, 1, samplerDst->getElemType()); 1040 1041 auto dstMsgDesc = dstInst->getMsgDescRaw(); 1042 // TODO: this may not hold when we start using load/store descriptors 1043 MUST_BE_TRUE(dstMsgDesc, "expected raw descriptor"); 1044 1045 auto newMsgDesc = kernel.fg.builder->createGeneralMsgDesc( 1046 dstMsgDesc->getDesc(), 1047 dstMsgDesc->getExtendedDesc(), dstMsgDesc->getAccess(), 1048 kernel.fg.builder->duplicateOperand(dstMsgDesc->getSurface()), 1049 kernel.fg.builder->duplicateOperand(dstMsgDesc->getSti())); 1050 1051 auto dupOp = kernel.fg.builder->createSplitSendInst(nullptr, dstInst->opcode(), dstInst->getExecSize(), samplerDstRgn, 1052 kernel.fg.builder->duplicateOperand(src0)->asSrcRegRegion(), 1053 kernel.fg.builder->duplicateOperand(dstInst->getSrc(1))->asSrcRegRegion(), 1054 kernel.fg.builder->duplicateOperand(dstInst->asSendInst()->getMsgDescOperand()), dstInst->getOption(), 1055 newMsgDesc, kernel.fg.builder->duplicateOperand(dstInst->getSrc(3)), true); 1056 dupOp->setCISAOff(dstInst->getCISAOff()); 1057 dupOp->inheritDIFrom(dstInst); 1058 1059 newInst.push_back(dupOp); 1060 1061 rematSrc = createSrcRgn(src, dst, samplerDst); 1062 1063 cacheInst = newInst.back(); 1064 } 1065 1066 // Fix for NoMaskWA 1067 for (auto inst : newInst) 1068 if (inst->getPredicate() && inst->getPredicate()->isSameAsNoMask()) 1069 inst->setPredicate(nullptr); 1070 1071 return rematSrc; 1072 } 1073 createSrcRgn(G4_SrcRegRegion * srcToRemat,G4_DstRegRegion * uniqueDef,G4_Declare * rematTemp)1074 G4_SrcRegRegion* Rematerialization::createSrcRgn(G4_SrcRegRegion* srcToRemat, G4_DstRegRegion* uniqueDef, G4_Declare* rematTemp) 1075 { 1076 G4_SrcRegRegion* rematSrc = nullptr; 1077 1078 unsigned row = (srcToRemat->getLeftBound() / numEltPerGRF<Type_UB>()) - (uniqueDef->getLeftBound() / numEltPerGRF<Type_UB>()); 1079 unsigned subReg = (srcToRemat->getLeftBound() % numEltPerGRF<Type_UB>()) / srcToRemat->getTypeSize(); 1080 1081 rematSrc = kernel.fg.builder->createSrcRegRegion(srcToRemat->getModifier(), Direct, 1082 rematTemp->getRegVar(), (short)row, (short)subReg, srcToRemat->getRegion(), srcToRemat->getType()); 1083 1084 return rematSrc; 1085 } 1086 findUniqueDef(References & refs,G4_SrcRegRegion * src)1087 const Reference* Rematerialization::findUniqueDef(References & refs, G4_SrcRegRegion *src) 1088 { 1089 // This function looks up list of definitions for a topdcl (src->getTopDcl()) and 1090 // returns a single dst region that defines that src region. If more than 1 def 1091 // match lb/rb of src then nullptr is returned. If a partial unique def is found 1092 // even then nullptr is returned. 1093 1094 Reference* uniqueDef = nullptr; 1095 1096 unsigned int lb = src->getLeftBound(), rb = src->getRightBound(); 1097 for (auto&& r : refs.def) 1098 { 1099 auto curdst = r.first->getDst(); 1100 unsigned int curlb = curdst->getLeftBound(); 1101 unsigned int currb = curdst->getRightBound(); 1102 1103 if (curlb <= lb && currb >= rb) 1104 { 1105 if (uniqueDef) 1106 { 1107 uniqueDef = nullptr; 1108 break; 1109 } 1110 else 1111 { 1112 uniqueDef = &r; 1113 } 1114 } 1115 else if ((curlb <= lb && currb >= lb) || 1116 (curlb <= rb && currb >= lb)) 1117 { 1118 // Partial overlap 1119 uniqueDef = nullptr; 1120 break; 1121 } 1122 } 1123 1124 if (uniqueDef) 1125 { 1126 G4_RegFileKind rf = refs.def.front().first->getDst()->getTopDcl()->getRegFile(); 1127 if (rf == G4_RegFileKind::G4_INPUT) 1128 { 1129 // Variable is an input as well as has a def 1130 uniqueDef = nullptr; 1131 } 1132 } 1133 1134 return uniqueDef; 1135 } 1136 getNumSamplers(G4_Kernel & kernel)1137 unsigned int getNumSamplers(G4_Kernel& kernel) 1138 { 1139 unsigned int numSampler = 0; 1140 1141 for (auto bb : kernel.fg) 1142 { 1143 for (auto inst : *bb) 1144 { 1145 if (inst->isSplitSend() && 1146 inst->getMsgDesc()->isSampler()) 1147 { 1148 numSampler++; 1149 } 1150 } 1151 } 1152 1153 return numSampler; 1154 } 1155 run()1156 void Rematerialization::run() 1157 { 1158 populateRefs(); 1159 1160 auto firstProgInst = kernel.fg.getEntryBB()->getFirstInst(); 1161 1162 for (auto bb : kernel.fg) 1163 { 1164 if (kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISATarget::VISA_3D) 1165 { 1166 // For Cm, assume cr0 def is live across BBs 1167 // For IGC, assume cr0 is reset at each BB entry 1168 cr0DefBB = false; 1169 } 1170 // Store cache of rematerialized operations so nearby instructions 1171 // can reuse them. 1172 // <Unique def, <Remat'd def, Lexical id of last ref>> 1173 std::map<const Reference*, std::pair<G4_INST*, unsigned int>> rematValues; 1174 for (auto instIt = bb->begin(); 1175 instIt != bb->end(); 1176 instIt++) 1177 { 1178 auto inst = (*instIt); 1179 auto dst = inst->getDst(); 1180 bool runRemat = false; 1181 1182 cr0DefBB |= dst && 1183 dst->isCrReg() && (inst != firstProgInst); 1184 1185 // Run remat if any src opnd is spilled 1186 for (unsigned int opnd = 0; opnd < G4_MAX_SRCS; opnd++) 1187 { 1188 auto src = inst->getSrc(opnd); 1189 1190 if (src && 1191 src->isSrcRegRegion()) 1192 { 1193 auto srcTopDcl = src->getTopDcl(); 1194 if (srcTopDcl && srcTopDcl->getRegVar()->isRegAllocPartaker() && 1195 (isRangeSpilled(srcTopDcl) || 1196 rematCandidates[srcTopDcl->getRegVar()->getId()] == true)) 1197 { 1198 // Run remat for spilled src opnd even if 1199 // register pressure is low. 1200 runRemat = true; 1201 break; 1202 } 1203 } 1204 } 1205 1206 if (!runRemat) 1207 { 1208 auto regPressure = rpe.getRegisterPressure(inst); 1209 1210 if (regPressure < rematRegPressure) 1211 { 1212 continue; 1213 } 1214 } 1215 1216 // High register pressure found at current instruction so try to remat 1217 for (unsigned int opnd = 0; opnd < G4_MAX_SRCS; opnd++) 1218 { 1219 auto src = inst->getSrc(opnd); 1220 1221 if (src && 1222 src->isSrcRegRegion()) 1223 { 1224 const Reference* uniqueDef = nullptr; 1225 G4_SrcRegRegion* rematSrc = nullptr; 1226 1227 bool canRemat = canRematerialize(src->asSrcRegRegion(), bb, uniqueDef, instIt); 1228 if (canRemat) 1229 { 1230 bool reUseRemat = false; 1231 auto prevRematIt = rematValues.find(uniqueDef); 1232 if (prevRematIt != rematValues.end()) 1233 { 1234 if ((inst->getLexicalId() - (*prevRematIt).second.second) <= 1235 MAX_LOCAL_REMAT_REUSE_DISTANCE) 1236 { 1237 reUseRemat = true; 1238 rematSrc = createSrcRgn(src->asSrcRegRegion(), uniqueDef->first->getDst(), 1239 (*prevRematIt).second.first->getDst()->getTopDcl()); 1240 1241 reduceNumUses(src->getTopDcl()); 1242 1243 #if 0 1244 printf("Reusing rematerialized value %s in src%d of $%d from %s\n", 1245 src->getTopDcl()->getName(), opnd, inst->getCISAOff(), 1246 (*prevRematIt).second.first->getDst()->getTopDcl()->getName()); 1247 #endif 1248 } 1249 (*prevRematIt).second.second = inst->getLexicalId(); 1250 } 1251 1252 if (!reUseRemat) 1253 { 1254 #if 0 1255 printf("Will rematerialize %s in src%d of $%d. Source computation at $%d\n", 1256 src->getTopDcl()->getName(), opnd, inst->getCISAOff(), uniqueDef->first->getCISAOff()); 1257 #endif 1258 std::list<G4_INST*> newInsts; 1259 G4_INST* cacheInst = nullptr; 1260 rematSrc = rematerialize(src->asSrcRegRegion(), bb, uniqueDef, newInsts, cacheInst); 1261 while (!newInsts.empty()) 1262 { 1263 bb->insertBefore(instIt, newInsts.front()); 1264 newInsts.pop_front(); 1265 } 1266 1267 rematValues.insert(std::make_pair(uniqueDef, std::make_pair(cacheInst, src->getInst()->getLexicalId()))); 1268 1269 reduceNumUses(src->getTopDcl()); 1270 1271 IRChanged = true; 1272 } 1273 1274 inst->setSrc(rematSrc, opnd); 1275 } 1276 } 1277 } 1278 } 1279 } 1280 1281 cleanRedundantSamplerHeaders(); 1282 1283 kernel.dumpToFile("after.remat"); 1284 } 1285 } 1286