1 /*========================== begin_copyright_notice ============================
2
3 Copyright (C) 2020-2021 Intel Corporation
4
5 SPDX-License-Identifier: MIT
6
7 ============================= end_copyright_notice ===========================*/
8
9 #include "AccSubstitution.hpp"
10
11 #include <cmath>
12
13 using namespace vISA;
14
15 struct AccInterval
16 {
17 G4_INST* inst;
18 int lastUse;
19 bool mustBeAcc0 = false;
20 bool isAllFloat = false;
21 bool isPreAssigned = false;
22 int assignedAcc = -1;
23 int spilledAcc = -1;
24 int bundleConflictTimes = 0;
25 int bankConflictTimes = 0;
26 int suppressionTimes = 0;
27
AccIntervalAccInterval28 AccInterval(G4_INST* inst_, int lastUse_, bool preAssigned = false) :
29 inst(inst_), lastUse(lastUse_), isPreAssigned(preAssigned)
30 {
31 if (isPreAssigned)
32 {
33 mustBeAcc0 = true;
34 assignedAcc = 0;
35 }
36 }
37
getSpillCostAccInterval38 double getSpillCost() const {
39 if (isPreAssigned)
40 {
41 // don't spill pre-assigned
42 return (double)1000000;
43 }
44 int dist = lastUse - inst->getLocalId();
45
46 //Bundle conflict has higher priority than bank conflict. Because bundle conflict means bank conflict at the same time.
47 return (std::pow((double)(bundleConflictTimes + 1), 3) + std::pow((double)(bankConflictTimes + 1), 2) + std::pow((double)inst->use_size(), 3) / dist) / (suppressionTimes + 1);
48 }
49
50 // see if this interval needs both halves of the acc
needBothAccAccInterval51 bool needBothAcc(IR_Builder& builder) const
52 {
53 switch (inst->getDst()->getType())
54 {
55 case Type_F:
56 return inst->getExecSize() == G4_ExecSize(builder.getNativeExecSize() * 2);
57 case Type_HF:
58 case Type_BF:
59 return false;
60 case Type_DF:
61 return inst->getExecSize() > G4_ExecSize(builder.getNativeExecSize() / 2);
62 default:
63 return true;
64 }
65 }
66
dumpAccInterval67 void dump()
68 {
69 std::cerr << "[" << inst->getLocalId() << ", " << lastUse << "] : ";
70 if (assignedAcc != -1)
71 {
72 std::cerr << "\tAcc" << assignedAcc << "\n";
73 }
74 else
75 {
76 std::cerr << "\n";
77 }
78 std::cerr << "\t";
79 inst->dump();
80 }
81 };
82
83 #define setInValidReg(x) (x = -1)
84 #define isValidReg(x) (x != -1)
85
setBundleConflict(int i,unsigned short & BC)86 static void setBundleConflict(int i, unsigned short& BC)
87 {
88 unsigned short bc = 0x1 << (i * 3);
89 BC |= bc;
90 }
91
setBankConflict(int i,unsigned short & BC)92 static void setBankConflict(int i, unsigned short& BC)
93 {
94 unsigned short bc = 0x2 << (i * 3);
95 BC |= bc;
96 }
97
setSuppression(int i,unsigned short & BC)98 static void setSuppression(int i, unsigned short& BC)
99 {
100 unsigned short bc = 0x4 << (i * 3);
101 BC |= bc;
102 }
103
104 /*
105 * Bank conflict types:
106 * 1. any two from same bundle and same bank
107 * 2. all three from same bank
108 */
getConflictTimesForTGL(int * firstRegCandidate,unsigned int & sameBankConflicts,unsigned short & BC)109 static void getConflictTimesForTGL(int* firstRegCandidate, unsigned int& sameBankConflicts, unsigned short& BC)
110 {
111 int bundles[G4_MAX_SRCS];
112 int bankSrcs[G4_MAX_SRCS];
113
114 for (int i = 0; i < G4_MAX_SRCS; i++)
115 {
116 bundles[i] = -1;
117 bankSrcs[i] = -1;
118 if (isValidReg(firstRegCandidate[i]))
119 {
120 bundles[i] = (firstRegCandidate[i] % 64) / 4;
121 bankSrcs[i] = (firstRegCandidate[i] % 4) / 2;
122 }
123 }
124
125 int sameBankNum = 0;
126 bool setBundle = false;
127 for (int i = 0; i < G4_MAX_SRCS; i++)
128 {
129 if (bundles[i] != -1)
130 {
131 for (int j = i + 1; j < G4_MAX_SRCS; j++)
132 {
133 if (bundles[j] != -1)
134 {
135 if (bundles[i] == bundles[j] && bankSrcs[i] == bankSrcs[j]) //same bank and same bundle
136 {
137 //setBankConflict(i, BC);
138 setBundleConflict(i, BC);
139 setBundleConflict(j, BC);
140 setBundle = true;
141 }
142 else if (bankSrcs[i] == bankSrcs[j]) //Different bundle and same bank
143 {
144 if (!sameBankNum)
145 {
146 sameBankNum += 2;
147 }
148 else
149 {
150 sameBankNum++;
151 }
152 }
153 }
154 }
155 }
156 }
157
158 if (!setBundle && sameBankNum > 2)
159 {
160 for (int i = 0; i < G4_MAX_SRCS; i++)
161 {
162 if (bundles[i] != -1)
163 {
164 setBankConflict(i, BC);
165 }
166 }
167 }
168
169 return;
170 }
171
bankConflictAnalysisTGL(G4_INST * inst,int * suppressRegs,std::map<G4_INST *,unsigned int> * BCInfo)172 void bankConflictAnalysisTGL(G4_INST* inst, int* suppressRegs, std::map<G4_INST*, unsigned int>* BCInfo)
173 {
174 if (inst->isSend() || inst->isMath() ||
175 inst->isSWSBSync() || inst->isLabel() ||
176 inst->isWait() ||
177 inst->isReturn() || inst->isCall())
178 {
179 for (int i = 0; i < 3; i++)
180 {
181 setInValidReg(suppressRegs[i]);
182 }
183 setInValidReg(suppressRegs[3]);
184
185 return;
186 }
187
188 int dstRegs[2];
189 int dstExecSize = 0;
190 int srcRegs[2][G4_MAX_SRCS];
191 int srcExecSize[G4_MAX_SRCS];
192 bool isScalar[G4_MAX_SRCS];
193
194 int firstRegCandidate[G4_MAX_SRCS];
195 int secondRegCandidate[G4_MAX_SRCS];
196
197 int candidateNum = 0;
198 unsigned int sameBankConflictTimes = 0;
199
200 //Initialization
201 for (int i = 0; i < G4_MAX_SRCS; i++)
202 {
203 setInValidReg(firstRegCandidate[i]);
204 setInValidReg(secondRegCandidate[i]);
205 setInValidReg(srcRegs[0][i]);
206 setInValidReg(srcRegs[1][i]);
207 isScalar[i] = false;
208 }
209 setInValidReg(dstRegs[0]);
210 setInValidReg(dstRegs[1]);
211
212 bool instSplit = false;
213
214 //Get Dst registers
215 G4_DstRegRegion* dstOpnd = inst->getDst();
216 if (dstOpnd && !dstOpnd->isIndirect() && dstOpnd->isGreg())
217 {
218 dstExecSize = dstOpnd->getLinearizedEnd() - dstOpnd->getLinearizedStart() + 1;
219 uint32_t byteAddress = dstOpnd->getLinearizedStart();
220 dstRegs[0] = byteAddress / numEltPerGRF<Type_UB>();
221 if (dstExecSize > 32)
222 {
223 dstRegs[1] = dstRegs[0] + (dstExecSize + numEltPerGRF<Type_UB>() - 1) / numEltPerGRF<Type_UB>() - 1;
224 instSplit = true;
225 }
226 }
227
228 //Get src
229 for (unsigned i = 0, size = inst->getNumSrc(); i < size; i++)
230 {
231 G4_Operand* srcOpnd = inst->getSrc(i);
232 if (srcOpnd)
233 {
234 if (srcOpnd->isSrcRegRegion() &&
235 srcOpnd->asSrcRegRegion()->getBase() &&
236 srcOpnd->asSrcRegRegion()->getBase()->isRegVar())
237 {
238 G4_RegVar* baseVar = static_cast<G4_RegVar*>(srcOpnd->asSrcRegRegion()->getBase());
239 srcExecSize[i] = srcOpnd->getLinearizedEnd() - srcOpnd->getLinearizedStart() + 1;
240 if (baseVar->isGreg()) {
241 uint32_t byteAddress = srcOpnd->getLinearizedStart();
242 srcRegs[0][i] = byteAddress / numEltPerGRF<Type_UB>();
243
244 if (srcExecSize[i] > 32)
245 {
246 srcRegs[1][i] = srcRegs[0][i] + (srcExecSize[i] + numEltPerGRF<Type_UB>() - 1) / numEltPerGRF<Type_UB>() - 1;
247 instSplit = true;
248 }
249 else if (srcOpnd->asSrcRegRegion()->isScalar()) //No Read suppression for SIMD 16/scalar src
250 {
251 srcRegs[1][i] = srcRegs[0][i];
252 isScalar[i] = true;
253 }
254 else
255 {
256 setInValidReg(srcRegs[1][i]);
257 }
258 }
259 }
260 }
261 }
262
263 //Read Suppression for current instruction
264 for (int i = 0; i < 3; i++)
265 {
266 unsigned short BC = 0;
267
268 if (isValidReg(suppressRegs[i]) &&
269 srcRegs[0][i] == suppressRegs[i] && !isScalar[i])
270 {
271 if (inst->opcode() == G4_mad && i == 1)
272 {
273 setSuppression(i, BC);
274 (*BCInfo)[inst] |= BC;
275 }
276 setInValidReg(srcRegs[0][i]);
277 }
278 else
279 {
280 suppressRegs[i] = srcRegs[0][i];
281 }
282
283 if (i == 1) //src1
284 {
285 if (isValidReg(suppressRegs[3]) &&
286 srcRegs[1][i] == suppressRegs[3] && !isScalar[i])
287 {
288 setInValidReg(srcRegs[1][i]);
289 }
290 else
291 {
292 suppressRegs[3] = srcRegs[1][i];
293 }
294 }
295
296 }
297
298 //Kill all previous read suppression candiadte if it wrote in DST
299 if (isValidReg(dstRegs[0]))
300 {
301 for (int i = 0; i < 4; i++)
302 {
303 if (suppressRegs[i] == dstRegs[0])
304 {
305 setInValidReg(suppressRegs[i]);
306 }
307 }
308 }
309
310 if (isValidReg(dstRegs[1]))
311 {
312 for (int i = 0; i < 4; i++)
313 {
314 if (suppressRegs[i] == dstRegs[0])
315 {
316 setInValidReg(suppressRegs[i]);
317 }
318 }
319 }
320
321
322 for (int i = 0; i < G4_MAX_SRCS; i++)
323 {
324 if (isValidReg(srcRegs[0][i]))
325 {
326 firstRegCandidate[i] = srcRegs[0][i];
327 candidateNum++;
328 }
329 }
330
331 unsigned short BC0 = 0;
332 if (candidateNum > 1)
333 {
334 getConflictTimesForTGL(firstRegCandidate, sameBankConflictTimes, BC0);
335 (*BCInfo)[inst] |= BC0;
336 }
337
338 if (instSplit)
339 {
340 candidateNum = 0;
341 for (int i = 0; i < G4_MAX_SRCS; i++)
342 {
343 if (isValidReg(srcRegs[1][i]))
344 {
345 secondRegCandidate[i] = srcRegs[1][i];
346 candidateNum++;
347 }
348 }
349
350 if (candidateNum > 1)
351 {
352 unsigned short BC = 0;
353 getConflictTimesForTGL(secondRegCandidate, sameBankConflictTimes, BC);
354 if (BC != 0)
355 {
356 (*BCInfo)[inst] |= ((unsigned int)BC) << 16;
357 }
358 }
359 }
360
361 return;
362 }
363
364 /*
365 * for unsigned integer info BC
366 * The first unsigned short provide the conflict info of GRF of a 1GRF size operands, or the first GRF of a 2GRF size operands.
367 * The second unsigned short provide the conflict info the second GRF of a 2GRF size operands.
368 * For each operands (from 0 to 3), 2 bits are used.
369 * Odd bit represents the bundle conflict and the even bit represents the bank conflict
370 */
getSuppression(int srcOpndIdx,unsigned int BC)371 static unsigned getSuppression(int srcOpndIdx, unsigned int BC)
372 {
373 unsigned short bc0 = (unsigned short)(0x0000FFFF & BC);
374 unsigned short bc1 = (unsigned short)(BC >> 16);
375 unsigned suppression = 0;
376 if (((bc0 >> (srcOpndIdx * 3)) & 0x4) != 0)
377 {
378 suppression++;
379 }
380 if (((bc1 >> (srcOpndIdx * 3)) & 0x4) != 0)
381 {
382 suppression++;
383 }
384
385 return suppression;
386 }
387
getBundleConflicts(int srcOpndIdx,unsigned int BC)388 static unsigned getBundleConflicts(int srcOpndIdx, unsigned int BC)
389 {
390 unsigned short bc0 = (unsigned short)(0x0000FFFF & BC);
391 unsigned short bc1 = (unsigned short)(BC >> 16);
392 unsigned conflicts = 0;
393 if (((bc0 >> (srcOpndIdx * 3)) & 0x1) != 0)
394 {
395 conflicts++;
396 }
397 if (((bc1 >> (srcOpndIdx * 3)) & 0x1) != 0)
398 {
399 conflicts++;
400 }
401
402 return conflicts;
403 }
404
getBankConflicts(int srcOpndIdx,unsigned int BC)405 static unsigned getBankConflicts(int srcOpndIdx, unsigned int BC)
406 {
407 unsigned short bc0 = (unsigned short)(0x0000FFFF & BC);
408 unsigned short bc1 = (unsigned short)(BC >> 16);
409 unsigned conflicts = 0;
410 if (((bc0 >> (srcOpndIdx * 3)) & 0x2) != 0)
411 {
412 conflicts++;
413 }
414 if (((bc1 >> (srcOpndIdx * 3)) & 0x2) != 0)
415 {
416 conflicts++;
417 }
418
419 return conflicts;
420 }
421
422 // returns true if the inst is a candidate for acc substitution
423 // lastUse is also update to point to the last use id of the inst
isAccCandidate(G4_INST * inst,int & lastUse,bool & mustBeAcc0,bool & isAllFloat,int & readSuppressionSrcs,int & bundleBC,int & bankBC,std::map<G4_INST *,unsigned int> * BCInfo)424 bool AccSubPass::isAccCandidate(G4_INST* inst, int& lastUse, bool& mustBeAcc0, bool& isAllFloat, int& readSuppressionSrcs, int& bundleBC, int& bankBC, std::map<G4_INST*, unsigned int>* BCInfo)
425 {
426 mustBeAcc0 = false;
427 isAllFloat = true;
428 G4_DstRegRegion* dst = inst->getDst();
429 if (!dst || kernel.fg.globalOpndHT.isOpndGlobal(dst) || !inst->canDstBeAcc())
430 {
431 return false;
432 }
433
434 if (!IS_TYPE_FLOAT_FOR_ACC(dst->getType()))
435 {
436 isAllFloat = false;
437 }
438
439 if (inst->getCondMod() && inst->opcode() != G4_sel)
440 {
441 // since our du-chain is on inst instead of operand, the presence of conditional modifier complicates the checks later.
442 // This is somewhat conservative but shouldn't matter too much as inst with both dst and conditional modifiers are rare.
443 // Exception is for sel as flag register is not updated.
444 return false;
445 }
446
447 // check that every use may be replaced with acc
448 int lastUseId = 0;
449 std::vector<G4_INST*> madSrc0Use;
450 std::vector<G4_INST*> threeSrcUses; //3src inst that use this dst
451 for (auto I = inst->use_begin(), E = inst->use_end(); I != E; ++I)
452 {
453 auto&& use = *I;
454 G4_INST* useInst = use.first;
455 Gen4_Operand_Number opndNum = use.second;
456 lastUseId = std::max(lastUseId, useInst->getLocalId());
457 // acc may be src0 of two-source inst or src1 of three-source inst
458 // ToDo: may swap source here
459 if (useInst->getNumSrc() == 3)
460 {
461 unsigned int BC = 0;
462 if (BCInfo != nullptr)
463 {
464 auto itR = BCInfo->find(useInst);
465 if (itR != BCInfo->end())
466 BC = itR->second;
467 }
468
469 if (!kernel.fg.builder->relaxedACCRestrictions() &&
470 std::find(threeSrcUses.begin(), threeSrcUses.end(), useInst) != threeSrcUses.end())
471 {
472 // don't allow acc to appear twice in a 3-src inst
473 return false;
474 }
475 threeSrcUses.push_back(useInst);
476 switch (opndNum)
477 {
478 case Opnd_src2:
479 if (!kernel.fg.builder->relaxedACCRestrictions3())
480 {
481 return false;
482 }
483 if (!IS_TYPE_FLOAT_FOR_ACC(useInst->getSrc(2)->getType()) ||
484 (useInst->getDst() && !IS_TYPE_FLOAT_FOR_ACC(useInst->getDst()->getType())))
485 {
486 return false;
487 }
488 break;
489 case Opnd_src1:
490 if (BC)
491 {
492 bundleBC += getBundleConflicts(1, BC);
493 bankBC += getBankConflicts(1, BC);
494 readSuppressionSrcs += getSuppression(1, BC);
495 }
496 break; //OK
497
498 case Opnd_src0:
499 if (BC)
500 {
501 bundleBC += getBundleConflicts(0, BC);
502 bankBC += getBankConflicts(0, BC);
503 readSuppressionSrcs += getSuppression(0, BC);
504 }
505
506 if (kernel.fg.builder->canMadHaveSrc0Acc())
507 {
508 // OK
509 }
510 else if (useInst->opcode() == G4_mad)
511 {
512 // we can turn this mad into a mac
513 mustBeAcc0 = true;
514 if (useInst->getSrc(0)->getType() == Type_HF && useInst->getMaskOffset() == 16)
515 {
516 // we must use acc1, and need to check that inst does not have an acc0 source
517 // so that dst and src won't have different acc source
518 if (inst->isAccSrcInst())
519 {
520 bool hasAcc0Src = false;
521 auto isAcc0 = [](G4_SrcRegRegion* src)
522 {
523 return src->getBase()->asAreg()->getArchRegType() == AREG_ACC0;
524 };
525 if (inst->getSrc(0)->isSrcRegRegion() &&
526 inst->getSrc(0)->asSrcRegRegion()->getBase()->isAccReg())
527 {
528 hasAcc0Src = isAcc0(inst->getSrc(0)->asSrcRegRegion());
529 }
530 else if (inst->getSrc(1)->isSrcRegRegion() &&
531 inst->getSrc(1)->asSrcRegRegion()->getBase()->isAccReg())
532 {
533 hasAcc0Src = isAcc0(inst->getSrc(1)->asSrcRegRegion());
534 }
535 if (hasAcc0Src)
536 {
537 return false;
538 }
539 }
540 }
541 madSrc0Use.push_back(useInst);
542 }
543 else
544 {
545 return false;
546 }
547 break;
548 default:
549 return false;
550 }
551 }
552 else if (!builder.relaxedACCRestrictions() && opndNum != Opnd_src0)
553 {
554 return false;
555 }
556
557 if (useInst->getSingleDef(opndNum) == nullptr)
558 {
559 // def must be the only define for this use
560 return false;
561 }
562
563 int srcId = useInst->getSrcNum(opndNum);
564 G4_Operand* src = useInst->getSrc(srcId);
565
566 if (dst->getType() != src->getType() || kernel.fg.globalOpndHT.isOpndGlobal(src) ||
567 dst->compareOperand(src) != Rel_eq)
568 {
569 return false;
570 }
571 if (!useInst->canSrcBeAcc(opndNum))
572 {
573 return false;
574 }
575 if (!IS_TYPE_FLOAT_FOR_ACC(src->getType()))
576 {
577 isAllFloat = false;
578 }
579 }
580
581 // we have to avoid the case where the dst is used as both src0 and src1 of a mad
582 for (auto madUse : madSrc0Use)
583 {
584 for (auto I = inst->use_begin(), E = inst->use_end(); I != E; ++I)
585 {
586 auto&& use = *I;
587 G4_INST* useInst = use.first;
588 Gen4_Operand_Number opndNum = use.second;
589 if (madUse == useInst && opndNum == Opnd_src1)
590 {
591 return false;
592 }
593 }
594 }
595
596 if (lastUseId == 0)
597 {
598 // no point using acc for a dst without local uses
599 return false;
600 }
601
602 lastUse = lastUseId;
603 return true;
604 }
605
606 // replace an inst's dst and all of its (local) uses with acc
607 // note that this may fail due to HW restrictions on acc
replaceDstWithAcc(G4_INST * inst,int accNum)608 bool AccSubPass::replaceDstWithAcc(G4_INST* inst, int accNum)
609 {
610 G4_DstRegRegion* dst = inst->getDst();
611 bool useAcc1 = (accNum & 0x1) != 0;
612 accNum &= ~0x1;
613
614 if (!builder.relaxedACCRestrictions())
615 {
616 auto myAcc = useAcc1 ? AREG_ACC1 : AREG_ACC0;
617 // check that dst and src do not have different accumulator
618 for (int i = 0, numSrc = inst->getNumSrc(); i < numSrc; ++i)
619 {
620 if (inst->getSrc(i)->isAccReg())
621 {
622 auto base = inst->getSrc(i)->asSrcRegRegion()->getBase();
623 if (base->isPhyAreg())
624 {
625 if (base->asAreg()->getArchRegType() != myAcc)
626 {
627 return false;
628 }
629 }
630 }
631 }
632 }
633
634 for (auto I = inst->use_begin(), E = inst->use_end(); I != E; ++I)
635 {
636 auto&& use = *I;
637 G4_INST* useInst = use.first;
638 if (!builder.canMadHaveSrc0Acc() && useInst->opcode() == G4_mad && use.second == Opnd_src0)
639 {
640 // if we are replacing mad with mac, additionally check if acc1 needs to be used
641 if (useInst->getMaskOffset() == 16 && dst->getType() == Type_HF)
642 {
643 if (builder.doMultiAccSub())
644 {
645 // this is not legal since acc1 may be taken by another interval already
646 return false;
647 }
648 useAcc1 = true;
649 }
650 }
651
652 if (builder.relaxedACCRestrictions())
653 {
654 // mul/mac can't have both sources be acc
655 // Note that we only need to check for explicit mac here since we will not change mad to mac
656 if (!builder.relaxedACCRestrictions3() && (useInst->opcode() == G4_mul || useInst->opcode() == G4_mac))
657 {
658 if (useInst->getSrc(0)->isAccReg() || useInst->getSrc(1)->isAccReg() ||
659 useInst->getSrc(0)->compareOperand(useInst->getSrc(1)) == G4_CmpRelation::Rel_eq)
660 {
661 return false;
662 }
663 }
664 else if (builder.relaxedACCRestrictions3() && useInst->opcode() == G4_mul)
665 {
666 if (!IS_TYPE_FLOAT_FOR_ACC(useInst->getDst()->getType()) ||
667 !IS_TYPE_FLOAT_FOR_ACC(useInst->getSrc(0)->getType()) ||
668 !IS_TYPE_FLOAT_FOR_ACC(useInst->getSrc(1)->getType()))
669 {
670 return false;
671 }
672 }
673 }
674 else
675 {
676 // do not allow an inst to have multiple acc source operands
677 if (useInst->getNumSrc() == 3)
678 {
679 if (useInst->getSrc(0)->isAccReg() || useInst->getSrc(1)->isAccReg())
680 {
681 return false;
682 }
683 }
684 else if (useInst->opcode() == G4_mac)
685 {
686 // this can happen if we have to convert mad into mac (some platforms don't allow
687 // src0 acc for mad), and the mad's src1 is also an acc candidate.
688 return false;
689 }
690 }
691 }
692
693 // at this point acc substitution must succeed
694
695 G4_Areg* accReg = useAcc1 ? builder.phyregpool.getAcc1Reg() : builder.phyregpool.getAcc0Reg();
696 G4_DstRegRegion* accDst = builder.createDst(accReg,
697 (short)accNum, 0, 1, dst->getType());
698 accDst->setAccRegSel(inst->getDst()->getAccRegSel());
699 inst->setDest(accDst);
700 for (auto I = inst->use_begin(), E = inst->use_end(); I != E; ++I)
701 {
702 auto&& use = *I;
703 G4_INST* useInst = use.first;
704 int srcId = useInst->getSrcNum(use.second);
705 G4_SrcRegRegion* oldSrc = useInst->getSrc(srcId)->asSrcRegRegion();
706 G4_SrcRegRegion* accSrc = builder.createSrcRegRegion(oldSrc->getModifier(), Direct,
707 accReg, (short)accNum, 0, builder.getRegionStride1(), dst->getType());
708 accSrc->setAccRegSel(oldSrc->getAccRegSel());
709
710 bool canReplaceToMac = useInst->opcode() == G4_mad && srcId == 0 && !builder.canMadHaveSrc0Acc();
711 if (canReplaceToMac && builder.noDFTypeMac()) {
712 // dst and all src cannot be DF
713 if ((useInst->getDst() && IS_DFTYPE(useInst->getDst()->getType())) ||
714 (useInst->getSrc(0) && IS_DFTYPE(useInst->getSrc(0)->getType())) ||
715 (useInst->getSrc(1) && IS_DFTYPE(useInst->getSrc(1)->getType())) ||
716 (useInst->getSrc(2) && IS_DFTYPE(useInst->getSrc(2)->getType())))
717 canReplaceToMac = false;
718 }
719
720 if (canReplaceToMac)
721 {
722 // change mad to mac as src0 of 3-src does not support acc
723 auto updateDefSrcPos = [](G4_INST* useInst, Gen4_Operand_Number origPos)
724 {
725 for (auto DI = useInst->def_begin(), DE = useInst->def_end(); DI != DE; ++DI)
726 {
727 auto&& def = *DI;
728 if (def.second == origPos)
729 {
730 for (auto UI = def.first->use_begin(), UE = def.first->use_end(); UI != UE; ++UI)
731 {
732 auto& use = *UI;
733 if (use.first == useInst && use.second == origPos)
734 {
735 switch (use.second)
736 {
737 case Opnd_src1:
738 use.second = Opnd_src0;
739 break;
740 case Opnd_src2:
741 use.second = Opnd_src1;
742 break;
743 default:
744 assert(false && "unexpectd src pos");
745 }
746 }
747 }
748 }
749 }
750 };
751 assert(accNum == 0 && "mad src0 may only use acc0");
752 G4_Operand* macSrc0 = useInst->getSrc(1);
753 updateDefSrcPos(useInst, Opnd_src1);
754 G4_Operand* macSrc1 = useInst->getSrc(2);
755 updateDefSrcPos(useInst, Opnd_src2);
756 useInst->setSrc(macSrc0, 0);
757 useInst->setSrc(macSrc1, 1);
758 useInst->setOpcode(G4_mac);
759 useInst->setImplAccSrc(accSrc);
760 }
761 else
762 {
763 useInst->setSrc(accSrc, srcId);
764 }
765 }
766
767 return true;
768 }
769
770 struct AccAssignment
771 {
772 std::vector<bool> freeAccs;
773 std::list<AccInterval*> activeIntervals;
774 IR_Builder& builder;
775
AccAssignmentAccAssignment776 AccAssignment(int numGeneralAcc, IR_Builder& m_builder, bool initToTrue) : builder(m_builder)
777 {
778 freeAccs.resize(numGeneralAcc, initToTrue);
779 }
780
781 // expire all intervals that end before the given interval
expireIntervalsAccAssignment782 void expireIntervals(AccInterval* interval)
783 {
784 for (auto iter = activeIntervals.begin(), iterEnd = activeIntervals.end(); iter != iterEnd;)
785 {
786 AccInterval* active = *iter;
787 if (active->lastUse <= interval->inst->getLocalId())
788 {
789 assert(!freeAccs[active->assignedAcc] && "active interval's acc should not be free");
790 freeAccs[active->assignedAcc] = true;
791 if (active->needBothAcc(builder))
792 {
793 assert(!freeAccs[active->assignedAcc + 1] && "active interval's acc should not be free");
794 freeAccs[active->assignedAcc + 1] = true;
795 }
796 iter = activeIntervals.erase(iter);
797 #ifdef DEBUG_VERBOSE_ON
798 std::cerr << "Expire: \t";
799 active->dump();
800 #endif
801 }
802 else
803 {
804 ++iter;
805 }
806 }
807 }
808
809 // spill interval that is assigned to accID and remove it from active list
spillIntervalAccAssignment810 void spillInterval(int accID)
811 {
812 auto acc0Iter = std::find_if(activeIntervals.begin(), activeIntervals.end(),
813 [accID](AccInterval* interval) { return interval->assignedAcc == accID; });
814 assert(acc0Iter != activeIntervals.end() && "expect to find interval with acc0");
815 auto spillInterval = *acc0Iter;
816 assert(!spillInterval->isPreAssigned && "overlapping pre-assigned acc0");
817 spillInterval->assignedAcc = -1;
818 activeIntervals.erase(acc0Iter);
819 freeAccs[accID] = true;
820 if (spillInterval->needBothAcc(builder))
821 {
822 assert(accID % 2 == 0 && "accID must be even-aligned in this case");
823 freeAccs[accID + 1] = true;
824 }
825 }
826
827 // pre-assigned intervals (e.g., mach, addc) must use acc0 (and acc1 depending on inst type/size)
828 // we have to spill active intervals that occupy acc0/acc1.
829 // the pre-assigned interavl is also pushed to active list
handlePreAssignedIntervalAccAssignment830 void handlePreAssignedInterval(AccInterval* interval)
831 {
832 if (!freeAccs[interval->assignedAcc])
833 {
834 spillInterval(interval->assignedAcc);
835 }
836 freeAccs[interval->assignedAcc] = false;
837
838 if (interval->needBothAcc(builder))
839 {
840 assert(interval->assignedAcc == 0 && "Total 2 acc support right now");
841 if (!freeAccs[interval->assignedAcc + 1]) // && activeIntervals.size()
842 {
843 spillInterval(interval->assignedAcc + 1);
844 }
845 freeAccs[interval->assignedAcc + 1] = false;
846 }
847
848 activeIntervals.push_back(interval);
849 }
850
851
assignAccAccAssignment852 bool assignAcc(AccInterval* interval, int startReg, int endReg, int step, unsigned forbidden)
853 {
854 for (int i = startReg; i < endReg; i += step)
855 {
856 if (forbidden & (1<< i))
857 {
858 continue;
859 }
860 if (freeAccs[i] && (!interval->needBothAcc(builder) || freeAccs[i + 1]))
861 {
862 interval->assignedAcc = i;
863 freeAccs[i] = false;
864 if (interval->needBothAcc(builder))
865 {
866 freeAccs[i + 1] = false;
867 }
868
869 activeIntervals.push_back(interval);
870 return true;
871 }
872 }
873
874 return false;
875 }
876
877 // pick a free acc for this interval
878 // returns true if a free acc is found, false otherwise
assignAccAccAssignment879 bool assignAcc(AccInterval* interval)
880 {
881 if (interval->isPreAssigned)
882 {
883 handlePreAssignedInterval(interval);
884 return true;
885 }
886
887 int step = interval->needBothAcc(builder) ? 2 : 1;
888 int startReg = 0;
889 int endReg = 0;
890 unsigned forbidden = 0;
891
892 if (interval->mustBeAcc0)
893 {
894 endReg = 1;
895 }
896 else if (builder.hasDoubleAcc())
897 {
898 // 8 thread mode 4 thread mode
899 //DF acc0-acc3,acc8-acc11 acc0-acc15
900 //F acc0-acc3,acc8-acc11 acc0-acc15
901 //HF acc0-acc3,acc8-acc11 acc0-acc15
902 //Q(UQ) acc0-acc3 acc0-acc7
903 //D(UD) acc0/acc2 acc0/acc2/acc4/acc6
904 //W(UW) acc0/acc2 acc0/acc2/acc4/acc6
905 if (!interval->isAllFloat)
906 {
907 if (builder.kernel.getNumThreads() == 8)
908 {
909 forbidden = 0xFFF0;
910 }
911 else
912 {
913 forbidden = 0xFF00;
914 }
915 }
916 else
917 {
918 if (builder.kernel.getNumThreads() == 8)
919 {
920 forbidden = 0xF0F0;
921 }
922 }
923 endReg = (int)freeAccs.size();
924 }
925 else
926 {
927 endReg = (int)freeAccs.size();
928 }
929
930 if (assignAcc(interval, startReg, endReg, step, forbidden))
931 {
932 return true;
933 }
934
935 return false;
936 }
937 };
938
939
multiAccSub(G4_BB * bb)940 void AccSubPass::multiAccSub(G4_BB* bb)
941 {
942 int numGeneralAcc = kernel.getNumAcc();
943
944 std::vector<AccInterval*> intervals;
945 std::vector<AccInterval*> failIntervals;
946 std::vector<AccInterval*> spillIntervals;
947
948 std::map<G4_INST*, unsigned int> BCInfo;
949
950 if (builder.getPlatform() == XeHP_SDV)
951 {
952 int suppressRegs[4];
953 for (int i = 0; i < 3; i++)
954 {
955 suppressRegs[i] = -1;
956 }
957 suppressRegs[3] = -1;
958
959 //Do bank conflict analysis for the BB
960 for (auto instIter = bb->begin(), instEnd = bb->end(); instIter != instEnd; ++instIter)
961 {
962 G4_INST* inst = *instIter;
963 bankConflictAnalysisTGL(inst, suppressRegs, &BCInfo);
964 }
965 }
966
967 //build intervals for potential acc candidates as well as pre-existing acc uses from mac/mach/addc/etc
968 for (auto instIter = bb->begin(), instEnd = bb->end(); instIter != instEnd; ++instIter)
969 {
970 G4_INST* inst = *instIter;
971 if (inst->defAcc())
972 {
973 // we should only have single def/use acc at this point, so any use would kill the def
974 auto iter = instIter;
975 auto useIter = std::find_if(++iter, instEnd, [](G4_INST* inst) { return inst->useAcc(); });
976 int lastUseId = useIter == instEnd ? bb->back()->getLocalId() : (*useIter)->getLocalId();
977 AccInterval* newInterval = new AccInterval(inst, lastUseId, true);
978 intervals.push_back(newInterval);
979 }
980 else
981 {
982 int lastUseId = 0;
983 bool mustBeAcc0 = false;
984 bool isAllFloat = true;
985 int bundleBCTimes = 0;
986 int bankBCTimes = 0;
987 int readSuppressionSrcs = 0;
988 if (isAccCandidate(inst, lastUseId, mustBeAcc0, isAllFloat, readSuppressionSrcs, bundleBCTimes, bankBCTimes, &BCInfo))
989 {
990 // this is a potential candidate for acc substitution
991 AccInterval* newInterval = new AccInterval(inst, lastUseId);
992 newInterval->mustBeAcc0 = mustBeAcc0;
993 newInterval->isAllFloat = isAllFloat;
994 newInterval->bankConflictTimes = bankBCTimes;
995 newInterval->bundleConflictTimes = bundleBCTimes;
996 newInterval->suppressionTimes = readSuppressionSrcs;
997
998 intervals.push_back(newInterval);
999 }
1000 }
1001 }
1002
1003 //modified linear scan to assign free accs to intervals
1004 AccAssignment accAssign(numGeneralAcc, builder, true);
1005
1006 for (auto interval : intervals)
1007 {
1008 // expire intervals
1009 accAssign.expireIntervals(interval);
1010
1011 // assign interval
1012 bool foundFreeAcc = accAssign.assignAcc(interval);
1013
1014 //Spill
1015 if (!foundFreeAcc && accAssign.activeIntervals.size() != 0)
1016 {
1017 // check if we should spill one of the active intervals
1018 auto spillCostCmp = [interval](AccInterval* intv1, AccInterval* intv2)
1019 {
1020 if (!interval->mustBeAcc0)
1021 {
1022 return intv1->getSpillCost() < intv2->getSpillCost();
1023 }
1024
1025 // different compr function if interval must use acc0
1026 if (intv1->assignedAcc == 0 && intv2->assignedAcc == 0)
1027 {
1028 return intv1->getSpillCost() < intv2->getSpillCost();
1029 }
1030 else if (intv1->assignedAcc == 0)
1031 {
1032 return true;
1033 }
1034 return false;
1035 };
1036 auto spillIter = std::min_element(accAssign.activeIntervals.begin(), accAssign.activeIntervals.end(),
1037 spillCostCmp);
1038 auto spillCandidate = *spillIter;
1039 if (interval->getSpillCost() > spillCandidate->getSpillCost() &&
1040 !spillCandidate->isPreAssigned &&
1041 !(interval->mustBeAcc0 && spillCandidate->assignedAcc != 0))
1042 {
1043 bool tmpAssignValue[2];
1044
1045 tmpAssignValue[0] = accAssign.freeAccs[spillCandidate->assignedAcc];
1046 accAssign.freeAccs[spillCandidate->assignedAcc] = true;
1047 if (spillCandidate->needBothAcc(builder))
1048 {
1049 tmpAssignValue[1] = accAssign.freeAccs[spillCandidate->assignedAcc + 1];
1050 accAssign.freeAccs[spillCandidate->assignedAcc + 1] = true;
1051 }
1052
1053 if (accAssign.assignAcc(interval))
1054 {
1055 #ifdef DEBUG_VERBOSE_ON
1056 std::cerr << "Kicked out: \t";
1057 spillCandidate->dump();
1058 #endif
1059 spillIntervals.push_back(spillCandidate);
1060 spillCandidate->spilledAcc = spillCandidate->assignedAcc;
1061 spillCandidate->lastUse = interval->inst->getLocalId();
1062
1063 spillCandidate->assignedAcc = -1;
1064 accAssign.activeIntervals.erase(spillIter);
1065 }
1066 else
1067 {
1068 accAssign.freeAccs[spillCandidate->assignedAcc] = tmpAssignValue[0];
1069 if (spillCandidate->needBothAcc(builder))
1070 {
1071 accAssign.freeAccs[spillCandidate->assignedAcc + 1] = tmpAssignValue[1];
1072 }
1073 }
1074 }
1075 }
1076
1077 if (interval->assignedAcc == -1)
1078 {
1079 failIntervals.push_back(interval);
1080 }
1081 #ifdef DEBUG_VERBOSE_ON
1082 if (interval->assignedAcc == -1)
1083 {
1084 std::cerr << "Failed: \t";
1085 }
1086 else
1087 {
1088 std::cerr << "Assigned: \t";
1089 }
1090 interval->dump();
1091 #endif
1092 }
1093
1094 //Rescan the spilled and failed cases to do ACC substitution in peephole.
1095 if (failIntervals.size() && spillIntervals.size())
1096 {
1097 for (auto spillInterval : spillIntervals)
1098 {
1099 AccAssignment accAssign(numGeneralAcc, builder, false);
1100 accAssign.freeAccs[spillInterval->spilledAcc] = true;
1101 if (spillInterval->needBothAcc(builder))
1102 {
1103 accAssign.freeAccs[spillInterval->spilledAcc + 1] = true;
1104 }
1105
1106 for (auto failInterval : failIntervals)
1107 {
1108 if (!((spillInterval->inst->getLocalId() <= failInterval->inst->getLocalId()) &&
1109 (failInterval->lastUse <= spillInterval->lastUse)) ||
1110 failInterval->assignedAcc != -1)
1111 {
1112 continue;
1113 }
1114 accAssign.expireIntervals(failInterval);
1115 accAssign.assignAcc(failInterval);
1116 }
1117 }
1118 }
1119
1120 for (auto interval : intervals)
1121 {
1122 if (!interval->isPreAssigned && interval->assignedAcc != -1)
1123 {
1124 G4_INST* inst = interval->inst;
1125 replaceDstWithAcc(inst, interval->assignedAcc);
1126
1127 numAccSubDef++;
1128 numAccSubUse += (int)inst->use_size();
1129 #if 0
1130 std::cout << "Acc sub def inst: \n";
1131 inst->emit(std::cout);
1132 std::cout << "[" << inst->getLocalId() << "]\n";
1133 std::cout << "Uses:\n";
1134 for (auto I = inst->use_begin(), E = inst->use_end(); I != E; ++I)
1135 {
1136 auto&& use = *I;
1137 std::cout << "\t";
1138 use.first->emit(std::cout);
1139 std::cout << "[" << use.first->getLocalId() << "]\n";
1140 }
1141 #endif
1142 }
1143 }
1144
1145
1146 for (int i = 0, end = (int)intervals.size(); i < end; ++i)
1147 {
1148 delete intervals[i];
1149 }
1150
1151 return;
1152 }
1153
1154 // substitute local operands with acc when possible
accSub(G4_BB * bb)1155 void AccSubPass::accSub(G4_BB* bb)
1156 {
1157 bb->resetLocalIds();
1158
1159 if (builder.doMultiAccSub())
1160 {
1161 multiAccSub(bb);
1162 return;
1163 }
1164
1165 for (auto instIter = bb->begin(), instEnd = bb->end(); instIter != instEnd; ++instIter)
1166 {
1167 bool canDoAccSub = true;
1168 G4_INST* inst = *instIter;
1169
1170 if (inst->defAcc())
1171 {
1172 // skip ahead till its single use
1173 // we should only have single def/use acc at this point, so any use would
1174 // kill the def
1175 auto iter = instIter;
1176 auto useIter = std::find_if(++iter, instEnd, [](G4_INST* inst) { return inst->useAcc(); });
1177 if (useIter == instEnd)
1178 {
1179 return;
1180 }
1181 instIter = --useIter; // start at the use inst next time
1182 continue;
1183 }
1184
1185 int lastUseId = 0;
1186 bool mustBeAcc0 = false; //ignored
1187 bool isAllFloat = false;
1188 int bundleC = 0;
1189 int bankC = 0;
1190 int suppression = 0;
1191 if (!isAccCandidate(inst, lastUseId, mustBeAcc0, isAllFloat, suppression, bundleC, bankC, nullptr))
1192 {
1193 continue;
1194 }
1195
1196 // don't attempt acc sub if def and last use are too far apart
1197 // this is a crude way to avoid a long running life range from blocking
1198 // other acc sub opportunities
1199 const int accWindow = 25;
1200 if (lastUseId == 0 || lastUseId - inst->getLocalId() > accWindow)
1201 {
1202 continue;
1203 }
1204
1205 // check for intervening acc usage between inst and its last use
1206 auto subIter = instIter;
1207 ++subIter;
1208 for (int instId = inst->getLocalId() + 1; instId != lastUseId; ++subIter, ++instId)
1209 {
1210 G4_INST* anInst = *subIter;
1211 if (anInst->useAcc() || anInst->mayExpandToAccMacro())
1212 {
1213 canDoAccSub = false;
1214 break;
1215 }
1216 }
1217
1218 if (!canDoAccSub)
1219 {
1220 continue;
1221 }
1222 else
1223 {
1224 replaceDstWithAcc(inst, 0);
1225 // advance iter to the last use of the acc
1226 instIter = subIter;
1227 --instIter;
1228
1229 numAccSubDef++;
1230 numAccSubUse += (int)inst->use_size();
1231
1232 #if 0
1233 std::cout << "Acc sub def inst: \n";
1234 inst->emit(std::cout);
1235 std::cout << "[" << inst->getLocalId() << "]\n";
1236 std::cout << "Uses:\n";
1237 for (auto&& use : inst->useInstList)
1238 {
1239 std::cout << "\t";
1240 use.first->emit(std::cout);
1241 std::cout << "[" << use.first->getLocalId() << "]\n";
1242 }
1243 #endif
1244 }
1245 }
1246 }
1247