1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2020-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include "AccSubstitution.hpp"
10 
11 #include <cmath>
12 
13 using namespace vISA;
14 
15 struct AccInterval
16 {
17     G4_INST* inst;
18     int lastUse;
19     bool mustBeAcc0 = false;
20     bool isAllFloat = false;
21     bool isPreAssigned = false;
22     int assignedAcc = -1;
23     int spilledAcc = -1;
24     int bundleConflictTimes = 0;
25     int bankConflictTimes = 0;
26     int suppressionTimes = 0;
27 
AccIntervalAccInterval28     AccInterval(G4_INST* inst_, int lastUse_, bool preAssigned = false) :
29         inst(inst_), lastUse(lastUse_), isPreAssigned(preAssigned)
30     {
31         if (isPreAssigned)
32         {
33             mustBeAcc0 = true;
34             assignedAcc = 0;
35         }
36     }
37 
getSpillCostAccInterval38     double getSpillCost() const {
39         if (isPreAssigned)
40         {
41             // don't spill pre-assigned
42             return (double)1000000;
43         }
44         int dist = lastUse - inst->getLocalId();
45 
46         //Bundle conflict has higher priority than bank conflict. Because bundle conflict means bank conflict at the same time.
47         return (std::pow((double)(bundleConflictTimes + 1), 3) + std::pow((double)(bankConflictTimes + 1), 2) + std::pow((double)inst->use_size(), 3) / dist) / (suppressionTimes + 1);
48     }
49 
50     // see if this interval needs both halves of the acc
needBothAccAccInterval51     bool needBothAcc(IR_Builder& builder) const
52     {
53         switch (inst->getDst()->getType())
54         {
55         case Type_F:
56             return inst->getExecSize() == G4_ExecSize(builder.getNativeExecSize() * 2);
57         case Type_HF:
58         case Type_BF:
59             return false;
60         case Type_DF:
61             return inst->getExecSize() > G4_ExecSize(builder.getNativeExecSize() / 2);
62         default:
63             return true;
64         }
65     }
66 
dumpAccInterval67     void dump()
68     {
69         std::cerr << "[" << inst->getLocalId() << ", " << lastUse << "] : ";
70         if (assignedAcc != -1)
71         {
72             std::cerr << "\tAcc" << assignedAcc << "\n";
73         }
74         else
75         {
76             std::cerr << "\n";
77         }
78         std::cerr << "\t";
79         inst->dump();
80     }
81 };
82 
83 #define setInValidReg(x)   (x = -1)
84 #define isValidReg(x)  (x != -1)
85 
setBundleConflict(int i,unsigned short & BC)86 static void setBundleConflict(int i, unsigned short& BC)
87 {
88     unsigned short bc = 0x1 << (i * 3);
89     BC |= bc;
90 }
91 
setBankConflict(int i,unsigned short & BC)92 static void setBankConflict(int i, unsigned short& BC)
93 {
94     unsigned short bc = 0x2 << (i * 3);
95     BC |= bc;
96 }
97 
setSuppression(int i,unsigned short & BC)98 static void setSuppression(int i, unsigned short& BC)
99 {
100     unsigned short bc = 0x4 << (i * 3);
101     BC |= bc;
102 }
103 
104 /*
105  * Bank conflict types:
106  *  1. any two from same bundle and same bank
107  *  2. all three from same bank
108  */
getConflictTimesForTGL(int * firstRegCandidate,unsigned int & sameBankConflicts,unsigned short & BC)109 static void getConflictTimesForTGL(int* firstRegCandidate, unsigned int& sameBankConflicts, unsigned short& BC)
110 {
111     int bundles[G4_MAX_SRCS];
112     int bankSrcs[G4_MAX_SRCS];
113 
114     for (int i = 0; i < G4_MAX_SRCS; i++)
115     {
116         bundles[i] = -1;
117         bankSrcs[i] = -1;
118         if (isValidReg(firstRegCandidate[i]))
119         {
120             bundles[i] = (firstRegCandidate[i] % 64) / 4;
121             bankSrcs[i] = (firstRegCandidate[i] % 4) / 2;
122         }
123     }
124 
125     int sameBankNum = 0;
126     bool setBundle = false;
127     for (int i = 0; i < G4_MAX_SRCS; i++)
128     {
129         if (bundles[i] != -1)
130         {
131             for (int j = i + 1; j < G4_MAX_SRCS; j++)
132             {
133                 if (bundles[j] != -1)
134                 {
135                     if (bundles[i] == bundles[j] && bankSrcs[i] == bankSrcs[j])  //same bank and same bundle
136                     {
137                         //setBankConflict(i, BC);
138                         setBundleConflict(i, BC);
139                         setBundleConflict(j, BC);
140                         setBundle = true;
141                     }
142                     else if (bankSrcs[i] == bankSrcs[j])  //Different bundle and same bank
143                     {
144                         if (!sameBankNum)
145                         {
146                             sameBankNum += 2;
147                         }
148                         else
149                         {
150                             sameBankNum++;
151                         }
152                     }
153                 }
154             }
155         }
156     }
157 
158     if (!setBundle && sameBankNum > 2)
159     {
160         for (int i = 0; i < G4_MAX_SRCS; i++)
161         {
162             if (bundles[i] != -1)
163             {
164                 setBankConflict(i, BC);
165             }
166         }
167     }
168 
169     return;
170 }
171 
bankConflictAnalysisTGL(G4_INST * inst,int * suppressRegs,std::map<G4_INST *,unsigned int> * BCInfo)172 void bankConflictAnalysisTGL(G4_INST* inst, int* suppressRegs, std::map<G4_INST*, unsigned int>* BCInfo)
173 {
174     if (inst->isSend() || inst->isMath() ||
175         inst->isSWSBSync() || inst->isLabel() ||
176         inst->isWait() ||
177         inst->isReturn() || inst->isCall())
178     {
179         for (int i = 0; i < 3; i++)
180         {
181             setInValidReg(suppressRegs[i]);
182         }
183         setInValidReg(suppressRegs[3]);
184 
185         return;
186     }
187 
188     int dstRegs[2];
189     int dstExecSize = 0;
190     int srcRegs[2][G4_MAX_SRCS];
191     int srcExecSize[G4_MAX_SRCS];
192     bool isScalar[G4_MAX_SRCS];
193 
194     int firstRegCandidate[G4_MAX_SRCS];
195     int secondRegCandidate[G4_MAX_SRCS];
196 
197     int candidateNum = 0;
198     unsigned int sameBankConflictTimes = 0;
199 
200     //Initialization
201     for (int i = 0; i < G4_MAX_SRCS; i++)
202     {
203         setInValidReg(firstRegCandidate[i]);
204         setInValidReg(secondRegCandidate[i]);
205         setInValidReg(srcRegs[0][i]);
206         setInValidReg(srcRegs[1][i]);
207         isScalar[i] = false;
208     }
209     setInValidReg(dstRegs[0]);
210     setInValidReg(dstRegs[1]);
211 
212     bool instSplit = false;
213 
214     //Get Dst registers
215     G4_DstRegRegion* dstOpnd = inst->getDst();
216     if (dstOpnd && !dstOpnd->isIndirect() && dstOpnd->isGreg())
217     {
218         dstExecSize = dstOpnd->getLinearizedEnd() - dstOpnd->getLinearizedStart() + 1;
219         uint32_t byteAddress = dstOpnd->getLinearizedStart();
220         dstRegs[0] = byteAddress / numEltPerGRF<Type_UB>();
221         if (dstExecSize > 32)
222         {
223             dstRegs[1] = dstRegs[0] + (dstExecSize + numEltPerGRF<Type_UB>() - 1) / numEltPerGRF<Type_UB>() - 1;
224             instSplit = true;
225         }
226     }
227 
228     //Get src
229     for (unsigned i = 0, size = inst->getNumSrc(); i < size; i++)
230     {
231         G4_Operand* srcOpnd = inst->getSrc(i);
232         if (srcOpnd)
233         {
234             if (srcOpnd->isSrcRegRegion() &&
235                 srcOpnd->asSrcRegRegion()->getBase() &&
236                 srcOpnd->asSrcRegRegion()->getBase()->isRegVar())
237             {
238                 G4_RegVar* baseVar = static_cast<G4_RegVar*>(srcOpnd->asSrcRegRegion()->getBase());
239                 srcExecSize[i] = srcOpnd->getLinearizedEnd() - srcOpnd->getLinearizedStart() + 1;
240                 if (baseVar->isGreg()) {
241                     uint32_t byteAddress = srcOpnd->getLinearizedStart();
242                     srcRegs[0][i] = byteAddress / numEltPerGRF<Type_UB>();
243 
244                     if (srcExecSize[i] > 32)
245                     {
246                         srcRegs[1][i] = srcRegs[0][i] + (srcExecSize[i] + numEltPerGRF<Type_UB>() - 1) / numEltPerGRF<Type_UB>() - 1;
247                         instSplit = true;
248                     }
249                     else if (srcOpnd->asSrcRegRegion()->isScalar()) //No Read suppression for SIMD 16/scalar src
250                     {
251                         srcRegs[1][i] = srcRegs[0][i];
252                         isScalar[i] = true;
253                     }
254                     else
255                     {
256                         setInValidReg(srcRegs[1][i]);
257                     }
258                 }
259             }
260         }
261     }
262 
263     //Read Suppression for current instruction
264     for (int i = 0; i < 3; i++)
265     {
266         unsigned short BC = 0;
267 
268         if (isValidReg(suppressRegs[i]) &&
269             srcRegs[0][i] == suppressRegs[i] && !isScalar[i])
270         {
271             if (inst->opcode() == G4_mad && i == 1)
272             {
273                 setSuppression(i, BC);
274                 (*BCInfo)[inst] |= BC;
275             }
276             setInValidReg(srcRegs[0][i]);
277         }
278         else
279         {
280             suppressRegs[i] = srcRegs[0][i];
281         }
282 
283         if (i == 1) //src1
284         {
285             if (isValidReg(suppressRegs[3]) &&
286                 srcRegs[1][i] == suppressRegs[3] && !isScalar[i])
287             {
288                 setInValidReg(srcRegs[1][i]);
289             }
290             else
291             {
292                 suppressRegs[3] = srcRegs[1][i];
293             }
294         }
295 
296     }
297 
298     //Kill all previous read suppression candiadte if it wrote in DST
299     if (isValidReg(dstRegs[0]))
300     {
301         for (int i = 0; i < 4; i++)
302         {
303             if (suppressRegs[i] == dstRegs[0])
304             {
305                 setInValidReg(suppressRegs[i]);
306             }
307         }
308     }
309 
310     if (isValidReg(dstRegs[1]))
311     {
312         for (int i = 0; i < 4; i++)
313         {
314             if (suppressRegs[i] == dstRegs[0])
315             {
316                 setInValidReg(suppressRegs[i]);
317             }
318         }
319     }
320 
321 
322     for (int i = 0; i < G4_MAX_SRCS; i++)
323     {
324         if (isValidReg(srcRegs[0][i]))
325         {
326             firstRegCandidate[i] = srcRegs[0][i];
327             candidateNum++;
328         }
329     }
330 
331     unsigned short BC0 = 0;
332     if (candidateNum > 1)
333     {
334         getConflictTimesForTGL(firstRegCandidate, sameBankConflictTimes, BC0);
335         (*BCInfo)[inst] |= BC0;
336     }
337 
338     if (instSplit)
339     {
340         candidateNum = 0;
341         for (int i = 0; i < G4_MAX_SRCS; i++)
342         {
343             if (isValidReg(srcRegs[1][i]))
344             {
345                 secondRegCandidate[i] = srcRegs[1][i];
346                 candidateNum++;
347             }
348         }
349 
350         if (candidateNum > 1)
351         {
352             unsigned short BC = 0;
353             getConflictTimesForTGL(secondRegCandidate, sameBankConflictTimes, BC);
354             if (BC != 0)
355             {
356                 (*BCInfo)[inst] |= ((unsigned int)BC) << 16;
357             }
358         }
359     }
360 
361     return;
362 }
363 
364 /*
365  *   for unsigned integer info BC
366  *   The first unsigned short provide the conflict info of GRF of a 1GRF size operands, or the first GRF of a 2GRF size operands.
367  *   The second unsigned short provide the conflict info the second GRF of a 2GRF size operands.
368  *   For each operands (from 0 to 3), 2 bits are used.
369  *   Odd bit represents the bundle conflict and the even bit represents the bank conflict
370  */
getSuppression(int srcOpndIdx,unsigned int BC)371 static unsigned getSuppression(int srcOpndIdx, unsigned int BC)
372 {
373     unsigned short bc0 = (unsigned short)(0x0000FFFF & BC);
374     unsigned short bc1 = (unsigned short)(BC >> 16);
375     unsigned suppression = 0;
376     if (((bc0 >> (srcOpndIdx * 3)) & 0x4) != 0)
377     {
378         suppression++;
379     }
380     if (((bc1 >> (srcOpndIdx * 3)) & 0x4) != 0)
381     {
382         suppression++;
383     }
384 
385     return suppression;
386 }
387 
getBundleConflicts(int srcOpndIdx,unsigned int BC)388 static unsigned getBundleConflicts(int srcOpndIdx, unsigned int BC)
389 {
390     unsigned short bc0 = (unsigned short)(0x0000FFFF & BC);
391     unsigned short bc1 = (unsigned short)(BC >> 16);
392     unsigned conflicts = 0;
393     if (((bc0 >> (srcOpndIdx * 3)) & 0x1) != 0)
394     {
395         conflicts++;
396     }
397     if (((bc1 >> (srcOpndIdx * 3)) & 0x1) != 0)
398     {
399         conflicts++;
400     }
401 
402     return conflicts;
403 }
404 
getBankConflicts(int srcOpndIdx,unsigned int BC)405 static unsigned getBankConflicts(int srcOpndIdx, unsigned int BC)
406 {
407     unsigned short bc0 = (unsigned short)(0x0000FFFF & BC);
408     unsigned short bc1 = (unsigned short)(BC >> 16);
409     unsigned conflicts = 0;
410     if (((bc0 >> (srcOpndIdx * 3)) & 0x2) != 0)
411     {
412         conflicts++;
413     }
414     if (((bc1 >> (srcOpndIdx * 3)) & 0x2) != 0)
415     {
416         conflicts++;
417     }
418 
419     return conflicts;
420 }
421 
422 // returns true if the inst is a candidate for acc substitution
423 // lastUse is also update to point to the last use id of the inst
isAccCandidate(G4_INST * inst,int & lastUse,bool & mustBeAcc0,bool & isAllFloat,int & readSuppressionSrcs,int & bundleBC,int & bankBC,std::map<G4_INST *,unsigned int> * BCInfo)424 bool AccSubPass::isAccCandidate(G4_INST* inst, int& lastUse, bool& mustBeAcc0, bool& isAllFloat, int& readSuppressionSrcs, int& bundleBC, int& bankBC, std::map<G4_INST*, unsigned int>* BCInfo)
425 {
426     mustBeAcc0 = false;
427     isAllFloat =  true;
428     G4_DstRegRegion* dst = inst->getDst();
429     if (!dst || kernel.fg.globalOpndHT.isOpndGlobal(dst) || !inst->canDstBeAcc())
430     {
431         return false;
432     }
433 
434     if (!IS_TYPE_FLOAT_FOR_ACC(dst->getType()))
435     {
436         isAllFloat = false;
437     }
438 
439     if (inst->getCondMod() && inst->opcode() != G4_sel)
440     {
441         // since our du-chain is on inst instead of operand, the presence of conditional modifier complicates the checks later.
442         // This is somewhat conservative but shouldn't matter too much as inst with both dst and conditional modifiers are rare.
443         // Exception is for sel as flag register is not updated.
444         return false;
445     }
446 
447     // check that every use may be replaced with acc
448     int lastUseId = 0;
449     std::vector<G4_INST*> madSrc0Use;
450     std::vector<G4_INST*> threeSrcUses; //3src inst that use this dst
451     for (auto I = inst->use_begin(), E = inst->use_end(); I != E; ++I)
452     {
453         auto&& use = *I;
454         G4_INST* useInst = use.first;
455         Gen4_Operand_Number opndNum = use.second;
456         lastUseId = std::max(lastUseId, useInst->getLocalId());
457         // acc may be src0 of two-source inst or src1 of three-source inst
458         // ToDo: may swap source here
459         if (useInst->getNumSrc() == 3)
460         {
461             unsigned int BC = 0;
462             if (BCInfo != nullptr)
463             {
464                 auto itR = BCInfo->find(useInst);
465                 if (itR != BCInfo->end())
466                     BC = itR->second;
467             }
468 
469             if (!kernel.fg.builder->relaxedACCRestrictions() &&
470                 std::find(threeSrcUses.begin(), threeSrcUses.end(), useInst) != threeSrcUses.end())
471             {
472                 // don't allow acc to appear twice in a 3-src inst
473                 return false;
474             }
475             threeSrcUses.push_back(useInst);
476             switch (opndNum)
477             {
478             case Opnd_src2:
479                 if (!kernel.fg.builder->relaxedACCRestrictions3())
480                 {
481                     return false;
482                 }
483                 if (!IS_TYPE_FLOAT_FOR_ACC(useInst->getSrc(2)->getType()) ||
484                    (useInst->getDst() && !IS_TYPE_FLOAT_FOR_ACC(useInst->getDst()->getType())))
485                 {
486                     return false;
487                 }
488                 break;
489             case Opnd_src1:
490                 if (BC)
491                 {
492                     bundleBC += getBundleConflicts(1, BC);
493                     bankBC += getBankConflicts(1, BC);
494                     readSuppressionSrcs += getSuppression(1, BC);
495                 }
496                 break;  //OK
497 
498             case Opnd_src0:
499                 if (BC)
500                 {
501                     bundleBC += getBundleConflicts(0, BC);
502                     bankBC += getBankConflicts(0, BC);
503                     readSuppressionSrcs += getSuppression(0, BC);
504                 }
505 
506                 if (kernel.fg.builder->canMadHaveSrc0Acc())
507                 {
508                     // OK
509                 }
510                 else if (useInst->opcode() == G4_mad)
511                 {
512                     // we can turn this mad into a mac
513                     mustBeAcc0 = true;
514                     if (useInst->getSrc(0)->getType() == Type_HF && useInst->getMaskOffset() == 16)
515                     {
516                         // we must use acc1, and need to check that inst does not have an acc0 source
517                         // so that dst and src won't have different acc source
518                         if (inst->isAccSrcInst())
519                         {
520                             bool hasAcc0Src = false;
521                             auto isAcc0 = [](G4_SrcRegRegion* src)
522                             {
523                                 return src->getBase()->asAreg()->getArchRegType() == AREG_ACC0;
524                             };
525                             if (inst->getSrc(0)->isSrcRegRegion() &&
526                                 inst->getSrc(0)->asSrcRegRegion()->getBase()->isAccReg())
527                             {
528                                 hasAcc0Src = isAcc0(inst->getSrc(0)->asSrcRegRegion());
529                             }
530                             else if (inst->getSrc(1)->isSrcRegRegion() &&
531                                 inst->getSrc(1)->asSrcRegRegion()->getBase()->isAccReg())
532                             {
533                                 hasAcc0Src = isAcc0(inst->getSrc(1)->asSrcRegRegion());
534                             }
535                             if (hasAcc0Src)
536                             {
537                                 return false;
538                             }
539                         }
540                     }
541                     madSrc0Use.push_back(useInst);
542                 }
543                 else
544                 {
545                     return false;
546                 }
547                 break;
548             default:
549                 return false;
550             }
551         }
552         else if (!builder.relaxedACCRestrictions() && opndNum != Opnd_src0)
553         {
554             return false;
555         }
556 
557         if (useInst->getSingleDef(opndNum) == nullptr)
558         {
559             // def must be the only define for this use
560             return false;
561         }
562 
563         int srcId = useInst->getSrcNum(opndNum);
564         G4_Operand* src = useInst->getSrc(srcId);
565 
566         if (dst->getType() != src->getType() || kernel.fg.globalOpndHT.isOpndGlobal(src) ||
567             dst->compareOperand(src) != Rel_eq)
568         {
569             return false;
570         }
571         if (!useInst->canSrcBeAcc(opndNum))
572         {
573             return false;
574         }
575         if (!IS_TYPE_FLOAT_FOR_ACC(src->getType()))
576         {
577             isAllFloat = false;
578         }
579     }
580 
581     // we have to avoid the case where the dst is used as both src0 and src1 of a mad
582     for (auto madUse : madSrc0Use)
583     {
584         for (auto I = inst->use_begin(), E = inst->use_end(); I != E; ++I)
585         {
586             auto&& use = *I;
587             G4_INST* useInst = use.first;
588             Gen4_Operand_Number opndNum = use.second;
589             if (madUse == useInst && opndNum == Opnd_src1)
590             {
591                 return false;
592             }
593         }
594     }
595 
596     if (lastUseId == 0)
597     {
598         // no point using acc for a dst without local uses
599         return false;
600     }
601 
602     lastUse = lastUseId;
603     return true;
604 }
605 
606 // replace an inst's dst and all of its (local) uses with acc
607 // note that this may fail due to HW restrictions on acc
replaceDstWithAcc(G4_INST * inst,int accNum)608 bool AccSubPass::replaceDstWithAcc(G4_INST* inst, int accNum)
609 {
610     G4_DstRegRegion* dst = inst->getDst();
611     bool useAcc1 = (accNum & 0x1) != 0;
612     accNum &= ~0x1;
613 
614     if (!builder.relaxedACCRestrictions())
615     {
616         auto myAcc = useAcc1 ? AREG_ACC1 : AREG_ACC0;
617         // check that dst and src do not have different accumulator
618         for (int i = 0, numSrc = inst->getNumSrc(); i < numSrc; ++i)
619         {
620             if (inst->getSrc(i)->isAccReg())
621             {
622                 auto base = inst->getSrc(i)->asSrcRegRegion()->getBase();
623                 if (base->isPhyAreg())
624                 {
625                     if (base->asAreg()->getArchRegType() != myAcc)
626                     {
627                         return false;
628                     }
629                 }
630             }
631         }
632     }
633 
634     for (auto I = inst->use_begin(), E = inst->use_end(); I != E; ++I)
635     {
636         auto&& use = *I;
637         G4_INST* useInst = use.first;
638         if (!builder.canMadHaveSrc0Acc() && useInst->opcode() == G4_mad && use.second == Opnd_src0)
639         {
640             // if we are replacing mad with mac, additionally check if acc1 needs to be used
641             if (useInst->getMaskOffset() == 16 && dst->getType() == Type_HF)
642             {
643                 if (builder.doMultiAccSub())
644                 {
645                     // this is not legal since acc1 may be taken by another interval already
646                     return false;
647                 }
648                 useAcc1 = true;
649             }
650         }
651 
652         if (builder.relaxedACCRestrictions())
653         {
654             // mul/mac can't have both sources be acc
655             // Note that we only need to check for explicit mac here since we will not change mad to mac
656             if (!builder.relaxedACCRestrictions3() && (useInst->opcode() == G4_mul || useInst->opcode() == G4_mac))
657             {
658                 if (useInst->getSrc(0)->isAccReg() || useInst->getSrc(1)->isAccReg() ||
659                     useInst->getSrc(0)->compareOperand(useInst->getSrc(1)) == G4_CmpRelation::Rel_eq)
660                 {
661                     return false;
662                 }
663             }
664             else if (builder.relaxedACCRestrictions3() && useInst->opcode() == G4_mul)
665             {
666                 if (!IS_TYPE_FLOAT_FOR_ACC(useInst->getDst()->getType()) ||
667                     !IS_TYPE_FLOAT_FOR_ACC(useInst->getSrc(0)->getType()) ||
668                     !IS_TYPE_FLOAT_FOR_ACC(useInst->getSrc(1)->getType()))
669                 {
670                     return false;
671                 }
672             }
673         }
674         else
675         {
676             // do not allow an inst to have multiple acc source operands
677             if (useInst->getNumSrc() == 3)
678             {
679                 if (useInst->getSrc(0)->isAccReg() || useInst->getSrc(1)->isAccReg())
680                 {
681                     return false;
682                 }
683             }
684             else if (useInst->opcode() == G4_mac)
685             {
686                 // this can happen if we have to convert mad into mac (some platforms don't allow
687                 // src0 acc for mad), and the mad's src1 is also an acc candidate.
688                 return false;
689             }
690         }
691     }
692 
693     // at this point acc substitution must succeed
694 
695     G4_Areg* accReg = useAcc1 ? builder.phyregpool.getAcc1Reg() : builder.phyregpool.getAcc0Reg();
696     G4_DstRegRegion* accDst = builder.createDst(accReg,
697         (short)accNum, 0, 1, dst->getType());
698     accDst->setAccRegSel(inst->getDst()->getAccRegSel());
699     inst->setDest(accDst);
700     for (auto I = inst->use_begin(), E = inst->use_end(); I != E; ++I)
701     {
702         auto&& use = *I;
703         G4_INST* useInst = use.first;
704         int srcId = useInst->getSrcNum(use.second);
705         G4_SrcRegRegion* oldSrc = useInst->getSrc(srcId)->asSrcRegRegion();
706         G4_SrcRegRegion* accSrc = builder.createSrcRegRegion(oldSrc->getModifier(), Direct,
707             accReg, (short)accNum, 0, builder.getRegionStride1(), dst->getType());
708         accSrc->setAccRegSel(oldSrc->getAccRegSel());
709 
710         bool canReplaceToMac = useInst->opcode() == G4_mad && srcId == 0 && !builder.canMadHaveSrc0Acc();
711         if (canReplaceToMac && builder.noDFTypeMac()) {
712             // dst and all src cannot be DF
713             if ((useInst->getDst() && IS_DFTYPE(useInst->getDst()->getType())) ||
714                 (useInst->getSrc(0) && IS_DFTYPE(useInst->getSrc(0)->getType())) ||
715                 (useInst->getSrc(1) && IS_DFTYPE(useInst->getSrc(1)->getType())) ||
716                 (useInst->getSrc(2) && IS_DFTYPE(useInst->getSrc(2)->getType())))
717                 canReplaceToMac = false;
718         }
719 
720         if (canReplaceToMac)
721         {
722             // change mad to mac as src0 of 3-src does not support acc
723             auto updateDefSrcPos = [](G4_INST* useInst, Gen4_Operand_Number origPos)
724             {
725                 for (auto DI = useInst->def_begin(), DE = useInst->def_end(); DI != DE; ++DI)
726                 {
727                     auto&& def = *DI;
728                     if (def.second == origPos)
729                     {
730                         for (auto UI = def.first->use_begin(), UE = def.first->use_end(); UI != UE; ++UI)
731                         {
732                             auto& use = *UI;
733                             if (use.first == useInst && use.second == origPos)
734                             {
735                                 switch (use.second)
736                                 {
737                                 case Opnd_src1:
738                                     use.second = Opnd_src0;
739                                     break;
740                                 case Opnd_src2:
741                                     use.second = Opnd_src1;
742                                     break;
743                                 default:
744                                     assert(false && "unexpectd src pos");
745                                 }
746                             }
747                         }
748                     }
749                 }
750             };
751             assert(accNum == 0 && "mad src0 may only use acc0");
752             G4_Operand* macSrc0 = useInst->getSrc(1);
753             updateDefSrcPos(useInst, Opnd_src1);
754             G4_Operand* macSrc1 = useInst->getSrc(2);
755             updateDefSrcPos(useInst, Opnd_src2);
756             useInst->setSrc(macSrc0, 0);
757             useInst->setSrc(macSrc1, 1);
758             useInst->setOpcode(G4_mac);
759             useInst->setImplAccSrc(accSrc);
760         }
761         else
762         {
763             useInst->setSrc(accSrc, srcId);
764         }
765     }
766 
767     return true;
768 }
769 
770 struct AccAssignment
771 {
772     std::vector<bool> freeAccs;
773     std::list<AccInterval*> activeIntervals;
774     IR_Builder& builder;
775 
AccAssignmentAccAssignment776     AccAssignment(int numGeneralAcc, IR_Builder& m_builder, bool initToTrue) : builder(m_builder)
777     {
778         freeAccs.resize(numGeneralAcc, initToTrue);
779     }
780 
781     // expire all intervals that end before the given interval
expireIntervalsAccAssignment782     void expireIntervals(AccInterval* interval)
783     {
784         for (auto iter = activeIntervals.begin(), iterEnd = activeIntervals.end(); iter != iterEnd;)
785         {
786             AccInterval* active = *iter;
787             if (active->lastUse <= interval->inst->getLocalId())
788             {
789                 assert(!freeAccs[active->assignedAcc] && "active interval's acc should not be free");
790                 freeAccs[active->assignedAcc] = true;
791                 if (active->needBothAcc(builder))
792                 {
793                     assert(!freeAccs[active->assignedAcc + 1] && "active interval's acc should not be free");
794                     freeAccs[active->assignedAcc + 1] = true;
795                 }
796                 iter = activeIntervals.erase(iter);
797 #ifdef DEBUG_VERBOSE_ON
798                 std::cerr << "Expire:     \t";
799                 active->dump();
800 #endif
801             }
802             else
803             {
804                 ++iter;
805             }
806         }
807     }
808 
809     // spill interval that is assigned to accID and remove it from active list
spillIntervalAccAssignment810     void spillInterval(int accID)
811     {
812         auto acc0Iter = std::find_if(activeIntervals.begin(), activeIntervals.end(),
813             [accID](AccInterval* interval) { return interval->assignedAcc == accID; });
814         assert(acc0Iter != activeIntervals.end() && "expect to find interval with acc0");
815         auto spillInterval = *acc0Iter;
816         assert(!spillInterval->isPreAssigned && "overlapping pre-assigned acc0");
817         spillInterval->assignedAcc = -1;
818         activeIntervals.erase(acc0Iter);
819         freeAccs[accID] = true;
820         if (spillInterval->needBothAcc(builder))
821         {
822             assert(accID % 2 == 0 && "accID must be even-aligned in this case");
823             freeAccs[accID + 1] = true;
824         }
825     }
826 
827     // pre-assigned intervals (e.g., mach, addc) must use acc0 (and acc1 depending on inst type/size)
828     // we have to spill active intervals that occupy acc0/acc1.
829     // the pre-assigned interavl is also pushed to active list
handlePreAssignedIntervalAccAssignment830     void handlePreAssignedInterval(AccInterval* interval)
831     {
832         if (!freeAccs[interval->assignedAcc])
833         {
834             spillInterval(interval->assignedAcc);
835         }
836         freeAccs[interval->assignedAcc] = false;
837 
838         if (interval->needBothAcc(builder))
839         {
840             assert(interval->assignedAcc == 0 && "Total 2 acc support right now");
841             if (!freeAccs[interval->assignedAcc + 1]) // && activeIntervals.size()
842             {
843                 spillInterval(interval->assignedAcc + 1);
844             }
845             freeAccs[interval->assignedAcc + 1] = false;
846         }
847 
848         activeIntervals.push_back(interval);
849     }
850 
851 
assignAccAccAssignment852     bool assignAcc(AccInterval* interval, int startReg, int endReg, int step, unsigned forbidden)
853     {
854         for (int i = startReg; i < endReg; i += step)
855         {
856             if (forbidden & (1<< i))
857             {
858                 continue;
859             }
860             if (freeAccs[i] && (!interval->needBothAcc(builder) || freeAccs[i + 1]))
861             {
862                 interval->assignedAcc = i;
863                 freeAccs[i] = false;
864                 if (interval->needBothAcc(builder))
865                 {
866                     freeAccs[i + 1] = false;
867                 }
868 
869                 activeIntervals.push_back(interval);
870                 return true;
871             }
872         }
873 
874         return false;
875     }
876 
877     // pick a free acc for this interval
878     // returns true if a free acc is found, false otherwise
assignAccAccAssignment879     bool assignAcc(AccInterval* interval)
880     {
881         if (interval->isPreAssigned)
882         {
883             handlePreAssignedInterval(interval);
884             return true;
885         }
886 
887         int step = interval->needBothAcc(builder) ? 2 : 1;
888         int startReg = 0;
889         int endReg = 0;
890         unsigned forbidden = 0;
891 
892         if (interval->mustBeAcc0)
893         {
894             endReg = 1;
895         }
896         else if (builder.hasDoubleAcc())
897         {
898             //      8 thread mode	        4 thread mode
899             //DF    acc0-acc3,acc8-acc11  acc0-acc15
900             //F     acc0-acc3,acc8-acc11  acc0-acc15
901             //HF    acc0-acc3,acc8-acc11  acc0-acc15
902             //Q(UQ) acc0-acc3             acc0-acc7
903             //D(UD) acc0/acc2	          acc0/acc2/acc4/acc6
904             //W(UW) acc0/acc2             acc0/acc2/acc4/acc6
905             if (!interval->isAllFloat)
906             {
907                 if (builder.kernel.getNumThreads() == 8)
908                 {
909                     forbidden = 0xFFF0;
910                 }
911                 else
912                 {
913                     forbidden = 0xFF00;
914                 }
915             }
916             else
917             {
918                 if (builder.kernel.getNumThreads() == 8)
919                 {
920                     forbidden = 0xF0F0;
921                 }
922             }
923             endReg = (int)freeAccs.size();
924         }
925         else
926         {
927             endReg = (int)freeAccs.size();
928         }
929 
930         if (assignAcc(interval, startReg, endReg, step, forbidden))
931         {
932             return true;
933         }
934 
935         return false;
936     }
937 };
938 
939 
multiAccSub(G4_BB * bb)940 void AccSubPass::multiAccSub(G4_BB* bb)
941 {
942     int numGeneralAcc = kernel.getNumAcc();
943 
944     std::vector<AccInterval*> intervals;
945     std::vector<AccInterval*> failIntervals;
946     std::vector<AccInterval*> spillIntervals;
947 
948     std::map<G4_INST*, unsigned int> BCInfo;
949 
950     if (builder.getPlatform() == XeHP_SDV)
951     {
952         int suppressRegs[4];
953         for (int i = 0; i < 3; i++)
954         {
955             suppressRegs[i] = -1;
956         }
957         suppressRegs[3] = -1;
958 
959         //Do bank conflict analysis for the BB
960         for (auto instIter = bb->begin(), instEnd = bb->end(); instIter != instEnd; ++instIter)
961         {
962             G4_INST* inst = *instIter;
963             bankConflictAnalysisTGL(inst, suppressRegs, &BCInfo);
964         }
965     }
966 
967     //build intervals for potential acc candidates as well as pre-existing acc uses from mac/mach/addc/etc
968     for (auto instIter = bb->begin(), instEnd = bb->end(); instIter != instEnd; ++instIter)
969     {
970         G4_INST* inst = *instIter;
971         if (inst->defAcc())
972         {
973             // we should only have single def/use acc at this point, so any use would kill the def
974             auto iter = instIter;
975             auto useIter = std::find_if(++iter, instEnd, [](G4_INST* inst) { return inst->useAcc(); });
976             int lastUseId = useIter == instEnd ? bb->back()->getLocalId() : (*useIter)->getLocalId();
977             AccInterval* newInterval = new AccInterval(inst, lastUseId, true);
978             intervals.push_back(newInterval);
979         }
980         else
981         {
982             int lastUseId = 0;
983             bool mustBeAcc0 = false;
984             bool isAllFloat = true;
985             int bundleBCTimes = 0;
986             int bankBCTimes = 0;
987             int readSuppressionSrcs = 0;
988             if (isAccCandidate(inst, lastUseId, mustBeAcc0, isAllFloat, readSuppressionSrcs, bundleBCTimes, bankBCTimes, &BCInfo))
989             {
990                 // this is a potential candidate for acc substitution
991                 AccInterval* newInterval = new AccInterval(inst, lastUseId);
992                 newInterval->mustBeAcc0 = mustBeAcc0;
993                 newInterval->isAllFloat = isAllFloat;
994                 newInterval->bankConflictTimes = bankBCTimes;
995                 newInterval->bundleConflictTimes = bundleBCTimes;
996                 newInterval->suppressionTimes = readSuppressionSrcs;
997 
998                 intervals.push_back(newInterval);
999             }
1000         }
1001     }
1002 
1003     //modified linear scan to assign free accs to intervals
1004     AccAssignment accAssign(numGeneralAcc, builder, true);
1005 
1006     for (auto interval : intervals)
1007     {
1008         // expire intervals
1009         accAssign.expireIntervals(interval);
1010 
1011         // assign interval
1012         bool foundFreeAcc = accAssign.assignAcc(interval);
1013 
1014         //Spill
1015         if (!foundFreeAcc && accAssign.activeIntervals.size() != 0)
1016         {
1017             // check if we should spill one of the active intervals
1018             auto spillCostCmp = [interval](AccInterval* intv1, AccInterval* intv2)
1019             {
1020                 if (!interval->mustBeAcc0)
1021                 {
1022                     return intv1->getSpillCost() < intv2->getSpillCost();
1023                 }
1024 
1025                 // different compr function if interval must use acc0
1026                 if (intv1->assignedAcc == 0 && intv2->assignedAcc == 0)
1027                 {
1028                     return intv1->getSpillCost() < intv2->getSpillCost();
1029                 }
1030                 else if (intv1->assignedAcc == 0)
1031                 {
1032                     return true;
1033                 }
1034                 return false;
1035             };
1036             auto spillIter = std::min_element(accAssign.activeIntervals.begin(), accAssign.activeIntervals.end(),
1037                 spillCostCmp);
1038             auto spillCandidate = *spillIter;
1039             if (interval->getSpillCost() > spillCandidate->getSpillCost() &&
1040                 !spillCandidate->isPreAssigned &&
1041                 !(interval->mustBeAcc0 && spillCandidate->assignedAcc != 0))
1042             {
1043                 bool tmpAssignValue[2];
1044 
1045                 tmpAssignValue[0] = accAssign.freeAccs[spillCandidate->assignedAcc];
1046                 accAssign.freeAccs[spillCandidate->assignedAcc] = true;
1047                 if (spillCandidate->needBothAcc(builder))
1048                 {
1049                     tmpAssignValue[1] = accAssign.freeAccs[spillCandidate->assignedAcc + 1];
1050                     accAssign.freeAccs[spillCandidate->assignedAcc + 1] = true;
1051                 }
1052 
1053                 if (accAssign.assignAcc(interval))
1054                 {
1055 #ifdef DEBUG_VERBOSE_ON
1056                     std::cerr << "Kicked out:  \t";
1057                     spillCandidate->dump();
1058 #endif
1059                     spillIntervals.push_back(spillCandidate);
1060                     spillCandidate->spilledAcc = spillCandidate->assignedAcc;
1061                     spillCandidate->lastUse = interval->inst->getLocalId();
1062 
1063                     spillCandidate->assignedAcc = -1;
1064                     accAssign.activeIntervals.erase(spillIter);
1065                 }
1066                 else
1067                 {
1068                     accAssign.freeAccs[spillCandidate->assignedAcc] = tmpAssignValue[0];
1069                     if (spillCandidate->needBothAcc(builder))
1070                     {
1071                         accAssign.freeAccs[spillCandidate->assignedAcc + 1] = tmpAssignValue[1];
1072                     }
1073                 }
1074             }
1075         }
1076 
1077         if (interval->assignedAcc == -1)
1078         {
1079             failIntervals.push_back(interval);
1080         }
1081 #ifdef DEBUG_VERBOSE_ON
1082         if (interval->assignedAcc == -1)
1083         {
1084             std::cerr << "Failed:    \t";
1085         }
1086         else
1087         {
1088             std::cerr << "Assigned:   \t";
1089         }
1090         interval->dump();
1091 #endif
1092     }
1093 
1094     //Rescan the spilled and failed cases to do ACC substitution in peephole.
1095     if (failIntervals.size() && spillIntervals.size())
1096     {
1097         for (auto spillInterval : spillIntervals)
1098         {
1099             AccAssignment accAssign(numGeneralAcc, builder, false);
1100             accAssign.freeAccs[spillInterval->spilledAcc] = true;
1101             if (spillInterval->needBothAcc(builder))
1102             {
1103                 accAssign.freeAccs[spillInterval->spilledAcc + 1] = true;
1104             }
1105 
1106             for (auto failInterval : failIntervals)
1107             {
1108                 if (!((spillInterval->inst->getLocalId() <= failInterval->inst->getLocalId()) &&
1109                     (failInterval->lastUse <= spillInterval->lastUse)) ||
1110                     failInterval->assignedAcc != -1)
1111                 {
1112                     continue;
1113                 }
1114                 accAssign.expireIntervals(failInterval);
1115                 accAssign.assignAcc(failInterval);
1116             }
1117         }
1118     }
1119 
1120     for (auto interval : intervals)
1121     {
1122         if (!interval->isPreAssigned && interval->assignedAcc != -1)
1123         {
1124             G4_INST* inst = interval->inst;
1125             replaceDstWithAcc(inst, interval->assignedAcc);
1126 
1127             numAccSubDef++;
1128             numAccSubUse += (int)inst->use_size();
1129 #if 0
1130             std::cout << "Acc sub def inst: \n";
1131             inst->emit(std::cout);
1132             std::cout << "[" << inst->getLocalId() << "]\n";
1133             std::cout << "Uses:\n";
1134             for (auto I = inst->use_begin(), E = inst->use_end(); I != E; ++I)
1135             {
1136                 auto&& use = *I;
1137                 std::cout << "\t";
1138                 use.first->emit(std::cout);
1139                 std::cout << "[" << use.first->getLocalId() << "]\n";
1140             }
1141 #endif
1142         }
1143     }
1144 
1145 
1146     for (int i = 0, end = (int)intervals.size(); i < end; ++i)
1147     {
1148         delete intervals[i];
1149     }
1150 
1151     return;
1152 }
1153 
1154 // substitute local operands with acc when possible
accSub(G4_BB * bb)1155 void AccSubPass::accSub(G4_BB* bb)
1156 {
1157     bb->resetLocalIds();
1158 
1159     if (builder.doMultiAccSub())
1160     {
1161         multiAccSub(bb);
1162         return;
1163     }
1164 
1165     for (auto instIter = bb->begin(), instEnd = bb->end(); instIter != instEnd; ++instIter)
1166     {
1167         bool canDoAccSub = true;
1168         G4_INST* inst = *instIter;
1169 
1170         if (inst->defAcc())
1171         {
1172             // skip ahead till its single use
1173             // we should only have single def/use acc at this point, so any use would
1174             // kill the def
1175             auto iter = instIter;
1176             auto useIter = std::find_if(++iter, instEnd, [](G4_INST* inst) { return inst->useAcc(); });
1177             if (useIter == instEnd)
1178             {
1179                 return;
1180             }
1181             instIter = --useIter; // start at the use inst next time
1182             continue;
1183         }
1184 
1185         int lastUseId = 0;
1186         bool mustBeAcc0 = false; //ignored
1187         bool isAllFloat = false;
1188         int bundleC = 0;
1189         int bankC = 0;
1190         int suppression = 0;
1191         if (!isAccCandidate(inst, lastUseId, mustBeAcc0, isAllFloat, suppression, bundleC, bankC, nullptr))
1192         {
1193             continue;
1194         }
1195 
1196         // don't attempt acc sub if def and last use are too far apart
1197         // this is a crude way to avoid a long running life range from blocking
1198         // other acc sub opportunities
1199         const int accWindow = 25;
1200         if (lastUseId == 0 || lastUseId - inst->getLocalId() > accWindow)
1201         {
1202             continue;
1203         }
1204 
1205         // check for intervening acc usage between inst and its last use
1206         auto subIter = instIter;
1207         ++subIter;
1208         for (int instId = inst->getLocalId() + 1; instId != lastUseId; ++subIter, ++instId)
1209         {
1210             G4_INST* anInst = *subIter;
1211             if (anInst->useAcc() || anInst->mayExpandToAccMacro())
1212             {
1213                 canDoAccSub = false;
1214                 break;
1215             }
1216         }
1217 
1218         if (!canDoAccSub)
1219         {
1220             continue;
1221         }
1222         else
1223         {
1224             replaceDstWithAcc(inst, 0);
1225             // advance iter to the last use of the acc
1226             instIter = subIter;
1227             --instIter;
1228 
1229             numAccSubDef++;
1230             numAccSubUse += (int)inst->use_size();
1231 
1232 #if 0
1233             std::cout << "Acc sub def inst: \n";
1234             inst->emit(std::cout);
1235             std::cout << "[" << inst->getLocalId() << "]\n";
1236             std::cout << "Uses:\n";
1237             for (auto&& use : inst->useInstList)
1238             {
1239                 std::cout << "\t";
1240                 use.first->emit(std::cout);
1241                 std::cout << "[" << use.first->getLocalId() << "]\n";
1242             }
1243 #endif
1244         }
1245     }
1246 }
1247