1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include "SplitAlignedScalars.h"
10 #include "G4_Opcode.h"
11 
12 using namespace vISA;
13 
canReplaceDst(G4_INST * inst)14 bool SplitAlignedScalars::canReplaceDst(G4_INST* inst)
15 {
16     // acc has special rules, so skip optimizing instructions using acc
17     if (inst->useAcc())
18         return false;
19 
20     // whitelist of supported instructions
21     if (inst->isMov() ||
22         inst->isMath() ||
23         inst->isArithmetic() ||
24         inst->isLogic() ||
25         inst->isCompare() ||
26         inst->isPseudoKill())
27     {
28         return true;
29     }
30 
31     return false;
32 }
33 
canReplaceSrc(G4_INST * inst,unsigned int idx)34 bool SplitAlignedScalars::canReplaceSrc(G4_INST* inst, unsigned int idx)
35 {
36     auto opcode = inst->opcode();
37 
38     if (inst->useAcc())
39         return false;
40 
41     if (inst->isMov() ||
42         inst->isMath() ||
43         inst->isArithmetic() ||
44         inst->isLogic() ||
45         inst->isCompare() ||
46         opcode == G4_mad ||
47         inst->isPseudoUse())
48     {
49         return true;
50     }
51 
52     return false;
53 }
54 
gatherCandidates()55 std::vector<G4_Declare*> SplitAlignedScalars::gatherCandidates()
56 {
57     std::vector<G4_Declare*> candidates;
58 
59     unsigned int lexId = 0;
60     for (auto bb : kernel.fg)
61     {
62         for (auto inst : *bb)
63         {
64             inst->setLexicalId(lexId++);
65 
66             auto dst = inst->getDst();
67             if (dst && dst->isDstRegRegion())
68             {
69                 auto dstTopDcl = dst->getTopDcl();
70                 if (dstTopDcl &&
71                     isDclCandidate(dstTopDcl))
72                 {
73                     auto& Data = dclData[dstTopDcl];
74                     Data.defs.push_back(std::pair(bb, inst));
75                     if (Data.firstDef == 0)
76                         Data.firstDef = lexId;
77 
78                     // disallow cases where scalar def has predicate
79                     if (inst->getPredicate())
80                     {
81                         Data.allowed = false;
82                     }
83 
84                     // Disallow splitting any variable if it's used as send
85                     // dst because adding a copy after send causes a back-to-back
86                     // dependency. If scheduler cannot hide it, it can lead to
87                     // performance penalty.
88                     if (inst->isSend())
89                     {
90                         Data.allowed = false;
91                     }
92 
93                     if (dst->getTypeSize() != dstTopDcl->getByteSize())
94                     {
95                         // we require that dst opnd writes complete topdcl
96                         Data.allowed = false;
97                     }
98 
99                     if (dst->getTypeSize() < 4)
100                     {
101                         // byte, word, half floats may have many hw restrictions
102                         // and these are usually not set as GRF aligned
103                         Data.allowed = false;
104                     }
105 
106                     auto dstDcl = dst->asDstRegRegion()->getBase()->asRegVar()->getDeclare();
107                     if (dstDcl->getAliasDeclare())
108                     {
109                         // disallow case where topdcl is scalar, but alias dcl
110                         // is not a scalar as it may be smaller in size. for eg,
111                         // topdcl may be :uq and alias may be of type :ud.
112                         if (dstDcl->getByteSize() != dstTopDcl->getByteSize())
113                             Data.allowed = false;
114                     }
115 
116                     if (dst->getHorzStride() != 1)
117                     {
118                         // dst hstride != 1 to accommodate some alignment restriction
119                         Data.allowed = false;
120                     }
121 
122                     if (!inst->isSend())
123                     {
124                         // check whether dst type size != src type size
125                         // disallow optimization if dst type is different
126                         // than src as that entails alignmment requirements
127                         // that this pass may not honor.
128                         for (unsigned int i = 0; i != inst->getNumSrc(); ++i)
129                         {
130                             auto src = inst->getSrc(i);
131                             if (!src->isSrcRegRegion())
132                                 continue;
133                             if (dst->getTypeSize() != src->getTypeSize())
134                                 Data.allowed = false;
135                         }
136                     }
137                 }
138             }
139 
140             for (unsigned int i = 0; i != inst->getNumSrc(); ++i)
141             {
142                 auto src = inst->getSrc(i);
143                 if (src && src->isSrcRegRegion())
144                 {
145                     auto srcTopDcl = src->getTopDcl();
146                     if (srcTopDcl &&
147                         isDclCandidate(srcTopDcl))
148                     {
149                         auto& Data = dclData[srcTopDcl];
150                         Data.uses.push_back(std::make_tuple(bb, inst, i));
151                         if (Data.lastUse < lexId)
152                             Data.lastUse = lexId;
153 
154                         if (!inst->isSend())
155                         {
156                             // mixed types have alignment requirements
157                             if (dst->getTypeSize() != src->getTypeSize())
158                                 Data.allowed = false;
159                         }
160                     }
161                 }
162             }
163         }
164     }
165 
166     for (auto dcl : kernel.Declares)
167     {
168         if (dcl != dcl->getRootDeclare())
169             continue;
170         auto dclDataIt = dclData.find(dcl);
171         if (dclDataIt != dclData.end())
172         {
173             if ((*dclDataIt).second.allowed)
174                 candidates.push_back(dcl);
175         }
176     }
177 
178     return candidates;
179 }
180 
181 // return number of movs needed when applying optimization on dcl
182 // this method returns estimate of dynamic movs by considering
183 // loops. each loop nest N add is expected to run const# of iterations.
computeNumMovs(G4_Declare * dcl)184 unsigned int SplitAlignedScalars::computeNumMovs(G4_Declare* dcl)
185 {
186     unsigned int numMovsNeeded = 0;
187     auto& Data = dclData[dcl];
188     for (auto def : Data.defs)
189     {
190         if (!canReplaceDst(def.second))
191         {
192             auto bb = def.first;
193             auto innerMostLoop = kernel.fg.getLoops().getInnerMostLoop(bb);
194             if (innerMostLoop)
195                 numMovsNeeded += innerMostLoop->getNestingLevel() * EstimatedLoopTripCount;
196             else
197                 numMovsNeeded++;
198         }
199     }
200     for (auto use : Data.uses)
201     {
202         if (!canReplaceSrc(std::get<1>(use), std::get<2>(use)))
203         {
204             auto bb = std::get<0>(use);
205             auto innerMostLoop = kernel.fg.getLoops().getInnerMostLoop(bb);
206             if (innerMostLoop)
207                 numMovsNeeded += innerMostLoop->getNestingLevel() * EstimatedLoopTripCount;
208             else
209                 numMovsNeeded++;
210         }
211     }
212 
213     return numMovsNeeded;
214 }
215 
pruneCandidates(std::vector<G4_Declare * > & candidates)216 void SplitAlignedScalars::pruneCandidates(std::vector<G4_Declare*>& candidates)
217 {
218     // This method is a cost function that decides which aligned scalar variables from candidates list
219     // get split. This method returns list of final candidates in passed-by-ref argument.
220     // We first sort candidates in descending order of spill cost.
221     // Next, we determine number of movs needed if a given candidate is split. This is an estimate
222     // that takes in to account loop nesting levels. If accumulated mov count is below a threshold
223     // then splitting is allowed. Note that we always split a candidate if it was spilled by current
224     // RA iteration or if the variable was marked as callee saved (in case it crossed fcall boundary).
225 
226     auto compareSpillCost = [&](G4_Declare* dcl1, G4_Declare* dcl2)
227     {
228         auto it1 = dclSpillCost.find(dcl1);
229         auto it2 = dclSpillCost.find(dcl2);
230 
231         float cost1 = 0, cost2 = 0;
232         if (it1 != dclSpillCost.end())
233             cost1 = (*it1).second;
234         if (it2 != dclSpillCost.end())
235             cost2 = (*it2).second;
236 
237         return cost1 < cost2;
238     };
239 
240     std::list<G4_Declare*> candidateList;
241     std::for_each(candidates.begin(), candidates.end(), [&](G4_Declare* dcl) { candidateList.push_back(dcl); });
242 
243     // First re-order candidates based on spill cost in descending order
244     candidateList.sort(compareSpillCost);
245     std::reverse(candidateList.begin(), candidateList.end());
246 
247     for (auto it = candidateList.begin(); it != candidateList.end();)
248     {
249         auto dcl = *it;
250         auto dclDataIt = dclData.find(dcl);
251         bool erase = true;
252         if (dclDataIt != dclData.end())
253         {
254             auto& Data = dclData[dcl];
255             if (heuristic(*it, Data))
256                 erase = false;
257         }
258         if (erase)
259         {
260             it = candidateList.erase(it);
261             continue;
262         }
263         ++it;
264     }
265 
266     auto estimateInstCount = [&]()
267     {
268         auto instCount = 0;
269         for (auto bb : kernel.fg.getBBList())
270         {
271             auto loop = kernel.fg.getLoops().getInnerMostLoop(bb);
272             if (loop)
273                 instCount += (loop->getNestingLevel() * EstimatedLoopTripCount) * bb->size();
274             else
275                 instCount += bb->size();
276         }
277         return instCount;
278     };
279 
280     candidates.clear();
281     unsigned int totalMovsNeeded = 0;
282     unsigned int estimatedInstCount = estimateInstCount();
283     for (auto candidate : candidateList)
284     {
285         bool isCandidate = false;
286         auto numMovsNeeded = computeNumMovs(candidate);
287 
288         // Allow any candidate that was marked as spill or if the candidate needs no movs or
289         // if candidate was marked as requiring callee save allocation.
290         if (spilledDclSet.find(candidate) != spilledDclSet.end() ||
291             calleeSaveBiased.find(candidate) != calleeSaveBiased.end() ||
292             numMovsNeeded == 0)
293             isCandidate = true;
294         else
295         {
296             // Mark as candidate for splitting only if doing so doesnt increase mov count above
297             // a set threshold.
298             if ((totalMovsNeeded + numMovsNeeded) < (unsigned int)(((float)estimatedInstCount * BloatAllowed)))
299                 isCandidate = true;
300         }
301 
302         if (isCandidate)
303         {
304             candidates.push_back(candidate);
305             totalMovsNeeded += numMovsNeeded;
306         }
307     }
308 }
309 
isDclCandidate(G4_Declare * dcl)310 bool SplitAlignedScalars::isDclCandidate(G4_Declare* dcl)
311 {
312     if (dcl->getRegFile() == G4_RegFileKind::G4_GRF &&
313         dcl->getNumElems() == 1 &&
314         !dcl->getAddressed() &&
315         !dcl->getIsPartialDcl() &&
316         !dcl->getRegVar()->isPhyRegAssigned() &&
317         !dcl->getAliasDeclare() &&
318         !dcl->isInput() &&
319         !dcl->isOutput() &&
320         !dcl->isPayloadLiveOut() &&
321         !dcl->isDoNotSpill() &&
322         gra.getSubRegAlign(dcl) == GRFALIGN)
323         return true;
324     return false;
325 }
326 
heuristic(G4_Declare * dcl,Data & d)327 bool SplitAlignedScalars::heuristic(G4_Declare* dcl, Data& d)
328 {
329     if (d.getDUMaxDist() < MinOptDist)
330         return false;
331 
332     if (!d.allowed)
333     {
334         return false;
335     }
336 
337     return true;
338 }
339 
340 // T is either G4_SrcRegRegion or G4_DstRegRegion
341 template<class T>
getDclForRgn(T * rgn,G4_Declare * newTopDcl)342 G4_Declare* SplitAlignedScalars::getDclForRgn(T* rgn, G4_Declare* newTopDcl)
343 {
344     // newTopDcl is the replacement dcl computed by this pass to replace rgn
345     // If rgn's dcl is not aliased then return newTopDcl as new rgn can
346     // directly use this.
347     // If rgn's dcl is aliased, create a new alias dcl based off newTopDcl
348     // with correct offset and return it.
349     auto rgnDcl = rgn->getBase()->asRegVar()->getDeclare();
350 
351     if (rgn->getTopDcl() == rgnDcl)
352         return newTopDcl;
353 
354     auto newAliasDcl = kernel.fg.builder->createTempVar(rgnDcl->getNumElems(),
355         rgnDcl->getElemType(), rgnDcl->getSubRegAlign());
356     newAliasDcl->setAliasDeclare(newTopDcl, rgnDcl->getOffsetFromBase());
357 
358     return newAliasDcl;
359 }
360 
run()361 void SplitAlignedScalars::run()
362 {
363     auto getNewDcl = [&](G4_Declare* oldDcl)
364     {
365         auto newTopDcl = oldNewDcls[oldDcl];
366         if (newTopDcl == nullptr)
367         {
368             newTopDcl = kernel.fg.builder->createTempVar(oldDcl->getNumElems(),
369                 oldDcl->getElemType(), G4_SubReg_Align::Any);
370             oldNewDcls[oldDcl] = newTopDcl;
371         }
372         return newTopDcl;
373     };
374 
375     auto getTypeExecSize = [&](G4_Type type)
376     {
377         if (TypeSize(type) < 8)
378             return std::make_tuple(1, type);
379 
380         if (kernel.fg.builder->noInt64())
381             return std::make_tuple(2, Type_UD);
382 
383         return std::make_tuple(1, Type_UQ);
384     };
385 
386     auto candidates = gatherCandidates();
387 
388     pruneCandidates(candidates);
389 
390     for (auto dcl : candidates)
391     {
392         auto dclDataIt = dclData.find(dcl);
393         if (dclDataIt != dclData.end())
394         {
395             oldNewDcls[dcl] = nullptr;
396             numDclsReplaced++;
397         }
398     }
399 
400     for (auto bb : kernel.fg)
401     {
402         for (auto instIt = bb->begin(), instItEnd = bb->end(); instIt != instItEnd;)
403         {
404             auto newInstIt = instIt;
405             auto inst = (*instIt);
406 
407             auto dst = inst->getDst();
408             if (dst &&
409                 oldNewDcls.find(dst->getTopDcl()) != oldNewDcls.end())
410             {
411                 auto oldTopDcl = dst->getTopDcl();
412                 auto newTopDcl = getNewDcl(oldTopDcl);
413                 auto newDcl = getDclForRgn(dst, newTopDcl);
414 
415                 if (canReplaceDst(inst))
416                 {
417                     auto newDstRgn = kernel.fg.builder->createDst(newDcl->getRegVar(), dst->getRegOff(), dst->getSubRegOff(),
418                         dst->getHorzStride(), dst->getType());
419                     inst->setDest(newDstRgn);
420                 }
421                 else
422                 {
423                     // found an instruction where dst has to be GRF aligned,
424                     // so we keep old dst but we insert a copy in new dcl
425                     auto newAlignedTmpTopDcl = kernel.fg.builder->createTempVar(oldTopDcl->getNumElems(), oldTopDcl->getElemType(),
426                         oldTopDcl->getSubRegAlign());
427                     newAlignedTmpTopDcl->copyAlign(oldTopDcl);
428                     auto newAlignedVar = getDclForRgn(dst, newAlignedTmpTopDcl);
429                     auto dstRgn = kernel.fg.builder->createDst(newAlignedVar->getRegVar(), dst->getRegOff(), dst->getSubRegOff(),
430                         dst->getHorzStride(), dst->getType());
431                     inst->setDest(dstRgn);
432 
433                     // emit copy to store data to original non-aligned scalar
434                     unsigned int execSize = 1;
435                     G4_Type typeToUse = Type_UD;
436                     std::tie(execSize, typeToUse) = getTypeExecSize(dst->getType());
437 
438                     auto src = kernel.fg.builder->createSrc(dstRgn->getBase(), dstRgn->getRegOff(),
439                         dstRgn->getSubRegOff(), execSize == g4::SIMD1 ? kernel.fg.builder->getRegionScalar() :
440                         kernel.fg.builder->getRegionStride1(), typeToUse);
441                     auto dstRgnOfCopy = kernel.fg.builder->createDst(newDcl->getRegVar(),
442                         dst->getRegOff(), dst->getSubRegOff(), dst->getHorzStride(), typeToUse);
443 
444                     G4_Predicate* dupPred = nullptr;
445                     if(inst->getPredicate())
446                         dupPred = kernel.fg.builder->createPredicate(*inst->getPredicate());
447                     auto newInst = kernel.fg.builder->createInternalInst(dupPred, G4_mov, nullptr,
448                         g4::NOSAT, G4_ExecSize(execSize), dstRgnOfCopy, src, nullptr, InstOpt_WriteEnable);
449                     newInstIt = bb->insertAfter(instIt, newInst);
450 
451                     gra.addNoRemat(newInst);
452 
453                     numMovsAdded++;
454                 }
455                 changesMade = true;
456             }
457 
458             for (unsigned int i = 0; i != inst->getNumSrc(); ++i)
459             {
460                 auto src = inst->getSrc(i);
461                 if (!src || !src->isSrcRegRegion())
462                     continue;
463 
464                 auto srcRgn = src->asSrcRegRegion();
465 
466                 if (oldNewDcls.find(srcRgn->getTopDcl()) != oldNewDcls.end())
467                 {
468                     auto oldTopDcl = srcRgn->getTopDcl();
469                     auto newTopDcl = getNewDcl(oldTopDcl);
470                     auto newDcl = getDclForRgn(srcRgn, newTopDcl);
471 
472                     if (canReplaceSrc(inst, i))
473                     {
474                         auto newSrcRgn = kernel.fg.builder->createSrcRegRegion(srcRgn->getModifier(),
475                             srcRgn->getRegAccess(), newDcl->getRegVar(),
476                             srcRgn->getRegOff(), srcRgn->getSubRegOff(), srcRgn->getRegion(), srcRgn->getType());
477                         inst->setSrc(newSrcRgn, i);
478                     }
479                     else
480                     {
481                         // create a new aligned tmp
482                         auto newAlignedTmpTopDcl = kernel.fg.builder->createTempVar(oldTopDcl->getNumElems(), oldTopDcl->getElemType(),
483                             oldTopDcl->getSubRegAlign());
484                         newAlignedTmpTopDcl->copyAlign(oldTopDcl);
485 
486                         unsigned int execSize = 1;
487                         G4_Type typeToUse = Type_UD;
488                         std::tie(execSize, typeToUse) = getTypeExecSize(srcRgn->getType());
489 
490                         // copy oldDcl in to newAlignedTmpTopDcl
491                         auto tmpDst = kernel.fg.builder->createDst(newAlignedTmpTopDcl->getRegVar(), typeToUse);
492                         auto src = kernel.fg.builder->createSrc(newTopDcl->getRegVar(), 0, 0,
493                             execSize == 1 ? kernel.fg.builder->getRegionScalar() : kernel.fg.builder->getRegionStride1(),
494                             typeToUse);
495                         auto copy = kernel.fg.builder->createMov(G4_ExecSize(execSize), tmpDst, src, InstOpt_WriteEnable, false);
496 
497                         // now create src out of tmpDst
498                         auto dclToUse = getDclForRgn(srcRgn, newAlignedTmpTopDcl);
499 
500                         auto newAlignedSrc = kernel.fg.builder->createSrcRegRegion(srcRgn->getModifier(), srcRgn->getRegAccess(),
501                             dclToUse->getRegVar(), srcRgn->getRegOff(), srcRgn->getSubRegOff(), srcRgn->getRegion(), srcRgn->getType());
502                         inst->setSrc(newAlignedSrc, i);
503                         bb->insertBefore(instIt, copy);
504 
505                         // this copy shouldnt be rematerialized
506                         gra.addNoRemat(copy);
507 
508                         numMovsAdded++;
509                     }
510                     changesMade = true;
511                 }
512             }
513 
514             instIt = ++newInstIt;
515         }
516     }
517 }
518 
dump(std::ostream & os)519 void SplitAlignedScalars::dump(std::ostream& os)
520 {
521     os << "# GRF aligned scalar dcls replaced: " << numDclsReplaced << std::endl;
522     os << "# movs added: " << numMovsAdded << std::endl;
523 }
524