1 /*========================== begin_copyright_notice ============================
2
3 Copyright (C) 2021 Intel Corporation
4
5 SPDX-License-Identifier: MIT
6
7 ============================= end_copyright_notice ===========================*/
8
9 #include "SplitAlignedScalars.h"
10 #include "G4_Opcode.h"
11
12 using namespace vISA;
13
canReplaceDst(G4_INST * inst)14 bool SplitAlignedScalars::canReplaceDst(G4_INST* inst)
15 {
16 // acc has special rules, so skip optimizing instructions using acc
17 if (inst->useAcc())
18 return false;
19
20 // whitelist of supported instructions
21 if (inst->isMov() ||
22 inst->isMath() ||
23 inst->isArithmetic() ||
24 inst->isLogic() ||
25 inst->isCompare() ||
26 inst->isPseudoKill())
27 {
28 return true;
29 }
30
31 return false;
32 }
33
canReplaceSrc(G4_INST * inst,unsigned int idx)34 bool SplitAlignedScalars::canReplaceSrc(G4_INST* inst, unsigned int idx)
35 {
36 auto opcode = inst->opcode();
37
38 if (inst->useAcc())
39 return false;
40
41 if (inst->isMov() ||
42 inst->isMath() ||
43 inst->isArithmetic() ||
44 inst->isLogic() ||
45 inst->isCompare() ||
46 opcode == G4_mad ||
47 inst->isPseudoUse())
48 {
49 return true;
50 }
51
52 return false;
53 }
54
gatherCandidates()55 std::vector<G4_Declare*> SplitAlignedScalars::gatherCandidates()
56 {
57 std::vector<G4_Declare*> candidates;
58
59 unsigned int lexId = 0;
60 for (auto bb : kernel.fg)
61 {
62 for (auto inst : *bb)
63 {
64 inst->setLexicalId(lexId++);
65
66 auto dst = inst->getDst();
67 if (dst && dst->isDstRegRegion())
68 {
69 auto dstTopDcl = dst->getTopDcl();
70 if (dstTopDcl &&
71 isDclCandidate(dstTopDcl))
72 {
73 auto& Data = dclData[dstTopDcl];
74 Data.defs.push_back(std::pair(bb, inst));
75 if (Data.firstDef == 0)
76 Data.firstDef = lexId;
77
78 // disallow cases where scalar def has predicate
79 if (inst->getPredicate())
80 {
81 Data.allowed = false;
82 }
83
84 // Disallow splitting any variable if it's used as send
85 // dst because adding a copy after send causes a back-to-back
86 // dependency. If scheduler cannot hide it, it can lead to
87 // performance penalty.
88 if (inst->isSend())
89 {
90 Data.allowed = false;
91 }
92
93 if (dst->getTypeSize() != dstTopDcl->getByteSize())
94 {
95 // we require that dst opnd writes complete topdcl
96 Data.allowed = false;
97 }
98
99 if (dst->getTypeSize() < 4)
100 {
101 // byte, word, half floats may have many hw restrictions
102 // and these are usually not set as GRF aligned
103 Data.allowed = false;
104 }
105
106 auto dstDcl = dst->asDstRegRegion()->getBase()->asRegVar()->getDeclare();
107 if (dstDcl->getAliasDeclare())
108 {
109 // disallow case where topdcl is scalar, but alias dcl
110 // is not a scalar as it may be smaller in size. for eg,
111 // topdcl may be :uq and alias may be of type :ud.
112 if (dstDcl->getByteSize() != dstTopDcl->getByteSize())
113 Data.allowed = false;
114 }
115
116 if (dst->getHorzStride() != 1)
117 {
118 // dst hstride != 1 to accommodate some alignment restriction
119 Data.allowed = false;
120 }
121
122 if (!inst->isSend())
123 {
124 // check whether dst type size != src type size
125 // disallow optimization if dst type is different
126 // than src as that entails alignmment requirements
127 // that this pass may not honor.
128 for (unsigned int i = 0; i != inst->getNumSrc(); ++i)
129 {
130 auto src = inst->getSrc(i);
131 if (!src->isSrcRegRegion())
132 continue;
133 if (dst->getTypeSize() != src->getTypeSize())
134 Data.allowed = false;
135 }
136 }
137 }
138 }
139
140 for (unsigned int i = 0; i != inst->getNumSrc(); ++i)
141 {
142 auto src = inst->getSrc(i);
143 if (src && src->isSrcRegRegion())
144 {
145 auto srcTopDcl = src->getTopDcl();
146 if (srcTopDcl &&
147 isDclCandidate(srcTopDcl))
148 {
149 auto& Data = dclData[srcTopDcl];
150 Data.uses.push_back(std::make_tuple(bb, inst, i));
151 if (Data.lastUse < lexId)
152 Data.lastUse = lexId;
153
154 if (!inst->isSend())
155 {
156 // mixed types have alignment requirements
157 if (dst->getTypeSize() != src->getTypeSize())
158 Data.allowed = false;
159 }
160 }
161 }
162 }
163 }
164 }
165
166 for (auto dcl : kernel.Declares)
167 {
168 if (dcl != dcl->getRootDeclare())
169 continue;
170 auto dclDataIt = dclData.find(dcl);
171 if (dclDataIt != dclData.end())
172 {
173 if ((*dclDataIt).second.allowed)
174 candidates.push_back(dcl);
175 }
176 }
177
178 return candidates;
179 }
180
181 // return number of movs needed when applying optimization on dcl
182 // this method returns estimate of dynamic movs by considering
183 // loops. each loop nest N add is expected to run const# of iterations.
computeNumMovs(G4_Declare * dcl)184 unsigned int SplitAlignedScalars::computeNumMovs(G4_Declare* dcl)
185 {
186 unsigned int numMovsNeeded = 0;
187 auto& Data = dclData[dcl];
188 for (auto def : Data.defs)
189 {
190 if (!canReplaceDst(def.second))
191 {
192 auto bb = def.first;
193 auto innerMostLoop = kernel.fg.getLoops().getInnerMostLoop(bb);
194 if (innerMostLoop)
195 numMovsNeeded += innerMostLoop->getNestingLevel() * EstimatedLoopTripCount;
196 else
197 numMovsNeeded++;
198 }
199 }
200 for (auto use : Data.uses)
201 {
202 if (!canReplaceSrc(std::get<1>(use), std::get<2>(use)))
203 {
204 auto bb = std::get<0>(use);
205 auto innerMostLoop = kernel.fg.getLoops().getInnerMostLoop(bb);
206 if (innerMostLoop)
207 numMovsNeeded += innerMostLoop->getNestingLevel() * EstimatedLoopTripCount;
208 else
209 numMovsNeeded++;
210 }
211 }
212
213 return numMovsNeeded;
214 }
215
pruneCandidates(std::vector<G4_Declare * > & candidates)216 void SplitAlignedScalars::pruneCandidates(std::vector<G4_Declare*>& candidates)
217 {
218 // This method is a cost function that decides which aligned scalar variables from candidates list
219 // get split. This method returns list of final candidates in passed-by-ref argument.
220 // We first sort candidates in descending order of spill cost.
221 // Next, we determine number of movs needed if a given candidate is split. This is an estimate
222 // that takes in to account loop nesting levels. If accumulated mov count is below a threshold
223 // then splitting is allowed. Note that we always split a candidate if it was spilled by current
224 // RA iteration or if the variable was marked as callee saved (in case it crossed fcall boundary).
225
226 auto compareSpillCost = [&](G4_Declare* dcl1, G4_Declare* dcl2)
227 {
228 auto it1 = dclSpillCost.find(dcl1);
229 auto it2 = dclSpillCost.find(dcl2);
230
231 float cost1 = 0, cost2 = 0;
232 if (it1 != dclSpillCost.end())
233 cost1 = (*it1).second;
234 if (it2 != dclSpillCost.end())
235 cost2 = (*it2).second;
236
237 return cost1 < cost2;
238 };
239
240 std::list<G4_Declare*> candidateList;
241 std::for_each(candidates.begin(), candidates.end(), [&](G4_Declare* dcl) { candidateList.push_back(dcl); });
242
243 // First re-order candidates based on spill cost in descending order
244 candidateList.sort(compareSpillCost);
245 std::reverse(candidateList.begin(), candidateList.end());
246
247 for (auto it = candidateList.begin(); it != candidateList.end();)
248 {
249 auto dcl = *it;
250 auto dclDataIt = dclData.find(dcl);
251 bool erase = true;
252 if (dclDataIt != dclData.end())
253 {
254 auto& Data = dclData[dcl];
255 if (heuristic(*it, Data))
256 erase = false;
257 }
258 if (erase)
259 {
260 it = candidateList.erase(it);
261 continue;
262 }
263 ++it;
264 }
265
266 auto estimateInstCount = [&]()
267 {
268 auto instCount = 0;
269 for (auto bb : kernel.fg.getBBList())
270 {
271 auto loop = kernel.fg.getLoops().getInnerMostLoop(bb);
272 if (loop)
273 instCount += (loop->getNestingLevel() * EstimatedLoopTripCount) * bb->size();
274 else
275 instCount += bb->size();
276 }
277 return instCount;
278 };
279
280 candidates.clear();
281 unsigned int totalMovsNeeded = 0;
282 unsigned int estimatedInstCount = estimateInstCount();
283 for (auto candidate : candidateList)
284 {
285 bool isCandidate = false;
286 auto numMovsNeeded = computeNumMovs(candidate);
287
288 // Allow any candidate that was marked as spill or if the candidate needs no movs or
289 // if candidate was marked as requiring callee save allocation.
290 if (spilledDclSet.find(candidate) != spilledDclSet.end() ||
291 calleeSaveBiased.find(candidate) != calleeSaveBiased.end() ||
292 numMovsNeeded == 0)
293 isCandidate = true;
294 else
295 {
296 // Mark as candidate for splitting only if doing so doesnt increase mov count above
297 // a set threshold.
298 if ((totalMovsNeeded + numMovsNeeded) < (unsigned int)(((float)estimatedInstCount * BloatAllowed)))
299 isCandidate = true;
300 }
301
302 if (isCandidate)
303 {
304 candidates.push_back(candidate);
305 totalMovsNeeded += numMovsNeeded;
306 }
307 }
308 }
309
isDclCandidate(G4_Declare * dcl)310 bool SplitAlignedScalars::isDclCandidate(G4_Declare* dcl)
311 {
312 if (dcl->getRegFile() == G4_RegFileKind::G4_GRF &&
313 dcl->getNumElems() == 1 &&
314 !dcl->getAddressed() &&
315 !dcl->getIsPartialDcl() &&
316 !dcl->getRegVar()->isPhyRegAssigned() &&
317 !dcl->getAliasDeclare() &&
318 !dcl->isInput() &&
319 !dcl->isOutput() &&
320 !dcl->isPayloadLiveOut() &&
321 !dcl->isDoNotSpill() &&
322 gra.getSubRegAlign(dcl) == GRFALIGN)
323 return true;
324 return false;
325 }
326
heuristic(G4_Declare * dcl,Data & d)327 bool SplitAlignedScalars::heuristic(G4_Declare* dcl, Data& d)
328 {
329 if (d.getDUMaxDist() < MinOptDist)
330 return false;
331
332 if (!d.allowed)
333 {
334 return false;
335 }
336
337 return true;
338 }
339
340 // T is either G4_SrcRegRegion or G4_DstRegRegion
341 template<class T>
getDclForRgn(T * rgn,G4_Declare * newTopDcl)342 G4_Declare* SplitAlignedScalars::getDclForRgn(T* rgn, G4_Declare* newTopDcl)
343 {
344 // newTopDcl is the replacement dcl computed by this pass to replace rgn
345 // If rgn's dcl is not aliased then return newTopDcl as new rgn can
346 // directly use this.
347 // If rgn's dcl is aliased, create a new alias dcl based off newTopDcl
348 // with correct offset and return it.
349 auto rgnDcl = rgn->getBase()->asRegVar()->getDeclare();
350
351 if (rgn->getTopDcl() == rgnDcl)
352 return newTopDcl;
353
354 auto newAliasDcl = kernel.fg.builder->createTempVar(rgnDcl->getNumElems(),
355 rgnDcl->getElemType(), rgnDcl->getSubRegAlign());
356 newAliasDcl->setAliasDeclare(newTopDcl, rgnDcl->getOffsetFromBase());
357
358 return newAliasDcl;
359 }
360
run()361 void SplitAlignedScalars::run()
362 {
363 auto getNewDcl = [&](G4_Declare* oldDcl)
364 {
365 auto newTopDcl = oldNewDcls[oldDcl];
366 if (newTopDcl == nullptr)
367 {
368 newTopDcl = kernel.fg.builder->createTempVar(oldDcl->getNumElems(),
369 oldDcl->getElemType(), G4_SubReg_Align::Any);
370 oldNewDcls[oldDcl] = newTopDcl;
371 }
372 return newTopDcl;
373 };
374
375 auto getTypeExecSize = [&](G4_Type type)
376 {
377 if (TypeSize(type) < 8)
378 return std::make_tuple(1, type);
379
380 if (kernel.fg.builder->noInt64())
381 return std::make_tuple(2, Type_UD);
382
383 return std::make_tuple(1, Type_UQ);
384 };
385
386 auto candidates = gatherCandidates();
387
388 pruneCandidates(candidates);
389
390 for (auto dcl : candidates)
391 {
392 auto dclDataIt = dclData.find(dcl);
393 if (dclDataIt != dclData.end())
394 {
395 oldNewDcls[dcl] = nullptr;
396 numDclsReplaced++;
397 }
398 }
399
400 for (auto bb : kernel.fg)
401 {
402 for (auto instIt = bb->begin(), instItEnd = bb->end(); instIt != instItEnd;)
403 {
404 auto newInstIt = instIt;
405 auto inst = (*instIt);
406
407 auto dst = inst->getDst();
408 if (dst &&
409 oldNewDcls.find(dst->getTopDcl()) != oldNewDcls.end())
410 {
411 auto oldTopDcl = dst->getTopDcl();
412 auto newTopDcl = getNewDcl(oldTopDcl);
413 auto newDcl = getDclForRgn(dst, newTopDcl);
414
415 if (canReplaceDst(inst))
416 {
417 auto newDstRgn = kernel.fg.builder->createDst(newDcl->getRegVar(), dst->getRegOff(), dst->getSubRegOff(),
418 dst->getHorzStride(), dst->getType());
419 inst->setDest(newDstRgn);
420 }
421 else
422 {
423 // found an instruction where dst has to be GRF aligned,
424 // so we keep old dst but we insert a copy in new dcl
425 auto newAlignedTmpTopDcl = kernel.fg.builder->createTempVar(oldTopDcl->getNumElems(), oldTopDcl->getElemType(),
426 oldTopDcl->getSubRegAlign());
427 newAlignedTmpTopDcl->copyAlign(oldTopDcl);
428 auto newAlignedVar = getDclForRgn(dst, newAlignedTmpTopDcl);
429 auto dstRgn = kernel.fg.builder->createDst(newAlignedVar->getRegVar(), dst->getRegOff(), dst->getSubRegOff(),
430 dst->getHorzStride(), dst->getType());
431 inst->setDest(dstRgn);
432
433 // emit copy to store data to original non-aligned scalar
434 unsigned int execSize = 1;
435 G4_Type typeToUse = Type_UD;
436 std::tie(execSize, typeToUse) = getTypeExecSize(dst->getType());
437
438 auto src = kernel.fg.builder->createSrc(dstRgn->getBase(), dstRgn->getRegOff(),
439 dstRgn->getSubRegOff(), execSize == g4::SIMD1 ? kernel.fg.builder->getRegionScalar() :
440 kernel.fg.builder->getRegionStride1(), typeToUse);
441 auto dstRgnOfCopy = kernel.fg.builder->createDst(newDcl->getRegVar(),
442 dst->getRegOff(), dst->getSubRegOff(), dst->getHorzStride(), typeToUse);
443
444 G4_Predicate* dupPred = nullptr;
445 if(inst->getPredicate())
446 dupPred = kernel.fg.builder->createPredicate(*inst->getPredicate());
447 auto newInst = kernel.fg.builder->createInternalInst(dupPred, G4_mov, nullptr,
448 g4::NOSAT, G4_ExecSize(execSize), dstRgnOfCopy, src, nullptr, InstOpt_WriteEnable);
449 newInstIt = bb->insertAfter(instIt, newInst);
450
451 gra.addNoRemat(newInst);
452
453 numMovsAdded++;
454 }
455 changesMade = true;
456 }
457
458 for (unsigned int i = 0; i != inst->getNumSrc(); ++i)
459 {
460 auto src = inst->getSrc(i);
461 if (!src || !src->isSrcRegRegion())
462 continue;
463
464 auto srcRgn = src->asSrcRegRegion();
465
466 if (oldNewDcls.find(srcRgn->getTopDcl()) != oldNewDcls.end())
467 {
468 auto oldTopDcl = srcRgn->getTopDcl();
469 auto newTopDcl = getNewDcl(oldTopDcl);
470 auto newDcl = getDclForRgn(srcRgn, newTopDcl);
471
472 if (canReplaceSrc(inst, i))
473 {
474 auto newSrcRgn = kernel.fg.builder->createSrcRegRegion(srcRgn->getModifier(),
475 srcRgn->getRegAccess(), newDcl->getRegVar(),
476 srcRgn->getRegOff(), srcRgn->getSubRegOff(), srcRgn->getRegion(), srcRgn->getType());
477 inst->setSrc(newSrcRgn, i);
478 }
479 else
480 {
481 // create a new aligned tmp
482 auto newAlignedTmpTopDcl = kernel.fg.builder->createTempVar(oldTopDcl->getNumElems(), oldTopDcl->getElemType(),
483 oldTopDcl->getSubRegAlign());
484 newAlignedTmpTopDcl->copyAlign(oldTopDcl);
485
486 unsigned int execSize = 1;
487 G4_Type typeToUse = Type_UD;
488 std::tie(execSize, typeToUse) = getTypeExecSize(srcRgn->getType());
489
490 // copy oldDcl in to newAlignedTmpTopDcl
491 auto tmpDst = kernel.fg.builder->createDst(newAlignedTmpTopDcl->getRegVar(), typeToUse);
492 auto src = kernel.fg.builder->createSrc(newTopDcl->getRegVar(), 0, 0,
493 execSize == 1 ? kernel.fg.builder->getRegionScalar() : kernel.fg.builder->getRegionStride1(),
494 typeToUse);
495 auto copy = kernel.fg.builder->createMov(G4_ExecSize(execSize), tmpDst, src, InstOpt_WriteEnable, false);
496
497 // now create src out of tmpDst
498 auto dclToUse = getDclForRgn(srcRgn, newAlignedTmpTopDcl);
499
500 auto newAlignedSrc = kernel.fg.builder->createSrcRegRegion(srcRgn->getModifier(), srcRgn->getRegAccess(),
501 dclToUse->getRegVar(), srcRgn->getRegOff(), srcRgn->getSubRegOff(), srcRgn->getRegion(), srcRgn->getType());
502 inst->setSrc(newAlignedSrc, i);
503 bb->insertBefore(instIt, copy);
504
505 // this copy shouldnt be rematerialized
506 gra.addNoRemat(copy);
507
508 numMovsAdded++;
509 }
510 changesMade = true;
511 }
512 }
513
514 instIt = ++newInstIt;
515 }
516 }
517 }
518
dump(std::ostream & os)519 void SplitAlignedScalars::dump(std::ostream& os)
520 {
521 os << "# GRF aligned scalar dcls replaced: " << numDclsReplaced << std::endl;
522 os << "# movs added: " << numMovsAdded << std::endl;
523 }
524