1 /*========================== begin_copyright_notice ============================
2
3 Copyright (C) 2017-2021 Intel Corporation
4
5 SPDX-License-Identifier: MIT
6
7 ============================= end_copyright_notice ===========================*/
8
9 #include "SpillManagerGMRF.h"
10 #include "G4_IR.hpp"
11 #include "Mem_Manager.h"
12 #include "FlowGraph.h"
13 #include "GraphColor.h"
14 #include "BuildIR.h"
15 #include "DebugInfo.h"
16
17 #include <cmath>
18 #include <sstream>
19 #include <fstream>
20 #include <unordered_set>
21
22 using namespace vISA;
23
24 // Configurations
25
26 #define ADDRESS_SENSITIVE_SPILLS_IMPLEMENTED
27 #define REG_DWORD_SIZE (getGRFSize() / 4)
28 #define REG_BYTE_SIZE (getGRFSize())
29 #define SCRATCH_SPACE_ADDRESS_UNIT 5
30
31 //#define DISABLE_SPILL_MEMORY_COMPRESSION
32 //#define VERIFY_SPILL_ASSIGNMENTS
33
34 // Constant declarations
35
36 static const unsigned DWORD_BYTE_SIZE = 4;
37 static const unsigned OWORD_BYTE_SIZE = 16;
38 static const unsigned HWORD_BYTE_SIZE = 32;
39 static const unsigned PAYLOAD_INPUT_REG_OFFSET = 0;
40 static const unsigned PAYLOAD_INPUT_SUBREG_OFFSET = 0;
41 static const unsigned OWORD_PAYLOAD_SPOFFSET_REG_OFFSET = 0;
42 static const unsigned OWORD_PAYLOAD_SPOFFSET_SUBREG_OFFSET = 2;
43 static const unsigned DWORD_PAYLOAD_SPOFFSET_REG_OFFSET = 1;
44 static const unsigned DWORD_PAYLOAD_SPOFFSET_SUBREG_OFFSET = 0;
45 static const unsigned OWORD_PAYLOAD_WRITE_REG_OFFSET = 1;
46 static const unsigned OWORD_PAYLOAD_WRITE_SUBREG_OFFSET = 0;
47 // dword scatter is always in SIMD8 mode
48 static const unsigned DWORD_PAYLOAD_WRITE_REG_OFFSET = 2;
49 static const unsigned DWORD_PAYLOAD_WRITE_SUBREG_OFFSET = 0;
50 static const unsigned OWORD_PAYLOAD_HEADER_MIN_HEIGHT = 1;
51 static const unsigned DWORD_PAYLOAD_HEADER_MIN_HEIGHT = 2;
52 static const unsigned OWORD_PAYLOAD_HEADER_MAX_HEIGHT = 1;
53 static const unsigned DWORD_PAYLOAD_HEADER_MAX_HEIGHT = 3;
54 static const unsigned DEF_HORIZ_STRIDE = 1;
55 static const unsigned REG_ORIGIN = 0;
56 static const unsigned SUBREG_ORIGIN = 0;
57
58 static const unsigned SEND_GT_READ_TYPE_BIT_OFFSET = 13;
59 static const unsigned SEND_GT_WRITE_TYPE_BIT_OFFSET = 13;
60 static const unsigned SEND_GT_DESC_DATA_SIZE_BIT_OFFSET = 8;
61 static const unsigned SEND_GT_OW_READ_TYPE = 0;
62 static const unsigned SEND_GT_OW_WRITE_TYPE = 8;
63 static const unsigned SEND_GT_SC_READ_TYPE = 6;
64 static const unsigned SEND_GT_SC_WRITE_TYPE = 11;
65 static const unsigned SEND_GT_DP_RD_EX_DESC_IMM = 5;
66 static const unsigned SEND_GT_DP_SC_RD_EX_DESC_IMM = 4; //scatter reads go to sampler cache
67 static const unsigned SEND_GT_DP_WR_EX_DESC_IMM = 5;
68
69 static const unsigned SEND_IVB_MSG_TYPE_BIT_OFFSET = 14;
70 static const unsigned SEND_IVB_OW_READ_TYPE = 0;
71 static const unsigned SEND_IVB_OW_WRITE_TYPE = 8;
72 static const unsigned SEND_IVB_SC_READ_TYPE = 3;
73 static const unsigned SEND_IVB_SC_WRITE_TYPE = 11;
74 static const unsigned SEND_IVB_DP_RD_EX_DESC_IMM = 10; //data cache
75 static const unsigned SEND_IVB_DP_WR_EX_DESC_IMM = 10; //data cache
76
77 // Scratch msg
78 static const unsigned SCRATCH_PAYLOAD_HEADER_MAX_HEIGHT = 1;
79 static const unsigned SCRATCH_MSG_DESC_CATEORY = 18;
80 static const unsigned SCRATCH_MSG_DESC_OPERATION_MODE = 17;
81 static const unsigned SCRATCH_MSG_DESC_CHANNEL_MODE = 16;
82 static const unsigned SCRATCH_MSG_INVALIDATE_AFTER_READ = 15;
83 static const unsigned SCRATCH_MSG_DESC_BLOCK_SIZE = 12;
84
85
86 #define LIMIT_SEND_EXEC_SIZE(EXEC_SIZE) (((EXEC_SIZE) > 16) ? 16 : (EXEC_SIZE))
87 #define SPILL_PAYLOAD_HEIGHT_LIMIT 4
88
splice(G4_BB * bb,INST_LIST_ITER iter,INST_LIST & instList,unsigned int CISAOff)89 void splice(G4_BB* bb, INST_LIST_ITER iter, INST_LIST& instList, unsigned int CISAOff)
90 {
91 // Update CISA offset of all instructions in instList before splicing
92 // operation.
93 for (auto inst : instList)
94 {
95 inst->setCISAOff(CISAOff);
96 }
97
98 bb->splice(iter, instList);
99 }
100
101 // spill/fill temps are always GRF-aligned, and are also even/odd aligned
102 // following the original declare's alignment
setNewDclAlignment(GlobalRA & gra,G4_Declare * newDcl,bool evenAlign)103 static void setNewDclAlignment(GlobalRA& gra, G4_Declare* newDcl, bool evenAlign)
104 {
105 newDcl->setSubRegAlign(GRFALIGN);
106 if (evenAlign)
107 {
108 newDcl->setEvenAlign();
109 }
110
111 gra.setSubRegAlign(newDcl, GRFALIGN);
112 gra.setEvenAligned(newDcl, evenAlign);
113 }
114
SpillManagerGRF(GlobalRA & g,unsigned spillAreaOffset,unsigned varIdCount,const LivenessAnalysis * lvInfo,LiveRange ** lrInfo,const Interference * intf,const LR_LIST * spilledLRs,unsigned iterationNo,bool failSafeSpill,unsigned spillRegSize,unsigned indrSpillRegSize,bool enableSpillSpaceCompression,bool useScratchMsg,bool avoidDstSrcOverlap)115 SpillManagerGRF::SpillManagerGRF(
116 GlobalRA& g,
117 unsigned spillAreaOffset,
118 unsigned varIdCount,
119 const LivenessAnalysis* lvInfo,
120 LiveRange** lrInfo,
121 const Interference* intf,
122 const LR_LIST* spilledLRs,
123 unsigned iterationNo,
124 bool failSafeSpill,
125 unsigned spillRegSize,
126 unsigned indrSpillRegSize,
127 bool enableSpillSpaceCompression,
128 bool useScratchMsg,
129 bool avoidDstSrcOverlap)
130 : gra(g)
131 , builder_(g.kernel.fg.builder)
132 , varIdCount_(varIdCount)
133 , latestImplicitVarIdCount_(0)
134 , lvInfo_(lvInfo)
135 , lrInfo_(lrInfo)
136 , spilledLRs_(spilledLRs)
137 , nextSpillOffset_(spillAreaOffset)
138 , iterationNo_(iterationNo)
139 , doSpillSpaceCompression(enableSpillSpaceCompression)
140 , failSafeSpill_(failSafeSpill)
141 , spillIntf_(intf)
142 , mem_(1024)
143 , useScratchMsg_(useScratchMsg)
144 , avoidDstSrcOverlap_(avoidDstSrcOverlap)
145 , refs(g.kernel)
146 {
147 const unsigned size = sizeof(unsigned) * varIdCount;
148 spillRangeCount_ = (unsigned*)allocMem(size);
149 memset(spillRangeCount_, 0, size);
150 fillRangeCount_ = (unsigned*)allocMem(size);
151 memset(fillRangeCount_, 0, size);
152 tmpRangeCount_ = (unsigned*)allocMem(size);
153 memset(tmpRangeCount_, 0, size);
154 msgSpillRangeCount_ = (unsigned*)allocMem(size);
155 memset(msgSpillRangeCount_, 0, size);
156 msgFillRangeCount_ = (unsigned*)allocMem(size);
157 memset(msgFillRangeCount_, 0, size);
158 spillAreaOffset_ = spillAreaOffset;
159 builder_->instList.clear();
160 spillRegStart_ = g.kernel.getNumRegTotal();
161 indrSpillRegStart_ = spillRegStart_;
162 spillRegOffset_ = spillRegStart_;
163 if (failSafeSpill) {
164 bool isStackCall = builder_->usesStack();
165 unsigned int stackCallRegSize = isStackCall ? builder_->kernel.numReservedABIGRF() : 0;
166 indrSpillRegStart_ -= (stackCallRegSize + indrSpillRegSize);
167 spillRegStart_ = indrSpillRegStart_ - spillRegSize;
168 }
169 curInst = nullptr;
170 globalScratchOffset = gra.kernel.getInt32KernelAttr(Attributes::ATTR_SpillMemOffset);
171 spilledLSLRs_ = nullptr;
172 if (builder_->hasScratchSurface())
173 {
174 builder_->initScratchSurfaceOffset();
175 auto entryBB = builder_->kernel.fg.getEntryBB();
176 auto iter = std::find_if(entryBB->begin(), entryBB->end(), [](G4_INST* inst) { return !inst->isLabel(); });
177 splice(entryBB, iter, builder_->instList, UNMAPPABLE_VISA_INDEX);
178 }
179
180 // LSC messages are used when:
181 // a. Stack call is used on PVC+,
182 // b. Spill size exceeds what can be represented using hword msg on PVC+
183 useLSCMsg = gra.useLscForSpillFill;
184 useLscNonstackCall = gra.useLscForNonStackCallSpillFill;
185 }
186
SpillManagerGRF(GlobalRA & g,unsigned spillAreaOffset,unsigned varIdCount,const LivenessAnalysis * lvInfo,LSLR_LIST * spilledLSLRs,bool enableSpillSpaceCompression,bool useScratchMsg,bool avoidDstSrcOverlap)187 SpillManagerGRF::SpillManagerGRF(
188 GlobalRA& g,
189 unsigned spillAreaOffset,
190 unsigned varIdCount,
191 const LivenessAnalysis* lvInfo,
192 LSLR_LIST* spilledLSLRs,
193 bool enableSpillSpaceCompression,
194 bool useScratchMsg,
195 bool avoidDstSrcOverlap)
196 : gra(g)
197 , builder_(g.kernel.fg.builder)
198 , varIdCount_(varIdCount)
199 , latestImplicitVarIdCount_(0)
200 , lvInfo_(lvInfo)
201 , spilledLSLRs_(spilledLSLRs)
202 , nextSpillOffset_(spillAreaOffset)
203 , doSpillSpaceCompression(enableSpillSpaceCompression)
204 , failSafeSpill_(false)
205 , mem_(1024)
206 , useScratchMsg_(useScratchMsg)
207 , avoidDstSrcOverlap_(avoidDstSrcOverlap)
208 , refs(g.kernel)
209 {
210 const unsigned size = sizeof(unsigned) * varIdCount;
211 spillRangeCount_ = (unsigned*)allocMem(size);
212 memset(spillRangeCount_, 0, size);
213 fillRangeCount_ = (unsigned*)allocMem(size);
214 memset(fillRangeCount_, 0, size);
215 tmpRangeCount_ = (unsigned*)allocMem(size);
216 memset(tmpRangeCount_, 0, size);
217 msgSpillRangeCount_ = (unsigned*)allocMem(size);
218 memset(msgSpillRangeCount_, 0, size);
219 msgFillRangeCount_ = (unsigned*)allocMem(size);
220 memset(msgFillRangeCount_, 0, size);
221 addrSpillFillRangeCount_ = (unsigned*)allocMem(size);
222 memset(addrSpillFillRangeCount_, 0, size);
223 spillAreaOffset_ = spillAreaOffset;
224 builder_->instList.clear();
225 curInst = NULL;
226 globalScratchOffset = gra.kernel.getInt32KernelAttr(Attributes::ATTR_SpillMemOffset);
227
228 if (builder_->hasScratchSurface())
229 {
230 builder_->initScratchSurfaceOffset();
231 auto entryBB = builder_->kernel.fg.getEntryBB();
232 auto iter = std::find_if(entryBB->begin(), entryBB->end(), [](G4_INST* inst) { return !inst->isLabel(); });
233 splice(entryBB, iter, builder_->instList, UNMAPPABLE_VISA_INDEX);
234 }
235 // LSC messages are used when:
236 // a. Stack call is used on PVC+,
237 // b. Spill size exceeds what can be represented using hword msg on PVC+
238 useLSCMsg = gra.useLscForSpillFill;
239 useLscNonstackCall = gra.useLscForNonStackCallSpillFill;
240 }
241
242 // Get the base regvar for the source or destination region.
243 template <class REGION_TYPE>
getRegVar(REGION_TYPE * region) const244 G4_RegVar *SpillManagerGRF::getRegVar(REGION_TYPE * region) const
245 {
246 G4_RegVar * spilledRegVar = (G4_RegVar *) region->getBase();
247 return spilledRegVar;
248 }
249
250 // Get the representative regvar that will be assigned a unique spill
251 // disp and not a relative spill disp.
getReprRegVar(G4_RegVar * regVar) const252 G4_RegVar *SpillManagerGRF::getReprRegVar(G4_RegVar * regVar) const
253 {
254 G4_RegVar * absBase = regVar->getAbsBaseRegVar();
255 if (absBase->isAliased())
256 return getReprRegVar(absBase->getDeclare()->getAliasDeclare()->getRegVar());
257 else
258 return absBase;
259 }
260
261 // Obtain the register file type of the regvar.
getRFType(G4_RegVar * regvar) const262 G4_RegFileKind SpillManagerGRF::getRFType(G4_RegVar * regvar) const
263 {
264 return regvar->getDeclare()->getRegFile();
265 }
266
267 // Obtain the register file type of the region.
268 template <class REGION_TYPE>
getRFType(REGION_TYPE * region) const269 G4_RegFileKind SpillManagerGRF::getRFType(REGION_TYPE * region) const
270 {
271 if (region->getBase()->isRegVar())
272 return getRFType(region->getBase()->asRegVar());
273 else if (region->getBase()->isGreg())
274 return G4_GRF;
275 else
276 return G4_ADDRESS;
277 }
278
279 // Get the byte offset of the origin of the source or destination region.
280 // The row offset component is calculated based on the the parameters of
281 // the corresponding declare directive, while the column offset is calculated
282 // based on the region parameters.
283 template <class REGION_TYPE>
getRegionOriginOffset(REGION_TYPE * region) const284 unsigned SpillManagerGRF::getRegionOriginOffset(REGION_TYPE * region) const
285 {
286 unsigned rowOffset = REG_BYTE_SIZE * region->getRegOff();
287 unsigned columnOffset = region->getSubRegOff() * region->getElemSize();
288 return rowOffset + columnOffset;
289 }
290
291 // Get a GRF aligned mask
grfMask() const292 unsigned SpillManagerGRF::grfMask() const
293 {
294 unsigned mask = 0;
295 mask = (mask - 1);
296 MUST_BE_TRUE(std::log2(numEltPerGRF<Type_UB>()) == (float)((int)(std::log2(numEltPerGRF<Type_UB>()))), "expected integral value");
297 unsigned int bits = (unsigned int)std::log2(numEltPerGRF<Type_UB>());
298 mask = mask << bits;
299 return mask;
300 }
301
302 // Get an hex word mask with the lower 5 bits zeroed.
hwordMask() const303 unsigned SpillManagerGRF::hwordMask() const
304 {
305 unsigned mask = 0;
306 mask = (mask - 1);
307 mask = mask << 5;
308 return mask;
309 }
310
311 // Get an octal word mask with the lower 4 bits zeroed.
owordMask() const312 unsigned SpillManagerGRF::owordMask() const
313 {
314 unsigned mask = 0;
315 mask = (mask - 1);
316 mask = mask << 4;
317 return mask;
318 }
319
320 // Get an dword word mask with the lower 2 bits zeroed.
dwordMask() const321 unsigned SpillManagerGRF::dwordMask() const
322 {
323 unsigned mask = 0;
324 mask = (mask - 1);
325 mask = mask << 2;
326 return mask;
327 }
328
329 // Test of the offset is oword aligned.
owordAligned(unsigned offset) const330 bool SpillManagerGRF::owordAligned(unsigned offset) const
331 {
332 return (offset & owordMask()) == offset;
333 }
334
335 // Test of the offset is oword aligned.
dwordAligned(unsigned offset) const336 bool SpillManagerGRF::dwordAligned(unsigned offset) const
337 {
338 return (offset & dwordMask ()) == offset;
339 }
340
341 // Get the ceil of the ratio.
cdiv(unsigned dvd,unsigned dvr)342 unsigned SpillManagerGRF::cdiv(unsigned dvd, unsigned dvr)
343 {
344 return (dvd / dvr) + ((dvd % dvr) ? 1 : 0);
345 }
346
347 // Get the live range corresponding to id.
shouldSpillRegister(G4_RegVar * regVar) const348 bool SpillManagerGRF::shouldSpillRegister(G4_RegVar * regVar) const
349 {
350 if (getRFType(regVar) == G4_ADDRESS)
351 {
352 return false;
353 }
354 G4_RegVar * actualRegVar =
355 (regVar->getDeclare()->getAliasDeclare()) ?
356 regVar->getDeclare()->getAliasDeclare()->getRegVar() :
357 regVar;
358 if (actualRegVar->getId() == UNDEFINED_VAL)
359 return false;
360 else if (regVar->isRegVarTransient() || regVar->isRegVarTmp())
361 return false;
362 #ifndef ADDRESS_SENSITIVE_SPILLS_IMPLEMENTED
363 else if (lvInfo_->isAddressSensitive (regVar->getId()))
364 return false;
365 #endif
366 else if (builder_->kernel.fg.isPseudoVCADcl(actualRegVar->getDeclare()) ||
367 builder_->kernel.fg.isPseudoVCEDcl(actualRegVar->getDeclare()))
368 return false;
369 else
370 return lrInfo_[actualRegVar->getId()]->getPhyReg() == NULL;
371 }
372
373 // Get the regvar with the id.
getRegVar(unsigned id) const374 G4_RegVar *SpillManagerGRF::getRegVar(unsigned id) const
375 {
376 return (lvInfo_->vars)[id];
377 }
378
379 // Get the byte size of the live range.
getByteSize(G4_RegVar * regVar) const380 unsigned SpillManagerGRF::getByteSize(G4_RegVar * regVar) const
381 {
382 unsigned normalizedRowSize =
383 (regVar->getDeclare()->getNumRows() > 1) ?
384 REG_BYTE_SIZE :
385 regVar->getDeclare()->getNumElems() *
386 regVar->getDeclare()->getElemSize();
387 return normalizedRowSize * regVar->getDeclare()->getNumRows();
388 }
389
390 // Check if the lifetime of the spill/fill memory of live range i interferes
391 // with the lifetime of the spill/fill memory of live range j
spillMemLifetimeInterfere(unsigned i,unsigned j) const392 bool SpillManagerGRF::spillMemLifetimeInterfere(
393 unsigned i, unsigned j) const
394 {
395 G4_RegVar * ireg = getRegVar(i);
396 G4_RegVar * jreg = getRegVar(j);
397 G4_RegVar * irep = getReprRegVar(ireg);
398 G4_RegVar * jrep = getReprRegVar(jreg);
399 G4_RegVar * inont = ireg->getNonTransientBaseRegVar();
400 G4_RegVar * jnont = jreg->getNonTransientBaseRegVar();
401
402 if (ireg->isRegVarTmp()) {
403 return
404 ireg->getBaseRegVar() == jrep ||
405 spillMemLifetimeInterfere(ireg->getBaseRegVar()->getId(), j);
406 }
407 else if (jreg->isRegVarTmp()) {
408 return
409 jreg->getBaseRegVar() == irep ||
410 spillMemLifetimeInterfere (jreg->getBaseRegVar()->getId(), i);
411 }
412
413 else if (inont->isRegVarTmp()) {
414 return
415 inont->getBaseRegVar() == jrep ||
416 spillMemLifetimeInterfere(inont->getBaseRegVar()->getId(), j);
417
418 }
419
420 else if (jnont->isRegVarTmp()) {
421 return
422 jnont->getBaseRegVar() == irep ||
423 spillMemLifetimeInterfere (jnont->getBaseRegVar()->getId(), i);
424 }
425
426 else {
427 if (spillIntf_->interfereBetween(irep->getId(), jrep->getId()))
428 return true;
429 else if (getRFType (irep) != getRFType (jrep))
430 return true;
431 else
432 #ifdef DISABLE_SPILL_MEMORY_COMPRESSION
433 return irep != jrep;
434 #else
435 return false;
436 #endif
437 }
438 }
439
440 // Calculate the spill memory displacement for the regvar.
calculateSpillDisp(G4_RegVar * regVar) const441 unsigned SpillManagerGRF::calculateSpillDisp(G4_RegVar * regVar) const
442 {
443 assert(regVar->getDisp () == UINT_MAX);
444
445 // Locate the blocked locations calculated from the interfering
446 // spilled live ranges and put them into a list in ascending order.
447
448 using LocList = std::list<G4_RegVar*>;
449 LocList locList;
450 unsigned lrId =
451 (regVar->getId() >= varIdCount_)?
452 regVar->getBaseRegVar()->getId(): regVar->getId();
453 assert(lrId < varIdCount_);
454
455 const std::vector<unsigned int>& intfs = spillIntf_->getSparseIntfForVar(lrId);
456 for (auto edge : intfs)
457 {
458 auto lrEdge = getRegVar(edge);
459 if (lrEdge->isRegVarTransient())
460 continue;
461 if (lrEdge->getDisp() == UINT_MAX)
462 continue;
463 locList.push_back(lrEdge);
464 }
465 locList.sort([](G4_RegVar* v1, G4_RegVar* v2) { return v1->getDisp() < v2->getDisp(); });
466
467 // Find a spill slot for lRange within the locList.
468 // we always start searching from nextSpillOffset_ to facilitate intra-iteration reuse.
469 // cross iteration reuse is not done in interest of compile time.
470 unsigned regVarLocDisp = ROUND(nextSpillOffset_, numEltPerGRF<Type_UB>());
471 unsigned regVarSize = getByteSize (regVar);
472
473 for (G4_RegVar *curLoc : locList) {
474 unsigned curLocDisp = curLoc->getDisp ();
475 if (regVarLocDisp < curLocDisp &&
476 regVarLocDisp + regVarSize <= curLocDisp)
477 break;
478 unsigned curLocEnd = curLocDisp + getByteSize(curLoc);
479 {
480 if (curLocEnd % numEltPerGRF<Type_UB>() != 0)
481 curLocEnd = ROUND(curLocEnd, numEltPerGRF<Type_UB>());
482 }
483
484 regVarLocDisp = (regVarLocDisp > curLocEnd)? regVarLocDisp: curLocEnd;
485 }
486
487 return regVarLocDisp;
488 }
489
calculateSpillDispForLS(G4_RegVar * regVar) const490 unsigned SpillManagerGRF::calculateSpillDispForLS(G4_RegVar* regVar) const
491 {
492 assert(regVar->getDisp() == UINT_MAX);
493
494 // Locate the blocked locations calculated from the interfering
495 // spilled live ranges and put them into a list in ascending order.
496
497 typedef std::deque < G4_RegVar* > LocList;
498 LocList locList;
499 unsigned lrId =
500 (regVar->getId() >= varIdCount_) ?
501 regVar->getBaseRegVar()->getId() : regVar->getId();
502 assert(lrId < varIdCount_);
503
504 for (auto lr : activeLR_)
505 {
506 G4_RegVar* intfRegVar = lr->getTopDcl()->getRegVar();
507 if (intfRegVar->isRegVarTransient()) continue;
508
509 unsigned iDisp = intfRegVar->getDisp();
510 if (iDisp == UINT_MAX) continue;
511
512 LocList::iterator loc;
513 for (loc = locList.begin();
514 loc != locList.end() && (*loc)->getDisp() < iDisp;
515 ++loc);
516 if (loc != locList.end())
517 locList.insert(loc, intfRegVar);
518 else
519 locList.push_back(intfRegVar);
520 }
521
522 // Find a spill slot for lRange within the locList.
523 // we always start searching from nextSpillOffset_ to facilitate intra-iteration reuse.
524 // cross iteration reuse is not done in interest of compile time.
525 unsigned regVarLocDisp = ROUND(nextSpillOffset_, numEltPerGRF<Type_UB>());
526 unsigned regVarSize = getByteSize(regVar);
527
528 for (LocList::iterator curLoc = locList.begin(), end = locList.end(); curLoc != end;
529 ++curLoc) {
530 unsigned curLocDisp = (*curLoc)->getDisp();
531 if (regVarLocDisp < curLocDisp &&
532 regVarLocDisp + regVarSize <= curLocDisp)
533 break;
534 unsigned curLocEnd = curLocDisp + getByteSize(*curLoc);
535 {
536 if (curLocEnd % numEltPerGRF<Type_UB>() != 0)
537 curLocEnd = ROUND(curLocEnd, numEltPerGRF<Type_UB>());
538 }
539
540 regVarLocDisp = (regVarLocDisp > curLocEnd) ? regVarLocDisp : curLocEnd;
541 }
542
543 return regVarLocDisp;
544 }
545
546 // Get the spill/fill displacement of the segment containing the region.
547 // A segment is the smallest dword or oword aligned portion of memory
548 // containing the destination or source operand that can be read or saved.
549 template <class REGION_TYPE>
getSegmentDisp(REGION_TYPE * region,G4_ExecSize execSize)550 unsigned SpillManagerGRF::getSegmentDisp (
551 REGION_TYPE * region,
552 G4_ExecSize execSize
553 )
554 {
555 assert(region->getElemSize () && execSize);
556 if (isUnalignedRegion(region, execSize))
557 return getEncAlignedSegmentDisp(region, execSize);
558 else
559 return getRegionDisp(region);
560 }
561
562 // Get the spill/fill displacement of the regvar.
getDisp(G4_RegVar * regVar)563 unsigned SpillManagerGRF::getDisp(G4_RegVar * regVar)
564 {
565 // Already calculated spill memory disp
566
567 if (regVar->getDisp() != UINT_MAX)
568 {
569 return regVar->getDisp();
570 }
571 else if (regVar->isAliased()) {
572 // If it is an aliased regvar then calculate the disp for the
573 // actual regvar and then calculate the disp of the aliased regvar
574 // based on it.
575 G4_Declare * regVarDcl = regVar->getDeclare();
576 return getDisp(regVarDcl->getAliasDeclare()->getRegVar()) +
577 regVarDcl->getAliasOffset();
578 }
579 else if (gra.splitResults.find(regVar->getDeclare()->getRootDeclare()) !=
580 gra.splitResults.end())
581 {
582 // this variable is result of variable splitting optimization.
583 // original variable is guaranteed to have spilled. if split
584 // variable also spills then reuse original variable's spill
585 // location.
586 auto it = gra.splitResults.find(regVar->getDeclare()->getRootDeclare());
587 auto disp = getDisp((*it).second.origDcl->getRegVar());
588 regVar->setDisp(disp);
589 }
590 else if (regVar->isRegVarTransient() &&
591 getDisp(regVar->getBaseRegVar()) != UINT_MAX)
592 {
593 // If its base regvar has been assigned a disp, then the spill memory
594 // has already been allocated for it, simply calculate the disp based
595 // on the enclosing segment disp.
596 assert(regVar->getBaseRegVar() != regVar);
597 unsigned itsDisp;
598
599 if (regVar->isRegVarSpill()) {
600 G4_RegVarTransient * tRegVar = static_cast <G4_RegVarTransient*> (regVar);
601 assert(
602 getSegmentByteSize(
603 tRegVar->getDstRepRegion(), tRegVar->getExecSize()) <=
604 getByteSize(tRegVar));
605 itsDisp =
606 getSegmentDisp(
607 tRegVar->getDstRepRegion(), tRegVar->getExecSize());
608 }
609 else if (regVar->isRegVarFill()) {
610 G4_RegVarTransient * tRegVar = static_cast <G4_RegVarTransient*> (regVar);
611 assert(
612 getSegmentByteSize(
613 tRegVar->getSrcRepRegion(),
614 tRegVar->getExecSize()) <= getByteSize(tRegVar));
615 itsDisp =
616 getSegmentDisp(tRegVar->getSrcRepRegion(), tRegVar->getExecSize());
617 }
618 else {
619 MUST_BE_TRUE(false, "Incorrect spill/fill ranges.");
620 itsDisp = 0;
621 }
622
623 regVar->setDisp(itsDisp);
624 }
625 else {
626 // Allocate the spill and evaluate its disp
627 if (doSpillSpaceCompression)
628 {
629 assert(regVar->isRegVarTransient() == false);
630 if (spilledLSLRs_ != nullptr)
631 {
632 regVar->setDisp(calculateSpillDispForLS(regVar));
633 }
634 else
635 {
636 regVar->setDisp(calculateSpillDisp(regVar));
637 }
638 }
639 else
640 {
641 assert(regVar->isRegVarTransient() == false);
642 if (regVar->getId() >= varIdCount_)
643 {
644 if (regVar->getBaseRegVar()->getDisp() != UINT_MAX)
645 {
646 regVar->setDisp(regVar->getBaseRegVar()->getDisp());
647 return regVar->getDisp();
648 }
649 }
650
651 if ((spillAreaOffset_) % numEltPerGRF<Type_UB>() != 0)
652 {
653 (spillAreaOffset_) = ROUND(spillAreaOffset_, numEltPerGRF<Type_UB>());
654 }
655
656 regVar->setDisp(spillAreaOffset_);
657 spillAreaOffset_ += getByteSize(regVar);
658 }
659 }
660
661 // ToDo: log this in some dump to help debug
662 //regVar->getDeclare()->dump();
663 //std::cerr << "spill offset = " << regVar->getDisp() << "\n";
664
665 return regVar->getDisp();
666 }
667
668 // Get the spill/fill displacement of the region.
669 template <class REGION_TYPE>
getRegionDisp(REGION_TYPE * region)670 unsigned SpillManagerGRF::getRegionDisp(REGION_TYPE * region)
671 {
672 return getDisp (getRegVar(region)) + getRegionOriginOffset(region);
673 }
674
675 // Get the type of send message to use to spill/fill the region.
676 // The type can be either on oword read/write or a scatter read/write.
677 // If the segment corresponding to the region is dword sized then a
678 // dword read/write is used else an oword read/write is used.
679 template <class REGION_TYPE>
getMsgType(REGION_TYPE * region,G4_ExecSize execSize)680 unsigned SpillManagerGRF::getMsgType(
681 REGION_TYPE * region, G4_ExecSize execSize)
682 {
683 unsigned regionDisp = getRegionDisp(region);
684 unsigned regionByteSize = getRegionByteSize(region, execSize);
685 if (owordAligned (regionDisp) && owordAligned (regionByteSize))
686 return owordMask();
687 else
688 return getEncAlignedSegmentMsgType(region, execSize);
689 }
690
691 // Determine if the region is unaligned w.r.t spill/fill memory read/writes.
692 // If the exact region cannot be read/written from spill/fill memory using
693 // one send instruction, then it is unaligned.
694 template <class REGION_TYPE>
isUnalignedRegion(REGION_TYPE * region,G4_ExecSize execSize)695 bool SpillManagerGRF::isUnalignedRegion(
696 REGION_TYPE * region, G4_ExecSize execSize)
697 {
698 unsigned regionDisp = getRegionDisp(region);
699 unsigned regionByteSize = getRegionByteSize(region, execSize);
700
701 bool needs32ByteAlign = useScratchMsg_;
702 needs32ByteAlign |= useLSCMsg;
703
704 auto bytePerGRF = numEltPerGRF<Type_UB>();
705 if (needs32ByteAlign)
706 {
707 if (regionDisp % bytePerGRF == 0 && regionByteSize % bytePerGRF == 0)
708 {
709 return
710 regionByteSize / bytePerGRF != 1 &&
711 regionByteSize / bytePerGRF != 2 &&
712 regionByteSize / bytePerGRF != 4;
713 }
714 else
715 return true;
716 }
717 else
718 {
719 if (owordAligned(regionDisp) && owordAligned(regionByteSize))
720 {
721 // Current intrinsic spill/fill cannot handle partial region spill.
722 // If it's the partial region of a large size variable, such as V91 in following instructions, the preload is needed.
723 // mov (16) V91(6,0)<1>:ub %retval_ub(0,0)<1;1,0>:ub {H1, Align1}
724 // mov (16) V91(6,16)<1>:ub %retval_ub(0,16)<1;1,0>:ub {H1, Align1}
725 G4_RegVar* var = getRegVar(region);
726 if ((var->getDeclare()->getByteSize() > bytePerGRF) &&
727 (regionByteSize < bytePerGRF || regionDisp % bytePerGRF))
728 {
729 return true;
730 }
731 return
732 regionByteSize / OWORD_BYTE_SIZE != 1 &&
733 regionByteSize / OWORD_BYTE_SIZE != 2 &&
734 regionByteSize / OWORD_BYTE_SIZE != 4;
735 }
736 else
737 return true;
738 }
739 }
740
741 // Calculate the smallest aligned segment encompassing the region.
742 template <class REGION_TYPE>
calculateEncAlignedSegment(REGION_TYPE * region,G4_ExecSize execSize,unsigned & start,unsigned & end,unsigned & type)743 void SpillManagerGRF::calculateEncAlignedSegment(
744 REGION_TYPE * region,
745 G4_ExecSize execSize,
746 unsigned & start,
747 unsigned & end,
748 unsigned & type)
749 {
750 unsigned regionDisp = getRegionDisp(region);
751 unsigned regionByteSize = getRegionByteSize(region, execSize);
752
753 if (needGRFAlignedOffset())
754 {
755 unsigned hwordLB = regionDisp & grfMask();
756 unsigned hwordRB = hwordLB + numEltPerGRF<Type_UB>();
757 unsigned blockSize = numEltPerGRF<Type_UB>();
758
759 while (regionDisp + regionByteSize > hwordRB) {
760 hwordRB += blockSize;
761 }
762
763 assert((hwordRB - hwordLB) / REG_BYTE_SIZE <= 4);
764 start = hwordLB;
765 end = hwordRB;
766 type = grfMask();
767 }
768 else
769 {
770 unsigned owordLB = regionDisp & owordMask();
771 unsigned owordRB = owordLB + OWORD_BYTE_SIZE;
772 unsigned blockSize = OWORD_BYTE_SIZE;
773
774 while (regionDisp + regionByteSize > owordRB) {
775 owordRB += blockSize;
776 blockSize *= 2;
777 }
778
779 assert((owordRB - owordLB) / REG_BYTE_SIZE <= 4);
780 start = owordLB;
781 end = owordRB;
782 type = owordMask();
783 }
784 }
785
786 // Get the byte size of the aligned segment for the region.
787
788 template <class REGION_TYPE>
789 unsigned
getEncAlignedSegmentByteSize(REGION_TYPE * region,G4_ExecSize execSize)790 SpillManagerGRF::getEncAlignedSegmentByteSize(
791 REGION_TYPE * region,
792 G4_ExecSize execSize
793 )
794 {
795 unsigned start, end, type;
796 calculateEncAlignedSegment(region, execSize, start, end, type);
797 return end - start;
798 }
799
800 // Get the start offset of the aligned segment for the region.
801 template <class REGION_TYPE>
802 unsigned
getEncAlignedSegmentDisp(REGION_TYPE * region,G4_ExecSize execSize)803 SpillManagerGRF::getEncAlignedSegmentDisp(
804 REGION_TYPE * region,
805 G4_ExecSize execSize
806 )
807 {
808 unsigned start, end, type;
809 calculateEncAlignedSegment(region, execSize, start, end, type);
810 return start;
811 }
812
813 // Get the type of message to be used to read/write the enclosing aligned
814 // segment for the region.
815 template <class REGION_TYPE>
getEncAlignedSegmentMsgType(REGION_TYPE * region,G4_ExecSize execSize)816 unsigned SpillManagerGRF::getEncAlignedSegmentMsgType(
817 REGION_TYPE * region,
818 G4_ExecSize execSize
819 )
820 {
821 unsigned start, end, type;
822 calculateEncAlignedSegment(region, execSize, start, end, type);
823 return type;
824 }
825
826 // Get the byte size of the segment for the region.
827 template <class REGION_TYPE>
getSegmentByteSize(REGION_TYPE * region,G4_ExecSize execSize)828 unsigned SpillManagerGRF::getSegmentByteSize(
829 REGION_TYPE * region,
830 G4_ExecSize execSize
831 )
832 {
833 assert(region->getElemSize () && execSize);
834 if (isUnalignedRegion(region, execSize))
835 return getEncAlignedSegmentByteSize(region, execSize);
836 else
837 return getRegionByteSize(region, execSize);
838 }
839
840 // Get the byte size of the destination region.
getRegionByteSize(G4_DstRegRegion * region,G4_ExecSize execSize) const841 unsigned SpillManagerGRF::getRegionByteSize(
842 G4_DstRegRegion * region,
843 G4_ExecSize execSize
844 ) const
845 {
846 unsigned size = region->getHorzStride() * region->getElemSize() *
847 (execSize - 1) + region->getElemSize();
848
849 return size;
850 }
851
852 // Get the byte size of the source region.
853
getRegionByteSize(G4_SrcRegRegion * region,G4_ExecSize execSize) const854 unsigned SpillManagerGRF::getRegionByteSize(
855 G4_SrcRegRegion * region,
856 G4_ExecSize execSize) const
857 {
858 assert(execSize % region->getRegion ()->width == 0);
859 unsigned nRows = execSize / region->getRegion ()->width;
860 unsigned size = 0;
861
862 for (unsigned int i = 0; i < nRows - 1; i++) {
863 size += region->getRegion ()->vertStride * region->getElemSize ();
864 }
865
866 size +=
867 region->getRegion ()->horzStride * region->getElemSize () *
868 (region->getRegion ()->width - 1) + region->getElemSize ();
869 return size;
870 }
871
872 // Get the max exec size on a 256 bit vector for the input operand.
getMaxExecSize(G4_Operand * operand)873 static unsigned getMaxExecSize(G4_Operand * operand)
874 {
875 const unsigned size = Type_UNDEF + 1;
876 static unsigned maxExecSize [size] {8, 8, 16, 16, 16, 16, 8, 8, 0};
877 return maxExecSize[operand->getType()];
878 }
879
880 // Check if the instruction is a SIMD 16 or 32 instruction that is logically
881 // equivalent to two instructions the second of which uses register operands
882 // at the following row with the same sub-register index.
isComprInst(G4_INST * inst) const883 bool SpillManagerGRF::isComprInst(G4_INST * inst) const
884 {
885 return inst->isComprInst();
886 }
887
888 // Check if the source in a compressed instruction operand occupies a second
889 // register.
isMultiRegComprSource(G4_SrcRegRegion * src,G4_INST * inst) const890 bool SpillManagerGRF::isMultiRegComprSource(
891 G4_SrcRegRegion* src,
892 G4_INST * inst) const
893 {
894 if (!inst->isComprInst ()) {
895 return false;
896 }
897 else if (isScalarReplication(src)) {
898 return false;
899 }
900 else if (inst->getExecSize() <= 8) {
901 return false;
902 }
903 else if (!src->asSrcRegRegion()->crossGRF())
904 {
905 return false;
906 }
907 else if (inst->getExecSize() == 16 &&
908 inst->getDst() &&
909 inst->getDst()->getTypeSize() == 4 &&
910 inst->getDst()->getHorzStride() == 1)
911 {
912 if (src->getTypeSize() == 2 && src->isNativePackedRegion()) {
913 return false;
914 } else {
915 return true;
916 }
917 }
918 else {
919 return true;
920 }
921 }
922
923 // Send message information query
getSendRspLengthBitOffset() const924 unsigned SpillManagerGRF::getSendRspLengthBitOffset() const
925 {
926 return SEND_GT_RSP_LENGTH_BIT_OFFSET;
927 }
928
929 // Send message information query
getSendMaxResponseLength() const930 unsigned SpillManagerGRF::getSendMaxResponseLength() const
931 {
932 //return SEND_GT_MAX_RESPONSE_LENGTH;
933 return 8;
934 }
935
936 // Send message information query
getSendMsgLengthBitOffset()937 unsigned SpillManagerGRF::getSendMsgLengthBitOffset()
938 {
939 return SEND_GT_MSG_LENGTH_BIT_OFFSET;
940 }
941
942 // Send message information query
getSendMaxMessageLength() const943 unsigned SpillManagerGRF::getSendMaxMessageLength() const
944 {
945 return SEND_GT_MAX_MESSAGE_LENGTH;
946 }
947
948 // Send message information query
getSendDescDataSizeBitOffset()949 unsigned SpillManagerGRF::getSendDescDataSizeBitOffset()
950 {
951 return SEND_GT_DESC_DATA_SIZE_BIT_OFFSET;
952 }
953
954 // Send message information query
getSendReadTypeBitOffset() const955 unsigned SpillManagerGRF::getSendReadTypeBitOffset() const
956 {
957 return SEND_IVB_MSG_TYPE_BIT_OFFSET;
958 }
959
960 // Send message information query
getSendWriteTypeBitOffset()961 unsigned SpillManagerGRF::getSendWriteTypeBitOffset()
962 {
963 return SEND_IVB_MSG_TYPE_BIT_OFFSET;
964 }
965
966 // Send message information query
getSendScReadType() const967 unsigned SpillManagerGRF::getSendScReadType() const
968 {
969 return SEND_IVB_SC_READ_TYPE;
970 }
971
972 // Send message information query
getSendScWriteType() const973 unsigned SpillManagerGRF::getSendScWriteType() const
974 {
975 return SEND_IVB_SC_WRITE_TYPE;
976 }
977
978 // Send message information query
getSendOwordReadType() const979 unsigned SpillManagerGRF::getSendOwordReadType() const
980 {
981 return SEND_IVB_OW_READ_TYPE;
982 }
983
984 // Send message information query
getSendOwordWriteType()985 unsigned SpillManagerGRF::getSendOwordWriteType()
986 {
987 return SEND_IVB_OW_WRITE_TYPE;
988 }
989
getSendExDesc(bool isWrite,bool isScatter) const990 unsigned SpillManagerGRF::getSendExDesc(bool isWrite, bool isScatter) const
991 {
992 return isWrite ? SEND_IVB_DP_WR_EX_DESC_IMM : SEND_IVB_DP_RD_EX_DESC_IMM;
993 }
994
995 // Allocate from custom memory allocator
allocMem(unsigned size) const996 void *SpillManagerGRF::allocMem(unsigned size) const
997 {
998 return builder_->mem.alloc(size);
999 }
1000
useSplitSend() const1001 bool SpillManagerGRF::useSplitSend() const
1002 {
1003 return builder_->useSends();
1004 }
1005
1006 // Get a unique spill range index for regvar.
getSpillIndex(G4_RegVar * spilledRegVar)1007 unsigned SpillManagerGRF::getSpillIndex(G4_RegVar * spilledRegVar)
1008 {
1009 return spillRangeCount_[spilledRegVar->getId()]++;
1010 }
1011
1012 // Get a unique fill range index for regvar.
getFillIndex(G4_RegVar * spilledRegVar)1013 unsigned SpillManagerGRF::getFillIndex(
1014 G4_RegVar * spilledRegVar
1015 )
1016 {
1017 return fillRangeCount_[spilledRegVar->getId()]++;
1018 }
1019
1020 // Get a unique tmp index for spilled regvar.
getTmpIndex(G4_RegVar * spilledRegVar)1021 unsigned SpillManagerGRF::getTmpIndex(G4_RegVar * spilledRegVar)
1022 {
1023 return tmpRangeCount_[spilledRegVar->getId()]++;
1024 }
1025
1026 // Get a unique msg index for spilled regvar.
getMsgSpillIndex(G4_RegVar * spilledRegVar)1027 unsigned SpillManagerGRF::getMsgSpillIndex(
1028 G4_RegVar * spilledRegVar)
1029 {
1030 return msgSpillRangeCount_[spilledRegVar->getId()]++;
1031 }
1032
1033 // Get a unique msg index for filled regvar.
getMsgFillIndex(G4_RegVar * spilledRegVar)1034 unsigned SpillManagerGRF::getMsgFillIndex(
1035 G4_RegVar * spilledRegVar)
1036 {
1037 return msgFillRangeCount_[spilledRegVar->getId()]++;
1038 }
1039
1040 // Get a unique msg index for addr spill fill regvar.
getAddrSpillFillIndex(G4_RegVar * spilledRegVar)1041 unsigned SpillManagerGRF::getAddrSpillFillIndex(
1042 G4_RegVar * spilledRegVar)
1043 {
1044 return addrSpillFillRangeCount_[spilledRegVar->getId()]++;
1045 }
1046
1047 // Create a unique name for a regvar representing a spill/fill/msg live range.
createImplicitRangeName(const char * baseName,G4_RegVar * spilledRegVar,unsigned index)1048 const char *SpillManagerGRF::createImplicitRangeName(
1049 const char * baseName,
1050 G4_RegVar * spilledRegVar,
1051 unsigned index)
1052 {
1053 std::stringstream nameStrm;
1054 nameStrm << baseName << "_" << spilledRegVar->getName()
1055 << "_" << index << std::ends;
1056 int nameLen = unsigned(nameStrm.str().length()) + 1;
1057 char * name = (char *) allocMem(nameLen);
1058 strcpy_s(name, nameLen, nameStrm.str().c_str ());
1059 return name;
1060 }
1061
1062 // Check if the region is a scalar replication region.
isScalarReplication(G4_SrcRegRegion * region) const1063 bool SpillManagerGRF::isScalarReplication(G4_SrcRegRegion * region) const
1064 {
1065 return region->isScalar();
1066 }
1067
1068 // Check if we have to repeat the simd16 source in the simd8 equivalents.
1069 // The BPSEC mentions that if a replicated scalar appears in an simd16
1070 // instruction, logically we need to repeat the source region used in
1071 // the first simd8 instruction in the second simd8 instruction as well
1072 // (i.e. the reg no is not incremented by one for the second).
repeatSIMD16or32Source(G4_SrcRegRegion * region) const1073 bool SpillManagerGRF::repeatSIMD16or32Source(G4_SrcRegRegion * region) const
1074 {
1075 return isScalarReplication(region);
1076 }
1077
1078 // Create a declare directive for a new live range (spill/fill/msg)
1079 // introduced as part of the spill code generation.
1080 G4_Declare *
createRangeDeclare(const char * name,G4_RegFileKind regFile,unsigned short nElems,unsigned short nRows,G4_Type type,DeclareType kind,G4_RegVar * base,G4_Operand * repRegion,G4_ExecSize execSize)1081 SpillManagerGRF::createRangeDeclare(
1082 const char* name,
1083 G4_RegFileKind regFile,
1084 unsigned short nElems,
1085 unsigned short nRows,
1086 G4_Type type,
1087 DeclareType kind,
1088 G4_RegVar * base,
1089 G4_Operand * repRegion,
1090 G4_ExecSize execSize)
1091 {
1092 G4_Declare * rangeDeclare =
1093 builder_->createDeclareNoLookup(
1094 name, regFile, nElems, nRows, type, kind,
1095 base, repRegion, execSize);
1096 rangeDeclare->getRegVar()->setId(
1097 varIdCount_ + latestImplicitVarIdCount_++);
1098 gra.setBBId(rangeDeclare, bbId_);
1099 return rangeDeclare;
1100 }
1101
1102 // Create a GRF regvar and its declare directive to represent the spill/fill
1103 // live range.
1104 // The size of the regvar is calculated from the size of the spill/fill
1105 // region. If the spill/fill region fits into one row, then width of the
1106 // regvar is exactly as needed for the spill/fill segment, else it is
1107 // made to occupy exactly two full rows. In either case the regvar is made
1108 // to have 16 word alignment requirement. This is to satisfy the requirements
1109 // of the send instruction used to save/load the value from memory. For
1110 // region's in simd16 instruction contexts we multiply the height by 2
1111 // except for source region's with scalar replication.
1112 template <class REGION_TYPE>
createTransientGRFRangeDeclare(REGION_TYPE * region,const char * baseName,unsigned index,G4_ExecSize execSize,G4_INST * inst)1113 G4_Declare * SpillManagerGRF::createTransientGRFRangeDeclare(
1114 REGION_TYPE * region,
1115 const char * baseName,
1116 unsigned index,
1117 G4_ExecSize execSize,
1118 G4_INST * inst)
1119 {
1120 const char * name =
1121 createImplicitRangeName(baseName, getRegVar(region), index);
1122 G4_Type type = region->getType();
1123 unsigned segmentByteSize = getSegmentByteSize(region, execSize);
1124 DeclareType regVarKind =
1125 (region->isDstRegRegion ())? DeclareType::Spill : DeclareType::Fill;
1126 unsigned short width, height;
1127
1128 if (segmentByteSize > REG_BYTE_SIZE || region->crossGRF()) {
1129 assert(REG_BYTE_SIZE % region->getElemSize () == 0);
1130 width = REG_BYTE_SIZE / region->getElemSize ();
1131 assert(segmentByteSize / REG_BYTE_SIZE <= 2);
1132 height = 2;
1133 } else {
1134 assert(segmentByteSize % region->getElemSize () == 0);
1135 width = segmentByteSize / region->getElemSize ();
1136 height = 1;
1137 }
1138
1139 if (needGRFAlignedOffset())
1140 {
1141 // the message will read/write a minimum of one GRF
1142 if (height == 1 && width < (getGRFSize() / region->getElemSize()))
1143 width = getGRFSize() / region->getElemSize();
1144 }
1145
1146 G4_Declare * transientRangeDeclare =
1147 createRangeDeclare(
1148 name, G4_GRF, width, height, type,
1149 regVarKind, region->getBase()->asRegVar(), region, execSize);
1150
1151 if (failSafeSpill_)
1152 {
1153 transientRangeDeclare->getRegVar()->setPhyReg(
1154 builder_->phyregpool.getGreg(spillRegOffset_), 0);
1155 spillRegOffset_ += height;
1156 }
1157
1158 // FIXME: We should take the original declare's alignment too, but I'm worried
1159 // we may get perf regression if FE is over-aligning or the alignment is not necessary for this inst.
1160 // So Either is used for now and we can change it later if there are bugs
1161 setNewDclAlignment(gra, transientRangeDeclare, false);
1162 return transientRangeDeclare;
1163 }
1164
getSpillRowSizeForSendDst(G4_INST * inst)1165 static unsigned short getSpillRowSizeForSendDst(G4_INST * inst)
1166 {
1167 unsigned short nRows = 0;
1168
1169 auto dst = inst->getDst();
1170
1171 if (inst->isSend())
1172 {
1173 G4_SendDesc* msgDesc = inst->getMsgDesc();
1174 nRows = msgDesc->getDstLenRegs();
1175 if (dst->getTopDcl()->getByteSize() <= getGRFSize())
1176 {
1177 // we may have a send that that writes to a <1 GRF variable, but due to A64 message requirements
1178 // the send has a response length > 1. We return row size as one instead as we've only allocated
1179 // one GRF for the spilled variable in scratch space
1180 nRows = 1;
1181 }
1182 }
1183 else
1184 {
1185 assert(dst->getLinearizedStart() % numEltPerGRF<Type_UB>() == 0);
1186 nRows = (dst->getLinearizedEnd() - dst->getLinearizedStart() + 1) / numEltPerGRF<Type_UB>();
1187 }
1188 return nRows;
1189 }
1190
1191 // Create a regvar and its declare directive to represent the spill live
1192 // range that appears as a send instruction post destination GRF.
1193 // The type of the regvar is set as dword and its width 8. The type of
1194 // the post destination does not matter, so we just use type dword, and
1195 // a width of 8 so that a row corresponds to a physical register.
createPostDstSpillRangeDeclare(G4_INST * sendOut)1196 G4_Declare * SpillManagerGRF::createPostDstSpillRangeDeclare(G4_INST *sendOut)
1197 {
1198 auto dst = sendOut->getDst();
1199 G4_RegVar * spilledRegVar = getRegVar(dst);
1200 const char * name =
1201 createImplicitRangeName(
1202 "SP_GRF", spilledRegVar, getSpillIndex (spilledRegVar));
1203 unsigned short nRows = getSpillRowSizeForSendDst(sendOut);
1204
1205 G4_DstRegRegion * normalizedPostDst = builder_->createDst(
1206 spilledRegVar, dst->getRegOff(), SUBREG_ORIGIN,
1207 DEF_HORIZ_STRIDE, Type_UD);
1208
1209 // We use the width as the user specified, the height however is
1210 // calculated based on the message descriptor to limit register
1211 // pressure induced by the spill range.
1212
1213 G4_Declare * transientRangeDeclare =
1214 createRangeDeclare(
1215 name, G4_GRF, REG_DWORD_SIZE, nRows, Type_UD,
1216 DeclareType::Spill, spilledRegVar, normalizedPostDst,
1217 G4_ExecSize(REG_DWORD_SIZE));
1218
1219 if (failSafeSpill_)
1220 {
1221 if (useSplitSend())
1222 {
1223 transientRangeDeclare->getRegVar()->setPhyReg(
1224 builder_->phyregpool.getGreg(spillRegStart_), 0);
1225 spillRegOffset_ += nRows;
1226 }
1227 else
1228 {
1229 transientRangeDeclare->getRegVar()->setPhyReg(
1230 builder_->phyregpool.getGreg(spillRegStart_+1), 0);
1231 spillRegOffset_ += nRows + 1;
1232 }
1233 }
1234
1235 return transientRangeDeclare;
1236 }
1237
1238 // Create a regvar and its declare directive to represent the spill live range.
createSpillRangeDeclare(G4_DstRegRegion * spilledRegion,G4_ExecSize execSize,G4_INST * inst)1239 G4_Declare * SpillManagerGRF::createSpillRangeDeclare(
1240 G4_DstRegRegion * spilledRegion,
1241 G4_ExecSize execSize,
1242 G4_INST * inst
1243 )
1244 {
1245 return
1246 createTransientGRFRangeDeclare(
1247 spilledRegion, "SP_GRF",
1248 getSpillIndex (getRegVar(spilledRegion)),
1249 execSize, inst);
1250 }
1251
1252 // Create a regvar and its declare directive to represent the GRF fill live
1253 // range.
createGRFFillRangeDeclare(G4_SrcRegRegion * fillRegion,G4_ExecSize execSize,G4_INST * inst)1254 G4_Declare * SpillManagerGRF::createGRFFillRangeDeclare(
1255 G4_SrcRegRegion * fillRegion,
1256 G4_ExecSize execSize,
1257 G4_INST * inst
1258 )
1259 {
1260 assert(getRFType (fillRegion) == G4_GRF);
1261 G4_Declare * fillRangeDecl =
1262 createTransientGRFRangeDeclare(
1263 fillRegion, "FL_GRF", getFillIndex(getRegVar(fillRegion)),
1264 execSize, inst);
1265 return fillRangeDecl;
1266 }
1267
getSpillRowSizeForSendSrc(G4_INST * inst,G4_SrcRegRegion * filledRegion)1268 static unsigned short getSpillRowSizeForSendSrc(
1269 G4_INST * inst,
1270 G4_SrcRegRegion * filledRegion)
1271 {
1272 unsigned short nRows = 0;
1273
1274 if (inst->isSend())
1275 {
1276 G4_SendDesc* msgDesc = inst->getMsgDesc();
1277 if (inst->isSplitSend() &&
1278 (inst->getSrc(1)->asSrcRegRegion() == filledRegion))
1279 {
1280 nRows = msgDesc->getSrc1LenRegs();
1281 }
1282 else
1283 {
1284 nRows = msgDesc->getSrc0LenRegs();
1285 }
1286 }
1287 else
1288 {
1289 nRows = (filledRegion->getLinearizedEnd() - filledRegion->getLinearizedStart() + 1) / numEltPerGRF<Type_UB>();
1290 }
1291
1292 return nRows;
1293 }
1294
1295
1296 // Create a regvar and its declare directive to represent the GRF fill live range.
createSendFillRangeDeclare(G4_SrcRegRegion * filledRegion,G4_INST * sendInst)1297 G4_Declare * SpillManagerGRF::createSendFillRangeDeclare(
1298 G4_SrcRegRegion * filledRegion,
1299 G4_INST * sendInst)
1300 {
1301 G4_RegVar * filledRegVar = getRegVar(filledRegion);
1302 const char * name =
1303 createImplicitRangeName(
1304 "FL_Send", filledRegVar, getFillIndex(filledRegVar));
1305 unsigned short nRows = getSpillRowSizeForSendSrc(sendInst, filledRegion);
1306
1307 G4_SrcRegRegion * normalizedSendSrc =
1308 builder_->createSrcRegRegion(
1309 filledRegion->getModifier(), Direct, filledRegVar,
1310 filledRegion->getRegOff(), filledRegion->getSubRegOff(), filledRegion->getRegion(),
1311 filledRegion->getType());
1312 unsigned short width = REG_BYTE_SIZE / filledRegion->getElemSize ();
1313 assert(REG_BYTE_SIZE % filledRegion->getElemSize () == 0);
1314
1315 // We use the width as the user specified, the height however is
1316 // calculated based on the message descriptor to limit register
1317 // pressure induced by the spill range.
1318
1319 G4_Declare * transientRangeDeclare =
1320 createRangeDeclare(
1321 name,
1322 G4_GRF,
1323 width, nRows, filledRegion->getType(),
1324 DeclareType::Fill, filledRegVar, normalizedSendSrc,
1325 G4_ExecSize(width));
1326
1327 setNewDclAlignment(gra, transientRangeDeclare, gra.isEvenAligned(filledRegVar->getDeclare()));
1328
1329 if (failSafeSpill_)
1330 {
1331 if (sendInst->isEOT() && builder_->hasEOTGRFBinding())
1332 {
1333 // make sure eot src is in last 16 GRF
1334 uint32_t eotStart = gra.kernel.getNumRegTotal() - 16;
1335 if (spillRegOffset_ < eotStart)
1336 {
1337 spillRegOffset_ = eotStart;
1338 }
1339 }
1340 transientRangeDeclare->getRegVar()->setPhyReg(builder_->phyregpool.getGreg(spillRegOffset_), 0);
1341 spillRegOffset_ += nRows;
1342 }
1343
1344 return transientRangeDeclare;
1345 }
1346
1347 // Create a regvar and its declare directive to represent the temporary live
1348 // range.
createTemporaryRangeDeclare(G4_DstRegRegion * spilledRegion,G4_ExecSize execSize,bool forceSegmentAlignment)1349 G4_Declare * SpillManagerGRF::createTemporaryRangeDeclare(
1350 G4_DstRegRegion * spilledRegion,
1351 G4_ExecSize execSize,
1352 bool forceSegmentAlignment)
1353 {
1354 const char * name =
1355 createImplicitRangeName(
1356 "TM_GRF", getRegVar(spilledRegion),
1357 getTmpIndex(getRegVar(spilledRegion)));
1358 unsigned byteSize =
1359 (forceSegmentAlignment)?
1360 getSegmentByteSize(spilledRegion, execSize):
1361 getRegionByteSize(spilledRegion, execSize);
1362
1363 // ensure tmp reg is large enough to hold all data when sub-reg offset is non-zero
1364 byteSize += spilledRegion->getSubRegOff() * spilledRegion->getElemSize();
1365
1366 assert(byteSize <= 2u * REG_BYTE_SIZE);
1367 assert(byteSize % spilledRegion->getElemSize () == 0);
1368
1369 G4_Type type = spilledRegion->getType();
1370 DeclareType regVarKind = DeclareType::Tmp;
1371
1372 unsigned short width, height;
1373 if (byteSize > REG_BYTE_SIZE)
1374 {
1375 height = 2;
1376 width = REG_BYTE_SIZE/spilledRegion->getElemSize();
1377 }
1378 else
1379 {
1380 height = 1;
1381 width = byteSize/spilledRegion->getElemSize();
1382 }
1383
1384 G4_RegVar* spilledRegVar = getRegVar(spilledRegion);
1385
1386 G4_Declare * temporaryRangeDeclare =
1387 createRangeDeclare(
1388 name, G4_GRF, width, height, type,
1389 regVarKind, spilledRegVar, NULL, G4_ExecSize(0));
1390
1391 if (failSafeSpill_)
1392 {
1393 temporaryRangeDeclare->getRegVar()->setPhyReg(builder_->phyregpool.getGreg(spillRegOffset_), 0);
1394 spillRegOffset_ += height;
1395 }
1396
1397 setNewDclAlignment(gra, temporaryRangeDeclare, false);
1398 return temporaryRangeDeclare;
1399 }
1400
1401 // Create a destination region that could be used in place of the spill regvar.
1402 // If the region is unaligned then the origin of the destination region
1403 // is the displacement of the orginal region from its segment, else the
1404 // origin is 0.
createSpillRangeDstRegion(G4_RegVar * spillRangeRegVar,G4_DstRegRegion * spilledRegion,G4_ExecSize execSize,unsigned regOff)1405 G4_DstRegRegion * SpillManagerGRF::createSpillRangeDstRegion(
1406 G4_RegVar * spillRangeRegVar,
1407 G4_DstRegRegion * spilledRegion,
1408 G4_ExecSize execSize,
1409 unsigned regOff)
1410 {
1411 if (isUnalignedRegion (spilledRegion, execSize)) {
1412 unsigned segmentDisp =
1413 getEncAlignedSegmentDisp(spilledRegion, execSize);
1414 unsigned regionDisp = getRegionDisp(spilledRegion);
1415 assert(regionDisp >= segmentDisp);
1416 unsigned short subRegOff =
1417 (regionDisp - segmentDisp) / spilledRegion->getElemSize ();
1418 assert(
1419 (regionDisp - segmentDisp) % spilledRegion->getElemSize () == 0);
1420 assert(subRegOff * spilledRegion->getElemSize () +
1421 getRegionByteSize(spilledRegion, execSize) <=
1422 2u * REG_BYTE_SIZE);
1423
1424 if (useScratchMsg_)
1425 {
1426 G4_Declare* parent_dcl = spilledRegion->getBase()->asRegVar()->getDeclare();
1427 unsigned off = 0;
1428 while (parent_dcl->getAliasDeclare() != NULL)
1429 {
1430 // off is in bytes
1431 off += parent_dcl->getAliasOffset();
1432 parent_dcl = parent_dcl->getAliasDeclare();
1433 }
1434 off = off%numEltPerGRF<Type_UB>();
1435 // sub-regoff is in units of element size
1436 subRegOff = spilledRegion->getSubRegOff() + off/spilledRegion->getElemSize();
1437 }
1438
1439 return builder_->createDst(
1440 spillRangeRegVar, (unsigned short) regOff, subRegOff,
1441 spilledRegion->getHorzStride(), spilledRegion->getType());
1442 }
1443
1444 else {
1445 return builder_->createDst(
1446 spillRangeRegVar, (short) regOff, SUBREG_ORIGIN,
1447 spilledRegion->getHorzStride(), spilledRegion->getType());
1448 }
1449 }
1450
1451 // Create a source region that could be used to copy out the temporary range
1452 // (that was created to replace the portion of the spilled live range appearing
1453 // in an instruction destination) into the segment aligned spill range for the
1454 // spilled live range that can be written out to spill memory.
createTemporaryRangeSrcRegion(G4_RegVar * tmpRangeRegVar,G4_DstRegRegion * spilledRegion,G4_ExecSize execSize,unsigned regOff)1455 G4_SrcRegRegion * SpillManagerGRF::createTemporaryRangeSrcRegion (
1456 G4_RegVar * tmpRangeRegVar,
1457 G4_DstRegRegion * spilledRegion,
1458 G4_ExecSize execSize,
1459 unsigned regOff)
1460 {
1461 uint16_t horzStride = spilledRegion->getHorzStride();
1462 // A scalar region is returned when execsize is 1.
1463 const RegionDesc *rDesc = builder_->createRegionDesc(execSize, horzStride, 1, 0);
1464
1465 return builder_->createSrc(tmpRangeRegVar, (short) regOff, spilledRegion->getSubRegOff(),
1466 rDesc, spilledRegion->getType());
1467 }
1468
1469 // Create a source region that could be used in place of the fill regvar.
1470 // If the region is unaligned then the origin of the destination region
1471 // is the displacement of the orginal region from its segment, else the
1472 // origin is 0.
createFillRangeSrcRegion(G4_RegVar * fillRangeRegVar,G4_SrcRegRegion * filledRegion,G4_ExecSize execSize)1473 G4_SrcRegRegion * SpillManagerGRF::createFillRangeSrcRegion (
1474 G4_RegVar * fillRangeRegVar,
1475 G4_SrcRegRegion * filledRegion,
1476 G4_ExecSize execSize)
1477 {
1478 // we need to preserve accRegSel if it's set
1479 if (isUnalignedRegion(filledRegion, execSize)) {
1480 unsigned segmentDisp =
1481 getEncAlignedSegmentDisp(filledRegion, execSize);
1482 unsigned regionDisp = getRegionDisp(filledRegion);
1483 assert(regionDisp >= segmentDisp);
1484 unsigned short subRegOff =
1485 (regionDisp - segmentDisp) / filledRegion->getElemSize ();
1486 assert(
1487 (regionDisp - segmentDisp) % filledRegion->getElemSize () == 0);
1488
1489 return builder_->createSrcRegRegion(
1490 filledRegion->getModifier (), Direct, fillRangeRegVar, REG_ORIGIN,
1491 subRegOff, filledRegion->getRegion(), filledRegion->getType(), filledRegion->getAccRegSel());
1492 }
1493 else
1494 {
1495 // fill intrinsic's sub-reg offset is always 0 since it is GRF aligned.
1496 // but original filled range's offset may not be 0, so actual filled
1497 // src needs to use sub-reg offset from original region.
1498 return builder_->createSrcRegRegion(
1499 filledRegion->getModifier (), Direct, fillRangeRegVar,
1500 REG_ORIGIN, filledRegion->getSubRegOff(), filledRegion->getRegion (),
1501 filledRegion->getType(), filledRegion->getAccRegSel());
1502 }
1503 }
1504
1505 // Create a source region for the spill regvar that can be used as an operand
1506 // for a mov instruction used to copy the value to an send payload for
1507 // an oword block write message. The spillRangeRegVar segment is guaranteed
1508 // to start at an dword boundary and of a dword aligned size by construction.
1509 // The whole spillRangeRegVar segment needs to be copied out to the send
1510 // payload. The source region generated is <4;4,1>:ud so that a row occupies
1511 // a packed oword. The exec size used in the copy instruction needs to be a
1512 // multiple of 4 depending on the size of the spill regvar - 4 or 8 for the
1513 // the spill regvar appearing as the destination in a regulat 2 cycle
1514 // instructions and 16 when appearing in simd16 instructions.
createBlockSpillRangeSrcRegion(G4_RegVar * spillRangeRegVar,unsigned regOff,unsigned subregOff)1515 G4_SrcRegRegion * SpillManagerGRF::createBlockSpillRangeSrcRegion(
1516 G4_RegVar * spillRangeRegVar,
1517 unsigned regOff,
1518 unsigned subregOff)
1519 {
1520 assert(getByteSize (spillRangeRegVar) % DWORD_BYTE_SIZE == 0);
1521 const RegionDesc * rDesc =
1522 builder_->rgnpool.createRegion(DWORD_BYTE_SIZE, DWORD_BYTE_SIZE, 1);
1523 return builder_->createSrc(spillRangeRegVar, (short) regOff, (short) subregOff,
1524 rDesc, Type_UD);
1525 }
1526
1527 // Create a GRF regvar and a declare directive for it, to represent an
1528 // implicit MFR live range that will be used as the send message payload
1529 // header and write payload for spilling a regvar to memory.
createMRangeDeclare(G4_RegVar * regVar)1530 G4_Declare * SpillManagerGRF::createMRangeDeclare(G4_RegVar * regVar)
1531 {
1532 if (useSplitSend() && useScratchMsg_)
1533 {
1534 return builder_->getBuiltinR0();
1535 }
1536 else if (useLSCMsg)
1537 {
1538 return nullptr;
1539 }
1540
1541 G4_RegVar * repRegVar =
1542 (regVar->isRegVarTransient ()) ? regVar->getBaseRegVar(): regVar;
1543 const char * name =
1544 createImplicitRangeName(
1545 "SP_MSG", repRegVar, getMsgSpillIndex(repRegVar));
1546 unsigned regVarByteSize = getByteSize (regVar);
1547 unsigned writePayloadHeight = cdiv(regVarByteSize, REG_BYTE_SIZE);
1548
1549 if (writePayloadHeight > SPILL_PAYLOAD_HEIGHT_LIMIT) {
1550 writePayloadHeight = SPILL_PAYLOAD_HEIGHT_LIMIT;
1551 }
1552
1553 unsigned payloadHeaderHeight =
1554 (regVarByteSize != DWORD_BYTE_SIZE)?
1555 OWORD_PAYLOAD_HEADER_MAX_HEIGHT: DWORD_PAYLOAD_HEADER_MAX_HEIGHT;
1556 unsigned short height = payloadHeaderHeight + writePayloadHeight;
1557 unsigned short width = REG_DWORD_SIZE;
1558
1559 // We should not find ourselves using dword scattered write
1560 if (useScratchMsg_)
1561 {
1562 assert(payloadHeaderHeight != DWORD_PAYLOAD_HEADER_MAX_HEIGHT);
1563 }
1564
1565 G4_Declare * msgRangeDeclare =
1566 createRangeDeclare(
1567 name,
1568 G4_GRF,
1569 width, height, Type_UD,
1570 DeclareType::Tmp, regVar->getNonTransientBaseRegVar (), NULL, G4_ExecSize(0));
1571
1572 if (failSafeSpill_)
1573 {
1574 msgRangeDeclare->getRegVar()->setPhyReg(builder_->phyregpool.getGreg(spillRegStart_), 0);
1575 }
1576
1577 return msgRangeDeclare;
1578 }
1579
1580 // Create a GRF regvar and a declare directive for it, to represent an
1581 // implicit MFR live range that will be used as the send message payload
1582 // header and write payload for spilling a regvar region to memory.
createMRangeDeclare(G4_DstRegRegion * region,G4_ExecSize execSize)1583 G4_Declare * SpillManagerGRF::createMRangeDeclare(
1584 G4_DstRegRegion * region,
1585 G4_ExecSize execSize)
1586 {
1587 if (useSplitSend() && useScratchMsg_)
1588 {
1589 return builder_->getBuiltinR0();
1590 }
1591 else if (useLSCMsg)
1592 {
1593 return nullptr;
1594 }
1595
1596 const char * name =
1597 createImplicitRangeName(
1598 "SP_MSG", getRegVar(region),
1599 getMsgSpillIndex(getRegVar(region)));
1600 unsigned regionByteSize = getSegmentByteSize(region, execSize);
1601 unsigned writePayloadHeight = cdiv(regionByteSize, REG_BYTE_SIZE);
1602 unsigned msgType = getMsgType (region, execSize);
1603 unsigned payloadHeaderHeight =
1604 (msgType == owordMask() || msgType == hwordMask ()) ?
1605 OWORD_PAYLOAD_HEADER_MAX_HEIGHT: DWORD_PAYLOAD_HEADER_MAX_HEIGHT;
1606
1607 // We should not find ourselves using dword scattered write
1608 if (useScratchMsg_)
1609 {
1610 assert(payloadHeaderHeight != DWORD_PAYLOAD_HEADER_MAX_HEIGHT);
1611 }
1612
1613 unsigned height = payloadHeaderHeight + writePayloadHeight;
1614 unsigned short width = REG_DWORD_SIZE;
1615 G4_Declare * msgRangeDeclare =
1616 createRangeDeclare(
1617 name,
1618 G4_GRF,
1619 width, (unsigned short) height, Type_UD,
1620 DeclareType::Tmp, region->getBase()->asRegVar(), NULL, G4_ExecSize(0));
1621
1622 if (failSafeSpill_)
1623 {
1624 msgRangeDeclare->getRegVar()->setPhyReg(builder_->phyregpool.getGreg(spillRegOffset_), 0);
1625 spillRegOffset_ += height;
1626 }
1627
1628 return msgRangeDeclare;
1629 }
1630
1631 // Create a GRF regvar and a declare directive for it, that will be used as
1632 // the send message payload header and write payload for filling a regvar
1633 // from memory.
1634
1635 G4_Declare *
createMRangeDeclare(G4_SrcRegRegion * region,G4_ExecSize execSize)1636 SpillManagerGRF::createMRangeDeclare(
1637 G4_SrcRegRegion * region,
1638 G4_ExecSize execSize
1639 )
1640 {
1641 if (useSplitSend() && useScratchMsg_)
1642 {
1643 return builder_->getBuiltinR0();
1644 }
1645 else if (useLSCMsg)
1646 {
1647 return nullptr;
1648 }
1649
1650 const char * name =
1651 createImplicitRangeName(
1652 "FL_MSG", getRegVar(region),
1653 getMsgFillIndex(getRegVar(region)));
1654 getSegmentByteSize(region, execSize);
1655 unsigned payloadHeaderHeight =
1656 (getMsgType (region, execSize) == owordMask()) ?
1657 OWORD_PAYLOAD_HEADER_MIN_HEIGHT : DWORD_PAYLOAD_HEADER_MIN_HEIGHT;
1658
1659 // We should not find ourselves using dword scattered write
1660 if (useScratchMsg_)
1661 {
1662 assert(payloadHeaderHeight != DWORD_PAYLOAD_HEADER_MAX_HEIGHT);
1663 // When using scratch msg descriptor we dont need to use a
1664 // separate GRF for payload. Source operand of send can directly
1665 // use r0.0.
1666 return builder_->getBuiltinR0();
1667 }
1668
1669 unsigned height = payloadHeaderHeight;
1670 unsigned width = REG_DWORD_SIZE;
1671 G4_Declare * msgRangeDeclare =
1672 createRangeDeclare (
1673 name,
1674 G4_GRF,
1675 (unsigned short) width, (unsigned short) height, Type_UD,
1676 DeclareType::Tmp, region->getBase()->asRegVar(), NULL, G4_ExecSize(0));
1677
1678 if (failSafeSpill_)
1679 {
1680 msgRangeDeclare->getRegVar()->setPhyReg(builder_->phyregpool.getGreg(spillRegOffset_), 0);
1681 spillRegOffset_ += height;
1682 }
1683
1684 return msgRangeDeclare;
1685 }
1686
1687 // Create a destination region for the GRF regvar for the write payload
1688 // portion of the oword block send message (used for spill). The exec size
1689 // can be either 4 or 8 for a regular 2 cycle instruction detination spills or
1690 // 16 for simd16 instruction destination spills.
createMPayloadBlockWriteDstRegion(G4_RegVar * grfRange,unsigned regOff,unsigned subregOff)1691 G4_DstRegRegion * SpillManagerGRF::createMPayloadBlockWriteDstRegion(
1692 G4_RegVar * grfRange,
1693 unsigned regOff,
1694 unsigned subregOff)
1695 {
1696 regOff += OWORD_PAYLOAD_WRITE_REG_OFFSET;
1697 subregOff += OWORD_PAYLOAD_WRITE_SUBREG_OFFSET;
1698 return builder_->createDst(
1699 grfRange, (short) regOff, (short) subregOff, DEF_HORIZ_STRIDE, Type_UD);
1700 }
1701
1702 // Create a destination region for the GRF regvar for the input header
1703 // payload portion of the send message to the data port. The exec size
1704 // needs to be 8 for the mov instruction that uses this as a destination.
createMHeaderInputDstRegion(G4_RegVar * grfRange,unsigned subregOff)1705 G4_DstRegRegion * SpillManagerGRF::createMHeaderInputDstRegion(
1706 G4_RegVar * grfRange,
1707 unsigned subregOff)
1708 {
1709 return builder_->createDst(
1710 grfRange, PAYLOAD_INPUT_REG_OFFSET, (short) subregOff,
1711 DEF_HORIZ_STRIDE, Type_UD);
1712 }
1713
1714 // Create a destination region for the GRF regvar for the payload offset
1715 // portion of the oword block send message. The exec size needs to be 1
1716 // for the mov instruction that uses this as a destination.
createMHeaderBlockOffsetDstRegion(G4_RegVar * grfRange)1717 G4_DstRegRegion * SpillManagerGRF::createMHeaderBlockOffsetDstRegion(
1718 G4_RegVar * grfRange)
1719 {
1720 return builder_->createDst(
1721 grfRange, OWORD_PAYLOAD_SPOFFSET_REG_OFFSET,
1722 OWORD_PAYLOAD_SPOFFSET_SUBREG_OFFSET, DEF_HORIZ_STRIDE,
1723 Type_UD);
1724 }
1725
1726 // Create a source region for the input payload (r0.0). The exec size
1727 // needs to be 8 for the mov instruction that uses this as a source.
1728 G4_SrcRegRegion *
createInputPayloadSrcRegion()1729 SpillManagerGRF::createInputPayloadSrcRegion()
1730 {
1731 G4_RegVar * inputPayloadDirectReg = builder_->getBuiltinR0()->getRegVar();
1732 const RegionDesc * rDesc =
1733 builder_->rgnpool.createRegion(
1734 REG_DWORD_SIZE, REG_DWORD_SIZE, DEF_HORIZ_STRIDE);
1735 return builder_->createSrc(inputPayloadDirectReg,
1736 PAYLOAD_INPUT_REG_OFFSET, PAYLOAD_INPUT_SUBREG_OFFSET,
1737 rDesc, Type_UD);
1738 }
1739
1740 // Create and initialize the message header for the send instruction for
1741 // save/load of value to/from memory.
1742 // The header includes the input payload and the offset (for spill disp).
1743 template <class REGION_TYPE>
createAndInitMHeader(REGION_TYPE * region,G4_ExecSize execSize)1744 G4_Declare * SpillManagerGRF::createAndInitMHeader(
1745 REGION_TYPE * region,
1746 G4_ExecSize execSize)
1747 {
1748 G4_Declare * mRangeDcl = createMRangeDeclare(region, execSize);
1749 return initMHeader (mRangeDcl, region, execSize);
1750 }
1751
1752 // Initialize the message header for the send instruction for save/load
1753 // of value to/from memory.
1754 // The header includes the input payload and the offset (for spill disp).
1755 template <class REGION_TYPE>
initMHeader(G4_Declare * mRangeDcl,REGION_TYPE * region,G4_ExecSize execSize)1756 G4_Declare * SpillManagerGRF::initMHeader(
1757 G4_Declare * mRangeDcl,
1758 REGION_TYPE * region,
1759 G4_ExecSize execSize)
1760 {
1761 // Initialize the message header with the input payload.
1762 if ((useScratchMsg_ && mRangeDcl == builder_->getBuiltinR0()) || !headerNeeded())
1763 {
1764 // mRangeDcl is NULL for fills
1765 return mRangeDcl;
1766 }
1767
1768 G4_DstRegRegion * mHeaderInputDstRegion =
1769 createMHeaderInputDstRegion(mRangeDcl->getRegVar());
1770 G4_SrcRegRegion * inputPayload = createInputPayloadSrcRegion();
1771 createMovInst(G4_ExecSize(REG_DWORD_SIZE), mHeaderInputDstRegion, inputPayload);
1772 numGRFMove++;
1773
1774 if (useScratchMsg_)
1775 {
1776 // Initialize msg header when region is a spill
1777 // When using scratch msg description, we only need to copy
1778 // r0.0 in to msg header. Memory offset will be
1779 // specified in the msg descriptor.
1780 }
1781 else
1782 {
1783 // Initialize the message header with the spill disp for block
1784 // read/write.
1785 G4_DstRegRegion * mHeaderOffsetDstRegion =
1786 createMHeaderBlockOffsetDstRegion(mRangeDcl->getRegVar());
1787 int offset = getSegmentDisp(region, execSize);
1788 getSpillOffset(offset);
1789 unsigned segmentDisp = offset / OWORD_BYTE_SIZE;
1790 G4_Imm * segmentDispImm = builder_->createImm (segmentDisp, Type_UD);
1791
1792 if (!region->isSrcRegRegion() && !region->isDstRegRegion())
1793 {
1794 MUST_BE_TRUE(false, ERROR_GRAPHCOLOR);
1795 }
1796
1797 if (builder_->getIsKernel() == false)
1798 {
1799 createAddFPInst(g4::SIMD1, mHeaderOffsetDstRegion, segmentDispImm);
1800 }
1801 else
1802 {
1803 createMovInst(g4::SIMD1, mHeaderOffsetDstRegion, segmentDispImm);
1804 }
1805 numGRFMove++;
1806 }
1807
1808 // Initialize the message header with the spill disp for scatter
1809 // read/write.
1810 return mRangeDcl;
1811 }
1812
1813 // Create and initialize the message header for the send instruction.
1814 // The header includes the input payload (for spill disp).
createAndInitMHeader(G4_RegVar * regVar)1815 G4_Declare * SpillManagerGRF::createAndInitMHeader(G4_RegVar * regVar)
1816 {
1817 G4_Declare * mRangeDcl = createMRangeDeclare(regVar);
1818 return initMHeader (mRangeDcl);
1819 }
1820
1821 // Initialize the message header for the send instruction.
1822 // The header includes the input payload (for spill disp).
initMHeader(G4_Declare * mRangeDcl)1823 G4_Declare * SpillManagerGRF::initMHeader(G4_Declare * mRangeDcl)
1824 {
1825 // Initialize the message header with the input payload.
1826 if ((useScratchMsg_ && mRangeDcl == builder_->getBuiltinR0()) || !headerNeeded())
1827 {
1828 // mRangeDcl is NULL for fills
1829 return mRangeDcl;
1830 }
1831
1832 G4_DstRegRegion * mHeaderInputDstRegion =
1833 createMHeaderInputDstRegion(mRangeDcl->getRegVar());
1834 G4_SrcRegRegion * inputPayload = createInputPayloadSrcRegion();
1835 createMovInst(G4_ExecSize(REG_DWORD_SIZE), mHeaderInputDstRegion, inputPayload);
1836 numGRFMove ++;
1837
1838 return mRangeDcl;
1839 }
1840
1841 // Initialize the the write payload part of the message for spilled regvars.
1842 // Either of the following restrictions for spillRangeDcl are assumed:
1843 // - the regvar element type is dword and its 2 <= width <= 8 and
1844 // height - regOff == 1
1845 // - the regvar element type is dword and its width = 8 and
1846 // 2 <= height - regOff <= 8
1847 // - the regvar element type is dword and its width and height are 1
initMWritePayload(G4_Declare * spillRangeDcl,G4_Declare * mRangeDcl,unsigned regOff,unsigned height)1848 void SpillManagerGRF::initMWritePayload(
1849 G4_Declare * spillRangeDcl,
1850 G4_Declare * mRangeDcl,
1851 unsigned regOff,
1852 unsigned height)
1853 {
1854 if (useSplitSend())
1855 {
1856 // no need for payload moves if using sends
1857 return;
1858 }
1859
1860 // We use an block write when the spilled regvars's segment is greater
1861 // than a dword. Generate a mov to copy the oword aligned segment into
1862 // the write payload part of the message.
1863 {
1864 unsigned nRows = height;
1865
1866 for (unsigned i = 0; i < nRows; i++) {
1867 G4_SrcRegRegion * spillRangeSrcRegion =
1868 createBlockSpillRangeSrcRegion(
1869 spillRangeDcl->getRegVar(), i + regOff);
1870 G4_DstRegRegion * mPayloadWriteDstRegion =
1871 createMPayloadBlockWriteDstRegion (
1872 mRangeDcl->getRegVar(), i);
1873 G4_ExecSize movExecSize =
1874 G4_ExecSize((nRows > 1) ? REG_DWORD_SIZE : spillRangeDcl->getNumElems());
1875 createMovInst(
1876 movExecSize, mPayloadWriteDstRegion, spillRangeSrcRegion);
1877 numGRFMove ++;
1878 }
1879 }
1880 }
1881
1882 // Initialize the the write payload part of the message for spilled regions.
initMWritePayload(G4_Declare * spillRangeDcl,G4_Declare * mRangeDcl,G4_DstRegRegion * spilledRangeRegion,G4_ExecSize execSize,unsigned regOff)1883 void SpillManagerGRF::initMWritePayload(
1884 G4_Declare * spillRangeDcl,
1885 G4_Declare * mRangeDcl,
1886 G4_DstRegRegion * spilledRangeRegion,
1887 G4_ExecSize execSize,
1888 unsigned regOff)
1889 {
1890 // We use an block write when the spilled region's segment is greater
1891 // than a dword. Generate a mov to copy the oword aligned segment into
1892 // the write payload part of the message.
1893 if (useSplitSend())
1894 {
1895 // no need for payload moves
1896 return;
1897 }
1898 else
1899 {
1900 G4_SrcRegRegion * spillRangeSrcRegion =
1901 createBlockSpillRangeSrcRegion(
1902 spillRangeDcl->getRegVar(), regOff);
1903 G4_DstRegRegion * mPayloadWriteDstRegion =
1904 createMPayloadBlockWriteDstRegion (mRangeDcl->getRegVar());
1905 unsigned segmentByteSize =
1906 getSegmentByteSize(spilledRangeRegion, execSize);
1907 G4_ExecSize movExecSize {segmentByteSize / DWORD_BYTE_SIZE};
1908
1909 // Write entire GRF when using scratch msg descriptor
1910 if (useScratchMsg_)
1911 {
1912 if (movExecSize <= 8)
1913 movExecSize = g4::SIMD8;
1914 else if (movExecSize < g4::SIMD16)
1915 movExecSize = g4::SIMD16;
1916 }
1917
1918 assert(segmentByteSize % DWORD_BYTE_SIZE == 0);
1919 assert(movExecSize <= g4::SIMD16);
1920 createMovInst(
1921 movExecSize, mPayloadWriteDstRegion, spillRangeSrcRegion);
1922 numGRFMove ++;
1923 }
1924 }
1925
1926 // Return the block size encoding for oword block reads.
blockSendBlockSizeCode(unsigned size)1927 unsigned SpillManagerGRF::blockSendBlockSizeCode(unsigned size)
1928 {
1929 auto code = GlobalRA::sendBlockSizeCode(size);
1930 return code << getSendDescDataSizeBitOffset();
1931 }
1932
1933 // Return the block size encoding for dword scatter reads.
scatterSendBlockSizeCode(unsigned size) const1934 unsigned SpillManagerGRF::scatterSendBlockSizeCode(unsigned size) const
1935 {
1936 unsigned code;
1937
1938 switch (size) {
1939 case 1:
1940 // We will use an exec size of 1 to perform 1 dword read/write.
1941 case 8:
1942 code = 0x02;
1943 break;
1944 case 16:
1945 code = 0x03;
1946 break;
1947 default:
1948 assert(0);
1949 code = 0;
1950 }
1951
1952 return code << getSendDescDataSizeBitOffset();
1953 }
1954
getScratchBlocksizeEncoding(int numGRF)1955 static uint32_t getScratchBlocksizeEncoding(int numGRF)
1956 {
1957
1958 int size = (numGRF * getGRFSize()) / 32; // in HWwords
1959 unsigned blocksize_encoding = 0;
1960 if (size == 1)
1961 {
1962 blocksize_encoding = 0x0;
1963 }
1964 else if (size == 2)
1965 {
1966 blocksize_encoding = 0x1;
1967 }
1968 else if (size == 4)
1969 {
1970 blocksize_encoding = 0x2;
1971 }
1972 else if (size == 8)
1973 {
1974 assert(getGenxPlatform() >= GENX_SKL);
1975 blocksize_encoding = 0x3;
1976 }
1977 else
1978 assert(false);
1979 return blocksize_encoding;
1980 }
1981
1982 std::tuple<uint32_t, G4_ExecSize>
createSpillSendMsgDescOWord(unsigned int height)1983 SpillManagerGRF::createSpillSendMsgDescOWord(unsigned int height)
1984 {
1985 unsigned segmentByteSize = height * REG_BYTE_SIZE;
1986 unsigned writePayloadCount = cdiv(segmentByteSize, REG_BYTE_SIZE);
1987 unsigned statelessSurfaceIndex = 0xFF;
1988 unsigned int message = statelessSurfaceIndex;
1989
1990 unsigned headerPresent = 0x80000;
1991 message |= headerPresent;
1992 unsigned messageType = getSendOwordWriteType();
1993 message |= messageType << getSendWriteTypeBitOffset();
1994 unsigned payloadHeaderCount = OWORD_PAYLOAD_HEADER_MAX_HEIGHT;
1995 // split send not used since msg type is oword
1996 unsigned messageLength = writePayloadCount + payloadHeaderCount;
1997 message |= messageLength << getSendMsgLengthBitOffset();
1998 unsigned segmentOwordSize = cdiv(segmentByteSize, OWORD_BYTE_SIZE);
1999 message |= blockSendBlockSizeCode(segmentOwordSize);
2000 auto execSize = G4_ExecSize(LIMIT_SEND_EXEC_SIZE(segmentOwordSize * DWORD_BYTE_SIZE));
2001
2002 return std::make_tuple(message, execSize);
2003 }
2004
2005 // Create the message descriptor for a spill send instruction for spilled
2006 // post destinations of send instructions.
createSpillSendMsgDesc(unsigned regOff,unsigned height,G4_ExecSize & execSize,G4_RegVar * base)2007 G4_Imm * SpillManagerGRF::createSpillSendMsgDesc(
2008 unsigned regOff,
2009 unsigned height,
2010 G4_ExecSize & execSize,
2011 G4_RegVar* base)
2012 {
2013 unsigned message = 0;
2014
2015 if (useScratchMsg_)
2016 {
2017 unsigned headerPresent = 0x80000;
2018 message = headerPresent;
2019 unsigned msgLength = useSplitSend() ? SCRATCH_PAYLOAD_HEADER_MAX_HEIGHT : SCRATCH_PAYLOAD_HEADER_MAX_HEIGHT + height;
2020 message |= (msgLength << getSendMsgLengthBitOffset());
2021 message |= (1 << SCRATCH_MSG_DESC_CATEORY);
2022 message |= (1 << SCRATCH_MSG_DESC_CHANNEL_MODE);
2023 message |= (1 << SCRATCH_MSG_DESC_OPERATION_MODE);
2024 unsigned blocksize_encoding = getScratchBlocksizeEncoding(height);
2025 message |= (blocksize_encoding << SCRATCH_MSG_DESC_BLOCK_SIZE);
2026 int offset = getDisp(base);
2027 getSpillOffset(offset);
2028 // message expects offsets to be in HWord
2029 message |= (offset + regOff * getGRFSize()) >> SCRATCH_SPACE_ADDRESS_UNIT;
2030 execSize = g4::SIMD16;
2031 }
2032 else
2033 {
2034 auto [message, retSize] = createSpillSendMsgDescOWord(height);
2035 execSize = retSize;
2036 }
2037 return builder_->createImm (message, Type_UD);
2038 }
2039
2040 // Create the message descriptor for a spill send instruction for spilled
2041 // destination regions.
2042 std::tuple<G4_Imm*, G4_ExecSize>
createSpillSendMsgDesc(G4_DstRegRegion * spilledRangeRegion,G4_ExecSize execSize)2043 SpillManagerGRF::createSpillSendMsgDesc(
2044 G4_DstRegRegion * spilledRangeRegion,
2045 G4_ExecSize execSize)
2046 {
2047 unsigned message = 0;
2048
2049 if (useScratchMsg_)
2050 {
2051 /*
2052 bits description
2053 18:0 function control
2054 19 Header present
2055 24:20 Response length
2056 28:25 Message length
2057 31:29 MBZ
2058
2059 18:0
2060 11:0 Offset (12b hword offset)
2061 13:12 Block size (00 - 1 register, 01 - 2 regs, 10 - reserved, 11 - 4 regs)
2062 14 MBZ
2063 15 Invalidate after read (0 - no invalidate, 1 - invalidate)
2064 16 Channel mode (0 - oword, 1 - dword)
2065 17 Operation type (0 - read, 1 - write)
2066 18 Category (1 - scratch block read/write)
2067 */
2068 unsigned segmentByteSize = getSegmentByteSize(spilledRangeRegion, execSize);
2069 unsigned writePayloadCount = cdiv(segmentByteSize, REG_BYTE_SIZE);
2070 unsigned headerPresent = 0x80000;
2071 message |= headerPresent;
2072
2073 unsigned payloadHeaderCount = SCRATCH_PAYLOAD_HEADER_MAX_HEIGHT;
2074 // message length = 1 if we are using sends, 1 + payload otherwise
2075 unsigned messageLength = useSplitSend() ? payloadHeaderCount :
2076 writePayloadCount + payloadHeaderCount;
2077 message |= (messageLength << getSendMsgLengthBitOffset());
2078 message |= (1 << SCRATCH_MSG_DESC_CATEORY); // category
2079 message |= (1 << SCRATCH_MSG_DESC_CHANNEL_MODE); // channel mode
2080 message |= (1 << SCRATCH_MSG_DESC_OPERATION_MODE); // write operation
2081 unsigned numGRFs = cdiv(segmentByteSize, numEltPerGRF<Type_UB>());
2082
2083 unsigned blocksize_encoding = getScratchBlocksizeEncoding(numGRFs);
2084
2085 message |= (blocksize_encoding << SCRATCH_MSG_DESC_BLOCK_SIZE);
2086 int offset = getRegionDisp(spilledRangeRegion);
2087 getSpillOffset(offset);
2088 message |= offset >> SCRATCH_SPACE_ADDRESS_UNIT;
2089 if (numGRFs > 1)
2090 {
2091 execSize = g4::SIMD16;
2092 }
2093 else
2094 {
2095 if (execSize > g4::SIMD8)
2096 {
2097 execSize = g4::SIMD16;
2098 }
2099 else
2100 {
2101 execSize = g4::SIMD8;
2102 }
2103 }
2104 }
2105 else
2106 {
2107 unsigned segmentByteSize =
2108 getSegmentByteSize(spilledRangeRegion, execSize);
2109 unsigned writePayloadCount = cdiv(segmentByteSize, REG_BYTE_SIZE);
2110 unsigned statelessSurfaceIndex = 0xFF;
2111 message = statelessSurfaceIndex;
2112
2113 unsigned headerPresent = 0x80000;
2114 message |= headerPresent;
2115 unsigned messageType = getSendOwordWriteType();
2116 message |= messageType << getSendWriteTypeBitOffset();
2117 unsigned payloadHeaderCount = OWORD_PAYLOAD_HEADER_MAX_HEIGHT;
2118 unsigned messageLength = useSplitSend() ? payloadHeaderCount : writePayloadCount + payloadHeaderCount;
2119 message |= messageLength << getSendMsgLengthBitOffset();
2120 unsigned segmentOwordSize = cdiv(segmentByteSize, OWORD_BYTE_SIZE);
2121 message |= blockSendBlockSizeCode (segmentOwordSize);
2122 execSize = G4_ExecSize(LIMIT_SEND_EXEC_SIZE(segmentOwordSize * DWORD_BYTE_SIZE));
2123 }
2124 return std::make_tuple(builder_->createImm (message, Type_UD), execSize);
2125 }
2126
2127 // Create an add instruction to add the FP needed for generating spill/fill code.
2128 // We always set the NoMask flag and use a null conditional modifier.
createAddFPInst(G4_ExecSize execSize,G4_DstRegRegion * dst,G4_Operand * src)2129 G4_INST * SpillManagerGRF::createAddFPInst(
2130 G4_ExecSize execSize,
2131 G4_DstRegRegion * dst,
2132 G4_Operand * src)
2133 {
2134 const RegionDesc* rDesc = builder_->getRegionScalar();
2135 G4_Operand* fp = builder_->createSrc(builder_->kernel.fg.framePtrDcl->getRegVar(),
2136 0, 0, rDesc, Type_UD);
2137 auto newInst = builder_->createBinOp(G4_add, execSize, dst, fp, src, InstOpt_WriteEnable, true);
2138 newInst->inheritDIFrom(curInst);
2139
2140 return newInst;
2141
2142 }
2143
2144 // Create a mov instruction needed for generating spill/fill code.
2145 // We always set the NoMask flag and use a null conditional modifier.
createMovInst(G4_ExecSize execSize,G4_DstRegRegion * dst,G4_Operand * src,G4_Predicate * predicate,unsigned int options)2146 G4_INST * SpillManagerGRF::createMovInst(
2147 G4_ExecSize execSize,
2148 G4_DstRegRegion * dst,
2149 G4_Operand * src,
2150 G4_Predicate * predicate,
2151 unsigned int options)
2152 {
2153 auto newInst = builder_->createMov(execSize, dst, src, options, true);
2154
2155 if (predicate)
2156 {
2157 newInst->setPredicate(predicate);
2158 }
2159
2160 return newInst;
2161 }
2162
2163 // Create a send instruction needed for generating spill/fill code.
2164 // We always set the NoMask flag and use a null predicate and conditional
2165 // modifier.
createSendInst(G4_ExecSize execSize,G4_DstRegRegion * postDst,G4_SrcRegRegion * payload,G4_Imm * desc,SFID funcID,bool isWrite,unsigned option)2166 G4_INST * SpillManagerGRF::createSendInst(
2167 G4_ExecSize execSize,
2168 G4_DstRegRegion * postDst,
2169 G4_SrcRegRegion * payload,
2170 G4_Imm * desc,
2171 SFID funcID,
2172 bool isWrite,
2173 unsigned option)
2174 {
2175 // ToDo: create exDesc in createSendMsgDesc()
2176 uint32_t exDesc = G4_SendDescRaw::createExtDesc(funcID);
2177 auto msgDesc = builder_->createSendMsgDesc(funcID, (uint32_t)desc->getInt(), exDesc, 0,
2178 isWrite ? SendAccess::WRITE_ONLY : SendAccess::READ_ONLY, nullptr);
2179 auto sendInst = builder_->createSendInst(
2180 NULL, G4_send, execSize, postDst,
2181 payload, desc, option, msgDesc, true);
2182 sendInst->inheritDIFrom(curInst);
2183
2184 return sendInst;
2185 }
2186
2187 // Create the send instructions to fill in the value of spillRangeDcl into
2188 // fillRangeDcl in aligned portions.
getNextSize(int height,bool useHWordMsg)2189 static int getNextSize(int height, bool useHWordMsg)
2190 {
2191 bool has8GRFMessage = useHWordMsg && getGenxPlatform() >= GENX_SKL &&
2192 getGRFSize() == 32;
2193 if (has8GRFMessage && height >= 8)
2194 {
2195 return 8;
2196 }
2197 else if (height >= 4)
2198 {
2199 return 4;
2200 }
2201 else if (height >= 2)
2202 {
2203 return 2;
2204 }
2205 else if (height >= 1)
2206 {
2207 return 1;
2208 }
2209 return 0;
2210 }
2211
2212 void
sendInSpilledRegVarPortions(G4_Declare * fillRangeDcl,G4_Declare * mRangeDcl,unsigned regOff,unsigned height,unsigned srcRegOff)2213 SpillManagerGRF::sendInSpilledRegVarPortions(
2214 G4_Declare * fillRangeDcl,
2215 G4_Declare * mRangeDcl,
2216 unsigned regOff,
2217 unsigned height,
2218 unsigned srcRegOff)
2219 {
2220 //if (!headerNeeded())
2221 if ((useScratchMsg_ && mRangeDcl == builder_->getBuiltinR0()) || !headerNeeded())
2222 {
2223 // Skip initializing message header
2224 }
2225 else
2226 {
2227 // Initialize the message header with the spill disp for portion.
2228 int offset = getDisp(fillRangeDcl->getRegVar()) + regOff * REG_BYTE_SIZE;
2229 getSpillOffset(offset);
2230
2231 unsigned segmentDisp = offset / OWORD_BYTE_SIZE;
2232 G4_Imm * segmentDispImm = builder_->createImm(segmentDisp, Type_UD);
2233 G4_DstRegRegion * mHeaderOffsetDstRegion =
2234 createMHeaderBlockOffsetDstRegion(mRangeDcl->getRegVar());
2235
2236 if (builder_->getIsKernel() == false)
2237 {
2238 createAddFPInst(
2239 g4::SIMD1, mHeaderOffsetDstRegion, segmentDispImm);
2240 }
2241 else
2242 {
2243 createMovInst(g4::SIMD1, mHeaderOffsetDstRegion, segmentDispImm);
2244 }
2245 numGRFMove ++;
2246 }
2247
2248 // Read in the portions using a greedy approach.
2249 int currentStride = getNextSize(height, useScratchMsg_);
2250
2251 if (currentStride)
2252 {
2253 if (useLSCMsg)
2254 {
2255 createLSCFill(fillRangeDcl, mRangeDcl, regOff, currentStride, srcRegOff);
2256 }
2257 else
2258 {
2259 createFillSendInstr(fillRangeDcl, mRangeDcl, regOff, currentStride, srcRegOff);
2260 }
2261
2262 if (height - currentStride > 0)
2263 {
2264 sendInSpilledRegVarPortions(
2265 fillRangeDcl, mRangeDcl, regOff + currentStride,
2266 height - currentStride, srcRegOff + currentStride);
2267 }
2268 }
2269 }
2270
2271 // Check if we need to perform the pre-load of the spilled region's
2272 // segment from spill memory. A pre-load is required under the following
2273 // circumstances:
2274 // - for partial writes - horizontal stride greater than one, and when
2275 // the emask and predicate can possibly disable channels (for now if
2276 // predicates or condition modofoers are present then we conservatively
2277 // assume a partial write)
2278 // - write's where the segment is larger than the actaully written region
2279 // (either because the spill offset for the region or its size is not
2280 // oword or dword aligned for writing the exact region)
shouldPreloadSpillRange(G4_INST * instContext,G4_BB * parentBB)2281 bool SpillManagerGRF::shouldPreloadSpillRange(
2282 G4_INST* instContext, G4_BB* parentBB)
2283 {
2284 // Check for partial and unaligned regions and add pre-load code, if
2285 // necessary.
2286 auto spilledRangeRegion = instContext->getDst();
2287 G4_ExecSize execSize = instContext->getExecSize();
2288
2289 if (isPartialRegion(spilledRangeRegion, execSize) ||
2290 isUnalignedRegion(spilledRangeRegion, execSize) ||
2291 instContext->isPartialWriteForSpill(!parentBB->isAllLaneActive()))
2292 {
2293 // special check for scalar variables: no need for pre-fill if instruction writes to whole variable and is not predicated
2294 auto spilledDcl = spilledRangeRegion->getTopDcl()->getRootDeclare();
2295 if (execSize == g4::SIMD1 && spilledRangeRegion->getTypeSize() == spilledDcl->getByteSize() && !instContext->getPredicate())
2296 {
2297 //ToDo: investigate why we are spilling so many scalar variables
2298 return false;
2299 }
2300 return true;
2301 }
2302 // No pre-load for whole and aligned region writes
2303 else
2304 {
2305 return false;
2306 }
2307 }
2308
2309 // Create the send instruction to perform the pre-load of the spilled region's
2310 // segment into spill memory.
preloadSpillRange(G4_Declare * spillRangeDcl,G4_Declare * mRangeDcl,G4_DstRegRegion * spilledRangeRegion,G4_ExecSize execSize)2311 void SpillManagerGRF::preloadSpillRange(
2312 G4_Declare * spillRangeDcl,
2313 G4_Declare * mRangeDcl,
2314 G4_DstRegRegion * spilledRangeRegion,
2315 G4_ExecSize execSize)
2316 {
2317 // When execSize is 32, regions <32, 32, 1> or <64; 32, 2> are invalid.
2318 // Use a uniform region descriptor <stride; 1, 0>. Note that stride could
2319 // be 0 when execsize is 1.
2320 uint16_t hstride = spilledRangeRegion->getHorzStride();
2321 const RegionDesc *rDesc = builder_->createRegionDesc(execSize, hstride, 1, 0);
2322
2323 G4_SrcRegRegion* preloadRegion = builder_->createSrc(spillRangeDcl->getRegVar(),
2324 REG_ORIGIN, spilledRangeRegion->getSubRegOff(),
2325 rDesc, spilledRangeRegion->getType());
2326
2327 if (useScratchMsg_)
2328 {
2329 // src region's base refers to the filled region's base
2330 // The size of src region is equal to number of rows that
2331 // have to be filled, starting at the reg offset specified
2332 // in the original operand. For eg,
2333 // Let the spilled operand be V40(3,3)
2334 //
2335 // => mov (1) V40(3,3)<1>:ud V30(0,0)<0;1,0>:ud
2336 // When this will be replaced with a preload fill,
2337 // => mov (1) TM_GRF_V40_0(0,0)<1>:ud V30(0,0)<0;1,0>:ud
2338 // => send (16) SP_V40_0(0,0)<1>:ud ... <--- load V40's 3rd row in SP_V40_0
2339 // => mov (1) SP_V40_0(0,3)<1>:ud TM_GRF_V40_0(0,0)<8;8,1>:ud <--- overlay
2340 // => send (16) null ... <--- store V40's updated 3rd row to memory
2341 //
2342 // Since the filled register's register offset is 0,0 in first
2343 // send instruction, this change is made when creating the operand
2344 // itself.
2345
2346 // Attach preloadRegion to dummy mov so getLeftBound/getRightBound won't crash when called from crossGRF in createFillSendMsgDesc
2347 builder_->createMov(execSize, builder_->createNullDst(Type_UD), preloadRegion, InstOpt_NoOpt, false);
2348 }
2349
2350 if (useLSCMsg)
2351 {
2352 createLSCFill(spillRangeDcl, mRangeDcl, preloadRegion, execSize);
2353 }
2354 else
2355 {
2356 createFillSendInstr(spillRangeDcl, mRangeDcl, preloadRegion, execSize);
2357 }
2358
2359 }
2360
getSpillFillHeader(IR_Builder & builder,G4_Declare * decl)2361 G4_SrcRegRegion* vISA::getSpillFillHeader(IR_Builder& builder, G4_Declare * decl)
2362 {
2363 if (builder.supportsLSC())
2364 {
2365 // LSC in its current incarnation needs a header to store the address
2366 return builder.createSrcRegRegion(builder.getSpillFillHeader(), builder.getRegionStride1());
2367 }
2368 return builder.createSrcRegRegion(decl, builder.getRegionStride1());
2369 }
2370
2371 // Create the send instruction to perform the spill of the spilled regvars's
2372 // segment into spill memory.
2373 // regOff - Offset of sub-spill. If one spill is split into more than one spill,
2374 // this is the offset of them, unit in register size
2375 // spillOff - Offset of the original variable being spilled, unit in register size.
createSpillSendInstr(G4_Declare * spillRangeDcl,G4_Declare * mRangeDcl,unsigned regOff,unsigned height,unsigned spillOff)2376 G4_INST *SpillManagerGRF::createSpillSendInstr(
2377 G4_Declare * spillRangeDcl,
2378 G4_Declare * mRangeDcl,
2379 unsigned regOff,
2380 unsigned height,
2381 unsigned spillOff)
2382 {
2383 G4_ExecSize execSize (0);
2384
2385 G4_Imm * messageDescImm = NULL;
2386
2387 if (useScratchMsg_)
2388 {
2389 G4_RegVar* r = spillRangeDcl->getRegVar();
2390 G4_RegVarTmp* rvar = static_cast<G4_RegVarTmp*> (r);
2391 messageDescImm =
2392 createSpillSendMsgDesc(spillOff, height, execSize, rvar->getBaseRegVar());
2393 #ifdef _DEBUG
2394 int offset = (messageDescImm->getInt() & 0xFFF) * numEltPerGRF<Type_UB>();
2395 MUST_BE_TRUE(offset >= globalScratchOffset, "incorrect offset");
2396 #endif
2397 }
2398 else
2399 {
2400 messageDescImm =
2401 createSpillSendMsgDesc(regOff, height, execSize);
2402 }
2403
2404 G4_DstRegRegion * postDst = builder_->createNullDst(execSize > g4::SIMD8 ? Type_UW : Type_UD);
2405
2406 G4_INST* sendInst = NULL;
2407 if (useSplitSend())
2408 {
2409 auto headerOpnd = getSpillFillHeader(*builder_, mRangeDcl);
2410 G4_SrcRegRegion* srcOpnd = createBlockSpillRangeSrcRegion(spillRangeDcl->getRegVar(), regOff);
2411
2412 auto off = G4_SpillIntrinsic::InvalidOffset;
2413 G4_Declare* fp = nullptr;
2414 if (useScratchMsg_)
2415 off = (messageDescImm->getInt() & 0xfff);
2416 else
2417 {
2418 if (builder_->usesStack())
2419 {
2420 G4_RegVar* r = spillRangeDcl->getRegVar();
2421 G4_RegVarTmp* rvar = static_cast<G4_RegVarTmp*> (r);
2422 int offset = getDisp(rvar->getBaseRegVar());
2423 getSpillOffset(offset);
2424 // message expects offsets to be in HWord
2425 off = (offset + spillOff * getGRFSize()) >> SCRATCH_SPACE_ADDRESS_UNIT;
2426 if (builder_->usesStack())
2427 fp = builder_->kernel.fg.getFramePtrDcl();
2428
2429 if (!fp && offset < SCRATCH_MSG_LIMIT)
2430 headerOpnd = builder_->createSrcRegRegion(builder_->getBuiltinR0(), builder_->getRegionStride1());
2431 }
2432 }
2433 sendInst = builder_->createSpill(postDst, headerOpnd, srcOpnd, execSize, height, off, fp, InstOpt_WriteEnable, true);
2434 sendInst->inheritDIFrom(curInst);
2435 }
2436 else
2437 {
2438 G4_SrcRegRegion * payload = builder_->createSrc(
2439 mRangeDcl->getRegVar(), 0, 0, builder_->getRegionStride1(), Type_UD);
2440 sendInst = createSendInst(execSize, postDst, payload, messageDescImm, SFID::DP_DC0, true, InstOpt_WriteEnable);
2441 }
2442
2443 return sendInst;
2444 }
2445
2446 // Create the send instruction to perform the spill of the spilled region's
2447 // segment into spill memory.
createSpillSendInstr(G4_Declare * spillRangeDcl,G4_Declare * mRangeDcl,G4_DstRegRegion * spilledRangeRegion,G4_ExecSize execSize,unsigned option)2448 G4_INST *SpillManagerGRF::createSpillSendInstr(
2449 G4_Declare * spillRangeDcl,
2450 G4_Declare * mRangeDcl,
2451 G4_DstRegRegion * spilledRangeRegion,
2452 G4_ExecSize execSize,
2453 unsigned option)
2454 {
2455
2456 G4_DstRegRegion * postDst =
2457 builder_->createNullDst(execSize > g4::SIMD8 ? Type_UW : Type_UD);
2458
2459 G4_INST* sendInst = NULL;
2460 if (useSplitSend())
2461 {
2462 unsigned extMsgLength = spillRangeDcl->getNumRows();
2463 const RegionDesc* region = builder_->getRegionStride1();
2464 auto headerOpnd = getSpillFillHeader(*builder_, mRangeDcl);
2465 G4_SrcRegRegion* srcOpnd = builder_->createSrcRegRegion(spillRangeDcl, region);
2466
2467 auto off = G4_SpillIntrinsic::InvalidOffset;
2468 G4_Declare* fp = nullptr;
2469 auto spillExecSize = execSize;
2470 if (useScratchMsg_)
2471 {
2472 auto [messageDescImm, retSize] =
2473 createSpillSendMsgDesc(spilledRangeRegion, execSize);
2474 spillExecSize = retSize;
2475 off = (messageDescImm->getInt() & 0xfff);
2476 }
2477 else
2478 {
2479 if (builder_->usesStack())
2480 {
2481 G4_RegVar* r = spillRangeDcl->getRegVar();
2482 G4_RegVarTmp* rvar = static_cast<G4_RegVarTmp*> (r);
2483 int offset = getDisp(rvar->getBaseRegVar());
2484 getSpillOffset(offset);
2485 // message expects offsets to be in HWord
2486 auto regOff = spilledRangeRegion->getRegOff();
2487 off = (offset + regOff * getGRFSize()) >> SCRATCH_SPACE_ADDRESS_UNIT;
2488 if (builder_->usesStack())
2489 fp = builder_->kernel.fg.getFramePtrDcl();
2490
2491 if (!fp && offset < SCRATCH_MSG_LIMIT)
2492 headerOpnd = builder_->createSrcRegRegion(builder_->getBuiltinR0(), builder_->getRegionStride1());
2493 }
2494 }
2495 sendInst = builder_->createSpill(postDst, headerOpnd, srcOpnd, spillExecSize, (uint16_t)extMsgLength,
2496 off, fp, static_cast<G4_InstOption>(option), true);
2497 sendInst->inheritDIFrom(curInst);
2498 }
2499 else
2500 {
2501 auto [messageDescImm, spillExecSize] =
2502 createSpillSendMsgDesc(spilledRangeRegion, execSize);
2503 G4_SrcRegRegion * payload = builder_->createSrc(
2504 mRangeDcl->getRegVar(), 0, 0, builder_->getRegionStride1(), Type_UD);
2505 sendInst = createSendInst(spillExecSize, postDst, payload, messageDescImm, SFID::DP_DC0, true, static_cast<G4_InstOption>(option));
2506 }
2507
2508 return sendInst;
2509 }
2510
2511 // Create the message description for a fill send instruction for filled
2512 // regvars.
createFillSendMsgDesc(unsigned regOff,unsigned height,G4_ExecSize & execSize,G4_RegVar * base)2513 G4_Imm *SpillManagerGRF::createFillSendMsgDesc(
2514 unsigned regOff,
2515 unsigned height,
2516 G4_ExecSize & execSize,
2517 G4_RegVar * base)
2518 {
2519 unsigned message = 0;
2520
2521 if (useScratchMsg_)
2522 {
2523 unsigned segmentByteSize = height * REG_BYTE_SIZE;
2524 unsigned responseLength = cdiv(segmentByteSize, REG_BYTE_SIZE);
2525 message = responseLength << getSendRspLengthBitOffset();
2526 unsigned headerPresent = 0x80000;
2527 message |= SCRATCH_PAYLOAD_HEADER_MAX_HEIGHT << getSendMsgLengthBitOffset();
2528 message |= headerPresent;
2529
2530 message |= (1 << SCRATCH_MSG_DESC_CATEORY);
2531 message |= (0 << SCRATCH_MSG_INVALIDATE_AFTER_READ);
2532 unsigned blocksize_encoding = getScratchBlocksizeEncoding(height);
2533
2534 message |= (blocksize_encoding << SCRATCH_MSG_DESC_BLOCK_SIZE);
2535
2536 int offset = getDisp(base);
2537 getSpillOffset(offset);
2538 // message expects offsets to be in HWord
2539 message |= (offset + regOff * getGRFSize()) >> SCRATCH_SPACE_ADDRESS_UNIT;
2540
2541 execSize = g4::SIMD16;
2542 }
2543 else
2544 {
2545 unsigned segmentByteSize = height * REG_BYTE_SIZE;
2546 unsigned statelessSurfaceIndex = 0xFF;
2547 unsigned responseLength = cdiv(segmentByteSize, REG_BYTE_SIZE);
2548 responseLength = responseLength << getSendRspLengthBitOffset();
2549 message = statelessSurfaceIndex | responseLength;
2550
2551 unsigned headerPresent = 0x80000;
2552 message |= headerPresent;
2553 unsigned messageType = getSendOwordReadType();
2554 message |= messageType << getSendReadTypeBitOffset();
2555 unsigned messageLength = OWORD_PAYLOAD_HEADER_MIN_HEIGHT;
2556 message |= messageLength << getSendMsgLengthBitOffset();
2557 unsigned segmentOwordSize =
2558 cdiv(segmentByteSize, OWORD_BYTE_SIZE);
2559 message |= blockSendBlockSizeCode (segmentOwordSize);
2560 execSize = G4_ExecSize(LIMIT_SEND_EXEC_SIZE (segmentOwordSize * DWORD_BYTE_SIZE));
2561 }
2562 return builder_->createImm (message, Type_UD);
2563 }
2564
2565 // Create the message description for a fill send instruction for filled
2566 // source regions.
2567 template <class REGION_TYPE>
createFillSendMsgDesc(REGION_TYPE * filledRangeRegion,G4_ExecSize execSize)2568 G4_Imm *SpillManagerGRF::createFillSendMsgDesc(
2569 REGION_TYPE * filledRangeRegion,
2570 G4_ExecSize execSize)
2571 {
2572 unsigned message = 0;
2573
2574 if (useScratchMsg_)
2575 {
2576 unsigned segmentByteSize =
2577 getSegmentByteSize(filledRangeRegion, execSize);
2578 if (filledRangeRegion->crossGRF()) {
2579 segmentByteSize = 2 * REG_BYTE_SIZE;
2580 }
2581
2582 unsigned responseLength = cdiv(segmentByteSize, REG_BYTE_SIZE);
2583 message = responseLength << getSendRspLengthBitOffset();
2584
2585 unsigned headerPresent = 0x80000;
2586 message |= headerPresent;
2587
2588 message |= (SCRATCH_PAYLOAD_HEADER_MAX_HEIGHT << getSendMsgLengthBitOffset());
2589 message |= (1 << SCRATCH_MSG_DESC_CATEORY);
2590 message |= (0 << SCRATCH_MSG_INVALIDATE_AFTER_READ);
2591 unsigned blocksize_encoding = getScratchBlocksizeEncoding(responseLength);
2592
2593 message |= (blocksize_encoding << SCRATCH_MSG_DESC_BLOCK_SIZE);
2594 int offset = getRegionDisp(filledRangeRegion);
2595 getSpillOffset(offset);
2596 message |= offset >> SCRATCH_SPACE_ADDRESS_UNIT;
2597 }
2598 else
2599 {
2600 unsigned segmentByteSize =
2601 getSegmentByteSize(filledRangeRegion, execSize);
2602 unsigned statelessSurfaceIndex = 0xFF;
2603 unsigned responseLength = cdiv(segmentByteSize, REG_BYTE_SIZE);
2604 responseLength = responseLength << getSendRspLengthBitOffset();
2605 message = statelessSurfaceIndex | responseLength;
2606
2607 unsigned headerPresent = 0x80000;
2608 message |= headerPresent;
2609 unsigned messageType = getSendOwordReadType();
2610 message |= messageType << getSendReadTypeBitOffset();
2611 unsigned messageLength = OWORD_PAYLOAD_HEADER_MIN_HEIGHT;
2612 message |= messageLength << getSendMsgLengthBitOffset();
2613 unsigned segmentOwordSize =
2614 cdiv(segmentByteSize, OWORD_BYTE_SIZE);
2615 message |= blockSendBlockSizeCode (segmentOwordSize);
2616 }
2617 return builder_->createImm(message, Type_UD);
2618 }
2619
2620 // Create the send instruction to perform the fill of the spilled regvars's
2621 // segment from spill memory.
2622 // spillOff - spill offset to the fillRangeDcl, in unit of grf size
createFillSendInstr(G4_Declare * fillRangeDcl,G4_Declare * mRangeDcl,unsigned regOff,unsigned height,unsigned spillOff)2623 G4_INST * SpillManagerGRF::createFillSendInstr (
2624 G4_Declare * fillRangeDcl,
2625 G4_Declare * mRangeDcl,
2626 unsigned regOff,
2627 unsigned height,
2628 unsigned spillOff)
2629 {
2630 G4_ExecSize execSize {0};
2631
2632 G4_Imm * messageDescImm = NULL;
2633
2634 if (useScratchMsg_)
2635 {
2636 G4_RegVar* r = fillRangeDcl->getRegVar();
2637 G4_RegVarTmp* rvar = static_cast<G4_RegVarTmp*> (r);
2638 messageDescImm =
2639 createFillSendMsgDesc (spillOff, height, execSize, rvar->getBaseRegVar());
2640 #ifdef _DEBUG
2641 int offset = (messageDescImm->getInt() & 0xFFF) * numEltPerGRF<Type_UB>();
2642 MUST_BE_TRUE(offset >= globalScratchOffset, "incorrect offset");
2643 #endif
2644 }
2645 else
2646 {
2647 messageDescImm =
2648 createFillSendMsgDesc (regOff, height, execSize);
2649 }
2650
2651 G4_DstRegRegion * postDst = builder_->createDst(
2652 fillRangeDcl->getRegVar(), (short) regOff, SUBREG_ORIGIN,
2653 DEF_HORIZ_STRIDE, (execSize > 8)? Type_UW: Type_UD);
2654
2655 auto payload = getSpillFillHeader(*builder_, mRangeDcl);
2656
2657 unsigned int off = G4_FillIntrinsic::InvalidOffset;
2658 G4_Declare* fp = nullptr;
2659 if (useScratchMsg_)
2660 off = (messageDescImm->getInt() & 0xfff);
2661 else
2662 {
2663 if (builder_->usesStack())
2664 {
2665 // compute hword offset to emit later when expanding spill/fill intrinsic
2666 G4_RegVar* r = fillRangeDcl->getRegVar();
2667 G4_RegVarTmp* rvar = static_cast<G4_RegVarTmp*> (r);
2668 int offset = getDisp(rvar->getBaseRegVar());
2669 getSpillOffset(offset);
2670 // message expects offsets to be in HWord
2671 off = (offset + spillOff * getGRFSize()) >> SCRATCH_SPACE_ADDRESS_UNIT;
2672 if (builder_->usesStack())
2673 fp = builder_->kernel.fg.getFramePtrDcl();
2674
2675 if (!fp && offset < SCRATCH_MSG_LIMIT)
2676 payload = builder_->createSrcRegRegion(builder_->getBuiltinR0(), builder_->getRegionStride1());
2677 }
2678 }
2679 auto fillInst = builder_->createFill(payload, postDst, execSize, height, off, fp, InstOpt_WriteEnable, true);
2680 fillInst->inheritDIFrom(curInst);
2681 return fillInst;
2682
2683 }
2684
2685 // Create the send instruction to perform the fill of the filled region's
2686 // segment into fill memory.
createFillSendInstr(G4_Declare * fillRangeDcl,G4_Declare * mRangeDcl,G4_SrcRegRegion * filledRangeRegion,G4_ExecSize execSize)2687 G4_INST * SpillManagerGRF::createFillSendInstr(
2688 G4_Declare * fillRangeDcl,
2689 G4_Declare * mRangeDcl,
2690 G4_SrcRegRegion * filledRangeRegion,
2691 G4_ExecSize execSize)
2692 {
2693 auto oldExecSize = execSize;
2694
2695 if (useScratchMsg_)
2696 {
2697 execSize = g4::SIMD16;
2698 }
2699
2700 G4_DstRegRegion * postDst = builder_->createDst(
2701 fillRangeDcl->getRegVar(), 0, SUBREG_ORIGIN,
2702 DEF_HORIZ_STRIDE, (execSize > 8)? Type_UW : Type_UD);
2703
2704 auto payload = getSpillFillHeader(*builder_, mRangeDcl);
2705
2706 unsigned int off = G4_FillIntrinsic::InvalidOffset;
2707 unsigned segmentByteSize = getSegmentByteSize(filledRangeRegion, oldExecSize);
2708 G4_Declare* fp = nullptr;
2709 if (useScratchMsg_)
2710 {
2711 G4_Imm* messageDescImm =
2712 createFillSendMsgDesc(filledRangeRegion, oldExecSize);
2713
2714 off = (messageDescImm->getInt() & 0xfff);
2715 if (filledRangeRegion->crossGRF())
2716 {
2717 segmentByteSize = 2 * REG_BYTE_SIZE;
2718 }
2719 }
2720 else
2721 {
2722 if (builder_->usesStack())
2723 {
2724 // compute hword offset to emit later when expanding spill/fill intrinsic
2725 int offset = getRegionDisp(filledRangeRegion);
2726 getSpillOffset(offset);
2727 off = offset >> SCRATCH_SPACE_ADDRESS_UNIT;
2728 if (builder_->usesStack())
2729 fp = builder_->kernel.fg.getFramePtrDcl();
2730
2731 if (!fp && offset < SCRATCH_MSG_LIMIT)
2732 payload = builder_->createSrcRegRegion(builder_->getBuiltinR0(), builder_->getRegionStride1());
2733 }
2734 }
2735
2736 unsigned responseLength = cdiv(segmentByteSize, REG_BYTE_SIZE);
2737 auto fillInst = builder_->createFill(payload, postDst, execSize, responseLength, off, fp, InstOpt_WriteEnable, true);
2738 fillInst->inheritDIFrom(curInst);
2739 return fillInst;
2740 }
2741
2742 // LSC versions of spill/fill, useLSCMsg must be true for these functions
2743
getLSCSpillFillHeader(G4_Declare * mRangeDcl,const G4_Declare * fp,int offset)2744 G4_SrcRegRegion *SpillManagerGRF::getLSCSpillFillHeader(
2745 G4_Declare* mRangeDcl, const G4_Declare *fp, int offset)
2746 {
2747 G4_SrcRegRegion* headerOpnd = nullptr;
2748 if (!fp && offset < SCRATCH_MSG_LIMIT && !useLscNonstackCall) {
2749 // using LSC because we exceed 128k of DC0 message
2750 headerOpnd = builder_->createSrcRegRegion(builder_->getBuiltinR0(), builder_->getRegionStride1());
2751 }
2752 else
2753 {
2754 headerOpnd = getSpillFillHeader(*builder_, mRangeDcl);
2755 }
2756 return headerOpnd;
2757 }
2758
2759 // Create the send instruction to perform the spill of the spilled regvars's
2760 // segment into spill memory.
2761 //
2762 // regOff - Offset of sub-spill. If one spill is splitted into more than one spill,
2763 // this is the offset of them, unit in register size
2764 // spillOff - Offset of the original variable being spilled, unit in register size.
createLSCSpill(G4_Declare * spillRangeDcl,G4_Declare * mRangeDcl,unsigned regOff,unsigned height,unsigned spillOff)2765 G4_INST * SpillManagerGRF::createLSCSpill(
2766 G4_Declare* spillRangeDcl,
2767 G4_Declare* mRangeDcl,
2768 unsigned regOff,
2769 unsigned height,
2770 unsigned spillOff)
2771 {
2772 G4_ExecSize execSize(16);
2773
2774 G4_DstRegRegion* postDst = builder_->createNullDst(Type_UD);
2775
2776 G4_SrcRegRegion* srcOpnd = createBlockSpillRangeSrcRegion(spillRangeDcl->getRegVar(), regOff);
2777 G4_Declare* fp = builder_->usesStack() ? builder_->kernel.fg.getFramePtrDcl() : nullptr;
2778
2779 G4_RegVarTmp* rvar = static_cast<G4_RegVarTmp*>(spillRangeDcl->getRegVar());
2780 int offset = getDisp(rvar->getBaseRegVar());
2781 getSpillOffset(offset);
2782 // message expects offsets to be in HWord
2783 uint32_t offsetHwords = (offset + spillOff * getGRFSize()) >> SCRATCH_SPACE_ADDRESS_UNIT;
2784
2785 G4_SrcRegRegion* header = getLSCSpillFillHeader(mRangeDcl, fp, offset);
2786 auto sendInst = builder_->createSpill(postDst, header, srcOpnd, execSize,
2787 height, offsetHwords, fp, InstOpt_WriteEnable, true);
2788 sendInst->inheritDIFrom(curInst);
2789
2790 return sendInst;
2791 }
2792
2793 // Create the send instruction to perform the spill of the spilled region's
2794 // segment into spill memory.
createLSCSpill(G4_Declare * spillRangeDcl,G4_Declare * mRangeDcl,G4_DstRegRegion * spilledRangeRegion,G4_ExecSize execSize,unsigned option)2795 G4_INST * SpillManagerGRF::createLSCSpill(
2796 G4_Declare* spillRangeDcl,
2797 G4_Declare* mRangeDcl,
2798 G4_DstRegRegion* spilledRangeRegion,
2799 G4_ExecSize execSize,
2800 unsigned option)
2801 {
2802 G4_DstRegRegion* postDst = builder_->createNullDst(Type_UD);
2803
2804 unsigned extMsgLength = spillRangeDcl->getNumRows();
2805 const RegionDesc* region = builder_->getRegionStride1();
2806 G4_SrcRegRegion* srcOpnd = builder_->createSrcRegRegion(spillRangeDcl, region);
2807
2808 G4_Declare* fp = builder_->usesStack() ? builder_->kernel.fg.getFramePtrDcl() : nullptr;
2809
2810 G4_RegVarTmp* rvar = static_cast<G4_RegVarTmp*>(spillRangeDcl->getRegVar());
2811 int offset = getDisp(rvar->getBaseRegVar());
2812 getSpillOffset(offset);
2813 // message expects offsets to be in HWord
2814 auto regOff = spilledRangeRegion->getRegOff();
2815 uint32_t offsetHwords = (offset + regOff * getGRFSize()) >> SCRATCH_SPACE_ADDRESS_UNIT;
2816
2817 G4_SrcRegRegion* header = getLSCSpillFillHeader(mRangeDcl, fp, offset);
2818 auto sendInst = builder_->createSpill(postDst, header, srcOpnd, execSize,
2819 (uint16_t)extMsgLength, offsetHwords, fp, static_cast<G4_InstOption>(option), true);
2820 sendInst->inheritDIFrom(curInst);
2821
2822 return sendInst;
2823 }
2824
2825 // Create the send instruction to perform the fill of the spilled regvars's
2826 // segment from spill memory.
2827 // spillOff - spill offset to the fillRangeDcl, in unit of grf size
createLSCFill(G4_Declare * fillRangeDcl,G4_Declare * mRangeDcl,unsigned regOff,unsigned height,unsigned spillOff)2828 G4_INST * SpillManagerGRF::createLSCFill(
2829 G4_Declare * fillRangeDcl,
2830 G4_Declare * mRangeDcl,
2831 unsigned regOff,
2832 unsigned height,
2833 unsigned spillOff)
2834 {
2835 G4_DstRegRegion* postDst = builder_->createDst(
2836 fillRangeDcl->getRegVar(), (short)regOff, SUBREG_ORIGIN,
2837 DEF_HORIZ_STRIDE, Type_UD);
2838
2839 G4_Declare* fp = builder_->usesStack() ? builder_->kernel.fg.getFramePtrDcl() : nullptr;
2840
2841 // compute hword offset to emit later when expanding spill/fill intrinsic
2842 G4_RegVar* r = fillRangeDcl->getRegVar();
2843 G4_RegVarTmp* rvar = static_cast<G4_RegVarTmp*> (r);
2844 int offset = getDisp(rvar->getBaseRegVar());
2845 getSpillOffset(offset);
2846 // fill intrinsic expects offsets to be in HWord
2847 uint32_t offsetHwords = (offset + spillOff * getGRFSize()) >> SCRATCH_SPACE_ADDRESS_UNIT;
2848
2849 G4_SrcRegRegion* header = getLSCSpillFillHeader(mRangeDcl, fp, offset);
2850 auto fillInst = builder_->createFill(header, postDst, g4::SIMD16, height,
2851 offsetHwords, fp, InstOpt_WriteEnable, true);
2852 fillInst->inheritDIFrom(curInst);
2853 return fillInst;
2854 }
2855
2856 // Create the send instruction to perform the fill of the filled region's
2857 // segment into fill memory.
createLSCFill(G4_Declare * fillRangeDcl,G4_Declare * mRangeDcl,G4_SrcRegRegion * filledRangeRegion,G4_ExecSize execSize)2858 G4_INST * SpillManagerGRF::createLSCFill(
2859 G4_Declare * fillRangeDcl,
2860 G4_Declare * mRangeDcl,
2861 G4_SrcRegRegion * filledRangeRegion,
2862 G4_ExecSize execSize)
2863 {
2864 auto oldExecSize = execSize;
2865
2866 G4_DstRegRegion* postDst = builder_->createDst(
2867 fillRangeDcl->getRegVar(), 0, SUBREG_ORIGIN,
2868 DEF_HORIZ_STRIDE, Type_UD);
2869
2870 unsigned segmentByteSize = getSegmentByteSize(filledRangeRegion, oldExecSize);
2871 G4_Declare* fp = builder_->usesStack() ? builder_->kernel.fg.getFramePtrDcl() : nullptr;
2872
2873 // compute hword offset to emit later when expanding spill/fill intrinsic
2874 int offset = getRegionDisp(filledRangeRegion);
2875 getSpillOffset(offset);
2876 uint32_t offsetHwords = offset >> SCRATCH_SPACE_ADDRESS_UNIT;
2877
2878 unsigned responseLength = cdiv(segmentByteSize, REG_BYTE_SIZE);
2879 G4_SrcRegRegion* header = getLSCSpillFillHeader(mRangeDcl, fp, offset);
2880 auto fillInst = builder_->createFill(header, postDst, execSize,
2881 responseLength, offsetHwords, fp, InstOpt_WriteEnable, true);
2882 fillInst->inheritDIFrom(curInst);
2883 return fillInst;
2884 }
2885
2886 // Replace the reference to the spilled region with a reference to an
2887 // equivalent reference to the spill range region.
replaceSpilledRange(G4_Declare * spillRangeDcl,G4_DstRegRegion * spilledRegion,G4_INST * spilledInst,uint32_t subRegOff)2888 void SpillManagerGRF::replaceSpilledRange(
2889 G4_Declare* spillRangeDcl,
2890 G4_DstRegRegion* spilledRegion,
2891 G4_INST* spilledInst,
2892 uint32_t subRegOff)
2893 {
2894 // we need to preserve accRegSel if it's set
2895 G4_DstRegRegion * tmpRangeDstRegion = builder_->createDst(
2896 spillRangeDcl->getRegVar(), REG_ORIGIN, subRegOff,
2897 spilledRegion->getHorzStride(), spilledRegion->getType(), spilledRegion->getAccRegSel());
2898 spilledInst->setDest (tmpRangeDstRegion);
2899 }
2900
2901 // Replace the reference to the filled region with a reference to an
2902 // equivalent reference to the fill range region.
replaceFilledRange(G4_Declare * fillRangeDcl,G4_SrcRegRegion * filledRegion,G4_INST * filledInst)2903 void SpillManagerGRF::replaceFilledRange(
2904 G4_Declare * fillRangeDcl,
2905 G4_SrcRegRegion * filledRegion,
2906 G4_INST * filledInst)
2907 {
2908 G4_ExecSize execSize =
2909 isMultiRegComprSource(filledRegion, filledInst) ?
2910 G4_ExecSize(filledInst->getExecSize() / 2):
2911 filledInst->getExecSize();
2912
2913 for (int i = 0; i < G4_MAX_SRCS; i++) {
2914 G4_Operand * src = filledInst->getSrc(i);
2915
2916 if (src && src->isSrcRegRegion())
2917 {
2918 G4_SrcRegRegion* srcRgn = src->asSrcRegRegion();
2919 if (*srcRgn == *filledRegion)
2920 {
2921 G4_SrcRegRegion* fillRangeSrcRegion =
2922 createFillRangeSrcRegion(
2923 fillRangeDcl->getRegVar(), filledRegion, execSize);
2924 filledInst->setSrc(fillRangeSrcRegion, i);
2925 }
2926 }
2927 }
2928 }
2929
2930 // Create the send instructions to write out the spillRangeDcl in aligned
2931 // portions.
sendOutSpilledRegVarPortions(G4_Declare * spillRangeDcl,G4_Declare * mRangeDcl,unsigned regOff,unsigned height,unsigned srcRegOff)2932 void SpillManagerGRF::sendOutSpilledRegVarPortions (
2933 G4_Declare * spillRangeDcl,
2934 G4_Declare * mRangeDcl,
2935 unsigned regOff,
2936 unsigned height,
2937 unsigned srcRegOff)
2938 {
2939 if (!headerNeeded())
2940 {
2941 // No need to make a copy of offset because when using
2942 // scratch msg descriptor, the offset is part of send
2943 // msg descriptor and not the header.
2944 }
2945 else
2946 {
2947 // Initialize the message header with the spill disp for portion.
2948 int offset = getDisp(spillRangeDcl->getRegVar()) + regOff * REG_BYTE_SIZE;
2949 getSpillOffset(offset);
2950 unsigned segmentDisp = offset / OWORD_BYTE_SIZE;
2951
2952 G4_Imm * segmentDispImm = builder_->createImm (segmentDisp, Type_UD);
2953 G4_DstRegRegion * mHeaderOffsetDstRegion =
2954 createMHeaderBlockOffsetDstRegion(mRangeDcl->getRegVar());
2955
2956 if (builder_->getIsKernel() == false)
2957 {
2958 createAddFPInst(g4::SIMD1, mHeaderOffsetDstRegion, segmentDispImm);
2959 }
2960 else
2961 {
2962 createMovInst(g4::SIMD1, mHeaderOffsetDstRegion, segmentDispImm);
2963 }
2964 numGRFMove ++;
2965 }
2966
2967
2968 // Write out the portions using a greedy approach.
2969 int currentStride = getNextSize(height, useScratchMsg_);
2970
2971 if (currentStride)
2972 {
2973 initMWritePayload(spillRangeDcl, mRangeDcl, regOff, currentStride);
2974
2975 if (useLSCMsg)
2976 {
2977 createLSCSpill(spillRangeDcl, mRangeDcl, regOff, currentStride, srcRegOff);
2978 }
2979 else
2980 {
2981 createSpillSendInstr(spillRangeDcl, mRangeDcl, regOff, currentStride, srcRegOff);
2982 }
2983
2984 if (height - currentStride > 0) {
2985 sendOutSpilledRegVarPortions(
2986 spillRangeDcl, mRangeDcl, regOff + currentStride, height - currentStride, srcRegOff + currentStride);
2987 }
2988 }
2989 }
2990
checkDefUseDomRel(G4_DstRegRegion * dst,G4_BB * defBB)2991 bool SpillManagerGRF::checkDefUseDomRel(G4_DstRegRegion* dst, G4_BB* defBB)
2992 {
2993 if (!refs.isUniqueDef(dst))
2994 return false;
2995
2996 auto dcl = dst->getTopDcl();
2997
2998 // check whether this def dominates all its uses
2999 auto uses = refs.getUses(dcl);
3000
3001 for (auto& use : *uses)
3002 {
3003 auto useBB = std::get<1>(use);
3004
3005 // check if def dominates use
3006 if (!defBB->dominates(useBB))
3007 return false;
3008
3009 if (defBB == useBB)
3010 {
3011 // defBB dominates useBB since its the same BB.
3012 // ensure def instruction appears lexically before use BB.
3013 auto useInst = std::get<0>(use);
3014 if (dst->getInst()->getLexicalId() > useInst->getLexicalId())
3015 return false;
3016 }
3017 }
3018
3019 // if def is in loop then ensure all uses are in same loop level
3020 // or inner loop nest of def's closest loop.
3021 auto defLoop = gra.kernel.fg.getLoops().getInnerMostLoop(defBB);
3022 if (defLoop)
3023 {
3024 // since def is in loop, check whether uses are also in same loop.
3025 for (auto& use : *uses)
3026 {
3027 auto useBB = std::get<1>(use);
3028 auto useLoop = gra.kernel.fg.getLoops().getInnerMostLoop(useBB);
3029 if (!useLoop)
3030 return false;
3031
3032 if (!useLoop->fullSubset(defLoop))
3033 return false;
3034 }
3035 }
3036
3037 return true;
3038 }
3039
checkUniqueDefAligned(G4_DstRegRegion * dst,G4_BB * defBB)3040 bool SpillManagerGRF::checkUniqueDefAligned(G4_DstRegRegion* dst, G4_BB* defBB)
3041 {
3042 // return true if dst is unique definition considering alignment
3043 // for spill code.
3044
3045 if (!refs.isUniqueDef(dst))
3046 return false;
3047
3048 // dst dcl may have multiple defs. As long as each def defines
3049 // different part of the variable, each def is marked as unique.
3050 // However, spill/fill is done on GRF granularity. So although
3051 // defs are unique in following sequence, we still need RMW for
3052 // 2nd def:
3053 //
3054 // .decl V361 type=w size=32
3055 //
3056 // add (M1, 8) V361(0,0)<1> V358(0,0)<1;1,0> 0x10:w
3057 // add (M1, 8) V361(0,8)<1> V358(0,0)<1;1,0> 0x18:w
3058 //
3059 // Return false if any other dominating def exists that defines
3060 // part of same row of variable dst.
3061 auto dcl = dst->getTopDcl();
3062
3063 auto defs = refs.getDefs(dcl);
3064 unsigned int GRFSize = numEltPerGRF<Type_UB>();
3065 unsigned int lb = dst->getLeftBound();
3066 unsigned int rb = dst->getRightBound();
3067 unsigned int startRow = lb / GRFSize;
3068 unsigned int endRow = rb / GRFSize;
3069
3070 for (auto& def : *defs)
3071 {
3072 // check whether dst and def write same row
3073 auto otherDefInst = std::get<0>(def);
3074
3075 if (otherDefInst == dst->getInst())
3076 continue;
3077
3078 auto otherDefDstRgn = otherDefInst->getDst();
3079 unsigned int otherLb = otherDefDstRgn->getLeftBound();
3080 unsigned int otherRb = otherDefDstRgn->getRightBound();
3081 unsigned int otherTypeSize = otherDefDstRgn->getTypeSize();
3082 bool commonRow = false;
3083 for (unsigned int i = otherLb; i <= otherRb; i += otherTypeSize)
3084 {
3085 auto rowWritten = i / GRFSize;
3086 if (rowWritten >= startRow && rowWritten <= endRow)
3087 {
3088 commonRow = true;
3089 break;
3090 }
3091 }
3092
3093 // No common row between defs, so it is safe to skip fill
3094 // wrt current def. Check with next def.
3095 if (!commonRow)
3096 continue;
3097
3098 auto otherDefBB = std::get<1>(def);
3099
3100 if (!defBB->dominates(otherDefBB))
3101 return false;
3102
3103 if (defBB == otherDefBB)
3104 {
3105 if (dst->getInst()->getLexicalId() > otherDefInst->getLexicalId())
3106 return false;
3107 }
3108 }
3109
3110 return true;
3111 }
3112
3113 // This function checks whether each spill dst region requires a read-modify-write operation
3114 // when inserting spill code. Dominator/unique defs dont require redundant read operation.
3115 // Dst regions that do not need RMW are added to a set. This functionality isnt needed for
3116 // functional correctness. This function is executed before inserting spill code because
3117 // we need all dst regions of dcl available to decide whether read is redundant. If this is
3118 // executed when inserting spill then dst regions of dcl appearing earlier than current one
3119 // would be translated to spill code already. Spill/fill code insertion replaces dst region
3120 // of spills with new temp region. This makes it difficult to check whether current dst and
3121 // an earlier spilled dst write to same GRF row.
updateRMWNeeded()3122 void SpillManagerGRF::updateRMWNeeded()
3123 {
3124 if (!gra.kernel.getOption(vISA_SkipRedundantFillInRMW))
3125 return;
3126
3127 auto isRMWNeededForSpilledDst = [&](G4_BB* bb, G4_DstRegRegion* spilledRegion)
3128 {
3129 auto isUniqueDef = checkUniqueDefAligned(spilledRegion, bb);
3130
3131 // Check0 : Def is NoMask, -- checked in isPartialWriteForSpill()
3132 // Check1 : Def is unique def,
3133 // Check2 : Def is in loop L and all use(s) of dcl are in loop L or it's inner loop nest,
3134 // Check3 : Flowgraph is reducible
3135 // RMW_Not_Needed = Check0 || (Check1 && Check2 && Check3)
3136 bool RMW_Needed = true;
3137
3138 if (isUniqueDef && builder_->kernel.fg.isReducible() && checkDefUseDomRel(spilledRegion, bb))
3139 {
3140 RMW_Needed = false;
3141 }
3142
3143 return RMW_Needed;
3144 };
3145
3146 // First pass to setup lexical ids of instruction so dominator relation can be
3147 // computed correctly intra-BB.
3148 unsigned int lexId = 0;
3149 for (auto bb : gra.kernel.fg.getBBList())
3150 {
3151 for (auto inst : bb->getInstList())
3152 {
3153 inst->setLexicalId(lexId++);
3154 }
3155 }
3156
3157 for (auto bb : gra.kernel.fg.getBBList())
3158 {
3159 for (auto inst : bb->getInstList())
3160 {
3161 if (inst->isPseudoKill())
3162 continue;
3163
3164 auto dst = inst->getDst();
3165 if (dst)
3166 {
3167 if (dst->getBase()->isRegVar())
3168 {
3169 auto dstRegVar = dst->getBase()->asRegVar();
3170 if (dstRegVar && shouldSpillRegister(dstRegVar))
3171 {
3172 if (getRFType(dstRegVar) == G4_GRF)
3173 {
3174 auto RMW_Needed = isRMWNeededForSpilledDst(bb, dst);
3175 if (!RMW_Needed)
3176 {
3177 // Any spilled dst region that doesnt need RMW
3178 // is added to noRMWNeeded set. This set is later
3179 // checked when inserting spill/fill code.
3180 noRMWNeeded.insert(dst);
3181 }
3182 }
3183 }
3184 }
3185 }
3186 }
3187 }
3188 }
3189
3190 // Create the code to create the spill range and save it to spill memory.
insertSpillRangeCode(INST_LIST::iterator spilledInstIter,G4_BB * bb)3191 void SpillManagerGRF::insertSpillRangeCode(
3192 INST_LIST::iterator spilledInstIter, G4_BB* bb)
3193 {
3194 G4_ExecSize execSize = (*spilledInstIter)->getExecSize();
3195 G4_Declare * replacementRangeDcl;
3196 builder_->instList.clear();
3197
3198 bool optimizeSplitLLR = false;
3199 G4_INST* inst = *spilledInstIter;
3200 G4_INST* spillSendInst = NULL;
3201 auto spilledRegion = inst->getDst();
3202
3203 auto spillDcl = spilledRegion->getTopDcl()->getRootDeclare();
3204 if (scalarImmSpill.find(spillDcl) != scalarImmSpill.end())
3205 {
3206 // do not spill scalar immediate values
3207 bb->erase(spilledInstIter);
3208 return;
3209 }
3210
3211 if (builder_->getOption(vISA_DoSplitOnSpill))
3212 {
3213 // if spilled inst is copy of original variable to it's split variable
3214 // then simply remove the instruction.
3215 if (LoopVarSplit::removeFromPreheader(&gra, spillDcl, bb, spilledInstIter))
3216 return;
3217 }
3218
3219 auto checkRMWNeeded = [this, spilledRegion]()
3220 {
3221 return noRMWNeeded.find(spilledRegion) == noRMWNeeded.end();
3222 };
3223
3224 //subreg offset for new dst that replaces the spilled dst
3225 auto newSubregOff = 0;
3226
3227 if (inst->mayExceedTwoGRF())
3228 {
3229 // Handle send instructions (special treatment)
3230 // Create the spill range for the whole post destination, assign spill
3231 // offset to the spill range and create the instructions to load the
3232 // save the spill range to spill memory.
3233 INST_LIST::iterator sendOutIter = spilledInstIter;
3234 assert(getRFType (spilledRegion) == G4_GRF);
3235 G4_Declare * spillRangeDcl =
3236 createPostDstSpillRangeDeclare (*sendOutIter);
3237 G4_Declare * mRangeDcl =
3238 createAndInitMHeader (
3239 (G4_RegVarTransient *) spillRangeDcl->getRegVar());
3240
3241 bool needRMW = inst->isPartialWriteForSpill(!bb->isAllLaneActive()) &&
3242 checkRMWNeeded();
3243 if (needRMW)
3244 {
3245 sendInSpilledRegVarPortions(
3246 spillRangeDcl, mRangeDcl, 0,
3247 spillRangeDcl->getNumRows(),
3248 spilledRegion->getRegOff());
3249
3250 INST_LIST::iterator insertPos = sendOutIter;
3251 splice(bb, insertPos, builder_->instList, curInst->getCISAOff());
3252 }
3253
3254 sendOutSpilledRegVarPortions(
3255 spillRangeDcl, mRangeDcl, 0, spillRangeDcl->getNumRows(),
3256 spilledRegion->getRegOff());
3257
3258 replacementRangeDcl = spillRangeDcl;
3259 }
3260 else
3261 {
3262 // Handle other regular single/multi destination register instructions.
3263 // Create the spill range for the destination region, assign spill
3264 // offset to the spill range and create the instructions to load the
3265 // save the spill range to spill memory.
3266
3267 // Create the segment aligned spill range
3268 G4_Declare * spillRangeDcl =
3269 createSpillRangeDeclare(spilledRegion, execSize, *spilledInstIter);
3270
3271 // Create and initialize the message header
3272 G4_Declare * mRangeDcl =
3273 createAndInitMHeader(spilledRegion, execSize);
3274
3275 // Unaligned region specific handling.
3276 unsigned int spillSendOption = InstOpt_WriteEnable;
3277 auto preloadNeeded = shouldPreloadSpillRange(*spilledInstIter, bb);
3278 if (preloadNeeded && checkRMWNeeded())
3279 {
3280
3281 // Preload the segment aligned spill range from memory to use
3282 // as an overlay
3283
3284 preloadSpillRange(
3285 spillRangeDcl, mRangeDcl, spilledRegion, execSize);
3286
3287 // Create the temporary range to use as a replacement range.
3288
3289 G4_Declare* tmpRangeDcl =
3290 createTemporaryRangeDeclare(spilledRegion, execSize);
3291
3292 // Copy out the value in the temporary range into its
3293 // location in the spill range.
3294
3295 G4_DstRegRegion* spillRangeDstRegion =
3296 createSpillRangeDstRegion(
3297 spillRangeDcl->getRegVar(), spilledRegion, execSize);
3298
3299 G4_SrcRegRegion* tmpRangeSrcRegion =
3300 createTemporaryRangeSrcRegion(
3301 tmpRangeDcl->getRegVar(), spilledRegion, execSize);
3302
3303 // NOTE: Never use a predicate for the final mov if the spilled
3304 // instruction was a sel (even in a SIMD CF context).
3305
3306 G4_Predicate* predicate =
3307 ((*spilledInstIter)->opcode() != G4_sel) ?
3308 (*spilledInstIter)->getPredicate() : nullptr;
3309
3310 if (tmpRangeSrcRegion->getType() == spillRangeDstRegion->getType() && IS_TYPE_FLOAT_ALL(tmpRangeSrcRegion->getType()))
3311 {
3312 // use int copy when possible as floating-point copy moves may need further legalization
3313 auto equivIntTy = floatToSameWidthIntType(tmpRangeSrcRegion->getType());
3314 tmpRangeSrcRegion->setType(equivIntTy);
3315 spillRangeDstRegion->setType(equivIntTy);
3316 }
3317
3318 createMovInst(
3319 execSize, spillRangeDstRegion, tmpRangeSrcRegion,
3320 builder_->duplicateOperand(predicate),
3321 (*spilledInstIter)->getMaskOption());
3322 numGRFMove++;
3323
3324 replacementRangeDcl = tmpRangeDcl;
3325 // maintain the spilled dst's subreg to not break the regioning restriction
3326 newSubregOff = spilledRegion->getSubRegOff();
3327 }
3328 else
3329 {
3330 // We're here because:
3331 // 1. preloadNeeded = false AND checkRMWNeeded = true OR
3332 // 2. preloadNeeded = true AND checkRMWNeeded = false OR
3333 // 3. both are false
3334 //
3335 // Case (1) occurs when:
3336 // Def uses dword type and writes entire row. But def doesnt define
3337 // complete variable, ie it isnt a kill. For such cases, we need to
3338 // use def's EM on spill msg.
3339 //
3340 // Case (2) occurs when:
3341 // Def is partial but it is unique in the program. For such cases,
3342 // we should use WriteEnable msg.
3343 //
3344 // Case (3) occurs when:
3345 // Def uses dword type and write entire row. Def defines complete
3346 // variable. We can use either EM.
3347
3348 // Aligned regions do not need a temporary range.
3349 LocalLiveRange* spilledLLR = gra.getLocalLR(spilledRegion->getBase()->asRegVar()->getDeclare());
3350 if (spilledLLR && spilledLLR->getSplit())
3351 {
3352 // if we are spilling the dest of a copy move introduced by local live-range splitting,
3353 // we can spill the source value instead and delete the move
3354 // ToDo: we should generalize this to cover all moves
3355 G4_SrcRegRegion* srcRegion = inst->getSrc(0)->asSrcRegRegion();
3356 G4_Declare* srcDcl = srcRegion->getBase()->asRegVar()->getDeclare();
3357 unsigned int lb = srcRegion->getLeftBound();
3358 unsigned int rb = srcRegion->getRightBound();
3359
3360 G4_RegVar * regVar = NULL;
3361 if (srcRegion->getBase()->isRegVar())
3362 {
3363 regVar = getRegVar(srcRegion);
3364 }
3365
3366 if (gra.getSubRegAlign(srcDcl) == GRFALIGN &&
3367 lb % REG_BYTE_SIZE == 0 &&
3368 (rb + 1) % REG_BYTE_SIZE == 0 &&
3369 (rb - lb + 1) == spillRangeDcl->getByteSize() &&
3370 regVar &&
3371 !shouldSpillRegister(regVar))
3372 {
3373 optimizeSplitLLR = true;
3374 }
3375 }
3376
3377 replacementRangeDcl = spillRangeDcl;
3378 // maintain the spilled dst's subreg since the spill is done on a per-GRF basis
3379 newSubregOff = spilledRegion->getSubRegOff();
3380
3381 if (preloadNeeded &&
3382 isUnalignedRegion(spilledRegion, execSize))
3383 {
3384 // A dst region may be not need pre-fill, however, if it is unaligned,
3385 // we need to use non-zero sub-reg offset in newly created spill dcl.
3386 // This section of code computes sub-reg offset to use for such cases.
3387 // It is insufficient to simply use spilledRegion's sub-reg offset in
3388 // case the region dcl is an alias of another dcl. This typically happens
3389 // when 2 scalar dcls are merged by merge scalar pass, merged dcl is
3390 // spilled, and dominating def writes non-zeroth element.
3391 unsigned segmentDisp = getEncAlignedSegmentDisp(spilledRegion, execSize);
3392 unsigned regionDisp = getRegionDisp(spilledRegion);
3393 assert(regionDisp >= segmentDisp);
3394 unsigned short subRegOff = (regionDisp - segmentDisp) / spilledRegion->getElemSize();
3395 assert((regionDisp - segmentDisp) % spilledRegion->getElemSize() == 0);
3396 assert(subRegOff * spilledRegion->getElemSize() +
3397 getRegionByteSize(spilledRegion, execSize) <=
3398 2u * REG_BYTE_SIZE);
3399 newSubregOff = subRegOff;
3400 }
3401
3402 if (!bb->isAllLaneActive() &&
3403 !preloadNeeded)
3404 {
3405 spillSendOption = (*spilledInstIter)->getMaskOption();
3406 }
3407 }
3408
3409 // Save the spill range to memory.
3410
3411 initMWritePayload(
3412 spillRangeDcl, mRangeDcl, spilledRegion, execSize);
3413
3414
3415 if (useLSCMsg)
3416 {
3417 spillSendInst = createLSCSpill(
3418 spillRangeDcl, mRangeDcl, spilledRegion, execSize, spillSendOption);
3419 }
3420 else
3421 {
3422 spillSendInst = createSpillSendInstr(
3423 spillRangeDcl, mRangeDcl, spilledRegion, execSize, spillSendOption);
3424 }
3425
3426 if (failSafeSpill_ && !avoidDstSrcOverlap_)
3427 {
3428 spillRegOffset_ = spillRegStart_;
3429 }
3430 }
3431
3432 if (builder_->getOption(vISA_DoSplitOnSpill))
3433 {
3434 if (inst->isRawMov())
3435 {
3436 // check whether mov is copy in loop preheader or exit
3437 auto it = gra.splitResults.find(inst->getSrc(0)->getTopDcl());
3438 if (it != gra.splitResults.end())
3439 {
3440 if ((*it).second.origDcl == spillDcl)
3441 {
3442 // srcRegion is a split var temp
3443 // this is a copy in either preheader or loop exit.
3444 // add it to list so we know it shouldnt be optimized
3445 // by spill cleanup.
3446 for (auto addedInst : builder_->instList)
3447 {
3448 (*it).second.insts[bb].insert(addedInst);
3449 }
3450 }
3451 }
3452 }
3453 }
3454
3455 // Replace the spilled range with the spill range and insert spill
3456 // instructions.
3457
3458 INST_LIST::iterator insertPos = std::next(spilledInstIter);
3459 replaceSpilledRange (replacementRangeDcl, spilledRegion, *spilledInstIter, newSubregOff);
3460
3461 splice(bb, insertPos, builder_->instList, curInst->getCISAOff());
3462
3463 if (optimizeSplitLLR && spillSendInst && spillSendInst->isSplitSend())
3464 {
3465 // delete the move and spill the source instead. Note that we can't do this if split send
3466 // is not enabled, as payload contains header
3467 bb->erase(spilledInstIter);
3468 unsigned int pos = 1;
3469 spillSendInst->setSrc(inst->getSrc(0), pos);
3470 }
3471 else
3472 {
3473 splice(bb, spilledInstIter, builder_->instList, curInst->getCISAOff());
3474 }
3475 }
3476
3477 // Create the code to create the GRF fill range and load it to spill memory.
insertFillGRFRangeCode(G4_SrcRegRegion * filledRegion,INST_LIST::iterator filledInstIter,G4_BB * bb)3478 void SpillManagerGRF::insertFillGRFRangeCode(
3479 G4_SrcRegRegion * filledRegion,
3480 INST_LIST::iterator filledInstIter,
3481 G4_BB* bb)
3482 {
3483 G4_ExecSize execSize = (*filledInstIter)->getExecSize();
3484
3485 // Create the fill range, assign spill offset to the fill range and
3486 // create the instructions to load the fill range from spill memory.
3487
3488 G4_Declare * fillRangeDcl = nullptr;
3489
3490 bool optimizeSplitLLR = false;
3491 G4_INST* inst = *filledInstIter;
3492 auto dstRegion = inst->getDst();
3493 G4_INST* fillSendInst = nullptr;
3494 auto spillDcl = filledRegion->getTopDcl()->getRootDeclare();
3495
3496 if (builder_->getOption(vISA_DoSplitOnSpill))
3497 {
3498 // if spilled inst is copy of split variable to it's spilled variable
3499 // then simply remove the instruction.
3500 //
3501 // if inst is:
3502 // (W) mov (8|M0) SPLIT1 V10
3503 //
3504 // and SPLIT1 is marked as spilled then dont insert spill code for it.
3505 // V10 is guaranteed to be spilled already so there is no point spilling
3506 // SPLIT1. we simply remove above instruction and any fill emitted to load
3507 // V10 and return.
3508 if (LoopVarSplit::removeFromLoopExit(&gra, spillDcl, bb, filledInstIter))
3509 return;
3510 }
3511
3512 auto sisIt = scalarImmSpill.find(spillDcl);
3513 if (sisIt != scalarImmSpill.end())
3514 {
3515 //re-materialize the scalar immediate value
3516 auto imm = sisIt->second;
3517 auto tempDcl = builder_->createTempVar(1, imm->getType(), spillDcl->getSubRegAlign());
3518 auto movInst = builder_->createMov(g4::SIMD1, builder_->createDstRegRegion(tempDcl, 1), imm, InstOpt_WriteEnable, false);
3519 bb->insertBefore(filledInstIter, movInst);
3520 assert(!filledRegion->isIndirect());
3521 auto newSrc = builder_->createSrc(tempDcl->getRegVar(), filledRegion->getRegOff(), filledRegion->getSubRegOff(), filledRegion->getRegion(),
3522 filledRegion->getType(), filledRegion->getAccRegSel());
3523 int i = 0;
3524 for (; i < inst->getNumSrc(); ++i)
3525 {
3526 if (inst->getSrc(i) == filledRegion)
3527 {
3528 break;
3529 }
3530 }
3531 inst->setSrc(newSrc, i);
3532 return;
3533 }
3534
3535 {
3536 fillRangeDcl =
3537 createGRFFillRangeDeclare(filledRegion, execSize, *filledInstIter);
3538 G4_Declare * mRangeDcl =
3539 createAndInitMHeader(filledRegion, execSize);
3540
3541 if (useLSCMsg)
3542 {
3543 fillSendInst = createLSCFill(fillRangeDcl, mRangeDcl, filledRegion, execSize);
3544 }
3545 else
3546 {
3547 fillSendInst = createFillSendInstr(fillRangeDcl, mRangeDcl, filledRegion, execSize);
3548 }
3549
3550 LocalLiveRange* filledLLR = gra.getLocalLR(filledRegion->getBase()->asRegVar()->getDeclare());
3551 if (filledLLR && filledLLR->getSplit())
3552 {
3553 G4_Declare* dstDcl = dstRegion->getBase()->asRegVar()->getDeclare();
3554 unsigned int lb = dstRegion->getLeftBound();
3555 unsigned int rb = dstRegion->getRightBound();
3556
3557 if (gra.getSubRegAlign(dstDcl) == GRFALIGN &&
3558 lb % REG_BYTE_SIZE == 0 &&
3559 (rb + 1) % REG_BYTE_SIZE == 0 &&
3560 (rb - lb + 1) == fillRangeDcl->getByteSize())
3561 {
3562 optimizeSplitLLR = true;
3563 }
3564 }
3565 }
3566
3567 if (builder_->getOption(vISA_DoSplitOnSpill))
3568 {
3569 if (inst->isRawMov())
3570 {
3571 // check whether mov is copy in loop preheader or exit
3572 auto it = gra.splitResults.find(dstRegion->getTopDcl());
3573 if (it != gra.splitResults.end())
3574 {
3575 if ((*it).second.origDcl == filledRegion->getTopDcl())
3576 {
3577 // dstRegion is a split var temp
3578 // this is a copy in either preheader or loop exit.
3579 // add it to list so we know it shouldnt be optimized
3580 // by spill cleanup.
3581 for (auto addedInst : builder_->instList)
3582 {
3583 (*it).second.insts[bb].insert(addedInst);
3584 }
3585 }
3586 }
3587 }
3588 }
3589
3590 // Replace the spilled range with the fill range and insert spill
3591 // instructions.
3592 replaceFilledRange (fillRangeDcl, filledRegion, *filledInstIter);
3593 INST_LIST::iterator insertPos = filledInstIter;
3594
3595 splice(bb, insertPos, builder_->instList, curInst->getCISAOff());
3596 if (optimizeSplitLLR)
3597 {
3598 INST_LIST::iterator nextIter = filledInstIter;
3599 INST_LIST::iterator prevIter = filledInstIter;
3600 nextIter++;
3601 prevIter--;
3602 prevIter--;
3603 bb->erase(filledInstIter);
3604 fillSendInst->setDest(dstRegion);
3605 G4_INST* prevInst = (*prevIter);
3606 if (prevInst->isPseudoKill() &&
3607 GetTopDclFromRegRegion(prevInst->getDst()) == fillRangeDcl)
3608 {
3609 prevInst->setDest(builder_->createDst(GetTopDclFromRegRegion(dstRegion)->getRegVar(), 0, 0, 1, Type_UD));
3610 }
3611 }
3612 }
3613
3614 // Create the code to create the GRF fill range and load it to spill memory.
insertSendFillRangeCode(G4_SrcRegRegion * filledRegion,INST_LIST::iterator filledInstIter,G4_BB * bb)3615 INST_LIST::iterator SpillManagerGRF::insertSendFillRangeCode(
3616 G4_SrcRegRegion * filledRegion,
3617 INST_LIST::iterator filledInstIter,
3618 G4_BB * bb)
3619 {
3620 G4_INST * sendInst = *filledInstIter;
3621
3622 unsigned width = REG_BYTE_SIZE / filledRegion->getElemSize();
3623
3624 // Create the fill range, assign spill offset to the fill range
3625
3626 G4_Declare * fillGRFRangeDcl =
3627 createSendFillRangeDeclare(filledRegion, sendInst);
3628
3629 // Create the instructions to load the fill range from spill memory.
3630
3631 G4_Declare * mRangeDcl = createMRangeDeclare(filledRegion, G4_ExecSize(width));
3632 initMHeader(mRangeDcl);
3633 sendInSpilledRegVarPortions(
3634 fillGRFRangeDcl, mRangeDcl, 0,
3635 fillGRFRangeDcl->getNumRows(), filledRegion->getRegOff());
3636
3637 // Replace the spilled range with the fill range and insert spill
3638 // instructions.
3639
3640 replaceFilledRange(fillGRFRangeDcl, filledRegion, *filledInstIter);
3641 INST_LIST::iterator insertPos = filledInstIter;
3642
3643 splice(bb, insertPos, builder_->instList, curInst->getCISAOff());
3644
3645 // Return the next instruction
3646
3647 return ++filledInstIter;
3648 }
3649
getOrCreateSpillFillDcl(G4_Declare * spilledAddrTakenDcl,G4_Kernel * kernel)3650 G4_Declare* getOrCreateSpillFillDcl(
3651 G4_Declare* spilledAddrTakenDcl, G4_Kernel* kernel)
3652 {
3653 // If spilledAddrTakenDcl already has a spill/fill range created, return it.
3654 // Else create new one and return it.
3655 G4_Declare* temp = spilledAddrTakenDcl->getAddrTakenSpillFill();
3656 if (temp == NULL)
3657 {
3658 #define ADDR_SPILL_FILL_NAME_SIZE 32
3659 const char* dclName = kernel->fg.builder->getNameString(kernel->fg.mem, ADDR_SPILL_FILL_NAME_SIZE,
3660 "ADDR_SP_FL_V%d", spilledAddrTakenDcl->getDeclId());
3661
3662 // temp is created of sub-class G4_RegVarTmp so that is
3663 // assigned infinite spill cost when coloring.
3664 temp = kernel->fg.builder->createDeclareNoLookup(dclName,
3665 G4_GRF, spilledAddrTakenDcl->getNumElems(),
3666 spilledAddrTakenDcl->getNumRows(), spilledAddrTakenDcl->getElemType() , DeclareType::Tmp, spilledAddrTakenDcl->getRegVar());
3667 spilledAddrTakenDcl->setAddrTakenSpillFill(temp);
3668 }
3669
3670 return temp;
3671 }
3672
getOrCreateAddrSpillFillDcl(G4_Declare * spilledAddrTakenDcl,G4_Kernel * kernel)3673 G4_Declare* SpillManagerGRF::getOrCreateAddrSpillFillDcl(
3674 G4_Declare* spilledAddrTakenDcl, G4_Kernel* kernel)
3675 {
3676 // If spilledAddrTakenDcl already has a spill/fill range created, return it.
3677 // Else create new one and return it.
3678 #define ADDR_SPILL_FILL_NAME_SIZE 32
3679 const char* dclName = kernel->fg.builder->getNameString(kernel->fg.mem, ADDR_SPILL_FILL_NAME_SIZE,
3680 "ADDR_SP_FL_V%d_%d", spilledAddrTakenDcl->getDeclId(), getAddrSpillFillIndex(spilledAddrTakenDcl->getRegVar()));
3681
3682 // temp is created of sub-class G4_RegVarTmp so that is
3683 // assigned infinite spill cost when coloring.
3684 G4_Declare* temp = kernel->fg.builder->createDeclareNoLookup(dclName,
3685 G4_GRF, spilledAddrTakenDcl->getNumElems(),
3686 spilledAddrTakenDcl->getNumRows(), spilledAddrTakenDcl->getElemType(), DeclareType::Tmp, spilledAddrTakenDcl->getRegVar());
3687 spilledAddrTakenDcl->setAddrTakenSpillFill(temp);
3688
3689 return temp;
3690 }
3691
3692 // For each address taken register spill find an available physical register
3693 // and assign it to the decl. This physical register will be used for inserting
3694 // spill/fill code for indirect reference instructions that point to the
3695 // spilled range.
3696 // Return true if enough registers found, false if sufficient registers unavailable.
handleAddrTakenSpills(G4_Kernel * kernel,PointsToAnalysis & pointsToAnalysis)3697 bool SpillManagerGRF::handleAddrTakenSpills(
3698 G4_Kernel * kernel, PointsToAnalysis& pointsToAnalysis)
3699 {
3700 bool success = true;
3701 unsigned int numAddrTakenSpills = 0;
3702
3703 for (const LiveRange* lr : *spilledLRs_)
3704 {
3705 if (lr->getDcl()->getAddressed())
3706 {
3707 getOrCreateSpillFillDcl(lr->getDcl(), kernel);
3708 }
3709
3710 if (lvInfo_->isAddressSensitive(lr->getVar()->getId()))
3711 {
3712 numAddrTakenSpills++;
3713 }
3714 }
3715
3716 if (numAddrTakenSpills > 0)
3717 {
3718 insertAddrTakenSpillFill(kernel, pointsToAnalysis);
3719 prunePointsTo(kernel, pointsToAnalysis);
3720 }
3721
3722 #ifdef _DEBUG
3723 if (success)
3724 {
3725 // Verify that each spilled address taken has a spill/fill registers assigned
3726 for (const LiveRange* lr : *spilledLRs_)
3727 {
3728 if (lr->getDcl()->getAddressed())
3729 MUST_BE_TRUE(lr->getDcl()->getAddrTakenSpillFill() != nullptr, "Spilled addr taken does not have assigned spill/fill GRF");
3730 }
3731 }
3732 #endif
3733
3734 return success;
3735 }
3736
handleAddrTakenLSSpills(G4_Kernel * kernel,PointsToAnalysis & pointsToAnalysis)3737 unsigned int SpillManagerGRF::handleAddrTakenLSSpills(
3738 G4_Kernel* kernel, PointsToAnalysis& pointsToAnalysis)
3739 {
3740 unsigned int numAddrTakenSpills = 0;
3741
3742 for (LSLiveRange* lr : *spilledLSLRs_)
3743 {
3744 if (lvInfo_->isAddressSensitive(lr->getTopDcl()->getRegVar()->getId()))
3745 {
3746 numAddrTakenSpills++;
3747 }
3748 }
3749
3750 if (numAddrTakenSpills > 0)
3751 {
3752 insertAddrTakenLSSpillFill(kernel, pointsToAnalysis);
3753 prunePointsToLS(kernel, pointsToAnalysis);
3754 }
3755
3756 #ifdef _DEBUG
3757 if (numAddrTakenSpills)
3758 {
3759 // Verify that each spilled address taken has a spill/fill registers assigned
3760 for (LSLiveRange* lr : *spilledLSLRs_)
3761 {
3762 if (lr->getTopDcl()->getAddressed())
3763 MUST_BE_TRUE(lr->getTopDcl()->getAddrTakenSpillFill() != NULL, "Spilled addr taken does not have assigned spill/fill GRF");
3764 }
3765 }
3766 #endif
3767
3768 return numAddrTakenSpills;
3769 }
3770
3771 // Insert spill and fill code for indirect GRF accesses
insertAddrTakenSpillAndFillCode(G4_Kernel * kernel,G4_BB * bb,INST_LIST::iterator inst_it,G4_Operand * opnd,PointsToAnalysis & pointsToAnalysis,bool spill,unsigned int bbid)3772 void SpillManagerGRF::insertAddrTakenSpillAndFillCode(
3773 G4_Kernel* kernel, G4_BB* bb,
3774 INST_LIST::iterator inst_it, G4_Operand* opnd,
3775 PointsToAnalysis& pointsToAnalysis, bool spill, unsigned int bbid)
3776 {
3777 curInst = (*inst_it);
3778 INST_LIST::iterator next_inst_it = ++inst_it;
3779 inst_it--;
3780
3781 // Check whether spill operand points to any spilled range
3782 for (const LiveRange* lr : *spilledLRs_) {
3783 G4_RegVar* var = nullptr;
3784
3785 if (opnd->isDstRegRegion() && opnd->asDstRegRegion()->getBase()->asRegVar())
3786 var = opnd->asDstRegRegion()->getBase()->asRegVar();
3787
3788 if (opnd->isSrcRegRegion() && opnd->asSrcRegRegion()->getBase()->asRegVar())
3789 var = opnd->asSrcRegRegion()->getBase()->asRegVar();
3790
3791 MUST_BE_TRUE(var != NULL, "Fill operand is neither a source nor dst region");
3792
3793 if (var &&
3794 pointsToAnalysis.isPresentInPointsTo(var,
3795 lr->getVar()))
3796 {
3797 unsigned int numrows = lr->getDcl()->getNumRows();
3798 G4_Declare* temp = getOrCreateSpillFillDcl(lr->getDcl(), kernel);
3799
3800 if (failSafeSpill_ &&
3801 temp->getRegVar()->getPhyReg() == NULL)
3802 {
3803 temp->getRegVar()->setPhyReg(builder_->phyregpool.getGreg(spillRegOffset_), 0);
3804 spillRegOffset_ += numrows;
3805 }
3806
3807 if (numrows > 1 || (lr->getDcl()->getNumElems() * lr->getDcl()->getElemSize() == getGRFSize()))
3808 {
3809 if (useScratchMsg_ || useSplitSend())
3810 {
3811 G4_Declare * fillGRFRangeDcl = temp;
3812 G4_Declare * mRangeDcl =
3813 createAndInitMHeader(
3814 (G4_RegVarTransient *)temp->getRegVar()->getBaseRegVar());
3815
3816 sendInSpilledRegVarPortions(
3817 fillGRFRangeDcl, mRangeDcl, 0,
3818 temp->getNumRows(), 0);
3819
3820 splice(bb, inst_it, builder_->instList, curInst->getCISAOff());
3821
3822 if (spill)
3823 {
3824 sendOutSpilledRegVarPortions (
3825 temp, mRangeDcl, 0, temp->getNumRows(),
3826 0);
3827
3828 splice(bb, next_inst_it, builder_->instList, curInst->getCISAOff());
3829 }
3830 }
3831 else
3832 {
3833
3834 for (unsigned int i = 0; i < numrows; i++)
3835 {
3836 G4_INST* inst;
3837 const RegionDesc* rd = kernel->fg.builder->getRegionStride1();
3838 G4_ExecSize curExSize {numEltPerGRF<Type_UD>()};
3839
3840 if ((i + 1) < numrows)
3841 curExSize = G4_ExecSize(numEltPerGRF<Type_UD>()*2);
3842
3843 G4_SrcRegRegion* srcRex = kernel->fg.builder->createSrc(lr->getVar(), (short)i, 0, rd, Type_F);
3844
3845 G4_DstRegRegion* dstRex = kernel->fg.builder->createDst(temp->getRegVar(), (short)i, 0, 1, Type_F);
3846
3847 inst = kernel->fg.builder->createMov(curExSize, dstRex, srcRex, InstOpt_WriteEnable, false);
3848
3849 bb->insertBefore(inst_it, inst);
3850
3851 if (spill)
3852 {
3853 // Also insert spill code
3854 G4_SrcRegRegion* srcRex = kernel->fg.builder->createSrc(temp->getRegVar(), (short)i, 0, rd, Type_F);
3855
3856 G4_DstRegRegion* dstRex = kernel->fg.builder->createDst(lr->getVar(), (short)i, 0, 1, Type_F);
3857
3858 inst = kernel->fg.builder->createMov(curExSize, dstRex, srcRex, InstOpt_WriteEnable, false);
3859
3860 bb->insertBefore(next_inst_it, inst);
3861 }
3862
3863 // If 2 rows were processed then increment induction var suitably
3864 if ( curExSize == 16)
3865 i++;
3866 }
3867 }
3868
3869 // Update points to
3870 // Note: points2 set should be updated after inserting fill code,
3871 // however, this sets a bit in liveness bit-vector that
3872 // causes the temp variable to be marked as live-out from
3873 // that BB. A general fix should treat address taken variables
3874 // more accurately wrt liveness so they dont escape via
3875 // unfeasible paths.
3876 //pointsToAnalysis.addFillToPointsTo(bbid, var, temp->getRegVar());
3877 }
3878 else if (numrows == 1)
3879 {
3880 // Insert spill/fill when there decl uses a single row, that too not completely
3881 G4_ExecSize curExSize = g4::SIMD16;
3882 unsigned short numbytes = lr->getDcl()->getNumElems() * lr->getDcl()->getElemSize();
3883
3884 //temp->setAddressed();
3885 short off = 0;
3886
3887 while (numbytes > 0)
3888 {
3889 G4_INST* inst;
3890 G4_Type type = Type_W;
3891
3892 if (numbytes >= 16)
3893 curExSize = g4::SIMD8;
3894 else if (numbytes >= 8 && numbytes < 16)
3895 curExSize = g4::SIMD4;
3896 else if (numbytes >= 4 && numbytes < 8)
3897 curExSize = g4::SIMD2;
3898 else if (numbytes >= 2 && numbytes < 4)
3899 curExSize = g4::SIMD1;
3900 else if (numbytes == 1)
3901 {
3902 // If a region has odd number of bytes, copy last byte in final iteration
3903 curExSize = g4::SIMD1;
3904 type = Type_UB;
3905 }
3906 else {
3907 MUST_BE_TRUE(false, "Cannot emit SIMD1 for byte");
3908 curExSize = G4_ExecSize(0);
3909 }
3910
3911 const RegionDesc* rd = kernel->fg.builder->getRegionStride1();
3912
3913 G4_SrcRegRegion* srcRex = kernel->fg.builder->createSrc(lr->getVar(), 0, off, rd, type);
3914
3915 G4_DstRegRegion* dstRex = kernel->fg.builder->createDst(temp->getRegVar(), 0, off, 1, type);
3916
3917 inst = kernel->fg.builder->createMov(curExSize, dstRex, srcRex, InstOpt_WriteEnable, false);
3918
3919 bb->insertBefore(inst_it, inst);
3920
3921 if (spill)
3922 {
3923 // Also insert spill code
3924 G4_SrcRegRegion* srcRex = kernel->fg.builder->createSrc(temp->getRegVar(), 0, off, rd, type);
3925
3926 G4_DstRegRegion* dstRex = kernel->fg.builder->createDst(lr->getVar(), 0, off, 1, type);
3927
3928 inst = kernel->fg.builder->createMov(curExSize, dstRex, srcRex, InstOpt_WriteEnable, false);
3929
3930 bb->insertBefore(next_inst_it, inst);
3931 }
3932
3933 off += curExSize;
3934 numbytes -= curExSize*2;
3935 }
3936
3937 // Update points to
3938 //pointsToAnalysis.addFillToPointsTo(bbid, var, temp->getRegVar());
3939 }
3940
3941 if (!spill)
3942 {
3943 // Insert pseudo_use node so that liveness keeps the
3944 // filled variable live through the indirect access.
3945 // Not required for spill because for spill we will
3946 // anyway insert a ues of the variable to emit store.
3947 const RegionDesc* rd = kernel->fg.builder->getRegionScalar();
3948
3949 G4_SrcRegRegion* pseudoUseSrc =
3950 kernel->fg.builder->createSrc(temp->getRegVar(), 0, 0, rd, Type_F);
3951
3952 G4_INST* pseudoUseInst = kernel->fg.builder->createInternalIntrinsicInst(
3953 nullptr, Intrinsic::Use, g4::SIMD1,
3954 nullptr, pseudoUseSrc, nullptr, nullptr, InstOpt_NoOpt);
3955
3956 bb->insertBefore(next_inst_it, pseudoUseInst);
3957 }
3958
3959 }
3960 }
3961 }
3962
3963 // Insert spill and fill code for indirect GRF accesses
insertAddrTakenLSSpillAndFillCode(G4_Kernel * kernel,G4_BB * bb,INST_LIST::iterator inst_it,G4_Operand * opnd,PointsToAnalysis & pointsToAnalysis,bool spill,unsigned int bbid)3964 void SpillManagerGRF::insertAddrTakenLSSpillAndFillCode(
3965 G4_Kernel* kernel, G4_BB* bb,
3966 INST_LIST::iterator inst_it, G4_Operand* opnd,
3967 PointsToAnalysis& pointsToAnalysis, bool spill, unsigned int bbid)
3968 {
3969 curInst = (*inst_it);
3970 INST_LIST::iterator next_inst_it = ++inst_it;
3971 inst_it--;
3972
3973 // Check whether spill operand points to any spilled range
3974 for (LSLiveRange* lr : *spilledLSLRs_)
3975 {
3976 G4_RegVar* var = nullptr;
3977
3978 if (opnd->isDstRegRegion() && opnd->asDstRegRegion()->getBase()->asRegVar())
3979 var = opnd->asDstRegRegion()->getBase()->asRegVar();
3980
3981 if (opnd->isSrcRegRegion() && opnd->asSrcRegRegion()->getBase()->asRegVar())
3982 var = opnd->asSrcRegRegion()->getBase()->asRegVar();
3983
3984 MUST_BE_TRUE(var != NULL, "Fill operand is neither a source nor dst region");
3985
3986 if (var &&
3987 pointsToAnalysis.isPresentInPointsTo(var,
3988 lr->getTopDcl()->getRegVar()))
3989 {
3990 unsigned int numrows = lr->getTopDcl()->getNumRows();
3991 G4_Declare* temp = getOrCreateAddrSpillFillDcl(lr->getTopDcl(), kernel);
3992
3993 if (failSafeSpill_ &&
3994 temp->getRegVar()->getPhyReg() == NULL)
3995 {
3996 temp->getRegVar()->setPhyReg(builder_->phyregpool.getGreg(spillRegOffset_), 0);
3997 spillRegOffset_ += numrows;
3998 }
3999
4000 if (!lr->isActiveLR())
4001 {
4002 lr->setActiveLR(true);
4003 updateActiveList(lr, &activeLR_);
4004 }
4005
4006 if (numrows > 1 || (lr->getTopDcl()->getNumElems() * lr->getTopDcl()->getElemSize() == getGRFSize()))
4007 {
4008 if (useScratchMsg_ || useSplitSend())
4009 {
4010 G4_Declare* fillGRFRangeDcl = temp;
4011 G4_Declare* mRangeDcl =
4012 createAndInitMHeader(
4013 (G4_RegVarTransient*)temp->getRegVar()->getBaseRegVar());
4014
4015 sendInSpilledRegVarPortions(
4016 fillGRFRangeDcl, mRangeDcl, 0,
4017 temp->getNumRows(), 0);
4018
4019 splice(bb, inst_it, builder_->instList, curInst->getCISAOff());
4020
4021 if (spill)
4022 {
4023 sendOutSpilledRegVarPortions(
4024 temp, mRangeDcl, 0, temp->getNumRows(),
4025 0);
4026
4027 splice(bb, next_inst_it, builder_->instList, curInst->getCISAOff());
4028 }
4029 }
4030 else
4031 {
4032
4033 for (unsigned int i = 0; i < numrows; i++)
4034 {
4035 G4_INST* inst;
4036 const RegionDesc* rd = kernel->fg.builder->getRegionStride1();
4037 G4_ExecSize curExSize{ numEltPerGRF<Type_UD>() };
4038
4039 if ((i + 1) < numrows)
4040 curExSize = G4_ExecSize(numEltPerGRF<Type_UD>() * 2);
4041
4042 G4_SrcRegRegion* srcRex = kernel->fg.builder->createSrc(lr->getTopDcl()->getRegVar(), (short)i, 0, rd, Type_F);
4043
4044 G4_DstRegRegion* dstRex = kernel->fg.builder->createDst(temp->getRegVar(), (short)i, 0, 1, Type_F);
4045
4046 inst = kernel->fg.builder->createMov(curExSize, dstRex, srcRex, InstOpt_WriteEnable, false);
4047
4048 bb->insertBefore(inst_it, inst);
4049
4050 if (spill)
4051 {
4052 // Also insert spill code
4053 G4_SrcRegRegion* srcRex = kernel->fg.builder->createSrc(temp->getRegVar(), (short)i, 0, rd, Type_F);
4054
4055 G4_DstRegRegion* dstRex = kernel->fg.builder->createDst(lr->getTopDcl()->getRegVar(), (short)i, 0, 1, Type_F);
4056
4057 inst = kernel->fg.builder->createMov(curExSize, dstRex, srcRex, InstOpt_WriteEnable, false);
4058
4059 bb->insertBefore(next_inst_it, inst);
4060 }
4061
4062 // If 2 rows were processed then increment induction var suitably
4063 if (curExSize == 16)
4064 i++;
4065 }
4066 }
4067
4068 // Update points to
4069 // Note: points2 set should be updated after inserting fill code,
4070 // however, this sets a bit in liveness bit-vector that
4071 // causes the temp variable to be marked as live-out from
4072 // that BB. A general fix should treat address taken variables
4073 // more accurately wrt liveness so they dont escape via
4074 // unfeasible paths.
4075 //pointsToAnalysis.addFillToPointsTo(bbid, var, temp->getRegVar());
4076 }
4077 else if (numrows == 1)
4078 {
4079 // Insert spill/fill when there decl uses a single row, that too not completely
4080 G4_ExecSize curExSize = g4::SIMD16;
4081 unsigned short numbytes = lr->getTopDcl()->getNumElems() * lr->getTopDcl()->getElemSize();
4082
4083 //temp->setAddressed();
4084 short off = 0;
4085
4086 while (numbytes > 0)
4087 {
4088 G4_INST* inst;
4089 G4_Type type = Type_W;
4090
4091 if (numbytes >= 16)
4092 curExSize = g4::SIMD8;
4093 else if (numbytes >= 8 && numbytes < 16)
4094 curExSize = g4::SIMD4;
4095 else if (numbytes >= 4 && numbytes < 8)
4096 curExSize = g4::SIMD2;
4097 else if (numbytes >= 2 && numbytes < 4)
4098 curExSize = g4::SIMD1;
4099 else if (numbytes == 1)
4100 {
4101 // If a region has odd number of bytes, copy last byte in final iteration
4102 curExSize = g4::SIMD1;
4103 type = Type_UB;
4104 }
4105 else {
4106 MUST_BE_TRUE(false, "Cannot emit SIMD1 for byte");
4107 curExSize = G4_ExecSize(0);
4108 }
4109
4110 const RegionDesc* rd = kernel->fg.builder->getRegionStride1();
4111
4112 G4_SrcRegRegion* srcRex = kernel->fg.builder->createSrc(lr->getTopDcl()->getRegVar(), 0, off, rd, type);
4113
4114 G4_DstRegRegion* dstRex = kernel->fg.builder->createDst(temp->getRegVar(), 0, off, 1, type);
4115
4116 inst = kernel->fg.builder->createMov(curExSize, dstRex, srcRex, InstOpt_WriteEnable, false);
4117
4118 bb->insertBefore(inst_it, inst);
4119
4120 if (spill)
4121 {
4122 // Also insert spill code
4123 G4_SrcRegRegion* srcRex = kernel->fg.builder->createSrc(temp->getRegVar(), 0, off, rd, type);
4124
4125 G4_DstRegRegion* dstRex = kernel->fg.builder->createDst(lr->getTopDcl()->getRegVar(), 0, off, 1, type);
4126
4127 inst = kernel->fg.builder->createMov(curExSize, dstRex, srcRex, InstOpt_WriteEnable, false);
4128
4129 bb->insertBefore(next_inst_it, inst);
4130 }
4131
4132 off += curExSize;
4133 numbytes -= curExSize * 2;
4134 }
4135
4136 // Update points to
4137 //pointsToAnalysis.addFillToPointsTo(bbid, var, temp->getRegVar());
4138 }
4139
4140 if (!spill)
4141 {
4142 // Insert pseudo_use node so that liveness keeps the
4143 // filled variable live through the indirect access.
4144 // Not required for spill because for spill we will
4145 // anyway insert a ues of the variable to emit store.
4146 const RegionDesc* rd = kernel->fg.builder->getRegionScalar();
4147
4148 G4_SrcRegRegion* pseudoUseSrc =
4149 kernel->fg.builder->createSrc(temp->getRegVar(), 0, 0, rd, Type_F);
4150
4151 G4_INST* pseudoUseInst = kernel->fg.builder->createInternalIntrinsicInst(
4152 nullptr, Intrinsic::Use, g4::SIMD1,
4153 nullptr, pseudoUseSrc, nullptr, nullptr, InstOpt_NoOpt);
4154
4155 bb->insertBefore(next_inst_it, pseudoUseInst);
4156 }
4157
4158 }
4159 }
4160 }
4161
4162 // Insert any spill/fills for address taken
insertAddrTakenSpillFill(G4_Kernel * kernel,PointsToAnalysis & pointsToAnalysis)4163 void SpillManagerGRF::insertAddrTakenSpillFill(
4164 G4_Kernel* kernel, PointsToAnalysis& pointsToAnalysis)
4165 {
4166 for (auto bb : kernel->fg)
4167 {
4168 for (INST_LIST_ITER inst_it = bb->begin();
4169 inst_it != bb->end();
4170 inst_it++)
4171 {
4172 G4_INST* curInst = (*inst_it);
4173
4174 if (failSafeSpill_)
4175 {
4176 spillRegOffset_ = indrSpillRegStart_;
4177 }
4178
4179 // Handle indirect destination
4180 G4_DstRegRegion* dst = curInst->getDst();
4181
4182 if (dst && dst->getRegAccess() == IndirGRF)
4183 {
4184 insertAddrTakenSpillAndFillCode(kernel, bb, inst_it, dst, pointsToAnalysis, true, bb->getId());
4185 }
4186
4187 for (int i = 0; i < G4_MAX_SRCS; i++)
4188 {
4189 G4_Operand* src = curInst->getSrc(i);
4190
4191 if (src && src->isSrcRegRegion() && src->asSrcRegRegion()->getRegAccess() == IndirGRF)
4192 {
4193 insertAddrTakenSpillAndFillCode(kernel, bb, inst_it, src, pointsToAnalysis, false, bb->getId());
4194 }
4195 }
4196 }
4197 }
4198 }
4199
insertAddrTakenLSSpillFill(G4_Kernel * kernel,PointsToAnalysis & pointsToAnalysis)4200 void SpillManagerGRF::insertAddrTakenLSSpillFill(
4201 G4_Kernel* kernel, PointsToAnalysis& pointsToAnalysis)
4202 {
4203 for (auto bb : kernel->fg)
4204 {
4205 for (INST_LIST_ITER inst_it = bb->begin();
4206 inst_it != bb->end();
4207 inst_it++)
4208 {
4209 G4_INST* curInst = (*inst_it);
4210
4211 unsigned int instID = curInst->getLexicalId();
4212 if (instID != (unsigned int)-1)
4213 {
4214 expireRanges(instID * 2, &activeLR_);
4215 }
4216
4217 if (failSafeSpill_)
4218 {
4219 spillRegOffset_ = indrSpillRegStart_;
4220 }
4221
4222 // Handle indirect destination
4223 G4_DstRegRegion* dst = curInst->getDst();
4224
4225 if (dst && dst->getRegAccess() == IndirGRF)
4226 {
4227 insertAddrTakenLSSpillAndFillCode(kernel, bb, inst_it, dst, pointsToAnalysis, true, bb->getId());
4228 }
4229
4230 for (int i = 0; i < G4_MAX_SRCS; i++)
4231 {
4232 G4_Operand* src = curInst->getSrc(i);
4233
4234 if (src && src->isSrcRegRegion() && src->asSrcRegRegion()->getRegAccess() == IndirGRF)
4235 {
4236 insertAddrTakenLSSpillAndFillCode(kernel, bb, inst_it, src, pointsToAnalysis, false, bb->getId());
4237 }
4238 }
4239 }
4240 }
4241
4242 if (activeLR_.size() > 0)
4243 {
4244 // Expire any remaining ranges
4245 LSLiveRange* lastActive = activeLR_.back();
4246 unsigned int endIdx;
4247
4248 lastActive->getLastRef(endIdx);
4249
4250 expireRanges(endIdx, &activeLR_);
4251 }
4252
4253 }
4254
4255 // For address spill/fill code inserted remove from points of each indirect operand
4256 // the original regvar that is spilled.
prunePointsTo(G4_Kernel * kernel,PointsToAnalysis & pointsToAnalysis)4257 void SpillManagerGRF::prunePointsTo(
4258 G4_Kernel* kernel, PointsToAnalysis& pointsToAnalysis)
4259 {
4260 for (auto bb : kernel->fg)
4261 {
4262 for (INST_LIST_ITER inst_it = bb->begin();
4263 inst_it != bb->end();
4264 inst_it++)
4265 {
4266 G4_INST* curInst = (*inst_it);
4267 std::stack<G4_Operand*> st;
4268
4269 // Handle indirect destination
4270 G4_DstRegRegion* dst = curInst->getDst();
4271
4272 if (dst && dst->getRegAccess() == IndirGRF)
4273 {
4274 st.push(dst);
4275 }
4276
4277 for (int i = 0; i < G4_MAX_SRCS; i++)
4278 {
4279 G4_Operand* src = curInst->getSrc(i);
4280
4281 if (src && src->isSrcRegRegion() && src->asSrcRegRegion()->getRegAccess() == IndirGRF)
4282 {
4283 st.push(src);
4284 }
4285 }
4286
4287 while (st.size() > 0)
4288 {
4289 G4_Operand* cur = st.top();
4290 st.pop();
4291
4292 // Check whether spill operand points to any spilled range
4293 for (const LiveRange* lr : *spilledLRs_) {
4294 G4_RegVar* var = nullptr;
4295
4296 if (cur->isDstRegRegion() && cur->asDstRegRegion()->getBase()->asRegVar())
4297 var = cur->asDstRegRegion()->getBase()->asRegVar();
4298
4299 if (cur->isSrcRegRegion() && cur->asSrcRegRegion()->getBase()->asRegVar())
4300 var = cur->asSrcRegRegion()->getBase()->asRegVar();
4301
4302 MUST_BE_TRUE(var != nullptr, "Operand is neither a source nor dst region");
4303
4304 if (var &&
4305 pointsToAnalysis.isPresentInPointsTo(var,
4306 lr->getVar()))
4307 {
4308 // Remove this from points to
4309 pointsToAnalysis.removeFromPointsTo(var, lr->getVar());
4310 }
4311 }
4312 }
4313 }
4314 }
4315 }
4316
prunePointsToLS(G4_Kernel * kernel,PointsToAnalysis & pointsToAnalysis)4317 void SpillManagerGRF::prunePointsToLS(
4318 G4_Kernel* kernel, PointsToAnalysis& pointsToAnalysis)
4319 {
4320 for (auto bb : kernel->fg)
4321 {
4322 for (INST_LIST_ITER inst_it = bb->begin();
4323 inst_it != bb->end();
4324 inst_it++)
4325 {
4326 G4_INST* curInst = (*inst_it);
4327 std::stack<G4_Operand*> st;
4328
4329 // Handle indirect destination
4330 G4_DstRegRegion* dst = curInst->getDst();
4331
4332 if (dst && dst->getRegAccess() == IndirGRF)
4333 {
4334 st.push(dst);
4335 }
4336
4337 for (int i = 0; i < G4_MAX_SRCS; i++)
4338 {
4339 G4_Operand* src = curInst->getSrc(i);
4340
4341 if (src && src->isSrcRegRegion() && src->asSrcRegRegion()->getRegAccess() == IndirGRF)
4342 {
4343 st.push(src);
4344 }
4345 }
4346
4347 while (st.size() > 0)
4348 {
4349 G4_Operand* cur = st.top();
4350 st.pop();
4351
4352 // Check whether spill operand points to any spilled range
4353 for (LSLiveRange* lr : *spilledLSLRs_)
4354 {
4355 G4_RegVar* var = nullptr;
4356
4357 if (cur->isDstRegRegion() && cur->asDstRegRegion()->getBase()->asRegVar())
4358 var = cur->asDstRegRegion()->getBase()->asRegVar();
4359
4360 if (cur->isSrcRegRegion() && cur->asSrcRegRegion()->getBase()->asRegVar())
4361 var = cur->asSrcRegRegion()->getBase()->asRegVar();
4362
4363 MUST_BE_TRUE(var != NULL, "Operand is neither a source nor dst region");
4364
4365 if (var &&
4366 pointsToAnalysis.isPresentInPointsTo(var,
4367 lr->getTopDcl()->getRegVar()))
4368 {
4369 // Remove this from points to
4370 pointsToAnalysis.removeFromPointsTo(var, lr->getTopDcl()->getRegVar());
4371 }
4372 }
4373 }
4374 }
4375 }
4376 }
4377
runSpillAnalysis()4378 void SpillManagerGRF::runSpillAnalysis()
4379 {
4380 if (failSafeSpill_)
4381 {
4382 // ToDo: use the reserved GRFs to perform scalar immediate rematerialization
4383 return;
4384 }
4385
4386 std::unordered_set<G4_Declare*> spilledDcl;
4387 scalarImmSpill.clear();
4388
4389 for (auto bb : gra.kernel.fg)
4390 {
4391 for (auto inst : *bb)
4392 {
4393 auto dst = inst->getDst();
4394 auto dcl = dst && dst->getTopDcl() ? dst->getTopDcl()->getRootDeclare() : nullptr;
4395 if (!dcl || dcl->getAddressed() || dcl->getNumElems() != 1 || !shouldSpillRegister(dcl->getRegVar()))
4396 {
4397 // declare must be a scalar without address taken
4398 continue;
4399 }
4400 if (spilledDcl.count(dcl))
4401 {
4402 // this spilled declare is defined more than once
4403 scalarImmSpill.erase(dcl);
4404 continue;
4405 }
4406 spilledDcl.insert(dcl);
4407 if (inst->opcode() == G4_mov && inst->getExecSize() == g4::SIMD1 && inst->getSrc(0)->isImm() && !inst->getPredicate() && !inst->getSaturate())
4408 {
4409 scalarImmSpill[dcl] = inst->getSrc(0)->asImm();
4410 }
4411 }
4412 }
4413 }
4414
4415 // Insert spill/fill code for all registers that have not been assigned
4416 // physical registers in the current iteration of the graph coloring
4417 // allocator.
4418 // returns false if spill fails somehow
insertSpillFillCode(G4_Kernel * kernel,PointsToAnalysis & pointsToAnalysis)4419 bool SpillManagerGRF::insertSpillFillCode(
4420 G4_Kernel * kernel, PointsToAnalysis& pointsToAnalysis)
4421 {
4422 //runSpillAnalysis();
4423 // Set the spill flag of all spilled regvars.
4424 for (const LiveRange* lr : *spilledLRs_) {
4425
4426 // Ignore request to spill/fill the spill/fill ranges
4427 // as it does not help the allocator.
4428 if (shouldSpillRegister(lr->getVar()) == false)
4429 {
4430 bool needsEOTGRF = lr->getEOTSrc() && builder_->hasEOTGRFBinding();
4431 if (failSafeSpill_ && needsEOTGRF &&
4432 (lr->getVar()->isRegVarTransient() ||
4433 lr->getVar()->isRegVarTmp()))
4434 {
4435 lr->getVar()->setPhyReg(builder_->phyregpool.getGreg(spillRegStart_ > (kernel->getNumRegTotal() - 16) ? spillRegStart_ : (kernel->getNumRegTotal() - 16)), 0);
4436 continue;
4437 }
4438 return false;
4439 }
4440 else
4441 {
4442 lr->getVar()->getDeclare()->setSpillFlag();
4443 }
4444 }
4445
4446 // Handle address taken spills
4447 bool success = handleAddrTakenSpills(kernel, pointsToAnalysis);
4448
4449 if (!success)
4450 {
4451 DEBUG_MSG("Enough physical register not available for handling address taken spills" << std::endl);
4452 return false;
4453 }
4454
4455 // Insert spill/fill code for all basic blocks.
4456 updateRMWNeeded();
4457 FlowGraph& fg = kernel->fg;
4458
4459 unsigned int id = 0;
4460 for (BB_LIST_ITER it = fg.begin(); it != fg.end(); it++)
4461 {
4462 bbId_ = (*it)->getId();
4463 INST_LIST::iterator jt = (*it)->begin();
4464
4465 while (jt != (*it)->end()) {
4466 INST_LIST::iterator kt = jt;
4467 ++kt;
4468 G4_INST* inst = *jt;
4469
4470 curInst = inst;
4471 curInst->setLexicalId(id++);
4472
4473 if (failSafeSpill_)
4474 {
4475 spillRegOffset_ = spillRegStart_;
4476 }
4477
4478 // Insert spill code, when the target is a spilled register.
4479
4480 if (inst->getDst())
4481 {
4482 G4_RegVar* regVar = nullptr;
4483 if (inst->getDst()->getBase()->isRegVar())
4484 {
4485 regVar = getRegVar(inst->getDst());
4486 }
4487
4488 if (regVar && shouldSpillRegister(regVar))
4489 {
4490 if (getRFType(regVar) == G4_GRF)
4491 {
4492 if (inst->isPseudoKill())
4493 {
4494 (*it)->erase(jt);
4495 jt = kt;
4496 continue;
4497 }
4498
4499 insertSpillRangeCode(jt, (*it));
4500 }
4501 else
4502 {
4503 assert(0);
4504 }
4505 }
4506 }
4507
4508
4509 // Insert fill code, when the source is a spilled register.
4510
4511 for (unsigned i = 0; i < G4_MAX_SRCS; i++)
4512 {
4513 if (inst->getSrc(i) &&
4514 inst->getSrc(i)->isSrcRegRegion ())
4515 {
4516 auto srcRR = inst->getSrc(i)->asSrcRegRegion();
4517 G4_RegVar* regVar = nullptr;
4518 if (srcRR->getBase()->isRegVar())
4519 {
4520 regVar = getRegVar(srcRR);
4521 }
4522
4523 if (regVar && shouldSpillRegister(regVar))
4524 {
4525 if (inst->isLifeTimeEnd())
4526 {
4527 (*it)->erase(jt);
4528 break;
4529 }
4530 bool mayExceedTwoGRF = (inst->isSend() && i == 0) ||
4531 inst->isDpas() ||
4532 (inst->isSplitSend() && i == 1);
4533
4534 if (mayExceedTwoGRF)
4535 {
4536 insertSendFillRangeCode(srcRR, jt, *it);
4537 }
4538 else if (getRFType(regVar) == G4_GRF)
4539 insertFillGRFRangeCode(srcRR, jt, *it);
4540 else
4541 assert(0);
4542 }
4543 }
4544 }
4545
4546 jt = kt;
4547 }
4548 }
4549
4550 bbId_ = UINT_MAX;
4551
4552 // Calculate the spill memory used in this iteration
4553
4554 for (auto spill : *spilledLRs_)
4555 {
4556 unsigned disp = spill->getVar ()->getDisp ();
4557
4558 if (spill->getVar ()->isSpilled ())
4559 {
4560 if (disp != UINT_MAX)
4561 {
4562 nextSpillOffset_ = std::max(nextSpillOffset_, disp + getByteSize(spill->getVar()));
4563 }
4564 }
4565 }
4566
4567 // Emit the instruction with the introduced spill/fill ranges in the
4568 // current iteration.
4569
4570 #ifndef NDEBUG
4571 #ifdef DEBUG_VERBOSE_ON1
4572 std::stringstream fname;
4573 fname << "spill_code_" << iterationNo_++ << "_" << kernel->getName()
4574 << ends;
4575 std::ofstream sout;
4576 sout.open(fname.str());
4577 kernel->emitDeviceAsm(sout, true, 0);
4578 sout.close ();
4579 #endif
4580 #endif
4581
4582 return true;
4583 }
4584
4585
expireRanges(unsigned int idx,std::list<LSLiveRange * > * liveList)4586 void SpillManagerGRF::expireRanges(
4587 unsigned int idx, std::list<LSLiveRange*> * liveList)
4588 {
4589 //active list is sorted in ascending order of starting index
4590
4591 while (liveList->size() > 0)
4592 {
4593 unsigned int endIdx;
4594 LSLiveRange* lr = liveList->front();
4595
4596 lr->getLastRef(endIdx);
4597
4598 if (endIdx <= idx)
4599 {
4600 #ifdef DEBUG_VERBOSE_ON
4601 DEBUG_VERBOSE("Expiring range " << lr->getTopDcl()->getName() << std::endl);
4602 #endif
4603 // Remove range from active list
4604 liveList->pop_front();
4605 lr->setActiveLR(false);
4606 }
4607 else
4608 {
4609 // As soon as we find first range that ends after ids break loop
4610 break;
4611 }
4612 }
4613
4614 return;
4615 }
4616
updateActiveList(LSLiveRange * lr,std::list<LSLiveRange * > * liveList)4617 void SpillManagerGRF::updateActiveList(
4618 LSLiveRange * lr, std::list<LSLiveRange*> * liveList)
4619 {
4620 bool done = false;
4621 unsigned int newlr_end;
4622
4623 lr->getLastRef(newlr_end);
4624
4625 for (auto active_it = liveList->begin();
4626 active_it != liveList->end();
4627 active_it++)
4628 {
4629 unsigned int end_idx;
4630 LSLiveRange* active_lr = (*active_it);
4631
4632 active_lr->getLastRef(end_idx);
4633
4634 if (end_idx > newlr_end)
4635 {
4636 liveList->insert(active_it, lr);
4637 done = true;
4638 break;
4639 }
4640 }
4641
4642 if (done == false)
4643 liveList->push_back(lr);
4644 }
4645
spillLiveRanges(G4_Kernel * kernel)4646 bool SpillManagerGRF::spillLiveRanges(G4_Kernel * kernel)
4647 {
4648 // Set the spill flag of all spilled regvars.
4649 for (LSLiveRange* lr : *spilledLSLRs_) {
4650 lr->getTopDcl()->setSpillFlag();
4651 }
4652
4653 // Handle address taken spills
4654 unsigned addrSpillNum = handleAddrTakenLSSpills(kernel, gra.pointsToAnalysis);
4655
4656 if (addrSpillNum)
4657 {
4658 for (auto spill : *spilledLSLRs_)
4659 {
4660 unsigned disp = spill->getTopDcl()->getRegVar()->getDisp();
4661
4662 if (spill->getTopDcl()->getRegVar()->isSpilled())
4663 {
4664 if (disp != UINT_MAX)
4665 {
4666 nextSpillOffset_ = std::max(nextSpillOffset_, disp + getByteSize(spill->getTopDcl()->getRegVar()));
4667 }
4668 }
4669 }
4670 }
4671
4672 // Insert spill/fill code for all basic blocks.
4673 FlowGraph& fg = kernel->fg;
4674 for (BB_LIST_ITER it = fg.begin(); it != fg.end(); it++)
4675 {
4676 bbId_ = (*it)->getId();
4677 INST_LIST::iterator jt = (*it)->begin();
4678
4679 while (jt != (*it)->end())
4680 {
4681 INST_LIST::iterator kt = jt;
4682 ++kt;
4683 G4_INST* inst = *jt;
4684 unsigned int instID = inst->getLexicalId();
4685 curInst = inst;
4686 if (instID != (unsigned int)-1)
4687 {
4688 expireRanges(instID * 2, &activeLR_);
4689 }
4690
4691 if (failSafeSpill_)
4692 {
4693 spillRegOffset_ = spillRegStart_;
4694 }
4695
4696 // Insert spill code, when the target is a spilled register.
4697 if (inst->getDst())
4698 {
4699 G4_RegVar* regVar = nullptr;
4700 if (inst->getDst()->getBase()->isRegVar())
4701 {
4702 regVar = getRegVar(inst->getDst());
4703 }
4704
4705 if (regVar && regVar->getDeclare()->isSpilled())
4706 {
4707 G4_Declare* dcl = regVar->getDeclare();
4708 while (dcl->getAliasDeclare())
4709 {
4710 dcl = dcl->getAliasDeclare();
4711 }
4712 LSLiveRange* lr = gra.getLSLR(dcl);
4713 if (!lr->isActiveLR())
4714 {
4715 lr->setActiveLR(true);
4716 updateActiveList(lr, &activeLR_);
4717 }
4718
4719 if (getRFType(regVar) == G4_GRF)
4720 {
4721 if (inst->isPseudoKill())
4722 {
4723 (*it)->erase(jt);
4724 jt = kt;
4725 continue;
4726 }
4727
4728 insertSpillRangeCode(jt, (*it));
4729 }
4730 else
4731 {
4732 assert(0);
4733 }
4734 }
4735 }
4736
4737 // Insert fill code, when the source is a spilled register.
4738 for (unsigned i = 0; i < G4_MAX_SRCS; i++)
4739 {
4740 if (inst->getSrc(i) &&
4741 inst->getSrc(i)->isSrcRegRegion ())
4742 {
4743 auto srcRR = inst->getSrc(i)->asSrcRegRegion();
4744 G4_RegVar* regVar = nullptr;
4745 if (srcRR->getBase()->isRegVar())
4746 {
4747 regVar = getRegVar(srcRR);
4748 }
4749
4750 if (regVar && regVar->getDeclare()->isSpilled())
4751 {
4752 G4_Declare* dcl = regVar->getDeclare();
4753 while (dcl->getAliasDeclare())
4754 {
4755 dcl = dcl->getAliasDeclare();
4756 }
4757 LSLiveRange* lr = gra.getLSLR(dcl);
4758 if (!lr->isActiveLR())
4759 {
4760 lr->setActiveLR(true);
4761 updateActiveList(lr, &activeLR_);
4762 }
4763
4764 if (inst->isLifeTimeEnd())
4765 {
4766 (*it)->erase(jt);
4767 break;
4768 }
4769 bool mayExceedTwoGRF = (inst->isSend() && i == 0) ||
4770 inst->isDpas() ||
4771 (inst->isSplitSend() && i == 1);
4772
4773 if (mayExceedTwoGRF)
4774 {
4775 insertSendFillRangeCode(srcRR, jt, *it);
4776 }
4777 else if (getRFType(regVar) == G4_GRF)
4778 insertFillGRFRangeCode(srcRR, jt, *it);
4779 else
4780 assert(0);
4781 }
4782 }
4783 }
4784
4785 jt = kt;
4786 }
4787 }
4788
4789 bbId_ = UINT_MAX;
4790
4791 // Calculate the spill memory used in this iteration
4792 for (auto spill : (*spilledLSLRs_))
4793 {
4794 unsigned disp = spill->getTopDcl()->getRegVar()->getDisp();
4795
4796 if (spill->getTopDcl()->getRegVar()->isSpilled ())
4797 {
4798 if (disp != UINT_MAX)
4799 {
4800 nextSpillOffset_ = std::max(nextSpillOffset_, disp + getByteSize(spill->getTopDcl()->getRegVar()));
4801 }
4802 }
4803 }
4804
4805 return true;
4806 }
4807
4808 //
4809 // For XeHP_SDV+ scratch surface is used for the vISA stack. This means when
4810 // the scratch message cannot be used for spill/fill (e.g., stack call),
4811 // a0.2 will be used as the message descriptor for the spill/fill.
4812 // As address RA is done before GRF, we don't know if a0.2 is live at the
4813 // point of the spill/fill inst and thus may need to preserve its value.
4814 // The good news is that all spill/fill may share the same A0, so we only
4815 // need to save/restore A0 when it's actually referenced in the BB.
4816 //
saveRestoreA0(G4_BB * bb)4817 void GlobalRA::saveRestoreA0(G4_BB * bb)
4818 {
4819 G4_Declare* tmpDcl = nullptr;
4820 unsigned int subReg = 0;
4821 if (kernel.fg.getHasStackCalls() || kernel.fg.getIsStackCallFunc())
4822 {
4823 // Use r126.6:ud for storing old a0.2 as it isn't caller/callee save
4824 tmpDcl = builder.kernel.fg.getScratchRegDcl();
4825 subReg = 6;
4826 }
4827 else
4828 {
4829 MUST_BE_TRUE(builder.hasValidOldA0Dot2(), "old a0.2 not saved");
4830 tmpDcl = builder.getOldA0Dot2Temp();
4831 subReg = 0;
4832 }
4833
4834 auto usesAddr = [](G4_INST* inst)
4835 {
4836 // ToDo: handle send with A0 msg desc better.
4837 if (inst->isSpillIntrinsic() || inst->isFillIntrinsic())
4838 {
4839 return false;
4840 }
4841 if (inst->getDst() && inst->getDst()->isAddress())
4842 {
4843 return true;
4844 }
4845 for (int i = 0, numSrc = inst->getNumSrc(); i < numSrc; ++i)
4846 {
4847 if (inst->getSrc(i)->isAddress() || (inst->getSrc(i)->isSrcRegRegion() && inst->getSrc(i)->asSrcRegRegion()->isIndirect()))
4848 {
4849 return true;
4850 }
4851 }
4852 return false;
4853 };
4854
4855 // a0.2 is spilled to r126.6 (r126 is scratch reg reserved for stack call)
4856 auto a0SaveMov = [this, tmpDcl, subReg]()
4857 {
4858 auto dstSave = builder.createDst(tmpDcl->getRegVar(), 0, subReg, 1, Type_UD);
4859 auto srcSave = builder.createSrc(builder.getBuiltinA0Dot2()->getRegVar(), 0, 0, builder.getRegionScalar(), Type_UD);
4860 auto saveInst = builder.createMov(g4::SIMD1, dstSave, srcSave, InstOpt_WriteEnable, false);
4861 return saveInst;
4862 };
4863
4864 auto a0RestoreMov = [this, tmpDcl, subReg]()
4865 {
4866 auto dstRestore = builder.createDstRegRegion(builder.getBuiltinA0Dot2(), 1);
4867 auto srcRestore = builder.createSrc(tmpDcl->getRegVar(), 0, subReg, builder.getRegionScalar(), Type_UD);
4868 auto restoreInst = builder.createMov(g4::SIMD1, dstRestore, srcRestore, InstOpt_WriteEnable, false);
4869 return restoreInst;
4870 };
4871
4872 auto a0SSOMove = [this]()
4873 {
4874 // shr (1) a0.2 SSO 0x4 {NM}
4875 // SSO is stored in r126.7
4876 auto dst = builder.createDstRegRegion(builder.getBuiltinA0Dot2(), 1);
4877 auto SSOsrc = builder.createSrc(builder.getSpillSurfaceOffset()->getRegVar(),
4878 0, 0, builder.getRegionScalar(), Type_UD);
4879 auto imm4 = builder.createImm(4, Type_UD);
4880
4881 return builder.createBinOp(G4_shr, g4::SIMD1, dst, SSOsrc, imm4, InstOpt_WriteEnable, false);
4882 };
4883
4884 auto isPrologOrEpilog = [this](G4_INST* inst)
4885 {
4886 // a0 is a caller save register. Dont save/restore it if it is used in callee save/restore sequence or
4887 // for frame descriptor spill instruction.
4888 if (inst == kernel.fg.builder->getFDSpillInst())
4889 return false;
4890
4891 if (calleeSaveInsts.find(inst) != calleeSaveInsts.end() ||
4892 calleeRestoreInsts.find(inst) != calleeRestoreInsts.end())
4893 return false;
4894
4895 return true;
4896 };
4897
4898 bool hasActiveSpillFill = false;
4899
4900 for (auto instIt = bb->begin(); instIt != bb->end(); ++instIt)
4901 {
4902 auto inst = (*instIt);
4903
4904 if (inst->isSpillIntrinsic() || inst->isFillIntrinsic())
4905 {
4906 if (!hasActiveSpillFill)
4907 {
4908 // save a0.2 to addrSpillLoc, then overwrite it with the scratch surface offset
4909 if (isPrologOrEpilog(inst))
4910 {
4911 auto addrSpill = a0SaveMov();
4912 bb->insertBefore(instIt, addrSpill);
4913 }
4914 auto a0SSO = a0SSOMove();
4915 bb->insertBefore(instIt, a0SSO);
4916 hasActiveSpillFill = true;
4917 }
4918 }
4919 else if (hasActiveSpillFill && usesAddr(inst))
4920 {
4921 // restore A0
4922 auto addrFill = a0RestoreMov();
4923 bb->insertBefore(instIt, addrFill);
4924 hasActiveSpillFill = false;
4925 }
4926 }
4927
4928 if (hasActiveSpillFill && !bb->isLastInstEOT() && !bb->isEndWithFRet())
4929 {
4930 // restore A0 before BB exit. BB is guaranteed to be non-empty as there's at least one spill/fill
4931 // If last inst is branch, insert restore before it. Otherwise insert it as last inst
4932 auto endIt = bb->back()->isCFInst() ? std::prev(bb->end()) : bb->end();
4933 bb->insertBefore(endIt, a0RestoreMov());
4934 }
4935 }
4936
computeSpillMsgDesc(unsigned int payloadSize,unsigned int offsetInGrfUnits)4937 uint32_t computeSpillMsgDesc(unsigned int payloadSize, unsigned int offsetInGrfUnits)
4938 {
4939 // Compute msg descriptor given payload size and offset.
4940 unsigned headerPresent = 0x80000;
4941 uint32_t message = headerPresent;
4942 unsigned msgLength = SCRATCH_PAYLOAD_HEADER_MAX_HEIGHT;
4943 message |= (msgLength << getSendMsgLengthBitOffset());
4944 message |= (1 << SCRATCH_MSG_DESC_CATEORY);
4945 message |= (1 << SCRATCH_MSG_DESC_CHANNEL_MODE);
4946 message |= (1 << SCRATCH_MSG_DESC_OPERATION_MODE);
4947 unsigned blocksize_encoding = getScratchBlocksizeEncoding(payloadSize);
4948 message |= (blocksize_encoding << SCRATCH_MSG_DESC_BLOCK_SIZE);
4949 int offset = offsetInGrfUnits;
4950 message |= offset;
4951
4952 return message;
4953 }
4954
computeFillMsgDesc(unsigned int payloadSize,unsigned int offsetInGrfUnits)4955 uint32_t computeFillMsgDesc(unsigned int payloadSize, unsigned int offsetInGrfUnits)
4956 {
4957 // Compute msg descriptor given payload size and offset.
4958 unsigned headerPresent = 0x80000;
4959 uint32_t message = headerPresent;
4960 unsigned msgLength = 1;
4961 message |= (msgLength << getSendMsgLengthBitOffset());
4962 message |= (1 << SCRATCH_MSG_DESC_CATEORY);
4963 message |= (0 << SCRATCH_MSG_INVALIDATE_AFTER_READ);
4964 unsigned blocksize_encoding = getScratchBlocksizeEncoding(payloadSize);
4965 message |= (blocksize_encoding << SCRATCH_MSG_DESC_BLOCK_SIZE);
4966 message |= offsetInGrfUnits;
4967
4968 return message;
4969 }
4970
4971 // Returns payload size in units of GRF rows
getPayloadSizeGRF(unsigned int numRows)4972 static unsigned int getPayloadSizeGRF(unsigned int numRows)
4973 {
4974 if (numRows >= 8)
4975 return 8u;
4976
4977 if (numRows >= 4)
4978 return 4u;
4979
4980 if (numRows >= 2)
4981 return 2u;
4982
4983 return 1u;
4984 }
4985
getPayloadSizeOword(unsigned int numOwords)4986 static unsigned int getPayloadSizeOword(unsigned int numOwords)
4987 {
4988 if (numOwords >= 8)
4989 return 8u;
4990
4991 if (numOwords >= 4)
4992 return 4u;
4993
4994 if (numOwords >= 2)
4995 return 2u;
4996
4997 return 1u;
4998 }
4999
owordToGRFSize(unsigned int numOwords)5000 unsigned int GlobalRA::owordToGRFSize(unsigned int numOwords)
5001 {
5002 unsigned int GRFSize = numOwords / (2 * (numEltPerGRF<Type_UB>() / HWORD_BYTE_SIZE));
5003
5004 return GRFSize;
5005 }
5006
hwordToGRFSize(unsigned int numHwords)5007 unsigned int GlobalRA::hwordToGRFSize(unsigned int numHwords)
5008 {
5009 return owordToGRFSize(numHwords * 2);
5010 }
5011
GRFToHwordSize(unsigned int numGRFs)5012 unsigned int GlobalRA::GRFToHwordSize(unsigned int numGRFs)
5013 {
5014 return GRFSizeToOwords(numGRFs) / 2;
5015 }
5016
GRFSizeToOwords(unsigned int numGRFs)5017 unsigned int GlobalRA::GRFSizeToOwords(unsigned int numGRFs)
5018 {
5019 return numGRFs * (numEltPerGRF<Type_UB>() / OWORD_BYTE_SIZE);
5020 }
5021
getHWordByteSize()5022 unsigned int GlobalRA::getHWordByteSize()
5023 {
5024 return HWORD_BYTE_SIZE;
5025 }
5026
createSpillFillAddr(IR_Builder & builder,G4_Declare * addr,G4_Declare * fp,int offset)5027 static G4_INST* createSpillFillAddr(
5028 IR_Builder& builder, G4_Declare* addr, G4_Declare* fp, int offset)
5029 {
5030 auto imm = builder.createImm(offset, Type_UD);
5031 auto dst = builder.createDstRegRegion(addr, 1);
5032 if (fp)
5033 {
5034 auto src0 = builder.createSrcRegRegion(fp, builder.getRegionScalar());
5035 return builder.createBinOp(G4_add, g4::SIMD1, dst, src0, imm, InstOpt_WriteEnable, true);
5036 }
5037 else
5038 {
5039 return builder.createMov(g4::SIMD1, dst, imm, InstOpt_WriteEnable, true);
5040 }
5041 }
5042
makeSpillFillComment(const char * spillFill,const char * toFrom,const char * base,uint32_t spillOffset,const char * of)5043 static std::string makeSpillFillComment(
5044 const char *spillFill,
5045 const char *toFrom,
5046 const char *base,
5047 uint32_t spillOffset,
5048 const char *of)
5049 {
5050 std::stringstream comment;
5051 comment << spillFill << " " << toFrom << " ";
5052 comment << base << "[" << spillOffset / getGRFSize() << "*" << (int)getGRFSize() << "]";
5053 if (!of || *of == 0) // some have "" as name
5054 of = "?";
5055 comment << " of " << of;
5056 return comment.str();
5057 }
5058
expandSpillLSC(G4_BB * bb,INST_LIST_ITER & instIt)5059 void GlobalRA::expandSpillLSC(G4_BB* bb, INST_LIST_ITER& instIt)
5060 {
5061 auto& builder = kernel.fg.builder;
5062 auto inst = (*instIt)->asSpillIntrinsic();
5063 // offset into scratch surface in bytes
5064 auto spillOffset = inst->getOffsetInBytes();
5065 uint32_t numRows = inst->getNumRows();
5066 auto payload = inst->getSrc(1)->asSrcRegRegion();
5067 auto rowOffset = payload->getRegOff();
5068
5069 LSC_OP op = LSC_STORE;
5070 LSC_SFID lscSfid = LSC_UGM;
5071 LSC_CACHE_OPTS cacheOpts{ LSC_CACHING_DEFAULT, LSC_CACHING_DEFAULT };
5072
5073 LSC_ADDR addrInfo;
5074 addrInfo.type = LSC_ADDR_TYPE_SS; //Scratch memory
5075 addrInfo.immScale = 1;
5076 addrInfo.immOffset = 0;
5077 addrInfo.size = LSC_ADDR_SIZE_32b;
5078
5079 builder->instList.clear();
5080 while (numRows > 0)
5081 {
5082 auto numGRFToWrite = getPayloadSizeGRF(numRows);
5083
5084 G4_Declare* spillAddr = inst->getFP() ? kernel.fg.scratchRegDcl : inst->getHeader()->getTopDcl();
5085 {
5086 // need to calculate spill address
5087 createSpillFillAddr(*builder, spillAddr, inst->getFP(), spillOffset);
5088 }
5089
5090 LSC_DATA_SHAPE dataShape;
5091 dataShape.size = LSC_DATA_SIZE_32b;
5092 dataShape.order = LSC_DATA_ORDER_TRANSPOSE;
5093 dataShape.elems = builder->lscGetElementNum(numGRFToWrite * getGRFSize() / 4);
5094
5095 auto src0Addr = builder->createSrcRegRegion(spillAddr, builder->getRegionStride1());
5096 auto payloadToUse = builder->createSrcWithNewRegOff(payload, rowOffset);
5097
5098 auto surface = builder->createSrcRegRegion(builder->getSpillSurfaceOffset(),
5099 builder->getRegionScalar());
5100
5101 G4_DstRegRegion* postDst = builder->createNullDst(Type_UD);
5102 G4_SendDescRaw* desc = builder->createLscMsgDesc(
5103 op,
5104 lscSfid,
5105 EXEC_SIZE_1,
5106 cacheOpts,
5107 addrInfo,
5108 dataShape,
5109 surface,
5110 0,
5111 1);
5112
5113 auto sendInst = builder->createLscSendInst(
5114 nullptr,
5115 postDst,
5116 src0Addr,
5117 payloadToUse,
5118 g4::SIMD1,
5119 desc,
5120 inst->getOption(),
5121 LSC_ADDR_TYPE_SS,
5122 false);
5123
5124 sendInst->addComment(makeSpillFillComment(
5125 "spill", "to",
5126 inst->getFP() ? "FP" : "offset",
5127 spillOffset,
5128 payload->getTopDcl()->getName()));
5129
5130 numRows -= numGRFToWrite;
5131 rowOffset += numGRFToWrite;
5132 spillOffset += numGRFToWrite * getGRFSize();
5133 }
5134
5135 if (getEUFusionWAInsts().count(inst) > 0)
5136 {
5137 removeEUFusionWAInst(inst);
5138 for (auto inst : builder->instList)
5139 addEUFusionWAInsts(inst);
5140 }
5141
5142 splice(bb, instIt, builder->instList, inst->getCISAOff());
5143 }
5144
expandFillLSC(G4_BB * bb,INST_LIST_ITER & instIt)5145 void GlobalRA::expandFillLSC(G4_BB* bb, INST_LIST_ITER& instIt)
5146 {
5147 auto& builder = kernel.fg.builder;
5148 auto inst = (*instIt)->asFillIntrinsic();
5149 // offset into scratch surface in bytes
5150 auto fillOffset = inst->getOffsetInBytes();
5151 uint32_t numRows = inst->getNumRows();
5152 auto rowOffset = inst->getDst()->getRegOff();
5153
5154 LSC_OP op = LSC_LOAD;
5155 LSC_SFID lscSfid = LSC_UGM;
5156 LSC_CACHE_OPTS cacheOpts{ LSC_CACHING_DEFAULT, LSC_CACHING_DEFAULT };
5157
5158 LSC_ADDR addrInfo;
5159 addrInfo.type = LSC_ADDR_TYPE_SS; //Scratch memory
5160 addrInfo.immScale = 1;
5161 addrInfo.immOffset = 0;
5162 addrInfo.size = LSC_ADDR_SIZE_32b;
5163
5164 builder->instList.clear();
5165
5166 while (numRows > 0)
5167 {
5168 unsigned responseLength = getPayloadSizeGRF(numRows);
5169 LSC_DATA_SHAPE dataShape;
5170 dataShape.size = LSC_DATA_SIZE_32b;
5171 dataShape.order = LSC_DATA_ORDER_TRANSPOSE;
5172 dataShape.elems = builder->lscGetElementNum(responseLength * getGRFSize() / 4);
5173
5174 G4_Declare* fillAddr = inst->getFP() ? kernel.fg.scratchRegDcl : inst->getHeader()->getTopDcl();
5175 {
5176 // need to calculate fill address
5177 createSpillFillAddr(*builder, fillAddr, inst->getFP(), fillOffset);
5178 }
5179 auto dstRead = builder->createDst(inst->getDst()->getTopDcl()->getRegVar(),
5180 (short)rowOffset, 0, 1, Type_UD);
5181
5182 auto surface = builder->createSrcRegRegion(builder->getSpillSurfaceOffset(),
5183 builder->getRegionScalar());
5184
5185 G4_SendDescRaw* desc = builder->createLscMsgDesc(
5186 op,
5187 lscSfid,
5188 EXEC_SIZE_1,
5189 cacheOpts,
5190 addrInfo,
5191 dataShape,
5192 surface,
5193 responseLength,
5194 1);
5195
5196 auto sendInst = builder->createLscSendInst(
5197 nullptr,
5198 dstRead,
5199 builder->createSrcRegRegion(fillAddr, builder->getRegionScalar()),
5200 nullptr,
5201 g4::SIMD1,
5202 desc,
5203 inst->getOption(),
5204 LSC_ADDR_TYPE_SS,
5205 false);
5206
5207 sendInst->addComment(makeSpillFillComment(
5208 "fill", "from",
5209 inst->getFP() ? "FP" : "offset",
5210 fillOffset,
5211 dstRead->getTopDcl()->getName()));
5212
5213 numRows -= responseLength;
5214 rowOffset += responseLength;
5215 fillOffset += responseLength * getGRFSize();
5216 }
5217
5218 if (getEUFusionWAInsts().count(inst) > 0)
5219 {
5220 removeEUFusionWAInst(inst);
5221 for (auto inst : builder->instList)
5222 addEUFusionWAInsts(inst);
5223 }
5224
5225 splice(bb, instIt, builder->instList, inst->getCISAOff());
5226 }
5227
expandSpillNonStackcall(uint32_t numRows,uint32_t offset,short rowOffset,G4_SrcRegRegion * header,G4_SrcRegRegion * payload,G4_BB * bb,INST_LIST_ITER & instIt)5228 void GlobalRA::expandSpillNonStackcall(
5229 uint32_t numRows, uint32_t offset, short rowOffset,
5230 G4_SrcRegRegion* header, G4_SrcRegRegion* payload, G4_BB* bb,
5231 INST_LIST_ITER& instIt)
5232 {
5233 auto& builder = kernel.fg.builder;
5234 auto inst = (*instIt);
5235
5236 if (offset == G4_SpillIntrinsic::InvalidOffset)
5237 {
5238 // oword msg
5239 auto payloadToUse = builder->createSrcRegRegion(*payload);
5240 auto [spillMsgDesc, execSize] = SpillManagerGRF::createSpillSendMsgDescOWord(numRows);
5241 G4_INST* sendInst = nullptr;
5242 // Use bindless for XeHP_SDV+
5243 if (builder->hasScratchSurface())
5244 {
5245 G4_Imm* descImm = createMsgDesc(GRFSizeToOwords(numRows), true, true);
5246 // Update BTI to 251
5247 auto spillMsgDesc = descImm->getInt();
5248 spillMsgDesc = spillMsgDesc & 0xffffff00;
5249 spillMsgDesc |= 251;
5250
5251 auto msgDesc = builder->createWriteMsgDesc(SFID::DP_DC0, (uint32_t)spillMsgDesc, numRows);
5252 G4_Imm* msgDescImm = builder->createImm(msgDesc->getDesc(), Type_UD);
5253
5254 // a0 is set by saveRestoreA0()
5255 auto a0Src = builder->createSrcRegRegion(builder->getBuiltinA0Dot2(), builder->getRegionScalar());
5256 sendInst = builder->createInternalSplitSendInst(execSize, inst->getDst(),
5257 header, payloadToUse, msgDescImm, inst->getOption(), msgDesc, a0Src);
5258 }
5259 else
5260 {
5261 G4_SendDescRaw * msgDesc =
5262 kernel.fg.builder->createSendMsgDesc(
5263 spillMsgDesc & 0x000FFFFFu, 0, 1, SFID::DP_DC0, numRows, 0, SendAccess::WRITE_ONLY);
5264 G4_Imm* msgDescImm = builder->createImm(msgDesc->getDesc(), Type_UD);
5265 G4_Imm* extDesc = builder->createImm(msgDesc->getExtendedDesc(), Type_UD);
5266 sendInst = builder->createInternalSplitSendInst(execSize,
5267 inst->getDst(), header, payloadToUse, msgDescImm, inst->getOption(),
5268 msgDesc, extDesc);
5269 }
5270 instIt = bb->insertBefore(instIt, sendInst);
5271 }
5272 else
5273 {
5274 while (numRows >= 1)
5275 {
5276 auto payloadToUse = builder->createSrcWithNewRegOff(payload, rowOffset);
5277
5278 auto region = builder->getRegionStride1();
5279
5280 uint32_t spillMsgDesc = computeSpillMsgDesc(getPayloadSizeGRF(numRows), offset);
5281 auto msgDesc = builder->createWriteMsgDesc(SFID::DP_DC0, spillMsgDesc, getPayloadSizeGRF(numRows));
5282 G4_Imm* msgDescImm = builder->createImm(msgDesc->getDesc(), Type_UD);
5283
5284 G4_SrcRegRegion* headerOpnd = builder->createSrcRegRegion(builder->getBuiltinR0(), region);
5285 G4_Imm* extDesc = builder->createImm(msgDesc->getExtendedDesc(), Type_UD);
5286 G4_ExecSize execSize = numRows > 1 ? g4::SIMD16 : g4::SIMD8;
5287
5288 auto sendInst = builder->createInternalSplitSendInst(execSize,
5289 inst->getDst(), headerOpnd, payloadToUse, msgDescImm,
5290 inst->getOption(), msgDesc, extDesc);
5291
5292 std::stringstream comments;
5293 comments << "scratch space spill: " << payloadToUse->getTopDcl()->getName() << " from offset[" << offset << "x32]";
5294 sendInst->addComment(comments.str());
5295
5296 instIt = bb->insertBefore(instIt, sendInst);
5297
5298 numRows -= getPayloadSizeGRF(numRows);
5299 offset += getPayloadSizeGRF(numRows);
5300 rowOffset += getPayloadSizeGRF(numRows);
5301 }
5302 }
5303 }
5304
expandSpillStackcall(uint32_t numRows,uint32_t offset,short rowOffset,G4_SrcRegRegion * payload,G4_BB * bb,INST_LIST_ITER & instIt)5305 void GlobalRA::expandSpillStackcall(
5306 uint32_t numRows, uint32_t offset, short rowOffset,
5307 G4_SrcRegRegion* payload, G4_BB* bb, INST_LIST_ITER& instIt)
5308 {
5309 auto& builder = kernel.fg.builder;
5310 auto inst = (*instIt);
5311
5312 auto spillIt = instIt;
5313
5314 // Use oword ld for stackcall. Lower intrinsic to:
5315 // (W) add(1 | M0) r126.2 < 1 > :ud r125.7 < 0; 1, 0 > : ud 0x0 : ud
5316 // (W) sends(8 | M0) null : ud r126 payload - src2 0x4A 0x20A02FF
5317 G4_Operand* src0 = nullptr;
5318 G4_Imm* src1 = nullptr;
5319 G4_Declare* scratchRegDcl = builder->kernel.fg.scratchRegDcl;
5320 G4_Declare* framePtr = inst->asSpillIntrinsic()->getFP();
5321
5322 // convert hword to oword offset
5323 auto numRowsOword = numRows * 2;
5324 auto offsetOword = offset * 2;
5325 auto rowOffsetOword = rowOffset * 2;
5326
5327 while (numRowsOword >= 1)
5328 {
5329 auto createOwordSpill = [&](unsigned int owordSize, G4_SrcRegRegion* payloadToUse)
5330 {
5331 G4_ExecSize execSize = (owordSize > 2) ? g4::SIMD16 : g4::SIMD8;
5332 G4_DstRegRegion* dst = builder->createNullDst((execSize > g4::SIMD8) ? Type_UW : Type_UD);
5333 auto sendSrc0 = builder->createSrc(scratchRegDcl->getRegVar(),
5334 0, 0, builder->rgnpool.createRegion(8, 8, 1), Type_UD);
5335 unsigned messageLength = owordToGRFSize(owordSize);
5336 G4_Imm* descImm = createMsgDesc(owordSize, true, true);
5337 G4_INST* sendInst = nullptr;
5338 // Use bindless for XeHP_SDV+
5339 if (builder->getPlatform() >= XeHP_SDV)
5340 {
5341 // Update BTI to 251
5342 auto spillMsgDesc = descImm->getInt();
5343 spillMsgDesc = spillMsgDesc & 0xffffff00;
5344 spillMsgDesc |= 251;
5345
5346 auto msgDesc = builder->createWriteMsgDesc(SFID::DP_DC0, (uint32_t)spillMsgDesc, messageLength);
5347 G4_Imm* msgDescImm = builder->createImm(msgDesc->getDesc(), Type_UD);
5348
5349 // a0 is set by saveRestoreA0()
5350 auto a0Src = builder->createSrcRegRegion(builder->getBuiltinA0Dot2(), builder->getRegionScalar());
5351 sendInst = builder->createInternalSplitSendInst(execSize, inst->getDst(),
5352 sendSrc0, payloadToUse, msgDescImm, inst->getOption(), msgDesc, a0Src);
5353 }
5354 else
5355 {
5356 auto msgDesc = builder->createWriteMsgDesc(SFID::DP_DC0, (uint32_t)descImm->getInt(), messageLength);
5357 G4_Imm* msgDescImm = builder->createImm(msgDesc->getDesc(), Type_UD);
5358 G4_Imm* extDesc = builder->createImm(msgDesc->getExtendedDesc(), Type_UD);
5359 sendInst = builder->createInternalSplitSendInst(execSize, dst, sendSrc0, payloadToUse,
5360 msgDescImm, inst->getOption() | InstOpt_WriteEnable, msgDesc, extDesc);
5361 }
5362 return sendInst;
5363 };
5364
5365 auto payloadSizeInOwords = getPayloadSizeOword(numRowsOword);
5366
5367 auto payloadToUse = builder->createSrcWithNewRegOff(payload, rowOffsetOword / 2);
5368
5369 G4_DstRegRegion* dst = builder->createDst(scratchRegDcl->getRegVar(), 0, 2, 1, Type_UD);
5370
5371 G4_INST* hdrSetInst = nullptr;
5372 if (inst->asSpillIntrinsic()->isOffsetValid())
5373 {
5374 // Skip header if spill module emits its own header
5375 if (framePtr)
5376 {
5377 src0 = builder->createSrc(framePtr->getRegVar(), 0, 0, builder->getRegionScalar(), Type_UD);
5378 src1 = builder->createImm(offsetOword, Type_UD);
5379 hdrSetInst = builder->createBinOp(G4_add, g4::SIMD1, dst, src0, src1, InstOpt_WriteEnable, false);
5380 }
5381 else
5382 {
5383 src0 = builder->createImm(offsetOword, Type_UD);
5384 hdrSetInst = builder->createMov(g4::SIMD1, dst, src0, InstOpt_WriteEnable, false);
5385 }
5386
5387 bb->insertBefore(spillIt, hdrSetInst);
5388 }
5389
5390 auto spillSends = createOwordSpill(payloadSizeInOwords, payloadToUse);
5391 std::stringstream comments;
5392 comments << "stack spill: " << payload->getTopDcl()->getName() << " to FP[" << inst->asSpillIntrinsic()->getOffset() << "x32]";
5393 spillSends->addComment(comments.str());
5394
5395 bb->insertBefore(spillIt, spillSends);
5396
5397 if (getEUFusionWAInsts().count(inst) > 0)
5398 {
5399 removeEUFusionWAInst(inst);
5400 addEUFusionWAInsts(spillSends);
5401 if (hdrSetInst)
5402 addEUFusionWAInsts(hdrSetInst);
5403 }
5404
5405 if (kernel.getOption(vISA_GenerateDebugInfo))
5406 {
5407 kernel.getKernelDebugInfo()->updateExpandedIntrinsic(inst->asSpillIntrinsic(), hdrSetInst);
5408 kernel.getKernelDebugInfo()->updateExpandedIntrinsic(inst->asSpillIntrinsic(), spillSends);
5409 }
5410
5411 numRowsOword -= payloadSizeInOwords;
5412 offsetOword += payloadSizeInOwords;
5413 rowOffsetOword += payloadSizeInOwords;
5414 }
5415 }
5416
5417 // Non-stack call:
5418 // sends <-- scratch - default, supported
5419 // send <-- scratch - disable split send using compiler option, not supported by intrinsic
5420 // send <-- non-scratch - used when scratch space usage is very high, supported
5421
5422 // Stack call :
5423 // sends <-- non-scratch - default spill, supported
5424 // send <-- non-scratch - default fill, supported
expandSpillIntrinsic(G4_BB * bb)5425 void GlobalRA::expandSpillIntrinsic(G4_BB* bb)
5426 {
5427 // spill (1) null:ud bitmask:ud r0:ud payload:ud
5428 for (auto instIt = bb->begin(); instIt != bb->end();)
5429 {
5430 auto inst = (*instIt);
5431 if (inst->isSpillIntrinsic())
5432 {
5433 bool isOffBP = inst->asSpillIntrinsic()->isOffBP();
5434 uint32_t numRows = inst->asSpillIntrinsic()->getNumRows();
5435 uint32_t offset = inst->asSpillIntrinsic()->getOffset() *
5436 (numEltPerGRF<Type_UB>() / HWORD_BYTE_SIZE);
5437 auto header = inst->getSrc(0)->asSrcRegRegion();
5438 auto payload = inst->getSrc(1)->asSrcRegRegion();
5439 auto spillIt = instIt;
5440
5441 auto rowOffset = payload->getRegOff();
5442 if (useLscForNonStackCallSpillFill || spillFillIntrinUsesLSC(inst)) {
5443 expandSpillLSC(bb, instIt);
5444 }
5445 else
5446 {
5447 if (!isOffBP)
5448 {
5449 expandSpillNonStackcall(numRows, offset, rowOffset, header, payload, bb, instIt);
5450 }
5451 else
5452 {
5453 expandSpillStackcall(numRows, offset, rowOffset, payload, bb, instIt);
5454 }
5455 }
5456 numGRFSpill++;
5457 instIt = bb->erase(spillIt);
5458 continue;
5459 }
5460 instIt++;
5461 }
5462 }
5463
expandFillNonStackcall(uint32_t numRows,uint32_t offset,short rowOffset,G4_SrcRegRegion * header,G4_DstRegRegion * resultRgn,G4_BB * bb,INST_LIST_ITER & instIt)5464 void GlobalRA::expandFillNonStackcall(uint32_t numRows, uint32_t offset, short rowOffset, G4_SrcRegRegion* header, G4_DstRegRegion* resultRgn, G4_BB* bb, INST_LIST_ITER& instIt)
5465 {
5466 auto& builder = kernel.fg.builder;
5467 auto inst = (*instIt);
5468
5469 if (offset == G4_FillIntrinsic::InvalidOffset)
5470 {
5471 // oword msg
5472 G4_ExecSize execSize = g4::SIMD16;
5473 auto numRowsOword = GRFSizeToOwords(numRows);
5474 auto fillDst = builder->createDst(resultRgn->getBase(), rowOffset,
5475 0, resultRgn->getHorzStride(), resultRgn->getType());
5476 auto sendSrc0 = builder->createSrc(header->getBase(),
5477 0, 0, builder->rgnpool.createRegion(8, 8, 1), Type_UD);
5478 G4_Imm* desc = createMsgDesc(numRowsOword, false, false);
5479 G4_INST* sendInst = nullptr;
5480 auto sfId = SFID::DP_DC0;
5481
5482 // Use bindless for XeHP_SDV+
5483 if (builder->hasScratchSurface())
5484 {
5485 // Update BTI to 251
5486 auto newDesc = desc->getInt() & 0xffffff00;
5487 newDesc |= 251;
5488 desc = builder->createImm(newDesc, Type_UD);
5489
5490 auto msgDesc = builder->createReadMsgDesc(sfId, (uint32_t)desc->getInt());
5491 G4_Operand* msgDescOpnd = builder->createImm(msgDesc->getDesc(), Type_UD);
5492
5493 // a0 is set by saveRestoreA0()
5494 auto src1 = builder->createSrc(builder->getBuiltinA0Dot2()->getRegVar(), 0, 0,
5495 builder->getRegionScalar(), Type_UD);
5496
5497 sendInst = builder->createInternalSplitSendInst(execSize, fillDst, sendSrc0,
5498 nullptr, msgDescOpnd, InstOpt_WriteEnable, msgDesc, src1);
5499 }
5500 else
5501 {
5502 auto msgDesc = builder->createReadMsgDesc(sfId, (uint32_t)desc->getInt());
5503 G4_Operand* msgDescOpnd = builder->createImm(msgDesc->getDesc(), Type_UD);
5504 sendInst = builder->createInternalSendInst(nullptr, G4_send, execSize, fillDst, sendSrc0, msgDescOpnd,
5505 InstOpt_WriteEnable, msgDesc);
5506 }
5507 instIt = bb->insertBefore(instIt, sendInst);
5508 }
5509 else
5510 {
5511 while (numRows >= 1)
5512 {
5513 auto fillDst = builder->createDst(resultRgn->getBase(), rowOffset,
5514 0, resultRgn->getHorzStride(), resultRgn->getType());
5515
5516 auto region = builder->getRegionStride1();
5517 G4_SrcRegRegion* headerOpnd = builder->createSrcRegRegion(builder->getBuiltinR0(), region);
5518
5519 uint32_t fillMsgDesc = computeFillMsgDesc(getPayloadSizeGRF(numRows), offset);
5520
5521 G4_SendDescRaw* msgDesc = kernel.fg.builder->createSendMsgDesc(fillMsgDesc,
5522 getPayloadSizeGRF(numRows), 1, SFID::DP_DC0, 0, 0, SendAccess::READ_ONLY);
5523
5524 G4_Imm* msgDescImm = builder->createImm(msgDesc->getDesc(), Type_UD);
5525
5526 auto sendInst = builder->createInternalSendInst(nullptr,
5527 G4_send, g4::SIMD16, fillDst, headerOpnd, msgDescImm, inst->getOption(),
5528 msgDesc);
5529
5530 std::stringstream comments;
5531 comments << "scratch space fill: " << inst->getDst()->getTopDcl()->getName() << " from offset[" << offset << "x32]";
5532 sendInst->addComment(comments.str());
5533
5534 instIt = bb->insertBefore(instIt, sendInst);
5535
5536 numRows -= getPayloadSizeGRF(numRows);
5537 offset += getPayloadSizeGRF(numRows);
5538 rowOffset += getPayloadSizeGRF(numRows);
5539 }
5540 }
5541 }
5542
expandFillStackcall(uint32_t numRows,uint32_t offset,short rowOffset,G4_SrcRegRegion * header,G4_DstRegRegion * resultRgn,G4_BB * bb,INST_LIST_ITER & instIt)5543 void GlobalRA::expandFillStackcall(uint32_t numRows, uint32_t offset, short rowOffset, G4_SrcRegRegion* header, G4_DstRegRegion* resultRgn, G4_BB* bb, INST_LIST_ITER& instIt)
5544 {
5545 auto& builder = kernel.fg.builder;
5546 auto inst = (*instIt);
5547 auto fillIt = instIt;
5548
5549 // Use oword ld for stackcall. Lower intrinsic to:
5550 // add (1) r126.2<1>:d FP<0;1,0>:d offset
5551 // send (16) r[startReg]<1>:uw r126 0xa desc:ud
5552 G4_Operand* src0 = nullptr;
5553 G4_Imm* src1 = nullptr;
5554 G4_Declare* scratchRegDcl = builder->kernel.fg.scratchRegDcl;
5555 G4_Declare* framePtr = inst->asFillIntrinsic()->getFP();
5556
5557 // convert hword to oword offset
5558 auto numRowsOword = numRows * 2;
5559 auto offsetOword = offset * 2;
5560 auto rowOffsetOword = rowOffset * 2;
5561
5562 while (numRowsOword >= 1)
5563 {
5564 auto createOwordFill = [&](unsigned int owordSize, G4_DstRegRegion* fillVar)
5565 {
5566 G4_ExecSize execSize = (owordSize > 2) ? g4::SIMD16 : g4::SIMD8;
5567 auto sendSrc0 = builder->createSrc(scratchRegDcl->getRegVar(),
5568 0, 0, builder->rgnpool.createRegion(8, 8, 1), Type_UD);
5569 G4_Imm* desc = createMsgDesc(owordSize, false, false);
5570 G4_INST* sendInst = nullptr;
5571 auto sfId = SFID::DP_DC0;
5572
5573 // Use bindless for XeHP_SDV+
5574 if (builder->getPlatform() >= XeHP_SDV)
5575 {
5576 // Update BTI to 251
5577 auto newDesc = desc->getInt() & 0xffffff00;
5578 newDesc |= 251;
5579 desc = builder->createImm(newDesc, Type_UD);
5580
5581 auto msgDesc = builder->createReadMsgDesc(sfId, (uint32_t)desc->getInt());
5582 G4_Operand* msgDescOpnd = builder->createImm(msgDesc->getDesc(), Type_UD);
5583
5584 // a0 is set by saveRestoreA0()
5585 auto src1 = builder->createSrc(builder->getBuiltinA0Dot2()->getRegVar(), 0, 0,
5586 builder->getRegionScalar(), Type_UD);
5587
5588 sendInst = builder->createInternalSplitSendInst(
5589 execSize, fillVar, sendSrc0,
5590 nullptr, msgDescOpnd, InstOpt_WriteEnable, msgDesc, src1);
5591 }
5592 else
5593 {
5594 auto msgDesc = builder->createReadMsgDesc(SFID::DP_DC0, (uint32_t)desc->getInt());
5595 auto msgDescImm = builder->createImm(msgDesc->getDesc(), Type_UD);
5596 sendInst = builder->createInternalSendInst(
5597 nullptr, G4_send, execSize, fillVar, sendSrc0, msgDescImm,
5598 InstOpt_WriteEnable, msgDesc);
5599 }
5600 return sendInst;
5601 };
5602
5603 auto respSizeInOwords = getPayloadSizeOword(numRowsOword);
5604 auto fillDst = builder->createDst(resultRgn->getBase(), rowOffsetOword / 2,
5605 0, resultRgn->getHorzStride(), resultRgn->getType());
5606
5607 G4_DstRegRegion* dst = builder->createDst(scratchRegDcl->getRegVar(), 0, 2, 1, Type_UD);
5608
5609 G4_INST* hdrSetInst = nullptr;
5610 if (inst->asFillIntrinsic()->isOffsetValid())
5611 {
5612 // Skip header if spill module emits its own header
5613 if (framePtr)
5614 {
5615 src0 = builder->createSrc(framePtr->getRegVar(), 0, 0, builder->getRegionScalar(), Type_UD);
5616 src1 = builder->createImm(offsetOword, Type_UD);
5617 hdrSetInst = builder->createBinOp(G4_add, g4::SIMD1, dst, src0, src1, InstOpt_WriteEnable, false);
5618 }
5619 else
5620 {
5621 src0 = builder->createImm(offsetOword, Type_UD);
5622 hdrSetInst = builder->createMov(g4::SIMD1, dst, src0, InstOpt_WriteEnable, false);
5623 }
5624
5625 bb->insertBefore(fillIt, hdrSetInst);
5626 }
5627
5628 auto fillSends = createOwordFill(respSizeInOwords, fillDst);
5629
5630 if (getEUFusionWAInsts().count(inst) > 0)
5631 {
5632 removeEUFusionWAInst(inst);
5633 addEUFusionWAInsts(fillSends);
5634 if (hdrSetInst)
5635 addEUFusionWAInsts(hdrSetInst);
5636 }
5637
5638 std::stringstream comments;
5639 comments << "stack fill: " << resultRgn->getTopDcl()->getName() << " from FP[" << inst->asFillIntrinsic()->getOffset() << "x32]";
5640 fillSends->addComment(comments.str());
5641
5642 bb->insertBefore(fillIt, fillSends);
5643
5644 if (kernel.getOption(vISA_GenerateDebugInfo))
5645 {
5646 kernel.getKernelDebugInfo()->updateExpandedIntrinsic(inst->asFillIntrinsic(), hdrSetInst);
5647 kernel.getKernelDebugInfo()->updateExpandedIntrinsic(inst->asFillIntrinsic(), fillSends);
5648 }
5649
5650 numRowsOword -= respSizeInOwords;
5651 offsetOword += respSizeInOwords;
5652 rowOffsetOword += respSizeInOwords;
5653 }
5654 }
5655
spillFillIntrinUsesLSC(G4_INST * spillFillIntrin)5656 bool GlobalRA::spillFillIntrinUsesLSC(G4_INST* spillFillIntrin)
5657 {
5658 G4_Declare* headerDcl = nullptr;
5659 if (!spillFillIntrin)
5660 return false;
5661
5662 if (spillFillIntrin->isFillIntrinsic())
5663 headerDcl = spillFillIntrin->asFillIntrinsic()->getHeader()->getTopDcl();
5664 else if (spillFillIntrin->isSpillIntrinsic())
5665 headerDcl = spillFillIntrin->asSpillIntrinsic()->getHeader()->getTopDcl();
5666
5667 if (useLscForSpillFill && headerDcl != builder.getBuiltinR0()->getRootDeclare())
5668 {
5669 return true;
5670 }
5671 return false;
5672 }
5673
expandFillIntrinsic(G4_BB * bb)5674 void GlobalRA::expandFillIntrinsic(G4_BB* bb)
5675 {
5676 // fill (1) fill_var:ud bitmask:ud offset:ud
5677 for (auto instIt = bb->begin(); instIt != bb->end();)
5678 {
5679 auto inst = (*instIt);
5680 if (inst->isFillIntrinsic())
5681 {
5682 bool isOffBP = inst->asFillIntrinsic()->isOffBP();
5683 uint32_t numRows = inst->asFillIntrinsic()->getNumRows();
5684 uint32_t offset = inst->asFillIntrinsic()->getOffset() *
5685 (numEltPerGRF<Type_UB>() / HWORD_BYTE_SIZE);
5686 auto header = inst->getSrc(0)->asSrcRegRegion();
5687 auto resultRgn = inst->getDst();
5688 auto fillIt = instIt;
5689
5690 auto rowOffset = resultRgn->getRegOff();
5691 if (useLscForNonStackCallSpillFill || spillFillIntrinUsesLSC(inst)) {
5692 expandFillLSC(bb, instIt);
5693 }
5694 else
5695 {
5696 if (!isOffBP)
5697 {
5698 expandFillNonStackcall(numRows, offset, rowOffset, header, resultRgn, bb, instIt);
5699 }
5700 else
5701 {
5702 expandFillStackcall(numRows, offset, rowOffset, header, resultRgn, bb, instIt);
5703 }
5704 }
5705 numGRFFill++;
5706 instIt = bb->erase(fillIt);
5707 continue;
5708 }
5709 instIt++;
5710 }
5711 }
5712
5713
expandSpillFillIntrinsics(unsigned int spillSizeInBytes)5714 void GlobalRA::expandSpillFillIntrinsics(unsigned int spillSizeInBytes)
5715 {
5716 auto globalScratchOffset = kernel.getInt32KernelAttr(Attributes::ATTR_SpillMemOffset);
5717
5718 for (auto bb : kernel.fg)
5719 {
5720 if (builder.hasScratchSurface() &&
5721 (kernel.fg.getHasStackCalls() || kernel.fg.getIsStackCallFunc() || kernel.fg.builder->hasValidOldA0Dot2()
5722 || (useLscForSpillFill && (spillSizeInBytes + globalScratchOffset) > SCRATCH_MSG_LIMIT &&
5723 spillSizeInBytes > 0)
5724 || (useLscForNonStackCallSpillFill && spillSizeInBytes > 0)
5725 // Following cases exist:
5726 // a. XeHP_SDV without stackcall => use hword scratch msg
5727 // b. XeHP_SDV without stackcall => using oword block msg
5728 // c. XeHP_SDV with stackcall
5729 // d. DG2+ without stackcall => hword scratch msg
5730 // e. DG2+ without stackcall => using LSC
5731 // f. DG2+ with stackcall => using LSC
5732 //
5733 // (a), (d) are similar to SKL with hword scratch msg.
5734 //
5735 // (c), (f):
5736 // a0.2 is saved/restored from r126.6:ud
5737 // SSO is saved in r126.7:ud (in replaceSSO function)
5738 // XeHP_SDV uses oword msg, DG2+ uses LSC msg
5739 // For DG2+, offset is computed in r126.0
5740 //
5741 // (b):
5742 // oword header is prepared in a temp variable, allocated by RA
5743 // a0.2 is saved/restored in oldA0Dot2(0,0) whenever required
5744 // SSO is allocated to a live-out temp (not tied to r126.7:ud)
5745 //
5746 // (e):
5747 // LSC msg is used for spill/fill
5748 // Spill offset is computed in spillHeader(0,0)
5749 // a0.2 is saved/restored in oldA0Dot2(0,0) whenever required
5750 // spillHeader is marked as live-out
5751 //
5752 // When needed:
5753 // SSO is marked as live-out
5754 // r0 is stored in r127
5755 //
5756 ))
5757 {
5758 saveRestoreA0(bb);
5759 }
5760 expandSpillIntrinsic(bb);
5761 expandFillIntrinsic(bb);
5762 }
5763 kernel.fg.builder->getcompilerStats().SetI64(CompilerStats::numGRFSpillStr(), numGRFSpill, kernel.getSimdSize());
5764 kernel.fg.builder->getcompilerStats().SetI64(CompilerStats::numGRFFillStr(), numGRFFill, kernel.getSimdSize());
5765
5766 }
5767
~SpillAnalysis()5768 SpillAnalysis::~SpillAnalysis()
5769 {
5770 if (Refs)
5771 {
5772 delete Refs;
5773 Refs = nullptr;
5774 }
5775 }
5776
Dump(std::ostream & OS)5777 void SpillAnalysis::Dump(std::ostream& OS)
5778 {
5779 auto& GRA = GC->getGRA();
5780 auto& Kernel = GRA.kernel;
5781 auto& Loops = Kernel.fg.getLoops();
5782 const auto& Spills = GC->getSpilledLiveRanges();
5783 std::unordered_map<G4_INST*, G4_BB*> InstBBMap;
5784
5785 for (auto* BB : Kernel.fg.getBBList())
5786 for (auto* Inst : BB->getInstList())
5787 InstBBMap[Inst] = BB;
5788
5789 OS << "Name, Dcl Byte Size, Spill Cost, Degree, #Defs, #Uses, Distance, #BBs, All BBs Where Live" << std::endl;
5790
5791 for (auto* Spill : Spills)
5792 {
5793 // dump - {Dcl size, Spill cost, Live BBs (loop annotation)}
5794 auto Dcl = Spill->getDcl();
5795 auto DclSizeBytes = Dcl->getByteSize();
5796 auto SpillCost = Spill->getSpillCost();
5797 auto Degree = DclDegree[Dcl];
5798 auto LiveBBs = GetLiveBBs(Dcl, InstBBMap);
5799 auto Distance = GetDistance(Dcl);
5800 auto NumDefs = Refs->getDefCount(Dcl);
5801 auto NumUses = Refs->getUseCount(Dcl);
5802
5803 OS << Dcl->getName() << "," << DclSizeBytes << ", " << SpillCost << ", " << Degree << ", "
5804 << NumDefs << ", " << NumUses << ", "
5805 << Distance << ", " << LiveBBs.size() << ", ";
5806
5807 for (auto* LiveBB : LiveBBs)
5808 {
5809 OS << "BB" << LiveBB->getId();
5810 auto* ClosestLoop = Loops.getInnerMostLoop(LiveBB);
5811 if (ClosestLoop)
5812 {
5813 OS << " [L" << ClosestLoop->id << "]";
5814 }
5815 OS << " ";
5816 }
5817
5818 OS << std::endl;
5819 }
5820 }
5821
GetDistance(G4_Declare * Dcl)5822 unsigned int SpillAnalysis::GetDistance(G4_Declare* Dcl)
5823 {
5824 if (AugIntervals.count(Dcl) == 0)
5825 {
5826 // Construct distance in conventional way
5827 auto& Kernel = GC->getGRA().kernel;
5828 unsigned int Start = 0xffffffff, End = 0x0;
5829
5830 auto* Defs = Refs->getDefs(Dcl);
5831 auto* Uses = Refs->getUses(Dcl);
5832
5833 for (auto& Def : *Defs)
5834 {
5835 auto* DefInst = std::get<0>(Def);
5836 Start = std::min(Start, DefInst->getLexicalId());
5837 End = std::max(End, DefInst->getLexicalId());
5838 }
5839
5840 for (auto& Use : *Uses)
5841 {
5842 auto* UseInst = std::get<0>(Use);
5843 Start = std::min(Start, UseInst->getLexicalId());
5844 End = std::max(End, UseInst->getLexicalId());
5845 }
5846
5847 for (auto* BB : Kernel.fg.getBBList())
5848 {
5849 if (LA->isLiveAtEntry(BB, Dcl->getRegVar()->getId()))
5850 Start = std::min(Start, BB->front()->getLexicalId());
5851 if (LA->isLiveAtExit(BB, Dcl->getRegVar()->getId()))
5852 End = std::max(End, BB->back()->getLexicalId());
5853 }
5854
5855 return End - Start;
5856 }
5857
5858 // Return augmentation distance when available
5859 auto Distance = AugIntervals[Dcl];
5860 return Distance.second->getLexicalId() - Distance.first->getLexicalId();
5861 }
5862
LoadAugIntervals(DECLARE_LIST & SortedIntervals,GlobalRA & GRA)5863 void SpillAnalysis::LoadAugIntervals(DECLARE_LIST& SortedIntervals, GlobalRA& GRA)
5864 {
5865 for (auto& LR : SortedIntervals)
5866 {
5867 auto* Start = GRA.getStartInterval(LR);
5868 auto* End = GRA.getEndInterval(LR);
5869 AugIntervals[LR] = std::make_pair(Start, End);
5870 }
5871 }
5872
LoadDegree(G4_Declare * Dcl,unsigned int degree)5873 void SpillAnalysis::LoadDegree(G4_Declare* Dcl, unsigned int degree)
5874 {
5875 // This should be called after degree computation and before simplification.
5876 DclDegree[Dcl] = degree;
5877 }
5878
Clear()5879 void SpillAnalysis::Clear()
5880 {
5881 if(Refs)
5882 delete Refs;
5883
5884 Refs = nullptr;
5885 LA = nullptr;
5886 GC = nullptr;
5887 SM = nullptr;
5888 AugIntervals.clear();
5889 DclDegree.clear();
5890
5891 }
5892
DumpHistogram(std::ostream & OS)5893 void SpillAnalysis::DumpHistogram(std::ostream& OS)
5894 {
5895 // Compute and dump histogram
5896 std::map<unsigned int, unsigned int> SpillSizeHistogram;
5897 for (auto Spill : GC->getSpilledLiveRanges())
5898 {
5899 auto ByteSize = Spill->getDcl()->getByteSize();
5900 SpillSizeHistogram[ByteSize] += 1;
5901 }
5902
5903 OS << "Spill Size Histogram For Iter#" << GC->getGRA().getIterNo() << " : " << std::endl;
5904 for (auto& Item : SpillSizeHistogram)
5905 {
5906 OS << "# vars of " << Item.first << " bytes spilled: " << Item.second << std::endl;
5907 }
5908
5909 OS << std::endl;
5910 }
5911
Do(LivenessAnalysis * L,GraphColor * C,SpillManagerGRF * S)5912 void SpillAnalysis::Do(LivenessAnalysis* L, GraphColor* C, SpillManagerGRF* S)
5913 {
5914 SetLivenessAnalysis(L);
5915 SetGraphColor(C);
5916 SetSpillManager(S);
5917
5918 unsigned int LexId = 0;
5919 for (auto* BB : C->getGRA().kernel.fg.getBBList())
5920 for (auto* Inst : BB->getInstList())
5921 Inst->setLexicalId(LexId++);
5922
5923 Refs = new VarReferences(C->getGRA().kernel);
5924
5925 auto IterNo = C->getGRA().getIterNo();
5926
5927 std::string FN = "spill-iter-";
5928 FN += std::to_string(IterNo);
5929 FN += std::string(".csv");
5930 std::ofstream OF;
5931 OF.open(FN, std::ofstream::out);
5932 Dump(OF);
5933 OF.close();
5934
5935 FN = "misc-data";
5936 OF.open(FN, IterNo == 0 ? std::ofstream::out : std::ofstream::app);
5937 if (IterNo == 0)
5938 {
5939 ((vISA::Analysis*)&C->getGRA().kernel.fg.getLoops())->dump(OF);
5940 }
5941 DumpHistogram(OF);
5942 OF.close();
5943 }
5944
GetLiveBBs(G4_Declare * Dcl,std::unordered_map<G4_INST *,G4_BB * > & InstBBMap)5945 std::vector<G4_BB*> SpillAnalysis::GetLiveBBs(G4_Declare* Dcl, std::unordered_map<G4_INST*, G4_BB*>& InstBBMap)
5946 {
5947 // Return all BBs over which Dcl is live. This includes augmentation data.
5948 auto Order = [](const G4_BB* First, const G4_BB* Second)
5949 {
5950 return First->getId() < Second->getId();
5951 };
5952 std::set<G4_BB*, decltype(Order)> BBs(Order);
5953 auto& Kernel = GC->getGRA().kernel;
5954
5955 VarReferences VarRefs(Kernel);
5956 auto* Defs = VarRefs.getDefs(Dcl);
5957 auto* Uses = VarRefs.getUses(Dcl);
5958
5959 for (auto Def : *Defs)
5960 {
5961 auto* BB = std::get<1>(Def);
5962 BBs.insert(BB);
5963 }
5964
5965 for (auto Use : *Uses)
5966 {
5967 auto* BB = std::get<1>(Use);
5968 BBs.insert(BB);
5969 }
5970
5971 for (auto BB : Kernel.fg.getBBList())
5972 {
5973 if (LA->isLiveAtEntry(BB, Dcl->getRegVar()->getId()) ||
5974 LA->isLiveAtExit(BB, Dcl->getRegVar()->getId()))
5975 {
5976 BBs.insert(BB);
5977 }
5978 }
5979
5980 if (AugIntervals.count(Dcl))
5981 {
5982 auto& Interval = AugIntervals[Dcl];
5983 auto AugBBs = GetIntervalBBs(Interval.first, Interval.second, InstBBMap);
5984 std::for_each(AugBBs.begin(), AugBBs.end(), [&](G4_BB* BB) {BBs.insert(BB); });
5985 }
5986
5987 std::vector<G4_BB*> RetBBs;
5988 std::for_each(BBs.begin(), BBs.end(), [&](G4_BB* BB) {RetBBs.push_back(BB); });
5989
5990 return RetBBs;
5991 }
5992
GetIntervalBBs(G4_INST * Start,G4_INST * End,std::unordered_map<G4_INST *,G4_BB * > & InstBBMap)5993 std::vector<G4_BB*> vISA::SpillAnalysis::GetIntervalBBs(G4_INST* Start, G4_INST* End, std::unordered_map<G4_INST*, G4_BB*>& InstBBMap)
5994 {
5995 // Return vector of BBs given Start/End G4_INST*s
5996 std::vector<G4_BB*> BBs;
5997 auto& Kernel = GC->getGRA().kernel;
5998 bool Started = false;
5999 for (auto* BB : Kernel.fg.getBBList())
6000 {
6001 bool BBAdded = false;
6002 for (auto* Inst : BB->getInstList())
6003 {
6004 if (Inst == Start)
6005 Started = true;
6006
6007 if (Started && !BBAdded)
6008 {
6009 BBs.push_back(BB);
6010 BBAdded = true;
6011 }
6012
6013 if (Inst == End)
6014 return BBs;
6015 }
6016 }
6017
6018 return BBs;
6019 }
6020