1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2017-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include "SpillManagerGMRF.h"
10 #include "G4_IR.hpp"
11 #include "Mem_Manager.h"
12 #include "FlowGraph.h"
13 #include "GraphColor.h"
14 #include "BuildIR.h"
15 #include "DebugInfo.h"
16 
17 #include <cmath>
18 #include <sstream>
19 #include <fstream>
20 #include <unordered_set>
21 
22 using namespace vISA;
23 
24 // Configurations
25 
26 #define ADDRESS_SENSITIVE_SPILLS_IMPLEMENTED
27 #define REG_DWORD_SIZE (getGRFSize() / 4)
28 #define REG_BYTE_SIZE (getGRFSize())
29 #define SCRATCH_SPACE_ADDRESS_UNIT 5
30 
31 //#define DISABLE_SPILL_MEMORY_COMPRESSION
32 //#define VERIFY_SPILL_ASSIGNMENTS
33 
34 // Constant declarations
35 
36 static const unsigned DWORD_BYTE_SIZE                        = 4;
37 static const unsigned OWORD_BYTE_SIZE                        = 16;
38 static const unsigned HWORD_BYTE_SIZE                        = 32;
39 static const unsigned PAYLOAD_INPUT_REG_OFFSET               = 0;
40 static const unsigned PAYLOAD_INPUT_SUBREG_OFFSET            = 0;
41 static const unsigned OWORD_PAYLOAD_SPOFFSET_REG_OFFSET      = 0;
42 static const unsigned OWORD_PAYLOAD_SPOFFSET_SUBREG_OFFSET   = 2;
43 static const unsigned DWORD_PAYLOAD_SPOFFSET_REG_OFFSET      = 1;
44 static const unsigned DWORD_PAYLOAD_SPOFFSET_SUBREG_OFFSET   = 0;
45 static const unsigned OWORD_PAYLOAD_WRITE_REG_OFFSET         = 1;
46 static const unsigned OWORD_PAYLOAD_WRITE_SUBREG_OFFSET      = 0;
47 // dword scatter is always in SIMD8 mode
48 static const unsigned DWORD_PAYLOAD_WRITE_REG_OFFSET         = 2;
49 static const unsigned DWORD_PAYLOAD_WRITE_SUBREG_OFFSET      = 0;
50 static const unsigned OWORD_PAYLOAD_HEADER_MIN_HEIGHT        = 1;
51 static const unsigned DWORD_PAYLOAD_HEADER_MIN_HEIGHT        = 2;
52 static const unsigned OWORD_PAYLOAD_HEADER_MAX_HEIGHT        = 1;
53 static const unsigned DWORD_PAYLOAD_HEADER_MAX_HEIGHT        = 3;
54 static const unsigned DEF_HORIZ_STRIDE                       = 1;
55 static const unsigned REG_ORIGIN                             = 0;
56 static const unsigned SUBREG_ORIGIN                          = 0;
57 
58 static const unsigned SEND_GT_READ_TYPE_BIT_OFFSET           = 13;
59 static const unsigned SEND_GT_WRITE_TYPE_BIT_OFFSET          = 13;
60 static const unsigned SEND_GT_DESC_DATA_SIZE_BIT_OFFSET      = 8;
61 static const unsigned SEND_GT_OW_READ_TYPE                   = 0;
62 static const unsigned SEND_GT_OW_WRITE_TYPE                  = 8;
63 static const unsigned SEND_GT_SC_READ_TYPE                   = 6;
64 static const unsigned SEND_GT_SC_WRITE_TYPE                  = 11;
65 static const unsigned SEND_GT_DP_RD_EX_DESC_IMM              = 5;
66 static const unsigned SEND_GT_DP_SC_RD_EX_DESC_IMM           = 4;    //scatter reads go to sampler cache
67 static const unsigned SEND_GT_DP_WR_EX_DESC_IMM              = 5;
68 
69 static const unsigned SEND_IVB_MSG_TYPE_BIT_OFFSET         = 14;
70 static const unsigned SEND_IVB_OW_READ_TYPE                = 0;
71 static const unsigned SEND_IVB_OW_WRITE_TYPE               = 8;
72 static const unsigned SEND_IVB_SC_READ_TYPE                = 3;
73 static const unsigned SEND_IVB_SC_WRITE_TYPE               = 11;
74 static const unsigned SEND_IVB_DP_RD_EX_DESC_IMM           = 10; //data cache
75 static const unsigned SEND_IVB_DP_WR_EX_DESC_IMM           = 10; //data cache
76 
77 // Scratch msg
78 static const unsigned SCRATCH_PAYLOAD_HEADER_MAX_HEIGHT     = 1;
79 static const unsigned SCRATCH_MSG_DESC_CATEORY             = 18;
80 static const unsigned SCRATCH_MSG_DESC_OPERATION_MODE      = 17;
81 static const unsigned SCRATCH_MSG_DESC_CHANNEL_MODE        = 16;
82 static const unsigned SCRATCH_MSG_INVALIDATE_AFTER_READ    = 15;
83 static const unsigned SCRATCH_MSG_DESC_BLOCK_SIZE          = 12;
84 
85 
86 #define LIMIT_SEND_EXEC_SIZE(EXEC_SIZE)  (((EXEC_SIZE) > 16) ? 16 : (EXEC_SIZE))
87 #define SPILL_PAYLOAD_HEIGHT_LIMIT 4
88 
splice(G4_BB * bb,INST_LIST_ITER iter,INST_LIST & instList,unsigned int CISAOff)89 void splice(G4_BB* bb, INST_LIST_ITER iter, INST_LIST& instList, unsigned int CISAOff)
90 {
91     // Update CISA offset of all instructions in instList before splicing
92     // operation.
93     for (auto inst : instList)
94     {
95         inst->setCISAOff(CISAOff);
96     }
97 
98     bb->splice(iter, instList);
99 }
100 
101 // spill/fill temps are always GRF-aligned, and are also even/odd aligned
102 // following the original declare's alignment
setNewDclAlignment(GlobalRA & gra,G4_Declare * newDcl,bool evenAlign)103 static void setNewDclAlignment(GlobalRA& gra, G4_Declare* newDcl, bool evenAlign)
104 {
105     newDcl->setSubRegAlign(GRFALIGN);
106     if (evenAlign)
107     {
108         newDcl->setEvenAlign();
109     }
110 
111     gra.setSubRegAlign(newDcl, GRFALIGN);
112     gra.setEvenAligned(newDcl, evenAlign);
113 }
114 
SpillManagerGRF(GlobalRA & g,unsigned spillAreaOffset,unsigned varIdCount,const LivenessAnalysis * lvInfo,LiveRange ** lrInfo,const Interference * intf,const LR_LIST * spilledLRs,unsigned iterationNo,bool failSafeSpill,unsigned spillRegSize,unsigned indrSpillRegSize,bool enableSpillSpaceCompression,bool useScratchMsg,bool avoidDstSrcOverlap)115 SpillManagerGRF::SpillManagerGRF(
116     GlobalRA& g,
117     unsigned spillAreaOffset,
118     unsigned varIdCount,
119     const LivenessAnalysis* lvInfo,
120     LiveRange** lrInfo,
121     const Interference* intf,
122     const LR_LIST* spilledLRs,
123     unsigned iterationNo,
124     bool failSafeSpill,
125     unsigned spillRegSize,
126     unsigned indrSpillRegSize,
127     bool enableSpillSpaceCompression,
128     bool useScratchMsg,
129     bool avoidDstSrcOverlap)
130     : gra(g)
131     , builder_(g.kernel.fg.builder)
132     , varIdCount_(varIdCount)
133     , latestImplicitVarIdCount_(0)
134     , lvInfo_(lvInfo)
135     , lrInfo_(lrInfo)
136     , spilledLRs_(spilledLRs)
137     , nextSpillOffset_(spillAreaOffset)
138     , iterationNo_(iterationNo)
139     , doSpillSpaceCompression(enableSpillSpaceCompression)
140     , failSafeSpill_(failSafeSpill)
141     , spillIntf_(intf)
142     , mem_(1024)
143     , useScratchMsg_(useScratchMsg)
144     , avoidDstSrcOverlap_(avoidDstSrcOverlap)
145     , refs(g.kernel)
146 {
147     const unsigned size = sizeof(unsigned) * varIdCount;
148     spillRangeCount_ = (unsigned*)allocMem(size);
149     memset(spillRangeCount_, 0, size);
150     fillRangeCount_ = (unsigned*)allocMem(size);
151     memset(fillRangeCount_, 0, size);
152     tmpRangeCount_ = (unsigned*)allocMem(size);
153     memset(tmpRangeCount_, 0, size);
154     msgSpillRangeCount_ = (unsigned*)allocMem(size);
155     memset(msgSpillRangeCount_, 0, size);
156     msgFillRangeCount_ = (unsigned*)allocMem(size);
157     memset(msgFillRangeCount_, 0, size);
158     spillAreaOffset_ = spillAreaOffset;
159     builder_->instList.clear();
160     spillRegStart_ = g.kernel.getNumRegTotal();
161     indrSpillRegStart_ = spillRegStart_;
162     spillRegOffset_ = spillRegStart_;
163     if (failSafeSpill) {
164         bool isStackCall = builder_->usesStack();
165         unsigned int stackCallRegSize = isStackCall ? builder_->kernel.numReservedABIGRF() : 0;
166         indrSpillRegStart_ -= (stackCallRegSize + indrSpillRegSize);
167         spillRegStart_ = indrSpillRegStart_ - spillRegSize;
168     }
169     curInst = nullptr;
170     globalScratchOffset = gra.kernel.getInt32KernelAttr(Attributes::ATTR_SpillMemOffset);
171     spilledLSLRs_ = nullptr;
172     if (builder_->hasScratchSurface())
173     {
174         builder_->initScratchSurfaceOffset();
175         auto entryBB = builder_->kernel.fg.getEntryBB();
176         auto iter = std::find_if(entryBB->begin(), entryBB->end(), [](G4_INST* inst) { return !inst->isLabel(); });
177         splice(entryBB, iter, builder_->instList, UNMAPPABLE_VISA_INDEX);
178     }
179 
180     // LSC messages are used when:
181     // a. Stack call is used on PVC+,
182     // b. Spill size exceeds what can be represented using hword msg on PVC+
183     useLSCMsg = gra.useLscForSpillFill;
184     useLscNonstackCall = gra.useLscForNonStackCallSpillFill;
185 }
186 
SpillManagerGRF(GlobalRA & g,unsigned spillAreaOffset,unsigned varIdCount,const LivenessAnalysis * lvInfo,LSLR_LIST * spilledLSLRs,bool enableSpillSpaceCompression,bool useScratchMsg,bool avoidDstSrcOverlap)187 SpillManagerGRF::SpillManagerGRF(
188     GlobalRA& g,
189     unsigned spillAreaOffset,
190     unsigned varIdCount,
191     const LivenessAnalysis* lvInfo,
192     LSLR_LIST* spilledLSLRs,
193     bool enableSpillSpaceCompression,
194     bool useScratchMsg,
195     bool avoidDstSrcOverlap)
196     : gra(g)
197     , builder_(g.kernel.fg.builder)
198     , varIdCount_(varIdCount)
199     , latestImplicitVarIdCount_(0)
200     , lvInfo_(lvInfo)
201     , spilledLSLRs_(spilledLSLRs)
202     , nextSpillOffset_(spillAreaOffset)
203     , doSpillSpaceCompression(enableSpillSpaceCompression)
204     , failSafeSpill_(false)
205     , mem_(1024)
206     , useScratchMsg_(useScratchMsg)
207     , avoidDstSrcOverlap_(avoidDstSrcOverlap)
208     , refs(g.kernel)
209 {
210     const unsigned size = sizeof(unsigned) * varIdCount;
211     spillRangeCount_ = (unsigned*)allocMem(size);
212     memset(spillRangeCount_, 0, size);
213     fillRangeCount_ = (unsigned*)allocMem(size);
214     memset(fillRangeCount_, 0, size);
215     tmpRangeCount_ = (unsigned*)allocMem(size);
216     memset(tmpRangeCount_, 0, size);
217     msgSpillRangeCount_ = (unsigned*)allocMem(size);
218     memset(msgSpillRangeCount_, 0, size);
219     msgFillRangeCount_ = (unsigned*)allocMem(size);
220     memset(msgFillRangeCount_, 0, size);
221     addrSpillFillRangeCount_ = (unsigned*)allocMem(size);
222     memset(addrSpillFillRangeCount_, 0, size);
223     spillAreaOffset_ = spillAreaOffset;
224     builder_->instList.clear();
225     curInst = NULL;
226     globalScratchOffset = gra.kernel.getInt32KernelAttr(Attributes::ATTR_SpillMemOffset);
227 
228     if (builder_->hasScratchSurface())
229     {
230         builder_->initScratchSurfaceOffset();
231         auto entryBB = builder_->kernel.fg.getEntryBB();
232         auto iter = std::find_if(entryBB->begin(), entryBB->end(), [](G4_INST* inst) { return !inst->isLabel(); });
233         splice(entryBB, iter, builder_->instList, UNMAPPABLE_VISA_INDEX);
234     }
235     // LSC messages are used when:
236     // a. Stack call is used on PVC+,
237     // b. Spill size exceeds what can be represented using hword msg on PVC+
238     useLSCMsg = gra.useLscForSpillFill;
239     useLscNonstackCall = gra.useLscForNonStackCallSpillFill;
240 }
241 
242 // Get the base regvar for the source or destination region.
243 template <class REGION_TYPE>
getRegVar(REGION_TYPE * region) const244 G4_RegVar *SpillManagerGRF::getRegVar(REGION_TYPE * region) const
245 {
246     G4_RegVar * spilledRegVar = (G4_RegVar *) region->getBase();
247     return spilledRegVar;
248 }
249 
250 // Get the representative regvar that will be assigned a unique spill
251 // disp and not a relative spill disp.
getReprRegVar(G4_RegVar * regVar) const252 G4_RegVar *SpillManagerGRF::getReprRegVar(G4_RegVar * regVar) const
253 {
254     G4_RegVar * absBase = regVar->getAbsBaseRegVar();
255     if (absBase->isAliased())
256         return getReprRegVar(absBase->getDeclare()->getAliasDeclare()->getRegVar());
257     else
258         return absBase;
259 }
260 
261 // Obtain the register file type of the regvar.
getRFType(G4_RegVar * regvar) const262 G4_RegFileKind SpillManagerGRF::getRFType(G4_RegVar * regvar) const
263 {
264     return regvar->getDeclare()->getRegFile();
265 }
266 
267 // Obtain the register file type of the region.
268 template <class REGION_TYPE>
getRFType(REGION_TYPE * region) const269 G4_RegFileKind SpillManagerGRF::getRFType(REGION_TYPE * region) const
270 {
271     if (region->getBase()->isRegVar())
272         return getRFType(region->getBase()->asRegVar());
273     else if (region->getBase()->isGreg())
274         return G4_GRF;
275     else
276         return G4_ADDRESS;
277 }
278 
279 // Get the byte offset of the origin of the source or destination region.
280 // The row offset component is calculated based on the the parameters of
281 // the corresponding declare directive, while the column offset is calculated
282 // based on the region parameters.
283 template <class REGION_TYPE>
getRegionOriginOffset(REGION_TYPE * region) const284 unsigned SpillManagerGRF::getRegionOriginOffset(REGION_TYPE * region) const
285 {
286     unsigned rowOffset = REG_BYTE_SIZE * region->getRegOff();
287     unsigned columnOffset = region->getSubRegOff() * region->getElemSize();
288     return rowOffset + columnOffset;
289 }
290 
291 // Get a GRF aligned mask
grfMask() const292 unsigned SpillManagerGRF::grfMask() const
293 {
294     unsigned mask = 0;
295     mask = (mask - 1);
296     MUST_BE_TRUE(std::log2(numEltPerGRF<Type_UB>()) == (float)((int)(std::log2(numEltPerGRF<Type_UB>()))), "expected integral value");
297     unsigned int bits = (unsigned int)std::log2(numEltPerGRF<Type_UB>());
298     mask = mask << bits;
299     return mask;
300 }
301 
302 // Get an hex word mask with the lower 5 bits zeroed.
hwordMask() const303 unsigned SpillManagerGRF::hwordMask() const
304 {
305     unsigned mask = 0;
306     mask = (mask - 1);
307     mask = mask << 5;
308     return mask;
309 }
310 
311 // Get an octal word mask with the lower 4 bits zeroed.
owordMask() const312 unsigned SpillManagerGRF::owordMask() const
313 {
314     unsigned mask = 0;
315     mask = (mask - 1);
316     mask = mask << 4;
317     return mask;
318 }
319 
320 // Get an dword word mask with the lower 2 bits zeroed.
dwordMask() const321 unsigned SpillManagerGRF::dwordMask() const
322 {
323     unsigned mask = 0;
324     mask = (mask - 1);
325     mask = mask << 2;
326     return mask;
327 }
328 
329 // Test of the offset is oword aligned.
owordAligned(unsigned offset) const330 bool SpillManagerGRF::owordAligned(unsigned offset) const
331 {
332     return (offset & owordMask()) == offset;
333 }
334 
335 // Test of the offset is oword aligned.
dwordAligned(unsigned offset) const336 bool SpillManagerGRF::dwordAligned(unsigned offset) const
337 {
338     return (offset & dwordMask ()) == offset;
339 }
340 
341 // Get the ceil of the ratio.
cdiv(unsigned dvd,unsigned dvr)342 unsigned SpillManagerGRF::cdiv(unsigned dvd, unsigned dvr)
343 {
344     return (dvd / dvr) + ((dvd % dvr) ? 1 : 0);
345 }
346 
347 // Get the live range corresponding to id.
shouldSpillRegister(G4_RegVar * regVar) const348 bool SpillManagerGRF::shouldSpillRegister(G4_RegVar * regVar) const
349 {
350     if (getRFType(regVar) == G4_ADDRESS)
351     {
352         return false;
353     }
354     G4_RegVar * actualRegVar =
355         (regVar->getDeclare()->getAliasDeclare()) ?
356             regVar->getDeclare()->getAliasDeclare()->getRegVar() :
357             regVar;
358     if (actualRegVar->getId() == UNDEFINED_VAL)
359         return false;
360     else if (regVar->isRegVarTransient() || regVar->isRegVarTmp())
361         return false;
362 #ifndef ADDRESS_SENSITIVE_SPILLS_IMPLEMENTED
363     else if (lvInfo_->isAddressSensitive (regVar->getId()))
364         return false;
365 #endif
366     else if (builder_->kernel.fg.isPseudoVCADcl(actualRegVar->getDeclare()) ||
367         builder_->kernel.fg.isPseudoVCEDcl(actualRegVar->getDeclare()))
368         return false;
369     else
370         return lrInfo_[actualRegVar->getId()]->getPhyReg() == NULL;
371 }
372 
373 // Get the regvar with the id.
getRegVar(unsigned id) const374 G4_RegVar *SpillManagerGRF::getRegVar(unsigned id) const
375 {
376     return (lvInfo_->vars)[id];
377 }
378 
379 // Get the byte size of the live range.
getByteSize(G4_RegVar * regVar) const380 unsigned SpillManagerGRF::getByteSize(G4_RegVar * regVar) const
381 {
382     unsigned normalizedRowSize =
383         (regVar->getDeclare()->getNumRows() > 1) ?
384             REG_BYTE_SIZE :
385             regVar->getDeclare()->getNumElems() *
386                 regVar->getDeclare()->getElemSize();
387     return normalizedRowSize * regVar->getDeclare()->getNumRows();
388 }
389 
390 // Check if the lifetime of the spill/fill memory of live range i interferes
391 // with the lifetime of the spill/fill memory of live range j
spillMemLifetimeInterfere(unsigned i,unsigned j) const392 bool SpillManagerGRF::spillMemLifetimeInterfere(
393     unsigned i, unsigned j) const
394 {
395     G4_RegVar * ireg = getRegVar(i);
396     G4_RegVar * jreg = getRegVar(j);
397     G4_RegVar * irep = getReprRegVar(ireg);
398     G4_RegVar * jrep = getReprRegVar(jreg);
399     G4_RegVar * inont = ireg->getNonTransientBaseRegVar();
400     G4_RegVar * jnont = jreg->getNonTransientBaseRegVar();
401 
402     if (ireg->isRegVarTmp()) {
403         return
404             ireg->getBaseRegVar() == jrep ||
405             spillMemLifetimeInterfere(ireg->getBaseRegVar()->getId(), j);
406     }
407     else if (jreg->isRegVarTmp()) {
408         return
409             jreg->getBaseRegVar() == irep ||
410             spillMemLifetimeInterfere (jreg->getBaseRegVar()->getId(), i);
411     }
412 
413     else if (inont->isRegVarTmp()) {
414         return
415             inont->getBaseRegVar() == jrep ||
416             spillMemLifetimeInterfere(inont->getBaseRegVar()->getId(), j);
417 
418     }
419 
420     else if (jnont->isRegVarTmp()) {
421         return
422             jnont->getBaseRegVar() == irep ||
423             spillMemLifetimeInterfere (jnont->getBaseRegVar()->getId(), i);
424     }
425 
426     else {
427         if (spillIntf_->interfereBetween(irep->getId(), jrep->getId()))
428             return true;
429         else if (getRFType (irep) != getRFType (jrep))
430             return true;
431         else
432 #ifdef DISABLE_SPILL_MEMORY_COMPRESSION
433             return irep != jrep;
434 #else
435             return false;
436 #endif
437     }
438 }
439 
440 // Calculate the spill memory displacement for the regvar.
calculateSpillDisp(G4_RegVar * regVar) const441 unsigned SpillManagerGRF::calculateSpillDisp(G4_RegVar *   regVar) const
442 {
443     assert(regVar->getDisp () == UINT_MAX);
444 
445     // Locate the blocked locations calculated from the interfering
446     // spilled live ranges and put them into a list in ascending order.
447 
448     using LocList = std::list<G4_RegVar*>;
449     LocList locList;
450     unsigned lrId =
451         (regVar->getId() >= varIdCount_)?
452         regVar->getBaseRegVar()->getId(): regVar->getId();
453     assert(lrId < varIdCount_);
454 
455     const std::vector<unsigned int>& intfs = spillIntf_->getSparseIntfForVar(lrId);
456     for (auto edge : intfs)
457     {
458         auto lrEdge = getRegVar(edge);
459         if (lrEdge->isRegVarTransient())
460             continue;
461         if (lrEdge->getDisp() == UINT_MAX)
462             continue;
463         locList.push_back(lrEdge);
464     }
465     locList.sort([](G4_RegVar* v1, G4_RegVar* v2) { return v1->getDisp() < v2->getDisp(); });
466 
467     // Find a spill slot for lRange within the locList.
468     // we always start searching from nextSpillOffset_ to facilitate intra-iteration reuse.
469     // cross iteration reuse is not done in interest of compile time.
470     unsigned regVarLocDisp = ROUND(nextSpillOffset_, numEltPerGRF<Type_UB>());
471     unsigned regVarSize = getByteSize (regVar);
472 
473     for (G4_RegVar *curLoc : locList) {
474         unsigned curLocDisp = curLoc->getDisp ();
475         if (regVarLocDisp < curLocDisp &&
476             regVarLocDisp + regVarSize <= curLocDisp)
477             break;
478         unsigned curLocEnd = curLocDisp + getByteSize(curLoc);
479         {
480             if (curLocEnd % numEltPerGRF<Type_UB>() != 0)
481                 curLocEnd = ROUND(curLocEnd, numEltPerGRF<Type_UB>());
482         }
483 
484         regVarLocDisp = (regVarLocDisp > curLocEnd)? regVarLocDisp: curLocEnd;
485     }
486 
487     return regVarLocDisp;
488 }
489 
calculateSpillDispForLS(G4_RegVar * regVar) const490 unsigned SpillManagerGRF::calculateSpillDispForLS(G4_RegVar* regVar) const
491 {
492     assert(regVar->getDisp() == UINT_MAX);
493 
494     // Locate the blocked locations calculated from the interfering
495     // spilled live ranges and put them into a list in ascending order.
496 
497     typedef std::deque < G4_RegVar* > LocList;
498     LocList locList;
499     unsigned lrId =
500         (regVar->getId() >= varIdCount_) ?
501             regVar->getBaseRegVar()->getId() : regVar->getId();
502     assert(lrId < varIdCount_);
503 
504     for (auto lr : activeLR_)
505     {
506         G4_RegVar* intfRegVar = lr->getTopDcl()->getRegVar();
507         if (intfRegVar->isRegVarTransient()) continue;
508 
509         unsigned iDisp = intfRegVar->getDisp();
510         if (iDisp == UINT_MAX) continue;
511 
512         LocList::iterator loc;
513         for (loc = locList.begin();
514             loc != locList.end() && (*loc)->getDisp() < iDisp;
515             ++loc);
516         if (loc != locList.end())
517             locList.insert(loc, intfRegVar);
518         else
519             locList.push_back(intfRegVar);
520     }
521 
522     // Find a spill slot for lRange within the locList.
523     // we always start searching from nextSpillOffset_ to facilitate intra-iteration reuse.
524     // cross iteration reuse is not done in interest of compile time.
525     unsigned regVarLocDisp = ROUND(nextSpillOffset_, numEltPerGRF<Type_UB>());
526     unsigned regVarSize = getByteSize(regVar);
527 
528     for (LocList::iterator curLoc = locList.begin(), end = locList.end(); curLoc != end;
529         ++curLoc) {
530         unsigned curLocDisp = (*curLoc)->getDisp();
531         if (regVarLocDisp < curLocDisp &&
532             regVarLocDisp + regVarSize <= curLocDisp)
533             break;
534         unsigned curLocEnd = curLocDisp + getByteSize(*curLoc);
535         {
536             if (curLocEnd % numEltPerGRF<Type_UB>() != 0)
537                 curLocEnd = ROUND(curLocEnd, numEltPerGRF<Type_UB>());
538         }
539 
540         regVarLocDisp = (regVarLocDisp > curLocEnd) ? regVarLocDisp : curLocEnd;
541     }
542 
543     return regVarLocDisp;
544 }
545 
546 // Get the spill/fill displacement of the segment containing the region.
547 // A segment is the smallest dword or oword aligned portion of memory
548 // containing the destination or source operand that can be read or saved.
549 template <class REGION_TYPE>
getSegmentDisp(REGION_TYPE * region,G4_ExecSize execSize)550 unsigned SpillManagerGRF::getSegmentDisp (
551     REGION_TYPE * region,
552     G4_ExecSize  execSize
553 )
554 {
555     assert(region->getElemSize () && execSize);
556     if (isUnalignedRegion(region, execSize))
557         return getEncAlignedSegmentDisp(region, execSize);
558     else
559         return getRegionDisp(region);
560 }
561 
562 // Get the spill/fill displacement of the regvar.
getDisp(G4_RegVar * regVar)563 unsigned SpillManagerGRF::getDisp(G4_RegVar * regVar)
564 {
565     // Already calculated spill memory disp
566 
567     if (regVar->getDisp() != UINT_MAX)
568     {
569         return regVar->getDisp();
570     }
571     else if (regVar->isAliased()) {
572         // If it is an aliased regvar then calculate the disp for the
573         // actual regvar and then calculate the disp of the aliased regvar
574         // based on it.
575         G4_Declare * regVarDcl = regVar->getDeclare();
576         return getDisp(regVarDcl->getAliasDeclare()->getRegVar()) +
577             regVarDcl->getAliasOffset();
578     }
579     else if (gra.splitResults.find(regVar->getDeclare()->getRootDeclare()) !=
580         gra.splitResults.end())
581     {
582         // this variable is result of variable splitting optimization.
583         // original variable is guaranteed to have spilled. if split
584         // variable also spills then reuse original variable's spill
585         // location.
586         auto it = gra.splitResults.find(regVar->getDeclare()->getRootDeclare());
587         auto disp = getDisp((*it).second.origDcl->getRegVar());
588         regVar->setDisp(disp);
589     }
590     else if (regVar->isRegVarTransient() &&
591         getDisp(regVar->getBaseRegVar()) != UINT_MAX)
592     {
593         // If its base regvar has been assigned a disp, then the spill memory
594         // has already been allocated for it, simply calculate the disp based
595         // on the enclosing segment disp.
596         assert(regVar->getBaseRegVar() != regVar);
597         unsigned itsDisp;
598 
599         if (regVar->isRegVarSpill()) {
600             G4_RegVarTransient * tRegVar = static_cast <G4_RegVarTransient*> (regVar);
601             assert(
602                 getSegmentByteSize(
603                 tRegVar->getDstRepRegion(), tRegVar->getExecSize()) <=
604                     getByteSize(tRegVar));
605             itsDisp =
606                 getSegmentDisp(
607                     tRegVar->getDstRepRegion(), tRegVar->getExecSize());
608         }
609         else if (regVar->isRegVarFill()) {
610             G4_RegVarTransient * tRegVar = static_cast <G4_RegVarTransient*> (regVar);
611             assert(
612                 getSegmentByteSize(
613                     tRegVar->getSrcRepRegion(),
614                     tRegVar->getExecSize()) <= getByteSize(tRegVar));
615             itsDisp =
616                 getSegmentDisp(tRegVar->getSrcRepRegion(), tRegVar->getExecSize());
617         }
618         else {
619             MUST_BE_TRUE(false, "Incorrect spill/fill ranges.");
620             itsDisp = 0;
621         }
622 
623         regVar->setDisp(itsDisp);
624     }
625     else {
626         // Allocate the spill and evaluate its disp
627         if (doSpillSpaceCompression)
628         {
629             assert(regVar->isRegVarTransient() == false);
630             if (spilledLSLRs_ != nullptr)
631             {
632                 regVar->setDisp(calculateSpillDispForLS(regVar));
633             }
634             else
635             {
636                 regVar->setDisp(calculateSpillDisp(regVar));
637             }
638         }
639         else
640         {
641             assert(regVar->isRegVarTransient() == false);
642             if (regVar->getId() >= varIdCount_)
643             {
644                 if (regVar->getBaseRegVar()->getDisp() != UINT_MAX)
645                 {
646                     regVar->setDisp(regVar->getBaseRegVar()->getDisp());
647                     return regVar->getDisp();
648                 }
649             }
650 
651             if ((spillAreaOffset_) % numEltPerGRF<Type_UB>() != 0)
652             {
653                 (spillAreaOffset_) = ROUND(spillAreaOffset_, numEltPerGRF<Type_UB>());
654             }
655 
656             regVar->setDisp(spillAreaOffset_);
657             spillAreaOffset_ += getByteSize(regVar);
658         }
659     }
660 
661     // ToDo: log this in some dump to help debug
662     //regVar->getDeclare()->dump();
663     //std::cerr << "spill offset = " << regVar->getDisp() << "\n";
664 
665     return regVar->getDisp();
666 }
667 
668 // Get the spill/fill displacement of the region.
669 template <class REGION_TYPE>
getRegionDisp(REGION_TYPE * region)670 unsigned SpillManagerGRF::getRegionDisp(REGION_TYPE * region)
671 {
672     return getDisp (getRegVar(region)) + getRegionOriginOffset(region);
673 }
674 
675 // Get the type of send message to use to spill/fill the region.
676 // The type can be either on oword read/write or a scatter read/write.
677 // If the segment corresponding to the region is dword sized then a
678 // dword read/write is used else an oword read/write is used.
679 template <class REGION_TYPE>
getMsgType(REGION_TYPE * region,G4_ExecSize execSize)680 unsigned SpillManagerGRF::getMsgType(
681     REGION_TYPE * region, G4_ExecSize execSize)
682 {
683     unsigned regionDisp = getRegionDisp(region);
684     unsigned regionByteSize = getRegionByteSize(region, execSize);
685     if (owordAligned (regionDisp) && owordAligned (regionByteSize))
686         return owordMask();
687     else
688         return getEncAlignedSegmentMsgType(region, execSize);
689 }
690 
691 // Determine if the region is unaligned w.r.t spill/fill memory read/writes.
692 // If the exact region cannot be read/written from spill/fill memory using
693 // one send instruction, then it is unaligned.
694 template <class REGION_TYPE>
isUnalignedRegion(REGION_TYPE * region,G4_ExecSize execSize)695 bool SpillManagerGRF::isUnalignedRegion(
696     REGION_TYPE * region, G4_ExecSize execSize)
697 {
698     unsigned regionDisp = getRegionDisp(region);
699     unsigned regionByteSize = getRegionByteSize(region, execSize);
700 
701     bool needs32ByteAlign = useScratchMsg_;
702     needs32ByteAlign |= useLSCMsg;
703 
704     auto bytePerGRF = numEltPerGRF<Type_UB>();
705     if (needs32ByteAlign)
706     {
707         if (regionDisp % bytePerGRF == 0 && regionByteSize % bytePerGRF == 0)
708         {
709             return
710                 regionByteSize / bytePerGRF != 1 &&
711                 regionByteSize / bytePerGRF != 2 &&
712                 regionByteSize / bytePerGRF != 4;
713         }
714         else
715             return true;
716     }
717     else
718     {
719         if (owordAligned(regionDisp) && owordAligned(regionByteSize))
720         {
721             //  Current intrinsic spill/fill cannot handle partial region spill.
722             //  If it's the partial region of a large size variable, such as V91 in following instructions, the preload is needed.
723             //  mov (16) V91(6,0)<1>:ub  %retval_ub(0,0)<1;1,0>:ub {H1, Align1}
724             //  mov (16) V91(6,16)<1>:ub %retval_ub(0,16)<1;1,0>:ub {H1, Align1}
725             G4_RegVar* var = getRegVar(region);
726             if ((var->getDeclare()->getByteSize() > bytePerGRF) &&
727                 (regionByteSize < bytePerGRF || regionDisp % bytePerGRF))
728             {
729                 return true;
730             }
731             return
732                 regionByteSize / OWORD_BYTE_SIZE != 1 &&
733                 regionByteSize / OWORD_BYTE_SIZE != 2 &&
734                 regionByteSize / OWORD_BYTE_SIZE != 4;
735         }
736         else
737             return true;
738     }
739 }
740 
741 // Calculate the smallest aligned segment encompassing the region.
742 template <class REGION_TYPE>
calculateEncAlignedSegment(REGION_TYPE * region,G4_ExecSize execSize,unsigned & start,unsigned & end,unsigned & type)743 void SpillManagerGRF::calculateEncAlignedSegment(
744     REGION_TYPE * region,
745     G4_ExecSize  execSize,
746     unsigned &    start,
747     unsigned &    end,
748     unsigned &    type)
749 {
750     unsigned regionDisp = getRegionDisp(region);
751     unsigned regionByteSize = getRegionByteSize(region, execSize);
752 
753     if (needGRFAlignedOffset())
754     {
755         unsigned hwordLB = regionDisp & grfMask();
756         unsigned hwordRB = hwordLB + numEltPerGRF<Type_UB>();
757         unsigned blockSize = numEltPerGRF<Type_UB>();
758 
759         while (regionDisp + regionByteSize > hwordRB) {
760             hwordRB += blockSize;
761         }
762 
763         assert((hwordRB - hwordLB) / REG_BYTE_SIZE <= 4);
764         start = hwordLB;
765         end = hwordRB;
766         type = grfMask();
767     }
768     else
769     {
770         unsigned owordLB = regionDisp & owordMask();
771         unsigned owordRB = owordLB + OWORD_BYTE_SIZE;
772         unsigned blockSize = OWORD_BYTE_SIZE;
773 
774         while (regionDisp + regionByteSize > owordRB) {
775             owordRB += blockSize;
776             blockSize *= 2;
777         }
778 
779         assert((owordRB - owordLB) / REG_BYTE_SIZE <= 4);
780         start = owordLB;
781         end = owordRB;
782         type = owordMask();
783     }
784 }
785 
786 // Get the byte size of the aligned segment for the region.
787 
788 template <class REGION_TYPE>
789 unsigned
getEncAlignedSegmentByteSize(REGION_TYPE * region,G4_ExecSize execSize)790 SpillManagerGRF::getEncAlignedSegmentByteSize(
791     REGION_TYPE * region,
792     G4_ExecSize  execSize
793 )
794 {
795     unsigned start, end, type;
796     calculateEncAlignedSegment(region, execSize, start, end, type);
797     return end - start;
798 }
799 
800 // Get the start offset of the aligned segment for the region.
801 template <class REGION_TYPE>
802 unsigned
getEncAlignedSegmentDisp(REGION_TYPE * region,G4_ExecSize execSize)803 SpillManagerGRF::getEncAlignedSegmentDisp(
804     REGION_TYPE * region,
805     G4_ExecSize  execSize
806 )
807 {
808     unsigned start, end, type;
809     calculateEncAlignedSegment(region, execSize, start, end, type);
810     return start;
811 }
812 
813 // Get the type of message to be used to read/write the enclosing aligned
814 // segment for the region.
815 template <class REGION_TYPE>
getEncAlignedSegmentMsgType(REGION_TYPE * region,G4_ExecSize execSize)816 unsigned SpillManagerGRF::getEncAlignedSegmentMsgType(
817     REGION_TYPE * region,
818     G4_ExecSize   execSize
819 )
820 {
821     unsigned start, end, type;
822     calculateEncAlignedSegment(region, execSize, start, end, type);
823     return type;
824 }
825 
826 // Get the byte size of the segment for the region.
827 template <class REGION_TYPE>
getSegmentByteSize(REGION_TYPE * region,G4_ExecSize execSize)828 unsigned SpillManagerGRF::getSegmentByteSize(
829     REGION_TYPE * region,
830     G4_ExecSize   execSize
831 )
832 {
833     assert(region->getElemSize () && execSize);
834     if (isUnalignedRegion(region, execSize))
835         return getEncAlignedSegmentByteSize(region, execSize);
836     else
837         return getRegionByteSize(region, execSize);
838 }
839 
840 // Get the byte size of the destination region.
getRegionByteSize(G4_DstRegRegion * region,G4_ExecSize execSize) const841 unsigned SpillManagerGRF::getRegionByteSize(
842     G4_DstRegRegion * region,
843     G4_ExecSize       execSize
844 ) const
845 {
846     unsigned size = region->getHorzStride() * region->getElemSize() *
847         (execSize - 1) + region->getElemSize();
848 
849     return size;
850 }
851 
852 // Get the byte size of the source region.
853 
getRegionByteSize(G4_SrcRegRegion * region,G4_ExecSize execSize) const854 unsigned SpillManagerGRF::getRegionByteSize(
855     G4_SrcRegRegion * region,
856     G4_ExecSize       execSize) const
857 {
858     assert(execSize % region->getRegion ()->width == 0);
859     unsigned nRows = execSize / region->getRegion ()->width;
860     unsigned size = 0;
861 
862     for (unsigned int i = 0; i < nRows - 1; i++) {
863         size += region->getRegion ()->vertStride * region->getElemSize ();
864     }
865 
866     size +=
867         region->getRegion ()->horzStride * region->getElemSize () *
868         (region->getRegion ()->width - 1) + region->getElemSize ();
869     return size;
870 }
871 
872 // Get the max exec size on a 256 bit vector for the input operand.
getMaxExecSize(G4_Operand * operand)873 static unsigned getMaxExecSize(G4_Operand * operand)
874 {
875     const unsigned size = Type_UNDEF + 1;
876     static unsigned maxExecSize [size] {8, 8, 16, 16, 16, 16, 8, 8, 0};
877     return maxExecSize[operand->getType()];
878 }
879 
880 // Check if the instruction is a SIMD 16 or 32 instruction that is logically
881 // equivalent to two instructions the second of which uses register operands
882 // at the following row with the same sub-register index.
isComprInst(G4_INST * inst) const883 bool SpillManagerGRF::isComprInst(G4_INST * inst) const
884 {
885     return inst->isComprInst();
886 }
887 
888 // Check if the source in a compressed instruction operand occupies a second
889 // register.
isMultiRegComprSource(G4_SrcRegRegion * src,G4_INST * inst) const890 bool SpillManagerGRF::isMultiRegComprSource(
891     G4_SrcRegRegion* src,
892     G4_INST *        inst) const
893 {
894     if (!inst->isComprInst ()) {
895         return false;
896     }
897     else if (isScalarReplication(src)) {
898         return false;
899     }
900     else if (inst->getExecSize() <= 8) {
901         return false;
902     }
903     else if (!src->asSrcRegRegion()->crossGRF())
904     {
905         return false;
906     }
907     else if (inst->getExecSize() == 16 &&
908              inst->getDst() &&
909              inst->getDst()->getTypeSize() == 4 &&
910              inst->getDst()->getHorzStride() == 1)
911     {
912         if (src->getTypeSize() == 2 && src->isNativePackedRegion()) {
913             return false;
914         } else {
915             return true;
916         }
917     }
918     else {
919         return true;
920     }
921 }
922 
923 // Send message information query
getSendRspLengthBitOffset() const924 unsigned SpillManagerGRF::getSendRspLengthBitOffset() const
925 {
926     return SEND_GT_RSP_LENGTH_BIT_OFFSET;
927 }
928 
929 // Send message information query
getSendMaxResponseLength() const930 unsigned SpillManagerGRF::getSendMaxResponseLength() const
931 {
932     //return SEND_GT_MAX_RESPONSE_LENGTH;
933     return 8;
934 }
935 
936 // Send message information query
getSendMsgLengthBitOffset()937 unsigned SpillManagerGRF::getSendMsgLengthBitOffset()
938 {
939     return SEND_GT_MSG_LENGTH_BIT_OFFSET;
940 }
941 
942 // Send message information query
getSendMaxMessageLength() const943 unsigned SpillManagerGRF::getSendMaxMessageLength() const
944 {
945     return SEND_GT_MAX_MESSAGE_LENGTH;
946 }
947 
948 // Send message information query
getSendDescDataSizeBitOffset()949 unsigned SpillManagerGRF::getSendDescDataSizeBitOffset()
950 {
951     return SEND_GT_DESC_DATA_SIZE_BIT_OFFSET;
952 }
953 
954 // Send message information query
getSendReadTypeBitOffset() const955 unsigned SpillManagerGRF::getSendReadTypeBitOffset() const
956 {
957     return SEND_IVB_MSG_TYPE_BIT_OFFSET;
958 }
959 
960 // Send message information query
getSendWriteTypeBitOffset()961 unsigned SpillManagerGRF::getSendWriteTypeBitOffset()
962 {
963     return SEND_IVB_MSG_TYPE_BIT_OFFSET;
964 }
965 
966 // Send message information query
getSendScReadType() const967 unsigned SpillManagerGRF::getSendScReadType() const
968 {
969     return SEND_IVB_SC_READ_TYPE;
970 }
971 
972 // Send message information query
getSendScWriteType() const973 unsigned SpillManagerGRF::getSendScWriteType() const
974 {
975     return SEND_IVB_SC_WRITE_TYPE;
976 }
977 
978 // Send message information query
getSendOwordReadType() const979 unsigned SpillManagerGRF::getSendOwordReadType() const
980 {
981     return SEND_IVB_OW_READ_TYPE;
982 }
983 
984 // Send message information query
getSendOwordWriteType()985 unsigned SpillManagerGRF::getSendOwordWriteType()
986 {
987     return SEND_IVB_OW_WRITE_TYPE;
988 }
989 
getSendExDesc(bool isWrite,bool isScatter) const990 unsigned SpillManagerGRF::getSendExDesc(bool isWrite, bool isScatter) const
991 {
992     return isWrite ? SEND_IVB_DP_WR_EX_DESC_IMM : SEND_IVB_DP_RD_EX_DESC_IMM;
993 }
994 
995 // Allocate from custom memory allocator
allocMem(unsigned size) const996 void *SpillManagerGRF::allocMem(unsigned size) const
997 {
998     return builder_->mem.alloc(size);
999 }
1000 
useSplitSend() const1001 bool SpillManagerGRF::useSplitSend() const
1002 {
1003     return builder_->useSends();
1004 }
1005 
1006 // Get a unique spill range index for regvar.
getSpillIndex(G4_RegVar * spilledRegVar)1007 unsigned SpillManagerGRF::getSpillIndex(G4_RegVar *  spilledRegVar)
1008 {
1009     return spillRangeCount_[spilledRegVar->getId()]++;
1010 }
1011 
1012 // Get a unique fill range index for regvar.
getFillIndex(G4_RegVar * spilledRegVar)1013 unsigned SpillManagerGRF::getFillIndex(
1014     G4_RegVar *  spilledRegVar
1015 )
1016 {
1017     return fillRangeCount_[spilledRegVar->getId()]++;
1018 }
1019 
1020 // Get a unique tmp index for spilled regvar.
getTmpIndex(G4_RegVar * spilledRegVar)1021 unsigned SpillManagerGRF::getTmpIndex(G4_RegVar *  spilledRegVar)
1022 {
1023     return tmpRangeCount_[spilledRegVar->getId()]++;
1024 }
1025 
1026 // Get a unique msg index for spilled regvar.
getMsgSpillIndex(G4_RegVar * spilledRegVar)1027 unsigned SpillManagerGRF::getMsgSpillIndex(
1028     G4_RegVar *  spilledRegVar)
1029 {
1030     return msgSpillRangeCount_[spilledRegVar->getId()]++;
1031 }
1032 
1033 // Get a unique msg index for filled regvar.
getMsgFillIndex(G4_RegVar * spilledRegVar)1034 unsigned SpillManagerGRF::getMsgFillIndex(
1035     G4_RegVar *  spilledRegVar)
1036 {
1037     return msgFillRangeCount_[spilledRegVar->getId()]++;
1038 }
1039 
1040 // Get a unique msg index for addr spill fill regvar.
getAddrSpillFillIndex(G4_RegVar * spilledRegVar)1041 unsigned SpillManagerGRF::getAddrSpillFillIndex(
1042     G4_RegVar *  spilledRegVar)
1043 {
1044     return addrSpillFillRangeCount_[spilledRegVar->getId()]++;
1045 }
1046 
1047 // Create a unique name for a regvar representing a spill/fill/msg live range.
createImplicitRangeName(const char * baseName,G4_RegVar * spilledRegVar,unsigned index)1048 const char *SpillManagerGRF::createImplicitRangeName(
1049     const char * baseName,
1050     G4_RegVar *  spilledRegVar,
1051     unsigned     index)
1052 {
1053     std::stringstream nameStrm;
1054     nameStrm << baseName << "_" << spilledRegVar->getName()
1055              << "_" << index << std::ends;
1056     int nameLen = unsigned(nameStrm.str().length()) + 1;
1057     char * name = (char *) allocMem(nameLen);
1058     strcpy_s(name, nameLen, nameStrm.str().c_str ());
1059     return name;
1060 }
1061 
1062 // Check if the region is a scalar replication region.
isScalarReplication(G4_SrcRegRegion * region) const1063 bool SpillManagerGRF::isScalarReplication(G4_SrcRegRegion * region) const
1064 {
1065     return region->isScalar();
1066 }
1067 
1068 // Check if we have to repeat the simd16 source in the simd8 equivalents.
1069 // The BPSEC mentions that if a replicated scalar appears in an simd16
1070 // instruction, logically we need to repeat the source region used in
1071 // the first simd8 instruction in the second simd8 instruction as well
1072 // (i.e. the reg no is not incremented by one for the second).
repeatSIMD16or32Source(G4_SrcRegRegion * region) const1073 bool SpillManagerGRF::repeatSIMD16or32Source(G4_SrcRegRegion * region) const
1074 {
1075     return isScalarReplication(region);
1076 }
1077 
1078 // Create a declare directive for a new live range (spill/fill/msg)
1079 // introduced as part of the spill code generation.
1080 G4_Declare *
createRangeDeclare(const char * name,G4_RegFileKind regFile,unsigned short nElems,unsigned short nRows,G4_Type type,DeclareType kind,G4_RegVar * base,G4_Operand * repRegion,G4_ExecSize execSize)1081 SpillManagerGRF::createRangeDeclare(
1082     const char*    name,
1083     G4_RegFileKind regFile,
1084     unsigned short nElems,
1085     unsigned short nRows,
1086     G4_Type        type,
1087     DeclareType    kind,
1088     G4_RegVar *    base,
1089     G4_Operand *   repRegion,
1090     G4_ExecSize    execSize)
1091 {
1092     G4_Declare * rangeDeclare =
1093         builder_->createDeclareNoLookup(
1094             name, regFile, nElems, nRows, type, kind,
1095             base, repRegion, execSize);
1096     rangeDeclare->getRegVar()->setId(
1097         varIdCount_ + latestImplicitVarIdCount_++);
1098     gra.setBBId(rangeDeclare, bbId_);
1099     return rangeDeclare;
1100 }
1101 
1102 // Create a GRF regvar and its declare directive to represent the spill/fill
1103 // live range.
1104 // The size of the regvar is calculated from the size of the spill/fill
1105 // region. If the spill/fill region fits into one row, then width of the
1106 // regvar is exactly as needed for the spill/fill segment, else it is
1107 // made to occupy exactly two full rows. In either case the regvar is made
1108 // to have 16 word alignment requirement. This is to satisfy the requirements
1109 // of the send instruction used to save/load the value from memory. For
1110 // region's in simd16 instruction contexts we multiply the height by 2
1111 // except for source region's with scalar replication.
1112 template <class REGION_TYPE>
createTransientGRFRangeDeclare(REGION_TYPE * region,const char * baseName,unsigned index,G4_ExecSize execSize,G4_INST * inst)1113 G4_Declare * SpillManagerGRF::createTransientGRFRangeDeclare(
1114     REGION_TYPE * region,
1115     const char  * baseName,
1116     unsigned      index,
1117     G4_ExecSize   execSize,
1118     G4_INST     * inst)
1119 {
1120     const char * name =
1121         createImplicitRangeName(baseName, getRegVar(region), index);
1122     G4_Type type = region->getType();
1123     unsigned segmentByteSize = getSegmentByteSize(region, execSize);
1124     DeclareType regVarKind =
1125         (region->isDstRegRegion ())? DeclareType::Spill : DeclareType::Fill;
1126     unsigned short width, height;
1127 
1128     if (segmentByteSize > REG_BYTE_SIZE || region->crossGRF()) {
1129         assert(REG_BYTE_SIZE % region->getElemSize () == 0);
1130         width = REG_BYTE_SIZE / region->getElemSize ();
1131         assert(segmentByteSize / REG_BYTE_SIZE <= 2);
1132         height = 2;
1133     } else {
1134         assert(segmentByteSize % region->getElemSize () == 0);
1135         width = segmentByteSize / region->getElemSize ();
1136         height = 1;
1137     }
1138 
1139     if (needGRFAlignedOffset())
1140     {
1141         // the message will read/write a minimum of one GRF
1142         if (height == 1 && width < (getGRFSize() / region->getElemSize()))
1143             width = getGRFSize() / region->getElemSize();
1144     }
1145 
1146     G4_Declare * transientRangeDeclare =
1147         createRangeDeclare(
1148             name, G4_GRF, width, height, type,
1149             regVarKind, region->getBase()->asRegVar(), region, execSize);
1150 
1151     if (failSafeSpill_)
1152     {
1153         transientRangeDeclare->getRegVar()->setPhyReg(
1154             builder_->phyregpool.getGreg(spillRegOffset_), 0);
1155         spillRegOffset_ += height;
1156     }
1157 
1158     // FIXME: We should take the original declare's alignment too, but I'm worried
1159     // we may get perf regression if FE is over-aligning or the alignment is not necessary for this inst.
1160     // So Either is used for now and we can change it later if there are bugs
1161     setNewDclAlignment(gra, transientRangeDeclare, false);
1162     return transientRangeDeclare;
1163 }
1164 
getSpillRowSizeForSendDst(G4_INST * inst)1165 static unsigned short getSpillRowSizeForSendDst(G4_INST * inst)
1166 {
1167     unsigned short nRows = 0;
1168 
1169     auto dst = inst->getDst();
1170 
1171     if (inst->isSend())
1172     {
1173         G4_SendDesc* msgDesc = inst->getMsgDesc();
1174         nRows = msgDesc->getDstLenRegs();
1175         if (dst->getTopDcl()->getByteSize() <= getGRFSize())
1176         {
1177             // we may have a send that that writes to a <1 GRF variable, but due to A64 message requirements
1178             // the send has a response length > 1. We return row size as one instead as we've only allocated
1179             // one GRF for the spilled variable in scratch space
1180             nRows = 1;
1181         }
1182     }
1183     else
1184     {
1185         assert(dst->getLinearizedStart() % numEltPerGRF<Type_UB>() == 0);
1186         nRows = (dst->getLinearizedEnd() - dst->getLinearizedStart() + 1) / numEltPerGRF<Type_UB>();
1187     }
1188     return nRows;
1189 }
1190 
1191 // Create a regvar and its declare directive to represent the spill live
1192 // range that appears as a send instruction post destination GRF.
1193 // The type of the regvar is set as dword and its width 8. The type of
1194 // the post destination does not matter, so we just use type dword, and
1195 // a width of 8 so that a row corresponds to a physical register.
createPostDstSpillRangeDeclare(G4_INST * sendOut)1196 G4_Declare * SpillManagerGRF::createPostDstSpillRangeDeclare(G4_INST *sendOut)
1197 {
1198     auto dst = sendOut->getDst();
1199     G4_RegVar * spilledRegVar = getRegVar(dst);
1200     const char * name =
1201         createImplicitRangeName(
1202             "SP_GRF", spilledRegVar, getSpillIndex (spilledRegVar));
1203     unsigned short nRows = getSpillRowSizeForSendDst(sendOut);
1204 
1205       G4_DstRegRegion * normalizedPostDst = builder_->createDst(
1206         spilledRegVar, dst->getRegOff(), SUBREG_ORIGIN,
1207         DEF_HORIZ_STRIDE, Type_UD);
1208 
1209     // We use the width as the user specified, the height however is
1210     // calculated based on the message descriptor to limit register
1211     // pressure induced by the spill range.
1212 
1213     G4_Declare * transientRangeDeclare =
1214         createRangeDeclare(
1215             name, G4_GRF, REG_DWORD_SIZE, nRows, Type_UD,
1216             DeclareType::Spill, spilledRegVar, normalizedPostDst,
1217             G4_ExecSize(REG_DWORD_SIZE));
1218 
1219     if (failSafeSpill_)
1220     {
1221         if (useSplitSend())
1222         {
1223             transientRangeDeclare->getRegVar()->setPhyReg(
1224                 builder_->phyregpool.getGreg(spillRegStart_), 0);
1225             spillRegOffset_ += nRows;
1226         }
1227         else
1228         {
1229             transientRangeDeclare->getRegVar()->setPhyReg(
1230                 builder_->phyregpool.getGreg(spillRegStart_+1), 0);
1231             spillRegOffset_ += nRows + 1;
1232         }
1233     }
1234 
1235     return transientRangeDeclare;
1236 }
1237 
1238 // Create a regvar and its declare directive to represent the spill live range.
createSpillRangeDeclare(G4_DstRegRegion * spilledRegion,G4_ExecSize execSize,G4_INST * inst)1239 G4_Declare * SpillManagerGRF::createSpillRangeDeclare(
1240     G4_DstRegRegion * spilledRegion,
1241     G4_ExecSize       execSize,
1242     G4_INST         * inst
1243 )
1244 {
1245     return
1246         createTransientGRFRangeDeclare(
1247             spilledRegion, "SP_GRF",
1248             getSpillIndex (getRegVar(spilledRegion)),
1249             execSize, inst);
1250 }
1251 
1252 // Create a regvar and its declare directive to represent the GRF fill live
1253 // range.
createGRFFillRangeDeclare(G4_SrcRegRegion * fillRegion,G4_ExecSize execSize,G4_INST * inst)1254 G4_Declare * SpillManagerGRF::createGRFFillRangeDeclare(
1255     G4_SrcRegRegion * fillRegion,
1256     G4_ExecSize       execSize,
1257     G4_INST         * inst
1258 )
1259 {
1260     assert(getRFType (fillRegion) == G4_GRF);
1261     G4_Declare * fillRangeDecl =
1262         createTransientGRFRangeDeclare(
1263             fillRegion, "FL_GRF", getFillIndex(getRegVar(fillRegion)),
1264             execSize, inst);
1265     return fillRangeDecl;
1266 }
1267 
getSpillRowSizeForSendSrc(G4_INST * inst,G4_SrcRegRegion * filledRegion)1268 static unsigned short getSpillRowSizeForSendSrc(
1269     G4_INST *         inst,
1270     G4_SrcRegRegion * filledRegion)
1271 {
1272     unsigned short nRows = 0;
1273 
1274     if (inst->isSend())
1275     {
1276         G4_SendDesc* msgDesc = inst->getMsgDesc();
1277         if (inst->isSplitSend() &&
1278             (inst->getSrc(1)->asSrcRegRegion() == filledRegion))
1279         {
1280             nRows = msgDesc->getSrc1LenRegs();
1281         }
1282         else
1283         {
1284             nRows = msgDesc->getSrc0LenRegs();
1285         }
1286     }
1287     else
1288     {
1289         nRows = (filledRegion->getLinearizedEnd() - filledRegion->getLinearizedStart() + 1) / numEltPerGRF<Type_UB>();
1290     }
1291 
1292     return nRows;
1293 }
1294 
1295 
1296 // Create a regvar and its declare directive to represent the GRF fill live range.
createSendFillRangeDeclare(G4_SrcRegRegion * filledRegion,G4_INST * sendInst)1297 G4_Declare * SpillManagerGRF::createSendFillRangeDeclare(
1298     G4_SrcRegRegion * filledRegion,
1299     G4_INST *         sendInst)
1300 {
1301     G4_RegVar * filledRegVar = getRegVar(filledRegion);
1302     const char * name =
1303         createImplicitRangeName(
1304             "FL_Send", filledRegVar, getFillIndex(filledRegVar));
1305     unsigned short nRows = getSpillRowSizeForSendSrc(sendInst, filledRegion);
1306 
1307     G4_SrcRegRegion * normalizedSendSrc =
1308         builder_->createSrcRegRegion(
1309         filledRegion->getModifier(), Direct, filledRegVar,
1310         filledRegion->getRegOff(), filledRegion->getSubRegOff(), filledRegion->getRegion(),
1311         filledRegion->getType());
1312     unsigned short width = REG_BYTE_SIZE / filledRegion->getElemSize ();
1313     assert(REG_BYTE_SIZE % filledRegion->getElemSize () == 0);
1314 
1315     // We use the width as the user specified, the height however is
1316     // calculated based on the message descriptor to limit register
1317     // pressure induced by the spill range.
1318 
1319     G4_Declare * transientRangeDeclare =
1320         createRangeDeclare(
1321         name,
1322         G4_GRF,
1323         width, nRows, filledRegion->getType(),
1324         DeclareType::Fill, filledRegVar, normalizedSendSrc,
1325         G4_ExecSize(width));
1326 
1327     setNewDclAlignment(gra, transientRangeDeclare, gra.isEvenAligned(filledRegVar->getDeclare()));
1328 
1329     if (failSafeSpill_)
1330     {
1331         if (sendInst->isEOT() && builder_->hasEOTGRFBinding())
1332         {
1333             // make sure eot src is in last 16 GRF
1334             uint32_t eotStart = gra.kernel.getNumRegTotal() - 16;
1335             if (spillRegOffset_ < eotStart)
1336             {
1337                 spillRegOffset_ = eotStart;
1338             }
1339         }
1340         transientRangeDeclare->getRegVar()->setPhyReg(builder_->phyregpool.getGreg(spillRegOffset_), 0);
1341         spillRegOffset_ += nRows;
1342     }
1343 
1344     return transientRangeDeclare;
1345 }
1346 
1347 // Create a regvar and its declare directive to represent the temporary live
1348 // range.
createTemporaryRangeDeclare(G4_DstRegRegion * spilledRegion,G4_ExecSize execSize,bool forceSegmentAlignment)1349 G4_Declare * SpillManagerGRF::createTemporaryRangeDeclare(
1350     G4_DstRegRegion * spilledRegion,
1351     G4_ExecSize       execSize,
1352     bool              forceSegmentAlignment)
1353 {
1354     const char * name =
1355         createImplicitRangeName(
1356             "TM_GRF", getRegVar(spilledRegion),
1357             getTmpIndex(getRegVar(spilledRegion)));
1358     unsigned byteSize =
1359         (forceSegmentAlignment)?
1360         getSegmentByteSize(spilledRegion, execSize):
1361         getRegionByteSize(spilledRegion, execSize);
1362 
1363     // ensure tmp reg is large enough to hold all data when sub-reg offset is non-zero
1364     byteSize += spilledRegion->getSubRegOff() * spilledRegion->getElemSize();
1365 
1366     assert(byteSize <= 2u * REG_BYTE_SIZE);
1367     assert(byteSize % spilledRegion->getElemSize () == 0);
1368 
1369     G4_Type type = spilledRegion->getType();
1370     DeclareType regVarKind = DeclareType::Tmp;
1371 
1372     unsigned short width, height;
1373     if (byteSize > REG_BYTE_SIZE)
1374     {
1375         height = 2;
1376         width = REG_BYTE_SIZE/spilledRegion->getElemSize();
1377     }
1378     else
1379     {
1380         height = 1;
1381         width = byteSize/spilledRegion->getElemSize();
1382     }
1383 
1384     G4_RegVar* spilledRegVar = getRegVar(spilledRegion);
1385 
1386     G4_Declare * temporaryRangeDeclare =
1387         createRangeDeclare(
1388             name, G4_GRF, width, height, type,
1389             regVarKind, spilledRegVar, NULL, G4_ExecSize(0));
1390 
1391     if (failSafeSpill_)
1392     {
1393         temporaryRangeDeclare->getRegVar()->setPhyReg(builder_->phyregpool.getGreg(spillRegOffset_), 0);
1394         spillRegOffset_ += height;
1395     }
1396 
1397     setNewDclAlignment(gra, temporaryRangeDeclare, false);
1398     return temporaryRangeDeclare;
1399 }
1400 
1401 // Create a destination region that could be used in place of the spill regvar.
1402 // If the region is unaligned then the origin of the destination region
1403 // is the displacement of the orginal region from its segment, else the
1404 // origin is 0.
createSpillRangeDstRegion(G4_RegVar * spillRangeRegVar,G4_DstRegRegion * spilledRegion,G4_ExecSize execSize,unsigned regOff)1405 G4_DstRegRegion * SpillManagerGRF::createSpillRangeDstRegion(
1406     G4_RegVar *       spillRangeRegVar,
1407     G4_DstRegRegion * spilledRegion,
1408     G4_ExecSize       execSize,
1409     unsigned          regOff)
1410 {
1411     if (isUnalignedRegion  (spilledRegion, execSize)) {
1412         unsigned segmentDisp =
1413             getEncAlignedSegmentDisp(spilledRegion, execSize);
1414         unsigned regionDisp = getRegionDisp(spilledRegion);
1415         assert(regionDisp >= segmentDisp);
1416         unsigned short subRegOff =
1417             (regionDisp - segmentDisp) / spilledRegion->getElemSize ();
1418         assert(
1419             (regionDisp - segmentDisp) % spilledRegion->getElemSize () == 0);
1420         assert(subRegOff * spilledRegion->getElemSize () +
1421                 getRegionByteSize(spilledRegion, execSize) <=
1422                 2u * REG_BYTE_SIZE);
1423 
1424         if (useScratchMsg_)
1425         {
1426             G4_Declare* parent_dcl = spilledRegion->getBase()->asRegVar()->getDeclare();
1427             unsigned off = 0;
1428             while (parent_dcl->getAliasDeclare() != NULL)
1429             {
1430                 // off is in bytes
1431                 off += parent_dcl->getAliasOffset();
1432                 parent_dcl = parent_dcl->getAliasDeclare();
1433             }
1434             off = off%numEltPerGRF<Type_UB>();
1435             // sub-regoff is in units of element size
1436             subRegOff = spilledRegion->getSubRegOff() + off/spilledRegion->getElemSize();
1437         }
1438 
1439         return builder_->createDst(
1440             spillRangeRegVar, (unsigned short) regOff, subRegOff,
1441             spilledRegion->getHorzStride(), spilledRegion->getType());
1442     }
1443 
1444     else {
1445         return builder_->createDst(
1446             spillRangeRegVar, (short) regOff, SUBREG_ORIGIN,
1447             spilledRegion->getHorzStride(), spilledRegion->getType());
1448     }
1449 }
1450 
1451 // Create a source region that could be used to copy out the temporary range
1452 // (that was created to replace the portion of the spilled live range appearing
1453 // in an instruction destination) into the segment aligned spill range for the
1454 // spilled live range that can be written out to spill memory.
createTemporaryRangeSrcRegion(G4_RegVar * tmpRangeRegVar,G4_DstRegRegion * spilledRegion,G4_ExecSize execSize,unsigned regOff)1455 G4_SrcRegRegion * SpillManagerGRF::createTemporaryRangeSrcRegion (
1456     G4_RegVar *       tmpRangeRegVar,
1457     G4_DstRegRegion * spilledRegion,
1458     G4_ExecSize       execSize,
1459     unsigned          regOff)
1460 {
1461     uint16_t horzStride = spilledRegion->getHorzStride();
1462     // A scalar region is returned when execsize is 1.
1463     const RegionDesc *rDesc = builder_->createRegionDesc(execSize, horzStride, 1, 0);
1464 
1465     return builder_->createSrc(tmpRangeRegVar, (short) regOff, spilledRegion->getSubRegOff(),
1466         rDesc, spilledRegion->getType());
1467 }
1468 
1469 // Create a source region that could be used in place of the fill regvar.
1470 // If the region is unaligned then the origin of the destination region
1471 // is the displacement of the orginal region from its segment, else the
1472 // origin is 0.
createFillRangeSrcRegion(G4_RegVar * fillRangeRegVar,G4_SrcRegRegion * filledRegion,G4_ExecSize execSize)1473 G4_SrcRegRegion * SpillManagerGRF::createFillRangeSrcRegion (
1474     G4_RegVar *       fillRangeRegVar,
1475     G4_SrcRegRegion * filledRegion,
1476     G4_ExecSize       execSize)
1477 {
1478     // we need to preserve accRegSel if it's set
1479     if (isUnalignedRegion(filledRegion, execSize)) {
1480         unsigned segmentDisp =
1481             getEncAlignedSegmentDisp(filledRegion, execSize);
1482         unsigned regionDisp = getRegionDisp(filledRegion);
1483         assert(regionDisp >= segmentDisp);
1484         unsigned short subRegOff =
1485             (regionDisp - segmentDisp) / filledRegion->getElemSize ();
1486         assert(
1487             (regionDisp - segmentDisp) % filledRegion->getElemSize () == 0);
1488 
1489         return builder_->createSrcRegRegion(
1490             filledRegion->getModifier (), Direct, fillRangeRegVar, REG_ORIGIN,
1491             subRegOff, filledRegion->getRegion(), filledRegion->getType(), filledRegion->getAccRegSel());
1492     }
1493     else
1494     {
1495         // fill intrinsic's sub-reg offset is always 0 since it is GRF aligned.
1496         // but original filled range's offset may not be 0, so actual filled
1497         // src needs to use sub-reg offset from original region.
1498         return builder_->createSrcRegRegion(
1499             filledRegion->getModifier (), Direct, fillRangeRegVar,
1500             REG_ORIGIN, filledRegion->getSubRegOff(), filledRegion->getRegion (),
1501             filledRegion->getType(), filledRegion->getAccRegSel());
1502     }
1503 }
1504 
1505 // Create a source region for the spill regvar that can be used as an operand
1506 // for a mov instruction used to copy the value to an send payload for
1507 // an oword block write message. The spillRangeRegVar segment is guaranteed
1508 // to start at an dword boundary and of a dword aligned size by construction.
1509 // The whole spillRangeRegVar segment needs to be copied out to the send
1510 // payload. The source region generated is <4;4,1>:ud so that a row occupies
1511 // a packed oword. The exec size used in the copy instruction needs to be a
1512 // multiple of 4 depending on the size of the spill regvar - 4 or 8 for the
1513 // the spill regvar appearing as the destination in a regulat 2 cycle
1514 // instructions and 16 when appearing in simd16 instructions.
createBlockSpillRangeSrcRegion(G4_RegVar * spillRangeRegVar,unsigned regOff,unsigned subregOff)1515 G4_SrcRegRegion * SpillManagerGRF::createBlockSpillRangeSrcRegion(
1516     G4_RegVar *       spillRangeRegVar,
1517     unsigned          regOff,
1518     unsigned          subregOff)
1519 {
1520     assert(getByteSize (spillRangeRegVar) % DWORD_BYTE_SIZE == 0);
1521     const RegionDesc * rDesc =
1522         builder_->rgnpool.createRegion(DWORD_BYTE_SIZE, DWORD_BYTE_SIZE, 1);
1523     return builder_->createSrc(spillRangeRegVar, (short) regOff, (short) subregOff,
1524         rDesc, Type_UD);
1525 }
1526 
1527 // Create a GRF regvar and a declare directive for it, to represent an
1528 // implicit MFR live range that will be used as the send message payload
1529 // header and write payload for spilling a regvar to memory.
createMRangeDeclare(G4_RegVar * regVar)1530 G4_Declare * SpillManagerGRF::createMRangeDeclare(G4_RegVar * regVar)
1531 {
1532     if (useSplitSend() && useScratchMsg_)
1533     {
1534         return builder_->getBuiltinR0();
1535     }
1536     else if (useLSCMsg)
1537     {
1538         return nullptr;
1539     }
1540 
1541     G4_RegVar * repRegVar =
1542         (regVar->isRegVarTransient ()) ? regVar->getBaseRegVar(): regVar;
1543     const char * name =
1544         createImplicitRangeName(
1545             "SP_MSG", repRegVar, getMsgSpillIndex(repRegVar));
1546     unsigned regVarByteSize = getByteSize (regVar);
1547     unsigned writePayloadHeight = cdiv(regVarByteSize, REG_BYTE_SIZE);
1548 
1549     if (writePayloadHeight > SPILL_PAYLOAD_HEIGHT_LIMIT) {
1550         writePayloadHeight = SPILL_PAYLOAD_HEIGHT_LIMIT;
1551     }
1552 
1553     unsigned payloadHeaderHeight =
1554         (regVarByteSize != DWORD_BYTE_SIZE)?
1555         OWORD_PAYLOAD_HEADER_MAX_HEIGHT: DWORD_PAYLOAD_HEADER_MAX_HEIGHT;
1556     unsigned short height = payloadHeaderHeight + writePayloadHeight;
1557     unsigned short width = REG_DWORD_SIZE;
1558 
1559     // We should not find ourselves using dword scattered write
1560     if (useScratchMsg_)
1561     {
1562         assert(payloadHeaderHeight != DWORD_PAYLOAD_HEADER_MAX_HEIGHT);
1563     }
1564 
1565     G4_Declare * msgRangeDeclare =
1566         createRangeDeclare(
1567             name,
1568             G4_GRF,
1569             width, height, Type_UD,
1570             DeclareType::Tmp, regVar->getNonTransientBaseRegVar (), NULL, G4_ExecSize(0));
1571 
1572     if (failSafeSpill_)
1573     {
1574         msgRangeDeclare->getRegVar()->setPhyReg(builder_->phyregpool.getGreg(spillRegStart_), 0);
1575     }
1576 
1577     return msgRangeDeclare;
1578 }
1579 
1580 // Create a GRF regvar and a declare directive for it, to represent an
1581 // implicit MFR live range that will be used as the send message payload
1582 // header and write payload for spilling a regvar region to memory.
createMRangeDeclare(G4_DstRegRegion * region,G4_ExecSize execSize)1583 G4_Declare * SpillManagerGRF::createMRangeDeclare(
1584     G4_DstRegRegion * region,
1585     G4_ExecSize       execSize)
1586 {
1587     if (useSplitSend() && useScratchMsg_)
1588     {
1589         return builder_->getBuiltinR0();
1590     }
1591     else if (useLSCMsg)
1592     {
1593         return nullptr;
1594     }
1595 
1596     const char * name =
1597         createImplicitRangeName(
1598             "SP_MSG", getRegVar(region),
1599             getMsgSpillIndex(getRegVar(region)));
1600     unsigned regionByteSize = getSegmentByteSize(region, execSize);
1601     unsigned writePayloadHeight = cdiv(regionByteSize, REG_BYTE_SIZE);
1602     unsigned msgType = getMsgType (region, execSize);
1603     unsigned payloadHeaderHeight =
1604         (msgType == owordMask() || msgType == hwordMask ()) ?
1605         OWORD_PAYLOAD_HEADER_MAX_HEIGHT: DWORD_PAYLOAD_HEADER_MAX_HEIGHT;
1606 
1607     // We should not find ourselves using dword scattered write
1608     if (useScratchMsg_)
1609     {
1610         assert(payloadHeaderHeight != DWORD_PAYLOAD_HEADER_MAX_HEIGHT);
1611     }
1612 
1613     unsigned height = payloadHeaderHeight + writePayloadHeight;
1614     unsigned short width = REG_DWORD_SIZE;
1615     G4_Declare * msgRangeDeclare =
1616         createRangeDeclare(
1617             name,
1618             G4_GRF,
1619             width, (unsigned short) height, Type_UD,
1620             DeclareType::Tmp, region->getBase()->asRegVar(), NULL, G4_ExecSize(0));
1621 
1622     if (failSafeSpill_)
1623     {
1624         msgRangeDeclare->getRegVar()->setPhyReg(builder_->phyregpool.getGreg(spillRegOffset_), 0);
1625         spillRegOffset_ += height;
1626     }
1627 
1628     return msgRangeDeclare;
1629 }
1630 
1631 // Create a GRF regvar and a declare directive for it, that will be used as
1632 // the send message payload header and write payload for filling a regvar
1633 // from memory.
1634 
1635 G4_Declare *
createMRangeDeclare(G4_SrcRegRegion * region,G4_ExecSize execSize)1636 SpillManagerGRF::createMRangeDeclare(
1637     G4_SrcRegRegion * region,
1638     G4_ExecSize       execSize
1639 )
1640 {
1641     if (useSplitSend() && useScratchMsg_)
1642     {
1643         return builder_->getBuiltinR0();
1644     }
1645     else if (useLSCMsg)
1646     {
1647         return nullptr;
1648     }
1649 
1650     const char * name =
1651         createImplicitRangeName(
1652             "FL_MSG", getRegVar(region),
1653             getMsgFillIndex(getRegVar(region)));
1654     getSegmentByteSize(region, execSize);
1655     unsigned payloadHeaderHeight =
1656         (getMsgType (region, execSize) == owordMask()) ?
1657         OWORD_PAYLOAD_HEADER_MIN_HEIGHT : DWORD_PAYLOAD_HEADER_MIN_HEIGHT;
1658 
1659     // We should not find ourselves using dword scattered write
1660     if (useScratchMsg_)
1661     {
1662         assert(payloadHeaderHeight != DWORD_PAYLOAD_HEADER_MAX_HEIGHT);
1663         // When using scratch msg descriptor we dont need to use a
1664         // separate GRF for payload. Source operand of send can directly
1665         // use r0.0.
1666         return builder_->getBuiltinR0();
1667     }
1668 
1669     unsigned height = payloadHeaderHeight;
1670     unsigned width = REG_DWORD_SIZE;
1671     G4_Declare * msgRangeDeclare =
1672         createRangeDeclare (
1673             name,
1674             G4_GRF,
1675             (unsigned short) width, (unsigned short) height, Type_UD,
1676             DeclareType::Tmp, region->getBase()->asRegVar(), NULL, G4_ExecSize(0));
1677 
1678     if (failSafeSpill_)
1679     {
1680         msgRangeDeclare->getRegVar()->setPhyReg(builder_->phyregpool.getGreg(spillRegOffset_), 0);
1681         spillRegOffset_ += height;
1682     }
1683 
1684     return msgRangeDeclare;
1685 }
1686 
1687 // Create a destination region for the GRF regvar for the write payload
1688 // portion of the oword block send message (used for spill). The exec size
1689 // can be either 4 or 8 for a regular 2 cycle instruction detination spills or
1690 // 16 for simd16 instruction destination spills.
createMPayloadBlockWriteDstRegion(G4_RegVar * grfRange,unsigned regOff,unsigned subregOff)1691 G4_DstRegRegion * SpillManagerGRF::createMPayloadBlockWriteDstRegion(
1692     G4_RegVar *       grfRange,
1693     unsigned          regOff,
1694     unsigned          subregOff)
1695 {
1696     regOff += OWORD_PAYLOAD_WRITE_REG_OFFSET;
1697     subregOff += OWORD_PAYLOAD_WRITE_SUBREG_OFFSET;
1698     return builder_->createDst(
1699         grfRange, (short) regOff, (short) subregOff, DEF_HORIZ_STRIDE, Type_UD);
1700 }
1701 
1702 // Create a destination region for the GRF regvar for the input header
1703 // payload portion of the send message to the data port. The exec size
1704 // needs to be 8 for the mov instruction that uses this as a destination.
createMHeaderInputDstRegion(G4_RegVar * grfRange,unsigned subregOff)1705 G4_DstRegRegion * SpillManagerGRF::createMHeaderInputDstRegion(
1706     G4_RegVar *       grfRange,
1707     unsigned          subregOff)
1708 {
1709     return builder_->createDst(
1710         grfRange, PAYLOAD_INPUT_REG_OFFSET, (short) subregOff,
1711         DEF_HORIZ_STRIDE, Type_UD);
1712 }
1713 
1714 // Create a destination region for the GRF regvar for the payload offset
1715 // portion of the oword block send message. The exec size needs to be 1
1716 // for the mov instruction that uses this as a destination.
createMHeaderBlockOffsetDstRegion(G4_RegVar * grfRange)1717 G4_DstRegRegion * SpillManagerGRF::createMHeaderBlockOffsetDstRegion(
1718     G4_RegVar *       grfRange)
1719 {
1720     return builder_->createDst(
1721         grfRange, OWORD_PAYLOAD_SPOFFSET_REG_OFFSET,
1722         OWORD_PAYLOAD_SPOFFSET_SUBREG_OFFSET, DEF_HORIZ_STRIDE,
1723         Type_UD);
1724 }
1725 
1726 // Create a source region for the input payload (r0.0). The exec size
1727 // needs to be 8 for the mov instruction that uses this as a source.
1728 G4_SrcRegRegion *
createInputPayloadSrcRegion()1729 SpillManagerGRF::createInputPayloadSrcRegion()
1730 {
1731     G4_RegVar * inputPayloadDirectReg = builder_->getBuiltinR0()->getRegVar();
1732     const RegionDesc * rDesc =
1733         builder_->rgnpool.createRegion(
1734             REG_DWORD_SIZE, REG_DWORD_SIZE, DEF_HORIZ_STRIDE);
1735     return builder_->createSrc(inputPayloadDirectReg,
1736         PAYLOAD_INPUT_REG_OFFSET, PAYLOAD_INPUT_SUBREG_OFFSET,
1737         rDesc, Type_UD);
1738 }
1739 
1740 // Create and initialize the message header for the send instruction for
1741 // save/load of value to/from memory.
1742 // The header includes the input payload and the offset (for spill disp).
1743 template <class REGION_TYPE>
createAndInitMHeader(REGION_TYPE * region,G4_ExecSize execSize)1744 G4_Declare * SpillManagerGRF::createAndInitMHeader(
1745     REGION_TYPE * region,
1746     G4_ExecSize  execSize)
1747 {
1748     G4_Declare * mRangeDcl = createMRangeDeclare(region, execSize);
1749     return initMHeader (mRangeDcl, region, execSize);
1750 }
1751 
1752 // Initialize the message header for the send instruction for save/load
1753 // of value to/from memory.
1754 // The header includes the input payload and the offset (for spill disp).
1755 template <class REGION_TYPE>
initMHeader(G4_Declare * mRangeDcl,REGION_TYPE * region,G4_ExecSize execSize)1756 G4_Declare * SpillManagerGRF::initMHeader(
1757     G4_Declare *  mRangeDcl,
1758     REGION_TYPE * region,
1759     G4_ExecSize  execSize)
1760 {
1761     // Initialize the message header with the input payload.
1762     if ((useScratchMsg_ && mRangeDcl == builder_->getBuiltinR0()) || !headerNeeded())
1763     {
1764         // mRangeDcl is NULL for fills
1765         return mRangeDcl;
1766     }
1767 
1768     G4_DstRegRegion * mHeaderInputDstRegion =
1769         createMHeaderInputDstRegion(mRangeDcl->getRegVar());
1770     G4_SrcRegRegion * inputPayload = createInputPayloadSrcRegion();
1771     createMovInst(G4_ExecSize(REG_DWORD_SIZE), mHeaderInputDstRegion, inputPayload);
1772     numGRFMove++;
1773 
1774     if (useScratchMsg_)
1775     {
1776         // Initialize msg header when region is a spill
1777         // When using scratch msg description, we only need to copy
1778         // r0.0 in to msg header. Memory offset will be
1779         // specified in the msg descriptor.
1780     }
1781     else
1782     {
1783       // Initialize the message header with the spill disp for block
1784       // read/write.
1785         G4_DstRegRegion * mHeaderOffsetDstRegion =
1786             createMHeaderBlockOffsetDstRegion(mRangeDcl->getRegVar());
1787         int offset = getSegmentDisp(region, execSize);
1788         getSpillOffset(offset);
1789         unsigned segmentDisp = offset / OWORD_BYTE_SIZE;
1790         G4_Imm * segmentDispImm = builder_->createImm (segmentDisp, Type_UD);
1791 
1792         if (!region->isSrcRegRegion() && !region->isDstRegRegion())
1793         {
1794             MUST_BE_TRUE(false, ERROR_GRAPHCOLOR);
1795         }
1796 
1797         if (builder_->getIsKernel() == false)
1798         {
1799             createAddFPInst(g4::SIMD1, mHeaderOffsetDstRegion, segmentDispImm);
1800         }
1801         else
1802         {
1803             createMovInst(g4::SIMD1, mHeaderOffsetDstRegion, segmentDispImm);
1804         }
1805         numGRFMove++;
1806     }
1807 
1808     // Initialize the message header with the spill disp for scatter
1809     // read/write.
1810     return mRangeDcl;
1811 }
1812 
1813 // Create and initialize the message header for the send instruction.
1814 // The header includes the input payload (for spill disp).
createAndInitMHeader(G4_RegVar * regVar)1815 G4_Declare * SpillManagerGRF::createAndInitMHeader(G4_RegVar * regVar)
1816 {
1817     G4_Declare * mRangeDcl = createMRangeDeclare(regVar);
1818     return initMHeader (mRangeDcl);
1819 }
1820 
1821 // Initialize the message header for the send instruction.
1822 // The header includes the input payload (for spill disp).
initMHeader(G4_Declare * mRangeDcl)1823 G4_Declare * SpillManagerGRF::initMHeader(G4_Declare * mRangeDcl)
1824 {
1825     // Initialize the message header with the input payload.
1826     if ((useScratchMsg_ && mRangeDcl == builder_->getBuiltinR0()) || !headerNeeded())
1827     {
1828         // mRangeDcl is NULL for fills
1829         return mRangeDcl;
1830     }
1831 
1832     G4_DstRegRegion * mHeaderInputDstRegion =
1833         createMHeaderInputDstRegion(mRangeDcl->getRegVar());
1834     G4_SrcRegRegion * inputPayload = createInputPayloadSrcRegion();
1835     createMovInst(G4_ExecSize(REG_DWORD_SIZE), mHeaderInputDstRegion, inputPayload);
1836     numGRFMove ++;
1837 
1838     return mRangeDcl;
1839 }
1840 
1841 // Initialize the the write payload part of the message for spilled regvars.
1842 // Either of the following restrictions for spillRangeDcl are assumed:
1843 //        - the regvar element type is dword and its 2 <= width <= 8 and
1844 //        height - regOff == 1
1845 //        - the regvar element type is dword and its width = 8 and
1846 //        2 <= height - regOff <= 8
1847 //      - the regvar element type is dword and its width and height are 1
initMWritePayload(G4_Declare * spillRangeDcl,G4_Declare * mRangeDcl,unsigned regOff,unsigned height)1848 void SpillManagerGRF::initMWritePayload(
1849     G4_Declare *      spillRangeDcl,
1850     G4_Declare *      mRangeDcl,
1851     unsigned          regOff,
1852     unsigned          height)
1853 {
1854     if (useSplitSend())
1855     {
1856         // no need for payload moves if using sends
1857         return;
1858     }
1859 
1860     // We use an block write when the spilled regvars's segment is greater
1861     // than a dword. Generate a mov to copy the oword aligned segment into
1862     // the write payload part of the message.
1863     {
1864         unsigned nRows = height;
1865 
1866         for (unsigned i = 0; i < nRows; i++) {
1867             G4_SrcRegRegion * spillRangeSrcRegion =
1868                 createBlockSpillRangeSrcRegion(
1869                     spillRangeDcl->getRegVar(), i + regOff);
1870             G4_DstRegRegion * mPayloadWriteDstRegion =
1871                 createMPayloadBlockWriteDstRegion (
1872                     mRangeDcl->getRegVar(), i);
1873             G4_ExecSize movExecSize =
1874                 G4_ExecSize((nRows > 1) ? REG_DWORD_SIZE : spillRangeDcl->getNumElems());
1875             createMovInst(
1876                 movExecSize, mPayloadWriteDstRegion, spillRangeSrcRegion);
1877             numGRFMove ++;
1878         }
1879     }
1880 }
1881 
1882 // Initialize the the write payload part of the message for spilled regions.
initMWritePayload(G4_Declare * spillRangeDcl,G4_Declare * mRangeDcl,G4_DstRegRegion * spilledRangeRegion,G4_ExecSize execSize,unsigned regOff)1883 void SpillManagerGRF::initMWritePayload(
1884     G4_Declare *      spillRangeDcl,
1885     G4_Declare *      mRangeDcl,
1886     G4_DstRegRegion * spilledRangeRegion,
1887     G4_ExecSize       execSize,
1888     unsigned          regOff)
1889 {
1890     // We use an block write when the spilled region's segment is greater
1891     // than a dword. Generate a mov to copy the oword aligned segment into
1892     // the write payload part of the message.
1893     if (useSplitSend())
1894     {
1895         // no need for payload moves
1896         return;
1897     }
1898     else
1899     {
1900         G4_SrcRegRegion * spillRangeSrcRegion =
1901             createBlockSpillRangeSrcRegion(
1902                 spillRangeDcl->getRegVar(), regOff);
1903         G4_DstRegRegion * mPayloadWriteDstRegion =
1904             createMPayloadBlockWriteDstRegion (mRangeDcl->getRegVar());
1905         unsigned segmentByteSize =
1906             getSegmentByteSize(spilledRangeRegion, execSize);
1907         G4_ExecSize movExecSize {segmentByteSize / DWORD_BYTE_SIZE};
1908 
1909         // Write entire GRF when using scratch msg descriptor
1910         if (useScratchMsg_)
1911         {
1912             if (movExecSize <= 8)
1913                 movExecSize = g4::SIMD8;
1914             else if (movExecSize < g4::SIMD16)
1915                 movExecSize = g4::SIMD16;
1916         }
1917 
1918         assert(segmentByteSize % DWORD_BYTE_SIZE == 0);
1919         assert(movExecSize <= g4::SIMD16);
1920         createMovInst(
1921             movExecSize, mPayloadWriteDstRegion, spillRangeSrcRegion);
1922         numGRFMove ++;
1923     }
1924 }
1925 
1926 // Return the block size encoding for oword block reads.
blockSendBlockSizeCode(unsigned size)1927 unsigned SpillManagerGRF::blockSendBlockSizeCode(unsigned size)
1928 {
1929     auto code = GlobalRA::sendBlockSizeCode(size);
1930     return code << getSendDescDataSizeBitOffset();
1931 }
1932 
1933 // Return the block size encoding for dword scatter reads.
scatterSendBlockSizeCode(unsigned size) const1934 unsigned SpillManagerGRF::scatterSendBlockSizeCode(unsigned size) const
1935 {
1936     unsigned code;
1937 
1938     switch (size) {
1939         case 1:
1940             // We will use an exec size of 1 to perform 1 dword read/write.
1941         case 8:
1942             code = 0x02;
1943             break;
1944         case 16:
1945             code = 0x03;
1946             break;
1947         default:
1948             assert(0);
1949             code = 0;
1950     }
1951 
1952     return code << getSendDescDataSizeBitOffset();
1953 }
1954 
getScratchBlocksizeEncoding(int numGRF)1955 static uint32_t getScratchBlocksizeEncoding(int numGRF)
1956 {
1957 
1958     int size = (numGRF * getGRFSize()) / 32; // in HWwords
1959     unsigned blocksize_encoding = 0;
1960     if (size == 1)
1961     {
1962         blocksize_encoding = 0x0;
1963     }
1964     else if (size == 2)
1965     {
1966         blocksize_encoding = 0x1;
1967     }
1968     else if (size == 4)
1969     {
1970         blocksize_encoding = 0x2;
1971     }
1972     else if (size == 8)
1973     {
1974         assert(getGenxPlatform() >= GENX_SKL);
1975         blocksize_encoding = 0x3;
1976     }
1977     else
1978         assert(false);
1979     return blocksize_encoding;
1980 }
1981 
1982 std::tuple<uint32_t, G4_ExecSize>
createSpillSendMsgDescOWord(unsigned int height)1983 SpillManagerGRF::createSpillSendMsgDescOWord(unsigned int height)
1984 {
1985     unsigned segmentByteSize = height * REG_BYTE_SIZE;
1986     unsigned writePayloadCount = cdiv(segmentByteSize, REG_BYTE_SIZE);
1987     unsigned statelessSurfaceIndex = 0xFF;
1988     unsigned int message = statelessSurfaceIndex;
1989 
1990     unsigned headerPresent = 0x80000;
1991     message |= headerPresent;
1992     unsigned messageType = getSendOwordWriteType();
1993     message |= messageType << getSendWriteTypeBitOffset();
1994     unsigned payloadHeaderCount = OWORD_PAYLOAD_HEADER_MAX_HEIGHT;
1995     // split send not used since msg type is oword
1996     unsigned messageLength = writePayloadCount + payloadHeaderCount;
1997     message |= messageLength << getSendMsgLengthBitOffset();
1998     unsigned segmentOwordSize = cdiv(segmentByteSize, OWORD_BYTE_SIZE);
1999     message |= blockSendBlockSizeCode(segmentOwordSize);
2000     auto execSize = G4_ExecSize(LIMIT_SEND_EXEC_SIZE(segmentOwordSize * DWORD_BYTE_SIZE));
2001 
2002     return std::make_tuple(message, execSize);
2003 }
2004 
2005 // Create the message descriptor for a spill send instruction for spilled
2006 // post destinations of send instructions.
createSpillSendMsgDesc(unsigned regOff,unsigned height,G4_ExecSize & execSize,G4_RegVar * base)2007 G4_Imm * SpillManagerGRF::createSpillSendMsgDesc(
2008     unsigned      regOff,
2009     unsigned      height,
2010     G4_ExecSize & execSize,
2011     G4_RegVar*    base)
2012 {
2013     unsigned message = 0;
2014 
2015     if (useScratchMsg_)
2016     {
2017         unsigned headerPresent = 0x80000;
2018         message = headerPresent;
2019         unsigned msgLength = useSplitSend() ? SCRATCH_PAYLOAD_HEADER_MAX_HEIGHT : SCRATCH_PAYLOAD_HEADER_MAX_HEIGHT + height;
2020         message |= (msgLength << getSendMsgLengthBitOffset());
2021         message |= (1 << SCRATCH_MSG_DESC_CATEORY);
2022         message |= (1 << SCRATCH_MSG_DESC_CHANNEL_MODE);
2023         message |= (1 << SCRATCH_MSG_DESC_OPERATION_MODE);
2024         unsigned blocksize_encoding = getScratchBlocksizeEncoding(height);
2025         message |= (blocksize_encoding << SCRATCH_MSG_DESC_BLOCK_SIZE);
2026         int offset = getDisp(base);
2027         getSpillOffset(offset);
2028         // message expects offsets to be in HWord
2029         message |= (offset + regOff * getGRFSize()) >> SCRATCH_SPACE_ADDRESS_UNIT;
2030         execSize = g4::SIMD16;
2031     }
2032     else
2033     {
2034         auto [message, retSize] = createSpillSendMsgDescOWord(height);
2035         execSize = retSize;
2036     }
2037     return builder_->createImm (message, Type_UD);
2038 }
2039 
2040 // Create the message descriptor for a spill send instruction for spilled
2041 // destination regions.
2042 std::tuple<G4_Imm*, G4_ExecSize>
createSpillSendMsgDesc(G4_DstRegRegion * spilledRangeRegion,G4_ExecSize execSize)2043 SpillManagerGRF::createSpillSendMsgDesc(
2044     G4_DstRegRegion * spilledRangeRegion,
2045     G4_ExecSize     execSize)
2046 {
2047     unsigned message = 0;
2048 
2049     if (useScratchMsg_)
2050     {
2051         /*
2052         bits    description
2053         18:0    function control
2054         19    Header present
2055         24:20    Response length
2056         28:25    Message length
2057         31:29    MBZ
2058 
2059         18:0
2060         11:0    Offset (12b hword offset)
2061         13:12    Block size (00 - 1 register, 01 - 2 regs, 10 - reserved, 11 - 4 regs)
2062         14    MBZ
2063         15    Invalidate after read (0 - no invalidate, 1 - invalidate)
2064         16    Channel mode (0 - oword, 1 - dword)
2065         17    Operation type (0 - read, 1 - write)
2066         18    Category (1 - scratch block read/write)
2067         */
2068         unsigned segmentByteSize = getSegmentByteSize(spilledRangeRegion, execSize);
2069         unsigned writePayloadCount = cdiv(segmentByteSize, REG_BYTE_SIZE);
2070         unsigned headerPresent = 0x80000;
2071         message |= headerPresent;
2072 
2073         unsigned payloadHeaderCount = SCRATCH_PAYLOAD_HEADER_MAX_HEIGHT;
2074         // message length = 1 if we are using sends, 1 + payload otherwise
2075         unsigned messageLength = useSplitSend() ? payloadHeaderCount :
2076             writePayloadCount + payloadHeaderCount;
2077         message |= (messageLength << getSendMsgLengthBitOffset());
2078         message |= (1 << SCRATCH_MSG_DESC_CATEORY); // category
2079         message |= (1 << SCRATCH_MSG_DESC_CHANNEL_MODE); // channel mode
2080         message |= (1 << SCRATCH_MSG_DESC_OPERATION_MODE); // write operation
2081         unsigned numGRFs = cdiv(segmentByteSize, numEltPerGRF<Type_UB>());
2082 
2083         unsigned blocksize_encoding = getScratchBlocksizeEncoding(numGRFs);
2084 
2085         message |= (blocksize_encoding << SCRATCH_MSG_DESC_BLOCK_SIZE);
2086         int offset = getRegionDisp(spilledRangeRegion);
2087         getSpillOffset(offset);
2088         message |= offset >> SCRATCH_SPACE_ADDRESS_UNIT;
2089         if (numGRFs > 1)
2090         {
2091             execSize = g4::SIMD16;
2092         }
2093         else
2094         {
2095             if (execSize > g4::SIMD8)
2096             {
2097                 execSize = g4::SIMD16;
2098             }
2099             else
2100             {
2101                 execSize = g4::SIMD8;
2102             }
2103         }
2104     }
2105     else
2106     {
2107         unsigned segmentByteSize =
2108             getSegmentByteSize(spilledRangeRegion, execSize);
2109         unsigned writePayloadCount = cdiv(segmentByteSize, REG_BYTE_SIZE);
2110         unsigned statelessSurfaceIndex = 0xFF;
2111         message = statelessSurfaceIndex;
2112 
2113         unsigned headerPresent = 0x80000;
2114         message |= headerPresent;
2115         unsigned messageType = getSendOwordWriteType();
2116         message |= messageType << getSendWriteTypeBitOffset();
2117         unsigned payloadHeaderCount = OWORD_PAYLOAD_HEADER_MAX_HEIGHT;
2118         unsigned messageLength = useSplitSend() ? payloadHeaderCount : writePayloadCount + payloadHeaderCount;
2119         message |= messageLength << getSendMsgLengthBitOffset();
2120         unsigned segmentOwordSize = cdiv(segmentByteSize, OWORD_BYTE_SIZE);
2121         message |= blockSendBlockSizeCode (segmentOwordSize);
2122         execSize = G4_ExecSize(LIMIT_SEND_EXEC_SIZE(segmentOwordSize * DWORD_BYTE_SIZE));
2123     }
2124     return std::make_tuple(builder_->createImm (message, Type_UD), execSize);
2125 }
2126 
2127 // Create an add instruction to add the FP needed for generating spill/fill code.
2128 // We always set the NoMask flag and use a null conditional modifier.
createAddFPInst(G4_ExecSize execSize,G4_DstRegRegion * dst,G4_Operand * src)2129 G4_INST * SpillManagerGRF::createAddFPInst(
2130     G4_ExecSize       execSize,
2131     G4_DstRegRegion * dst,
2132     G4_Operand *      src)
2133 {
2134     const RegionDesc* rDesc = builder_->getRegionScalar();
2135     G4_Operand* fp = builder_->createSrc(builder_->kernel.fg.framePtrDcl->getRegVar(),
2136         0, 0, rDesc, Type_UD);
2137     auto newInst = builder_->createBinOp(G4_add, execSize, dst, fp, src, InstOpt_WriteEnable, true);
2138     newInst->inheritDIFrom(curInst);
2139 
2140     return newInst;
2141 
2142 }
2143 
2144 // Create a mov instruction needed for generating spill/fill code.
2145 // We always set the NoMask flag and use a null conditional modifier.
createMovInst(G4_ExecSize execSize,G4_DstRegRegion * dst,G4_Operand * src,G4_Predicate * predicate,unsigned int options)2146 G4_INST * SpillManagerGRF::createMovInst(
2147     G4_ExecSize       execSize,
2148     G4_DstRegRegion * dst,
2149     G4_Operand *      src,
2150     G4_Predicate *    predicate,
2151     unsigned int      options)
2152 {
2153     auto newInst = builder_->createMov(execSize, dst, src, options, true);
2154 
2155     if (predicate)
2156     {
2157         newInst->setPredicate(predicate);
2158     }
2159 
2160     return newInst;
2161 }
2162 
2163 // Create a send instruction needed for generating spill/fill code.
2164 // We always set the NoMask flag and use a null predicate and conditional
2165 // modifier.
createSendInst(G4_ExecSize execSize,G4_DstRegRegion * postDst,G4_SrcRegRegion * payload,G4_Imm * desc,SFID funcID,bool isWrite,unsigned option)2166 G4_INST * SpillManagerGRF::createSendInst(
2167     G4_ExecSize       execSize,
2168     G4_DstRegRegion * postDst,
2169     G4_SrcRegRegion * payload,
2170     G4_Imm *          desc,
2171     SFID              funcID,
2172     bool              isWrite,
2173     unsigned          option)
2174 {
2175     // ToDo: create exDesc in createSendMsgDesc()
2176     uint32_t exDesc = G4_SendDescRaw::createExtDesc(funcID);
2177     auto msgDesc = builder_->createSendMsgDesc(funcID, (uint32_t)desc->getInt(), exDesc, 0,
2178         isWrite ? SendAccess::WRITE_ONLY : SendAccess::READ_ONLY, nullptr);
2179     auto sendInst = builder_->createSendInst(
2180         NULL, G4_send, execSize, postDst,
2181         payload, desc, option, msgDesc, true);
2182     sendInst->inheritDIFrom(curInst);
2183 
2184     return sendInst;
2185 }
2186 
2187 // Create the send instructions to fill in the value of spillRangeDcl into
2188 // fillRangeDcl in aligned portions.
getNextSize(int height,bool useHWordMsg)2189 static int getNextSize(int height, bool useHWordMsg)
2190 {
2191     bool has8GRFMessage = useHWordMsg && getGenxPlatform() >= GENX_SKL &&
2192         getGRFSize() == 32;
2193     if (has8GRFMessage && height >= 8)
2194     {
2195         return 8;
2196     }
2197     else if (height >= 4)
2198     {
2199         return 4;
2200     }
2201     else if (height >= 2)
2202     {
2203         return 2;
2204     }
2205     else if (height >= 1)
2206     {
2207         return 1;
2208     }
2209     return 0;
2210 }
2211 
2212 void
sendInSpilledRegVarPortions(G4_Declare * fillRangeDcl,G4_Declare * mRangeDcl,unsigned regOff,unsigned height,unsigned srcRegOff)2213 SpillManagerGRF::sendInSpilledRegVarPortions(
2214     G4_Declare *      fillRangeDcl,
2215     G4_Declare *      mRangeDcl,
2216     unsigned          regOff,
2217     unsigned          height,
2218     unsigned          srcRegOff)
2219 {
2220     //if (!headerNeeded())
2221     if ((useScratchMsg_ && mRangeDcl == builder_->getBuiltinR0()) || !headerNeeded())
2222     {
2223         // Skip initializing message header
2224     }
2225     else
2226     {
2227         // Initialize the message header with the spill disp for portion.
2228         int offset = getDisp(fillRangeDcl->getRegVar()) + regOff * REG_BYTE_SIZE;
2229         getSpillOffset(offset);
2230 
2231         unsigned segmentDisp = offset / OWORD_BYTE_SIZE;
2232         G4_Imm * segmentDispImm = builder_->createImm(segmentDisp, Type_UD);
2233         G4_DstRegRegion * mHeaderOffsetDstRegion =
2234             createMHeaderBlockOffsetDstRegion(mRangeDcl->getRegVar());
2235 
2236         if (builder_->getIsKernel() == false)
2237         {
2238             createAddFPInst(
2239                 g4::SIMD1, mHeaderOffsetDstRegion, segmentDispImm);
2240         }
2241         else
2242         {
2243             createMovInst(g4::SIMD1, mHeaderOffsetDstRegion, segmentDispImm);
2244         }
2245         numGRFMove ++;
2246     }
2247 
2248     // Read in the portions using a greedy approach.
2249     int currentStride = getNextSize(height, useScratchMsg_);
2250 
2251     if (currentStride)
2252     {
2253         if (useLSCMsg)
2254         {
2255             createLSCFill(fillRangeDcl, mRangeDcl, regOff, currentStride, srcRegOff);
2256         }
2257         else
2258         {
2259             createFillSendInstr(fillRangeDcl, mRangeDcl, regOff, currentStride, srcRegOff);
2260         }
2261 
2262         if (height - currentStride > 0)
2263         {
2264             sendInSpilledRegVarPortions(
2265                 fillRangeDcl, mRangeDcl, regOff + currentStride,
2266                 height - currentStride, srcRegOff + currentStride);
2267         }
2268     }
2269 }
2270 
2271 // Check if we need to perform the pre-load of the spilled region's
2272 // segment from spill memory. A pre-load is required under the following
2273 // circumstances:
2274 //        - for partial writes - horizontal stride greater than one, and when
2275 //          the emask and predicate can possibly disable channels (for now if
2276 //        predicates or condition modofoers are present then we conservatively
2277 //        assume a partial write)
2278 //        - write's where the segment is larger than the actaully written region
2279 //        (either because the spill offset for the region or its size is not
2280 //         oword or dword aligned for writing the exact region)
shouldPreloadSpillRange(G4_INST * instContext,G4_BB * parentBB)2281 bool SpillManagerGRF::shouldPreloadSpillRange(
2282     G4_INST* instContext, G4_BB* parentBB)
2283 {
2284     // Check for partial and unaligned regions and add pre-load code, if
2285     // necessary.
2286     auto spilledRangeRegion = instContext->getDst();
2287     G4_ExecSize execSize = instContext->getExecSize();
2288 
2289     if (isPartialRegion(spilledRangeRegion, execSize) ||
2290         isUnalignedRegion(spilledRangeRegion, execSize) ||
2291         instContext->isPartialWriteForSpill(!parentBB->isAllLaneActive()))
2292     {
2293         // special check for scalar variables: no need for pre-fill if instruction writes to whole variable and is not predicated
2294         auto spilledDcl = spilledRangeRegion->getTopDcl()->getRootDeclare();
2295         if (execSize == g4::SIMD1 && spilledRangeRegion->getTypeSize() == spilledDcl->getByteSize() && !instContext->getPredicate())
2296         {
2297             //ToDo: investigate why we are spilling so many scalar variables
2298             return false;
2299         }
2300         return true;
2301     }
2302     // No pre-load for whole and aligned region writes
2303     else
2304     {
2305         return false;
2306     }
2307 }
2308 
2309 // Create the send instruction to perform the pre-load of the spilled region's
2310 // segment into spill memory.
preloadSpillRange(G4_Declare * spillRangeDcl,G4_Declare * mRangeDcl,G4_DstRegRegion * spilledRangeRegion,G4_ExecSize execSize)2311 void SpillManagerGRF::preloadSpillRange(
2312     G4_Declare *      spillRangeDcl,
2313     G4_Declare *      mRangeDcl,
2314     G4_DstRegRegion * spilledRangeRegion,
2315     G4_ExecSize       execSize)
2316 {
2317     // When execSize is 32, regions <32, 32, 1> or <64; 32, 2> are invalid.
2318     // Use a uniform region descriptor <stride; 1, 0>. Note that stride could
2319     // be 0 when execsize is 1.
2320     uint16_t hstride = spilledRangeRegion->getHorzStride();
2321     const RegionDesc *rDesc = builder_->createRegionDesc(execSize, hstride, 1, 0);
2322 
2323     G4_SrcRegRegion* preloadRegion = builder_->createSrc(spillRangeDcl->getRegVar(),
2324         REG_ORIGIN, spilledRangeRegion->getSubRegOff(),
2325         rDesc, spilledRangeRegion->getType());
2326 
2327     if (useScratchMsg_)
2328     {
2329         // src region's base refers to the filled region's base
2330         // The size of src region is equal to number of rows that
2331         // have to be filled, starting at the reg offset specified
2332         // in the original operand. For eg,
2333         // Let the spilled operand be V40(3,3)
2334         //
2335         // => mov (1) V40(3,3)<1>:ud    V30(0,0)<0;1,0>:ud
2336         // When this will be replaced with a preload fill,
2337         // => mov (1) TM_GRF_V40_0(0,0)<1>:ud   V30(0,0)<0;1,0>:ud
2338         // => send (16) SP_V40_0(0,0)<1>:ud ...                            <---  load V40's 3rd row in SP_V40_0
2339         // => mov (1) SP_V40_0(0,3)<1>:ud   TM_GRF_V40_0(0,0)<8;8,1>:ud <--- overlay
2340         // => send (16) null ...                                        <--- store V40's updated 3rd row to memory
2341         //
2342         // Since the filled register's register offset is 0,0 in first
2343         // send instruction, this change is made when creating the operand
2344         // itself.
2345 
2346         // Attach preloadRegion to dummy mov so getLeftBound/getRightBound won't crash when called from crossGRF in createFillSendMsgDesc
2347         builder_->createMov(execSize, builder_->createNullDst(Type_UD), preloadRegion, InstOpt_NoOpt, false);
2348     }
2349 
2350     if (useLSCMsg)
2351     {
2352         createLSCFill(spillRangeDcl, mRangeDcl, preloadRegion, execSize);
2353     }
2354     else
2355     {
2356         createFillSendInstr(spillRangeDcl, mRangeDcl, preloadRegion, execSize);
2357     }
2358 
2359 }
2360 
getSpillFillHeader(IR_Builder & builder,G4_Declare * decl)2361 G4_SrcRegRegion* vISA::getSpillFillHeader(IR_Builder& builder, G4_Declare * decl)
2362 {
2363     if (builder.supportsLSC())
2364     {
2365         // LSC in its current incarnation needs a header to store the address
2366         return builder.createSrcRegRegion(builder.getSpillFillHeader(), builder.getRegionStride1());
2367     }
2368     return builder.createSrcRegRegion(decl, builder.getRegionStride1());
2369 }
2370 
2371 // Create the send instruction to perform the spill of the spilled regvars's
2372 // segment into spill memory.
2373 // regOff - Offset of sub-spill. If one spill is split into more than one spill,
2374 // this is the offset of them, unit in register size
2375 //  spillOff - Offset of the original variable being spilled, unit in register size.
createSpillSendInstr(G4_Declare * spillRangeDcl,G4_Declare * mRangeDcl,unsigned regOff,unsigned height,unsigned spillOff)2376 G4_INST *SpillManagerGRF::createSpillSendInstr(
2377     G4_Declare *      spillRangeDcl,
2378     G4_Declare *      mRangeDcl,
2379     unsigned          regOff,
2380     unsigned          height,
2381     unsigned          spillOff)
2382 {
2383     G4_ExecSize execSize (0);
2384 
2385     G4_Imm * messageDescImm = NULL;
2386 
2387     if (useScratchMsg_)
2388     {
2389         G4_RegVar* r = spillRangeDcl->getRegVar();
2390         G4_RegVarTmp* rvar = static_cast<G4_RegVarTmp*> (r);
2391         messageDescImm =
2392             createSpillSendMsgDesc(spillOff, height, execSize, rvar->getBaseRegVar());
2393 #ifdef _DEBUG
2394         int offset = (messageDescImm->getInt() & 0xFFF) * numEltPerGRF<Type_UB>();
2395         MUST_BE_TRUE(offset >= globalScratchOffset, "incorrect offset");
2396 #endif
2397     }
2398     else
2399     {
2400         messageDescImm =
2401             createSpillSendMsgDesc(regOff, height, execSize);
2402     }
2403 
2404     G4_DstRegRegion * postDst = builder_->createNullDst(execSize > g4::SIMD8 ? Type_UW : Type_UD);
2405 
2406     G4_INST* sendInst = NULL;
2407     if (useSplitSend())
2408     {
2409         auto headerOpnd = getSpillFillHeader(*builder_, mRangeDcl);
2410         G4_SrcRegRegion* srcOpnd = createBlockSpillRangeSrcRegion(spillRangeDcl->getRegVar(), regOff);
2411 
2412         auto off = G4_SpillIntrinsic::InvalidOffset;
2413         G4_Declare* fp = nullptr;
2414         if (useScratchMsg_)
2415             off = (messageDescImm->getInt() & 0xfff);
2416         else
2417         {
2418             if (builder_->usesStack())
2419             {
2420                 G4_RegVar* r = spillRangeDcl->getRegVar();
2421                 G4_RegVarTmp* rvar = static_cast<G4_RegVarTmp*> (r);
2422                 int offset = getDisp(rvar->getBaseRegVar());
2423                 getSpillOffset(offset);
2424                 // message expects offsets to be in HWord
2425                 off = (offset + spillOff * getGRFSize()) >> SCRATCH_SPACE_ADDRESS_UNIT;
2426                 if (builder_->usesStack())
2427                     fp = builder_->kernel.fg.getFramePtrDcl();
2428 
2429                 if (!fp && offset < SCRATCH_MSG_LIMIT)
2430                     headerOpnd = builder_->createSrcRegRegion(builder_->getBuiltinR0(), builder_->getRegionStride1());
2431             }
2432         }
2433         sendInst = builder_->createSpill(postDst, headerOpnd, srcOpnd, execSize, height, off, fp, InstOpt_WriteEnable, true);
2434         sendInst->inheritDIFrom(curInst);
2435     }
2436     else
2437     {
2438         G4_SrcRegRegion * payload = builder_->createSrc(
2439             mRangeDcl->getRegVar(), 0, 0, builder_->getRegionStride1(), Type_UD);
2440         sendInst = createSendInst(execSize, postDst, payload, messageDescImm, SFID::DP_DC0, true, InstOpt_WriteEnable);
2441     }
2442 
2443     return sendInst;
2444 }
2445 
2446 // Create the send instruction to perform the spill of the spilled region's
2447 // segment into spill memory.
createSpillSendInstr(G4_Declare * spillRangeDcl,G4_Declare * mRangeDcl,G4_DstRegRegion * spilledRangeRegion,G4_ExecSize execSize,unsigned option)2448 G4_INST *SpillManagerGRF::createSpillSendInstr(
2449     G4_Declare *      spillRangeDcl,
2450     G4_Declare *      mRangeDcl,
2451     G4_DstRegRegion * spilledRangeRegion,
2452     G4_ExecSize       execSize,
2453     unsigned          option)
2454 {
2455 
2456     G4_DstRegRegion * postDst =
2457         builder_->createNullDst(execSize > g4::SIMD8 ? Type_UW : Type_UD);
2458 
2459     G4_INST* sendInst = NULL;
2460     if (useSplitSend())
2461     {
2462         unsigned extMsgLength = spillRangeDcl->getNumRows();
2463         const RegionDesc* region = builder_->getRegionStride1();
2464         auto headerOpnd = getSpillFillHeader(*builder_, mRangeDcl);
2465         G4_SrcRegRegion* srcOpnd = builder_->createSrcRegRegion(spillRangeDcl, region);
2466 
2467         auto off = G4_SpillIntrinsic::InvalidOffset;
2468         G4_Declare* fp = nullptr;
2469         auto spillExecSize = execSize;
2470         if (useScratchMsg_)
2471         {
2472             auto [messageDescImm, retSize] =
2473                 createSpillSendMsgDesc(spilledRangeRegion, execSize);
2474             spillExecSize = retSize;
2475             off = (messageDescImm->getInt() & 0xfff);
2476         }
2477         else
2478         {
2479             if (builder_->usesStack())
2480             {
2481                 G4_RegVar* r = spillRangeDcl->getRegVar();
2482                 G4_RegVarTmp* rvar = static_cast<G4_RegVarTmp*> (r);
2483                 int offset = getDisp(rvar->getBaseRegVar());
2484                 getSpillOffset(offset);
2485                 // message expects offsets to be in HWord
2486                 auto regOff = spilledRangeRegion->getRegOff();
2487                 off = (offset + regOff * getGRFSize()) >> SCRATCH_SPACE_ADDRESS_UNIT;
2488                 if (builder_->usesStack())
2489                     fp = builder_->kernel.fg.getFramePtrDcl();
2490 
2491                 if (!fp && offset < SCRATCH_MSG_LIMIT)
2492                     headerOpnd = builder_->createSrcRegRegion(builder_->getBuiltinR0(), builder_->getRegionStride1());
2493             }
2494         }
2495         sendInst = builder_->createSpill(postDst, headerOpnd, srcOpnd, spillExecSize, (uint16_t)extMsgLength,
2496             off, fp, static_cast<G4_InstOption>(option), true);
2497         sendInst->inheritDIFrom(curInst);
2498     }
2499     else
2500     {
2501         auto [messageDescImm, spillExecSize] =
2502             createSpillSendMsgDesc(spilledRangeRegion, execSize);
2503         G4_SrcRegRegion * payload = builder_->createSrc(
2504             mRangeDcl->getRegVar(), 0, 0, builder_->getRegionStride1(), Type_UD);
2505         sendInst = createSendInst(spillExecSize, postDst, payload, messageDescImm, SFID::DP_DC0, true, static_cast<G4_InstOption>(option));
2506     }
2507 
2508     return sendInst;
2509 }
2510 
2511 // Create the message description for a fill send instruction for filled
2512 // regvars.
createFillSendMsgDesc(unsigned regOff,unsigned height,G4_ExecSize & execSize,G4_RegVar * base)2513 G4_Imm *SpillManagerGRF::createFillSendMsgDesc(
2514     unsigned          regOff,
2515     unsigned          height,
2516     G4_ExecSize &     execSize,
2517     G4_RegVar *       base)
2518 {
2519     unsigned message = 0;
2520 
2521     if (useScratchMsg_)
2522     {
2523         unsigned segmentByteSize = height * REG_BYTE_SIZE;
2524         unsigned responseLength = cdiv(segmentByteSize, REG_BYTE_SIZE);
2525         message = responseLength << getSendRspLengthBitOffset();
2526         unsigned headerPresent = 0x80000;
2527         message |= SCRATCH_PAYLOAD_HEADER_MAX_HEIGHT << getSendMsgLengthBitOffset();
2528         message |= headerPresent;
2529 
2530         message |= (1 << SCRATCH_MSG_DESC_CATEORY);
2531         message |= (0 << SCRATCH_MSG_INVALIDATE_AFTER_READ);
2532         unsigned blocksize_encoding = getScratchBlocksizeEncoding(height);
2533 
2534         message |= (blocksize_encoding << SCRATCH_MSG_DESC_BLOCK_SIZE);
2535 
2536         int offset = getDisp(base);
2537         getSpillOffset(offset);
2538         // message expects offsets to be in HWord
2539         message |= (offset + regOff * getGRFSize()) >> SCRATCH_SPACE_ADDRESS_UNIT;
2540 
2541         execSize = g4::SIMD16;
2542     }
2543     else
2544     {
2545         unsigned segmentByteSize = height * REG_BYTE_SIZE;
2546         unsigned statelessSurfaceIndex = 0xFF;
2547         unsigned responseLength = cdiv(segmentByteSize, REG_BYTE_SIZE);
2548         responseLength = responseLength << getSendRspLengthBitOffset();
2549         message = statelessSurfaceIndex | responseLength;
2550 
2551         unsigned headerPresent = 0x80000;
2552         message |= headerPresent;
2553         unsigned messageType = getSendOwordReadType();
2554         message |= messageType << getSendReadTypeBitOffset();
2555         unsigned messageLength = OWORD_PAYLOAD_HEADER_MIN_HEIGHT;
2556         message |= messageLength << getSendMsgLengthBitOffset();
2557         unsigned segmentOwordSize =
2558             cdiv(segmentByteSize, OWORD_BYTE_SIZE);
2559         message |= blockSendBlockSizeCode (segmentOwordSize);
2560         execSize = G4_ExecSize(LIMIT_SEND_EXEC_SIZE (segmentOwordSize * DWORD_BYTE_SIZE));
2561     }
2562     return builder_->createImm (message, Type_UD);
2563 }
2564 
2565 // Create the message description for a fill send instruction for filled
2566 // source regions.
2567 template <class REGION_TYPE>
createFillSendMsgDesc(REGION_TYPE * filledRangeRegion,G4_ExecSize execSize)2568 G4_Imm *SpillManagerGRF::createFillSendMsgDesc(
2569     REGION_TYPE * filledRangeRegion,
2570     G4_ExecSize    execSize)
2571 {
2572     unsigned message = 0;
2573 
2574     if (useScratchMsg_)
2575     {
2576         unsigned segmentByteSize =
2577             getSegmentByteSize(filledRangeRegion, execSize);
2578         if (filledRangeRegion->crossGRF()) {
2579             segmentByteSize = 2 * REG_BYTE_SIZE;
2580         }
2581 
2582         unsigned responseLength = cdiv(segmentByteSize, REG_BYTE_SIZE);
2583         message = responseLength << getSendRspLengthBitOffset();
2584 
2585         unsigned headerPresent = 0x80000;
2586         message |= headerPresent;
2587 
2588         message |= (SCRATCH_PAYLOAD_HEADER_MAX_HEIGHT << getSendMsgLengthBitOffset());
2589         message |= (1 << SCRATCH_MSG_DESC_CATEORY);
2590         message |= (0 << SCRATCH_MSG_INVALIDATE_AFTER_READ);
2591         unsigned blocksize_encoding = getScratchBlocksizeEncoding(responseLength);
2592 
2593         message |= (blocksize_encoding << SCRATCH_MSG_DESC_BLOCK_SIZE);
2594         int offset = getRegionDisp(filledRangeRegion);
2595         getSpillOffset(offset);
2596         message |= offset >> SCRATCH_SPACE_ADDRESS_UNIT;
2597     }
2598     else
2599     {
2600         unsigned segmentByteSize =
2601             getSegmentByteSize(filledRangeRegion, execSize);
2602         unsigned statelessSurfaceIndex = 0xFF;
2603         unsigned responseLength = cdiv(segmentByteSize, REG_BYTE_SIZE);
2604         responseLength = responseLength << getSendRspLengthBitOffset();
2605         message = statelessSurfaceIndex | responseLength;
2606 
2607         unsigned headerPresent = 0x80000;
2608         message |= headerPresent;
2609         unsigned messageType = getSendOwordReadType();
2610         message |= messageType << getSendReadTypeBitOffset();
2611         unsigned messageLength = OWORD_PAYLOAD_HEADER_MIN_HEIGHT;
2612         message |= messageLength << getSendMsgLengthBitOffset();
2613         unsigned segmentOwordSize =
2614             cdiv(segmentByteSize, OWORD_BYTE_SIZE);
2615         message |= blockSendBlockSizeCode (segmentOwordSize);
2616     }
2617     return builder_->createImm(message, Type_UD);
2618 }
2619 
2620 // Create the send instruction to perform the fill of the spilled regvars's
2621 // segment from spill memory.
2622 // spillOff - spill offset to the fillRangeDcl, in unit of grf size
createFillSendInstr(G4_Declare * fillRangeDcl,G4_Declare * mRangeDcl,unsigned regOff,unsigned height,unsigned spillOff)2623 G4_INST * SpillManagerGRF::createFillSendInstr (
2624     G4_Declare *      fillRangeDcl,
2625     G4_Declare *      mRangeDcl,
2626     unsigned          regOff,
2627     unsigned          height,
2628     unsigned          spillOff)
2629 {
2630     G4_ExecSize execSize {0};
2631 
2632     G4_Imm * messageDescImm = NULL;
2633 
2634     if (useScratchMsg_)
2635     {
2636         G4_RegVar* r = fillRangeDcl->getRegVar();
2637         G4_RegVarTmp* rvar = static_cast<G4_RegVarTmp*> (r);
2638         messageDescImm =
2639             createFillSendMsgDesc (spillOff, height, execSize, rvar->getBaseRegVar());
2640 #ifdef _DEBUG
2641         int offset = (messageDescImm->getInt() & 0xFFF) * numEltPerGRF<Type_UB>();
2642         MUST_BE_TRUE(offset >= globalScratchOffset, "incorrect offset");
2643 #endif
2644     }
2645     else
2646     {
2647         messageDescImm =
2648             createFillSendMsgDesc (regOff, height, execSize);
2649     }
2650 
2651     G4_DstRegRegion * postDst = builder_->createDst(
2652         fillRangeDcl->getRegVar(), (short) regOff, SUBREG_ORIGIN,
2653         DEF_HORIZ_STRIDE, (execSize > 8)? Type_UW: Type_UD);
2654 
2655     auto payload = getSpillFillHeader(*builder_, mRangeDcl);
2656 
2657     unsigned int off = G4_FillIntrinsic::InvalidOffset;
2658     G4_Declare* fp = nullptr;
2659     if (useScratchMsg_)
2660         off = (messageDescImm->getInt() & 0xfff);
2661     else
2662     {
2663         if (builder_->usesStack())
2664         {
2665             // compute hword offset to emit later when expanding spill/fill intrinsic
2666             G4_RegVar* r = fillRangeDcl->getRegVar();
2667             G4_RegVarTmp* rvar = static_cast<G4_RegVarTmp*> (r);
2668             int offset = getDisp(rvar->getBaseRegVar());
2669             getSpillOffset(offset);
2670             // message expects offsets to be in HWord
2671             off = (offset + spillOff * getGRFSize()) >> SCRATCH_SPACE_ADDRESS_UNIT;
2672             if (builder_->usesStack())
2673                 fp = builder_->kernel.fg.getFramePtrDcl();
2674 
2675             if (!fp && offset < SCRATCH_MSG_LIMIT)
2676                 payload = builder_->createSrcRegRegion(builder_->getBuiltinR0(), builder_->getRegionStride1());
2677         }
2678     }
2679     auto fillInst = builder_->createFill(payload, postDst, execSize, height, off, fp, InstOpt_WriteEnable, true);
2680     fillInst->inheritDIFrom(curInst);
2681     return fillInst;
2682 
2683 }
2684 
2685 // Create the send instruction to perform the fill of the filled region's
2686 // segment into fill memory.
createFillSendInstr(G4_Declare * fillRangeDcl,G4_Declare * mRangeDcl,G4_SrcRegRegion * filledRangeRegion,G4_ExecSize execSize)2687 G4_INST * SpillManagerGRF::createFillSendInstr(
2688     G4_Declare *      fillRangeDcl,
2689     G4_Declare *      mRangeDcl,
2690     G4_SrcRegRegion * filledRangeRegion,
2691     G4_ExecSize       execSize)
2692 {
2693     auto oldExecSize = execSize;
2694 
2695     if (useScratchMsg_)
2696     {
2697         execSize = g4::SIMD16;
2698     }
2699 
2700     G4_DstRegRegion * postDst = builder_->createDst(
2701         fillRangeDcl->getRegVar(), 0, SUBREG_ORIGIN,
2702         DEF_HORIZ_STRIDE, (execSize > 8)? Type_UW : Type_UD);
2703 
2704     auto payload = getSpillFillHeader(*builder_, mRangeDcl);
2705 
2706     unsigned int off = G4_FillIntrinsic::InvalidOffset;
2707     unsigned segmentByteSize = getSegmentByteSize(filledRangeRegion, oldExecSize);
2708     G4_Declare* fp = nullptr;
2709     if (useScratchMsg_)
2710     {
2711         G4_Imm* messageDescImm =
2712             createFillSendMsgDesc(filledRangeRegion, oldExecSize);
2713 
2714         off = (messageDescImm->getInt() & 0xfff);
2715         if (filledRangeRegion->crossGRF())
2716         {
2717             segmentByteSize = 2 * REG_BYTE_SIZE;
2718         }
2719     }
2720     else
2721     {
2722         if (builder_->usesStack())
2723         {
2724             // compute hword offset to emit later when expanding spill/fill intrinsic
2725             int offset = getRegionDisp(filledRangeRegion);
2726             getSpillOffset(offset);
2727             off = offset >> SCRATCH_SPACE_ADDRESS_UNIT;
2728             if (builder_->usesStack())
2729                 fp = builder_->kernel.fg.getFramePtrDcl();
2730 
2731             if (!fp && offset < SCRATCH_MSG_LIMIT)
2732                 payload = builder_->createSrcRegRegion(builder_->getBuiltinR0(), builder_->getRegionStride1());
2733         }
2734     }
2735 
2736     unsigned responseLength = cdiv(segmentByteSize, REG_BYTE_SIZE);
2737     auto fillInst = builder_->createFill(payload, postDst, execSize, responseLength, off, fp, InstOpt_WriteEnable, true);
2738     fillInst->inheritDIFrom(curInst);
2739     return fillInst;
2740 }
2741 
2742 // LSC versions of spill/fill, useLSCMsg must be true for these functions
2743 
getLSCSpillFillHeader(G4_Declare * mRangeDcl,const G4_Declare * fp,int offset)2744 G4_SrcRegRegion *SpillManagerGRF::getLSCSpillFillHeader(
2745     G4_Declare* mRangeDcl, const G4_Declare *fp, int offset)
2746 {
2747     G4_SrcRegRegion* headerOpnd = nullptr;
2748     if (!fp && offset < SCRATCH_MSG_LIMIT && !useLscNonstackCall) {
2749         // using LSC because we exceed 128k of DC0 message
2750         headerOpnd = builder_->createSrcRegRegion(builder_->getBuiltinR0(), builder_->getRegionStride1());
2751     }
2752     else
2753     {
2754         headerOpnd = getSpillFillHeader(*builder_, mRangeDcl);
2755     }
2756     return headerOpnd;
2757 }
2758 
2759 // Create the send instruction to perform the spill of the spilled regvars's
2760 // segment into spill memory.
2761 //
2762 // regOff - Offset of sub-spill. If one spill is splitted into more than one spill,
2763 // this is the offset of them, unit in register size
2764 // spillOff - Offset of the original variable being spilled, unit in register size.
createLSCSpill(G4_Declare * spillRangeDcl,G4_Declare * mRangeDcl,unsigned regOff,unsigned height,unsigned spillOff)2765 G4_INST * SpillManagerGRF::createLSCSpill(
2766     G4_Declare*       spillRangeDcl,
2767     G4_Declare*       mRangeDcl,
2768     unsigned          regOff,
2769     unsigned          height,
2770     unsigned          spillOff)
2771 {
2772     G4_ExecSize execSize(16);
2773 
2774     G4_DstRegRegion* postDst = builder_->createNullDst(Type_UD);
2775 
2776     G4_SrcRegRegion* srcOpnd = createBlockSpillRangeSrcRegion(spillRangeDcl->getRegVar(), regOff);
2777     G4_Declare* fp = builder_->usesStack() ? builder_->kernel.fg.getFramePtrDcl() : nullptr;
2778 
2779     G4_RegVarTmp* rvar = static_cast<G4_RegVarTmp*>(spillRangeDcl->getRegVar());
2780     int offset = getDisp(rvar->getBaseRegVar());
2781     getSpillOffset(offset);
2782     // message expects offsets to be in HWord
2783     uint32_t offsetHwords = (offset + spillOff * getGRFSize()) >> SCRATCH_SPACE_ADDRESS_UNIT;
2784 
2785     G4_SrcRegRegion* header = getLSCSpillFillHeader(mRangeDcl, fp, offset);
2786     auto sendInst = builder_->createSpill(postDst, header, srcOpnd, execSize,
2787         height, offsetHwords, fp, InstOpt_WriteEnable, true);
2788     sendInst->inheritDIFrom(curInst);
2789 
2790     return sendInst;
2791 }
2792 
2793 // Create the send instruction to perform the spill of the spilled region's
2794 // segment into spill memory.
createLSCSpill(G4_Declare * spillRangeDcl,G4_Declare * mRangeDcl,G4_DstRegRegion * spilledRangeRegion,G4_ExecSize execSize,unsigned option)2795 G4_INST * SpillManagerGRF::createLSCSpill(
2796     G4_Declare*       spillRangeDcl,
2797     G4_Declare*       mRangeDcl,
2798     G4_DstRegRegion*  spilledRangeRegion,
2799     G4_ExecSize       execSize,
2800     unsigned          option)
2801 {
2802     G4_DstRegRegion* postDst = builder_->createNullDst(Type_UD);
2803 
2804     unsigned extMsgLength = spillRangeDcl->getNumRows();
2805     const RegionDesc* region = builder_->getRegionStride1();
2806     G4_SrcRegRegion* srcOpnd = builder_->createSrcRegRegion(spillRangeDcl, region);
2807 
2808     G4_Declare* fp = builder_->usesStack() ? builder_->kernel.fg.getFramePtrDcl() : nullptr;
2809 
2810     G4_RegVarTmp* rvar = static_cast<G4_RegVarTmp*>(spillRangeDcl->getRegVar());
2811     int offset = getDisp(rvar->getBaseRegVar());
2812     getSpillOffset(offset);
2813     // message expects offsets to be in HWord
2814     auto regOff = spilledRangeRegion->getRegOff();
2815     uint32_t offsetHwords = (offset + regOff * getGRFSize()) >> SCRATCH_SPACE_ADDRESS_UNIT;
2816 
2817     G4_SrcRegRegion* header = getLSCSpillFillHeader(mRangeDcl, fp, offset);
2818     auto sendInst = builder_->createSpill(postDst, header, srcOpnd, execSize,
2819         (uint16_t)extMsgLength, offsetHwords, fp, static_cast<G4_InstOption>(option), true);
2820     sendInst->inheritDIFrom(curInst);
2821 
2822     return sendInst;
2823 }
2824 
2825 // Create the send instruction to perform the fill of the spilled regvars's
2826 // segment from spill memory.
2827 // spillOff - spill offset to the fillRangeDcl, in unit of grf size
createLSCFill(G4_Declare * fillRangeDcl,G4_Declare * mRangeDcl,unsigned regOff,unsigned height,unsigned spillOff)2828 G4_INST * SpillManagerGRF::createLSCFill(
2829     G4_Declare * fillRangeDcl,
2830     G4_Declare * mRangeDcl,
2831     unsigned     regOff,
2832     unsigned     height,
2833     unsigned     spillOff)
2834 {
2835     G4_DstRegRegion* postDst = builder_->createDst(
2836         fillRangeDcl->getRegVar(), (short)regOff, SUBREG_ORIGIN,
2837         DEF_HORIZ_STRIDE, Type_UD);
2838 
2839     G4_Declare* fp = builder_->usesStack() ? builder_->kernel.fg.getFramePtrDcl() : nullptr;
2840 
2841     // compute hword offset to emit later when expanding spill/fill intrinsic
2842     G4_RegVar* r = fillRangeDcl->getRegVar();
2843     G4_RegVarTmp* rvar = static_cast<G4_RegVarTmp*> (r);
2844     int offset = getDisp(rvar->getBaseRegVar());
2845     getSpillOffset(offset);
2846     // fill intrinsic expects offsets to be in HWord
2847     uint32_t offsetHwords = (offset + spillOff * getGRFSize()) >> SCRATCH_SPACE_ADDRESS_UNIT;
2848 
2849     G4_SrcRegRegion* header = getLSCSpillFillHeader(mRangeDcl, fp, offset);
2850     auto fillInst = builder_->createFill(header, postDst, g4::SIMD16, height,
2851         offsetHwords, fp, InstOpt_WriteEnable, true);
2852     fillInst->inheritDIFrom(curInst);
2853     return fillInst;
2854 }
2855 
2856 // Create the send instruction to perform the fill of the filled region's
2857 // segment into fill memory.
createLSCFill(G4_Declare * fillRangeDcl,G4_Declare * mRangeDcl,G4_SrcRegRegion * filledRangeRegion,G4_ExecSize execSize)2858 G4_INST * SpillManagerGRF::createLSCFill(
2859     G4_Declare *      fillRangeDcl,
2860     G4_Declare *      mRangeDcl,
2861     G4_SrcRegRegion * filledRangeRegion,
2862     G4_ExecSize       execSize)
2863 {
2864     auto oldExecSize = execSize;
2865 
2866     G4_DstRegRegion* postDst = builder_->createDst(
2867         fillRangeDcl->getRegVar(), 0, SUBREG_ORIGIN,
2868         DEF_HORIZ_STRIDE, Type_UD);
2869 
2870     unsigned segmentByteSize = getSegmentByteSize(filledRangeRegion, oldExecSize);
2871     G4_Declare* fp = builder_->usesStack() ? builder_->kernel.fg.getFramePtrDcl() : nullptr;
2872 
2873     // compute hword offset to emit later when expanding spill/fill intrinsic
2874     int offset = getRegionDisp(filledRangeRegion);
2875     getSpillOffset(offset);
2876     uint32_t offsetHwords = offset >> SCRATCH_SPACE_ADDRESS_UNIT;
2877 
2878     unsigned responseLength = cdiv(segmentByteSize, REG_BYTE_SIZE);
2879     G4_SrcRegRegion* header = getLSCSpillFillHeader(mRangeDcl, fp, offset);
2880     auto fillInst = builder_->createFill(header, postDst, execSize,
2881         responseLength, offsetHwords, fp, InstOpt_WriteEnable, true);
2882     fillInst->inheritDIFrom(curInst);
2883     return fillInst;
2884 }
2885 
2886 // Replace the reference to the spilled region with a reference to an
2887 // equivalent reference to the spill range region.
replaceSpilledRange(G4_Declare * spillRangeDcl,G4_DstRegRegion * spilledRegion,G4_INST * spilledInst,uint32_t subRegOff)2888 void SpillManagerGRF::replaceSpilledRange(
2889     G4_Declare* spillRangeDcl,
2890     G4_DstRegRegion* spilledRegion,
2891     G4_INST* spilledInst,
2892     uint32_t subRegOff)
2893 {
2894     // we need to preserve accRegSel if it's set
2895     G4_DstRegRegion * tmpRangeDstRegion = builder_->createDst(
2896         spillRangeDcl->getRegVar(), REG_ORIGIN, subRegOff,
2897         spilledRegion->getHorzStride(), spilledRegion->getType(), spilledRegion->getAccRegSel());
2898     spilledInst->setDest (tmpRangeDstRegion);
2899 }
2900 
2901 // Replace the reference to the filled region with a reference to an
2902 // equivalent reference to the fill range region.
replaceFilledRange(G4_Declare * fillRangeDcl,G4_SrcRegRegion * filledRegion,G4_INST * filledInst)2903 void SpillManagerGRF::replaceFilledRange(
2904     G4_Declare *      fillRangeDcl,
2905     G4_SrcRegRegion * filledRegion,
2906     G4_INST *         filledInst)
2907 {
2908     G4_ExecSize execSize =
2909         isMultiRegComprSource(filledRegion, filledInst) ?
2910             G4_ExecSize(filledInst->getExecSize() / 2):
2911             filledInst->getExecSize();
2912 
2913     for (int i = 0; i < G4_MAX_SRCS; i++) {
2914         G4_Operand * src = filledInst->getSrc(i);
2915 
2916         if (src && src->isSrcRegRegion())
2917         {
2918             G4_SrcRegRegion* srcRgn = src->asSrcRegRegion();
2919             if (*srcRgn == *filledRegion)
2920             {
2921                 G4_SrcRegRegion* fillRangeSrcRegion =
2922                     createFillRangeSrcRegion(
2923                         fillRangeDcl->getRegVar(), filledRegion, execSize);
2924                 filledInst->setSrc(fillRangeSrcRegion, i);
2925             }
2926         }
2927     }
2928 }
2929 
2930 // Create the send instructions to write out the spillRangeDcl in aligned
2931 // portions.
sendOutSpilledRegVarPortions(G4_Declare * spillRangeDcl,G4_Declare * mRangeDcl,unsigned regOff,unsigned height,unsigned srcRegOff)2932 void SpillManagerGRF::sendOutSpilledRegVarPortions (
2933     G4_Declare *      spillRangeDcl,
2934     G4_Declare *      mRangeDcl,
2935     unsigned          regOff,
2936     unsigned          height,
2937     unsigned          srcRegOff)
2938 {
2939     if (!headerNeeded())
2940     {
2941         // No need to make a copy of offset because when using
2942         // scratch msg descriptor, the offset is part of send
2943         // msg descriptor and not the header.
2944     }
2945     else
2946     {
2947         // Initialize the message header with the spill disp for portion.
2948         int offset = getDisp(spillRangeDcl->getRegVar()) + regOff * REG_BYTE_SIZE;
2949         getSpillOffset(offset);
2950         unsigned segmentDisp = offset / OWORD_BYTE_SIZE;
2951 
2952         G4_Imm * segmentDispImm = builder_->createImm (segmentDisp, Type_UD);
2953         G4_DstRegRegion * mHeaderOffsetDstRegion =
2954             createMHeaderBlockOffsetDstRegion(mRangeDcl->getRegVar());
2955 
2956         if (builder_->getIsKernel() == false)
2957         {
2958             createAddFPInst(g4::SIMD1, mHeaderOffsetDstRegion, segmentDispImm);
2959         }
2960         else
2961         {
2962             createMovInst(g4::SIMD1, mHeaderOffsetDstRegion, segmentDispImm);
2963         }
2964         numGRFMove ++;
2965     }
2966 
2967 
2968     // Write out the portions using a greedy approach.
2969     int currentStride = getNextSize(height, useScratchMsg_);
2970 
2971     if (currentStride)
2972     {
2973         initMWritePayload(spillRangeDcl, mRangeDcl, regOff, currentStride);
2974 
2975         if (useLSCMsg)
2976         {
2977             createLSCSpill(spillRangeDcl, mRangeDcl, regOff, currentStride, srcRegOff);
2978         }
2979         else
2980         {
2981             createSpillSendInstr(spillRangeDcl, mRangeDcl, regOff, currentStride, srcRegOff);
2982         }
2983 
2984         if (height - currentStride > 0) {
2985             sendOutSpilledRegVarPortions(
2986                 spillRangeDcl, mRangeDcl, regOff + currentStride, height - currentStride, srcRegOff + currentStride);
2987         }
2988     }
2989 }
2990 
checkDefUseDomRel(G4_DstRegRegion * dst,G4_BB * defBB)2991 bool SpillManagerGRF::checkDefUseDomRel(G4_DstRegRegion* dst, G4_BB* defBB)
2992 {
2993     if (!refs.isUniqueDef(dst))
2994         return false;
2995 
2996     auto dcl = dst->getTopDcl();
2997 
2998     // check whether this def dominates all its uses
2999     auto uses = refs.getUses(dcl);
3000 
3001     for (auto& use : *uses)
3002     {
3003         auto useBB = std::get<1>(use);
3004 
3005         // check if def dominates use
3006         if (!defBB->dominates(useBB))
3007             return false;
3008 
3009         if (defBB == useBB)
3010         {
3011             // defBB dominates useBB since its the same BB.
3012             // ensure def instruction appears lexically before use BB.
3013             auto useInst = std::get<0>(use);
3014             if (dst->getInst()->getLexicalId() > useInst->getLexicalId())
3015                 return false;
3016         }
3017     }
3018 
3019     // if def is in loop then ensure all uses are in same loop level
3020     // or inner loop nest of def's closest loop.
3021     auto defLoop = gra.kernel.fg.getLoops().getInnerMostLoop(defBB);
3022     if (defLoop)
3023     {
3024         // since def is in loop, check whether uses are also in same loop.
3025         for (auto& use : *uses)
3026         {
3027             auto useBB = std::get<1>(use);
3028             auto useLoop = gra.kernel.fg.getLoops().getInnerMostLoop(useBB);
3029             if (!useLoop)
3030                 return false;
3031 
3032             if (!useLoop->fullSubset(defLoop))
3033                 return false;
3034         }
3035     }
3036 
3037     return true;
3038 }
3039 
checkUniqueDefAligned(G4_DstRegRegion * dst,G4_BB * defBB)3040 bool SpillManagerGRF::checkUniqueDefAligned(G4_DstRegRegion* dst, G4_BB* defBB)
3041 {
3042     // return true if dst is unique definition considering alignment
3043     // for spill code.
3044 
3045     if (!refs.isUniqueDef(dst))
3046         return false;
3047 
3048     // dst dcl may have multiple defs. As long as each def defines
3049     // different part of the variable, each def is marked as unique.
3050     // However, spill/fill is done on GRF granularity. So although
3051     // defs are unique in following sequence, we still need RMW for
3052     // 2nd def:
3053     //
3054     // .decl V361 type=w size=32
3055     //
3056     // add (M1, 8) V361(0,0)<1>   V358(0,0)<1;1,0>   0x10:w
3057     // add (M1, 8) V361(0,8)<1>   V358(0,0)<1;1,0>   0x18:w
3058     //
3059     // Return false if any other dominating def exists that defines
3060     // part of same row of variable dst.
3061     auto dcl = dst->getTopDcl();
3062 
3063     auto defs = refs.getDefs(dcl);
3064     unsigned int GRFSize = numEltPerGRF<Type_UB>();
3065     unsigned int lb = dst->getLeftBound();
3066     unsigned int rb = dst->getRightBound();
3067     unsigned int startRow = lb / GRFSize;
3068     unsigned int endRow = rb / GRFSize;
3069 
3070     for (auto& def : *defs)
3071     {
3072         // check whether dst and def write same row
3073         auto otherDefInst = std::get<0>(def);
3074 
3075         if (otherDefInst == dst->getInst())
3076             continue;
3077 
3078         auto otherDefDstRgn = otherDefInst->getDst();
3079         unsigned int otherLb = otherDefDstRgn->getLeftBound();
3080         unsigned int otherRb = otherDefDstRgn->getRightBound();
3081         unsigned int otherTypeSize = otherDefDstRgn->getTypeSize();
3082         bool commonRow = false;
3083         for (unsigned int i = otherLb; i <= otherRb; i += otherTypeSize)
3084         {
3085             auto rowWritten = i / GRFSize;
3086             if (rowWritten >= startRow && rowWritten <= endRow)
3087             {
3088                 commonRow = true;
3089                 break;
3090             }
3091         }
3092 
3093         // No common row between defs, so it is safe to skip fill
3094         // wrt current def. Check with next def.
3095         if (!commonRow)
3096             continue;
3097 
3098         auto otherDefBB = std::get<1>(def);
3099 
3100         if (!defBB->dominates(otherDefBB))
3101             return false;
3102 
3103         if (defBB == otherDefBB)
3104         {
3105             if (dst->getInst()->getLexicalId() > otherDefInst->getLexicalId())
3106                 return false;
3107         }
3108     }
3109 
3110     return true;
3111 }
3112 
3113 // This function checks whether each spill dst region requires a read-modify-write operation
3114 // when inserting spill code. Dominator/unique defs dont require redundant read operation.
3115 // Dst regions that do not need RMW are added to a set. This functionality isnt needed for
3116 // functional correctness. This function is executed before inserting spill code because
3117 // we need all dst regions of dcl available to decide whether read is redundant. If this is
3118 // executed when inserting spill then dst regions of dcl appearing earlier than current one
3119 // would be translated to spill code already. Spill/fill code insertion replaces dst region
3120 // of spills with new temp region. This makes it difficult to check whether current dst and
3121 // an earlier spilled dst write to same GRF row.
updateRMWNeeded()3122 void SpillManagerGRF::updateRMWNeeded()
3123 {
3124     if (!gra.kernel.getOption(vISA_SkipRedundantFillInRMW))
3125         return;
3126 
3127     auto isRMWNeededForSpilledDst = [&](G4_BB* bb, G4_DstRegRegion* spilledRegion)
3128     {
3129         auto isUniqueDef = checkUniqueDefAligned(spilledRegion, bb);
3130 
3131         // Check0 : Def is NoMask, -- checked in isPartialWriteForSpill()
3132         // Check1 : Def is unique def,
3133         // Check2 : Def is in loop L and all use(s) of dcl are in loop L or it's inner loop nest,
3134         // Check3 : Flowgraph is reducible
3135         // RMW_Not_Needed = Check0 || (Check1 && Check2 && Check3)
3136         bool RMW_Needed = true;
3137 
3138         if (isUniqueDef && builder_->kernel.fg.isReducible() && checkDefUseDomRel(spilledRegion, bb))
3139         {
3140             RMW_Needed = false;
3141         }
3142 
3143         return RMW_Needed;
3144     };
3145 
3146     // First pass to setup lexical ids of instruction so dominator relation can be
3147     // computed correctly intra-BB.
3148     unsigned int lexId = 0;
3149     for (auto bb : gra.kernel.fg.getBBList())
3150     {
3151         for (auto inst : bb->getInstList())
3152         {
3153             inst->setLexicalId(lexId++);
3154         }
3155     }
3156 
3157     for (auto bb : gra.kernel.fg.getBBList())
3158     {
3159         for (auto inst : bb->getInstList())
3160         {
3161             if (inst->isPseudoKill())
3162                 continue;
3163 
3164             auto dst = inst->getDst();
3165             if (dst)
3166             {
3167                 if (dst->getBase()->isRegVar())
3168                 {
3169                     auto dstRegVar = dst->getBase()->asRegVar();
3170                     if (dstRegVar && shouldSpillRegister(dstRegVar))
3171                     {
3172                         if (getRFType(dstRegVar) == G4_GRF)
3173                         {
3174                             auto RMW_Needed = isRMWNeededForSpilledDst(bb, dst);
3175                             if (!RMW_Needed)
3176                             {
3177                                 // Any spilled dst region that doesnt need RMW
3178                                 // is added to noRMWNeeded set. This set is later
3179                                 // checked when inserting spill/fill code.
3180                                 noRMWNeeded.insert(dst);
3181                             }
3182                         }
3183                     }
3184                 }
3185             }
3186         }
3187     }
3188 }
3189 
3190 // Create the code to create the spill range and save it to spill memory.
insertSpillRangeCode(INST_LIST::iterator spilledInstIter,G4_BB * bb)3191 void SpillManagerGRF::insertSpillRangeCode(
3192     INST_LIST::iterator spilledInstIter, G4_BB* bb)
3193 {
3194     G4_ExecSize execSize = (*spilledInstIter)->getExecSize();
3195     G4_Declare * replacementRangeDcl;
3196     builder_->instList.clear();
3197 
3198     bool optimizeSplitLLR = false;
3199     G4_INST* inst = *spilledInstIter;
3200     G4_INST* spillSendInst = NULL;
3201     auto spilledRegion = inst->getDst();
3202 
3203     auto spillDcl = spilledRegion->getTopDcl()->getRootDeclare();
3204     if (scalarImmSpill.find(spillDcl) != scalarImmSpill.end())
3205     {
3206         // do not spill scalar immediate values
3207         bb->erase(spilledInstIter);
3208         return;
3209     }
3210 
3211     if (builder_->getOption(vISA_DoSplitOnSpill))
3212     {
3213         // if spilled inst is copy of original variable to it's split variable
3214         // then simply remove the instruction.
3215         if (LoopVarSplit::removeFromPreheader(&gra, spillDcl, bb, spilledInstIter))
3216             return;
3217     }
3218 
3219     auto checkRMWNeeded = [this, spilledRegion]()
3220     {
3221         return noRMWNeeded.find(spilledRegion) == noRMWNeeded.end();
3222     };
3223 
3224     //subreg offset for new dst that replaces the spilled dst
3225     auto newSubregOff = 0;
3226 
3227     if (inst->mayExceedTwoGRF())
3228     {
3229         // Handle send instructions (special treatment)
3230         // Create the spill range for the whole post destination, assign spill
3231         // offset to the spill range and create the instructions to load the
3232         // save the spill range to spill memory.
3233         INST_LIST::iterator sendOutIter = spilledInstIter;
3234         assert(getRFType (spilledRegion) == G4_GRF);
3235         G4_Declare * spillRangeDcl =
3236             createPostDstSpillRangeDeclare (*sendOutIter);
3237         G4_Declare * mRangeDcl =
3238             createAndInitMHeader (
3239                 (G4_RegVarTransient *) spillRangeDcl->getRegVar());
3240 
3241         bool needRMW = inst->isPartialWriteForSpill(!bb->isAllLaneActive()) &&
3242             checkRMWNeeded();
3243         if (needRMW)
3244         {
3245             sendInSpilledRegVarPortions(
3246                 spillRangeDcl, mRangeDcl, 0,
3247                 spillRangeDcl->getNumRows(),
3248                 spilledRegion->getRegOff());
3249 
3250             INST_LIST::iterator insertPos = sendOutIter;
3251             splice(bb, insertPos, builder_->instList, curInst->getCISAOff());
3252         }
3253 
3254         sendOutSpilledRegVarPortions(
3255             spillRangeDcl, mRangeDcl, 0, spillRangeDcl->getNumRows(),
3256             spilledRegion->getRegOff());
3257 
3258         replacementRangeDcl = spillRangeDcl;
3259     }
3260     else
3261     {
3262         // Handle other regular single/multi destination register instructions.
3263         // Create the spill range for the destination region, assign spill
3264         // offset to the spill range and create the instructions to load the
3265         // save the spill range to spill memory.
3266 
3267         // Create the segment aligned spill range
3268         G4_Declare * spillRangeDcl =
3269             createSpillRangeDeclare(spilledRegion, execSize, *spilledInstIter);
3270 
3271         // Create and initialize the message header
3272         G4_Declare * mRangeDcl =
3273             createAndInitMHeader(spilledRegion, execSize);
3274 
3275         // Unaligned region specific handling.
3276         unsigned int spillSendOption = InstOpt_WriteEnable;
3277         auto preloadNeeded = shouldPreloadSpillRange(*spilledInstIter, bb);
3278         if (preloadNeeded && checkRMWNeeded())
3279         {
3280 
3281             // Preload the segment aligned spill range from memory to use
3282             // as an overlay
3283 
3284             preloadSpillRange(
3285                 spillRangeDcl, mRangeDcl, spilledRegion, execSize);
3286 
3287             // Create the temporary range to use as a replacement range.
3288 
3289             G4_Declare* tmpRangeDcl =
3290                 createTemporaryRangeDeclare(spilledRegion, execSize);
3291 
3292             // Copy out the value in the temporary range into its
3293             // location in the spill range.
3294 
3295             G4_DstRegRegion* spillRangeDstRegion =
3296                 createSpillRangeDstRegion(
3297                     spillRangeDcl->getRegVar(), spilledRegion, execSize);
3298 
3299             G4_SrcRegRegion* tmpRangeSrcRegion =
3300                 createTemporaryRangeSrcRegion(
3301                     tmpRangeDcl->getRegVar(), spilledRegion, execSize);
3302 
3303             // NOTE: Never use a predicate for the final mov if the spilled
3304             //       instruction was a sel (even in a SIMD CF context).
3305 
3306             G4_Predicate* predicate =
3307                 ((*spilledInstIter)->opcode() != G4_sel) ?
3308                 (*spilledInstIter)->getPredicate() : nullptr;
3309 
3310             if (tmpRangeSrcRegion->getType() == spillRangeDstRegion->getType() && IS_TYPE_FLOAT_ALL(tmpRangeSrcRegion->getType()))
3311             {
3312                 // use int copy when possible as floating-point copy moves may need further legalization
3313                 auto equivIntTy = floatToSameWidthIntType(tmpRangeSrcRegion->getType());
3314                 tmpRangeSrcRegion->setType(equivIntTy);
3315                 spillRangeDstRegion->setType(equivIntTy);
3316             }
3317 
3318             createMovInst(
3319                 execSize, spillRangeDstRegion, tmpRangeSrcRegion,
3320                 builder_->duplicateOperand(predicate),
3321                 (*spilledInstIter)->getMaskOption());
3322             numGRFMove++;
3323 
3324             replacementRangeDcl = tmpRangeDcl;
3325             // maintain the spilled dst's subreg to not break the regioning restriction
3326             newSubregOff = spilledRegion->getSubRegOff();
3327         }
3328         else
3329         {
3330             // We're here because:
3331             // 1. preloadNeeded = false AND checkRMWNeeded = true OR
3332             // 2. preloadNeeded = true AND checkRMWNeeded = false OR
3333             // 3. both are false
3334             //
3335             // Case (1) occurs when:
3336             // Def uses dword type and writes entire row. But def doesnt define
3337             // complete variable, ie it isnt a kill. For such cases, we need to
3338             // use def's EM on spill msg.
3339             //
3340             // Case (2) occurs when:
3341             // Def is partial but it is unique in the program. For such cases,
3342             // we should use WriteEnable msg.
3343             //
3344             // Case (3) occurs when:
3345             // Def uses dword type and write entire row. Def defines complete
3346             // variable. We can use either EM.
3347 
3348             // Aligned regions do not need a temporary range.
3349             LocalLiveRange* spilledLLR = gra.getLocalLR(spilledRegion->getBase()->asRegVar()->getDeclare());
3350             if (spilledLLR && spilledLLR->getSplit())
3351             {
3352                 // if we are spilling the dest of a copy move introduced by local live-range splitting,
3353                 // we can spill the source value instead and delete the move
3354                 // ToDo: we should generalize this to cover all moves
3355                 G4_SrcRegRegion* srcRegion = inst->getSrc(0)->asSrcRegRegion();
3356                 G4_Declare* srcDcl = srcRegion->getBase()->asRegVar()->getDeclare();
3357                 unsigned int lb = srcRegion->getLeftBound();
3358                 unsigned int rb = srcRegion->getRightBound();
3359 
3360                 G4_RegVar * regVar = NULL;
3361                 if (srcRegion->getBase()->isRegVar())
3362                 {
3363                     regVar = getRegVar(srcRegion);
3364                 }
3365 
3366                 if (gra.getSubRegAlign(srcDcl) == GRFALIGN &&
3367                     lb %  REG_BYTE_SIZE == 0 &&
3368                     (rb + 1) % REG_BYTE_SIZE == 0 &&
3369                     (rb - lb + 1) == spillRangeDcl->getByteSize() &&
3370                     regVar &&
3371                     !shouldSpillRegister(regVar))
3372                 {
3373                     optimizeSplitLLR = true;
3374                 }
3375             }
3376 
3377             replacementRangeDcl = spillRangeDcl;
3378             // maintain the spilled dst's subreg since the spill is done on a per-GRF basis
3379             newSubregOff = spilledRegion->getSubRegOff();
3380 
3381             if (preloadNeeded &&
3382                 isUnalignedRegion(spilledRegion, execSize))
3383             {
3384                 // A dst region may be not need pre-fill, however, if it is unaligned,
3385                 // we need to use non-zero sub-reg offset in newly created spill dcl.
3386                 // This section of code computes sub-reg offset to use for such cases.
3387                 // It is insufficient to simply use spilledRegion's sub-reg offset in
3388                 // case the region dcl is an alias of another dcl. This typically happens
3389                 // when 2 scalar dcls are merged by merge scalar pass, merged dcl is
3390                 // spilled, and dominating def writes non-zeroth element.
3391                 unsigned segmentDisp = getEncAlignedSegmentDisp(spilledRegion, execSize);
3392                 unsigned regionDisp = getRegionDisp(spilledRegion);
3393                 assert(regionDisp >= segmentDisp);
3394                 unsigned short subRegOff = (regionDisp - segmentDisp) / spilledRegion->getElemSize();
3395                 assert((regionDisp - segmentDisp) % spilledRegion->getElemSize() == 0);
3396                 assert(subRegOff * spilledRegion->getElemSize() +
3397                     getRegionByteSize(spilledRegion, execSize) <=
3398                     2u * REG_BYTE_SIZE);
3399                 newSubregOff = subRegOff;
3400             }
3401 
3402             if (!bb->isAllLaneActive() &&
3403                 !preloadNeeded)
3404             {
3405                 spillSendOption = (*spilledInstIter)->getMaskOption();
3406             }
3407         }
3408 
3409         // Save the spill range to memory.
3410 
3411         initMWritePayload(
3412             spillRangeDcl, mRangeDcl, spilledRegion, execSize);
3413 
3414 
3415         if (useLSCMsg)
3416         {
3417             spillSendInst = createLSCSpill(
3418                 spillRangeDcl, mRangeDcl, spilledRegion, execSize, spillSendOption);
3419         }
3420         else
3421         {
3422             spillSendInst = createSpillSendInstr(
3423                 spillRangeDcl, mRangeDcl, spilledRegion, execSize, spillSendOption);
3424         }
3425 
3426         if (failSafeSpill_ && !avoidDstSrcOverlap_)
3427         {
3428             spillRegOffset_ = spillRegStart_;
3429         }
3430     }
3431 
3432     if (builder_->getOption(vISA_DoSplitOnSpill))
3433     {
3434         if (inst->isRawMov())
3435         {
3436             // check whether mov is copy in loop preheader or exit
3437             auto it = gra.splitResults.find(inst->getSrc(0)->getTopDcl());
3438             if (it != gra.splitResults.end())
3439             {
3440                 if ((*it).second.origDcl == spillDcl)
3441                 {
3442                     // srcRegion is a split var temp
3443                     // this is a copy in either preheader or loop exit.
3444                     // add it to list so we know it shouldnt be optimized
3445                     // by spill cleanup.
3446                     for (auto addedInst : builder_->instList)
3447                     {
3448                         (*it).second.insts[bb].insert(addedInst);
3449                     }
3450                 }
3451             }
3452         }
3453     }
3454 
3455     // Replace the spilled range with the spill range and insert spill
3456     // instructions.
3457 
3458     INST_LIST::iterator insertPos = std::next(spilledInstIter);
3459     replaceSpilledRange (replacementRangeDcl, spilledRegion, *spilledInstIter, newSubregOff);
3460 
3461     splice(bb, insertPos, builder_->instList, curInst->getCISAOff());
3462 
3463     if (optimizeSplitLLR && spillSendInst && spillSendInst->isSplitSend())
3464     {
3465         // delete the move and spill the source instead. Note that we can't do this if split send
3466         // is not enabled, as payload contains header
3467         bb->erase(spilledInstIter);
3468         unsigned int pos = 1;
3469         spillSendInst->setSrc(inst->getSrc(0), pos);
3470     }
3471     else
3472     {
3473         splice(bb, spilledInstIter, builder_->instList, curInst->getCISAOff());
3474     }
3475 }
3476 
3477 // Create the code to create the GRF fill range and load it to spill memory.
insertFillGRFRangeCode(G4_SrcRegRegion * filledRegion,INST_LIST::iterator filledInstIter,G4_BB * bb)3478 void SpillManagerGRF::insertFillGRFRangeCode(
3479     G4_SrcRegRegion *   filledRegion,
3480     INST_LIST::iterator filledInstIter,
3481     G4_BB* bb)
3482 {
3483     G4_ExecSize execSize = (*filledInstIter)->getExecSize();
3484 
3485     // Create the fill range, assign spill offset to the fill range and
3486     // create the instructions to load the fill range from spill memory.
3487 
3488     G4_Declare * fillRangeDcl = nullptr;
3489 
3490     bool optimizeSplitLLR = false;
3491     G4_INST* inst = *filledInstIter;
3492     auto dstRegion = inst->getDst();
3493     G4_INST* fillSendInst = nullptr;
3494     auto spillDcl = filledRegion->getTopDcl()->getRootDeclare();
3495 
3496     if (builder_->getOption(vISA_DoSplitOnSpill))
3497     {
3498         // if spilled inst is copy of split variable to it's spilled variable
3499         // then simply remove the instruction.
3500         //
3501         // if inst is:
3502         // (W) mov (8|M0) SPLIT1    V10
3503         //
3504         // and SPLIT1 is marked as spilled then dont insert spill code for it.
3505         // V10 is guaranteed to be spilled already so there is no point spilling
3506         // SPLIT1. we simply remove above instruction and any fill emitted to load
3507         // V10 and return.
3508         if (LoopVarSplit::removeFromLoopExit(&gra, spillDcl, bb, filledInstIter))
3509             return;
3510     }
3511 
3512     auto sisIt = scalarImmSpill.find(spillDcl);
3513     if (sisIt != scalarImmSpill.end())
3514     {
3515         //re-materialize the scalar immediate value
3516         auto imm = sisIt->second;
3517         auto tempDcl = builder_->createTempVar(1, imm->getType(), spillDcl->getSubRegAlign());
3518         auto movInst = builder_->createMov(g4::SIMD1, builder_->createDstRegRegion(tempDcl, 1), imm, InstOpt_WriteEnable, false);
3519         bb->insertBefore(filledInstIter, movInst);
3520         assert(!filledRegion->isIndirect());
3521         auto newSrc = builder_->createSrc(tempDcl->getRegVar(), filledRegion->getRegOff(), filledRegion->getSubRegOff(), filledRegion->getRegion(),
3522             filledRegion->getType(), filledRegion->getAccRegSel());
3523         int i = 0;
3524         for (; i < inst->getNumSrc(); ++i)
3525         {
3526             if (inst->getSrc(i) == filledRegion)
3527             {
3528                 break;
3529             }
3530         }
3531         inst->setSrc(newSrc, i);
3532         return;
3533     }
3534 
3535     {
3536         fillRangeDcl =
3537             createGRFFillRangeDeclare(filledRegion, execSize, *filledInstIter);
3538         G4_Declare * mRangeDcl =
3539             createAndInitMHeader(filledRegion, execSize);
3540 
3541         if (useLSCMsg)
3542         {
3543             fillSendInst = createLSCFill(fillRangeDcl, mRangeDcl, filledRegion, execSize);
3544         }
3545         else
3546         {
3547             fillSendInst = createFillSendInstr(fillRangeDcl, mRangeDcl, filledRegion, execSize);
3548         }
3549 
3550         LocalLiveRange* filledLLR = gra.getLocalLR(filledRegion->getBase()->asRegVar()->getDeclare());
3551         if (filledLLR && filledLLR->getSplit())
3552         {
3553             G4_Declare* dstDcl = dstRegion->getBase()->asRegVar()->getDeclare();
3554             unsigned int lb = dstRegion->getLeftBound();
3555             unsigned int rb = dstRegion->getRightBound();
3556 
3557             if (gra.getSubRegAlign(dstDcl) == GRFALIGN  &&
3558                 lb %  REG_BYTE_SIZE == 0 &&
3559                 (rb + 1) % REG_BYTE_SIZE == 0 &&
3560                 (rb - lb + 1) == fillRangeDcl->getByteSize())
3561             {
3562                 optimizeSplitLLR = true;
3563             }
3564         }
3565     }
3566 
3567     if (builder_->getOption(vISA_DoSplitOnSpill))
3568     {
3569         if (inst->isRawMov())
3570         {
3571             // check whether mov is copy in loop preheader or exit
3572             auto it = gra.splitResults.find(dstRegion->getTopDcl());
3573             if (it != gra.splitResults.end())
3574             {
3575                 if ((*it).second.origDcl == filledRegion->getTopDcl())
3576                 {
3577                     // dstRegion is a split var temp
3578                     // this is a copy in either preheader or loop exit.
3579                     // add it to list so we know it shouldnt be optimized
3580                     // by spill cleanup.
3581                     for (auto addedInst : builder_->instList)
3582                     {
3583                         (*it).second.insts[bb].insert(addedInst);
3584                     }
3585                 }
3586             }
3587         }
3588     }
3589 
3590     // Replace the spilled range with the fill range and insert spill
3591     // instructions.
3592     replaceFilledRange (fillRangeDcl, filledRegion, *filledInstIter);
3593     INST_LIST::iterator insertPos = filledInstIter;
3594 
3595     splice(bb, insertPos, builder_->instList, curInst->getCISAOff());
3596     if (optimizeSplitLLR)
3597     {
3598         INST_LIST::iterator nextIter = filledInstIter;
3599         INST_LIST::iterator prevIter = filledInstIter;
3600         nextIter++;
3601         prevIter--;
3602         prevIter--;
3603         bb->erase(filledInstIter);
3604         fillSendInst->setDest(dstRegion);
3605         G4_INST* prevInst = (*prevIter);
3606         if (prevInst->isPseudoKill() &&
3607             GetTopDclFromRegRegion(prevInst->getDst()) == fillRangeDcl)
3608         {
3609             prevInst->setDest(builder_->createDst(GetTopDclFromRegRegion(dstRegion)->getRegVar(), 0, 0, 1, Type_UD));
3610         }
3611     }
3612 }
3613 
3614 // Create the code to create the GRF fill range and load it to spill memory.
insertSendFillRangeCode(G4_SrcRegRegion * filledRegion,INST_LIST::iterator filledInstIter,G4_BB * bb)3615 INST_LIST::iterator SpillManagerGRF::insertSendFillRangeCode(
3616     G4_SrcRegRegion *   filledRegion,
3617     INST_LIST::iterator filledInstIter,
3618     G4_BB *             bb)
3619 {
3620     G4_INST * sendInst = *filledInstIter;
3621 
3622     unsigned width = REG_BYTE_SIZE / filledRegion->getElemSize();
3623 
3624     // Create the fill range, assign spill offset to the fill range
3625 
3626     G4_Declare * fillGRFRangeDcl =
3627         createSendFillRangeDeclare(filledRegion, sendInst);
3628 
3629     // Create the instructions to load the fill range from spill memory.
3630 
3631     G4_Declare * mRangeDcl = createMRangeDeclare(filledRegion, G4_ExecSize(width));
3632     initMHeader(mRangeDcl);
3633     sendInSpilledRegVarPortions(
3634         fillGRFRangeDcl, mRangeDcl, 0,
3635         fillGRFRangeDcl->getNumRows(), filledRegion->getRegOff());
3636 
3637     // Replace the spilled range with the fill range and insert spill
3638     // instructions.
3639 
3640     replaceFilledRange(fillGRFRangeDcl, filledRegion, *filledInstIter);
3641     INST_LIST::iterator insertPos = filledInstIter;
3642 
3643     splice(bb, insertPos, builder_->instList, curInst->getCISAOff());
3644 
3645     // Return the next instruction
3646 
3647     return ++filledInstIter;
3648 }
3649 
getOrCreateSpillFillDcl(G4_Declare * spilledAddrTakenDcl,G4_Kernel * kernel)3650 G4_Declare* getOrCreateSpillFillDcl(
3651     G4_Declare* spilledAddrTakenDcl, G4_Kernel* kernel)
3652 {
3653     // If spilledAddrTakenDcl already has a spill/fill range created, return it.
3654     // Else create new one and return it.
3655     G4_Declare* temp = spilledAddrTakenDcl->getAddrTakenSpillFill();
3656     if (temp == NULL)
3657     {
3658 #define ADDR_SPILL_FILL_NAME_SIZE 32
3659         const char* dclName = kernel->fg.builder->getNameString(kernel->fg.mem, ADDR_SPILL_FILL_NAME_SIZE,
3660             "ADDR_SP_FL_V%d", spilledAddrTakenDcl->getDeclId());
3661 
3662         // temp is created of sub-class G4_RegVarTmp so that is
3663         // assigned infinite spill cost when coloring.
3664         temp = kernel->fg.builder->createDeclareNoLookup(dclName,
3665             G4_GRF, spilledAddrTakenDcl->getNumElems(),
3666             spilledAddrTakenDcl->getNumRows(), spilledAddrTakenDcl->getElemType() , DeclareType::Tmp, spilledAddrTakenDcl->getRegVar());
3667         spilledAddrTakenDcl->setAddrTakenSpillFill(temp);
3668     }
3669 
3670     return temp;
3671 }
3672 
getOrCreateAddrSpillFillDcl(G4_Declare * spilledAddrTakenDcl,G4_Kernel * kernel)3673 G4_Declare* SpillManagerGRF::getOrCreateAddrSpillFillDcl(
3674     G4_Declare* spilledAddrTakenDcl, G4_Kernel* kernel)
3675 {
3676     // If spilledAddrTakenDcl already has a spill/fill range created, return it.
3677     // Else create new one and return it.
3678 #define ADDR_SPILL_FILL_NAME_SIZE 32
3679     const char* dclName = kernel->fg.builder->getNameString(kernel->fg.mem, ADDR_SPILL_FILL_NAME_SIZE,
3680         "ADDR_SP_FL_V%d_%d", spilledAddrTakenDcl->getDeclId(), getAddrSpillFillIndex(spilledAddrTakenDcl->getRegVar()));
3681 
3682     // temp is created of sub-class G4_RegVarTmp so that is
3683     // assigned infinite spill cost when coloring.
3684     G4_Declare* temp = kernel->fg.builder->createDeclareNoLookup(dclName,
3685         G4_GRF, spilledAddrTakenDcl->getNumElems(),
3686         spilledAddrTakenDcl->getNumRows(), spilledAddrTakenDcl->getElemType(), DeclareType::Tmp, spilledAddrTakenDcl->getRegVar());
3687     spilledAddrTakenDcl->setAddrTakenSpillFill(temp);
3688 
3689     return temp;
3690 }
3691 
3692 // For each address taken register spill find an available physical register
3693 // and assign it to the decl. This physical register will be used for inserting
3694 // spill/fill code for indirect reference instructions that point to the
3695 // spilled range.
3696 // Return true if enough registers found, false if sufficient registers unavailable.
handleAddrTakenSpills(G4_Kernel * kernel,PointsToAnalysis & pointsToAnalysis)3697 bool SpillManagerGRF::handleAddrTakenSpills(
3698     G4_Kernel * kernel, PointsToAnalysis& pointsToAnalysis)
3699 {
3700     bool success = true;
3701     unsigned int numAddrTakenSpills = 0;
3702 
3703     for (const LiveRange* lr : *spilledLRs_)
3704     {
3705         if (lr->getDcl()->getAddressed())
3706         {
3707             getOrCreateSpillFillDcl(lr->getDcl(), kernel);
3708         }
3709 
3710         if (lvInfo_->isAddressSensitive(lr->getVar()->getId()))
3711         {
3712             numAddrTakenSpills++;
3713         }
3714     }
3715 
3716     if (numAddrTakenSpills > 0)
3717     {
3718         insertAddrTakenSpillFill(kernel, pointsToAnalysis);
3719         prunePointsTo(kernel, pointsToAnalysis);
3720     }
3721 
3722 #ifdef _DEBUG
3723     if (success)
3724     {
3725         // Verify that each spilled address taken has a spill/fill registers assigned
3726         for (const LiveRange* lr : *spilledLRs_)
3727         {
3728             if (lr->getDcl()->getAddressed())
3729                 MUST_BE_TRUE(lr->getDcl()->getAddrTakenSpillFill() != nullptr, "Spilled addr taken does not have assigned spill/fill GRF");
3730         }
3731     }
3732 #endif
3733 
3734     return success;
3735 }
3736 
handleAddrTakenLSSpills(G4_Kernel * kernel,PointsToAnalysis & pointsToAnalysis)3737 unsigned int SpillManagerGRF::handleAddrTakenLSSpills(
3738     G4_Kernel* kernel, PointsToAnalysis& pointsToAnalysis)
3739 {
3740     unsigned int numAddrTakenSpills = 0;
3741 
3742     for (LSLiveRange* lr : *spilledLSLRs_)
3743     {
3744         if (lvInfo_->isAddressSensitive(lr->getTopDcl()->getRegVar()->getId()))
3745         {
3746             numAddrTakenSpills++;
3747         }
3748     }
3749 
3750     if (numAddrTakenSpills > 0)
3751     {
3752         insertAddrTakenLSSpillFill(kernel, pointsToAnalysis);
3753         prunePointsToLS(kernel, pointsToAnalysis);
3754     }
3755 
3756 #ifdef _DEBUG
3757     if (numAddrTakenSpills)
3758     {
3759         // Verify that each spilled address taken has a spill/fill registers assigned
3760         for (LSLiveRange* lr : *spilledLSLRs_)
3761         {
3762             if (lr->getTopDcl()->getAddressed())
3763                 MUST_BE_TRUE(lr->getTopDcl()->getAddrTakenSpillFill() != NULL, "Spilled addr taken does not have assigned spill/fill GRF");
3764         }
3765     }
3766 #endif
3767 
3768     return numAddrTakenSpills;
3769 }
3770 
3771 // Insert spill and fill code for indirect GRF accesses
insertAddrTakenSpillAndFillCode(G4_Kernel * kernel,G4_BB * bb,INST_LIST::iterator inst_it,G4_Operand * opnd,PointsToAnalysis & pointsToAnalysis,bool spill,unsigned int bbid)3772 void SpillManagerGRF::insertAddrTakenSpillAndFillCode(
3773     G4_Kernel* kernel, G4_BB* bb,
3774     INST_LIST::iterator inst_it, G4_Operand* opnd,
3775     PointsToAnalysis& pointsToAnalysis, bool spill, unsigned int bbid)
3776 {
3777     curInst = (*inst_it);
3778     INST_LIST::iterator next_inst_it = ++inst_it;
3779     inst_it--;
3780 
3781     // Check whether spill operand points to any spilled range
3782     for (const LiveRange* lr : *spilledLRs_) {
3783         G4_RegVar* var = nullptr;
3784 
3785         if (opnd->isDstRegRegion() && opnd->asDstRegRegion()->getBase()->asRegVar())
3786             var = opnd->asDstRegRegion()->getBase()->asRegVar();
3787 
3788         if (opnd->isSrcRegRegion() && opnd->asSrcRegRegion()->getBase()->asRegVar())
3789             var = opnd->asSrcRegRegion()->getBase()->asRegVar();
3790 
3791         MUST_BE_TRUE(var != NULL, "Fill operand is neither a source nor dst region");
3792 
3793         if (var &&
3794             pointsToAnalysis.isPresentInPointsTo(var,
3795             lr->getVar()))
3796         {
3797             unsigned int numrows = lr->getDcl()->getNumRows();
3798             G4_Declare* temp = getOrCreateSpillFillDcl(lr->getDcl(), kernel);
3799 
3800             if (failSafeSpill_ &&
3801                 temp->getRegVar()->getPhyReg() == NULL)
3802             {
3803                 temp->getRegVar()->setPhyReg(builder_->phyregpool.getGreg(spillRegOffset_), 0);
3804                 spillRegOffset_ += numrows;
3805             }
3806 
3807             if (numrows > 1 || (lr->getDcl()->getNumElems() * lr->getDcl()->getElemSize() == getGRFSize()))
3808             {
3809                 if (useScratchMsg_ || useSplitSend())
3810                 {
3811                     G4_Declare * fillGRFRangeDcl = temp;
3812                     G4_Declare * mRangeDcl =
3813                         createAndInitMHeader(
3814                         (G4_RegVarTransient *)temp->getRegVar()->getBaseRegVar());
3815 
3816                     sendInSpilledRegVarPortions(
3817                         fillGRFRangeDcl, mRangeDcl, 0,
3818                         temp->getNumRows(), 0);
3819 
3820                     splice(bb, inst_it, builder_->instList, curInst->getCISAOff());
3821 
3822                     if (spill)
3823                     {
3824                         sendOutSpilledRegVarPortions (
3825                             temp, mRangeDcl, 0, temp->getNumRows(),
3826                             0);
3827 
3828                         splice(bb, next_inst_it, builder_->instList, curInst->getCISAOff());
3829                     }
3830                 }
3831                 else
3832                 {
3833 
3834                     for (unsigned int i = 0; i < numrows; i++)
3835                     {
3836                         G4_INST* inst;
3837                         const RegionDesc* rd = kernel->fg.builder->getRegionStride1();
3838                         G4_ExecSize curExSize {numEltPerGRF<Type_UD>()};
3839 
3840                         if ((i + 1) < numrows)
3841                             curExSize = G4_ExecSize(numEltPerGRF<Type_UD>()*2);
3842 
3843                         G4_SrcRegRegion* srcRex = kernel->fg.builder->createSrc(lr->getVar(), (short)i, 0, rd, Type_F);
3844 
3845                         G4_DstRegRegion* dstRex = kernel->fg.builder->createDst(temp->getRegVar(), (short)i, 0, 1, Type_F);
3846 
3847                         inst = kernel->fg.builder->createMov(curExSize, dstRex, srcRex, InstOpt_WriteEnable, false);
3848 
3849                         bb->insertBefore(inst_it, inst);
3850 
3851                         if (spill)
3852                         {
3853                             // Also insert spill code
3854                             G4_SrcRegRegion* srcRex = kernel->fg.builder->createSrc(temp->getRegVar(), (short)i, 0, rd, Type_F);
3855 
3856                             G4_DstRegRegion* dstRex = kernel->fg.builder->createDst(lr->getVar(), (short)i, 0, 1, Type_F);
3857 
3858                             inst = kernel->fg.builder->createMov(curExSize, dstRex, srcRex, InstOpt_WriteEnable, false);
3859 
3860                             bb->insertBefore(next_inst_it, inst);
3861                         }
3862 
3863                         // If 2 rows were processed then increment induction var suitably
3864                         if (   curExSize == 16)
3865                             i++;
3866                     }
3867                 }
3868 
3869                 // Update points to
3870                 // Note: points2 set should be updated after inserting fill code,
3871                 // however, this sets a bit in liveness bit-vector that
3872                 // causes the temp variable to be marked as live-out from
3873                 // that BB. A general fix should treat address taken variables
3874                 // more accurately wrt liveness so they dont escape via
3875                 // unfeasible paths.
3876                 //pointsToAnalysis.addFillToPointsTo(bbid, var, temp->getRegVar());
3877             }
3878             else if (numrows == 1)
3879             {
3880                 // Insert spill/fill when there decl uses a single row, that too not completely
3881                 G4_ExecSize curExSize = g4::SIMD16;
3882                 unsigned short numbytes = lr->getDcl()->getNumElems() * lr->getDcl()->getElemSize();
3883 
3884                 //temp->setAddressed();
3885                 short off = 0;
3886 
3887                 while (numbytes > 0)
3888                 {
3889                     G4_INST* inst;
3890                     G4_Type type = Type_W;
3891 
3892                     if (numbytes >= 16)
3893                         curExSize = g4::SIMD8;
3894                     else if (numbytes >= 8 && numbytes < 16)
3895                         curExSize = g4::SIMD4;
3896                     else if (numbytes >= 4 && numbytes < 8)
3897                         curExSize = g4::SIMD2;
3898                     else if (numbytes >= 2 && numbytes < 4)
3899                         curExSize = g4::SIMD1;
3900                     else if (numbytes == 1)
3901                     {
3902                         // If a region has odd number of bytes, copy last byte in final iteration
3903                         curExSize = g4::SIMD1;
3904                         type = Type_UB;
3905                     }
3906                     else {
3907                         MUST_BE_TRUE(false, "Cannot emit SIMD1 for byte");
3908                         curExSize = G4_ExecSize(0);
3909                     }
3910 
3911                     const RegionDesc* rd = kernel->fg.builder->getRegionStride1();
3912 
3913                     G4_SrcRegRegion* srcRex = kernel->fg.builder->createSrc(lr->getVar(), 0, off, rd, type);
3914 
3915                     G4_DstRegRegion* dstRex = kernel->fg.builder->createDst(temp->getRegVar(), 0, off, 1, type);
3916 
3917                     inst = kernel->fg.builder->createMov(curExSize, dstRex, srcRex, InstOpt_WriteEnable, false);
3918 
3919                     bb->insertBefore(inst_it, inst);
3920 
3921                     if (spill)
3922                     {
3923                         // Also insert spill code
3924                         G4_SrcRegRegion* srcRex = kernel->fg.builder->createSrc(temp->getRegVar(), 0, off, rd, type);
3925 
3926                         G4_DstRegRegion* dstRex = kernel->fg.builder->createDst(lr->getVar(), 0, off, 1, type);
3927 
3928                         inst = kernel->fg.builder->createMov(curExSize, dstRex, srcRex, InstOpt_WriteEnable, false);
3929 
3930                         bb->insertBefore(next_inst_it, inst);
3931                     }
3932 
3933                     off += curExSize;
3934                     numbytes -= curExSize*2;
3935                 }
3936 
3937                 // Update points to
3938                 //pointsToAnalysis.addFillToPointsTo(bbid, var, temp->getRegVar());
3939             }
3940 
3941             if (!spill)
3942             {
3943                 // Insert pseudo_use node so that liveness keeps the
3944                 // filled variable live through the indirect access.
3945                 // Not required for spill because for spill we will
3946                 // anyway insert a ues of the variable to emit store.
3947                 const RegionDesc* rd = kernel->fg.builder->getRegionScalar();
3948 
3949                 G4_SrcRegRegion* pseudoUseSrc =
3950                     kernel->fg.builder->createSrc(temp->getRegVar(), 0, 0, rd, Type_F);
3951 
3952                 G4_INST* pseudoUseInst = kernel->fg.builder->createInternalIntrinsicInst(
3953                     nullptr, Intrinsic::Use, g4::SIMD1,
3954                     nullptr, pseudoUseSrc, nullptr, nullptr, InstOpt_NoOpt);
3955 
3956                 bb->insertBefore(next_inst_it, pseudoUseInst);
3957             }
3958 
3959         }
3960     }
3961 }
3962 
3963 // Insert spill and fill code for indirect GRF accesses
insertAddrTakenLSSpillAndFillCode(G4_Kernel * kernel,G4_BB * bb,INST_LIST::iterator inst_it,G4_Operand * opnd,PointsToAnalysis & pointsToAnalysis,bool spill,unsigned int bbid)3964 void SpillManagerGRF::insertAddrTakenLSSpillAndFillCode(
3965     G4_Kernel* kernel, G4_BB* bb,
3966     INST_LIST::iterator inst_it, G4_Operand* opnd,
3967     PointsToAnalysis& pointsToAnalysis, bool spill, unsigned int bbid)
3968 {
3969     curInst = (*inst_it);
3970     INST_LIST::iterator next_inst_it = ++inst_it;
3971     inst_it--;
3972 
3973     // Check whether spill operand points to any spilled range
3974     for (LSLiveRange* lr : *spilledLSLRs_)
3975     {
3976         G4_RegVar* var = nullptr;
3977 
3978         if (opnd->isDstRegRegion() && opnd->asDstRegRegion()->getBase()->asRegVar())
3979             var = opnd->asDstRegRegion()->getBase()->asRegVar();
3980 
3981         if (opnd->isSrcRegRegion() && opnd->asSrcRegRegion()->getBase()->asRegVar())
3982             var = opnd->asSrcRegRegion()->getBase()->asRegVar();
3983 
3984         MUST_BE_TRUE(var != NULL, "Fill operand is neither a source nor dst region");
3985 
3986         if (var &&
3987             pointsToAnalysis.isPresentInPointsTo(var,
3988                 lr->getTopDcl()->getRegVar()))
3989         {
3990             unsigned int numrows = lr->getTopDcl()->getNumRows();
3991             G4_Declare* temp = getOrCreateAddrSpillFillDcl(lr->getTopDcl(), kernel);
3992 
3993             if (failSafeSpill_ &&
3994                 temp->getRegVar()->getPhyReg() == NULL)
3995             {
3996                 temp->getRegVar()->setPhyReg(builder_->phyregpool.getGreg(spillRegOffset_), 0);
3997                 spillRegOffset_ += numrows;
3998             }
3999 
4000             if (!lr->isActiveLR())
4001             {
4002                 lr->setActiveLR(true);
4003                 updateActiveList(lr, &activeLR_);
4004             }
4005 
4006             if (numrows > 1 || (lr->getTopDcl()->getNumElems() * lr->getTopDcl()->getElemSize() == getGRFSize()))
4007             {
4008                 if (useScratchMsg_ || useSplitSend())
4009                 {
4010                     G4_Declare* fillGRFRangeDcl = temp;
4011                     G4_Declare* mRangeDcl =
4012                         createAndInitMHeader(
4013                             (G4_RegVarTransient*)temp->getRegVar()->getBaseRegVar());
4014 
4015                     sendInSpilledRegVarPortions(
4016                         fillGRFRangeDcl, mRangeDcl, 0,
4017                         temp->getNumRows(), 0);
4018 
4019                     splice(bb, inst_it, builder_->instList, curInst->getCISAOff());
4020 
4021                     if (spill)
4022                     {
4023                         sendOutSpilledRegVarPortions(
4024                             temp, mRangeDcl, 0, temp->getNumRows(),
4025                             0);
4026 
4027                         splice(bb, next_inst_it, builder_->instList, curInst->getCISAOff());
4028                     }
4029                 }
4030                 else
4031                 {
4032 
4033                     for (unsigned int i = 0; i < numrows; i++)
4034                     {
4035                         G4_INST* inst;
4036                         const RegionDesc* rd = kernel->fg.builder->getRegionStride1();
4037                         G4_ExecSize curExSize{ numEltPerGRF<Type_UD>() };
4038 
4039                         if ((i + 1) < numrows)
4040                             curExSize = G4_ExecSize(numEltPerGRF<Type_UD>() * 2);
4041 
4042                         G4_SrcRegRegion* srcRex = kernel->fg.builder->createSrc(lr->getTopDcl()->getRegVar(), (short)i, 0, rd, Type_F);
4043 
4044                         G4_DstRegRegion* dstRex = kernel->fg.builder->createDst(temp->getRegVar(), (short)i, 0, 1, Type_F);
4045 
4046                         inst = kernel->fg.builder->createMov(curExSize, dstRex, srcRex, InstOpt_WriteEnable, false);
4047 
4048                         bb->insertBefore(inst_it, inst);
4049 
4050                         if (spill)
4051                         {
4052                             // Also insert spill code
4053                             G4_SrcRegRegion* srcRex = kernel->fg.builder->createSrc(temp->getRegVar(), (short)i, 0, rd, Type_F);
4054 
4055                             G4_DstRegRegion* dstRex = kernel->fg.builder->createDst(lr->getTopDcl()->getRegVar(), (short)i, 0, 1, Type_F);
4056 
4057                             inst = kernel->fg.builder->createMov(curExSize, dstRex, srcRex, InstOpt_WriteEnable, false);
4058 
4059                             bb->insertBefore(next_inst_it, inst);
4060                         }
4061 
4062                         // If 2 rows were processed then increment induction var suitably
4063                         if (curExSize == 16)
4064                             i++;
4065                     }
4066                 }
4067 
4068                 // Update points to
4069                 // Note: points2 set should be updated after inserting fill code,
4070                 // however, this sets a bit in liveness bit-vector that
4071                 // causes the temp variable to be marked as live-out from
4072                 // that BB. A general fix should treat address taken variables
4073                 // more accurately wrt liveness so they dont escape via
4074                 // unfeasible paths.
4075                 //pointsToAnalysis.addFillToPointsTo(bbid, var, temp->getRegVar());
4076             }
4077             else if (numrows == 1)
4078             {
4079                 // Insert spill/fill when there decl uses a single row, that too not completely
4080                 G4_ExecSize curExSize = g4::SIMD16;
4081                 unsigned short numbytes = lr->getTopDcl()->getNumElems() * lr->getTopDcl()->getElemSize();
4082 
4083                 //temp->setAddressed();
4084                 short off = 0;
4085 
4086                 while (numbytes > 0)
4087                 {
4088                     G4_INST* inst;
4089                     G4_Type type = Type_W;
4090 
4091                     if (numbytes >= 16)
4092                         curExSize = g4::SIMD8;
4093                     else if (numbytes >= 8 && numbytes < 16)
4094                         curExSize = g4::SIMD4;
4095                     else if (numbytes >= 4 && numbytes < 8)
4096                         curExSize = g4::SIMD2;
4097                     else if (numbytes >= 2 && numbytes < 4)
4098                         curExSize = g4::SIMD1;
4099                     else if (numbytes == 1)
4100                     {
4101                         // If a region has odd number of bytes, copy last byte in final iteration
4102                         curExSize = g4::SIMD1;
4103                         type = Type_UB;
4104                     }
4105                     else {
4106                         MUST_BE_TRUE(false, "Cannot emit SIMD1 for byte");
4107                         curExSize = G4_ExecSize(0);
4108                     }
4109 
4110                     const RegionDesc* rd = kernel->fg.builder->getRegionStride1();
4111 
4112                     G4_SrcRegRegion* srcRex = kernel->fg.builder->createSrc(lr->getTopDcl()->getRegVar(), 0, off, rd, type);
4113 
4114                     G4_DstRegRegion* dstRex = kernel->fg.builder->createDst(temp->getRegVar(), 0, off, 1, type);
4115 
4116                     inst = kernel->fg.builder->createMov(curExSize, dstRex, srcRex, InstOpt_WriteEnable, false);
4117 
4118                     bb->insertBefore(inst_it, inst);
4119 
4120                     if (spill)
4121                     {
4122                         // Also insert spill code
4123                         G4_SrcRegRegion* srcRex = kernel->fg.builder->createSrc(temp->getRegVar(), 0, off, rd, type);
4124 
4125                         G4_DstRegRegion* dstRex = kernel->fg.builder->createDst(lr->getTopDcl()->getRegVar(), 0, off, 1, type);
4126 
4127                         inst = kernel->fg.builder->createMov(curExSize, dstRex, srcRex, InstOpt_WriteEnable, false);
4128 
4129                         bb->insertBefore(next_inst_it, inst);
4130                     }
4131 
4132                     off += curExSize;
4133                     numbytes -= curExSize * 2;
4134                 }
4135 
4136                 // Update points to
4137                 //pointsToAnalysis.addFillToPointsTo(bbid, var, temp->getRegVar());
4138             }
4139 
4140             if (!spill)
4141             {
4142                 // Insert pseudo_use node so that liveness keeps the
4143                 // filled variable live through the indirect access.
4144                 // Not required for spill because for spill we will
4145                 // anyway insert a ues of the variable to emit store.
4146                 const RegionDesc* rd = kernel->fg.builder->getRegionScalar();
4147 
4148                 G4_SrcRegRegion* pseudoUseSrc =
4149                     kernel->fg.builder->createSrc(temp->getRegVar(), 0, 0, rd, Type_F);
4150 
4151                 G4_INST* pseudoUseInst = kernel->fg.builder->createInternalIntrinsicInst(
4152                     nullptr, Intrinsic::Use, g4::SIMD1,
4153                     nullptr, pseudoUseSrc, nullptr, nullptr, InstOpt_NoOpt);
4154 
4155                 bb->insertBefore(next_inst_it, pseudoUseInst);
4156             }
4157 
4158         }
4159     }
4160 }
4161 
4162 // Insert any spill/fills for address taken
insertAddrTakenSpillFill(G4_Kernel * kernel,PointsToAnalysis & pointsToAnalysis)4163 void SpillManagerGRF::insertAddrTakenSpillFill(
4164     G4_Kernel* kernel, PointsToAnalysis& pointsToAnalysis)
4165 {
4166     for (auto bb : kernel->fg)
4167     {
4168         for (INST_LIST_ITER inst_it = bb->begin();
4169             inst_it != bb->end();
4170             inst_it++)
4171         {
4172             G4_INST* curInst = (*inst_it);
4173 
4174             if (failSafeSpill_)
4175             {
4176                 spillRegOffset_ = indrSpillRegStart_;
4177             }
4178 
4179             // Handle indirect destination
4180             G4_DstRegRegion* dst = curInst->getDst();
4181 
4182             if (dst && dst->getRegAccess() == IndirGRF)
4183             {
4184                 insertAddrTakenSpillAndFillCode(kernel, bb, inst_it, dst, pointsToAnalysis, true, bb->getId());
4185             }
4186 
4187             for (int i = 0; i < G4_MAX_SRCS; i++)
4188             {
4189                 G4_Operand* src = curInst->getSrc(i);
4190 
4191                 if (src && src->isSrcRegRegion() && src->asSrcRegRegion()->getRegAccess() == IndirGRF)
4192                 {
4193                     insertAddrTakenSpillAndFillCode(kernel, bb, inst_it, src, pointsToAnalysis, false, bb->getId());
4194                 }
4195             }
4196         }
4197     }
4198 }
4199 
insertAddrTakenLSSpillFill(G4_Kernel * kernel,PointsToAnalysis & pointsToAnalysis)4200 void SpillManagerGRF::insertAddrTakenLSSpillFill(
4201     G4_Kernel* kernel, PointsToAnalysis& pointsToAnalysis)
4202 {
4203     for (auto bb : kernel->fg)
4204     {
4205         for (INST_LIST_ITER inst_it = bb->begin();
4206             inst_it != bb->end();
4207             inst_it++)
4208         {
4209             G4_INST* curInst = (*inst_it);
4210 
4211             unsigned int instID = curInst->getLexicalId();
4212             if (instID != (unsigned int)-1)
4213             {
4214                 expireRanges(instID * 2, &activeLR_);
4215             }
4216 
4217             if (failSafeSpill_)
4218             {
4219                 spillRegOffset_ = indrSpillRegStart_;
4220             }
4221 
4222             // Handle indirect destination
4223             G4_DstRegRegion* dst = curInst->getDst();
4224 
4225             if (dst && dst->getRegAccess() == IndirGRF)
4226             {
4227                 insertAddrTakenLSSpillAndFillCode(kernel, bb, inst_it, dst, pointsToAnalysis, true, bb->getId());
4228             }
4229 
4230             for (int i = 0; i < G4_MAX_SRCS; i++)
4231             {
4232                 G4_Operand* src = curInst->getSrc(i);
4233 
4234                 if (src && src->isSrcRegRegion() && src->asSrcRegRegion()->getRegAccess() == IndirGRF)
4235                 {
4236                     insertAddrTakenLSSpillAndFillCode(kernel, bb, inst_it, src, pointsToAnalysis, false, bb->getId());
4237                 }
4238             }
4239         }
4240     }
4241 
4242     if (activeLR_.size() > 0)
4243     {
4244         // Expire any remaining ranges
4245         LSLiveRange* lastActive = activeLR_.back();
4246         unsigned int endIdx;
4247 
4248         lastActive->getLastRef(endIdx);
4249 
4250         expireRanges(endIdx, &activeLR_);
4251     }
4252 
4253 }
4254 
4255 // For address spill/fill code inserted remove from points of each indirect operand
4256 // the original regvar that is spilled.
prunePointsTo(G4_Kernel * kernel,PointsToAnalysis & pointsToAnalysis)4257 void SpillManagerGRF::prunePointsTo(
4258     G4_Kernel* kernel, PointsToAnalysis& pointsToAnalysis)
4259 {
4260     for (auto bb : kernel->fg)
4261     {
4262         for (INST_LIST_ITER inst_it = bb->begin();
4263             inst_it != bb->end();
4264             inst_it++)
4265         {
4266             G4_INST* curInst = (*inst_it);
4267             std::stack<G4_Operand*> st;
4268 
4269             // Handle indirect destination
4270             G4_DstRegRegion* dst = curInst->getDst();
4271 
4272             if (dst && dst->getRegAccess() == IndirGRF)
4273             {
4274                 st.push(dst);
4275             }
4276 
4277             for (int i = 0; i < G4_MAX_SRCS; i++)
4278             {
4279                 G4_Operand* src = curInst->getSrc(i);
4280 
4281                 if (src && src->isSrcRegRegion() && src->asSrcRegRegion()->getRegAccess() == IndirGRF)
4282                 {
4283                     st.push(src);
4284                 }
4285             }
4286 
4287             while (st.size() > 0)
4288             {
4289                 G4_Operand* cur = st.top();
4290                 st.pop();
4291 
4292                 // Check whether spill operand points to any spilled range
4293                 for (const LiveRange* lr : *spilledLRs_) {
4294                     G4_RegVar* var = nullptr;
4295 
4296                     if (cur->isDstRegRegion() && cur->asDstRegRegion()->getBase()->asRegVar())
4297                         var = cur->asDstRegRegion()->getBase()->asRegVar();
4298 
4299                     if (cur->isSrcRegRegion() && cur->asSrcRegRegion()->getBase()->asRegVar())
4300                         var = cur->asSrcRegRegion()->getBase()->asRegVar();
4301 
4302                     MUST_BE_TRUE(var != nullptr, "Operand is neither a source nor dst region");
4303 
4304                     if (var &&
4305                         pointsToAnalysis.isPresentInPointsTo(var,
4306                         lr->getVar()))
4307                     {
4308                         // Remove this from points to
4309                         pointsToAnalysis.removeFromPointsTo(var, lr->getVar());
4310                     }
4311                 }
4312             }
4313         }
4314     }
4315 }
4316 
prunePointsToLS(G4_Kernel * kernel,PointsToAnalysis & pointsToAnalysis)4317 void SpillManagerGRF::prunePointsToLS(
4318     G4_Kernel* kernel, PointsToAnalysis& pointsToAnalysis)
4319 {
4320     for (auto bb : kernel->fg)
4321     {
4322         for (INST_LIST_ITER inst_it = bb->begin();
4323             inst_it != bb->end();
4324             inst_it++)
4325         {
4326             G4_INST* curInst = (*inst_it);
4327             std::stack<G4_Operand*> st;
4328 
4329             // Handle indirect destination
4330             G4_DstRegRegion* dst = curInst->getDst();
4331 
4332             if (dst && dst->getRegAccess() == IndirGRF)
4333             {
4334                 st.push(dst);
4335             }
4336 
4337             for (int i = 0; i < G4_MAX_SRCS; i++)
4338             {
4339                 G4_Operand* src = curInst->getSrc(i);
4340 
4341                 if (src && src->isSrcRegRegion() && src->asSrcRegRegion()->getRegAccess() == IndirGRF)
4342                 {
4343                     st.push(src);
4344                 }
4345             }
4346 
4347             while (st.size() > 0)
4348             {
4349                 G4_Operand* cur = st.top();
4350                 st.pop();
4351 
4352                 // Check whether spill operand points to any spilled range
4353                 for (LSLiveRange* lr : *spilledLSLRs_)
4354                 {
4355                     G4_RegVar* var = nullptr;
4356 
4357                     if (cur->isDstRegRegion() && cur->asDstRegRegion()->getBase()->asRegVar())
4358                         var = cur->asDstRegRegion()->getBase()->asRegVar();
4359 
4360                     if (cur->isSrcRegRegion() && cur->asSrcRegRegion()->getBase()->asRegVar())
4361                         var = cur->asSrcRegRegion()->getBase()->asRegVar();
4362 
4363                     MUST_BE_TRUE(var != NULL, "Operand is neither a source nor dst region");
4364 
4365                     if (var &&
4366                         pointsToAnalysis.isPresentInPointsTo(var,
4367                             lr->getTopDcl()->getRegVar()))
4368                     {
4369                         // Remove this from points to
4370                         pointsToAnalysis.removeFromPointsTo(var, lr->getTopDcl()->getRegVar());
4371                     }
4372                 }
4373             }
4374         }
4375     }
4376 }
4377 
runSpillAnalysis()4378 void SpillManagerGRF::runSpillAnalysis()
4379 {
4380     if (failSafeSpill_)
4381     {
4382         // ToDo: use the reserved GRFs to perform scalar immediate rematerialization
4383         return;
4384     }
4385 
4386     std::unordered_set<G4_Declare*> spilledDcl;
4387     scalarImmSpill.clear();
4388 
4389     for (auto bb : gra.kernel.fg)
4390     {
4391         for (auto inst : *bb)
4392         {
4393             auto dst = inst->getDst();
4394             auto dcl = dst && dst->getTopDcl() ? dst->getTopDcl()->getRootDeclare() : nullptr;
4395             if (!dcl || dcl->getAddressed() || dcl->getNumElems() != 1 || !shouldSpillRegister(dcl->getRegVar()))
4396             {
4397                 // declare must be a scalar without address taken
4398                 continue;
4399             }
4400             if (spilledDcl.count(dcl))
4401             {
4402                 // this spilled declare is defined more than once
4403                 scalarImmSpill.erase(dcl);
4404                 continue;
4405             }
4406             spilledDcl.insert(dcl);
4407             if (inst->opcode() == G4_mov && inst->getExecSize() == g4::SIMD1 && inst->getSrc(0)->isImm() && !inst->getPredicate() && !inst->getSaturate())
4408             {
4409                 scalarImmSpill[dcl] = inst->getSrc(0)->asImm();
4410             }
4411         }
4412     }
4413 }
4414 
4415 // Insert spill/fill code for all registers that have not been assigned
4416 // physical registers in the current iteration of the graph coloring
4417 // allocator.
4418 // returns false if spill fails somehow
insertSpillFillCode(G4_Kernel * kernel,PointsToAnalysis & pointsToAnalysis)4419 bool SpillManagerGRF::insertSpillFillCode(
4420     G4_Kernel * kernel, PointsToAnalysis& pointsToAnalysis)
4421 {
4422     //runSpillAnalysis();
4423     // Set the spill flag of all spilled regvars.
4424     for (const LiveRange* lr : *spilledLRs_) {
4425 
4426         // Ignore request to spill/fill the spill/fill ranges
4427         // as it does not help the allocator.
4428         if (shouldSpillRegister(lr->getVar()) == false)
4429         {
4430             bool needsEOTGRF = lr->getEOTSrc() && builder_->hasEOTGRFBinding();
4431             if (failSafeSpill_ && needsEOTGRF &&
4432                 (lr->getVar()->isRegVarTransient() ||
4433                     lr->getVar()->isRegVarTmp()))
4434             {
4435                 lr->getVar()->setPhyReg(builder_->phyregpool.getGreg(spillRegStart_ > (kernel->getNumRegTotal() - 16) ? spillRegStart_ : (kernel->getNumRegTotal() - 16)), 0);
4436                 continue;
4437             }
4438             return false;
4439         }
4440         else
4441         {
4442             lr->getVar()->getDeclare()->setSpillFlag();
4443         }
4444     }
4445 
4446     // Handle address taken spills
4447     bool success = handleAddrTakenSpills(kernel, pointsToAnalysis);
4448 
4449     if (!success)
4450     {
4451         DEBUG_MSG("Enough physical register not available for handling address taken spills" << std::endl);
4452         return false;
4453     }
4454 
4455     // Insert spill/fill code for all basic blocks.
4456     updateRMWNeeded();
4457     FlowGraph& fg = kernel->fg;
4458 
4459     unsigned int id = 0;
4460     for (BB_LIST_ITER it = fg.begin(); it != fg.end(); it++)
4461     {
4462         bbId_ = (*it)->getId();
4463         INST_LIST::iterator jt = (*it)->begin();
4464 
4465         while (jt != (*it)->end()) {
4466             INST_LIST::iterator kt = jt;
4467             ++kt;
4468             G4_INST* inst = *jt;
4469 
4470             curInst = inst;
4471             curInst->setLexicalId(id++);
4472 
4473             if (failSafeSpill_)
4474             {
4475                 spillRegOffset_ = spillRegStart_;
4476             }
4477 
4478             // Insert spill code, when the target is a spilled register.
4479 
4480             if (inst->getDst())
4481             {
4482                 G4_RegVar* regVar = nullptr;
4483                 if (inst->getDst()->getBase()->isRegVar())
4484                 {
4485                     regVar = getRegVar(inst->getDst());
4486                 }
4487 
4488                 if (regVar && shouldSpillRegister(regVar))
4489                 {
4490                     if (getRFType(regVar) == G4_GRF)
4491                     {
4492                         if (inst->isPseudoKill())
4493                         {
4494                             (*it)->erase(jt);
4495                             jt = kt;
4496                             continue;
4497                         }
4498 
4499                         insertSpillRangeCode(jt, (*it));
4500                     }
4501                     else
4502                     {
4503                         assert(0);
4504                     }
4505                 }
4506             }
4507 
4508 
4509             // Insert fill code, when the source is a spilled register.
4510 
4511             for (unsigned i = 0; i < G4_MAX_SRCS; i++)
4512             {
4513                 if (inst->getSrc(i) &&
4514                     inst->getSrc(i)->isSrcRegRegion ())
4515                 {
4516                     auto srcRR = inst->getSrc(i)->asSrcRegRegion();
4517                     G4_RegVar* regVar = nullptr;
4518                     if (srcRR->getBase()->isRegVar())
4519                     {
4520                         regVar = getRegVar(srcRR);
4521                     }
4522 
4523                     if (regVar && shouldSpillRegister(regVar))
4524                     {
4525                         if (inst->isLifeTimeEnd())
4526                         {
4527                             (*it)->erase(jt);
4528                             break;
4529                         }
4530                         bool mayExceedTwoGRF = (inst->isSend() && i == 0) ||
4531                             inst->isDpas() ||
4532                             (inst->isSplitSend() && i == 1);
4533 
4534                         if (mayExceedTwoGRF)
4535                         {
4536                             insertSendFillRangeCode(srcRR, jt, *it);
4537                         }
4538                         else if (getRFType(regVar) == G4_GRF)
4539                             insertFillGRFRangeCode(srcRR, jt, *it);
4540                         else
4541                             assert(0);
4542                     }
4543                 }
4544             }
4545 
4546             jt = kt;
4547         }
4548     }
4549 
4550     bbId_ = UINT_MAX;
4551 
4552     // Calculate the spill memory used in this iteration
4553 
4554     for (auto spill : *spilledLRs_)
4555     {
4556         unsigned disp = spill->getVar ()->getDisp ();
4557 
4558         if (spill->getVar ()->isSpilled ())
4559         {
4560             if (disp != UINT_MAX)
4561             {
4562                 nextSpillOffset_ = std::max(nextSpillOffset_, disp + getByteSize(spill->getVar()));
4563             }
4564         }
4565     }
4566 
4567     // Emit the instruction with the introduced spill/fill ranges in the
4568     // current iteration.
4569 
4570 #ifndef NDEBUG
4571 #ifdef DEBUG_VERBOSE_ON1
4572     std::stringstream fname;
4573     fname << "spill_code_" << iterationNo_++ << "_" << kernel->getName()
4574           << ends;
4575     std::ofstream sout;
4576     sout.open(fname.str());
4577     kernel->emitDeviceAsm(sout, true, 0);
4578     sout.close ();
4579 #endif
4580 #endif
4581 
4582     return true;
4583 }
4584 
4585 
expireRanges(unsigned int idx,std::list<LSLiveRange * > * liveList)4586 void SpillManagerGRF::expireRanges(
4587     unsigned int idx, std::list<LSLiveRange*> * liveList)
4588 {
4589     //active list is sorted in ascending order of starting index
4590 
4591     while (liveList->size() > 0)
4592     {
4593         unsigned int endIdx;
4594         LSLiveRange* lr = liveList->front();
4595 
4596         lr->getLastRef(endIdx);
4597 
4598         if (endIdx <= idx)
4599         {
4600 #ifdef DEBUG_VERBOSE_ON
4601             DEBUG_VERBOSE("Expiring range " << lr->getTopDcl()->getName() << std::endl);
4602 #endif
4603             // Remove range from active list
4604             liveList->pop_front();
4605             lr->setActiveLR(false);
4606         }
4607         else
4608         {
4609             // As soon as we find first range that ends after ids break loop
4610             break;
4611         }
4612     }
4613 
4614     return;
4615 }
4616 
updateActiveList(LSLiveRange * lr,std::list<LSLiveRange * > * liveList)4617 void SpillManagerGRF::updateActiveList(
4618     LSLiveRange * lr, std::list<LSLiveRange*> * liveList)
4619 {
4620     bool done = false;
4621     unsigned int newlr_end;
4622 
4623     lr->getLastRef(newlr_end);
4624 
4625     for (auto active_it = liveList->begin();
4626         active_it != liveList->end();
4627         active_it++)
4628     {
4629         unsigned int end_idx;
4630         LSLiveRange* active_lr = (*active_it);
4631 
4632         active_lr->getLastRef(end_idx);
4633 
4634         if (end_idx > newlr_end)
4635         {
4636             liveList->insert(active_it, lr);
4637             done = true;
4638             break;
4639         }
4640     }
4641 
4642     if (done == false)
4643         liveList->push_back(lr);
4644 }
4645 
spillLiveRanges(G4_Kernel * kernel)4646 bool SpillManagerGRF::spillLiveRanges(G4_Kernel * kernel)
4647 {
4648     // Set the spill flag of all spilled regvars.
4649     for (LSLiveRange* lr : *spilledLSLRs_) {
4650         lr->getTopDcl()->setSpillFlag();
4651     }
4652 
4653     // Handle address taken spills
4654     unsigned addrSpillNum = handleAddrTakenLSSpills(kernel, gra.pointsToAnalysis);
4655 
4656     if (addrSpillNum)
4657     {
4658         for (auto spill : *spilledLSLRs_)
4659         {
4660             unsigned disp = spill->getTopDcl()->getRegVar()->getDisp();
4661 
4662             if (spill->getTopDcl()->getRegVar()->isSpilled())
4663             {
4664                 if (disp != UINT_MAX)
4665                 {
4666                     nextSpillOffset_ = std::max(nextSpillOffset_, disp + getByteSize(spill->getTopDcl()->getRegVar()));
4667                 }
4668             }
4669         }
4670     }
4671 
4672     // Insert spill/fill code for all basic blocks.
4673     FlowGraph& fg = kernel->fg;
4674     for (BB_LIST_ITER it = fg.begin(); it != fg.end(); it++)
4675     {
4676         bbId_ = (*it)->getId();
4677         INST_LIST::iterator jt = (*it)->begin();
4678 
4679         while (jt != (*it)->end())
4680         {
4681             INST_LIST::iterator kt = jt;
4682             ++kt;
4683             G4_INST* inst = *jt;
4684             unsigned int instID = inst->getLexicalId();
4685             curInst = inst;
4686             if (instID != (unsigned int)-1)
4687             {
4688                 expireRanges(instID * 2, &activeLR_);
4689             }
4690 
4691             if (failSafeSpill_)
4692             {
4693                 spillRegOffset_ = spillRegStart_;
4694             }
4695 
4696             // Insert spill code, when the target is a spilled register.
4697             if (inst->getDst())
4698             {
4699                 G4_RegVar* regVar = nullptr;
4700                 if (inst->getDst()->getBase()->isRegVar())
4701                 {
4702                     regVar = getRegVar(inst->getDst());
4703                 }
4704 
4705                 if (regVar && regVar->getDeclare()->isSpilled())
4706                 {
4707                     G4_Declare* dcl = regVar->getDeclare();
4708                     while (dcl->getAliasDeclare())
4709                     {
4710                         dcl = dcl->getAliasDeclare();
4711                     }
4712                     LSLiveRange* lr = gra.getLSLR(dcl);
4713                     if (!lr->isActiveLR())
4714                     {
4715                         lr->setActiveLR(true);
4716                         updateActiveList(lr, &activeLR_);
4717                     }
4718 
4719                     if (getRFType(regVar) == G4_GRF)
4720                     {
4721                         if (inst->isPseudoKill())
4722                         {
4723                             (*it)->erase(jt);
4724                             jt = kt;
4725                             continue;
4726                         }
4727 
4728                         insertSpillRangeCode(jt, (*it));
4729                     }
4730                     else
4731                     {
4732                         assert(0);
4733                     }
4734                 }
4735             }
4736 
4737             // Insert fill code, when the source is a spilled register.
4738             for (unsigned i = 0; i < G4_MAX_SRCS; i++)
4739             {
4740                 if (inst->getSrc(i) &&
4741                     inst->getSrc(i)->isSrcRegRegion ())
4742                 {
4743                     auto srcRR = inst->getSrc(i)->asSrcRegRegion();
4744                     G4_RegVar* regVar = nullptr;
4745                     if (srcRR->getBase()->isRegVar())
4746                     {
4747                         regVar = getRegVar(srcRR);
4748                     }
4749 
4750                     if (regVar && regVar->getDeclare()->isSpilled())
4751                     {
4752                         G4_Declare* dcl = regVar->getDeclare();
4753                         while (dcl->getAliasDeclare())
4754                         {
4755                             dcl = dcl->getAliasDeclare();
4756                         }
4757                         LSLiveRange* lr = gra.getLSLR(dcl);
4758                         if (!lr->isActiveLR())
4759                         {
4760                             lr->setActiveLR(true);
4761                             updateActiveList(lr, &activeLR_);
4762                         }
4763 
4764                         if (inst->isLifeTimeEnd())
4765                         {
4766                             (*it)->erase(jt);
4767                             break;
4768                         }
4769                         bool mayExceedTwoGRF = (inst->isSend() && i == 0) ||
4770                             inst->isDpas() ||
4771                             (inst->isSplitSend() && i == 1);
4772 
4773                         if (mayExceedTwoGRF)
4774                         {
4775                             insertSendFillRangeCode(srcRR, jt, *it);
4776                         }
4777                         else if (getRFType(regVar) == G4_GRF)
4778                             insertFillGRFRangeCode(srcRR, jt, *it);
4779                         else
4780                             assert(0);
4781                     }
4782                 }
4783             }
4784 
4785             jt = kt;
4786         }
4787     }
4788 
4789     bbId_ = UINT_MAX;
4790 
4791     // Calculate the spill memory used in this iteration
4792     for (auto spill : (*spilledLSLRs_))
4793     {
4794         unsigned disp = spill->getTopDcl()->getRegVar()->getDisp();
4795 
4796         if (spill->getTopDcl()->getRegVar()->isSpilled ())
4797         {
4798             if (disp != UINT_MAX)
4799             {
4800                 nextSpillOffset_ = std::max(nextSpillOffset_, disp + getByteSize(spill->getTopDcl()->getRegVar()));
4801             }
4802         }
4803     }
4804 
4805     return true;
4806 }
4807 
4808 //
4809 // For XeHP_SDV+ scratch surface is used for the vISA stack.  This means when
4810 // the scratch message cannot be used for spill/fill (e.g., stack call),
4811 // a0.2 will be used as the message descriptor for the spill/fill.
4812 // As address RA is done before GRF, we don't know if a0.2 is live at the
4813 // point of the spill/fill inst and thus may need to preserve its value.
4814 // The good news is that all spill/fill may share the same A0, so we only
4815 // need to save/restore A0 when it's actually referenced in the BB.
4816 //
saveRestoreA0(G4_BB * bb)4817 void GlobalRA::saveRestoreA0(G4_BB * bb)
4818 {
4819     G4_Declare* tmpDcl = nullptr;
4820     unsigned int subReg = 0;
4821     if (kernel.fg.getHasStackCalls() || kernel.fg.getIsStackCallFunc())
4822     {
4823         // Use r126.6:ud for storing old a0.2 as it isn't caller/callee save
4824         tmpDcl = builder.kernel.fg.getScratchRegDcl();
4825         subReg = 6;
4826     }
4827     else
4828     {
4829         MUST_BE_TRUE(builder.hasValidOldA0Dot2(), "old a0.2 not saved");
4830         tmpDcl = builder.getOldA0Dot2Temp();
4831         subReg = 0;
4832     }
4833 
4834     auto usesAddr = [](G4_INST* inst)
4835     {
4836         // ToDo: handle send with A0 msg desc better.
4837         if (inst->isSpillIntrinsic() || inst->isFillIntrinsic())
4838         {
4839             return false;
4840         }
4841         if (inst->getDst() && inst->getDst()->isAddress())
4842         {
4843             return true;
4844         }
4845         for (int i = 0, numSrc = inst->getNumSrc(); i < numSrc; ++i)
4846         {
4847             if (inst->getSrc(i)->isAddress() || (inst->getSrc(i)->isSrcRegRegion() && inst->getSrc(i)->asSrcRegRegion()->isIndirect()))
4848             {
4849                 return true;
4850             }
4851         }
4852         return false;
4853     };
4854 
4855     // a0.2 is spilled to r126.6 (r126 is scratch reg reserved for stack call)
4856     auto a0SaveMov = [this, tmpDcl, subReg]()
4857     {
4858         auto dstSave = builder.createDst(tmpDcl->getRegVar(), 0, subReg, 1, Type_UD);
4859         auto srcSave = builder.createSrc(builder.getBuiltinA0Dot2()->getRegVar(), 0, 0, builder.getRegionScalar(), Type_UD);
4860         auto saveInst = builder.createMov(g4::SIMD1, dstSave, srcSave, InstOpt_WriteEnable, false);
4861         return saveInst;
4862     };
4863 
4864     auto a0RestoreMov = [this, tmpDcl, subReg]()
4865     {
4866         auto dstRestore = builder.createDstRegRegion(builder.getBuiltinA0Dot2(), 1);
4867         auto srcRestore = builder.createSrc(tmpDcl->getRegVar(), 0, subReg, builder.getRegionScalar(), Type_UD);
4868         auto restoreInst = builder.createMov(g4::SIMD1, dstRestore, srcRestore, InstOpt_WriteEnable, false);
4869         return restoreInst;
4870     };
4871 
4872     auto a0SSOMove = [this]()
4873     {
4874         // shr (1) a0.2   SSO   0x4 {NM}
4875         // SSO is stored in r126.7
4876         auto dst = builder.createDstRegRegion(builder.getBuiltinA0Dot2(), 1);
4877         auto SSOsrc = builder.createSrc(builder.getSpillSurfaceOffset()->getRegVar(),
4878             0, 0, builder.getRegionScalar(), Type_UD);
4879         auto imm4 = builder.createImm(4, Type_UD);
4880 
4881         return builder.createBinOp(G4_shr, g4::SIMD1, dst, SSOsrc, imm4, InstOpt_WriteEnable, false);
4882     };
4883 
4884     auto isPrologOrEpilog = [this](G4_INST* inst)
4885     {
4886         // a0 is a caller save register. Dont save/restore it if it is used in callee save/restore sequence or
4887         // for frame descriptor spill instruction.
4888         if (inst == kernel.fg.builder->getFDSpillInst())
4889             return false;
4890 
4891         if (calleeSaveInsts.find(inst) != calleeSaveInsts.end() ||
4892             calleeRestoreInsts.find(inst) != calleeRestoreInsts.end())
4893             return false;
4894 
4895         return true;
4896     };
4897 
4898     bool hasActiveSpillFill = false;
4899 
4900     for (auto instIt = bb->begin(); instIt != bb->end(); ++instIt)
4901     {
4902         auto inst = (*instIt);
4903 
4904         if (inst->isSpillIntrinsic() || inst->isFillIntrinsic())
4905         {
4906             if (!hasActiveSpillFill)
4907             {
4908                 // save a0.2 to addrSpillLoc, then overwrite it with the scratch surface offset
4909                 if (isPrologOrEpilog(inst))
4910                 {
4911                     auto addrSpill = a0SaveMov();
4912                     bb->insertBefore(instIt, addrSpill);
4913                 }
4914                auto a0SSO = a0SSOMove();
4915                bb->insertBefore(instIt, a0SSO);
4916                hasActiveSpillFill = true;
4917             }
4918         }
4919         else if (hasActiveSpillFill && usesAddr(inst))
4920         {
4921             // restore A0
4922             auto addrFill = a0RestoreMov();
4923             bb->insertBefore(instIt, addrFill);
4924             hasActiveSpillFill = false;
4925         }
4926     }
4927 
4928     if (hasActiveSpillFill && !bb->isLastInstEOT() && !bb->isEndWithFRet())
4929     {
4930         // restore A0 before BB exit. BB is guaranteed to be non-empty as there's at least one spill/fill
4931         // If last inst is branch, insert restore before it. Otherwise insert it as last inst
4932         auto endIt = bb->back()->isCFInst() ? std::prev(bb->end()) : bb->end();
4933         bb->insertBefore(endIt, a0RestoreMov());
4934     }
4935 }
4936 
computeSpillMsgDesc(unsigned int payloadSize,unsigned int offsetInGrfUnits)4937 uint32_t computeSpillMsgDesc(unsigned int payloadSize, unsigned int offsetInGrfUnits)
4938 {
4939     // Compute msg descriptor given payload size and offset.
4940     unsigned headerPresent = 0x80000;
4941     uint32_t message = headerPresent;
4942     unsigned msgLength = SCRATCH_PAYLOAD_HEADER_MAX_HEIGHT;
4943     message |= (msgLength << getSendMsgLengthBitOffset());
4944     message |= (1 << SCRATCH_MSG_DESC_CATEORY);
4945     message |= (1 << SCRATCH_MSG_DESC_CHANNEL_MODE);
4946     message |= (1 << SCRATCH_MSG_DESC_OPERATION_MODE);
4947     unsigned blocksize_encoding = getScratchBlocksizeEncoding(payloadSize);
4948     message |= (blocksize_encoding << SCRATCH_MSG_DESC_BLOCK_SIZE);
4949     int offset = offsetInGrfUnits;
4950     message |= offset;
4951 
4952     return message;
4953 }
4954 
computeFillMsgDesc(unsigned int payloadSize,unsigned int offsetInGrfUnits)4955 uint32_t computeFillMsgDesc(unsigned int payloadSize, unsigned int offsetInGrfUnits)
4956 {
4957     // Compute msg descriptor given payload size and offset.
4958     unsigned headerPresent = 0x80000;
4959     uint32_t message = headerPresent;
4960     unsigned msgLength = 1;
4961     message |= (msgLength << getSendMsgLengthBitOffset());
4962     message |= (1 << SCRATCH_MSG_DESC_CATEORY);
4963     message |= (0 << SCRATCH_MSG_INVALIDATE_AFTER_READ);
4964     unsigned blocksize_encoding = getScratchBlocksizeEncoding(payloadSize);
4965     message |= (blocksize_encoding << SCRATCH_MSG_DESC_BLOCK_SIZE);
4966     message |= offsetInGrfUnits;
4967 
4968     return message;
4969 }
4970 
4971 // Returns payload size in units of GRF rows
getPayloadSizeGRF(unsigned int numRows)4972 static unsigned int getPayloadSizeGRF(unsigned int numRows)
4973 {
4974     if (numRows >= 8)
4975         return 8u;
4976 
4977     if (numRows >= 4)
4978         return 4u;
4979 
4980     if (numRows >= 2)
4981         return 2u;
4982 
4983     return 1u;
4984 }
4985 
getPayloadSizeOword(unsigned int numOwords)4986 static unsigned int getPayloadSizeOword(unsigned int numOwords)
4987 {
4988     if (numOwords >= 8)
4989         return 8u;
4990 
4991     if (numOwords >= 4)
4992         return 4u;
4993 
4994     if (numOwords >= 2)
4995         return 2u;
4996 
4997     return 1u;
4998 }
4999 
owordToGRFSize(unsigned int numOwords)5000 unsigned int GlobalRA::owordToGRFSize(unsigned int numOwords)
5001 {
5002     unsigned int GRFSize = numOwords / (2 * (numEltPerGRF<Type_UB>() / HWORD_BYTE_SIZE));
5003 
5004     return GRFSize;
5005 }
5006 
hwordToGRFSize(unsigned int numHwords)5007 unsigned int GlobalRA::hwordToGRFSize(unsigned int numHwords)
5008 {
5009     return owordToGRFSize(numHwords * 2);
5010 }
5011 
GRFToHwordSize(unsigned int numGRFs)5012 unsigned int GlobalRA::GRFToHwordSize(unsigned int numGRFs)
5013 {
5014     return GRFSizeToOwords(numGRFs) / 2;
5015 }
5016 
GRFSizeToOwords(unsigned int numGRFs)5017 unsigned int GlobalRA::GRFSizeToOwords(unsigned int numGRFs)
5018 {
5019     return numGRFs * (numEltPerGRF<Type_UB>() / OWORD_BYTE_SIZE);
5020 }
5021 
getHWordByteSize()5022 unsigned int GlobalRA::getHWordByteSize()
5023 {
5024     return HWORD_BYTE_SIZE;
5025 }
5026 
createSpillFillAddr(IR_Builder & builder,G4_Declare * addr,G4_Declare * fp,int offset)5027 static G4_INST* createSpillFillAddr(
5028     IR_Builder& builder, G4_Declare* addr, G4_Declare* fp, int offset)
5029 {
5030     auto imm = builder.createImm(offset, Type_UD);
5031     auto dst = builder.createDstRegRegion(addr, 1);
5032     if (fp)
5033     {
5034         auto src0 = builder.createSrcRegRegion(fp, builder.getRegionScalar());
5035         return builder.createBinOp(G4_add, g4::SIMD1, dst, src0, imm, InstOpt_WriteEnable, true);
5036     }
5037     else
5038     {
5039         return builder.createMov(g4::SIMD1, dst, imm, InstOpt_WriteEnable, true);
5040     }
5041 }
5042 
makeSpillFillComment(const char * spillFill,const char * toFrom,const char * base,uint32_t spillOffset,const char * of)5043 static std::string makeSpillFillComment(
5044     const char *spillFill,
5045     const char *toFrom,
5046     const char *base,
5047     uint32_t spillOffset,
5048     const char *of)
5049 {
5050     std::stringstream comment;
5051     comment << spillFill << " " <<  toFrom << " ";
5052     comment << base << "[" << spillOffset / getGRFSize() << "*" << (int)getGRFSize() << "]";
5053     if (!of || *of == 0) // some have "" as name
5054         of = "?";
5055     comment << " of " << of;
5056     return comment.str();
5057 }
5058 
expandSpillLSC(G4_BB * bb,INST_LIST_ITER & instIt)5059 void GlobalRA::expandSpillLSC(G4_BB* bb, INST_LIST_ITER& instIt)
5060 {
5061     auto& builder = kernel.fg.builder;
5062     auto inst = (*instIt)->asSpillIntrinsic();
5063     // offset into scratch surface in bytes
5064     auto spillOffset = inst->getOffsetInBytes();
5065     uint32_t numRows = inst->getNumRows();
5066     auto payload = inst->getSrc(1)->asSrcRegRegion();
5067     auto rowOffset = payload->getRegOff();
5068 
5069     LSC_OP op = LSC_STORE;
5070     LSC_SFID lscSfid = LSC_UGM;
5071     LSC_CACHE_OPTS cacheOpts{ LSC_CACHING_DEFAULT, LSC_CACHING_DEFAULT };
5072 
5073     LSC_ADDR addrInfo;
5074     addrInfo.type = LSC_ADDR_TYPE_SS; //Scratch memory
5075     addrInfo.immScale = 1;
5076     addrInfo.immOffset = 0;
5077     addrInfo.size = LSC_ADDR_SIZE_32b;
5078 
5079     builder->instList.clear();
5080     while (numRows > 0)
5081     {
5082         auto numGRFToWrite = getPayloadSizeGRF(numRows);
5083 
5084         G4_Declare* spillAddr = inst->getFP() ? kernel.fg.scratchRegDcl : inst->getHeader()->getTopDcl();
5085         {
5086             // need to calculate spill address
5087             createSpillFillAddr(*builder, spillAddr, inst->getFP(), spillOffset);
5088         }
5089 
5090         LSC_DATA_SHAPE dataShape;
5091         dataShape.size = LSC_DATA_SIZE_32b;
5092         dataShape.order = LSC_DATA_ORDER_TRANSPOSE;
5093         dataShape.elems = builder->lscGetElementNum(numGRFToWrite * getGRFSize() / 4);
5094 
5095         auto src0Addr = builder->createSrcRegRegion(spillAddr, builder->getRegionStride1());
5096         auto payloadToUse = builder->createSrcWithNewRegOff(payload, rowOffset);
5097 
5098         auto surface = builder->createSrcRegRegion(builder->getSpillSurfaceOffset(),
5099             builder->getRegionScalar());
5100 
5101         G4_DstRegRegion* postDst = builder->createNullDst(Type_UD);
5102         G4_SendDescRaw* desc = builder->createLscMsgDesc(
5103             op,
5104             lscSfid,
5105             EXEC_SIZE_1,
5106             cacheOpts,
5107             addrInfo,
5108             dataShape,
5109             surface,
5110             0,
5111             1);
5112 
5113         auto sendInst = builder->createLscSendInst(
5114             nullptr,
5115             postDst,
5116             src0Addr,
5117             payloadToUse,
5118             g4::SIMD1,
5119             desc,
5120             inst->getOption(),
5121             LSC_ADDR_TYPE_SS,
5122             false);
5123 
5124         sendInst->addComment(makeSpillFillComment(
5125             "spill", "to",
5126             inst->getFP() ? "FP" : "offset",
5127             spillOffset,
5128             payload->getTopDcl()->getName()));
5129 
5130         numRows -= numGRFToWrite;
5131         rowOffset += numGRFToWrite;
5132         spillOffset += numGRFToWrite * getGRFSize();
5133     }
5134 
5135     if (getEUFusionWAInsts().count(inst) > 0)
5136     {
5137         removeEUFusionWAInst(inst);
5138         for (auto inst : builder->instList)
5139             addEUFusionWAInsts(inst);
5140     }
5141 
5142     splice(bb, instIt, builder->instList, inst->getCISAOff());
5143 }
5144 
expandFillLSC(G4_BB * bb,INST_LIST_ITER & instIt)5145 void GlobalRA::expandFillLSC(G4_BB* bb, INST_LIST_ITER& instIt)
5146 {
5147     auto& builder = kernel.fg.builder;
5148     auto inst = (*instIt)->asFillIntrinsic();
5149     // offset into scratch surface in bytes
5150     auto fillOffset = inst->getOffsetInBytes();
5151     uint32_t numRows = inst->getNumRows();
5152     auto rowOffset = inst->getDst()->getRegOff();
5153 
5154     LSC_OP op = LSC_LOAD;
5155     LSC_SFID lscSfid = LSC_UGM;
5156     LSC_CACHE_OPTS cacheOpts{ LSC_CACHING_DEFAULT, LSC_CACHING_DEFAULT };
5157 
5158     LSC_ADDR addrInfo;
5159     addrInfo.type = LSC_ADDR_TYPE_SS; //Scratch memory
5160     addrInfo.immScale = 1;
5161     addrInfo.immOffset = 0;
5162     addrInfo.size = LSC_ADDR_SIZE_32b;
5163 
5164     builder->instList.clear();
5165 
5166     while (numRows > 0)
5167     {
5168         unsigned responseLength = getPayloadSizeGRF(numRows);
5169         LSC_DATA_SHAPE dataShape;
5170         dataShape.size = LSC_DATA_SIZE_32b;
5171         dataShape.order = LSC_DATA_ORDER_TRANSPOSE;
5172         dataShape.elems = builder->lscGetElementNum(responseLength * getGRFSize() / 4);
5173 
5174         G4_Declare* fillAddr = inst->getFP() ? kernel.fg.scratchRegDcl : inst->getHeader()->getTopDcl();
5175         {
5176             // need to calculate fill address
5177             createSpillFillAddr(*builder, fillAddr, inst->getFP(), fillOffset);
5178         }
5179         auto dstRead = builder->createDst(inst->getDst()->getTopDcl()->getRegVar(),
5180             (short)rowOffset, 0, 1, Type_UD);
5181 
5182         auto surface = builder->createSrcRegRegion(builder->getSpillSurfaceOffset(),
5183             builder->getRegionScalar());
5184 
5185         G4_SendDescRaw* desc = builder->createLscMsgDesc(
5186             op,
5187             lscSfid,
5188             EXEC_SIZE_1,
5189             cacheOpts,
5190             addrInfo,
5191             dataShape,
5192             surface,
5193             responseLength,
5194             1);
5195 
5196         auto sendInst = builder->createLscSendInst(
5197             nullptr,
5198             dstRead,
5199             builder->createSrcRegRegion(fillAddr, builder->getRegionScalar()),
5200             nullptr,
5201             g4::SIMD1,
5202             desc,
5203             inst->getOption(),
5204             LSC_ADDR_TYPE_SS,
5205             false);
5206 
5207         sendInst->addComment(makeSpillFillComment(
5208             "fill", "from",
5209             inst->getFP() ? "FP" : "offset",
5210             fillOffset,
5211             dstRead->getTopDcl()->getName()));
5212 
5213         numRows -= responseLength;
5214         rowOffset += responseLength;
5215         fillOffset += responseLength * getGRFSize();
5216     }
5217 
5218     if (getEUFusionWAInsts().count(inst) > 0)
5219     {
5220         removeEUFusionWAInst(inst);
5221         for (auto inst : builder->instList)
5222             addEUFusionWAInsts(inst);
5223     }
5224 
5225     splice(bb, instIt, builder->instList, inst->getCISAOff());
5226 }
5227 
expandSpillNonStackcall(uint32_t numRows,uint32_t offset,short rowOffset,G4_SrcRegRegion * header,G4_SrcRegRegion * payload,G4_BB * bb,INST_LIST_ITER & instIt)5228 void GlobalRA::expandSpillNonStackcall(
5229     uint32_t numRows, uint32_t offset, short rowOffset,
5230     G4_SrcRegRegion* header, G4_SrcRegRegion* payload, G4_BB* bb,
5231     INST_LIST_ITER& instIt)
5232 {
5233     auto& builder = kernel.fg.builder;
5234     auto inst = (*instIt);
5235 
5236     if (offset == G4_SpillIntrinsic::InvalidOffset)
5237     {
5238         // oword msg
5239         auto payloadToUse = builder->createSrcRegRegion(*payload);
5240         auto [spillMsgDesc, execSize] = SpillManagerGRF::createSpillSendMsgDescOWord(numRows);
5241         G4_INST* sendInst = nullptr;
5242         // Use bindless for XeHP_SDV+
5243         if (builder->hasScratchSurface())
5244         {
5245             G4_Imm* descImm = createMsgDesc(GRFSizeToOwords(numRows), true, true);
5246             // Update BTI to 251
5247             auto spillMsgDesc = descImm->getInt();
5248             spillMsgDesc = spillMsgDesc & 0xffffff00;
5249             spillMsgDesc |= 251;
5250 
5251             auto msgDesc = builder->createWriteMsgDesc(SFID::DP_DC0, (uint32_t)spillMsgDesc, numRows);
5252             G4_Imm* msgDescImm = builder->createImm(msgDesc->getDesc(), Type_UD);
5253 
5254             // a0 is set by saveRestoreA0()
5255             auto a0Src = builder->createSrcRegRegion(builder->getBuiltinA0Dot2(), builder->getRegionScalar());
5256             sendInst = builder->createInternalSplitSendInst(execSize, inst->getDst(),
5257                 header, payloadToUse, msgDescImm, inst->getOption(), msgDesc, a0Src);
5258         }
5259         else
5260         {
5261             G4_SendDescRaw * msgDesc =
5262                 kernel.fg.builder->createSendMsgDesc(
5263                     spillMsgDesc & 0x000FFFFFu, 0, 1, SFID::DP_DC0, numRows, 0, SendAccess::WRITE_ONLY);
5264             G4_Imm* msgDescImm = builder->createImm(msgDesc->getDesc(), Type_UD);
5265             G4_Imm* extDesc = builder->createImm(msgDesc->getExtendedDesc(), Type_UD);
5266             sendInst = builder->createInternalSplitSendInst(execSize,
5267                 inst->getDst(), header, payloadToUse, msgDescImm, inst->getOption(),
5268                 msgDesc, extDesc);
5269         }
5270         instIt = bb->insertBefore(instIt, sendInst);
5271     }
5272     else
5273     {
5274         while (numRows >= 1)
5275         {
5276             auto payloadToUse = builder->createSrcWithNewRegOff(payload, rowOffset);
5277 
5278             auto region = builder->getRegionStride1();
5279 
5280             uint32_t spillMsgDesc = computeSpillMsgDesc(getPayloadSizeGRF(numRows), offset);
5281             auto msgDesc = builder->createWriteMsgDesc(SFID::DP_DC0, spillMsgDesc, getPayloadSizeGRF(numRows));
5282             G4_Imm* msgDescImm = builder->createImm(msgDesc->getDesc(), Type_UD);
5283 
5284             G4_SrcRegRegion* headerOpnd = builder->createSrcRegRegion(builder->getBuiltinR0(), region);
5285             G4_Imm* extDesc = builder->createImm(msgDesc->getExtendedDesc(), Type_UD);
5286             G4_ExecSize execSize = numRows > 1 ? g4::SIMD16 : g4::SIMD8;
5287 
5288             auto sendInst = builder->createInternalSplitSendInst(execSize,
5289                 inst->getDst(), headerOpnd, payloadToUse, msgDescImm,
5290                 inst->getOption(), msgDesc, extDesc);
5291 
5292             std::stringstream comments;
5293             comments << "scratch space spill: " << payloadToUse->getTopDcl()->getName() << " from offset[" << offset << "x32]";
5294             sendInst->addComment(comments.str());
5295 
5296             instIt = bb->insertBefore(instIt, sendInst);
5297 
5298             numRows -= getPayloadSizeGRF(numRows);
5299             offset += getPayloadSizeGRF(numRows);
5300             rowOffset += getPayloadSizeGRF(numRows);
5301         }
5302     }
5303 }
5304 
expandSpillStackcall(uint32_t numRows,uint32_t offset,short rowOffset,G4_SrcRegRegion * payload,G4_BB * bb,INST_LIST_ITER & instIt)5305 void GlobalRA::expandSpillStackcall(
5306     uint32_t numRows, uint32_t offset, short rowOffset,
5307     G4_SrcRegRegion* payload, G4_BB* bb, INST_LIST_ITER& instIt)
5308 {
5309     auto& builder = kernel.fg.builder;
5310     auto inst = (*instIt);
5311 
5312     auto spillIt = instIt;
5313 
5314     // Use oword ld for stackcall. Lower intrinsic to:
5315     // (W)      add(1 | M0)         r126.2 < 1 > :ud  r125.7 < 0; 1, 0 > : ud  0x0 : ud
5316     // (W)      sends(8 | M0)         null : ud       r126              payload - src2                0x4A      0x20A02FF
5317     G4_Operand* src0 = nullptr;
5318     G4_Imm* src1 = nullptr;
5319     G4_Declare* scratchRegDcl = builder->kernel.fg.scratchRegDcl;
5320     G4_Declare* framePtr = inst->asSpillIntrinsic()->getFP();
5321 
5322     // convert hword to oword offset
5323     auto numRowsOword = numRows * 2;
5324     auto offsetOword = offset * 2;
5325     auto rowOffsetOword = rowOffset * 2;
5326 
5327     while (numRowsOword >= 1)
5328     {
5329         auto createOwordSpill = [&](unsigned int owordSize, G4_SrcRegRegion* payloadToUse)
5330         {
5331             G4_ExecSize execSize = (owordSize > 2) ? g4::SIMD16 : g4::SIMD8;
5332             G4_DstRegRegion* dst = builder->createNullDst((execSize > g4::SIMD8) ? Type_UW : Type_UD);
5333             auto sendSrc0 = builder->createSrc(scratchRegDcl->getRegVar(),
5334                 0, 0, builder->rgnpool.createRegion(8, 8, 1), Type_UD);
5335             unsigned messageLength = owordToGRFSize(owordSize);
5336             G4_Imm* descImm = createMsgDesc(owordSize, true, true);
5337             G4_INST* sendInst = nullptr;
5338             // Use bindless for XeHP_SDV+
5339             if (builder->getPlatform() >= XeHP_SDV)
5340             {
5341                 // Update BTI to 251
5342                 auto spillMsgDesc = descImm->getInt();
5343                 spillMsgDesc = spillMsgDesc & 0xffffff00;
5344                 spillMsgDesc |= 251;
5345 
5346                 auto msgDesc = builder->createWriteMsgDesc(SFID::DP_DC0, (uint32_t)spillMsgDesc, messageLength);
5347                 G4_Imm* msgDescImm = builder->createImm(msgDesc->getDesc(), Type_UD);
5348 
5349                 // a0 is set by saveRestoreA0()
5350                 auto a0Src = builder->createSrcRegRegion(builder->getBuiltinA0Dot2(), builder->getRegionScalar());
5351                 sendInst = builder->createInternalSplitSendInst(execSize, inst->getDst(),
5352                     sendSrc0, payloadToUse, msgDescImm, inst->getOption(), msgDesc, a0Src);
5353             }
5354             else
5355             {
5356                 auto msgDesc = builder->createWriteMsgDesc(SFID::DP_DC0, (uint32_t)descImm->getInt(), messageLength);
5357                 G4_Imm* msgDescImm = builder->createImm(msgDesc->getDesc(), Type_UD);
5358                 G4_Imm* extDesc = builder->createImm(msgDesc->getExtendedDesc(), Type_UD);
5359                 sendInst = builder->createInternalSplitSendInst(execSize, dst, sendSrc0, payloadToUse,
5360                     msgDescImm, inst->getOption() | InstOpt_WriteEnable, msgDesc, extDesc);
5361             }
5362             return sendInst;
5363         };
5364 
5365         auto payloadSizeInOwords = getPayloadSizeOword(numRowsOword);
5366 
5367         auto payloadToUse = builder->createSrcWithNewRegOff(payload, rowOffsetOword / 2);
5368 
5369         G4_DstRegRegion* dst = builder->createDst(scratchRegDcl->getRegVar(), 0, 2, 1, Type_UD);
5370 
5371         G4_INST* hdrSetInst = nullptr;
5372         if (inst->asSpillIntrinsic()->isOffsetValid())
5373         {
5374             // Skip header if spill module emits its own header
5375             if (framePtr)
5376             {
5377                 src0 = builder->createSrc(framePtr->getRegVar(), 0, 0, builder->getRegionScalar(), Type_UD);
5378                 src1 = builder->createImm(offsetOword, Type_UD);
5379                 hdrSetInst = builder->createBinOp(G4_add, g4::SIMD1, dst, src0, src1, InstOpt_WriteEnable, false);
5380             }
5381             else
5382             {
5383                 src0 = builder->createImm(offsetOword, Type_UD);
5384                 hdrSetInst = builder->createMov(g4::SIMD1, dst, src0, InstOpt_WriteEnable, false);
5385             }
5386 
5387             bb->insertBefore(spillIt, hdrSetInst);
5388         }
5389 
5390         auto spillSends = createOwordSpill(payloadSizeInOwords, payloadToUse);
5391         std::stringstream comments;
5392         comments <<  "stack spill: " << payload->getTopDcl()->getName() << " to FP[" << inst->asSpillIntrinsic()->getOffset() << "x32]";
5393         spillSends->addComment(comments.str());
5394 
5395         bb->insertBefore(spillIt, spillSends);
5396 
5397         if (getEUFusionWAInsts().count(inst) > 0)
5398         {
5399             removeEUFusionWAInst(inst);
5400             addEUFusionWAInsts(spillSends);
5401             if (hdrSetInst)
5402                 addEUFusionWAInsts(hdrSetInst);
5403         }
5404 
5405         if (kernel.getOption(vISA_GenerateDebugInfo))
5406         {
5407             kernel.getKernelDebugInfo()->updateExpandedIntrinsic(inst->asSpillIntrinsic(), hdrSetInst);
5408             kernel.getKernelDebugInfo()->updateExpandedIntrinsic(inst->asSpillIntrinsic(), spillSends);
5409         }
5410 
5411         numRowsOword -= payloadSizeInOwords;
5412         offsetOword += payloadSizeInOwords;
5413         rowOffsetOword += payloadSizeInOwords;
5414     }
5415 }
5416 
5417 // Non-stack call:
5418 //  sends <-- scratch - default, supported
5419 //  send  <-- scratch - disable split send using compiler option, not supported by intrinsic
5420 //  send  <-- non-scratch - used when scratch space usage is very high, supported
5421 
5422 //  Stack call :
5423 //  sends <-- non-scratch - default spill, supported
5424 //  send  <-- non-scratch - default fill, supported
expandSpillIntrinsic(G4_BB * bb)5425 void GlobalRA::expandSpillIntrinsic(G4_BB* bb)
5426 {
5427     // spill (1) null:ud   bitmask:ud   r0:ud   payload:ud
5428     for (auto instIt = bb->begin(); instIt != bb->end();)
5429     {
5430         auto inst = (*instIt);
5431         if (inst->isSpillIntrinsic())
5432         {
5433             bool isOffBP = inst->asSpillIntrinsic()->isOffBP();
5434             uint32_t numRows = inst->asSpillIntrinsic()->getNumRows();
5435             uint32_t offset = inst->asSpillIntrinsic()->getOffset() *
5436                 (numEltPerGRF<Type_UB>() / HWORD_BYTE_SIZE);
5437             auto header = inst->getSrc(0)->asSrcRegRegion();
5438             auto payload = inst->getSrc(1)->asSrcRegRegion();
5439             auto spillIt = instIt;
5440 
5441             auto rowOffset = payload->getRegOff();
5442             if (useLscForNonStackCallSpillFill || spillFillIntrinUsesLSC(inst)) {
5443                 expandSpillLSC(bb, instIt);
5444             }
5445             else
5446             {
5447                 if (!isOffBP)
5448                 {
5449                     expandSpillNonStackcall(numRows, offset, rowOffset, header, payload, bb, instIt);
5450                 }
5451                 else
5452                 {
5453                     expandSpillStackcall(numRows, offset, rowOffset, payload, bb, instIt);
5454                 }
5455             }
5456             numGRFSpill++;
5457             instIt = bb->erase(spillIt);
5458             continue;
5459         }
5460         instIt++;
5461     }
5462 }
5463 
expandFillNonStackcall(uint32_t numRows,uint32_t offset,short rowOffset,G4_SrcRegRegion * header,G4_DstRegRegion * resultRgn,G4_BB * bb,INST_LIST_ITER & instIt)5464  void GlobalRA::expandFillNonStackcall(uint32_t numRows, uint32_t offset, short rowOffset, G4_SrcRegRegion* header, G4_DstRegRegion* resultRgn, G4_BB* bb, INST_LIST_ITER& instIt)
5465  {
5466      auto& builder = kernel.fg.builder;
5467      auto inst = (*instIt);
5468 
5469      if (offset == G4_FillIntrinsic::InvalidOffset)
5470      {
5471          // oword msg
5472          G4_ExecSize execSize = g4::SIMD16;
5473          auto numRowsOword = GRFSizeToOwords(numRows);
5474          auto fillDst = builder->createDst(resultRgn->getBase(), rowOffset,
5475              0, resultRgn->getHorzStride(), resultRgn->getType());
5476          auto sendSrc0 = builder->createSrc(header->getBase(),
5477              0, 0, builder->rgnpool.createRegion(8, 8, 1), Type_UD);
5478          G4_Imm* desc = createMsgDesc(numRowsOword, false, false);
5479          G4_INST* sendInst = nullptr;
5480          auto sfId = SFID::DP_DC0;
5481 
5482          // Use bindless for XeHP_SDV+
5483          if (builder->hasScratchSurface())
5484          {
5485              // Update BTI to 251
5486              auto newDesc = desc->getInt() & 0xffffff00;
5487              newDesc |= 251;
5488              desc = builder->createImm(newDesc, Type_UD);
5489 
5490              auto msgDesc = builder->createReadMsgDesc(sfId, (uint32_t)desc->getInt());
5491              G4_Operand* msgDescOpnd = builder->createImm(msgDesc->getDesc(), Type_UD);
5492 
5493              // a0 is set by saveRestoreA0()
5494              auto src1 = builder->createSrc(builder->getBuiltinA0Dot2()->getRegVar(), 0, 0,
5495                  builder->getRegionScalar(), Type_UD);
5496 
5497              sendInst = builder->createInternalSplitSendInst(execSize, fillDst, sendSrc0,
5498                  nullptr, msgDescOpnd, InstOpt_WriteEnable, msgDesc, src1);
5499          }
5500          else
5501          {
5502              auto msgDesc = builder->createReadMsgDesc(sfId, (uint32_t)desc->getInt());
5503              G4_Operand* msgDescOpnd = builder->createImm(msgDesc->getDesc(), Type_UD);
5504              sendInst = builder->createInternalSendInst(nullptr, G4_send, execSize, fillDst, sendSrc0, msgDescOpnd,
5505                  InstOpt_WriteEnable, msgDesc);
5506          }
5507          instIt = bb->insertBefore(instIt, sendInst);
5508      }
5509      else
5510      {
5511          while (numRows >= 1)
5512          {
5513              auto fillDst = builder->createDst(resultRgn->getBase(), rowOffset,
5514                  0, resultRgn->getHorzStride(), resultRgn->getType());
5515 
5516              auto region = builder->getRegionStride1();
5517              G4_SrcRegRegion* headerOpnd = builder->createSrcRegRegion(builder->getBuiltinR0(), region);
5518 
5519              uint32_t fillMsgDesc = computeFillMsgDesc(getPayloadSizeGRF(numRows), offset);
5520 
5521              G4_SendDescRaw* msgDesc = kernel.fg.builder->createSendMsgDesc(fillMsgDesc,
5522                  getPayloadSizeGRF(numRows), 1, SFID::DP_DC0, 0, 0, SendAccess::READ_ONLY);
5523 
5524              G4_Imm* msgDescImm = builder->createImm(msgDesc->getDesc(), Type_UD);
5525 
5526              auto sendInst = builder->createInternalSendInst(nullptr,
5527                  G4_send, g4::SIMD16, fillDst, headerOpnd, msgDescImm, inst->getOption(),
5528                  msgDesc);
5529 
5530              std::stringstream comments;
5531              comments << "scratch space fill: " << inst->getDst()->getTopDcl()->getName() << " from offset[" << offset << "x32]";
5532              sendInst->addComment(comments.str());
5533 
5534              instIt = bb->insertBefore(instIt, sendInst);
5535 
5536              numRows -= getPayloadSizeGRF(numRows);
5537              offset += getPayloadSizeGRF(numRows);
5538              rowOffset += getPayloadSizeGRF(numRows);
5539          }
5540      }
5541  }
5542 
expandFillStackcall(uint32_t numRows,uint32_t offset,short rowOffset,G4_SrcRegRegion * header,G4_DstRegRegion * resultRgn,G4_BB * bb,INST_LIST_ITER & instIt)5543 void GlobalRA::expandFillStackcall(uint32_t numRows, uint32_t offset, short rowOffset, G4_SrcRegRegion* header, G4_DstRegRegion* resultRgn, G4_BB* bb, INST_LIST_ITER& instIt)
5544 {
5545     auto& builder = kernel.fg.builder;
5546     auto inst = (*instIt);
5547     auto fillIt = instIt;
5548 
5549     // Use oword ld for stackcall. Lower intrinsic to:
5550     // add (1) r126.2<1>:d FP<0;1,0>:d offset
5551     //  send (16) r[startReg]<1>:uw r126 0xa desc:ud
5552     G4_Operand* src0 = nullptr;
5553     G4_Imm* src1 = nullptr;
5554     G4_Declare* scratchRegDcl = builder->kernel.fg.scratchRegDcl;
5555     G4_Declare* framePtr = inst->asFillIntrinsic()->getFP();
5556 
5557     // convert hword to oword offset
5558     auto numRowsOword = numRows * 2;
5559     auto offsetOword = offset * 2;
5560     auto rowOffsetOword = rowOffset * 2;
5561 
5562     while (numRowsOword >= 1)
5563     {
5564         auto createOwordFill = [&](unsigned int owordSize, G4_DstRegRegion* fillVar)
5565         {
5566             G4_ExecSize execSize = (owordSize > 2) ? g4::SIMD16 : g4::SIMD8;
5567             auto sendSrc0 = builder->createSrc(scratchRegDcl->getRegVar(),
5568                 0, 0, builder->rgnpool.createRegion(8, 8, 1), Type_UD);
5569             G4_Imm* desc = createMsgDesc(owordSize, false, false);
5570             G4_INST* sendInst = nullptr;
5571             auto sfId = SFID::DP_DC0;
5572 
5573             // Use bindless for XeHP_SDV+
5574             if (builder->getPlatform() >= XeHP_SDV)
5575             {
5576                 // Update BTI to 251
5577                 auto newDesc = desc->getInt() & 0xffffff00;
5578                 newDesc |= 251;
5579                 desc = builder->createImm(newDesc, Type_UD);
5580 
5581                 auto msgDesc = builder->createReadMsgDesc(sfId, (uint32_t)desc->getInt());
5582                 G4_Operand* msgDescOpnd = builder->createImm(msgDesc->getDesc(), Type_UD);
5583 
5584                 // a0 is set by saveRestoreA0()
5585                 auto src1 = builder->createSrc(builder->getBuiltinA0Dot2()->getRegVar(), 0, 0,
5586                     builder->getRegionScalar(), Type_UD);
5587 
5588                 sendInst = builder->createInternalSplitSendInst(
5589                     execSize, fillVar, sendSrc0,
5590                     nullptr, msgDescOpnd, InstOpt_WriteEnable, msgDesc, src1);
5591             }
5592             else
5593             {
5594                 auto msgDesc = builder->createReadMsgDesc(SFID::DP_DC0, (uint32_t)desc->getInt());
5595                 auto msgDescImm = builder->createImm(msgDesc->getDesc(), Type_UD);
5596                 sendInst = builder->createInternalSendInst(
5597                     nullptr, G4_send, execSize, fillVar, sendSrc0, msgDescImm,
5598                     InstOpt_WriteEnable, msgDesc);
5599             }
5600             return sendInst;
5601         };
5602 
5603         auto respSizeInOwords = getPayloadSizeOword(numRowsOword);
5604         auto fillDst = builder->createDst(resultRgn->getBase(), rowOffsetOword / 2,
5605             0, resultRgn->getHorzStride(), resultRgn->getType());
5606 
5607         G4_DstRegRegion* dst = builder->createDst(scratchRegDcl->getRegVar(), 0, 2, 1, Type_UD);
5608 
5609         G4_INST* hdrSetInst = nullptr;
5610         if (inst->asFillIntrinsic()->isOffsetValid())
5611         {
5612             // Skip header if spill module emits its own header
5613             if (framePtr)
5614             {
5615                 src0 = builder->createSrc(framePtr->getRegVar(), 0, 0, builder->getRegionScalar(), Type_UD);
5616                 src1 = builder->createImm(offsetOword, Type_UD);
5617                 hdrSetInst = builder->createBinOp(G4_add, g4::SIMD1, dst, src0, src1, InstOpt_WriteEnable, false);
5618             }
5619             else
5620             {
5621                 src0 = builder->createImm(offsetOword, Type_UD);
5622                 hdrSetInst = builder->createMov(g4::SIMD1, dst, src0, InstOpt_WriteEnable, false);
5623             }
5624 
5625             bb->insertBefore(fillIt, hdrSetInst);
5626         }
5627 
5628         auto fillSends = createOwordFill(respSizeInOwords, fillDst);
5629 
5630         if (getEUFusionWAInsts().count(inst) > 0)
5631         {
5632             removeEUFusionWAInst(inst);
5633             addEUFusionWAInsts(fillSends);
5634             if (hdrSetInst)
5635                 addEUFusionWAInsts(hdrSetInst);
5636         }
5637 
5638         std::stringstream comments;
5639         comments << "stack fill: " << resultRgn->getTopDcl()->getName() << " from FP[" << inst->asFillIntrinsic()->getOffset() << "x32]";
5640         fillSends->addComment(comments.str());
5641 
5642         bb->insertBefore(fillIt, fillSends);
5643 
5644         if (kernel.getOption(vISA_GenerateDebugInfo))
5645         {
5646             kernel.getKernelDebugInfo()->updateExpandedIntrinsic(inst->asFillIntrinsic(), hdrSetInst);
5647             kernel.getKernelDebugInfo()->updateExpandedIntrinsic(inst->asFillIntrinsic(), fillSends);
5648         }
5649 
5650         numRowsOword -= respSizeInOwords;
5651         offsetOword += respSizeInOwords;
5652         rowOffsetOword += respSizeInOwords;
5653     }
5654 }
5655 
spillFillIntrinUsesLSC(G4_INST * spillFillIntrin)5656 bool GlobalRA::spillFillIntrinUsesLSC(G4_INST* spillFillIntrin)
5657 {
5658     G4_Declare* headerDcl = nullptr;
5659     if (!spillFillIntrin)
5660         return false;
5661 
5662     if (spillFillIntrin->isFillIntrinsic())
5663         headerDcl = spillFillIntrin->asFillIntrinsic()->getHeader()->getTopDcl();
5664     else if (spillFillIntrin->isSpillIntrinsic())
5665         headerDcl = spillFillIntrin->asSpillIntrinsic()->getHeader()->getTopDcl();
5666 
5667     if (useLscForSpillFill && headerDcl != builder.getBuiltinR0()->getRootDeclare())
5668     {
5669         return true;
5670     }
5671     return false;
5672 }
5673 
expandFillIntrinsic(G4_BB * bb)5674 void GlobalRA::expandFillIntrinsic(G4_BB* bb)
5675 {
5676     // fill (1) fill_var:ud     bitmask:ud     offset:ud
5677     for (auto instIt = bb->begin(); instIt != bb->end();)
5678     {
5679         auto inst = (*instIt);
5680         if (inst->isFillIntrinsic())
5681         {
5682             bool isOffBP = inst->asFillIntrinsic()->isOffBP();
5683             uint32_t numRows = inst->asFillIntrinsic()->getNumRows();
5684             uint32_t offset = inst->asFillIntrinsic()->getOffset() *
5685                 (numEltPerGRF<Type_UB>() / HWORD_BYTE_SIZE);
5686             auto header = inst->getSrc(0)->asSrcRegRegion();
5687             auto resultRgn = inst->getDst();
5688             auto fillIt = instIt;
5689 
5690             auto rowOffset = resultRgn->getRegOff();
5691             if (useLscForNonStackCallSpillFill || spillFillIntrinUsesLSC(inst)) {
5692                 expandFillLSC(bb, instIt);
5693             }
5694             else
5695             {
5696                 if (!isOffBP)
5697                 {
5698                     expandFillNonStackcall(numRows, offset, rowOffset, header, resultRgn, bb, instIt);
5699                 }
5700                 else
5701                 {
5702                     expandFillStackcall(numRows, offset, rowOffset, header, resultRgn, bb, instIt);
5703                 }
5704             }
5705             numGRFFill++;
5706             instIt = bb->erase(fillIt);
5707             continue;
5708         }
5709         instIt++;
5710     }
5711 }
5712 
5713 
expandSpillFillIntrinsics(unsigned int spillSizeInBytes)5714 void GlobalRA::expandSpillFillIntrinsics(unsigned int spillSizeInBytes)
5715 {
5716     auto globalScratchOffset = kernel.getInt32KernelAttr(Attributes::ATTR_SpillMemOffset);
5717 
5718     for (auto bb : kernel.fg)
5719     {
5720         if (builder.hasScratchSurface() &&
5721             (kernel.fg.getHasStackCalls() || kernel.fg.getIsStackCallFunc() || kernel.fg.builder->hasValidOldA0Dot2()
5722                 || (useLscForSpillFill && (spillSizeInBytes + globalScratchOffset) > SCRATCH_MSG_LIMIT &&
5723                     spillSizeInBytes > 0)
5724                 || (useLscForNonStackCallSpillFill && spillSizeInBytes > 0)
5725             // Following cases exist:
5726             // a. XeHP_SDV without stackcall => use hword scratch msg
5727             // b. XeHP_SDV without stackcall => using oword block msg
5728             // c. XeHP_SDV with stackcall
5729             // d. DG2+ without stackcall => hword scratch msg
5730             // e. DG2+ without stackcall => using LSC
5731             // f. DG2+ with stackcall    => using LSC
5732             //
5733             // (a), (d) are similar to SKL with hword scratch msg.
5734             //
5735             // (c), (f):
5736             // a0.2 is saved/restored from r126.6:ud
5737             // SSO is saved in r126.7:ud (in replaceSSO function)
5738             // XeHP_SDV uses oword msg, DG2+ uses LSC msg
5739             // For DG2+, offset is computed in r126.0
5740             //
5741             // (b):
5742             // oword header is prepared in a temp variable, allocated by RA
5743             // a0.2 is saved/restored in oldA0Dot2(0,0) whenever required
5744             // SSO is allocated to a live-out temp (not tied to r126.7:ud)
5745             //
5746             // (e):
5747             // LSC msg is used for spill/fill
5748             // Spill offset is computed in spillHeader(0,0)
5749             // a0.2 is saved/restored in oldA0Dot2(0,0) whenever required
5750             // spillHeader is marked as live-out
5751             //
5752             // When needed:
5753             // SSO is marked as live-out
5754             // r0 is stored in r127
5755             //
5756         ))
5757         {
5758             saveRestoreA0(bb);
5759         }
5760         expandSpillIntrinsic(bb);
5761         expandFillIntrinsic(bb);
5762     }
5763     kernel.fg.builder->getcompilerStats().SetI64(CompilerStats::numGRFSpillStr(), numGRFSpill, kernel.getSimdSize());
5764     kernel.fg.builder->getcompilerStats().SetI64(CompilerStats::numGRFFillStr(), numGRFFill, kernel.getSimdSize());
5765 
5766 }
5767 
~SpillAnalysis()5768 SpillAnalysis::~SpillAnalysis()
5769 {
5770     if (Refs)
5771     {
5772         delete Refs;
5773         Refs = nullptr;
5774     }
5775 }
5776 
Dump(std::ostream & OS)5777 void SpillAnalysis::Dump(std::ostream& OS)
5778 {
5779     auto& GRA = GC->getGRA();
5780     auto& Kernel = GRA.kernel;
5781     auto& Loops = Kernel.fg.getLoops();
5782     const auto& Spills = GC->getSpilledLiveRanges();
5783     std::unordered_map<G4_INST*, G4_BB*> InstBBMap;
5784 
5785     for (auto* BB : Kernel.fg.getBBList())
5786         for (auto* Inst : BB->getInstList())
5787             InstBBMap[Inst] = BB;
5788 
5789     OS << "Name, Dcl Byte Size, Spill Cost, Degree, #Defs, #Uses, Distance, #BBs, All BBs Where Live" << std::endl;
5790 
5791     for (auto* Spill : Spills)
5792     {
5793         // dump - {Dcl size, Spill cost, Live BBs (loop annotation)}
5794         auto Dcl = Spill->getDcl();
5795         auto DclSizeBytes = Dcl->getByteSize();
5796         auto SpillCost = Spill->getSpillCost();
5797         auto Degree = DclDegree[Dcl];
5798         auto LiveBBs = GetLiveBBs(Dcl, InstBBMap);
5799         auto Distance = GetDistance(Dcl);
5800         auto NumDefs = Refs->getDefCount(Dcl);
5801         auto NumUses = Refs->getUseCount(Dcl);
5802 
5803         OS << Dcl->getName() << "," << DclSizeBytes << ", " << SpillCost << ", " << Degree << ", "
5804             << NumDefs << ", " << NumUses << ", "
5805             << Distance << ", " << LiveBBs.size() << ", ";
5806 
5807         for (auto* LiveBB : LiveBBs)
5808         {
5809             OS << "BB" << LiveBB->getId();
5810             auto* ClosestLoop = Loops.getInnerMostLoop(LiveBB);
5811             if (ClosestLoop)
5812             {
5813                 OS << " [L" << ClosestLoop->id << "]";
5814             }
5815             OS << " ";
5816         }
5817 
5818         OS << std::endl;
5819     }
5820 }
5821 
GetDistance(G4_Declare * Dcl)5822 unsigned int SpillAnalysis::GetDistance(G4_Declare* Dcl)
5823 {
5824     if (AugIntervals.count(Dcl) == 0)
5825     {
5826         // Construct distance in conventional way
5827         auto& Kernel = GC->getGRA().kernel;
5828         unsigned int Start = 0xffffffff, End = 0x0;
5829 
5830         auto* Defs = Refs->getDefs(Dcl);
5831         auto* Uses = Refs->getUses(Dcl);
5832 
5833         for (auto& Def : *Defs)
5834         {
5835             auto* DefInst = std::get<0>(Def);
5836             Start = std::min(Start, DefInst->getLexicalId());
5837             End = std::max(End, DefInst->getLexicalId());
5838         }
5839 
5840         for (auto& Use : *Uses)
5841         {
5842             auto* UseInst = std::get<0>(Use);
5843             Start = std::min(Start, UseInst->getLexicalId());
5844             End = std::max(End, UseInst->getLexicalId());
5845         }
5846 
5847         for (auto* BB : Kernel.fg.getBBList())
5848         {
5849             if (LA->isLiveAtEntry(BB, Dcl->getRegVar()->getId()))
5850                 Start = std::min(Start, BB->front()->getLexicalId());
5851             if (LA->isLiveAtExit(BB, Dcl->getRegVar()->getId()))
5852                 End = std::max(End, BB->back()->getLexicalId());
5853         }
5854 
5855         return End - Start;
5856     }
5857 
5858     // Return augmentation distance when available
5859     auto Distance = AugIntervals[Dcl];
5860     return Distance.second->getLexicalId() - Distance.first->getLexicalId();
5861 }
5862 
LoadAugIntervals(DECLARE_LIST & SortedIntervals,GlobalRA & GRA)5863 void SpillAnalysis::LoadAugIntervals(DECLARE_LIST& SortedIntervals, GlobalRA& GRA)
5864 {
5865     for (auto& LR : SortedIntervals)
5866     {
5867         auto* Start = GRA.getStartInterval(LR);
5868         auto* End = GRA.getEndInterval(LR);
5869         AugIntervals[LR] = std::make_pair(Start, End);
5870     }
5871 }
5872 
LoadDegree(G4_Declare * Dcl,unsigned int degree)5873 void SpillAnalysis::LoadDegree(G4_Declare* Dcl, unsigned int degree)
5874 {
5875     // This should be called after degree computation and before simplification.
5876     DclDegree[Dcl] = degree;
5877 }
5878 
Clear()5879 void SpillAnalysis::Clear()
5880 {
5881     if(Refs)
5882         delete Refs;
5883 
5884     Refs = nullptr;
5885     LA = nullptr;
5886     GC = nullptr;
5887     SM = nullptr;
5888     AugIntervals.clear();
5889     DclDegree.clear();
5890 
5891 }
5892 
DumpHistogram(std::ostream & OS)5893 void SpillAnalysis::DumpHistogram(std::ostream& OS)
5894 {
5895     // Compute and dump histogram
5896     std::map<unsigned int, unsigned int> SpillSizeHistogram;
5897     for (auto Spill : GC->getSpilledLiveRanges())
5898     {
5899         auto ByteSize = Spill->getDcl()->getByteSize();
5900         SpillSizeHistogram[ByteSize] += 1;
5901     }
5902 
5903     OS << "Spill Size Histogram For Iter#" << GC->getGRA().getIterNo() << " : " << std::endl;
5904     for (auto& Item : SpillSizeHistogram)
5905     {
5906         OS << "# vars of " << Item.first << " bytes spilled: " << Item.second << std::endl;
5907     }
5908 
5909     OS << std::endl;
5910 }
5911 
Do(LivenessAnalysis * L,GraphColor * C,SpillManagerGRF * S)5912 void SpillAnalysis::Do(LivenessAnalysis* L, GraphColor* C, SpillManagerGRF* S)
5913 {
5914     SetLivenessAnalysis(L);
5915     SetGraphColor(C);
5916     SetSpillManager(S);
5917 
5918     unsigned int LexId = 0;
5919     for (auto* BB : C->getGRA().kernel.fg.getBBList())
5920         for (auto* Inst : BB->getInstList())
5921             Inst->setLexicalId(LexId++);
5922 
5923     Refs = new VarReferences(C->getGRA().kernel);
5924 
5925     auto IterNo = C->getGRA().getIterNo();
5926 
5927     std::string FN = "spill-iter-";
5928     FN += std::to_string(IterNo);
5929     FN += std::string(".csv");
5930     std::ofstream OF;
5931     OF.open(FN, std::ofstream::out);
5932     Dump(OF);
5933     OF.close();
5934 
5935     FN = "misc-data";
5936     OF.open(FN, IterNo == 0 ? std::ofstream::out : std::ofstream::app);
5937     if (IterNo == 0)
5938     {
5939         ((vISA::Analysis*)&C->getGRA().kernel.fg.getLoops())->dump(OF);
5940     }
5941     DumpHistogram(OF);
5942     OF.close();
5943 }
5944 
GetLiveBBs(G4_Declare * Dcl,std::unordered_map<G4_INST *,G4_BB * > & InstBBMap)5945 std::vector<G4_BB*> SpillAnalysis::GetLiveBBs(G4_Declare* Dcl, std::unordered_map<G4_INST*, G4_BB*>& InstBBMap)
5946 {
5947     // Return all BBs over which Dcl is live. This includes augmentation data.
5948     auto Order = [](const G4_BB* First, const G4_BB* Second)
5949     {
5950         return First->getId() < Second->getId();
5951     };
5952     std::set<G4_BB*, decltype(Order)> BBs(Order);
5953     auto& Kernel = GC->getGRA().kernel;
5954 
5955     VarReferences VarRefs(Kernel);
5956     auto* Defs = VarRefs.getDefs(Dcl);
5957     auto* Uses = VarRefs.getUses(Dcl);
5958 
5959     for (auto Def : *Defs)
5960     {
5961         auto* BB = std::get<1>(Def);
5962         BBs.insert(BB);
5963     }
5964 
5965     for (auto Use : *Uses)
5966     {
5967         auto* BB = std::get<1>(Use);
5968         BBs.insert(BB);
5969     }
5970 
5971     for (auto BB : Kernel.fg.getBBList())
5972     {
5973         if (LA->isLiveAtEntry(BB, Dcl->getRegVar()->getId()) ||
5974             LA->isLiveAtExit(BB, Dcl->getRegVar()->getId()))
5975         {
5976             BBs.insert(BB);
5977         }
5978     }
5979 
5980     if (AugIntervals.count(Dcl))
5981     {
5982         auto& Interval = AugIntervals[Dcl];
5983         auto AugBBs = GetIntervalBBs(Interval.first, Interval.second, InstBBMap);
5984         std::for_each(AugBBs.begin(), AugBBs.end(), [&](G4_BB* BB) {BBs.insert(BB); });
5985     }
5986 
5987     std::vector<G4_BB*> RetBBs;
5988     std::for_each(BBs.begin(), BBs.end(), [&](G4_BB* BB) {RetBBs.push_back(BB); });
5989 
5990     return RetBBs;
5991 }
5992 
GetIntervalBBs(G4_INST * Start,G4_INST * End,std::unordered_map<G4_INST *,G4_BB * > & InstBBMap)5993 std::vector<G4_BB*> vISA::SpillAnalysis::GetIntervalBBs(G4_INST* Start, G4_INST* End, std::unordered_map<G4_INST*, G4_BB*>& InstBBMap)
5994 {
5995     // Return vector of BBs given Start/End G4_INST*s
5996     std::vector<G4_BB*> BBs;
5997     auto& Kernel = GC->getGRA().kernel;
5998     bool Started = false;
5999     for (auto* BB : Kernel.fg.getBBList())
6000     {
6001         bool BBAdded = false;
6002         for (auto* Inst : BB->getInstList())
6003         {
6004             if (Inst == Start)
6005                 Started = true;
6006 
6007             if (Started && !BBAdded)
6008             {
6009                 BBs.push_back(BB);
6010                 BBAdded = true;
6011             }
6012 
6013             if (Inst == End)
6014                 return BBs;
6015         }
6016     }
6017 
6018     return BBs;
6019 }
6020