1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2017-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include "FlowGraph.h"
10 #include "GraphColor.h"
11 #include <list>
12 #include "SpillCleanup.h"
13 
14 uint32_t computeFillMsgDesc(unsigned int payloadSize, unsigned int offset);
15 uint32_t computeSpillMsgDesc(unsigned int payloadSize, unsigned int offset);
16 
17 namespace vISA
18 {
generateCoalescedSpill(G4_SrcRegRegion * header,unsigned int scratchOffset,unsigned int payloadSize,bool useNoMask,G4_InstOption mask,G4_Declare * spillDcl,unsigned int row)19 G4_SrcRegRegion* CoalesceSpillFills::generateCoalescedSpill(G4_SrcRegRegion* header, unsigned int scratchOffset, unsigned int payloadSize,
20     bool useNoMask, G4_InstOption mask, G4_Declare* spillDcl, unsigned int row)
21 {
22     // Generate split send instruction with specified payload size and offset
23     auto spillSrcPayload = kernel.fg.builder->createSrc(spillDcl->getRegVar(),
24         row, 0, kernel.fg.builder->getRegionStride1(), Type_UD);
25 
26     // Create send instruction with payloadSize starting at scratch offset min
27     G4_Declare* fp = nullptr;
28     if (kernel.fg.getHasStackCalls() || kernel.fg.getIsStackCallFunc())
29         fp = kernel.fg.getFramePtrDcl();
30     unsigned int option = useNoMask ? InstOpt_WriteEnable : 0;
31     auto spillInst = kernel.fg.builder->createSpill(
32         kernel.fg.builder->createNullDst(Type_UW), header, spillSrcPayload, g4::SIMD16, payloadSize,
33         GlobalRA::GRFToHwordSize(scratchOffset), fp, static_cast<G4_InstOption>(option), false);
34 
35     if (!useNoMask)
36     {
37         // Care about mask only for non-NoMask sends
38         spillInst->setMaskOption(mask);
39     }
40 
41     return spillSrcPayload;
42 }
43 
generateCoalescedFill(G4_SrcRegRegion * header,unsigned int scratchOffset,unsigned int payloadSize,unsigned int dclSize,bool evenAlignDst)44 G4_INST* CoalesceSpillFills::generateCoalescedFill(G4_SrcRegRegion* header, unsigned int scratchOffset, unsigned int payloadSize,
45     unsigned int dclSize, bool evenAlignDst)
46 {
47     // Generate split send instruction with specified payload size and offset
48     // Construct fillDst
49     const char* dclName = kernel.fg.builder->getNameString(kernel.fg.mem, 32,
50         "COAL_FILL_%d", kernel.Declares.size());
51     auto fillDcl = kernel.fg.builder->createDeclareNoLookup(dclName, G4_GRF,
52         numEltPerGRF<Type_UD>(), dclSize, Type_UD, DeclareType::CoalescedFill);
53 
54     if (evenAlignDst)
55     {
56         fillDcl->setEvenAlign();
57         gra.setEvenAligned(fillDcl, true);
58     }
59     fillDcl->setDoNotSpill();
60 
61     auto fillDst = kernel.fg.builder->createDst(fillDcl->getRegVar(), 0,
62         0, 1, Type_UW);
63 
64     G4_Declare* fp = nullptr;
65     if (kernel.fg.getHasStackCalls() || kernel.fg.getIsStackCallFunc())
66         fp = kernel.fg.getFramePtrDcl();
67 
68     auto fillInst = kernel.fg.builder->createFill(header, fillDst, g4::SIMD16, payloadSize,
69         GlobalRA::GRFToHwordSize(scratchOffset), fp, InstOpt_WriteEnable, false);
70     return fillInst;
71 }
72 
copyToOldFills(G4_DstRegRegion * coalescedFillDst,std::list<std::pair<G4_DstRegRegion *,std::pair<unsigned int,unsigned int>>> indFills,INST_LIST_ITER f,G4_BB * bb,int srcCISAOff)73 void CoalesceSpillFills::copyToOldFills(
74     G4_DstRegRegion* coalescedFillDst,
75     std::list<std::pair<G4_DstRegRegion*, std::pair<unsigned int, unsigned int>>> indFills,
76     INST_LIST_ITER f, G4_BB* bb, int srcCISAOff)
77 {
78     // Copy data from coalesced fill in to older fills.
79     // This way we dont carry entire coalesced payload
80     // till last fill.
81     for (auto oldFill : indFills)
82     {
83         unsigned int numGRFs = (oldFill.first->getRightBound() - oldFill.first->getLeftBound()
84             + numEltPerGRF<Type_UB>() - 1) / numEltPerGRF<Type_UB>();
85         unsigned int rowOff = 0;
86         // TODO: Check for > 2 GRF dst
87         while (numGRFs > 0)
88         {
89             G4_ExecSize simdSize = g4::SIMD8;
90 
91             unsigned int off = oldFill.second.first;
92             unsigned int size = oldFill.second.second;
93 
94             static const int BYTES_PER_HEXWORD = 32;
95             unsigned int scratchOff =
96                 coalescedFillDst->getInst()->getMsgDesc()->getOffset() / BYTES_PER_HEXWORD;
97 
98             // Scratch msg offset is always equal or lower than individual fills
99             unsigned int offToUse = off - scratchOff + rowOff;
100 
101             if (size > g4::SIMD8)
102                 simdSize = g4::SIMD16;
103 
104             G4_DstRegRegion* movDst = kernel.fg.builder->createDst(
105                 oldFill.first->getBase(), rowOff, 0, 1, Type_UD);
106 
107             G4_SrcRegRegion* src = kernel.fg.builder->createSrc(
108                 coalescedFillDst->getBase(), offToUse, 0, kernel.fg.builder->getRegionStride1(), Type_UD);
109 
110             G4_INST* copy = kernel.fg.builder->createMov(
111                 simdSize, movDst, src, InstOpt_WriteEnable, false);
112 
113             bb->insertBefore(f, copy);
114 
115             numGRFs -= simdSize == 8 ? 1 : 2;
116             rowOff += simdSize == 8 ? 1 : 2;
117         }
118     }
119 }
120 
createCoalescedSpillDcl(unsigned int payloadSize)121 G4_Declare* CoalesceSpillFills::createCoalescedSpillDcl(unsigned int payloadSize)
122 {
123     // Construct spill src
124     const char* dclName = nullptr;
125     G4_Declare* spillDcl = nullptr;
126 
127     dclName = kernel.fg.builder->getNameString(kernel.fg.mem, 32,
128         "COAL_SPILL_%d", kernel.Declares.size());
129     spillDcl = kernel.fg.builder->createDeclareNoLookup(dclName, G4_GRF,
130         numEltPerGRF<Type_UD>(), payloadSize, Type_UD, DeclareType::CoalescedSpill);
131 
132     spillDcl->setDoNotSpill();
133 
134     return spillDcl;
135 }
136 
coalesceSpills(std::list<INST_LIST_ITER> & coalesceableSpills,unsigned int min,unsigned int max,bool useNoMask,G4_InstOption mask,G4_BB * bb)137 void CoalesceSpillFills::coalesceSpills(
138     std::list<INST_LIST_ITER>& coalesceableSpills, unsigned int min,
139     unsigned int max, bool useNoMask, G4_InstOption mask, G4_BB* bb)
140 {
141     // Generate fill with minimum size = max-min. This should be compatible with
142     // payload sizes supported by hardware.
143     unsigned int payloadSize = (max - min) + 1;
144 
145     auto leadInst = *coalesceableSpills.front();
146 
147     MUST_BE_TRUE(payloadSize == 1 || payloadSize == 2 || payloadSize == 4 || payloadSize == 8,
148         "Unsupported payload size");
149 
150     std::set<G4_Declare*> declares;
151     unsigned int minRow = UINT_MAX;
152     for (auto d : coalesceableSpills)
153     {
154         auto src1Opnd = (*d)->getSrc(1)->asSrcRegRegion();
155         auto curRow = src1Opnd->getLeftBound() / numEltPerGRF<Type_UB>();
156         declares.insert(src1Opnd->getTopDcl());
157         minRow = minRow > curRow ? curRow : minRow;
158     }
159 
160     G4_Declare* dcl = nullptr;
161     if (declares.size() == 1)
162     {
163         dcl = (*declares.begin());
164     }
165     else
166     {
167         dcl = createCoalescedSpillDcl(payloadSize);
168         minRow = 0;
169     }
170 
171     auto coalescedSpillSrc = generateCoalescedSpill(kernel.fg.builder->duplicateOperand(leadInst->asSpillIntrinsic()->getHeader()),
172         min, payloadSize, useNoMask, mask, dcl, minRow);
173     coalescedSpillSrc->getInst()->inheritDIFrom(leadInst);
174 
175     if (declares.size() != 1)
176     {
177         for (auto c : coalesceableSpills)
178         {
179             unsigned int scratchOffset, scratchSize;
180             getScratchMsgInfo((*c), scratchOffset, scratchSize);
181 
182             unsigned int rowOff = scratchOffset - min;
183             replaceMap.insert(std::make_pair((*c)->getSrc(1)->getTopDcl(),
184                 std::make_pair(coalescedSpillSrc->getTopDcl(), rowOff)));
185         }
186     }
187 
188     auto f = coalesceableSpills.back();
189     f++;
190 
191     for (auto spill : coalesceableSpills)
192     {
193         bb->erase(spill);
194     }
195     coalesceableSpills.clear();
196     bb->insertBefore(f, coalescedSpillSrc->getInst());
197 }
198 
coalesceFills(std::list<INST_LIST_ITER> & coalesceableFills,unsigned int min,unsigned int max,G4_BB * bb,int srcCISAOff)199 void CoalesceSpillFills::coalesceFills(std::list<INST_LIST_ITER>& coalesceableFills, unsigned int min,
200     unsigned int max, G4_BB* bb, int srcCISAOff)
201 {
202     // Generate fill with minimum size = max-min. This should be compatible with
203     // payload sizes supported by hardware.
204     unsigned int payloadSize = (max - min) + 1;
205     if (payloadSize == 3)
206         payloadSize = 4;
207     else if (payloadSize > 4)
208         payloadSize = 8;
209     else if (payloadSize == 0)
210         payloadSize = 1;
211 
212     MUST_BE_TRUE(payloadSize == 1 || payloadSize == 2 || payloadSize == 4 || payloadSize == 8,
213         "Unsupported payload size");
214 
215     // dclSize could be larger than payload size when
216     // 2 variables across scratch writes are coalesced.
217     unsigned int dclSize = payloadSize;
218     for (auto c : coalesceableFills)
219     {
220         unsigned int scratchOffset, scratchSize;
221         auto fill = (*c);
222         getScratchMsgInfo((*c), scratchOffset, scratchSize);
223 
224         auto fillDst = fill->getDst();
225         auto fillDstRegOff = fillDst->getRegOff();
226         unsigned int dstDclRows = fillDst->getTopDcl()->getNumRows();
227         unsigned int maxRow = dstDclRows + scratchOffset - fillDstRegOff - min;
228 
229         if (maxRow > dclSize)
230             dclSize = maxRow;
231     }
232 
233     auto leadInst = *coalesceableFills.front();
234 
235     auto newFill = generateCoalescedFill(kernel.fg.builder->duplicateOperand(leadInst->asFillIntrinsic()->getHeader()),
236         min, payloadSize, dclSize, gra.isEvenAligned(leadInst->getDst()->getTopDcl()));
237     newFill->inheritDIFrom(leadInst);
238 
239     for (auto c : coalesceableFills)
240     {
241         unsigned int scratchOffset, scratchSize;
242         getScratchMsgInfo((*c), scratchOffset, scratchSize);
243 
244         unsigned int rowOff = scratchOffset - min;
245         replaceMap.insert(std::make_pair((*c)->getDst()->getTopDcl(),
246             std::make_pair(newFill->getDst()->getTopDcl(), rowOff)));
247     }
248 
249     auto f = coalesceableFills.front();
250     f++;
251 
252     for (auto fill : coalesceableFills)
253     {
254         if (fill == f)
255         {
256             f++;
257         }
258         bb->erase(fill);
259     }
260 
261     coalesceableFills.clear();
262     bb->insertBefore(f, newFill);
263 
264     //    copyToOldFills(coalescedFillDst, indFills, f, bb, srcCISAOff);
265 }
266 
267 // Return true if heuristic agrees to coalescing.
fillHeuristic(std::list<INST_LIST_ITER> & coalesceableFills,std::list<INST_LIST_ITER> & instList,const std::list<INST_LIST_ITER> & origInstList,unsigned int & min,unsigned int & max)268 bool CoalesceSpillFills::fillHeuristic(std::list<INST_LIST_ITER>& coalesceableFills,
269     std::list<INST_LIST_ITER>& instList, const std::list<INST_LIST_ITER>& origInstList,
270     unsigned int& min, unsigned int& max)
271 {
272 #if 0
273     std::bitset<8> bits(0);
274     MUST_BE_TRUE(cMaxFillPayloadSize == 8, "Handle other max fill payload size");
275 #else
276     std::bitset<4> bits(0);
277     MUST_BE_TRUE(cMaxFillPayloadSize == 4, "Handle other max fill payload size");
278 #endif
279 
280     if (coalesceableFills.size() <= 1)
281     {
282         return false;
283     }
284 
285     min = 0xffffffff, max = 0;
286     G4_Declare* header = (*coalesceableFills.front())->asFillIntrinsic()->getHeader()->getTopDcl();
287     for (auto f : coalesceableFills)
288     {
289         if ((*f)->asFillIntrinsic()->getHeader()->getTopDcl() != header &&
290             !(*f)->asFillIntrinsic()->getFP())
291             return false;
292 
293         unsigned int scratchOffset, scratchSize;
294         getScratchMsgInfo(*f, scratchOffset, scratchSize);
295 
296         if (scratchSize == 8)
297         {
298             return false;
299         }
300 
301         if (addrTakenSpillFillDcl.find((*f)->getDst()->getTopDcl()) !=
302             addrTakenSpillFillDcl.end())
303         {
304             return false;
305         }
306 
307         for (auto i = scratchOffset; i < (scratchOffset + scratchSize); i++)
308             bits.set(i - scratchOffset);
309 
310         if (min > scratchOffset)
311             min = scratchOffset;
312 
313         if (max < (scratchOffset + scratchSize - 1))
314             max = (scratchOffset + scratchSize - 1);
315     }
316 
317     // Iterate over coalescable fills and ensure all rows of a variable
318     // are fill candidates. If not, then dont fill. This helps cases like,
319     // #1 FILL_V10(0,0) <-- load 0x10 ... (4 GRFs)
320     // #2 FILL_V10(4,0) <-- load 0x14 ... (1 GRF)
321     // #3 send ... FILL_V10(0,0)   ... (use 3 GRFs of FILL_V10)
322     // #4 FILL_V11(0,0) <-- load 0x15 ... (1 GRF)
323     //
324     // Loads at #2 and #4 can be coalesced. But this requires a new coalesced
325     // variable of size = 6 GRFs. This size can quickly increase for Cm where
326     // payloads of 8 GRF are also present. So instead of allowing these cases
327     // at the risk of spilling more, we require that all rows of fill range
328     // are candidates in coalesceableFills list. So when only #2 and #4 are
329     // fill candidates, we will not coalesce them. This makes us miss some
330     // cases where subset rows of fills have been converted to movs.
331     const int maxDclSize = 128;
332 
333     std::map<G4_Declare*, std::bitset<maxDclSize>> allRows;
334     for (auto c : coalesceableFills)
335     {
336         auto topdcl = (*c)->getDst()->getTopDcl();
337 
338         unsigned int scratchOffset, scratchSize;
339         getScratchMsgInfo(*c, scratchOffset, scratchSize);
340 
341         auto it = allRows.find(topdcl);
342         if (it == allRows.end())
343         {
344             allRows.insert(std::make_pair(topdcl, std::bitset<maxDclSize>()));
345             it = allRows.find(topdcl);
346         }
347 
348         // Now mark bits corresponding to rows
349         unsigned int regOff = (*c)->getDst()->getRegOff();
350         for (unsigned int r = regOff;
351             r < (regOff+ scratchSize); r++)
352         {
353             it->second.set(r);
354         }
355     }
356 
357     // Check whether all dcls in map have all rows filled
358     for (auto&& r : allRows)
359     {
360         unsigned int numRows = r.first->getNumRows();
361 
362         for (unsigned int i = 0; i < numRows; i++)
363         {
364             if (r.second.test(i) == false)
365             {
366                 // Found a row of variable that isnt captured in
367                 // list of candidate fills.
368                 return false;
369             }
370         }
371     }
372 
373 #if 0
374     for (auto f : coalesceableFills)
375     {
376         unsigned int scratchOffset, scratchSize;
377         getScratchMsgInfo(*f, scratchOffset, scratchSize);
378 
379         for (auto i = scratchOffset; i < (scratchOffset + scratchSize); i++)
380             bits.set(i - min);
381     }
382 #endif
383 
384     if (max - min <= 3)
385     {
386         // Will emit at most 4GRF read
387         if (bits[0] != bits[1] &&
388             bits[2] != bits[3])
389         {
390             // Dont coalesce patterns like
391             // 1010, 0101
392             return false;
393         }
394 
395         if ((bits[0] & bits[3]) &&
396             !(bits[1] | bits[2]))
397         {
398             // 1001
399             return false;
400         }
401     }
402 
403     return true;
404 }
405 
notOOB(unsigned int min,unsigned int max)406 bool CoalesceSpillFills::notOOB(unsigned int min, unsigned int max)
407 {
408     // Disallow coalescing if it exceeds spill size computed by RA
409     // as this may cause segmentation fault.
410     auto nextSpillOffset = spill.getNextOffset();
411     auto diff = max - min;
412     auto HWordByteSize = GlobalRA::getHWordByteSize();
413     if (diff == 2 &&
414         (min + 3) * HWordByteSize >= nextSpillOffset)
415     {
416         // Coalesce 1-1-
417         // Payload size 4 is needed as candidates are not back-to-back.
418         // Coalescing is illegal if last row exceeds spill size computed by RA.
419         return false;
420     }
421     else if (diff > 3 && diff < 7 &&
422         (min + 7) * HWordByteSize >= nextSpillOffset)
423     {
424         // payload size = 8 would be used
425         return false;
426     }
427 
428     return true;
429 }
430 
431 // instList contains all instructions (fills or spills) within window size.
432 // At exit, instList will contain instructions that will not be coalesced.
433 // coalescable list will contain instructions within min-max offset range.
434 // First instruction's offset in instList is set to be min. max is
435 // min + maxPayloadSize - 1.
sendsInRange(std::list<INST_LIST_ITER> & instList,std::list<INST_LIST_ITER> & coalescable,unsigned int maxPayloadSize,unsigned int & min,unsigned int & max)436 void CoalesceSpillFills::sendsInRange(std::list<INST_LIST_ITER>& instList,
437     std::list<INST_LIST_ITER>& coalescable,
438     unsigned int maxPayloadSize, unsigned int& min, unsigned int& max)
439 {
440     min = 0xffffffff;
441     max = 0;
442     bool isFirstNoMask = false;
443     unsigned int mask = 0;
444     for (auto iter = instList.begin();
445         iter != instList.end();
446         )
447     {
448         unsigned scratchOffset, sizeInGrfUnit, lastScratchOffset;
449         auto inst = *(*iter);
450         getScratchMsgInfo(inst, scratchOffset, sizeInGrfUnit);
451         lastScratchOffset = scratchOffset + sizeInGrfUnit - 1;
452 
453         if (min == 0xffffffff && max == 0)
454         {
455             // First spill is definitely a candidate
456             min = scratchOffset;
457             max = lastScratchOffset;
458             coalescable.push_back(*iter);
459             iter = instList.erase(iter);
460             isFirstNoMask = inst->isWriteEnableInst();
461             mask = inst->getMaskOption();
462 
463             if (addrTakenSpillFillDcl.find(inst->getDst()->getTopDcl()) !=
464                 addrTakenSpillFillDcl.end())
465             {
466                 return;
467             }
468 
469             continue;
470         }
471 
472         if (min != 0xffffffff || max != 0)
473         {
474             bool maskMatch = (isFirstNoMask && inst->isWriteEnableInst()) ||
475                 (mask == inst->getMaskOption());
476 
477             // don't coalesce if non-leading fill inst has alignment requirements,
478             // as we may not be able to satisfy it
479             bool fillDstisAligned = gra.isEvenAligned(inst->getDst()->getTopDcl());
480 
481             if (!maskMatch || fillDstisAligned)
482             {
483                 iter++;
484                 continue;
485             }
486 
487             // Check whether min/max can be extended
488             if (scratchOffset <= min &&
489                 (min - scratchOffset) <= (cMaxFillPayloadSize - 1) &&
490                 (max - scratchOffset) <= (cMaxFillPayloadSize - 1) &&
491                 notOOB(scratchOffset, max))
492             {
493                 // This instruction can be coalesced
494                 min = scratchOffset;
495                 if (max < lastScratchOffset)
496                     max = lastScratchOffset;
497 
498                 //MUST_BE_TRUE(max - min <= (cMaxFillPayloadSize - 1), "Unexpected fills coalesced. (max - min) is out of bounds - 1");
499 
500                 coalescable.push_back(*iter);
501                 iter = instList.erase(iter);
502             }
503             else if (scratchOffset >= max &&
504                 (lastScratchOffset - min) <= (cMaxFillPayloadSize - 1) &&
505                 (lastScratchOffset - max) <= (cMaxFillPayloadSize - 1) &&
506                 notOOB(min, scratchOffset))
507             {
508                 max = lastScratchOffset;
509 
510                 //MUST_BE_TRUE(max - min <= cMaxFillPayloadSize, "Unexpected spills coalesced. (max - min) is out of bounds - 2");
511 
512                 coalescable.push_back(*iter);
513                 iter = instList.erase(iter);
514             }
515             else if (scratchOffset >= min &&
516                 lastScratchOffset <= max)
517             {
518                 coalescable.push_back(*iter);
519                 iter = instList.erase(iter);
520             }
521             else
522             {
523                 iter++;
524             }
525         }
526     }
527 }
528 
529 // instList contains all spills seen in window.
530 // coalescable is empty and should contain consecutive spills.
531 // This funtion will prune spills so they write consecutive
532 // memory slots. First spill is first candidate to start window.
keepConsecutiveSpills(std::list<INST_LIST_ITER> & instList,std::list<INST_LIST_ITER> & coalescable,unsigned int maxPayloadSize,unsigned int & minOffset,unsigned int & maxOffset,bool & useNoMask,G4_InstOption & mask)533 void CoalesceSpillFills::keepConsecutiveSpills(std::list<INST_LIST_ITER>& instList,
534     std::list<INST_LIST_ITER>& coalescable,
535     unsigned int maxPayloadSize, unsigned int& minOffset, unsigned int& maxOffset,
536     bool& useNoMask, G4_InstOption& mask)
537 {
538     // allowed list contains instructions to be coalesced in
539     // ascending order of their spill slots.
540     std::list<INST_LIST_ITER> allowed;
541     auto origInstList = instList;
542     allowed.push_back(instList.front());
543     instList.pop_front();
544     unsigned int maskOffset = (*allowed.front())->getMaskOption();
545     mask = (G4_InstOption)(maskOffset & InstOpt_QuarterMasks);
546     useNoMask = (maskOffset & InstOpt_WriteEnable) ? true : false;
547     unsigned int size;
548     getScratchMsgInfo(*allowed.front(), minOffset, size);
549     maxOffset = minOffset + size - 1;
550 
551     bool firstSpillFromSend = false;
552     G4_Declare* sendDstTopDcl = (*allowed.front())->getSrc(1)->getTopDcl();
553     if (sendDstDcl.find(sendDstTopDcl) != sendDstDcl.end())
554         firstSpillFromSend = true;
555 
556     G4_Declare* header = (*instList.front())->asSpillIntrinsic()->getHeader()->getTopDcl();
557 
558     for (auto instIt : instList)
559     {
560         auto inst = (*instIt);
561 
562         if (inst->asSpillIntrinsic()->getHeader()->getTopDcl() != header &&
563             !inst->asSpillIntrinsic()->getFP())
564         {
565             return;
566         }
567 
568         useNoMask &= inst->isWriteEnableInst();
569 
570         if (!useNoMask)
571             break;
572     }
573 
574     if (useNoMask)
575     {
576         // Spill coalescing doesnt work as expected without NoMask
577         bool redo;
578         do
579         {
580             redo = false;
581             for (auto spillIt = instList.begin();
582                 spillIt != instList.end();
583                 spillIt++)
584             {
585                 unsigned int scratchOffset, scratchSize;
586                 getScratchMsgInfo(*(*spillIt), scratchOffset, scratchSize);
587 
588                 auto src1 = (*(*spillIt))->getSrc(1);
589                 if (src1 &&
590                     addrTakenSpillFillDcl.find(src1->getTopDcl()) !=
591                     addrTakenSpillFillDcl.end())
592                 {
593                     // Address taken dcls should not be coalesed with others.
594                     // This is dangerous because nothing ties indirect opnd
595                     // with fill/spill instructions for it. Only after RA do
596                     // we update offset of address register holding the
597                     // indirect operand, based on RA assigment to spill/fill
598                     // address taken variable.
599                     continue;
600                 }
601 
602                 if (// Consecutive scratch offsets
603                     scratchOffset == maxOffset + 1 &&
604                     // Scratch offset + size is within max payload size
605                     (scratchOffset + scratchSize - 1) <= (minOffset + maxPayloadSize - 1) &&
606                     // Either both masks are same or both are NoMask
607                     (((*(*spillIt))->getMaskOption() == (maskOffset & InstOpt_QuarterMasks)) ||
608                     (useNoMask && (*(*spillIt))->isWriteEnableInst())))
609                 {
610                     auto curInstDstTopDcl = (*(*spillIt))->getSrc(1)->getTopDcl();
611                     // Check whether current inst's topdcl was spilled in a send.
612                     // If it was and first instruction in instList wasnt then
613                     // dont consider current instruction as coalescing candidate.
614                     if (!firstSpillFromSend &&
615                         sendDstDcl.find(curInstDstTopDcl) != sendDstDcl.end())
616                     {
617                         continue;
618                     }
619 
620                     // This condition allows send coalescing iff
621                     // a. Either none of the vars are defined in a send
622                     // b. All vars defined in same send
623                     if (!firstSpillFromSend ||
624                         curInstDstTopDcl == sendDstTopDcl)
625                     {
626                         if (curInstDstTopDcl == sendDstTopDcl)
627                         {
628                             // Make sure src1 operands are consecutive
629                             auto curSrc1Row = (*(*spillIt))->getSrc(1)->asSrcRegRegion()->getRegOff();
630                             bool success = true;
631                             for (auto candidate : allowed)
632                             {
633                                 unsigned int candOffset, candSize;
634                                 getScratchMsgInfo(*candidate, candOffset, candSize);
635                                 auto prevSrc1Row = (*candidate)->getSrc(1)->asSrcRegRegion()->getRegOff();
636 
637                                 unsigned int scratchOffDelta = scratchOffset - candOffset;
638                                 if ((prevSrc1Row + scratchOffDelta) != curSrc1Row)
639                                 {
640                                     // Following is disallowed
641                                     // send  (8) V10(1,0) ... <-- resLen = 4
642                                     // sends (8) null r0 V10(1,0) 0x100 <-- extLen = 1
643                                     // mov   (8) T2  V10(2,0)
644                                     // sends (8) null r0 r10(3,0) 0x101 <-- extLen = 1
645                                     // mov   (8) T4  V10(4,0)
646                                     // Two scratch writes cannot be coalesced here
647                                     // because their src1 regions arent consecutive.
648                                     success = false;
649                                     break;
650                                 }
651                             }
652                             if (!success)
653                                 continue;
654                         }
655 
656                         allowed.push_back(*spillIt);
657                         instList.erase(spillIt);
658                         redo = true;
659                         maxOffset += scratchSize;
660                         break;
661                     }
662                 }
663             }
664         } while (redo);
665     }
666 
667     while (allowed.size() > 1)
668     {
669         unsigned int slots = maxOffset - minOffset + 1;
670         if (slots == 2 || slots == 4)
671         {
672             // Insert coalescable spills in order of appearance
673             for (auto origInst : origInstList)
674             {
675                 for (auto allowedSpills : allowed)
676                 {
677                     if (*origInst == *allowedSpills)
678                     {
679                         coalescable.push_back(origInst);
680                         break;
681                     }
682                 }
683             }
684 
685             MUST_BE_TRUE(coalescable.size() == allowed.size(),
686                 "Coalesced spills list missing entries");
687             break;
688         }
689         else
690         {
691             allowed.pop_back();
692             unsigned int scratchOffset, scratchSize;
693             getScratchMsgInfo(*allowed.back(), scratchOffset, scratchSize);
694             maxOffset = scratchOffset + scratchSize - 1;
695         }
696     }
697 
698     instList = origInstList;
699     for (auto coalIt = coalescable.begin(),
700         instIt = instList.begin();
701         coalIt != coalescable.end();
702         coalIt++)
703     {
704         if (*instIt == *coalIt)
705             instIt = instList.erase(instIt);
706         else
707         {
708             while (*instIt != *coalIt)
709             {
710                 instIt++;
711             }
712             instIt = instList.erase(instIt);
713         }
714     }
715 }
716 
analyzeSpillCoalescing(std::list<INST_LIST_ITER> & instList,INST_LIST_ITER start,INST_LIST_ITER end,G4_BB * bb)717 INST_LIST_ITER CoalesceSpillFills::analyzeSpillCoalescing(std::list<INST_LIST_ITER>& instList,
718     INST_LIST_ITER start, INST_LIST_ITER end, G4_BB* bb)
719 {
720     // Check and perform coalescing, if possible, amongst spills in instList.
721     // Return inst iter points to either last inst+1 in instList if all spills
722     // were coalesced. Otherwise, it points to first spill that wasnt coalesced.
723     // Spill coalescing is possible only when all slots in coalesced range
724     // have a write.
725     INST_LIST_ITER last = end;
726     last++;
727 #if 0
728     unsigned int startCISAOff = (*instList.front())->getCISAOff();
729 #endif
730     if (instList.size() < 2)
731     {
732         return last;
733     }
734 
735     std::list<INST_LIST_ITER> coalesceableSpills;
736     auto origInstList = instList;
737     unsigned int min, max;
738     G4_InstOption mask;
739     bool useNoMask;
740     keepConsecutiveSpills(instList, coalesceableSpills, cMaxSpillPayloadSize, min, max, useNoMask, mask);
741 
742 #if 0
743     printf("Start -- \n");
744     if (coalesceableSpills.size() > 0)
745     {
746         printf("Will coalesce following spill (offset, size) pairs:\n");
747         for (auto k : coalesceableSpills)
748         {
749             printf("(%d, %d) @ $%d,\t", (*k)->getMsgDesc()->getScratchRWOffset(), (*k)->getMsgDesc()->getScratchRWSize(), (*k)->getCISAOff());
750         }
751         printf("\n\n");
752     }
753 
754     if (instList.size() > 0)
755     {
756         printf("Will NOT coalesce following spill (offset, size) pairs:\n");
757         for (auto k : instList)
758         {
759             printf("(%d, %d) @ $%d,\t", (*k)->getMsgDesc()->getScratchRWOffset(), (*k)->getMsgDesc()->getScratchRWSize(), (*k)->getCISAOff());
760         }
761         printf("\n\n");
762     }
763 
764     printf("End --\n");
765 #endif
766 
767     if (coalesceableSpills.size() > 1)
768     {
769         coalesceSpills(coalesceableSpills, min, max, useNoMask, mask, bb);
770     }
771     else
772     {
773         // When coalescing is not done, we want to
774         // move to second instruction in instList in
775         // next loop iteration.
776         instList.pop_front();
777     }
778 
779     if (instList.size() == 0)
780     {
781         return last;
782     }
783     else
784     {
785         return instList.front();
786     }
787 }
788 
analyzeFillCoalescing(std::list<INST_LIST_ITER> & instList,INST_LIST_ITER start,INST_LIST_ITER end,G4_BB * bb)789 INST_LIST_ITER CoalesceSpillFills::analyzeFillCoalescing(std::list<INST_LIST_ITER>& instList,
790     INST_LIST_ITER start, INST_LIST_ITER end, G4_BB* bb)
791 {
792     // Check and perform coalescing, if possible, amongst fills in instList.
793     // Return inst iter points to either last inst+1 in instList if all fills
794     // were coalesced. Otherwise, it points to first fill that wasnt coalesced.
795     INST_LIST_ITER last = end;
796     last++;
797 #if 0
798     G4_INST* lastInst = nullptr;
799     if (last != bb->end())
800         lastInst = (*last);
801 #endif
802     if (instList.size() < 2)
803     {
804         return last;
805     }
806 
807     std::list<INST_LIST_ITER> coalesceableFills;
808     auto origInstList = instList;
809     unsigned int min, max;
810     sendsInRange(instList, coalesceableFills, cMaxFillPayloadSize, min, max);
811 
812     bool heuristic = fillHeuristic(coalesceableFills, instList, origInstList, min, max);
813     if (!heuristic)
814     {
815         coalesceableFills.clear();
816         instList = origInstList;
817         instList.pop_front();
818 #if 0
819         printf("Fill heuristic didnt agree to coalescing\n");
820 #endif
821     }
822 
823 #if 0
824     printf("Start -- \n");
825     if (coalesceableFills.size() > 0)
826     {
827         printf("Will coalesce following fill (offset, size) pairs:\n");
828         for (auto k : coalesceableFills)
829         {
830             printf("(%d, %d) @ $%d,\t", (*k)->getMsgDesc()->getScratchRWOffset(), (*k)->getMsgDesc()->getScratchRWSize(), (*k)->getCISAOff());
831         }
832         printf("\n\n");
833     }
834 
835     if (instList.size() > 0)
836     {
837         printf("Will NOT coalesce following fill (offset, size) pairs:\n");
838         for (auto k : instList)
839         {
840             printf("(%d, %d) @ $%d,\t", (*k)->getMsgDesc()->getScratchRWOffset(), (*k)->getMsgDesc()->getScratchRWSize(), (*k)->getCISAOff());
841         }
842         printf("\n\n");
843     }
844 
845     printf("End --\n");
846 #endif
847 
848     if (coalesceableFills.size() > 1)
849     {
850         coalesceFills(coalesceableFills, min, max, bb, (*coalesceableFills.front())->getCISAOff());
851     }
852 
853     if (instList.size() == 0)
854     {
855         return last;
856     }
857     else
858     {
859         return instList.front();
860     }
861 }
862 
overlap(G4_INST * inst1,G4_INST * inst2,bool & isFullOverlap)863 bool CoalesceSpillFills::overlap(G4_INST* inst1, G4_INST* inst2, bool& isFullOverlap)
864 {
865     unsigned int scratchOffset1, scratchSize1, scratchOffset2, scratchSize2;
866     unsigned int scratchEnd1, scratchEnd2;
867     getScratchMsgInfo(inst1, scratchOffset1, scratchSize1);
868     getScratchMsgInfo(inst2, scratchOffset2, scratchSize2);
869 
870     // isFullOverlap is true only if inst1 full covers inst2
871     isFullOverlap = false;
872 
873     scratchEnd1 = scratchOffset1 + scratchSize1 - 1;
874     scratchEnd2 = scratchOffset2 + scratchSize2 - 1;
875 
876     if (scratchOffset1 <= scratchOffset2)
877     {
878         // inst1    |---------|     or     |----------|
879         // inst2         |------|             |---|
880         if (scratchEnd1 >= scratchOffset2)
881         {
882             if (scratchOffset1 <= scratchOffset2 &&
883                 (scratchOffset1 + scratchSize1) >= (scratchOffset2 + scratchSize2))
884             {
885                 isFullOverlap = true;
886             }
887 
888             return true;
889         }
890     }
891     else
892     {
893         // inst1          |------|      or       |-----|
894         // inst2       |-----|               |-----------|
895         if (scratchEnd2 >= scratchOffset1)
896         {
897             if (scratchOffset1 <= scratchOffset2 &&
898                 (scratchOffset1 + scratchSize1) >= (scratchOffset2 + scratchSize2))
899             {
900                 isFullOverlap = true;
901             }
902 
903             return true;
904         }
905     }
906 
907     return false;
908 }
909 
overlap(G4_INST * inst,std::list<INST_LIST_ITER> & allInsts)910 bool CoalesceSpillFills::overlap(G4_INST* inst, std::list<INST_LIST_ITER>& allInsts)
911 {
912     for (auto sp : allInsts)
913     {
914         bool t;
915         auto spillInst = (*sp);
916         if (overlap(inst, spillInst, t))
917             return true;
918     }
919 
920     return false;
921 }
922 
removeWARFills(std::list<INST_LIST_ITER> & fills,std::list<INST_LIST_ITER> & spills)923 void CoalesceSpillFills::removeWARFills(std::list<INST_LIST_ITER>& fills, std::list<INST_LIST_ITER>& spills)
924 {
925     for (auto flIt = fills.begin();
926         flIt != fills.end();
927         )
928     {
929         if (overlap((*(*flIt)), spills))
930         {
931             flIt = fills.erase(flIt);
932             continue;
933         }
934         flIt++;
935     }
936 }
937 
938 // Return true if an operand was replaced
replaceCoalescedOperands(G4_INST * inst)939 bool CoalesceSpillFills::replaceCoalescedOperands(G4_INST* inst)
940 {
941     bool IRChanged = false;
942     auto dst = inst->getDst();
943     if (dst &&
944         dst->getTopDcl())
945     {
946         auto dcl = dst->getTopDcl();
947         auto it = replaceMap.find(dcl);
948 
949         if (it != replaceMap.end())
950         {
951             auto dstRgn = dst->asDstRegRegion();
952             auto newDstRgn = kernel.fg.builder->createDst(it->second.first->getRegVar(),
953                 it->second.second + dstRgn->getRegOff(), dstRgn->getSubRegOff(), dstRgn->getHorzStride(), dstRgn->getType());
954 
955             newDstRgn->setAccRegSel(dstRgn->getAccRegSel());
956             inst->setDest(newDstRgn);
957             IRChanged = true;
958         }
959     }
960 
961     for (unsigned int i = 0; i < G4_MAX_SRCS; i++)
962     {
963         auto opnd = inst->getSrc(i);
964 
965         if (opnd &&
966             opnd->getTopDcl())
967         {
968             auto dcl = opnd->getTopDcl();
969             auto it = replaceMap.find(dcl);
970 
971             if (it == replaceMap.end())
972                 continue;
973 
974             if (opnd->isSrcRegRegion())
975             {
976                 auto srcRgn = opnd->asSrcRegRegion();
977                 auto oldRgnDesc = srcRgn->getRegion();
978 
979                 auto newSrcRgn = kernel.fg.builder->createSrcRegRegion(srcRgn->getModifier(), Direct,
980                     it->second.first->getRegVar(), it->second.second + srcRgn->getRegOff(),
981                     srcRgn->getSubRegOff(), oldRgnDesc,
982                     opnd->getType());
983                 newSrcRgn->setAccRegSel(srcRgn->getAccRegSel());
984                 inst->setSrc(newSrcRgn, i);
985                 IRChanged = true;
986             }
987         }
988     }
989 
990     return IRChanged;
991 }
992 
allSpillsSameVar(std::list<INST_LIST_ITER> & spills)993 bool CoalesceSpillFills::allSpillsSameVar(std::list<INST_LIST_ITER>& spills)
994 {
995     // Return true if all vars in spills list have same dcl
996     G4_Declare* dcl = nullptr;
997     for (auto s : spills)
998     {
999         auto topdcl = (*s)->getSrc(1)->getTopDcl();
1000 
1001         if (!dcl)
1002             dcl = topdcl;
1003 
1004         if (topdcl != dcl)
1005         {
1006             return false;
1007         }
1008     }
1009 
1010     // Allow only if all dcls are defined by same send
1011     if (sendDstDcl.find(dcl) != sendDstDcl.end())
1012         return true;
1013 
1014     return false;
1015 }
1016 
fills()1017 void CoalesceSpillFills::fills()
1018 {
1019     // Iterate over all BBs, find fills that are closeby and coalesce
1020     // a bunch of them. Insert movs as required.
1021     for (auto bb : kernel.fg)
1022     {
1023         auto endIter = bb->end();
1024         std::list<INST_LIST_ITER> fillsToCoalesce;
1025         std::list<INST_LIST_ITER> spills;
1026         INST_LIST_ITER startIter = bb->begin();
1027         unsigned int w = 0;
1028         const auto& splitInsts = LoopVarSplit::getSplitInsts(&gra, bb);
1029         for (auto instIter = startIter;
1030             instIter != endIter;)
1031         {
1032             auto inst = (*instIter);
1033 
1034             if (inst->isPseudoKill() ||
1035                 inst->isLabel())
1036             {
1037                 instIter++;
1038                 continue;
1039             }
1040 
1041             if (splitInsts.find(inst) != splitInsts.end())
1042             {
1043                 // if inst was emitted by loop split transformation,
1044                 // then dont optimize it. such instructions are
1045                 // emitted in loop preheader/loop exit. if a split
1046                 // variable spills, we need to erase all fills and
1047                 // spills emitted for that split. if we coalesce
1048                 // 2 fills from split sequence, we may end up with
1049                 // fill loading data that wont be used. for eg,
1050                 //
1051                 // (W) mov (8|M0) SPLIT1     V10
1052                 // (W) mov (8|M0) SPLIT2     V11
1053                 // ==>
1054                 //
1055                 // (W) fill FILL_V10 @ 0x0
1056                 // (W) mov (8|M0) SPLIT1     FILL_V10
1057                 // (W) fill FILL_V11 @ 0x32
1058                 // (W) mov (8|M0) SPLIT2     FILL_V11
1059                 //
1060                 // we need to be able to erase 2nd fill and copy in
1061                 // case SPLIT2 spills. this cannot be done if the 2
1062                 // fills are coalesced.
1063 
1064                 instIter++;
1065                 continue;
1066             }
1067 
1068             if (inst->isSpillIntrinsic())
1069             {
1070                 spills.push_back(instIter);
1071             }
1072             else if (inst->isFillIntrinsic())
1073             {
1074                 // Check if coalescing is possible
1075                 if (fillsToCoalesce.size() == 0)
1076                 {
1077                     w = 0;
1078                     startIter = instIter;
1079                     spills.clear();
1080                 }
1081 
1082                 if (!overlap(*instIter, spills))
1083                 {
1084                     fillsToCoalesce.push_back(instIter);
1085                 }
1086             }
1087 
1088             if (fillsToCoalesce.size() > 0 &&
1089                 rpe.getRegisterPressure(inst) > fillWindowSizeThreshold)
1090             {
1091                 // High register pressure region so reduce window size to 3
1092                 w = (cWindowSize - w > 3) ? cWindowSize - 3 : w;
1093             }
1094 
1095             if (w == cWindowSize || inst == bb->back())
1096             {
1097                 if (fillsToCoalesce.size() > 1)
1098                 {
1099                     instIter = analyzeFillCoalescing(fillsToCoalesce, startIter, instIter, bb);
1100                 }
1101                 else if (w == cWindowSize)
1102                 {
1103                     startIter = instIter;
1104                 }
1105                 else if (inst == bb->back())
1106                 {
1107                     break;
1108                 }
1109 
1110                 w = 0;
1111                 fillsToCoalesce.clear();
1112                 spills.clear();
1113 
1114                 continue;
1115             }
1116 
1117             if (fillsToCoalesce.size() > 0)
1118             {
1119                 w++;
1120             }
1121 
1122             instIter++;
1123         }
1124 
1125         // One pass to replace old fills with coalesced dcl
1126         for (auto instIt = bb->begin();
1127             instIt != bb->end();
1128             )
1129         {
1130             auto inst = (*instIt);
1131 
1132             if (inst->isPseudoKill() &&
1133                 replaceMap.find(inst->getDst()->getTopDcl()) != replaceMap.end())
1134             {
1135                 instIt = bb->erase(instIt);
1136                 continue;
1137             }
1138 
1139             while (replaceCoalescedOperands(inst))
1140             {
1141                 // replaceCoalescedOperands() updates references to old
1142                 // dcls in IR that are coalesced. Having a single pass to
1143                 // replace operands is insufficient if a coalesced range
1144                 // gets re-coalesced. This can happen rarely when window
1145                 // heuristic hits boundary condition.
1146             };
1147             instIt++;
1148         }
1149     }
1150 }
1151 
populateSendDstDcl()1152 void CoalesceSpillFills::populateSendDstDcl()
1153 {
1154     // Find and store all G4_Declares that are dest in sends
1155     // and are spilled. This is required when coalescing
1156     // scratch writes for such spills. We cannot mix coalescing
1157     // for G4_Declares from one and and other instructions.
1158     // Otherwise register pressure increases significantly.
1159     for (auto bb : kernel.fg)
1160     {
1161         for (auto inst : *bb)
1162         {
1163             if (inst->isSend() &&
1164                 inst->getDst())
1165             {
1166                 if (!inst->getDst()->isNullReg())
1167                 {
1168                     if (!inst->getMsgDesc()->isScratch())
1169                     {
1170                         auto topdcl = inst->getDst()->getTopDcl();
1171 
1172                         sendDstDcl.insert(topdcl);
1173                     }
1174                 }
1175             }
1176             else if (inst->isSpillIntrinsic() &&
1177                 inst->getSrc(1)->getBase()->asRegVar()->isRegVarCoalesced())
1178             {
1179                 auto topdcl = inst->getSrc(1)->getTopDcl();
1180 
1181                 sendDstDcl.insert(topdcl);
1182             }
1183         }
1184     }
1185 }
1186 
spills()1187 void CoalesceSpillFills::spills()
1188 {
1189     populateSendDstDcl();
1190 
1191     // Iterate over all BBs, find fills that are closeby and coalesce
1192     // a bunch of them. Insert movs as required.
1193     for (auto bb : kernel.fg)
1194     {
1195         auto endIter = bb->end();
1196         std::list<INST_LIST_ITER> spillsToCoalesce;
1197         INST_LIST_ITER startIter = bb->begin();
1198         unsigned int w = 0;
1199         const auto& splitInsts = LoopVarSplit::getSplitInsts(&gra, bb);
1200         for (auto instIter = startIter;
1201             instIter != endIter;)
1202         {
1203             auto inst = (*instIter);
1204 
1205             if (inst->isPseudoKill() ||
1206                 inst->isLabel())
1207             {
1208                 instIter++;
1209                 continue;
1210             }
1211 
1212             if (splitInsts.find(inst) != splitInsts.end())
1213             {
1214                 instIter++;
1215                 continue;
1216             }
1217 
1218             bool earlyCoalesce = false;
1219             if (inst->isSpillIntrinsic())
1220             {
1221                 // Check if coalescing is possible
1222                 if (spillsToCoalesce.size() == 0)
1223                 {
1224                     w = 0;
1225                     startIter = instIter;
1226                     spillsToCoalesce.clear();
1227                 }
1228 
1229                 for (auto coalIt = spillsToCoalesce.begin();
1230                     coalIt != spillsToCoalesce.end();
1231                    )
1232                 {
1233                     bool fullOverlap = false;
1234                     if (overlap(*instIter, *(*coalIt), fullOverlap))
1235                     {
1236                         if (fullOverlap)
1237                         {
1238 #if 0
1239                             printf("Deleting spill at $%d due to %d\n", (*(*coalIt))->getCISAOff(), (*instIter)->getCISAOff());
1240 #endif
1241                             // Delete earlier spill since its made redundant
1242                             // by current spill.
1243                             bb->erase(*coalIt);
1244                         }
1245 
1246                         coalIt = spillsToCoalesce.erase(coalIt);
1247                         continue;
1248                     }
1249                     coalIt++;
1250                 }
1251                 spillsToCoalesce.push_back(instIter);
1252             }
1253             else if (inst->isFillIntrinsic())
1254             {
1255                 for (auto coalIt = spillsToCoalesce.begin();
1256                     coalIt != spillsToCoalesce.end();
1257                     )
1258                 {
1259                     bool temp = false;
1260                     if (overlap(*instIter, *(*coalIt), temp))
1261                     {
1262 #if 1
1263                         // Instead of deleting scratch writes try coalescing
1264                         // at this point. This way maybe the fill can also
1265                         // be cleaned up in later phase.
1266                         earlyCoalesce = true;
1267                         break;
1268 #else
1269                         coalIt = spillsToCoalesce.erase(coalIt);
1270                         continue;
1271 #endif
1272                     }
1273                     coalIt++;
1274                 }
1275             }
1276 
1277             if (spillsToCoalesce.size() > 0 &&
1278                 rpe.getRegisterPressure(inst) > spillWindowSizeThreshold)
1279             {
1280                 if (!allSpillsSameVar(spillsToCoalesce))
1281                 {
1282                     // High register pressure region so reduce window size to 3
1283                     w = (cWindowSize - w > 3) ? cWindowSize - 3 : w;
1284                 }
1285                 else
1286                 {
1287 #if 0
1288                     printf("Found register pressure = %d at %d. Still coalescing spills because all spills are from same var.\n",
1289                         rpe.getRegisterPressure(inst), inst->getCISAOff());
1290 #endif
1291                 }
1292             }
1293 
1294             if (w == cWindowSize || inst == bb->back() ||
1295                 earlyCoalesce)
1296             {
1297                 if (spillsToCoalesce.size() > 1)
1298                 {
1299                     instIter = analyzeSpillCoalescing(spillsToCoalesce, startIter, instIter, bb);
1300                 }
1301                 else if (w == cWindowSize)
1302                 {
1303                     startIter = instIter;
1304                 }
1305                 else if (inst == bb->back())
1306                 {
1307                     break;
1308                 }
1309 
1310                 w = 0;
1311                 spillsToCoalesce.clear();
1312                 continue;
1313             }
1314 
1315             if (spillsToCoalesce.size() > 0)
1316             {
1317                 w++;
1318             }
1319 
1320             instIter++;
1321         }
1322 
1323         // One pass to replace old fills with coalesced dcl
1324         for (auto instIt = bb->begin();
1325             instIt != bb->end();
1326             )
1327         {
1328             auto inst = (*instIt);
1329 
1330             if (inst->isPseudoKill() &&
1331                 replaceMap.find(inst->getDst()->getTopDcl()) != replaceMap.end())
1332             {
1333                 instIt = bb->erase(instIt);
1334                 continue;
1335             }
1336 
1337             while (replaceCoalescedOperands(inst))
1338             {
1339                 // replaceCoalescedOperands() updates references to old
1340                 // dcls in IR that are coalesced. Having a single pass to
1341                 // replace operands is insufficient if a coalesced range
1342                 // gets re-coalesced. This can happen rarely when window
1343                 // heuristic hits boundary condition.
1344             };
1345             instIt++;
1346         }
1347     }
1348 }
1349 
fixSendsSrcOverlap()1350 void CoalesceSpillFills::fixSendsSrcOverlap()
1351 {
1352     // Overlap for sends src operands is not allowed.
1353     //
1354     // Fix for following code pattern after spill/fill coalescing:
1355     // send (16) COAL_FILL_373(0,0)<1>:ud r0 0xa 0x24c2001:ud{Align1, NoMask} // #??:$365:%657:&-1 // scratch read, resLen=4, msgLen=1
1356     // sends(1) null:ud COAL_FILL_373(0, 0) COAL_FILL_373(1, 0) 0x4c : ud 0x40680ff : ud{ Align1, Q1, NoMask } // #??:$365:%365:&-1 // a64 scatt
1357     // ered write, resLen = 0, msgLen = 2, extMsgLen = 1
1358     //
1359     // for CISA:
1360     // svm_scatter.1.1 (M1_NM, 1) V441.0 V449.0                                     /// $365
1361     //
1362     // where V441 and V449 are both scalars of type :uq and :ud respectively
1363     //
1364     for (auto bb : kernel.fg)
1365     {
1366         for (auto instIt = bb->begin();
1367             instIt != bb->end();
1368             instIt++)
1369         {
1370             auto inst = (*instIt);
1371 
1372             if (!inst->isSplitSend())
1373             {
1374                 continue;
1375             }
1376 
1377             auto src0 = inst->getSrc(0);
1378             auto src1 = inst->getSrc(1);
1379 
1380             if (src0->getTopDcl() == src1->getTopDcl())
1381             {
1382                 auto lb0 = src0->getLeftBound();
1383                 auto rb0 = src0->getRightBound();
1384                 auto lb1 = src1->getLeftBound();
1385                 auto rb1 = src1->getRightBound();
1386 
1387                 if ((lb0 < lb1 && rb0 > lb1) ||
1388                     (lb1 < lb0 && rb1 > lb0))
1389                 {
1390                     // Ideally we should create copy of
1391                     // operand with less number of GRFs,
1392                     // but this is a really corner case
1393                     // and probably shows up only for
1394                     // force spills. So we simply choose
1395                     // src1 of sends.
1396                     const char* dclName = kernel.fg.builder->getNameString(kernel.fg.mem, 32,
1397                         "COPY_%d", kernel.Declares.size());
1398                     G4_Declare* copyDcl = kernel.fg.builder->createDeclareNoLookup(dclName, G4_GRF,
1399                         numEltPerGRF<Type_UD>(), src1->getTopDcl()->getNumRows(),
1400                         Type_UD);
1401 
1402                     unsigned int elems = copyDcl->getNumElems();
1403                     short row = 0;
1404                     while (elems > 0)
1405                     {
1406                         G4_SrcRegRegion* srcRgn = kernel.fg.builder->createSrc(src1->getTopDcl()->getRegVar(), row, 0,
1407                             kernel.fg.builder->getRegionStride1(), Type_UD);
1408                         G4_DstRegRegion* dstRgn = kernel.fg.builder->createDst(
1409                             copyDcl->getRegVar(), row, 0, 1, Type_UD);
1410                         G4_INST* copyInst = kernel.fg.builder->createMov(
1411                             g4::SIMD8, dstRgn, srcRgn, InstOpt_WriteEnable, false);
1412                         bb->insertBefore(instIt, copyInst);
1413                         elems -= 8;
1414                         row++;
1415                     }
1416 
1417                     G4_SrcRegRegion* sendSrc1 = kernel.fg.builder->createSrc(copyDcl->getRegVar(), 0, 0,
1418                         kernel.fg.builder->getRegionStride1(), Type_UD);
1419                     inst->setSrc(sendSrc1, 1);
1420                 }
1421             }
1422         }
1423     }
1424 }
1425 
removeRedundantSplitMovs()1426 void CoalesceSpillFills::removeRedundantSplitMovs()
1427 {
1428     // send (8) V_SAMPLER   -- resLen = 3
1429     // COAL_0(0,0) = V_SAMPLE(0,0)
1430     // COAL_0(1,0) = V_SAMPLE(1,0)
1431     // send (8) <null>    COAL_0(0,0) <-- len = 2
1432     // TV0(0,0) = V_SAMPLE(2,0)
1433     // ===>
1434     // send (8) V_SAMPLER   -- resLen = 3
1435     // send (8) <null>    V_SAMPLE(0,0) <-- len = 2
1436     // TV0(0,0) = V_SAMPLE(2,0)
1437 
1438     // Look for scratch writes. Src1 is data to write to memory.
1439     // Iterate in bottom-order to check whether raw movs exist
1440     // that define src1 of scratch write and whether their
1441     // source operands are consecutive.
1442 
1443     // Store numUses for dcls replaced and location of defs.
1444     // This structure is used to eliminate redundant movs
1445     // later.
1446     typedef std::pair<G4_BB*, INST_LIST_ITER> MovLoc;
1447     typedef unsigned int NumRefs;
1448     std::map<G4_Declare*, std::pair<NumRefs, std::list<MovLoc>>> movs;
1449 
1450     for (auto bb : kernel.fg)
1451     {
1452         // Store all dcls defined by non scratch sends
1453         // as only they are candidates for this pass.
1454         // Without this, we might end up identifying
1455         // other raw movs coming from partial write like:
1456         // add (8) r8.0<1>:q r20.0<4;4,1>:q r4.0<0;1,0>:ud {Align1, Q1}
1457         // send(16) r27.0<1>:uw r26 0xa 0x22c1000 : ud{ Align1, NoMask } // scratch read, fill, offset = 0, resLen=2, msgLen=1
1458         // mov(8) r27.0<1> : q r8.0<4; 4, 1> : q{ Align1, Q1 }
1459         // sends(16) null : uw r26 r27 0x8a : ud 0x20f1000 : ud{ Align1, NoMask } // scratch write, spill, offset = 0, resLen=0, msgLen=1, extMsgLen=2
1460         //
1461         // Although there is a raw mov before scratch write,
1462         // it has to be preserved for correctness.
1463         std::set<G4_Declare*> sendDst;
1464         for (auto inst : *bb)
1465         {
1466             if (inst->isSend() &&
1467                 inst->getDst() &&
1468                 !inst->getDst()->isNullReg() &&
1469                 !inst->getMsgDesc()->isScratchRead() &&
1470                 !inst->getMsgDesc()->isScratchWrite())
1471             {
1472                 sendDst.insert(inst->getDst()->getTopDcl());
1473             }
1474         }
1475 
1476         for (auto instIt = bb->begin(), endIt = bb->end();
1477             instIt != endIt;
1478             instIt++)
1479         {
1480             auto inst = (*instIt);
1481 
1482             if (inst->isSpillIntrinsic())
1483             {
1484                 // Spill sends
1485                 auto src1Dcl = inst->getSrc(1)->getTopDcl();
1486                 unsigned int lb = inst->getSrc(1)->getLeftBound();
1487                 unsigned int rb = inst->getSrc(1)->getRightBound();
1488                 std::set<unsigned int> rows;
1489                 for (unsigned int k = lb / numEltPerGRF<Type_UB>(); k != (rb + numEltPerGRF<Type_UB>() - 1) / numEltPerGRF<Type_UB>(); k++)
1490                 {
1491                     rows.insert(k);
1492                 }
1493                 auto tmpIt = instIt;
1494                 tmpIt--;
1495                 G4_Declare* srcDcl = nullptr;
1496                 std::map<unsigned int, unsigned int> dstSrcRowMapping;
1497                 std::list<MovLoc> copies;
1498                 while (tmpIt != bb->begin())
1499                 {
1500                     auto pInst = (*tmpIt);
1501 
1502                     // Each copy should be a raw mov
1503                     if (!pInst->isRawMov())
1504                         break;
1505 
1506                     // Ensure src0 topdcl comes from a send dst in this BB
1507                     if (sendDst.find(pInst->getSrc(0)->getTopDcl()) ==
1508                         sendDst.end())
1509                         break;
1510 
1511                     // Check whether dcls match
1512                     auto pDstDcl = pInst->getDst()->getTopDcl();
1513                     if (pDstDcl != src1Dcl)
1514                         break;
1515 
1516                     unsigned int plb = pInst->getDst()->getLeftBound();
1517                     unsigned int prb = pInst->getDst()->getRightBound();
1518 
1519                     // Check whether complete row(s) defined
1520                     if ((prb - plb + 1) % numEltPerGRF<Type_UB>() != 0)
1521                         break;
1522 
1523                     unsigned int rowStart = plb / numEltPerGRF<Type_UB>();
1524                     unsigned int numRows = (prb - plb + 1) / numEltPerGRF<Type_UB>();
1525                     bool punt = false;
1526                     for (unsigned int k = rowStart; k != (rowStart + numRows); k++)
1527                     {
1528                         if (rows.find(k) == rows.end())
1529                         {
1530                             punt = true;
1531                             break;
1532                         }
1533                         dstSrcRowMapping.insert(std::make_pair(k, INT_MAX));
1534                     }
1535 
1536                     if (punt)
1537                         break;
1538 
1539                     auto pSrc0 = pInst->getSrc(0);
1540                     if (!pSrc0->isSrcRegRegion())
1541                         break;
1542 
1543                     auto pSrcDcl = pSrc0->getTopDcl();
1544                     if (!srcDcl)
1545                         srcDcl = pSrcDcl;
1546                     else if (srcDcl != pSrcDcl)
1547                         break;
1548 
1549                     // mov src should be GRF aligned
1550                     if (pSrc0->getLeftBound() % numEltPerGRF<Type_UB>() != 0)
1551                         break;
1552 
1553                     unsigned int src0lb = pSrc0->getLeftBound();
1554                     unsigned int src0rb = pSrc0->getRightBound();
1555 
1556                     // (rb - lb) should match dst (rb - lb)
1557                     if ((src0rb - src0lb) != (prb - plb))
1558                         break;
1559 
1560                     unsigned int pStartRow = pSrc0->getLeftBound() / numEltPerGRF<Type_UB>();
1561                     for (unsigned int k = rowStart; k != (rowStart + numRows); k++)
1562                     {
1563                         auto it = dstSrcRowMapping.find(k);
1564                         if (it == dstSrcRowMapping.end())
1565                         {
1566                             punt = true;
1567                             break;
1568                         }
1569 
1570                         it->second = pStartRow + (k - rowStart);
1571                     }
1572 
1573                     if (punt)
1574                         break;
1575 
1576                     copies.push_back(std::make_pair(bb, tmpIt));
1577                     tmpIt--;
1578                 }
1579 
1580                 if (dstSrcRowMapping.size() > 0)
1581                 {
1582                     // Now check whether each entry of src1 has a corresponding src offset
1583                     unsigned int dstRowStart = lb / numEltPerGRF<Type_UB>();
1584                     bool success = true;
1585                     auto baseIt = dstSrcRowMapping.find(dstRowStart);
1586                     if (baseIt != dstSrcRowMapping.end())
1587                     {
1588                         auto base = dstSrcRowMapping.find(dstRowStart)->second;
1589                         for (auto m : dstSrcRowMapping)
1590                         {
1591                             unsigned int curRow = m.first - dstRowStart;
1592                             if (m.second == INT_MAX)
1593                             {
1594                                 success = false;
1595                                 break;
1596                             }
1597 
1598                             if (m.second != (base + curRow))
1599                             {
1600                                 success = false;
1601                                 break;
1602                             }
1603                         }
1604 
1605                         if (success && srcDcl)
1606                         {
1607                             // Replace src1 of send with srcDcl
1608                             G4_SrcRegRegion* sendSrc1 = kernel.fg.builder->createSrc(srcDcl->getRegVar(),
1609                                 base, 0, kernel.fg.builder->getRegionStride1(), inst->getSrc(1)->getType());
1610                             inst->setSrc(sendSrc1, 1);
1611 
1612                             for (auto c : copies)
1613                             {
1614                                 auto defDcl = (*c.second)->getDst()->getTopDcl();
1615                                 auto it = movs.find(defDcl);
1616                                 if (it == movs.end())
1617                                 {
1618                                     std::list<MovLoc> t;
1619                                     t.push_back(c);
1620                                     movs.insert(std::make_pair(defDcl, std::make_pair(0, t)));
1621                                 }
1622                                 else
1623                                 {
1624                                     it->second.second.push_back(c);
1625                                 }
1626                             }
1627                         }
1628                     }
1629                 }
1630             }
1631         }
1632     }
1633 
1634     // Update number of uses of each dcl
1635     for (auto bb : kernel.fg)
1636     {
1637         for (auto instIt = bb->begin(), endIt = bb->end();
1638             instIt != endIt; instIt++)
1639         {
1640             auto inst = (*instIt);
1641 
1642             if (inst->isPseudoKill())
1643             {
1644                 auto dcl = inst->getDst()->getTopDcl();
1645                 auto it = movs.find(dcl);
1646                 if (it != movs.end())
1647                 {
1648                     it->second.second.push_back(std::make_pair(bb, instIt));
1649                 }
1650             }
1651 
1652             for (unsigned int i = 0; i < G4_MAX_SRCS; i++)
1653             {
1654                 G4_Operand* opnd = inst->getSrc(i);
1655 
1656                 if (opnd &&
1657                     opnd->getTopDcl())
1658                 {
1659                     auto it = movs.find(opnd->getTopDcl());
1660                     if (it != movs.end())
1661                         it->second.first++;
1662                 }
1663             }
1664         }
1665     }
1666 
1667     for (auto mov : movs)
1668     {
1669         auto dcl = mov.first;
1670         auto numRefs = mov.second.first;
1671         auto& allMovs = mov.second.second;
1672 
1673         if (numRefs == 0 && !dcl->getAddressed())
1674         {
1675 #if 0
1676             printf("Removing movs/pseudoKill for dcl %s\n", dcl->getName());
1677 #endif
1678             for (auto m : allMovs)
1679             {
1680                 auto bb = m.first;
1681                 auto iter = m.second;
1682 #if 0
1683                 printf("\tFound %s occurence at $%d\n", (*iter)->opcode() == G4_mov ? "mov" : "pseudokill", (*iter)->getCISAOff());
1684 #endif
1685                 bb->erase(iter);
1686             }
1687         }
1688     }
1689 }
1690 
spillFillCleanup()1691 void CoalesceSpillFills::spillFillCleanup()
1692 {
1693     // Eliminate redundant fills when a write
1694     // is close by:
1695     //
1696     // spill TV1 at offset = 1
1697     // ..
1698     // ..
1699     // ..
1700     // fill FP1 from offset = 1
1701     // = FP1
1702     // ===>
1703     // Remove fill and replace occurence of FP1 with TV1
1704     //
1705 
1706     std::map<unsigned int, G4_INST*> writesPerOffset;
1707     std::set<G4_Declare*> defs;
1708     for (auto bb : kernel.fg)
1709     {
1710         auto startIt = bb->begin();
1711         auto endIt = bb->end();
1712         const auto& splitInsts = LoopVarSplit::getSplitInsts(&gra, bb);
1713         for (auto instIt = startIt;
1714             instIt != endIt;
1715             instIt++)
1716         {
1717             auto inst = (*instIt);
1718 
1719             if (splitInsts.find(inst) != splitInsts.end())
1720                 continue;
1721 
1722             if (inst->isFillIntrinsic())
1723             {
1724                 writesPerOffset.clear();
1725                 defs.clear();
1726 
1727                 // Store offset, spill inst pair
1728                 unsigned int rowStart, numRows;
1729                 getScratchMsgInfo(inst, rowStart, numRows);
1730                 unsigned int lastRow = rowStart + numRows - 1;
1731 
1732                 // Scan window of instruction above current inst
1733                 // to check whether all rows read by current inst
1734                 // have been written.
1735                 auto pInstIt = instIt;
1736                 pInstIt--;
1737                 unsigned int w = cSpillFillCleanupWindowSize;
1738                 while (pInstIt != startIt &&
1739                     w > 0)
1740                 {
1741                     auto pInst = (*pInstIt);
1742 
1743                     if (pInst->isSpillIntrinsic())
1744                     {
1745                         unsigned int pRowStart, pNumRows;
1746                         getScratchMsgInfo(pInst, pRowStart, pNumRows);
1747 
1748                         // If any def of src1 dcl is found then dont
1749                         // consider this write for optimization. Its
1750                         // value in memory could be different than
1751                         // one held in variable.
1752                         auto pSrc1Dcl = pInst->getSrc(1)->getTopDcl();
1753                         if (defs.find(pSrc1Dcl) != defs.end())
1754                         {
1755                             // When a WAR is detected, punt out because
1756                             // any older spill is going to be stale.
1757                             // An optimization here is to detect variable
1758                             // row on which WAR occurs. If the row is
1759                             // different from current fill, then it
1760                             // should be safe to ignore pInst and continue
1761                             // till end of window.
1762                             pInstIt--;
1763                             break;
1764                         }
1765 
1766                         for (unsigned int pRow = pRowStart;
1767                             pRow != (pRowStart + pNumRows);
1768                             pRow++)
1769                         {
1770                             auto writeIt = writesPerOffset.find(pRow);
1771 
1772                             // Check whether a more recent write was found for this row
1773                             if (writeIt != writesPerOffset.end())
1774                                 continue;
1775 
1776                             writesPerOffset.insert(std::make_pair(pRow, pInst));
1777                         }
1778                     }
1779 
1780                     if (pInst->getDst() &&
1781                         pInst->getDst()->getTopDcl())
1782                     {
1783                         // Store any defs seen to handle WAR
1784                         defs.insert(pInst->getDst()->getTopDcl());
1785                     }
1786 
1787                     w--;
1788                     pInstIt--;
1789                 }
1790 
1791                 // Check whether writes for all rows were found
1792                 bool found = true;
1793                 for (auto row = rowStart; row <= lastRow; row++)
1794                 {
1795                     if (writesPerOffset.find(row) == writesPerOffset.end())
1796                     {
1797                         found = false;
1798                         break;
1799                     }
1800                 }
1801 
1802                 if (!found)
1803                 {
1804                     continue;
1805                 }
1806 
1807                 // Writes for all rows found
1808                 G4_ExecSize execSize = kernel.getSimdSize() > g4::SIMD16 ? g4::SIMD16 : kernel.getSimdSize();
1809 
1810                 for (auto row = rowStart; row <= lastRow;)
1811                 {
1812                     if (execSize == g4::SIMD16 && row == lastRow)
1813                     {
1814                         // In case of odd rows in SIMD16
1815                         execSize = g4::SIMD8;
1816                     }
1817                     else if (execSize == g4::SIMD16)
1818                     {
1819                         // In SIMD16 kernel 2 consecutive rows should come from same spill
1820                         if (writesPerOffset.find(row)->second != writesPerOffset.find(row + 1)->second)
1821                         {
1822                             execSize = g4::SIMD8;
1823                         }
1824                     }
1825 
1826                     auto type = Type_UD;
1827                     // In spill cleanup, all units are in hword units independent of platform.
1828                     // For PVC, GRF size is 2x of Gen9. Correction to PVC is postponed to code
1829                     // generation time when translating spill/fill intrinsics to actual send.
1830                     // Since we're emitting a mov here, we need to do this correction here.
1831                     if (getGRFSize() == 64)
1832                         type = Type_UQ;
1833                     // Insert SIMD8 mov per row
1834                     G4_DstRegRegion* nDst = kernel.fg.builder->createDst(
1835                         inst->getDst()->getBase(), row + inst->getDst()->asDstRegRegion()->getRegOff() - rowStart,
1836                         0, 1, type);
1837 
1838                     auto write = writesPerOffset.find(row)->second;
1839                     G4_SrcRegRegion* src1Write = write->getSrc(1)->asSrcRegRegion();
1840                     unsigned int writeRowStart = write->asSpillIntrinsic()->getOffset();
1841                     unsigned int diff = row - writeRowStart;
1842                     G4_SrcRegRegion* nSrc = kernel.fg.builder->createSrc(
1843                         src1Write->getBase(), diff + src1Write->getRegOff(), 0,
1844                         kernel.fg.builder->getRegionStride1(), type);
1845 
1846                     G4_INST* mov = kernel.fg.builder->createMov(
1847                         execSize, nDst, nSrc, InstOpt_WriteEnable, false);
1848                     bb->insertBefore(instIt, mov);
1849 
1850                     row += execSize / 8;
1851                 }
1852 
1853                 auto tempIt = instIt;
1854                 tempIt--;
1855                 bb->erase(instIt);
1856                 instIt = tempIt;
1857             }
1858         }
1859     }
1860 }
1861 
removeRedundantWrites()1862 void CoalesceSpillFills::removeRedundantWrites()
1863 {
1864     typedef std::list<std::pair<G4_BB*, INST_LIST_ITER>> SPILLS;
1865     typedef std::list<std::pair<G4_BB*, INST_LIST_ITER>> FILLS;
1866     std::map<unsigned int, std::pair<SPILLS, FILLS>> scratchOffsetAccess;
1867     // Traverse bottom-up to detect and remove redundant writes.
1868     // Redundant writes include:
1869     // 1. Successive writes to same offset without a fill in between,
1870     // 2. Writes in program without any fill from that slot throughout
1871     for (auto bb : kernel.fg)
1872     {
1873         auto endIt = bb->end();
1874         endIt--;
1875         // Store spill slots that are written in to alongwith emask used
1876         std::map<unsigned int, unsigned int> scratchOffToMask;
1877         for (auto instIt = endIt;
1878             instIt != bb->begin();
1879             instIt--)
1880         {
1881             auto inst = (*instIt);
1882 
1883             if (inst->isSpillIntrinsic() || inst->isFillIntrinsic())
1884             {
1885                 unsigned int offset = 0, size = 0;
1886                 if (inst->isFillIntrinsic())
1887                 {
1888                     getScratchMsgInfo(inst, offset, size);
1889                     for (unsigned int k = offset; k != (offset + size); k++)
1890                     {
1891                         auto it = scratchOffToMask.find(k);
1892                         if (it != scratchOffToMask.end())
1893                         {
1894                             scratchOffToMask.erase(it);
1895                         }
1896                     }
1897                 }
1898                 else if (inst->isSpillIntrinsic())
1899                 {
1900                     getScratchMsgInfo(inst, offset, size);
1901                     bool allRowsFound = true;
1902                     unsigned int emask = inst->getMaskOption();
1903                     for (unsigned int k = offset; k != (offset + size); k++)
1904                     {
1905                         auto it = scratchOffToMask.find(k);
1906                         if (it != scratchOffToMask.end())
1907                         {
1908                             if (emask != it->second &&
1909                                 (it->second & InstOpt_WriteEnable) == 0)
1910                             {
1911                                 allRowsFound = false;
1912                                 break;
1913                             }
1914                         }
1915                         else
1916                         {
1917                             allRowsFound = false;
1918                             break;
1919                         }
1920                     }
1921 
1922                     if (allRowsFound)
1923                     {
1924 #if 0
1925                         printf("Removing redundant successive write at $%d\n", inst->getCISAOff());
1926 #endif
1927                         instIt = bb->erase(instIt);
1928                     }
1929                     else
1930                     {
1931                         unsigned int emask = inst->getOption();
1932                         for (unsigned int k = offset; k != (offset + size); k++)
1933                         {
1934                             scratchOffToMask.insert(std::make_pair(k, emask));
1935                         }
1936                     }
1937                 }
1938             }
1939         }
1940     }
1941 
1942     for (auto bb : kernel.fg)
1943     {
1944         auto endIt = bb->end();
1945         for (auto instIt = bb->begin();
1946             instIt != endIt;
1947             instIt++)
1948         {
1949             auto inst = (*instIt);
1950 
1951             if (inst->isFillIntrinsic() ||
1952                 inst->isSpillIntrinsic())
1953             {
1954                 unsigned int offset, size;
1955                 getScratchMsgInfo(inst, offset, size);
1956                 bool isRead = inst->isFillIntrinsic();
1957                 for (unsigned int i = offset; i != (offset + size); i++)
1958                 {
1959                     auto it = scratchOffsetAccess.find(i);
1960                     if (it != scratchOffsetAccess.end())
1961                     {
1962                         if (isRead)
1963                         {
1964                             auto& fill = it->second.second;
1965                             fill.push_back(std::make_pair(bb, instIt));
1966                         }
1967                         else
1968                         {
1969                             auto& spill = it->second.first;
1970                             spill.push_back(std::make_pair(bb, instIt));
1971                         }
1972                     }
1973                     else
1974                     {
1975                         SPILLS s;
1976                         FILLS f;
1977                         if (isRead)
1978                             f.push_back(std::make_pair(bb, instIt));
1979                         else
1980                             s.push_back(std::make_pair(bb, instIt));
1981                         scratchOffsetAccess.insert(std::make_pair(i, std::make_pair(s, f)));
1982                     }
1983                 }
1984             }
1985         }
1986     }
1987 
1988     std::map<G4_INST*, std::pair<INST_LIST_ITER, G4_BB*>> spillToRemove;
1989     for (auto scratchAccess : scratchOffsetAccess)
1990     {
1991         if (scratchAccess.second.second.size() == 0 &&
1992             scratchAccess.second.first.size() > 0)
1993         {
1994             // 0 fills for scratch slot
1995             // Check whether all spill slots have 0 fills
1996             // in case spills are coalesced.
1997             for (auto spill : scratchAccess.second.first)
1998             {
1999                 bool spillRequired = false;
2000                 unsigned int offset, size;
2001                 getScratchMsgInfo(*spill.second, offset, size);
2002 
2003                 // Verify that all slots from offset->(offset+size) have 0 fills
2004                 for (unsigned int slot = offset; slot != (offset + size); slot++)
2005                 {
2006                     auto it = scratchOffsetAccess.find(slot);
2007                     if (it->second.second.size() != 0)
2008                     {
2009                         spillRequired = true;
2010                         break;
2011                     }
2012                 }
2013 
2014                 if (!spillRequired)
2015                 {
2016                     spillToRemove.insert(std::make_pair(*spill.second, std::make_pair(spill.second, spill.first)));
2017                 }
2018 
2019             }
2020         }
2021         else if (scratchAccess.second.first.size() == 0 &&
2022             scratchAccess.second.second.size() > 0)
2023         {
2024             // 0 spills for scratch slot, non-zero fills
2025             // Check whether all fill slots have 0 spills
2026             // in case fills are coalesced.
2027             for (auto fill : scratchAccess.second.second)
2028             {
2029                 bool fillRequired = false;
2030                 unsigned int offset, size;
2031                 getScratchMsgInfo(*fill.second, offset, size);
2032 
2033                 // Verify that all slots from offset->(offset+size) have 0 spills
2034                 for (unsigned int slot = offset; slot != (offset + size); slot++)
2035                 {
2036                     auto it = scratchOffsetAccess.find(slot);
2037                     if (it->second.first.size() != 0)
2038                     {
2039                         fillRequired = true;
2040                         break;
2041                     }
2042                 }
2043 
2044                 if (!fillRequired)
2045                 {
2046                     spillToRemove.insert(std::make_pair(*fill.second, std::make_pair(fill.second, fill.first)));
2047                 }
2048 
2049             }
2050         }
2051     }
2052 
2053     for (auto removeSp : spillToRemove)
2054     {
2055         G4_BB* bb = removeSp.second.second;
2056 #if 0
2057         printf("Removing redundant scratch access at CISA $%d\n", removeSp.first->getCISAOff());
2058 #endif
2059         bb->erase(removeSp.second.first);
2060     }
2061 }
2062 
run()2063 void CoalesceSpillFills::run()
2064 {
2065     removeRedundantSplitMovs();
2066 
2067     fills();
2068     replaceMap.clear();
2069     spills();
2070     replaceMap.clear();
2071     spillFillCleanup();
2072 
2073     removeRedundantWrites();
2074 
2075     fixSendsSrcOverlap();
2076 
2077     kernel.dumpToFile("after.spillCleanup." + std::to_string(iterNo));
2078 }
2079 
dumpKernel()2080 void CoalesceSpillFills::dumpKernel()
2081 {
2082     for (auto bb : kernel.fg)
2083     {
2084         for (auto inst : *bb)
2085         {
2086             inst->emit(std::cerr);
2087             std::cerr << "\t$" << inst->getCISAOff() << ", #" << rpe.getRegisterPressure(inst) << "\n";
2088         }
2089     }
2090 }
2091 
dumpKernel(unsigned int v1,unsigned int v2)2092 void CoalesceSpillFills::dumpKernel(unsigned int v1, unsigned int v2)
2093 {
2094     bool start = false, end = false, canEnd = false;
2095     for (auto bb : kernel.fg)
2096     {
2097         if (end)
2098             break;
2099 
2100         for (auto inst : *bb)
2101         {
2102             if (canEnd &&
2103                 inst->getCISAOff() > (int)v2)
2104             {
2105                 end = true;
2106                 break;
2107             }
2108 
2109             if (inst->getCISAOff() == v2)
2110             {
2111                 // This ensures invalid offsets
2112                 // are dumped till v2 is hit.
2113                 canEnd = true;
2114             }
2115 
2116             if (inst->getCISAOff() == v1)
2117                 start = true;
2118 
2119             if (start && !end)
2120             {
2121                 inst->dump();
2122                 printf(" // $%d, #%d\n", inst->getCISAOff(), rpe.getRegisterPressure(inst));
2123             }
2124         }
2125     }
2126 }
2127 
computeAddressTakenDcls()2128 void CoalesceSpillFills::computeAddressTakenDcls()
2129 {
2130     for (auto dcl : kernel.Declares)
2131     {
2132         auto addrSpillFill = dcl->getAddrTakenSpillFill();
2133         if (addrSpillFill)
2134             addrTakenSpillFillDcl.insert(addrSpillFill);
2135     }
2136 }
2137 }
2138