1 /*========================== begin_copyright_notice ============================
2
3 Copyright (C) 2017-2021 Intel Corporation
4
5 SPDX-License-Identifier: MIT
6
7 ============================= end_copyright_notice ===========================*/
8
9 #include "FlowGraph.h"
10 #include "GraphColor.h"
11 #include <list>
12 #include "SpillCleanup.h"
13
14 uint32_t computeFillMsgDesc(unsigned int payloadSize, unsigned int offset);
15 uint32_t computeSpillMsgDesc(unsigned int payloadSize, unsigned int offset);
16
17 namespace vISA
18 {
generateCoalescedSpill(G4_SrcRegRegion * header,unsigned int scratchOffset,unsigned int payloadSize,bool useNoMask,G4_InstOption mask,G4_Declare * spillDcl,unsigned int row)19 G4_SrcRegRegion* CoalesceSpillFills::generateCoalescedSpill(G4_SrcRegRegion* header, unsigned int scratchOffset, unsigned int payloadSize,
20 bool useNoMask, G4_InstOption mask, G4_Declare* spillDcl, unsigned int row)
21 {
22 // Generate split send instruction with specified payload size and offset
23 auto spillSrcPayload = kernel.fg.builder->createSrc(spillDcl->getRegVar(),
24 row, 0, kernel.fg.builder->getRegionStride1(), Type_UD);
25
26 // Create send instruction with payloadSize starting at scratch offset min
27 G4_Declare* fp = nullptr;
28 if (kernel.fg.getHasStackCalls() || kernel.fg.getIsStackCallFunc())
29 fp = kernel.fg.getFramePtrDcl();
30 unsigned int option = useNoMask ? InstOpt_WriteEnable : 0;
31 auto spillInst = kernel.fg.builder->createSpill(
32 kernel.fg.builder->createNullDst(Type_UW), header, spillSrcPayload, g4::SIMD16, payloadSize,
33 GlobalRA::GRFToHwordSize(scratchOffset), fp, static_cast<G4_InstOption>(option), false);
34
35 if (!useNoMask)
36 {
37 // Care about mask only for non-NoMask sends
38 spillInst->setMaskOption(mask);
39 }
40
41 return spillSrcPayload;
42 }
43
generateCoalescedFill(G4_SrcRegRegion * header,unsigned int scratchOffset,unsigned int payloadSize,unsigned int dclSize,bool evenAlignDst)44 G4_INST* CoalesceSpillFills::generateCoalescedFill(G4_SrcRegRegion* header, unsigned int scratchOffset, unsigned int payloadSize,
45 unsigned int dclSize, bool evenAlignDst)
46 {
47 // Generate split send instruction with specified payload size and offset
48 // Construct fillDst
49 const char* dclName = kernel.fg.builder->getNameString(kernel.fg.mem, 32,
50 "COAL_FILL_%d", kernel.Declares.size());
51 auto fillDcl = kernel.fg.builder->createDeclareNoLookup(dclName, G4_GRF,
52 numEltPerGRF<Type_UD>(), dclSize, Type_UD, DeclareType::CoalescedFill);
53
54 if (evenAlignDst)
55 {
56 fillDcl->setEvenAlign();
57 gra.setEvenAligned(fillDcl, true);
58 }
59 fillDcl->setDoNotSpill();
60
61 auto fillDst = kernel.fg.builder->createDst(fillDcl->getRegVar(), 0,
62 0, 1, Type_UW);
63
64 G4_Declare* fp = nullptr;
65 if (kernel.fg.getHasStackCalls() || kernel.fg.getIsStackCallFunc())
66 fp = kernel.fg.getFramePtrDcl();
67
68 auto fillInst = kernel.fg.builder->createFill(header, fillDst, g4::SIMD16, payloadSize,
69 GlobalRA::GRFToHwordSize(scratchOffset), fp, InstOpt_WriteEnable, false);
70 return fillInst;
71 }
72
copyToOldFills(G4_DstRegRegion * coalescedFillDst,std::list<std::pair<G4_DstRegRegion *,std::pair<unsigned int,unsigned int>>> indFills,INST_LIST_ITER f,G4_BB * bb,int srcCISAOff)73 void CoalesceSpillFills::copyToOldFills(
74 G4_DstRegRegion* coalescedFillDst,
75 std::list<std::pair<G4_DstRegRegion*, std::pair<unsigned int, unsigned int>>> indFills,
76 INST_LIST_ITER f, G4_BB* bb, int srcCISAOff)
77 {
78 // Copy data from coalesced fill in to older fills.
79 // This way we dont carry entire coalesced payload
80 // till last fill.
81 for (auto oldFill : indFills)
82 {
83 unsigned int numGRFs = (oldFill.first->getRightBound() - oldFill.first->getLeftBound()
84 + numEltPerGRF<Type_UB>() - 1) / numEltPerGRF<Type_UB>();
85 unsigned int rowOff = 0;
86 // TODO: Check for > 2 GRF dst
87 while (numGRFs > 0)
88 {
89 G4_ExecSize simdSize = g4::SIMD8;
90
91 unsigned int off = oldFill.second.first;
92 unsigned int size = oldFill.second.second;
93
94 static const int BYTES_PER_HEXWORD = 32;
95 unsigned int scratchOff =
96 coalescedFillDst->getInst()->getMsgDesc()->getOffset() / BYTES_PER_HEXWORD;
97
98 // Scratch msg offset is always equal or lower than individual fills
99 unsigned int offToUse = off - scratchOff + rowOff;
100
101 if (size > g4::SIMD8)
102 simdSize = g4::SIMD16;
103
104 G4_DstRegRegion* movDst = kernel.fg.builder->createDst(
105 oldFill.first->getBase(), rowOff, 0, 1, Type_UD);
106
107 G4_SrcRegRegion* src = kernel.fg.builder->createSrc(
108 coalescedFillDst->getBase(), offToUse, 0, kernel.fg.builder->getRegionStride1(), Type_UD);
109
110 G4_INST* copy = kernel.fg.builder->createMov(
111 simdSize, movDst, src, InstOpt_WriteEnable, false);
112
113 bb->insertBefore(f, copy);
114
115 numGRFs -= simdSize == 8 ? 1 : 2;
116 rowOff += simdSize == 8 ? 1 : 2;
117 }
118 }
119 }
120
createCoalescedSpillDcl(unsigned int payloadSize)121 G4_Declare* CoalesceSpillFills::createCoalescedSpillDcl(unsigned int payloadSize)
122 {
123 // Construct spill src
124 const char* dclName = nullptr;
125 G4_Declare* spillDcl = nullptr;
126
127 dclName = kernel.fg.builder->getNameString(kernel.fg.mem, 32,
128 "COAL_SPILL_%d", kernel.Declares.size());
129 spillDcl = kernel.fg.builder->createDeclareNoLookup(dclName, G4_GRF,
130 numEltPerGRF<Type_UD>(), payloadSize, Type_UD, DeclareType::CoalescedSpill);
131
132 spillDcl->setDoNotSpill();
133
134 return spillDcl;
135 }
136
coalesceSpills(std::list<INST_LIST_ITER> & coalesceableSpills,unsigned int min,unsigned int max,bool useNoMask,G4_InstOption mask,G4_BB * bb)137 void CoalesceSpillFills::coalesceSpills(
138 std::list<INST_LIST_ITER>& coalesceableSpills, unsigned int min,
139 unsigned int max, bool useNoMask, G4_InstOption mask, G4_BB* bb)
140 {
141 // Generate fill with minimum size = max-min. This should be compatible with
142 // payload sizes supported by hardware.
143 unsigned int payloadSize = (max - min) + 1;
144
145 auto leadInst = *coalesceableSpills.front();
146
147 MUST_BE_TRUE(payloadSize == 1 || payloadSize == 2 || payloadSize == 4 || payloadSize == 8,
148 "Unsupported payload size");
149
150 std::set<G4_Declare*> declares;
151 unsigned int minRow = UINT_MAX;
152 for (auto d : coalesceableSpills)
153 {
154 auto src1Opnd = (*d)->getSrc(1)->asSrcRegRegion();
155 auto curRow = src1Opnd->getLeftBound() / numEltPerGRF<Type_UB>();
156 declares.insert(src1Opnd->getTopDcl());
157 minRow = minRow > curRow ? curRow : minRow;
158 }
159
160 G4_Declare* dcl = nullptr;
161 if (declares.size() == 1)
162 {
163 dcl = (*declares.begin());
164 }
165 else
166 {
167 dcl = createCoalescedSpillDcl(payloadSize);
168 minRow = 0;
169 }
170
171 auto coalescedSpillSrc = generateCoalescedSpill(kernel.fg.builder->duplicateOperand(leadInst->asSpillIntrinsic()->getHeader()),
172 min, payloadSize, useNoMask, mask, dcl, minRow);
173 coalescedSpillSrc->getInst()->inheritDIFrom(leadInst);
174
175 if (declares.size() != 1)
176 {
177 for (auto c : coalesceableSpills)
178 {
179 unsigned int scratchOffset, scratchSize;
180 getScratchMsgInfo((*c), scratchOffset, scratchSize);
181
182 unsigned int rowOff = scratchOffset - min;
183 replaceMap.insert(std::make_pair((*c)->getSrc(1)->getTopDcl(),
184 std::make_pair(coalescedSpillSrc->getTopDcl(), rowOff)));
185 }
186 }
187
188 auto f = coalesceableSpills.back();
189 f++;
190
191 for (auto spill : coalesceableSpills)
192 {
193 bb->erase(spill);
194 }
195 coalesceableSpills.clear();
196 bb->insertBefore(f, coalescedSpillSrc->getInst());
197 }
198
coalesceFills(std::list<INST_LIST_ITER> & coalesceableFills,unsigned int min,unsigned int max,G4_BB * bb,int srcCISAOff)199 void CoalesceSpillFills::coalesceFills(std::list<INST_LIST_ITER>& coalesceableFills, unsigned int min,
200 unsigned int max, G4_BB* bb, int srcCISAOff)
201 {
202 // Generate fill with minimum size = max-min. This should be compatible with
203 // payload sizes supported by hardware.
204 unsigned int payloadSize = (max - min) + 1;
205 if (payloadSize == 3)
206 payloadSize = 4;
207 else if (payloadSize > 4)
208 payloadSize = 8;
209 else if (payloadSize == 0)
210 payloadSize = 1;
211
212 MUST_BE_TRUE(payloadSize == 1 || payloadSize == 2 || payloadSize == 4 || payloadSize == 8,
213 "Unsupported payload size");
214
215 // dclSize could be larger than payload size when
216 // 2 variables across scratch writes are coalesced.
217 unsigned int dclSize = payloadSize;
218 for (auto c : coalesceableFills)
219 {
220 unsigned int scratchOffset, scratchSize;
221 auto fill = (*c);
222 getScratchMsgInfo((*c), scratchOffset, scratchSize);
223
224 auto fillDst = fill->getDst();
225 auto fillDstRegOff = fillDst->getRegOff();
226 unsigned int dstDclRows = fillDst->getTopDcl()->getNumRows();
227 unsigned int maxRow = dstDclRows + scratchOffset - fillDstRegOff - min;
228
229 if (maxRow > dclSize)
230 dclSize = maxRow;
231 }
232
233 auto leadInst = *coalesceableFills.front();
234
235 auto newFill = generateCoalescedFill(kernel.fg.builder->duplicateOperand(leadInst->asFillIntrinsic()->getHeader()),
236 min, payloadSize, dclSize, gra.isEvenAligned(leadInst->getDst()->getTopDcl()));
237 newFill->inheritDIFrom(leadInst);
238
239 for (auto c : coalesceableFills)
240 {
241 unsigned int scratchOffset, scratchSize;
242 getScratchMsgInfo((*c), scratchOffset, scratchSize);
243
244 unsigned int rowOff = scratchOffset - min;
245 replaceMap.insert(std::make_pair((*c)->getDst()->getTopDcl(),
246 std::make_pair(newFill->getDst()->getTopDcl(), rowOff)));
247 }
248
249 auto f = coalesceableFills.front();
250 f++;
251
252 for (auto fill : coalesceableFills)
253 {
254 if (fill == f)
255 {
256 f++;
257 }
258 bb->erase(fill);
259 }
260
261 coalesceableFills.clear();
262 bb->insertBefore(f, newFill);
263
264 // copyToOldFills(coalescedFillDst, indFills, f, bb, srcCISAOff);
265 }
266
267 // Return true if heuristic agrees to coalescing.
fillHeuristic(std::list<INST_LIST_ITER> & coalesceableFills,std::list<INST_LIST_ITER> & instList,const std::list<INST_LIST_ITER> & origInstList,unsigned int & min,unsigned int & max)268 bool CoalesceSpillFills::fillHeuristic(std::list<INST_LIST_ITER>& coalesceableFills,
269 std::list<INST_LIST_ITER>& instList, const std::list<INST_LIST_ITER>& origInstList,
270 unsigned int& min, unsigned int& max)
271 {
272 #if 0
273 std::bitset<8> bits(0);
274 MUST_BE_TRUE(cMaxFillPayloadSize == 8, "Handle other max fill payload size");
275 #else
276 std::bitset<4> bits(0);
277 MUST_BE_TRUE(cMaxFillPayloadSize == 4, "Handle other max fill payload size");
278 #endif
279
280 if (coalesceableFills.size() <= 1)
281 {
282 return false;
283 }
284
285 min = 0xffffffff, max = 0;
286 G4_Declare* header = (*coalesceableFills.front())->asFillIntrinsic()->getHeader()->getTopDcl();
287 for (auto f : coalesceableFills)
288 {
289 if ((*f)->asFillIntrinsic()->getHeader()->getTopDcl() != header &&
290 !(*f)->asFillIntrinsic()->getFP())
291 return false;
292
293 unsigned int scratchOffset, scratchSize;
294 getScratchMsgInfo(*f, scratchOffset, scratchSize);
295
296 if (scratchSize == 8)
297 {
298 return false;
299 }
300
301 if (addrTakenSpillFillDcl.find((*f)->getDst()->getTopDcl()) !=
302 addrTakenSpillFillDcl.end())
303 {
304 return false;
305 }
306
307 for (auto i = scratchOffset; i < (scratchOffset + scratchSize); i++)
308 bits.set(i - scratchOffset);
309
310 if (min > scratchOffset)
311 min = scratchOffset;
312
313 if (max < (scratchOffset + scratchSize - 1))
314 max = (scratchOffset + scratchSize - 1);
315 }
316
317 // Iterate over coalescable fills and ensure all rows of a variable
318 // are fill candidates. If not, then dont fill. This helps cases like,
319 // #1 FILL_V10(0,0) <-- load 0x10 ... (4 GRFs)
320 // #2 FILL_V10(4,0) <-- load 0x14 ... (1 GRF)
321 // #3 send ... FILL_V10(0,0) ... (use 3 GRFs of FILL_V10)
322 // #4 FILL_V11(0,0) <-- load 0x15 ... (1 GRF)
323 //
324 // Loads at #2 and #4 can be coalesced. But this requires a new coalesced
325 // variable of size = 6 GRFs. This size can quickly increase for Cm where
326 // payloads of 8 GRF are also present. So instead of allowing these cases
327 // at the risk of spilling more, we require that all rows of fill range
328 // are candidates in coalesceableFills list. So when only #2 and #4 are
329 // fill candidates, we will not coalesce them. This makes us miss some
330 // cases where subset rows of fills have been converted to movs.
331 const int maxDclSize = 128;
332
333 std::map<G4_Declare*, std::bitset<maxDclSize>> allRows;
334 for (auto c : coalesceableFills)
335 {
336 auto topdcl = (*c)->getDst()->getTopDcl();
337
338 unsigned int scratchOffset, scratchSize;
339 getScratchMsgInfo(*c, scratchOffset, scratchSize);
340
341 auto it = allRows.find(topdcl);
342 if (it == allRows.end())
343 {
344 allRows.insert(std::make_pair(topdcl, std::bitset<maxDclSize>()));
345 it = allRows.find(topdcl);
346 }
347
348 // Now mark bits corresponding to rows
349 unsigned int regOff = (*c)->getDst()->getRegOff();
350 for (unsigned int r = regOff;
351 r < (regOff+ scratchSize); r++)
352 {
353 it->second.set(r);
354 }
355 }
356
357 // Check whether all dcls in map have all rows filled
358 for (auto&& r : allRows)
359 {
360 unsigned int numRows = r.first->getNumRows();
361
362 for (unsigned int i = 0; i < numRows; i++)
363 {
364 if (r.second.test(i) == false)
365 {
366 // Found a row of variable that isnt captured in
367 // list of candidate fills.
368 return false;
369 }
370 }
371 }
372
373 #if 0
374 for (auto f : coalesceableFills)
375 {
376 unsigned int scratchOffset, scratchSize;
377 getScratchMsgInfo(*f, scratchOffset, scratchSize);
378
379 for (auto i = scratchOffset; i < (scratchOffset + scratchSize); i++)
380 bits.set(i - min);
381 }
382 #endif
383
384 if (max - min <= 3)
385 {
386 // Will emit at most 4GRF read
387 if (bits[0] != bits[1] &&
388 bits[2] != bits[3])
389 {
390 // Dont coalesce patterns like
391 // 1010, 0101
392 return false;
393 }
394
395 if ((bits[0] & bits[3]) &&
396 !(bits[1] | bits[2]))
397 {
398 // 1001
399 return false;
400 }
401 }
402
403 return true;
404 }
405
notOOB(unsigned int min,unsigned int max)406 bool CoalesceSpillFills::notOOB(unsigned int min, unsigned int max)
407 {
408 // Disallow coalescing if it exceeds spill size computed by RA
409 // as this may cause segmentation fault.
410 auto nextSpillOffset = spill.getNextOffset();
411 auto diff = max - min;
412 auto HWordByteSize = GlobalRA::getHWordByteSize();
413 if (diff == 2 &&
414 (min + 3) * HWordByteSize >= nextSpillOffset)
415 {
416 // Coalesce 1-1-
417 // Payload size 4 is needed as candidates are not back-to-back.
418 // Coalescing is illegal if last row exceeds spill size computed by RA.
419 return false;
420 }
421 else if (diff > 3 && diff < 7 &&
422 (min + 7) * HWordByteSize >= nextSpillOffset)
423 {
424 // payload size = 8 would be used
425 return false;
426 }
427
428 return true;
429 }
430
431 // instList contains all instructions (fills or spills) within window size.
432 // At exit, instList will contain instructions that will not be coalesced.
433 // coalescable list will contain instructions within min-max offset range.
434 // First instruction's offset in instList is set to be min. max is
435 // min + maxPayloadSize - 1.
sendsInRange(std::list<INST_LIST_ITER> & instList,std::list<INST_LIST_ITER> & coalescable,unsigned int maxPayloadSize,unsigned int & min,unsigned int & max)436 void CoalesceSpillFills::sendsInRange(std::list<INST_LIST_ITER>& instList,
437 std::list<INST_LIST_ITER>& coalescable,
438 unsigned int maxPayloadSize, unsigned int& min, unsigned int& max)
439 {
440 min = 0xffffffff;
441 max = 0;
442 bool isFirstNoMask = false;
443 unsigned int mask = 0;
444 for (auto iter = instList.begin();
445 iter != instList.end();
446 )
447 {
448 unsigned scratchOffset, sizeInGrfUnit, lastScratchOffset;
449 auto inst = *(*iter);
450 getScratchMsgInfo(inst, scratchOffset, sizeInGrfUnit);
451 lastScratchOffset = scratchOffset + sizeInGrfUnit - 1;
452
453 if (min == 0xffffffff && max == 0)
454 {
455 // First spill is definitely a candidate
456 min = scratchOffset;
457 max = lastScratchOffset;
458 coalescable.push_back(*iter);
459 iter = instList.erase(iter);
460 isFirstNoMask = inst->isWriteEnableInst();
461 mask = inst->getMaskOption();
462
463 if (addrTakenSpillFillDcl.find(inst->getDst()->getTopDcl()) !=
464 addrTakenSpillFillDcl.end())
465 {
466 return;
467 }
468
469 continue;
470 }
471
472 if (min != 0xffffffff || max != 0)
473 {
474 bool maskMatch = (isFirstNoMask && inst->isWriteEnableInst()) ||
475 (mask == inst->getMaskOption());
476
477 // don't coalesce if non-leading fill inst has alignment requirements,
478 // as we may not be able to satisfy it
479 bool fillDstisAligned = gra.isEvenAligned(inst->getDst()->getTopDcl());
480
481 if (!maskMatch || fillDstisAligned)
482 {
483 iter++;
484 continue;
485 }
486
487 // Check whether min/max can be extended
488 if (scratchOffset <= min &&
489 (min - scratchOffset) <= (cMaxFillPayloadSize - 1) &&
490 (max - scratchOffset) <= (cMaxFillPayloadSize - 1) &&
491 notOOB(scratchOffset, max))
492 {
493 // This instruction can be coalesced
494 min = scratchOffset;
495 if (max < lastScratchOffset)
496 max = lastScratchOffset;
497
498 //MUST_BE_TRUE(max - min <= (cMaxFillPayloadSize - 1), "Unexpected fills coalesced. (max - min) is out of bounds - 1");
499
500 coalescable.push_back(*iter);
501 iter = instList.erase(iter);
502 }
503 else if (scratchOffset >= max &&
504 (lastScratchOffset - min) <= (cMaxFillPayloadSize - 1) &&
505 (lastScratchOffset - max) <= (cMaxFillPayloadSize - 1) &&
506 notOOB(min, scratchOffset))
507 {
508 max = lastScratchOffset;
509
510 //MUST_BE_TRUE(max - min <= cMaxFillPayloadSize, "Unexpected spills coalesced. (max - min) is out of bounds - 2");
511
512 coalescable.push_back(*iter);
513 iter = instList.erase(iter);
514 }
515 else if (scratchOffset >= min &&
516 lastScratchOffset <= max)
517 {
518 coalescable.push_back(*iter);
519 iter = instList.erase(iter);
520 }
521 else
522 {
523 iter++;
524 }
525 }
526 }
527 }
528
529 // instList contains all spills seen in window.
530 // coalescable is empty and should contain consecutive spills.
531 // This funtion will prune spills so they write consecutive
532 // memory slots. First spill is first candidate to start window.
keepConsecutiveSpills(std::list<INST_LIST_ITER> & instList,std::list<INST_LIST_ITER> & coalescable,unsigned int maxPayloadSize,unsigned int & minOffset,unsigned int & maxOffset,bool & useNoMask,G4_InstOption & mask)533 void CoalesceSpillFills::keepConsecutiveSpills(std::list<INST_LIST_ITER>& instList,
534 std::list<INST_LIST_ITER>& coalescable,
535 unsigned int maxPayloadSize, unsigned int& minOffset, unsigned int& maxOffset,
536 bool& useNoMask, G4_InstOption& mask)
537 {
538 // allowed list contains instructions to be coalesced in
539 // ascending order of their spill slots.
540 std::list<INST_LIST_ITER> allowed;
541 auto origInstList = instList;
542 allowed.push_back(instList.front());
543 instList.pop_front();
544 unsigned int maskOffset = (*allowed.front())->getMaskOption();
545 mask = (G4_InstOption)(maskOffset & InstOpt_QuarterMasks);
546 useNoMask = (maskOffset & InstOpt_WriteEnable) ? true : false;
547 unsigned int size;
548 getScratchMsgInfo(*allowed.front(), minOffset, size);
549 maxOffset = minOffset + size - 1;
550
551 bool firstSpillFromSend = false;
552 G4_Declare* sendDstTopDcl = (*allowed.front())->getSrc(1)->getTopDcl();
553 if (sendDstDcl.find(sendDstTopDcl) != sendDstDcl.end())
554 firstSpillFromSend = true;
555
556 G4_Declare* header = (*instList.front())->asSpillIntrinsic()->getHeader()->getTopDcl();
557
558 for (auto instIt : instList)
559 {
560 auto inst = (*instIt);
561
562 if (inst->asSpillIntrinsic()->getHeader()->getTopDcl() != header &&
563 !inst->asSpillIntrinsic()->getFP())
564 {
565 return;
566 }
567
568 useNoMask &= inst->isWriteEnableInst();
569
570 if (!useNoMask)
571 break;
572 }
573
574 if (useNoMask)
575 {
576 // Spill coalescing doesnt work as expected without NoMask
577 bool redo;
578 do
579 {
580 redo = false;
581 for (auto spillIt = instList.begin();
582 spillIt != instList.end();
583 spillIt++)
584 {
585 unsigned int scratchOffset, scratchSize;
586 getScratchMsgInfo(*(*spillIt), scratchOffset, scratchSize);
587
588 auto src1 = (*(*spillIt))->getSrc(1);
589 if (src1 &&
590 addrTakenSpillFillDcl.find(src1->getTopDcl()) !=
591 addrTakenSpillFillDcl.end())
592 {
593 // Address taken dcls should not be coalesed with others.
594 // This is dangerous because nothing ties indirect opnd
595 // with fill/spill instructions for it. Only after RA do
596 // we update offset of address register holding the
597 // indirect operand, based on RA assigment to spill/fill
598 // address taken variable.
599 continue;
600 }
601
602 if (// Consecutive scratch offsets
603 scratchOffset == maxOffset + 1 &&
604 // Scratch offset + size is within max payload size
605 (scratchOffset + scratchSize - 1) <= (minOffset + maxPayloadSize - 1) &&
606 // Either both masks are same or both are NoMask
607 (((*(*spillIt))->getMaskOption() == (maskOffset & InstOpt_QuarterMasks)) ||
608 (useNoMask && (*(*spillIt))->isWriteEnableInst())))
609 {
610 auto curInstDstTopDcl = (*(*spillIt))->getSrc(1)->getTopDcl();
611 // Check whether current inst's topdcl was spilled in a send.
612 // If it was and first instruction in instList wasnt then
613 // dont consider current instruction as coalescing candidate.
614 if (!firstSpillFromSend &&
615 sendDstDcl.find(curInstDstTopDcl) != sendDstDcl.end())
616 {
617 continue;
618 }
619
620 // This condition allows send coalescing iff
621 // a. Either none of the vars are defined in a send
622 // b. All vars defined in same send
623 if (!firstSpillFromSend ||
624 curInstDstTopDcl == sendDstTopDcl)
625 {
626 if (curInstDstTopDcl == sendDstTopDcl)
627 {
628 // Make sure src1 operands are consecutive
629 auto curSrc1Row = (*(*spillIt))->getSrc(1)->asSrcRegRegion()->getRegOff();
630 bool success = true;
631 for (auto candidate : allowed)
632 {
633 unsigned int candOffset, candSize;
634 getScratchMsgInfo(*candidate, candOffset, candSize);
635 auto prevSrc1Row = (*candidate)->getSrc(1)->asSrcRegRegion()->getRegOff();
636
637 unsigned int scratchOffDelta = scratchOffset - candOffset;
638 if ((prevSrc1Row + scratchOffDelta) != curSrc1Row)
639 {
640 // Following is disallowed
641 // send (8) V10(1,0) ... <-- resLen = 4
642 // sends (8) null r0 V10(1,0) 0x100 <-- extLen = 1
643 // mov (8) T2 V10(2,0)
644 // sends (8) null r0 r10(3,0) 0x101 <-- extLen = 1
645 // mov (8) T4 V10(4,0)
646 // Two scratch writes cannot be coalesced here
647 // because their src1 regions arent consecutive.
648 success = false;
649 break;
650 }
651 }
652 if (!success)
653 continue;
654 }
655
656 allowed.push_back(*spillIt);
657 instList.erase(spillIt);
658 redo = true;
659 maxOffset += scratchSize;
660 break;
661 }
662 }
663 }
664 } while (redo);
665 }
666
667 while (allowed.size() > 1)
668 {
669 unsigned int slots = maxOffset - minOffset + 1;
670 if (slots == 2 || slots == 4)
671 {
672 // Insert coalescable spills in order of appearance
673 for (auto origInst : origInstList)
674 {
675 for (auto allowedSpills : allowed)
676 {
677 if (*origInst == *allowedSpills)
678 {
679 coalescable.push_back(origInst);
680 break;
681 }
682 }
683 }
684
685 MUST_BE_TRUE(coalescable.size() == allowed.size(),
686 "Coalesced spills list missing entries");
687 break;
688 }
689 else
690 {
691 allowed.pop_back();
692 unsigned int scratchOffset, scratchSize;
693 getScratchMsgInfo(*allowed.back(), scratchOffset, scratchSize);
694 maxOffset = scratchOffset + scratchSize - 1;
695 }
696 }
697
698 instList = origInstList;
699 for (auto coalIt = coalescable.begin(),
700 instIt = instList.begin();
701 coalIt != coalescable.end();
702 coalIt++)
703 {
704 if (*instIt == *coalIt)
705 instIt = instList.erase(instIt);
706 else
707 {
708 while (*instIt != *coalIt)
709 {
710 instIt++;
711 }
712 instIt = instList.erase(instIt);
713 }
714 }
715 }
716
analyzeSpillCoalescing(std::list<INST_LIST_ITER> & instList,INST_LIST_ITER start,INST_LIST_ITER end,G4_BB * bb)717 INST_LIST_ITER CoalesceSpillFills::analyzeSpillCoalescing(std::list<INST_LIST_ITER>& instList,
718 INST_LIST_ITER start, INST_LIST_ITER end, G4_BB* bb)
719 {
720 // Check and perform coalescing, if possible, amongst spills in instList.
721 // Return inst iter points to either last inst+1 in instList if all spills
722 // were coalesced. Otherwise, it points to first spill that wasnt coalesced.
723 // Spill coalescing is possible only when all slots in coalesced range
724 // have a write.
725 INST_LIST_ITER last = end;
726 last++;
727 #if 0
728 unsigned int startCISAOff = (*instList.front())->getCISAOff();
729 #endif
730 if (instList.size() < 2)
731 {
732 return last;
733 }
734
735 std::list<INST_LIST_ITER> coalesceableSpills;
736 auto origInstList = instList;
737 unsigned int min, max;
738 G4_InstOption mask;
739 bool useNoMask;
740 keepConsecutiveSpills(instList, coalesceableSpills, cMaxSpillPayloadSize, min, max, useNoMask, mask);
741
742 #if 0
743 printf("Start -- \n");
744 if (coalesceableSpills.size() > 0)
745 {
746 printf("Will coalesce following spill (offset, size) pairs:\n");
747 for (auto k : coalesceableSpills)
748 {
749 printf("(%d, %d) @ $%d,\t", (*k)->getMsgDesc()->getScratchRWOffset(), (*k)->getMsgDesc()->getScratchRWSize(), (*k)->getCISAOff());
750 }
751 printf("\n\n");
752 }
753
754 if (instList.size() > 0)
755 {
756 printf("Will NOT coalesce following spill (offset, size) pairs:\n");
757 for (auto k : instList)
758 {
759 printf("(%d, %d) @ $%d,\t", (*k)->getMsgDesc()->getScratchRWOffset(), (*k)->getMsgDesc()->getScratchRWSize(), (*k)->getCISAOff());
760 }
761 printf("\n\n");
762 }
763
764 printf("End --\n");
765 #endif
766
767 if (coalesceableSpills.size() > 1)
768 {
769 coalesceSpills(coalesceableSpills, min, max, useNoMask, mask, bb);
770 }
771 else
772 {
773 // When coalescing is not done, we want to
774 // move to second instruction in instList in
775 // next loop iteration.
776 instList.pop_front();
777 }
778
779 if (instList.size() == 0)
780 {
781 return last;
782 }
783 else
784 {
785 return instList.front();
786 }
787 }
788
analyzeFillCoalescing(std::list<INST_LIST_ITER> & instList,INST_LIST_ITER start,INST_LIST_ITER end,G4_BB * bb)789 INST_LIST_ITER CoalesceSpillFills::analyzeFillCoalescing(std::list<INST_LIST_ITER>& instList,
790 INST_LIST_ITER start, INST_LIST_ITER end, G4_BB* bb)
791 {
792 // Check and perform coalescing, if possible, amongst fills in instList.
793 // Return inst iter points to either last inst+1 in instList if all fills
794 // were coalesced. Otherwise, it points to first fill that wasnt coalesced.
795 INST_LIST_ITER last = end;
796 last++;
797 #if 0
798 G4_INST* lastInst = nullptr;
799 if (last != bb->end())
800 lastInst = (*last);
801 #endif
802 if (instList.size() < 2)
803 {
804 return last;
805 }
806
807 std::list<INST_LIST_ITER> coalesceableFills;
808 auto origInstList = instList;
809 unsigned int min, max;
810 sendsInRange(instList, coalesceableFills, cMaxFillPayloadSize, min, max);
811
812 bool heuristic = fillHeuristic(coalesceableFills, instList, origInstList, min, max);
813 if (!heuristic)
814 {
815 coalesceableFills.clear();
816 instList = origInstList;
817 instList.pop_front();
818 #if 0
819 printf("Fill heuristic didnt agree to coalescing\n");
820 #endif
821 }
822
823 #if 0
824 printf("Start -- \n");
825 if (coalesceableFills.size() > 0)
826 {
827 printf("Will coalesce following fill (offset, size) pairs:\n");
828 for (auto k : coalesceableFills)
829 {
830 printf("(%d, %d) @ $%d,\t", (*k)->getMsgDesc()->getScratchRWOffset(), (*k)->getMsgDesc()->getScratchRWSize(), (*k)->getCISAOff());
831 }
832 printf("\n\n");
833 }
834
835 if (instList.size() > 0)
836 {
837 printf("Will NOT coalesce following fill (offset, size) pairs:\n");
838 for (auto k : instList)
839 {
840 printf("(%d, %d) @ $%d,\t", (*k)->getMsgDesc()->getScratchRWOffset(), (*k)->getMsgDesc()->getScratchRWSize(), (*k)->getCISAOff());
841 }
842 printf("\n\n");
843 }
844
845 printf("End --\n");
846 #endif
847
848 if (coalesceableFills.size() > 1)
849 {
850 coalesceFills(coalesceableFills, min, max, bb, (*coalesceableFills.front())->getCISAOff());
851 }
852
853 if (instList.size() == 0)
854 {
855 return last;
856 }
857 else
858 {
859 return instList.front();
860 }
861 }
862
overlap(G4_INST * inst1,G4_INST * inst2,bool & isFullOverlap)863 bool CoalesceSpillFills::overlap(G4_INST* inst1, G4_INST* inst2, bool& isFullOverlap)
864 {
865 unsigned int scratchOffset1, scratchSize1, scratchOffset2, scratchSize2;
866 unsigned int scratchEnd1, scratchEnd2;
867 getScratchMsgInfo(inst1, scratchOffset1, scratchSize1);
868 getScratchMsgInfo(inst2, scratchOffset2, scratchSize2);
869
870 // isFullOverlap is true only if inst1 full covers inst2
871 isFullOverlap = false;
872
873 scratchEnd1 = scratchOffset1 + scratchSize1 - 1;
874 scratchEnd2 = scratchOffset2 + scratchSize2 - 1;
875
876 if (scratchOffset1 <= scratchOffset2)
877 {
878 // inst1 |---------| or |----------|
879 // inst2 |------| |---|
880 if (scratchEnd1 >= scratchOffset2)
881 {
882 if (scratchOffset1 <= scratchOffset2 &&
883 (scratchOffset1 + scratchSize1) >= (scratchOffset2 + scratchSize2))
884 {
885 isFullOverlap = true;
886 }
887
888 return true;
889 }
890 }
891 else
892 {
893 // inst1 |------| or |-----|
894 // inst2 |-----| |-----------|
895 if (scratchEnd2 >= scratchOffset1)
896 {
897 if (scratchOffset1 <= scratchOffset2 &&
898 (scratchOffset1 + scratchSize1) >= (scratchOffset2 + scratchSize2))
899 {
900 isFullOverlap = true;
901 }
902
903 return true;
904 }
905 }
906
907 return false;
908 }
909
overlap(G4_INST * inst,std::list<INST_LIST_ITER> & allInsts)910 bool CoalesceSpillFills::overlap(G4_INST* inst, std::list<INST_LIST_ITER>& allInsts)
911 {
912 for (auto sp : allInsts)
913 {
914 bool t;
915 auto spillInst = (*sp);
916 if (overlap(inst, spillInst, t))
917 return true;
918 }
919
920 return false;
921 }
922
removeWARFills(std::list<INST_LIST_ITER> & fills,std::list<INST_LIST_ITER> & spills)923 void CoalesceSpillFills::removeWARFills(std::list<INST_LIST_ITER>& fills, std::list<INST_LIST_ITER>& spills)
924 {
925 for (auto flIt = fills.begin();
926 flIt != fills.end();
927 )
928 {
929 if (overlap((*(*flIt)), spills))
930 {
931 flIt = fills.erase(flIt);
932 continue;
933 }
934 flIt++;
935 }
936 }
937
938 // Return true if an operand was replaced
replaceCoalescedOperands(G4_INST * inst)939 bool CoalesceSpillFills::replaceCoalescedOperands(G4_INST* inst)
940 {
941 bool IRChanged = false;
942 auto dst = inst->getDst();
943 if (dst &&
944 dst->getTopDcl())
945 {
946 auto dcl = dst->getTopDcl();
947 auto it = replaceMap.find(dcl);
948
949 if (it != replaceMap.end())
950 {
951 auto dstRgn = dst->asDstRegRegion();
952 auto newDstRgn = kernel.fg.builder->createDst(it->second.first->getRegVar(),
953 it->second.second + dstRgn->getRegOff(), dstRgn->getSubRegOff(), dstRgn->getHorzStride(), dstRgn->getType());
954
955 newDstRgn->setAccRegSel(dstRgn->getAccRegSel());
956 inst->setDest(newDstRgn);
957 IRChanged = true;
958 }
959 }
960
961 for (unsigned int i = 0; i < G4_MAX_SRCS; i++)
962 {
963 auto opnd = inst->getSrc(i);
964
965 if (opnd &&
966 opnd->getTopDcl())
967 {
968 auto dcl = opnd->getTopDcl();
969 auto it = replaceMap.find(dcl);
970
971 if (it == replaceMap.end())
972 continue;
973
974 if (opnd->isSrcRegRegion())
975 {
976 auto srcRgn = opnd->asSrcRegRegion();
977 auto oldRgnDesc = srcRgn->getRegion();
978
979 auto newSrcRgn = kernel.fg.builder->createSrcRegRegion(srcRgn->getModifier(), Direct,
980 it->second.first->getRegVar(), it->second.second + srcRgn->getRegOff(),
981 srcRgn->getSubRegOff(), oldRgnDesc,
982 opnd->getType());
983 newSrcRgn->setAccRegSel(srcRgn->getAccRegSel());
984 inst->setSrc(newSrcRgn, i);
985 IRChanged = true;
986 }
987 }
988 }
989
990 return IRChanged;
991 }
992
allSpillsSameVar(std::list<INST_LIST_ITER> & spills)993 bool CoalesceSpillFills::allSpillsSameVar(std::list<INST_LIST_ITER>& spills)
994 {
995 // Return true if all vars in spills list have same dcl
996 G4_Declare* dcl = nullptr;
997 for (auto s : spills)
998 {
999 auto topdcl = (*s)->getSrc(1)->getTopDcl();
1000
1001 if (!dcl)
1002 dcl = topdcl;
1003
1004 if (topdcl != dcl)
1005 {
1006 return false;
1007 }
1008 }
1009
1010 // Allow only if all dcls are defined by same send
1011 if (sendDstDcl.find(dcl) != sendDstDcl.end())
1012 return true;
1013
1014 return false;
1015 }
1016
fills()1017 void CoalesceSpillFills::fills()
1018 {
1019 // Iterate over all BBs, find fills that are closeby and coalesce
1020 // a bunch of them. Insert movs as required.
1021 for (auto bb : kernel.fg)
1022 {
1023 auto endIter = bb->end();
1024 std::list<INST_LIST_ITER> fillsToCoalesce;
1025 std::list<INST_LIST_ITER> spills;
1026 INST_LIST_ITER startIter = bb->begin();
1027 unsigned int w = 0;
1028 const auto& splitInsts = LoopVarSplit::getSplitInsts(&gra, bb);
1029 for (auto instIter = startIter;
1030 instIter != endIter;)
1031 {
1032 auto inst = (*instIter);
1033
1034 if (inst->isPseudoKill() ||
1035 inst->isLabel())
1036 {
1037 instIter++;
1038 continue;
1039 }
1040
1041 if (splitInsts.find(inst) != splitInsts.end())
1042 {
1043 // if inst was emitted by loop split transformation,
1044 // then dont optimize it. such instructions are
1045 // emitted in loop preheader/loop exit. if a split
1046 // variable spills, we need to erase all fills and
1047 // spills emitted for that split. if we coalesce
1048 // 2 fills from split sequence, we may end up with
1049 // fill loading data that wont be used. for eg,
1050 //
1051 // (W) mov (8|M0) SPLIT1 V10
1052 // (W) mov (8|M0) SPLIT2 V11
1053 // ==>
1054 //
1055 // (W) fill FILL_V10 @ 0x0
1056 // (W) mov (8|M0) SPLIT1 FILL_V10
1057 // (W) fill FILL_V11 @ 0x32
1058 // (W) mov (8|M0) SPLIT2 FILL_V11
1059 //
1060 // we need to be able to erase 2nd fill and copy in
1061 // case SPLIT2 spills. this cannot be done if the 2
1062 // fills are coalesced.
1063
1064 instIter++;
1065 continue;
1066 }
1067
1068 if (inst->isSpillIntrinsic())
1069 {
1070 spills.push_back(instIter);
1071 }
1072 else if (inst->isFillIntrinsic())
1073 {
1074 // Check if coalescing is possible
1075 if (fillsToCoalesce.size() == 0)
1076 {
1077 w = 0;
1078 startIter = instIter;
1079 spills.clear();
1080 }
1081
1082 if (!overlap(*instIter, spills))
1083 {
1084 fillsToCoalesce.push_back(instIter);
1085 }
1086 }
1087
1088 if (fillsToCoalesce.size() > 0 &&
1089 rpe.getRegisterPressure(inst) > fillWindowSizeThreshold)
1090 {
1091 // High register pressure region so reduce window size to 3
1092 w = (cWindowSize - w > 3) ? cWindowSize - 3 : w;
1093 }
1094
1095 if (w == cWindowSize || inst == bb->back())
1096 {
1097 if (fillsToCoalesce.size() > 1)
1098 {
1099 instIter = analyzeFillCoalescing(fillsToCoalesce, startIter, instIter, bb);
1100 }
1101 else if (w == cWindowSize)
1102 {
1103 startIter = instIter;
1104 }
1105 else if (inst == bb->back())
1106 {
1107 break;
1108 }
1109
1110 w = 0;
1111 fillsToCoalesce.clear();
1112 spills.clear();
1113
1114 continue;
1115 }
1116
1117 if (fillsToCoalesce.size() > 0)
1118 {
1119 w++;
1120 }
1121
1122 instIter++;
1123 }
1124
1125 // One pass to replace old fills with coalesced dcl
1126 for (auto instIt = bb->begin();
1127 instIt != bb->end();
1128 )
1129 {
1130 auto inst = (*instIt);
1131
1132 if (inst->isPseudoKill() &&
1133 replaceMap.find(inst->getDst()->getTopDcl()) != replaceMap.end())
1134 {
1135 instIt = bb->erase(instIt);
1136 continue;
1137 }
1138
1139 while (replaceCoalescedOperands(inst))
1140 {
1141 // replaceCoalescedOperands() updates references to old
1142 // dcls in IR that are coalesced. Having a single pass to
1143 // replace operands is insufficient if a coalesced range
1144 // gets re-coalesced. This can happen rarely when window
1145 // heuristic hits boundary condition.
1146 };
1147 instIt++;
1148 }
1149 }
1150 }
1151
populateSendDstDcl()1152 void CoalesceSpillFills::populateSendDstDcl()
1153 {
1154 // Find and store all G4_Declares that are dest in sends
1155 // and are spilled. This is required when coalescing
1156 // scratch writes for such spills. We cannot mix coalescing
1157 // for G4_Declares from one and and other instructions.
1158 // Otherwise register pressure increases significantly.
1159 for (auto bb : kernel.fg)
1160 {
1161 for (auto inst : *bb)
1162 {
1163 if (inst->isSend() &&
1164 inst->getDst())
1165 {
1166 if (!inst->getDst()->isNullReg())
1167 {
1168 if (!inst->getMsgDesc()->isScratch())
1169 {
1170 auto topdcl = inst->getDst()->getTopDcl();
1171
1172 sendDstDcl.insert(topdcl);
1173 }
1174 }
1175 }
1176 else if (inst->isSpillIntrinsic() &&
1177 inst->getSrc(1)->getBase()->asRegVar()->isRegVarCoalesced())
1178 {
1179 auto topdcl = inst->getSrc(1)->getTopDcl();
1180
1181 sendDstDcl.insert(topdcl);
1182 }
1183 }
1184 }
1185 }
1186
spills()1187 void CoalesceSpillFills::spills()
1188 {
1189 populateSendDstDcl();
1190
1191 // Iterate over all BBs, find fills that are closeby and coalesce
1192 // a bunch of them. Insert movs as required.
1193 for (auto bb : kernel.fg)
1194 {
1195 auto endIter = bb->end();
1196 std::list<INST_LIST_ITER> spillsToCoalesce;
1197 INST_LIST_ITER startIter = bb->begin();
1198 unsigned int w = 0;
1199 const auto& splitInsts = LoopVarSplit::getSplitInsts(&gra, bb);
1200 for (auto instIter = startIter;
1201 instIter != endIter;)
1202 {
1203 auto inst = (*instIter);
1204
1205 if (inst->isPseudoKill() ||
1206 inst->isLabel())
1207 {
1208 instIter++;
1209 continue;
1210 }
1211
1212 if (splitInsts.find(inst) != splitInsts.end())
1213 {
1214 instIter++;
1215 continue;
1216 }
1217
1218 bool earlyCoalesce = false;
1219 if (inst->isSpillIntrinsic())
1220 {
1221 // Check if coalescing is possible
1222 if (spillsToCoalesce.size() == 0)
1223 {
1224 w = 0;
1225 startIter = instIter;
1226 spillsToCoalesce.clear();
1227 }
1228
1229 for (auto coalIt = spillsToCoalesce.begin();
1230 coalIt != spillsToCoalesce.end();
1231 )
1232 {
1233 bool fullOverlap = false;
1234 if (overlap(*instIter, *(*coalIt), fullOverlap))
1235 {
1236 if (fullOverlap)
1237 {
1238 #if 0
1239 printf("Deleting spill at $%d due to %d\n", (*(*coalIt))->getCISAOff(), (*instIter)->getCISAOff());
1240 #endif
1241 // Delete earlier spill since its made redundant
1242 // by current spill.
1243 bb->erase(*coalIt);
1244 }
1245
1246 coalIt = spillsToCoalesce.erase(coalIt);
1247 continue;
1248 }
1249 coalIt++;
1250 }
1251 spillsToCoalesce.push_back(instIter);
1252 }
1253 else if (inst->isFillIntrinsic())
1254 {
1255 for (auto coalIt = spillsToCoalesce.begin();
1256 coalIt != spillsToCoalesce.end();
1257 )
1258 {
1259 bool temp = false;
1260 if (overlap(*instIter, *(*coalIt), temp))
1261 {
1262 #if 1
1263 // Instead of deleting scratch writes try coalescing
1264 // at this point. This way maybe the fill can also
1265 // be cleaned up in later phase.
1266 earlyCoalesce = true;
1267 break;
1268 #else
1269 coalIt = spillsToCoalesce.erase(coalIt);
1270 continue;
1271 #endif
1272 }
1273 coalIt++;
1274 }
1275 }
1276
1277 if (spillsToCoalesce.size() > 0 &&
1278 rpe.getRegisterPressure(inst) > spillWindowSizeThreshold)
1279 {
1280 if (!allSpillsSameVar(spillsToCoalesce))
1281 {
1282 // High register pressure region so reduce window size to 3
1283 w = (cWindowSize - w > 3) ? cWindowSize - 3 : w;
1284 }
1285 else
1286 {
1287 #if 0
1288 printf("Found register pressure = %d at %d. Still coalescing spills because all spills are from same var.\n",
1289 rpe.getRegisterPressure(inst), inst->getCISAOff());
1290 #endif
1291 }
1292 }
1293
1294 if (w == cWindowSize || inst == bb->back() ||
1295 earlyCoalesce)
1296 {
1297 if (spillsToCoalesce.size() > 1)
1298 {
1299 instIter = analyzeSpillCoalescing(spillsToCoalesce, startIter, instIter, bb);
1300 }
1301 else if (w == cWindowSize)
1302 {
1303 startIter = instIter;
1304 }
1305 else if (inst == bb->back())
1306 {
1307 break;
1308 }
1309
1310 w = 0;
1311 spillsToCoalesce.clear();
1312 continue;
1313 }
1314
1315 if (spillsToCoalesce.size() > 0)
1316 {
1317 w++;
1318 }
1319
1320 instIter++;
1321 }
1322
1323 // One pass to replace old fills with coalesced dcl
1324 for (auto instIt = bb->begin();
1325 instIt != bb->end();
1326 )
1327 {
1328 auto inst = (*instIt);
1329
1330 if (inst->isPseudoKill() &&
1331 replaceMap.find(inst->getDst()->getTopDcl()) != replaceMap.end())
1332 {
1333 instIt = bb->erase(instIt);
1334 continue;
1335 }
1336
1337 while (replaceCoalescedOperands(inst))
1338 {
1339 // replaceCoalescedOperands() updates references to old
1340 // dcls in IR that are coalesced. Having a single pass to
1341 // replace operands is insufficient if a coalesced range
1342 // gets re-coalesced. This can happen rarely when window
1343 // heuristic hits boundary condition.
1344 };
1345 instIt++;
1346 }
1347 }
1348 }
1349
fixSendsSrcOverlap()1350 void CoalesceSpillFills::fixSendsSrcOverlap()
1351 {
1352 // Overlap for sends src operands is not allowed.
1353 //
1354 // Fix for following code pattern after spill/fill coalescing:
1355 // send (16) COAL_FILL_373(0,0)<1>:ud r0 0xa 0x24c2001:ud{Align1, NoMask} // #??:$365:%657:&-1 // scratch read, resLen=4, msgLen=1
1356 // sends(1) null:ud COAL_FILL_373(0, 0) COAL_FILL_373(1, 0) 0x4c : ud 0x40680ff : ud{ Align1, Q1, NoMask } // #??:$365:%365:&-1 // a64 scatt
1357 // ered write, resLen = 0, msgLen = 2, extMsgLen = 1
1358 //
1359 // for CISA:
1360 // svm_scatter.1.1 (M1_NM, 1) V441.0 V449.0 /// $365
1361 //
1362 // where V441 and V449 are both scalars of type :uq and :ud respectively
1363 //
1364 for (auto bb : kernel.fg)
1365 {
1366 for (auto instIt = bb->begin();
1367 instIt != bb->end();
1368 instIt++)
1369 {
1370 auto inst = (*instIt);
1371
1372 if (!inst->isSplitSend())
1373 {
1374 continue;
1375 }
1376
1377 auto src0 = inst->getSrc(0);
1378 auto src1 = inst->getSrc(1);
1379
1380 if (src0->getTopDcl() == src1->getTopDcl())
1381 {
1382 auto lb0 = src0->getLeftBound();
1383 auto rb0 = src0->getRightBound();
1384 auto lb1 = src1->getLeftBound();
1385 auto rb1 = src1->getRightBound();
1386
1387 if ((lb0 < lb1 && rb0 > lb1) ||
1388 (lb1 < lb0 && rb1 > lb0))
1389 {
1390 // Ideally we should create copy of
1391 // operand with less number of GRFs,
1392 // but this is a really corner case
1393 // and probably shows up only for
1394 // force spills. So we simply choose
1395 // src1 of sends.
1396 const char* dclName = kernel.fg.builder->getNameString(kernel.fg.mem, 32,
1397 "COPY_%d", kernel.Declares.size());
1398 G4_Declare* copyDcl = kernel.fg.builder->createDeclareNoLookup(dclName, G4_GRF,
1399 numEltPerGRF<Type_UD>(), src1->getTopDcl()->getNumRows(),
1400 Type_UD);
1401
1402 unsigned int elems = copyDcl->getNumElems();
1403 short row = 0;
1404 while (elems > 0)
1405 {
1406 G4_SrcRegRegion* srcRgn = kernel.fg.builder->createSrc(src1->getTopDcl()->getRegVar(), row, 0,
1407 kernel.fg.builder->getRegionStride1(), Type_UD);
1408 G4_DstRegRegion* dstRgn = kernel.fg.builder->createDst(
1409 copyDcl->getRegVar(), row, 0, 1, Type_UD);
1410 G4_INST* copyInst = kernel.fg.builder->createMov(
1411 g4::SIMD8, dstRgn, srcRgn, InstOpt_WriteEnable, false);
1412 bb->insertBefore(instIt, copyInst);
1413 elems -= 8;
1414 row++;
1415 }
1416
1417 G4_SrcRegRegion* sendSrc1 = kernel.fg.builder->createSrc(copyDcl->getRegVar(), 0, 0,
1418 kernel.fg.builder->getRegionStride1(), Type_UD);
1419 inst->setSrc(sendSrc1, 1);
1420 }
1421 }
1422 }
1423 }
1424 }
1425
removeRedundantSplitMovs()1426 void CoalesceSpillFills::removeRedundantSplitMovs()
1427 {
1428 // send (8) V_SAMPLER -- resLen = 3
1429 // COAL_0(0,0) = V_SAMPLE(0,0)
1430 // COAL_0(1,0) = V_SAMPLE(1,0)
1431 // send (8) <null> COAL_0(0,0) <-- len = 2
1432 // TV0(0,0) = V_SAMPLE(2,0)
1433 // ===>
1434 // send (8) V_SAMPLER -- resLen = 3
1435 // send (8) <null> V_SAMPLE(0,0) <-- len = 2
1436 // TV0(0,0) = V_SAMPLE(2,0)
1437
1438 // Look for scratch writes. Src1 is data to write to memory.
1439 // Iterate in bottom-order to check whether raw movs exist
1440 // that define src1 of scratch write and whether their
1441 // source operands are consecutive.
1442
1443 // Store numUses for dcls replaced and location of defs.
1444 // This structure is used to eliminate redundant movs
1445 // later.
1446 typedef std::pair<G4_BB*, INST_LIST_ITER> MovLoc;
1447 typedef unsigned int NumRefs;
1448 std::map<G4_Declare*, std::pair<NumRefs, std::list<MovLoc>>> movs;
1449
1450 for (auto bb : kernel.fg)
1451 {
1452 // Store all dcls defined by non scratch sends
1453 // as only they are candidates for this pass.
1454 // Without this, we might end up identifying
1455 // other raw movs coming from partial write like:
1456 // add (8) r8.0<1>:q r20.0<4;4,1>:q r4.0<0;1,0>:ud {Align1, Q1}
1457 // send(16) r27.0<1>:uw r26 0xa 0x22c1000 : ud{ Align1, NoMask } // scratch read, fill, offset = 0, resLen=2, msgLen=1
1458 // mov(8) r27.0<1> : q r8.0<4; 4, 1> : q{ Align1, Q1 }
1459 // sends(16) null : uw r26 r27 0x8a : ud 0x20f1000 : ud{ Align1, NoMask } // scratch write, spill, offset = 0, resLen=0, msgLen=1, extMsgLen=2
1460 //
1461 // Although there is a raw mov before scratch write,
1462 // it has to be preserved for correctness.
1463 std::set<G4_Declare*> sendDst;
1464 for (auto inst : *bb)
1465 {
1466 if (inst->isSend() &&
1467 inst->getDst() &&
1468 !inst->getDst()->isNullReg() &&
1469 !inst->getMsgDesc()->isScratchRead() &&
1470 !inst->getMsgDesc()->isScratchWrite())
1471 {
1472 sendDst.insert(inst->getDst()->getTopDcl());
1473 }
1474 }
1475
1476 for (auto instIt = bb->begin(), endIt = bb->end();
1477 instIt != endIt;
1478 instIt++)
1479 {
1480 auto inst = (*instIt);
1481
1482 if (inst->isSpillIntrinsic())
1483 {
1484 // Spill sends
1485 auto src1Dcl = inst->getSrc(1)->getTopDcl();
1486 unsigned int lb = inst->getSrc(1)->getLeftBound();
1487 unsigned int rb = inst->getSrc(1)->getRightBound();
1488 std::set<unsigned int> rows;
1489 for (unsigned int k = lb / numEltPerGRF<Type_UB>(); k != (rb + numEltPerGRF<Type_UB>() - 1) / numEltPerGRF<Type_UB>(); k++)
1490 {
1491 rows.insert(k);
1492 }
1493 auto tmpIt = instIt;
1494 tmpIt--;
1495 G4_Declare* srcDcl = nullptr;
1496 std::map<unsigned int, unsigned int> dstSrcRowMapping;
1497 std::list<MovLoc> copies;
1498 while (tmpIt != bb->begin())
1499 {
1500 auto pInst = (*tmpIt);
1501
1502 // Each copy should be a raw mov
1503 if (!pInst->isRawMov())
1504 break;
1505
1506 // Ensure src0 topdcl comes from a send dst in this BB
1507 if (sendDst.find(pInst->getSrc(0)->getTopDcl()) ==
1508 sendDst.end())
1509 break;
1510
1511 // Check whether dcls match
1512 auto pDstDcl = pInst->getDst()->getTopDcl();
1513 if (pDstDcl != src1Dcl)
1514 break;
1515
1516 unsigned int plb = pInst->getDst()->getLeftBound();
1517 unsigned int prb = pInst->getDst()->getRightBound();
1518
1519 // Check whether complete row(s) defined
1520 if ((prb - plb + 1) % numEltPerGRF<Type_UB>() != 0)
1521 break;
1522
1523 unsigned int rowStart = plb / numEltPerGRF<Type_UB>();
1524 unsigned int numRows = (prb - plb + 1) / numEltPerGRF<Type_UB>();
1525 bool punt = false;
1526 for (unsigned int k = rowStart; k != (rowStart + numRows); k++)
1527 {
1528 if (rows.find(k) == rows.end())
1529 {
1530 punt = true;
1531 break;
1532 }
1533 dstSrcRowMapping.insert(std::make_pair(k, INT_MAX));
1534 }
1535
1536 if (punt)
1537 break;
1538
1539 auto pSrc0 = pInst->getSrc(0);
1540 if (!pSrc0->isSrcRegRegion())
1541 break;
1542
1543 auto pSrcDcl = pSrc0->getTopDcl();
1544 if (!srcDcl)
1545 srcDcl = pSrcDcl;
1546 else if (srcDcl != pSrcDcl)
1547 break;
1548
1549 // mov src should be GRF aligned
1550 if (pSrc0->getLeftBound() % numEltPerGRF<Type_UB>() != 0)
1551 break;
1552
1553 unsigned int src0lb = pSrc0->getLeftBound();
1554 unsigned int src0rb = pSrc0->getRightBound();
1555
1556 // (rb - lb) should match dst (rb - lb)
1557 if ((src0rb - src0lb) != (prb - plb))
1558 break;
1559
1560 unsigned int pStartRow = pSrc0->getLeftBound() / numEltPerGRF<Type_UB>();
1561 for (unsigned int k = rowStart; k != (rowStart + numRows); k++)
1562 {
1563 auto it = dstSrcRowMapping.find(k);
1564 if (it == dstSrcRowMapping.end())
1565 {
1566 punt = true;
1567 break;
1568 }
1569
1570 it->second = pStartRow + (k - rowStart);
1571 }
1572
1573 if (punt)
1574 break;
1575
1576 copies.push_back(std::make_pair(bb, tmpIt));
1577 tmpIt--;
1578 }
1579
1580 if (dstSrcRowMapping.size() > 0)
1581 {
1582 // Now check whether each entry of src1 has a corresponding src offset
1583 unsigned int dstRowStart = lb / numEltPerGRF<Type_UB>();
1584 bool success = true;
1585 auto baseIt = dstSrcRowMapping.find(dstRowStart);
1586 if (baseIt != dstSrcRowMapping.end())
1587 {
1588 auto base = dstSrcRowMapping.find(dstRowStart)->second;
1589 for (auto m : dstSrcRowMapping)
1590 {
1591 unsigned int curRow = m.first - dstRowStart;
1592 if (m.second == INT_MAX)
1593 {
1594 success = false;
1595 break;
1596 }
1597
1598 if (m.second != (base + curRow))
1599 {
1600 success = false;
1601 break;
1602 }
1603 }
1604
1605 if (success && srcDcl)
1606 {
1607 // Replace src1 of send with srcDcl
1608 G4_SrcRegRegion* sendSrc1 = kernel.fg.builder->createSrc(srcDcl->getRegVar(),
1609 base, 0, kernel.fg.builder->getRegionStride1(), inst->getSrc(1)->getType());
1610 inst->setSrc(sendSrc1, 1);
1611
1612 for (auto c : copies)
1613 {
1614 auto defDcl = (*c.second)->getDst()->getTopDcl();
1615 auto it = movs.find(defDcl);
1616 if (it == movs.end())
1617 {
1618 std::list<MovLoc> t;
1619 t.push_back(c);
1620 movs.insert(std::make_pair(defDcl, std::make_pair(0, t)));
1621 }
1622 else
1623 {
1624 it->second.second.push_back(c);
1625 }
1626 }
1627 }
1628 }
1629 }
1630 }
1631 }
1632 }
1633
1634 // Update number of uses of each dcl
1635 for (auto bb : kernel.fg)
1636 {
1637 for (auto instIt = bb->begin(), endIt = bb->end();
1638 instIt != endIt; instIt++)
1639 {
1640 auto inst = (*instIt);
1641
1642 if (inst->isPseudoKill())
1643 {
1644 auto dcl = inst->getDst()->getTopDcl();
1645 auto it = movs.find(dcl);
1646 if (it != movs.end())
1647 {
1648 it->second.second.push_back(std::make_pair(bb, instIt));
1649 }
1650 }
1651
1652 for (unsigned int i = 0; i < G4_MAX_SRCS; i++)
1653 {
1654 G4_Operand* opnd = inst->getSrc(i);
1655
1656 if (opnd &&
1657 opnd->getTopDcl())
1658 {
1659 auto it = movs.find(opnd->getTopDcl());
1660 if (it != movs.end())
1661 it->second.first++;
1662 }
1663 }
1664 }
1665 }
1666
1667 for (auto mov : movs)
1668 {
1669 auto dcl = mov.first;
1670 auto numRefs = mov.second.first;
1671 auto& allMovs = mov.second.second;
1672
1673 if (numRefs == 0 && !dcl->getAddressed())
1674 {
1675 #if 0
1676 printf("Removing movs/pseudoKill for dcl %s\n", dcl->getName());
1677 #endif
1678 for (auto m : allMovs)
1679 {
1680 auto bb = m.first;
1681 auto iter = m.second;
1682 #if 0
1683 printf("\tFound %s occurence at $%d\n", (*iter)->opcode() == G4_mov ? "mov" : "pseudokill", (*iter)->getCISAOff());
1684 #endif
1685 bb->erase(iter);
1686 }
1687 }
1688 }
1689 }
1690
spillFillCleanup()1691 void CoalesceSpillFills::spillFillCleanup()
1692 {
1693 // Eliminate redundant fills when a write
1694 // is close by:
1695 //
1696 // spill TV1 at offset = 1
1697 // ..
1698 // ..
1699 // ..
1700 // fill FP1 from offset = 1
1701 // = FP1
1702 // ===>
1703 // Remove fill and replace occurence of FP1 with TV1
1704 //
1705
1706 std::map<unsigned int, G4_INST*> writesPerOffset;
1707 std::set<G4_Declare*> defs;
1708 for (auto bb : kernel.fg)
1709 {
1710 auto startIt = bb->begin();
1711 auto endIt = bb->end();
1712 const auto& splitInsts = LoopVarSplit::getSplitInsts(&gra, bb);
1713 for (auto instIt = startIt;
1714 instIt != endIt;
1715 instIt++)
1716 {
1717 auto inst = (*instIt);
1718
1719 if (splitInsts.find(inst) != splitInsts.end())
1720 continue;
1721
1722 if (inst->isFillIntrinsic())
1723 {
1724 writesPerOffset.clear();
1725 defs.clear();
1726
1727 // Store offset, spill inst pair
1728 unsigned int rowStart, numRows;
1729 getScratchMsgInfo(inst, rowStart, numRows);
1730 unsigned int lastRow = rowStart + numRows - 1;
1731
1732 // Scan window of instruction above current inst
1733 // to check whether all rows read by current inst
1734 // have been written.
1735 auto pInstIt = instIt;
1736 pInstIt--;
1737 unsigned int w = cSpillFillCleanupWindowSize;
1738 while (pInstIt != startIt &&
1739 w > 0)
1740 {
1741 auto pInst = (*pInstIt);
1742
1743 if (pInst->isSpillIntrinsic())
1744 {
1745 unsigned int pRowStart, pNumRows;
1746 getScratchMsgInfo(pInst, pRowStart, pNumRows);
1747
1748 // If any def of src1 dcl is found then dont
1749 // consider this write for optimization. Its
1750 // value in memory could be different than
1751 // one held in variable.
1752 auto pSrc1Dcl = pInst->getSrc(1)->getTopDcl();
1753 if (defs.find(pSrc1Dcl) != defs.end())
1754 {
1755 // When a WAR is detected, punt out because
1756 // any older spill is going to be stale.
1757 // An optimization here is to detect variable
1758 // row on which WAR occurs. If the row is
1759 // different from current fill, then it
1760 // should be safe to ignore pInst and continue
1761 // till end of window.
1762 pInstIt--;
1763 break;
1764 }
1765
1766 for (unsigned int pRow = pRowStart;
1767 pRow != (pRowStart + pNumRows);
1768 pRow++)
1769 {
1770 auto writeIt = writesPerOffset.find(pRow);
1771
1772 // Check whether a more recent write was found for this row
1773 if (writeIt != writesPerOffset.end())
1774 continue;
1775
1776 writesPerOffset.insert(std::make_pair(pRow, pInst));
1777 }
1778 }
1779
1780 if (pInst->getDst() &&
1781 pInst->getDst()->getTopDcl())
1782 {
1783 // Store any defs seen to handle WAR
1784 defs.insert(pInst->getDst()->getTopDcl());
1785 }
1786
1787 w--;
1788 pInstIt--;
1789 }
1790
1791 // Check whether writes for all rows were found
1792 bool found = true;
1793 for (auto row = rowStart; row <= lastRow; row++)
1794 {
1795 if (writesPerOffset.find(row) == writesPerOffset.end())
1796 {
1797 found = false;
1798 break;
1799 }
1800 }
1801
1802 if (!found)
1803 {
1804 continue;
1805 }
1806
1807 // Writes for all rows found
1808 G4_ExecSize execSize = kernel.getSimdSize() > g4::SIMD16 ? g4::SIMD16 : kernel.getSimdSize();
1809
1810 for (auto row = rowStart; row <= lastRow;)
1811 {
1812 if (execSize == g4::SIMD16 && row == lastRow)
1813 {
1814 // In case of odd rows in SIMD16
1815 execSize = g4::SIMD8;
1816 }
1817 else if (execSize == g4::SIMD16)
1818 {
1819 // In SIMD16 kernel 2 consecutive rows should come from same spill
1820 if (writesPerOffset.find(row)->second != writesPerOffset.find(row + 1)->second)
1821 {
1822 execSize = g4::SIMD8;
1823 }
1824 }
1825
1826 auto type = Type_UD;
1827 // In spill cleanup, all units are in hword units independent of platform.
1828 // For PVC, GRF size is 2x of Gen9. Correction to PVC is postponed to code
1829 // generation time when translating spill/fill intrinsics to actual send.
1830 // Since we're emitting a mov here, we need to do this correction here.
1831 if (getGRFSize() == 64)
1832 type = Type_UQ;
1833 // Insert SIMD8 mov per row
1834 G4_DstRegRegion* nDst = kernel.fg.builder->createDst(
1835 inst->getDst()->getBase(), row + inst->getDst()->asDstRegRegion()->getRegOff() - rowStart,
1836 0, 1, type);
1837
1838 auto write = writesPerOffset.find(row)->second;
1839 G4_SrcRegRegion* src1Write = write->getSrc(1)->asSrcRegRegion();
1840 unsigned int writeRowStart = write->asSpillIntrinsic()->getOffset();
1841 unsigned int diff = row - writeRowStart;
1842 G4_SrcRegRegion* nSrc = kernel.fg.builder->createSrc(
1843 src1Write->getBase(), diff + src1Write->getRegOff(), 0,
1844 kernel.fg.builder->getRegionStride1(), type);
1845
1846 G4_INST* mov = kernel.fg.builder->createMov(
1847 execSize, nDst, nSrc, InstOpt_WriteEnable, false);
1848 bb->insertBefore(instIt, mov);
1849
1850 row += execSize / 8;
1851 }
1852
1853 auto tempIt = instIt;
1854 tempIt--;
1855 bb->erase(instIt);
1856 instIt = tempIt;
1857 }
1858 }
1859 }
1860 }
1861
removeRedundantWrites()1862 void CoalesceSpillFills::removeRedundantWrites()
1863 {
1864 typedef std::list<std::pair<G4_BB*, INST_LIST_ITER>> SPILLS;
1865 typedef std::list<std::pair<G4_BB*, INST_LIST_ITER>> FILLS;
1866 std::map<unsigned int, std::pair<SPILLS, FILLS>> scratchOffsetAccess;
1867 // Traverse bottom-up to detect and remove redundant writes.
1868 // Redundant writes include:
1869 // 1. Successive writes to same offset without a fill in between,
1870 // 2. Writes in program without any fill from that slot throughout
1871 for (auto bb : kernel.fg)
1872 {
1873 auto endIt = bb->end();
1874 endIt--;
1875 // Store spill slots that are written in to alongwith emask used
1876 std::map<unsigned int, unsigned int> scratchOffToMask;
1877 for (auto instIt = endIt;
1878 instIt != bb->begin();
1879 instIt--)
1880 {
1881 auto inst = (*instIt);
1882
1883 if (inst->isSpillIntrinsic() || inst->isFillIntrinsic())
1884 {
1885 unsigned int offset = 0, size = 0;
1886 if (inst->isFillIntrinsic())
1887 {
1888 getScratchMsgInfo(inst, offset, size);
1889 for (unsigned int k = offset; k != (offset + size); k++)
1890 {
1891 auto it = scratchOffToMask.find(k);
1892 if (it != scratchOffToMask.end())
1893 {
1894 scratchOffToMask.erase(it);
1895 }
1896 }
1897 }
1898 else if (inst->isSpillIntrinsic())
1899 {
1900 getScratchMsgInfo(inst, offset, size);
1901 bool allRowsFound = true;
1902 unsigned int emask = inst->getMaskOption();
1903 for (unsigned int k = offset; k != (offset + size); k++)
1904 {
1905 auto it = scratchOffToMask.find(k);
1906 if (it != scratchOffToMask.end())
1907 {
1908 if (emask != it->second &&
1909 (it->second & InstOpt_WriteEnable) == 0)
1910 {
1911 allRowsFound = false;
1912 break;
1913 }
1914 }
1915 else
1916 {
1917 allRowsFound = false;
1918 break;
1919 }
1920 }
1921
1922 if (allRowsFound)
1923 {
1924 #if 0
1925 printf("Removing redundant successive write at $%d\n", inst->getCISAOff());
1926 #endif
1927 instIt = bb->erase(instIt);
1928 }
1929 else
1930 {
1931 unsigned int emask = inst->getOption();
1932 for (unsigned int k = offset; k != (offset + size); k++)
1933 {
1934 scratchOffToMask.insert(std::make_pair(k, emask));
1935 }
1936 }
1937 }
1938 }
1939 }
1940 }
1941
1942 for (auto bb : kernel.fg)
1943 {
1944 auto endIt = bb->end();
1945 for (auto instIt = bb->begin();
1946 instIt != endIt;
1947 instIt++)
1948 {
1949 auto inst = (*instIt);
1950
1951 if (inst->isFillIntrinsic() ||
1952 inst->isSpillIntrinsic())
1953 {
1954 unsigned int offset, size;
1955 getScratchMsgInfo(inst, offset, size);
1956 bool isRead = inst->isFillIntrinsic();
1957 for (unsigned int i = offset; i != (offset + size); i++)
1958 {
1959 auto it = scratchOffsetAccess.find(i);
1960 if (it != scratchOffsetAccess.end())
1961 {
1962 if (isRead)
1963 {
1964 auto& fill = it->second.second;
1965 fill.push_back(std::make_pair(bb, instIt));
1966 }
1967 else
1968 {
1969 auto& spill = it->second.first;
1970 spill.push_back(std::make_pair(bb, instIt));
1971 }
1972 }
1973 else
1974 {
1975 SPILLS s;
1976 FILLS f;
1977 if (isRead)
1978 f.push_back(std::make_pair(bb, instIt));
1979 else
1980 s.push_back(std::make_pair(bb, instIt));
1981 scratchOffsetAccess.insert(std::make_pair(i, std::make_pair(s, f)));
1982 }
1983 }
1984 }
1985 }
1986 }
1987
1988 std::map<G4_INST*, std::pair<INST_LIST_ITER, G4_BB*>> spillToRemove;
1989 for (auto scratchAccess : scratchOffsetAccess)
1990 {
1991 if (scratchAccess.second.second.size() == 0 &&
1992 scratchAccess.second.first.size() > 0)
1993 {
1994 // 0 fills for scratch slot
1995 // Check whether all spill slots have 0 fills
1996 // in case spills are coalesced.
1997 for (auto spill : scratchAccess.second.first)
1998 {
1999 bool spillRequired = false;
2000 unsigned int offset, size;
2001 getScratchMsgInfo(*spill.second, offset, size);
2002
2003 // Verify that all slots from offset->(offset+size) have 0 fills
2004 for (unsigned int slot = offset; slot != (offset + size); slot++)
2005 {
2006 auto it = scratchOffsetAccess.find(slot);
2007 if (it->second.second.size() != 0)
2008 {
2009 spillRequired = true;
2010 break;
2011 }
2012 }
2013
2014 if (!spillRequired)
2015 {
2016 spillToRemove.insert(std::make_pair(*spill.second, std::make_pair(spill.second, spill.first)));
2017 }
2018
2019 }
2020 }
2021 else if (scratchAccess.second.first.size() == 0 &&
2022 scratchAccess.second.second.size() > 0)
2023 {
2024 // 0 spills for scratch slot, non-zero fills
2025 // Check whether all fill slots have 0 spills
2026 // in case fills are coalesced.
2027 for (auto fill : scratchAccess.second.second)
2028 {
2029 bool fillRequired = false;
2030 unsigned int offset, size;
2031 getScratchMsgInfo(*fill.second, offset, size);
2032
2033 // Verify that all slots from offset->(offset+size) have 0 spills
2034 for (unsigned int slot = offset; slot != (offset + size); slot++)
2035 {
2036 auto it = scratchOffsetAccess.find(slot);
2037 if (it->second.first.size() != 0)
2038 {
2039 fillRequired = true;
2040 break;
2041 }
2042 }
2043
2044 if (!fillRequired)
2045 {
2046 spillToRemove.insert(std::make_pair(*fill.second, std::make_pair(fill.second, fill.first)));
2047 }
2048
2049 }
2050 }
2051 }
2052
2053 for (auto removeSp : spillToRemove)
2054 {
2055 G4_BB* bb = removeSp.second.second;
2056 #if 0
2057 printf("Removing redundant scratch access at CISA $%d\n", removeSp.first->getCISAOff());
2058 #endif
2059 bb->erase(removeSp.second.first);
2060 }
2061 }
2062
run()2063 void CoalesceSpillFills::run()
2064 {
2065 removeRedundantSplitMovs();
2066
2067 fills();
2068 replaceMap.clear();
2069 spills();
2070 replaceMap.clear();
2071 spillFillCleanup();
2072
2073 removeRedundantWrites();
2074
2075 fixSendsSrcOverlap();
2076
2077 kernel.dumpToFile("after.spillCleanup." + std::to_string(iterNo));
2078 }
2079
dumpKernel()2080 void CoalesceSpillFills::dumpKernel()
2081 {
2082 for (auto bb : kernel.fg)
2083 {
2084 for (auto inst : *bb)
2085 {
2086 inst->emit(std::cerr);
2087 std::cerr << "\t$" << inst->getCISAOff() << ", #" << rpe.getRegisterPressure(inst) << "\n";
2088 }
2089 }
2090 }
2091
dumpKernel(unsigned int v1,unsigned int v2)2092 void CoalesceSpillFills::dumpKernel(unsigned int v1, unsigned int v2)
2093 {
2094 bool start = false, end = false, canEnd = false;
2095 for (auto bb : kernel.fg)
2096 {
2097 if (end)
2098 break;
2099
2100 for (auto inst : *bb)
2101 {
2102 if (canEnd &&
2103 inst->getCISAOff() > (int)v2)
2104 {
2105 end = true;
2106 break;
2107 }
2108
2109 if (inst->getCISAOff() == v2)
2110 {
2111 // This ensures invalid offsets
2112 // are dumped till v2 is hit.
2113 canEnd = true;
2114 }
2115
2116 if (inst->getCISAOff() == v1)
2117 start = true;
2118
2119 if (start && !end)
2120 {
2121 inst->dump();
2122 printf(" // $%d, #%d\n", inst->getCISAOff(), rpe.getRegisterPressure(inst));
2123 }
2124 }
2125 }
2126 }
2127
computeAddressTakenDcls()2128 void CoalesceSpillFills::computeAddressTakenDcls()
2129 {
2130 for (auto dcl : kernel.Declares)
2131 {
2132 auto addrSpillFill = dcl->getAddrTakenSpillFill();
2133 if (addrSpillFill)
2134 addrTakenSpillFillDcl.insert(addrSpillFill);
2135 }
2136 }
2137 }
2138