1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2017-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include "BuildIR.h"
10 #include "DebugInfo.h"
11 #include "FlowGraph.h"
12 #include "GraphColor.h"
13 #include "LocalRA.h"
14 #include "LinearScanRA.h"
15 #include "Optimizer.h"
16 #include "SCCAnalysis.h"
17 #include "SpillCleanup.h"
18 #include "SpillCode.h"
19 #include "Rematerialization.h"
20 #include "RPE.h"
21 #include "Timer.h"
22 
23 #include <algorithm>
24 #include <cmath>  // sqrt
25 #include <fstream>
26 #include <iostream>
27 #include <list>
28 #include <sstream>
29 #include "SplitAlignedScalars.h"
30 
31 using namespace vISA;
32 
33 #define GRAPH_COLOR_MEM_SIZE 16*1024
34 #define SCRATCH_MSG_LIMIT (128 * 1024)
35 #define FAIL_SAFE_RA_LIMIT 3
36 
37 const RAVarInfo GlobalRA::defaultValues;
38 const char GlobalRA::StackCallStr[] = "StackCall";
39 
40 static const unsigned IN_LOOP_REFERENCE_COUNT_FACTOR = 4;
41 
42 #define BANK_CONFLICT_HEURISTIC_INST   0.04
43 #define BANK_CONFLICT_HEURISTIC_REF_COUNT  0.25
44 #define BANK_CONFLICT_HEURISTIC_LOOP_ITERATION 5
45 #define BANK_CONFLICT_SEND_INST_CYCLE          60 //Some send 200, some 400 we choose the small one
46 #define BANK_CONFLICT_SIMD8_OVERHEAD_CYCLE     1
47 #define BANK_CONFLICT_SIMD16_OVERHEAD_CYCLE    2
48 #define INTERNAL_CONFLICT_RATIO_HEURISTIC 0.25
49 
50 #define NOMASK_BYTE 0x80
51 
52 
Interference(const LivenessAnalysis * l,LiveRange ** const & lr,unsigned n,unsigned ns,unsigned nm,GlobalRA & g)53 Interference::Interference(const LivenessAnalysis* l, LiveRange** const & lr, unsigned n, unsigned ns, unsigned nm,
54     GlobalRA& g) : gra(g), kernel(g.kernel), lrs(lr),
55     builder(*g.kernel.fg.builder), maxId(n), splitStartId(ns), splitNum(nm),
56     liveAnalysis(l), rowSize(maxId / BITS_DWORD + 1)
57 {
58 }
59 
varSplitCheckBeforeIntf(unsigned v1,unsigned v2) const60 inline bool Interference::varSplitCheckBeforeIntf(unsigned v1, unsigned v2) const
61 {
62     const LiveRange * l1 = lrs[v1];
63     const LiveRange * l2 = lrs[v2];
64 
65     if (!l1->getIsPartialDcl() &&
66         !l2->getIsPartialDcl())
67     {
68         return false;
69     }
70 
71     //Don't do interference for two split declares
72     if (l1->getIsPartialDcl() &&
73         l2->getIsPartialDcl())
74     {
75         return true;
76     }
77 
78     unsigned p1 = v1;
79     unsigned p2 = v2;
80     //Don't do inteference for child and parent delcares
81     if (l1->getIsPartialDcl())
82     {
83         p1 = l1->getParentLRID();
84     }
85 
86     if (l2->getIsPartialDcl())
87     {
88         p2 = l2->getParentLRID();
89     }
90 
91     if (p1 == p2)
92     {
93         return true;
94     }
95 
96     return false;
97 }
98 
setupBankAccordingToSiblingOperand(BankConflict assignedBank,unsigned offset,bool oneGRFBank)99 BankConflict BankConflictPass::setupBankAccordingToSiblingOperand(BankConflict assignedBank, unsigned offset, bool oneGRFBank)
100 {
101     BankConflict tgtBank;
102 
103     MUST_BE_TRUE(assignedBank != BANK_CONFLICT_NONE, "sibling bank is not assigned");
104 
105     //Set according to sibling
106     tgtBank = (assignedBank == BANK_CONFLICT_FIRST_HALF_EVEN || assignedBank == BANK_CONFLICT_FIRST_HALF_ODD) ?
107         (assignedBank == BANK_CONFLICT_FIRST_HALF_EVEN ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_SECOND_HALF_EVEN) :
108         (assignedBank == BANK_CONFLICT_SECOND_HALF_EVEN ? BANK_CONFLICT_FIRST_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN);
109 
110     //Adjust according to the offset
111     if (oneGRFBank)
112     {
113         if (offset % 2)
114         {
115             if (tgtBank == BANK_CONFLICT_SECOND_HALF_EVEN ||
116                 tgtBank == BANK_CONFLICT_FIRST_HALF_EVEN)
117             {
118                 tgtBank = (tgtBank == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_FIRST_HALF_ODD : BANK_CONFLICT_SECOND_HALF_ODD;
119             }
120             else
121             {
122                 tgtBank = (tgtBank == BANK_CONFLICT_FIRST_HALF_ODD) ? BANK_CONFLICT_FIRST_HALF_EVEN : BANK_CONFLICT_SECOND_HALF_EVEN;
123             }
124         }
125     }
126     else
127     {
128         if (offset % 4 >= 2)
129         {
130             if (tgtBank == BANK_CONFLICT_SECOND_HALF_EVEN ||
131                 tgtBank == BANK_CONFLICT_FIRST_HALF_EVEN)
132             {
133                 tgtBank = (tgtBank == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_FIRST_HALF_ODD : BANK_CONFLICT_SECOND_HALF_ODD;
134             }
135             else
136             {
137                 tgtBank = (tgtBank == BANK_CONFLICT_FIRST_HALF_ODD) ? BANK_CONFLICT_FIRST_HALF_EVEN : BANK_CONFLICT_SECOND_HALF_EVEN;
138             }
139         }
140     }
141 
142     return tgtBank;
143 }
144 
refNumBasedSort(const unsigned * refNum,unsigned * index)145 void refNumBasedSort(const unsigned *refNum, unsigned *index)
146 {
147     if (refNum[2] > refNum[1])
148     {
149         index[0] = 2;
150         index[1] = 1;
151     }
152     else
153     {
154         index[0] = 1;
155         index[1] = 2;
156     }
157 
158     index[2] = 0;
159 
160     return;
161 }
162 
hasInternalConflict3Srcs(BankConflict * srcBC)163 bool BankConflictPass::hasInternalConflict3Srcs(BankConflict *srcBC)
164 {
165     if (((srcBC[0] == BANK_CONFLICT_SECOND_HALF_EVEN ||
166         srcBC[0] == BANK_CONFLICT_FIRST_HALF_EVEN) &&
167         (srcBC[1] == BANK_CONFLICT_SECOND_HALF_EVEN ||
168             srcBC[1] == BANK_CONFLICT_FIRST_HALF_EVEN) &&
169             (srcBC[2] == BANK_CONFLICT_SECOND_HALF_EVEN ||
170                 srcBC[2] == BANK_CONFLICT_FIRST_HALF_EVEN)) ||
171                 ((srcBC[0] == BANK_CONFLICT_SECOND_HALF_ODD ||
172                     srcBC[0] == BANK_CONFLICT_FIRST_HALF_ODD) &&
173                     (srcBC[1] == BANK_CONFLICT_SECOND_HALF_ODD ||
174                         srcBC[1] == BANK_CONFLICT_FIRST_HALF_ODD) &&
175                         (srcBC[2] == BANK_CONFLICT_SECOND_HALF_ODD ||
176                             srcBC[2] == BANK_CONFLICT_FIRST_HALF_ODD)))
177     {
178         return true;
179     }
180     if ((srcBC[0] < BANK_CONFLICT_SECOND_HALF_EVEN &&
181         srcBC[1] < BANK_CONFLICT_SECOND_HALF_EVEN &&
182         srcBC[2] < BANK_CONFLICT_SECOND_HALF_EVEN) ||
183         (srcBC[0] >= BANK_CONFLICT_SECOND_HALF_EVEN &&
184             srcBC[1] >= BANK_CONFLICT_SECOND_HALF_EVEN &&
185             srcBC[2] >= BANK_CONFLICT_SECOND_HALF_EVEN))
186     {
187         return true;
188     }
189 
190     return false;
191 }
192 
setupEvenOddBankConflictsForDecls(G4_Declare * dcl_1,G4_Declare * dcl_2,unsigned offset1,unsigned offset2,BankConflict & srcBC1,BankConflict & srcBC2)193 void BankConflictPass::setupEvenOddBankConflictsForDecls(G4_Declare * dcl_1, G4_Declare * dcl_2,
194     unsigned offset1, unsigned offset2,
195     BankConflict &srcBC1, BankConflict &srcBC2)
196 {
197     ASSERT_USER(srcBC1 == BANK_CONFLICT_NONE, "Wrong Bank initial value");
198     ASSERT_USER(srcBC2 == BANK_CONFLICT_NONE, "Wrong Bank initial value");
199 
200     unsigned refNum1 = gra.getNumRefs(dcl_1);
201     unsigned refNum2 = gra.getNumRefs(dcl_2);
202 
203     BankConflict bank1 = BANK_CONFLICT_NONE;
204     BankConflict bank2 = BANK_CONFLICT_NONE;
205 
206     bank1 = (refNum1 >= refNum2) ? BANK_CONFLICT_FIRST_HALF_EVEN : BANK_CONFLICT_SECOND_HALF_ODD;
207     bank2 = (bank1 == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
208 
209     srcBC1 = bank1;
210     srcBC2 = bank2;
211 
212     //Adjust only for the single bank allocation
213     if ((offset1 + offset2) % 2)
214     {
215         if (refNum1 >= refNum2)
216         {
217             bank2 = (bank2 == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
218         }
219         else
220         {
221             bank1 = (bank1 == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
222         }
223     }
224 
225     gra.setBankConflict(dcl_1, bank1);
226     gra.setBankConflict(dcl_2, bank2);
227 
228     return;
229 }
230 
231 
232 //
233 // inst opcode is G4_mad. This function sets up a simple state machine to prevent conflict
234 // between src 1 and 2 of mad inst. Following is how GRF file is divided in to banks:
235 // bank-block A = 0, 2, 4, 6, ..., 62
236 // bank-block B = 1, 3, 5, 7, ..., 63
237 // bank-block C = 64, 66, 68, ..., 126
238 // bank-block D = 65, 67, 69, ..., 127
239 //
240 // For ternary ops, if src1 and src2 are to the same bank then there will be an access collision.
241 // But unary and binary ops will have no collision, no matter what registers they use. The reason
242 // is second and third src operands are read in the same clock cycle, which is different than
243 // when src0 operand is read. This is true upto pre-SKL.
244 //
245 // Bank Conflict Herustics:
246 // 1. Try to balance the used registers in two banks for the potential conflicted registers.
247 // 2. reference number is used to decide which to be assigned first
248 // 3. When conflict detected, bank can be updated according to the reference count.
249 //
setupBankConflictsOneGRFOld(G4_INST * inst,int & bank1RegNum,int & bank2RegNum,float GRFRatio,unsigned & internalConflict)250 void BankConflictPass::setupBankConflictsOneGRFOld(G4_INST* inst, int &bank1RegNum, int &bank2RegNum, float GRFRatio, unsigned &internalConflict)
251 {
252     BankConflict srcBC[3];
253     unsigned regNum[3];
254     unsigned refNum[3];
255     unsigned offset[3];
256     G4_Declare * dcls[3];
257     G4_Declare * opndDcls[3];
258     int bank_num = 0;
259 
260     for (int i = 0; i < 3; i++)
261     {
262         dcls[i] = nullptr;
263         opndDcls[i] = nullptr;
264 
265         G4_Operand* src = inst->getSrc(i);
266         if (!src || !src->isSrcRegRegion() || src->isAccReg())
267         {
268             // bank conflict not possible
269             return;
270         }
271 
272         dcls[i] = GetTopDclFromRegRegion(src);
273         opndDcls[i] = src->getBase()->asRegVar()->getDeclare();
274 
275         regNum[i] = dcls[i]->getNumRows();
276         refNum[i] = gra.getNumRefs(dcls[i]);
277         offset[i] = (opndDcls[i]->getOffsetFromBase() + src->getLeftBound()) / numEltPerGRF<Type_UB>();
278         srcBC[i] = gra.getBankConflict(dcls[i]);
279 
280         if (src->getBase()->asRegVar()->isPhyRegAssigned())
281         {
282             unsigned reg = src->getBase()->asRegVar()->getPhyReg()->asGreg()->getRegNum();
283             if ((reg + offset[i]) < SECOND_HALF_BANK_START_GRF)
284             {
285                 srcBC[i] = ((reg + offset[i]) % 2) ? BANK_CONFLICT_FIRST_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
286             }
287             else
288             {
289                 srcBC[i] = ((reg + offset[i]) % 2) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_SECOND_HALF_EVEN;
290             }
291             if (reg < SECOND_HALF_BANK_START_GRF)
292             {
293                 bank1RegNum += regNum[i];
294             }
295             else
296             {
297                 bank2RegNum += regNum[i];
298             }
299             gra.setBankConflict(dcls[i], srcBC[i]);
300         }
301         else if (srcBC[i] != BANK_CONFLICT_NONE)
302         {
303             if (offset[i] % 2)
304             {
305                 //Get operand's bank from declare's bank
306                 if (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN ||
307                     srcBC[i] == BANK_CONFLICT_FIRST_HALF_ODD)
308                 {
309                     srcBC[i] = (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_FIRST_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
310                 }
311                 else
312                 {
313                     srcBC[i] = (srcBC[i] == BANK_CONFLICT_SECOND_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_SECOND_HALF_EVEN;
314                 }
315             }
316         }
317 
318         if (i > 0)
319         {
320             bank_num += srcBC[i];
321         }
322     }
323 
324     //In case src1 and src2 share same declare, i.e. use same regsiter
325     if (bank_num == 0 &&
326         dcls[1] == dcls[2])
327     {
328         BankConflict bank1 = ((bank1RegNum * GRFRatio) > bank2RegNum) ? BANK_CONFLICT_SECOND_HALF_EVEN : BANK_CONFLICT_FIRST_HALF_EVEN;
329 
330         gra.setBankConflict(dcls[1], bank1);
331         srcBC[1] = bank1;
332         srcBC[2] = bank1;
333         bank_num += bank1 * 2;
334         if (bank1 < BANK_CONFLICT_SECOND_HALF_EVEN)
335         {
336             bank1RegNum += regNum[1];
337         }
338         else
339         {
340             bank2RegNum += regNum[1];
341         }
342     }
343 
344     //No bank assigned to src 1, 2.
345     //assign the two delcares into different bundles/banks.
346     if (bank_num == 0)
347     {
348         BankConflict bank1 = BANK_CONFLICT_NONE;
349         BankConflict bank2 = BANK_CONFLICT_NONE;
350         bool bank1First = false;
351         if (GRFRatio == 1.0)
352         {
353             //For global RA: Try to reduce the size of bank 2
354             if ((float)refNum[1] / regNum[1] >= (float)refNum[2] / regNum[2])
355             {
356                 bank1 = BANK_CONFLICT_SECOND_HALF_EVEN;
357                 bank2 = BANK_CONFLICT_FIRST_HALF_ODD;
358                 bank1First = true;
359             }
360             else
361             {
362                 bank2 = BANK_CONFLICT_SECOND_HALF_EVEN;
363                 bank1 = BANK_CONFLICT_FIRST_HALF_ODD;
364             }
365         }
366         else
367         {
368             //For local RA: Try to balance two banks
369             if (refNum[1] >= refNum[2])
370             {
371                 bank1 = ((bank1RegNum * GRFRatio) > bank2RegNum) ? BANK_CONFLICT_SECOND_HALF_EVEN : BANK_CONFLICT_FIRST_HALF_EVEN;
372                 bank2 = (bank1 == BANK_CONFLICT_SECOND_HALF_EVEN) ? BANK_CONFLICT_FIRST_HALF_ODD : BANK_CONFLICT_SECOND_HALF_ODD;
373                 bank1First = true;
374             }
375             else
376             {
377                 bank2 = (bank1RegNum * GRFRatio) > bank2RegNum ? BANK_CONFLICT_SECOND_HALF_EVEN : BANK_CONFLICT_FIRST_HALF_EVEN;
378                 bank1 = (bank2 == BANK_CONFLICT_SECOND_HALF_EVEN) ? BANK_CONFLICT_FIRST_HALF_ODD : BANK_CONFLICT_SECOND_HALF_ODD;
379             }
380         }
381 
382         //Adjust only for the single bank allocation
383         if ((offset[1] + offset[2]) % 2)
384         {
385             if (bank1First)
386             {
387                 bank2 = (bank2 == BANK_CONFLICT_FIRST_HALF_ODD) ? BANK_CONFLICT_FIRST_HALF_EVEN : BANK_CONFLICT_SECOND_HALF_EVEN;
388             }
389             else
390             {
391                 bank1 = (bank1 == BANK_CONFLICT_SECOND_HALF_ODD) ? BANK_CONFLICT_SECOND_HALF_EVEN : BANK_CONFLICT_FIRST_HALF_EVEN;
392             }
393         }
394 
395         if (bank1 >= BANK_CONFLICT_SECOND_HALF_EVEN)
396         {
397             bank2RegNum += regNum[1];
398             bank1RegNum += regNum[2];
399         }
400         else
401         {
402             bank1RegNum += regNum[1];
403             bank2RegNum += regNum[2];
404         }
405 
406         gra.setBankConflict(dcls[1], bank1);
407         gra.setBankConflict(dcls[2], bank2);
408     }
409     else
410     {
411         if (srcBC[1] == BANK_CONFLICT_NONE || srcBC[2] == BANK_CONFLICT_NONE)
412         {
413             //One source operand is assigned bank already
414             if (srcBC[2] == BANK_CONFLICT_NONE)
415             {
416                 srcBC[2] = setupBankAccordingToSiblingOperand(srcBC[1], offset[2], true);
417                 gra.setBankConflict(dcls[2], srcBC[2]);
418 
419                 if (srcBC[2] < BANK_CONFLICT_SECOND_HALF_EVEN)
420                     bank1RegNum += regNum[2];
421                 else
422                     bank2RegNum += regNum[2];
423             }
424             else
425             {
426                 srcBC[1] = setupBankAccordingToSiblingOperand(srcBC[2], offset[1], true);
427                 gra.setBankConflict(dcls[1], srcBC[1]);
428                 if (srcBC[1] < BANK_CONFLICT_SECOND_HALF_EVEN)
429                     bank1RegNum += regNum[1];
430                 else
431                     bank2RegNum += regNum[1];
432             }
433         }
434         else if (dcls[1] != dcls[2])
435         {
436             if (((srcBC[1] == BANK_CONFLICT_SECOND_HALF_EVEN ||
437                 srcBC[1] == BANK_CONFLICT_FIRST_HALF_EVEN) &&
438                 (srcBC[2] == BANK_CONFLICT_SECOND_HALF_EVEN ||
439                     srcBC[2] == BANK_CONFLICT_FIRST_HALF_EVEN)) ||
440                     ((srcBC[1] == BANK_CONFLICT_SECOND_HALF_ODD ||
441                         srcBC[1] == BANK_CONFLICT_FIRST_HALF_ODD) &&
442                         (srcBC[2] == BANK_CONFLICT_SECOND_HALF_ODD ||
443                             srcBC[2] == BANK_CONFLICT_FIRST_HALF_ODD)))
444             {
445                 internalConflict++;
446             }
447             if ((srcBC[1] < BANK_CONFLICT_SECOND_HALF_EVEN &&
448                 srcBC[2] < BANK_CONFLICT_SECOND_HALF_EVEN) ||
449                 (srcBC[1] >= BANK_CONFLICT_SECOND_HALF_EVEN &&
450                     srcBC[2] >= BANK_CONFLICT_SECOND_HALF_EVEN))
451             {
452                 internalConflict++;
453             }
454         }
455     }
456 
457 #ifdef DEBUG_VERBOSE_ON
458     for (int i = 0; i < 3; i++)
459     {
460         if (opndDcls[i])
461         {
462             printf("%s, %s\n", opndDcls[i]->getName(), dcls[i]->getBankConflict() > 2 ?
463                 (dcls[i]->getBankConflict() == BANK_CONFLICT_SECOND_HALF_EVEN ? "HIGH_EVEN" : "HIGH_ODD") :
464                 dcls[i]->getBankConflict() > 0 ?
465                 (dcls[i]->getBankConflict() == BANK_CONFLICT_FIRST_HALF_EVEN ? "LOW_EVEN" : "LOW_ODD") : "NONE");
466         }
467     }
468     printf("Bank1 number: %d; Bank2 number: %d\n", bank1RegNum, bank2RegNum);
469 #endif
470 
471     return;
472 }
473 
getBanks(G4_INST * inst,BankConflict * srcBC,G4_Declare ** dcls,G4_Declare ** opndDcls,unsigned * offset)474 void BankConflictPass::getBanks(G4_INST* inst, BankConflict *srcBC, G4_Declare **dcls, G4_Declare **opndDcls, unsigned *offset)
475 {
476     for (int i = 0; i < 3; i++)
477     {
478         dcls[i] = nullptr;
479         opndDcls[i] = nullptr;
480         srcBC[i] = BANK_CONFLICT_NONE;
481 
482         G4_Operand* src = inst->getSrc(i);
483         if (!src || !src->isSrcRegRegion() || src->isAccReg())
484         {
485             return;
486         }
487 
488         dcls[i] = GetTopDclFromRegRegion(src);
489         if (!dcls[i])
490         {
491             continue;
492         }
493         opndDcls[i] = src->getBase()->asRegVar()->getDeclare();
494 
495         offset[i] = (opndDcls[i]->getOffsetFromBase() + src->getLeftBound()) / numEltPerGRF<Type_UB>();
496         srcBC[i] = gra.getBankConflict(dcls[i]);
497 
498         if (src->getBase()->asRegVar()->isPhyRegAssigned())
499         {
500             unsigned reg = src->getBase()->asRegVar()->getPhyReg()->asGreg()->getRegNum();
501             srcBC[i] = ((reg + offset[i]) % 2) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
502         }
503         else if (srcBC[i] != BANK_CONFLICT_NONE)
504         {
505             if (offset[i] % 2)
506             {
507                 if (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN)
508                 {
509                     srcBC[i] = BANK_CONFLICT_SECOND_HALF_ODD;
510                 }
511                 else
512                 {
513                     srcBC[i] = BANK_CONFLICT_FIRST_HALF_EVEN;
514                 }
515             }
516         }
517     }
518 
519     return;
520 }
521 
getPrevBanks(G4_INST * inst,BankConflict * srcBC,G4_Declare ** dcls,G4_Declare ** opndDcls,unsigned * offset)522 void BankConflictPass::getPrevBanks(G4_INST* inst, BankConflict *srcBC, G4_Declare **dcls, G4_Declare **opndDcls, unsigned *offset)
523 {
524     int execSize[G4_MAX_SRCS];
525 
526     for (int i = 1; i < 3; i++)
527     {
528         dcls[i] = nullptr;
529         opndDcls[i] = nullptr;
530         srcBC[i] = BANK_CONFLICT_NONE;
531 
532         G4_Operand* src = inst->getSrc(i);
533         if (!src || !src->isSrcRegRegion())
534         {
535             return;
536         }
537         dcls[i] = GetTopDclFromRegRegion(src);
538         if (dcls[i]->getRegFile() != G4_GRF)
539         {
540             return;
541         }
542         execSize[i] = src->getLinearizedEnd() - src->getLinearizedStart() + 1;
543 
544         opndDcls[i] = src->getBase()->asRegVar()->getDeclare();
545 
546         offset[i] = (opndDcls[i]->getOffsetFromBase() + src->getLeftBound()) / numEltPerGRF<Type_UB>();
547         srcBC[i] = gra.getBankConflict(dcls[i]);
548 
549         if (src->getBase()->asRegVar()->isPhyRegAssigned())
550         {
551             unsigned reg = src->getBase()->asRegVar()->getPhyReg()->asGreg()->getRegNum();
552             srcBC[i] = ((reg + offset[i]) % 2) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
553         }
554         else if (srcBC[i] != BANK_CONFLICT_NONE)
555         {
556             if (offset[i] % 2)
557             {
558                 if (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN)
559                 {
560                     srcBC[i] = BANK_CONFLICT_SECOND_HALF_ODD;
561                 }
562                 else
563                 {
564                     srcBC[i] = BANK_CONFLICT_FIRST_HALF_EVEN;
565                 }
566             }
567         }
568         if (execSize[i] > 32)
569         {
570             srcBC[i] = (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
571         }
572     }
573 
574     return;
575 }
576 
577 
578 
setupBankForSrc0(G4_INST * inst,G4_INST * prevInst)579 void BankConflictPass::setupBankForSrc0(G4_INST* inst, G4_INST* prevInst)
580 {
581     BankConflict srcBC[3];
582     G4_Declare * dcls[3];
583     G4_Declare * opndDcls[3];
584     unsigned offset[3];
585 
586     BankConflict prevSrcBC[3];
587     G4_Declare * prevDcls[3];
588     G4_Declare * prevOpndDcls[3];
589     unsigned prevOffset[3];
590 
591     if (prevInst->isSend() ||
592         prevInst->isMath())
593     {
594         return;
595     }
596 
597     getBanks(inst, srcBC, dcls, opndDcls, offset);
598     getPrevBanks(prevInst, prevSrcBC, prevDcls, prevOpndDcls, prevOffset);
599 
600     if (dcls[0] != nullptr &&
601         srcBC[0] == BANK_CONFLICT_NONE &&
602         prevSrcBC[1] != BANK_CONFLICT_NONE &&
603         prevSrcBC[2] != BANK_CONFLICT_NONE)
604     {
605         if (prevSrcBC[1] == prevSrcBC[2])
606         {
607             if (prevSrcBC[1] == BANK_CONFLICT_FIRST_HALF_EVEN)
608             {
609                 srcBC[0] = offset[0] % 2 ? BANK_CONFLICT_FIRST_HALF_EVEN : BANK_CONFLICT_SECOND_HALF_ODD;
610             }
611             else
612             {
613                 srcBC[0] = offset[0] % 2 ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
614             }
615 
616             gra.setBankConflict(dcls[0], srcBC[0]);
617         }
618     }
619 
620     return;
621 }
622 
setupBankConflictsforTwoGRFs(G4_INST * inst)623 void BankConflictPass::setupBankConflictsforTwoGRFs(G4_INST* inst)
624 {
625     BankConflict srcBC[3];
626     unsigned refNum[3];
627     unsigned offset[3];
628     G4_Declare * dcls[3];
629     G4_Declare * opndDcls[3];
630     int bank_num = 0;
631     int execSize[3];
632 
633     for (int i = 0; i < 3; i++)
634     {
635         dcls[i] = nullptr;
636         opndDcls[i] = nullptr;
637         execSize[i] = 0;
638 
639         G4_Operand* src = inst->getSrc(i);
640         if (!src || !src->isSrcRegRegion() || src->isAccReg())
641         {
642             // bank conflict not possible
643             return;
644         }
645         execSize[i] = src->getLinearizedEnd() - src->getLinearizedStart() + 1;
646 
647         dcls[i] = GetTopDclFromRegRegion(src);
648         opndDcls[i] = src->getBase()->asRegVar()->getDeclare();
649 
650         refNum[i] = gra.getNumRefs(dcls[i]);
651         offset[i] = (opndDcls[i]->getOffsetFromBase() + src->getLeftBound()) / numEltPerGRF<Type_UB>();
652         srcBC[i] = gra.getBankConflict(dcls[i]);
653 
654         if (src->getBase()->asRegVar()->isPhyRegAssigned())
655         {
656             unsigned reg = src->getBase()->asRegVar()->getPhyReg()->asGreg()->getRegNum();
657             srcBC[i] = ((reg + offset[i]) % 2) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
658             gra.setBankConflict(dcls[i], srcBC[i]);
659         }
660         else if (srcBC[i] != BANK_CONFLICT_NONE)
661         {
662             if (offset[i] % 2)
663             {
664                 if (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN)
665                 {
666                     srcBC[i] = BANK_CONFLICT_SECOND_HALF_ODD;
667                 }
668                 else
669                 {
670                     srcBC[i] = BANK_CONFLICT_FIRST_HALF_EVEN;
671                 }
672             }
673         }
674         if (i != 0)
675         {
676             bank_num += srcBC[i];
677         }
678     }
679 
680     int simd8SrcNum = 0;
681     for (int i = 0; i < 3; i++)
682     {
683         if (execSize[i] <= 32)
684         {
685             simd8SrcNum++;
686         }
687     }
688 
689     //In case (src0) src1 and src2 use same declare, i.e. use same regsiter
690     if ((dcls[0] == dcls[1]) && (dcls[1] == dcls[2]))
691     {
692         return;
693     }
694 
695     //No bank assigned to src operands,
696     //assign the two delcares into different bundles/banks.
697     if (simd8SrcNum <= 1)  //All simd16, do even align
698     {
699         for (int i = 0; i < 3; i++)
700         {
701             if (execSize[i] > 32)
702             {
703                 srcBC[i] = offset[i] % 2 ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
704                 gra.setBankConflict(dcls[i], srcBC[i]);
705             }
706         }
707     }
708     else if (bank_num == 0)
709     {
710         unsigned index[3];
711 
712         refNumBasedSort(refNum, index);
713 
714         if (dcls[index[0]] != dcls[index[1]])
715         {
716             setupEvenOddBankConflictsForDecls(dcls[index[0]], dcls[index[1]],
717                 offset[index[0]], offset[index[1]],
718                 srcBC[index[0]], srcBC[index[1]]);
719         }
720     }
721     else
722     {
723         if (srcBC[1] != BANK_CONFLICT_NONE)
724         {
725             srcBC[2] = (srcBC[1] == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
726             if (offset[2] % 2)
727             {
728                 srcBC[2] = (srcBC[2] == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
729             }
730             gra.setBankConflict(dcls[2], srcBC[2]);
731         }
732         else
733         {
734             srcBC[1] = (srcBC[2] == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
735             if (offset[1] % 2)
736             {
737                 srcBC[1] = (srcBC[1] == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
738             }
739             gra.setBankConflict(dcls[1], srcBC[1]);
740         }
741     }
742 
743 #ifdef DEBUG_VERBOSE_ON
744     for (int i = 0; i < 3; i++)
745     {
746         if (opndDcls[i])
747         {
748             printf("%s, %s\n", opndDcls[i]->getName(), dcls[i]->getBankConflict() > 2 ?
749                 (dcls[i]->getBankConflict() == BANK_CONFLICT_SECOND_HALF_EVEN ? "HIGH_EVEN" : "HIGH_ODD") :
750                 dcls[i]->getBankConflict() > 0 ?
751                 (dcls[i]->getBankConflict() == BANK_CONFLICT_FIRST_HALF_EVEN ? "LOW_EVEN" : "LOW_ODD") : "NONE");
752         }
753     }
754     printf("Bank1 number: %d; Bank2 number: %d\n", bank1RegNum, bank2RegNum);
755 #endif
756 
757     return;
758 }
759 
isOddOffset(unsigned offset) const760 bool BankConflictPass::isOddOffset(unsigned offset) const
761 {
762     if (gra.kernel.fg.builder->oneGRFBankDivision())
763     {
764         return (offset % 2);
765     }
766     else
767     {
768         return ((offset % 4) / 2);
769     }
770 }
771 
setupBankConflictsforDPAS(G4_INST * inst)772 void BankConflictPass::setupBankConflictsforDPAS(G4_INST* inst)
773 {
774     BankConflict srcBC[3];
775     unsigned refNum[3];
776     unsigned offset[3];
777     G4_Declare * dcls[3];
778     G4_Declare * opndDcls[3];
779     int bank_num = 0;
780 
781     if (!inst->isDpas())
782     {
783         return;
784     }
785 
786 
787     for (int i = 0; i < 3; i += 1)
788     {
789         opndDcls[i] = nullptr;
790 
791         G4_Operand* src = inst->getSrc(i);
792 
793         dcls[i] = GetTopDclFromRegRegion(src);
794         if (dcls[i])
795         {
796             opndDcls[i] = src->getBase()->asRegVar()->getDeclare();
797 
798             refNum[i] = gra.getNumRefs(dcls[i]);
799             offset[i] = (opndDcls[i]->getOffsetFromBase() + src->getLeftBound()) / numEltPerGRF<Type_UB>();
800             srcBC[i] = gra.getBankConflict(dcls[i]);
801 
802             if (srcBC[i] != BANK_CONFLICT_NONE)
803             {
804                 if (isOddOffset(offset[i]))
805                 {
806                     if (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN)
807                     {
808                         srcBC[i] = BANK_CONFLICT_SECOND_HALF_ODD;
809                     }
810                     else
811                     {
812                         srcBC[i] = BANK_CONFLICT_FIRST_HALF_EVEN;
813                     }
814                 }
815                 if (i != 1)
816                 {
817                     bank_num++;
818                 }
819             }
820         }
821     }
822     if (dcls[0] && dcls[1])
823     {
824         gra.addBundleConflictDcl(dcls[0], dcls[1], offset[0] - offset[1]);
825         gra.addBundleConflictDcl(dcls[1], dcls[0], offset[1] - offset[0]);
826     }
827     if (dcls[1] && dcls[2])
828     {
829         gra.addBundleConflictDcl(dcls[2], dcls[1], offset[2] - offset[1]);
830         gra.addBundleConflictDcl(dcls[1], dcls[2], offset[1] - offset[2]);
831     }
832 #if 0
833     if (gra.kernel.getOption(vISA_forceBCR) && dcls[0] && dcls[2])
834     {
835         gra.addBundleConflictDcl(dcls[2], dcls[0], offset[2] - offset[0]);
836         gra.addBundleConflictDcl(dcls[0], dcls[2], offset[0] - offset[2]);
837     }
838 #endif
839 
840     //In case (src0) src1 and src2 use same declare, i.e. use same regsiter
841     if (dcls[0] == dcls[2] ||
842         !dcls[0] || !dcls[2])
843     {
844         return;
845     }
846 
847     if (bank_num == 0)
848     {
849         srcBC[0] = refNum[0] > refNum[2] ? BANK_CONFLICT_FIRST_HALF_EVEN : BANK_CONFLICT_SECOND_HALF_ODD;
850         srcBC[2] = refNum[0] > refNum[2] ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
851         if (isOddOffset(offset[0]))
852         {
853             srcBC[0] = (srcBC[0] == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
854         }
855         if (isOddOffset(offset[2]))
856         {
857             srcBC[2] = (srcBC[2] == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
858         }
859         gra.setBankConflict(dcls[0], srcBC[0]);
860         gra.setBankConflict(dcls[2], srcBC[2]);
861 
862     }
863     else if (bank_num == 1)
864     {
865         if (srcBC[0] != BANK_CONFLICT_NONE)
866         {
867             srcBC[2] = (srcBC[0] == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
868             if (isOddOffset(offset[2]))
869             {
870                 srcBC[2] = (srcBC[2] == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
871             }
872             gra.setBankConflict(dcls[2], srcBC[2]);
873         }
874         else
875         {
876             srcBC[0] = (srcBC[2] == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
877             if (offset[0] % 2)
878             {
879                 srcBC[0] = (srcBC[0] == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
880             }
881             gra.setBankConflict(dcls[0], srcBC[0]);
882         }
883     }
884 
885 #ifdef DEBUG_VERBOSE_ON
886     for (int i = 0; i < 3; i += 2)
887     {
888         if (opndDcls[i])
889         {
890             printf("%s, ", opndDcls[i]->getName());
891 
892             if (gra.getBankConflict(dcls[i]) == BANK_CONFLICT_FIRST_HALF_EVEN)
893             {
894                 printf("%s\n", "EVEN");
895             }
896             else if (gra.getBankConflict(dcls[i]) == BANK_CONFLICT_SECOND_HALF_ODD)
897             {
898                 printf("%s\n", "ODD");
899             }
900             else
901             {
902                 printf("%s\n", "NONE");
903             }
904         }
905     }
906 #endif
907 
908     return;
909 }
910 
setupBankConflictsforMad(G4_INST * inst)911 void BankConflictPass::setupBankConflictsforMad(G4_INST* inst)
912 {
913     BankConflict srcBC[3];
914     unsigned offset[3];
915     G4_Declare * dcls[3];
916     G4_Declare * opndDcls[3];
917     BankConflict assignedBank = BANK_CONFLICT_NONE; //Flip for next
918 
919     for (int i = 0; i < 3; i += 1)
920     {
921         dcls[i] = nullptr;
922         opndDcls[i] = nullptr;
923 
924         G4_Operand* src = inst->getSrc(i);
925         if (!src || !src->isSrcRegRegion() || src->isAccReg())
926         {
927             // bank conflict not possible
928             continue;
929         }
930 
931         dcls[i] = GetTopDclFromRegRegion(src);
932         opndDcls[i] = src->getBase()->asRegVar()->getDeclare();
933         offset[i] = (opndDcls[i]->getOffsetFromBase() + src->getLeftBound()) / numEltPerGRF<Type_UB>();
934         srcBC[i] = gra.getBankConflict(dcls[i]);
935 
936         if (srcBC[i] != BANK_CONFLICT_NONE)
937         {
938             if (isOddOffset(offset[i]))
939             {
940                 if (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN)
941                 {
942                     srcBC[i] = BANK_CONFLICT_SECOND_HALF_ODD;
943                 }
944                 else
945                 {
946                     srcBC[i] = BANK_CONFLICT_FIRST_HALF_EVEN;
947                 }
948             }
949             if (assignedBank != BANK_CONFLICT_SECOND_HALF_EVEN)
950             {
951                 if (assignedBank == BANK_CONFLICT_NONE)
952                 {
953                     assignedBank = srcBC[i];
954                 }
955                 else if (assignedBank != srcBC[i])
956                 {
957                     assignedBank = BANK_CONFLICT_SECOND_HALF_EVEN;  //BANK_CONFLICT_SECOND_HALF_EVEN is used to represent all banks are assigned
958                 }
959             }
960         }
961     }
962 
963     for (int k = 0; k < 2; k++)
964     {
965         for (int i = 2; i != -1; i--)
966         {
967             if (!dcls[i])
968             {
969                 continue;
970             }
971 
972             LocalLiveRange* lr = gra.getLocalLR(dcls[i]);
973             if (!lr ||
974                 (k == 0  && !lr->isLiveRangeLocal()))
975             {
976                 continue;
977             }
978 
979             if (k == 1 && lr->isLiveRangeLocal())
980             {
981                 continue;
982             }
983 
984             if (assignedBank == BANK_CONFLICT_SECOND_HALF_EVEN)
985             {
986                 continue;
987             }
988 
989             srcBC[i] = gra.getBankConflict(dcls[i]);
990             if (srcBC[i] != BANK_CONFLICT_NONE)
991             {
992                 if (isOddOffset(offset[i]))
993                 {
994                     if (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN)
995                     {
996                         srcBC[i] = BANK_CONFLICT_SECOND_HALF_ODD;
997                     }
998                     else
999                     {
1000                         srcBC[i] = BANK_CONFLICT_FIRST_HALF_EVEN;
1001                     }
1002                 }
1003 
1004                 if (assignedBank == BANK_CONFLICT_NONE)
1005                 {
1006                     assignedBank = srcBC[i];
1007                 }
1008                 else if (srcBC[i] != assignedBank)
1009                 {
1010                     assignedBank = BANK_CONFLICT_SECOND_HALF_EVEN;
1011                 }
1012 
1013                 continue;
1014             }
1015 
1016             if (assignedBank == BANK_CONFLICT_NONE)
1017             {
1018                 srcBC[i] = BANK_CONFLICT_FIRST_HALF_EVEN;
1019                 assignedBank = srcBC[i];
1020                 if (isOddOffset(offset[i]))
1021                 {
1022                     srcBC[i] = (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
1023                 }
1024                 gra.setBankConflict(dcls[i], srcBC[i]);
1025             }
1026             else
1027             {
1028                 srcBC[i] = (assignedBank == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
1029                 if (isOddOffset(offset[i]))
1030                 {
1031                     srcBC[i] = (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
1032                 }
1033                 gra.setBankConflict(dcls[i], srcBC[i]);
1034                 assignedBank = BANK_CONFLICT_SECOND_HALF_EVEN;
1035             }
1036         }
1037     }
1038 
1039 #ifdef DEBUG_VERBOSE_ON
1040     printf("$%d:\n", inst->getCISAOff());
1041     for (int i = 0; i < 3; i++)
1042     {
1043         if (dcls[i])
1044         {
1045             printf("%s, ", dcls[i]->getName());
1046 
1047             if (gra.getBankConflict(dcls[i]) == BANK_CONFLICT_FIRST_HALF_EVEN)
1048             {
1049                 printf("%s\n", "EVEN");
1050             }
1051             else if (gra.getBankConflict(dcls[i]) == BANK_CONFLICT_SECOND_HALF_ODD)
1052             {
1053                 printf("%s\n", "ODD");
1054             }
1055             else
1056             {
1057                 printf("%s\n", "NONE");
1058             }
1059         }
1060     }
1061     printf("\n");
1062 #endif
1063 
1064     return;
1065 }
1066 
setupBankConflictsForBB(G4_BB * bb,unsigned & threeSourceInstNum,unsigned & sendInstNum,unsigned numRegLRA,unsigned & internalConflict)1067 void BankConflictPass::setupBankConflictsForBB(
1068     G4_BB* bb,
1069     unsigned &threeSourceInstNum,
1070     unsigned &sendInstNum,
1071     unsigned numRegLRA,
1072     unsigned & internalConflict)
1073 {
1074     int bank1RegNum = 0;
1075     int bank2RegNum = 0;
1076     float GRFRatio = 0;
1077     G4_INST* prevInst = nullptr;
1078 
1079     if (numRegLRA)
1080     {
1081         GRFRatio = ((float)(numRegLRA - SECOND_HALF_BANK_START_GRF)) / SECOND_HALF_BANK_START_GRF;
1082     }
1083 
1084     for (auto i = bb->rbegin(), rend = bb->rend();
1085         i != rend;
1086         i++)
1087     {
1088         G4_INST* inst = (*i);
1089         if (inst->getNumSrc() == 3 && !inst->isSend())
1090         {
1091             threeSourceInstNum++;
1092             setupBankConflictsOneGRFOld(inst, bank1RegNum, bank2RegNum, GRFRatio, internalConflict);
1093         }
1094         if (inst->isSend() && !inst->isEOT())
1095         {
1096             //Why only data port read causes issue?
1097             if (inst->getMsgDesc()->isRead())
1098             {
1099                 sendInstNum++;
1100             }
1101         }
1102     }
1103 
1104     if ((float)threeSourceInstNum / bb->size() > 0.1)
1105     {
1106         if (!gra.kernel.fg.builder->lowHighBundle() && gra.kernel.fg.builder->hasEarlyGRFRead())
1107         {
1108             for (G4_INST* inst : *bb)
1109             {
1110                 if (prevInst && inst->getNumSrc() == 3 && !inst->isSend())
1111                 {
1112                     setupBankForSrc0(inst, prevInst);
1113                 }
1114                 prevInst = inst;
1115             }
1116         }
1117     }
1118 }
1119 
setupBankConflictsForBBTGL(G4_BB * bb,unsigned & threeSourceInstNum,unsigned & sendInstNum,unsigned numRegLRA,unsigned & internalConflict)1120 void BankConflictPass::setupBankConflictsForBBTGL(
1121     G4_BB* bb,
1122     unsigned& threeSourceInstNum,
1123     unsigned& sendInstNum,
1124     unsigned numRegLRA,
1125     unsigned& internalConflict)
1126 {
1127     float GRFRatio = 0;
1128     G4_INST* prevInst = nullptr;
1129 
1130     if (numRegLRA)
1131     {
1132         GRFRatio = ((float)(numRegLRA - SECOND_HALF_BANK_START_GRF)) / SECOND_HALF_BANK_START_GRF;
1133     }
1134 
1135     for (auto i = bb->rbegin(), rend = bb->rend();
1136         i != rend;
1137         i++)
1138     {
1139         G4_INST* inst = (*i);
1140         if (inst->isSend() || inst->isCFInst() || inst->isLabel() || inst->isOptBarrier())
1141         {
1142             if (inst->isSend() && !inst->isEOT())
1143             {
1144                 // Why only data port read causes issue?
1145                 if (inst->getMsgDesc()->isRead())
1146                 {
1147                     sendInstNum++;
1148                 }
1149             }
1150             continue;
1151         }
1152         if (inst->getNumSrc() == 3)
1153         {
1154             threeSourceInstNum++;
1155             if (inst->isDpas())
1156             {
1157                 hasDpasInst = true;
1158                 setupBankConflictsforDPAS(inst);
1159             }
1160             else
1161             {
1162                 setupBankConflictsforMad(inst);
1163             }
1164         }
1165         else if (gra.kernel.getOption(vISA_forceBCR) && !forGlobal && inst->getNumSrc() == 2)
1166         {
1167             threeSourceInstNum++;
1168             setupBankConflictsforMad(inst);
1169         }
1170     }
1171 
1172     if ((float)threeSourceInstNum / bb->size() > 0.1)
1173     {
1174         if (!gra.kernel.fg.builder->lowHighBundle() && gra.kernel.fg.builder->hasEarlyGRFRead())
1175         {
1176             for (G4_INST* inst : *bb)
1177             {
1178                 if (prevInst && inst->getNumSrc() == 3 && !inst->isSend())
1179                 {
1180                     setupBankForSrc0(inst, prevInst);
1181                 }
1182                 prevInst = inst;
1183             }
1184         }
1185     }
1186 }
1187 
1188 //Use for BB sorting according to the loop nest level and the BB size.
compareBBLoopLevel(G4_BB * bb1,G4_BB * bb2)1189 bool compareBBLoopLevel(G4_BB* bb1, G4_BB* bb2)
1190 {
1191     if (bb1->getNestLevel() > bb2->getNestLevel())
1192     {
1193         return true;
1194     }
1195     else if (bb1->getNestLevel() == bb2->getNestLevel())
1196     {
1197         return bb1->size() > bb2->size();
1198     }
1199 
1200     return false;
1201 }
1202 
1203 /*
1204  * output:
1205  *        threeSourceCandidate, if there are enough three source instructions
1206  *        return value, if do bank confliction reduction to RR RA.
1207  */
setupBankConflictsForKernel(bool doLocalRR,bool & threeSourceCandidate,unsigned numRegLRA,bool & highInternalConflict)1208 bool BankConflictPass::setupBankConflictsForKernel(bool doLocalRR, bool &threeSourceCandidate, unsigned numRegLRA, bool &highInternalConflict)
1209 {
1210     unsigned threeSourceInstNumInKernel = 0;
1211     unsigned internalConflict = 0;
1212     unsigned instNumInKernel = 0;
1213     unsigned sendInstNumInKernel = 0;
1214 
1215     std::vector<G4_BB *> orderedBBs(gra.kernel.fg.cbegin(), gra.kernel.fg.cend());
1216     std::sort(orderedBBs.begin(), orderedBBs.end(), compareBBLoopLevel);
1217 
1218     for (auto bb : orderedBBs)
1219     {
1220         unsigned instNum = 0;
1221         unsigned sendInstNum = 0;
1222         unsigned threeSourceInstNum = 0;
1223         unsigned conflicts = 0;
1224 
1225         unsigned loopNestLevel = 0;
1226 
1227         if (gra.kernel.fg.builder->lowHighBundle())
1228         {
1229             setupBankConflictsForBB(bb, threeSourceInstNum, sendInstNum, numRegLRA, conflicts);
1230         }
1231         else
1232         {
1233             setupBankConflictsForBBTGL(bb, threeSourceInstNum, sendInstNum, numRegLRA, conflicts);
1234         }
1235 
1236         loopNestLevel = bb->getNestLevel() + 1;
1237 
1238         if (threeSourceInstNum)
1239         {
1240             instNum = (uint32_t)bb->size() * loopNestLevel * BANK_CONFLICT_HEURISTIC_LOOP_ITERATION;
1241             threeSourceInstNum = threeSourceInstNum * loopNestLevel * BANK_CONFLICT_HEURISTIC_LOOP_ITERATION;
1242             sendInstNum = sendInstNum * loopNestLevel * BANK_CONFLICT_HEURISTIC_LOOP_ITERATION;
1243             conflicts = conflicts * loopNestLevel * BANK_CONFLICT_HEURISTIC_LOOP_ITERATION;
1244             internalConflict += conflicts;
1245             threeSourceInstNumInKernel += threeSourceInstNum;
1246             instNumInKernel += instNum;
1247             sendInstNumInKernel += sendInstNum;
1248         }
1249     }
1250 
1251     if (!threeSourceInstNumInKernel ||
1252         (float)threeSourceInstNumInKernel / instNumInKernel < BANK_CONFLICT_HEURISTIC_INST)
1253     {
1254         return false;
1255     }
1256 
1257     highInternalConflict = ((float)internalConflict / threeSourceInstNumInKernel) > INTERNAL_CONFLICT_RATIO_HEURISTIC;
1258 
1259     //Bank conflict reduction is done only when there is enough three source instructions.
1260     threeSourceCandidate = true;
1261 
1262     if (doLocalRR && sendInstNumInKernel)
1263     {
1264         if (!hasDpasInst && (sendInstNumInKernel > threeSourceInstNumInKernel))
1265         {
1266             return false;
1267         }
1268     }
1269 
1270     return true;
1271 }
1272 
areAllDefsNoMask(G4_Declare * dcl)1273 bool GlobalRA::areAllDefsNoMask(G4_Declare* dcl)
1274 {
1275     bool retval = true;
1276     auto& maskUsed = getMask(dcl);
1277     if (maskUsed.size() > 0 &&
1278         getAugmentationMask(dcl) != AugmentationMasks::NonDefault)
1279     {
1280         auto byteSize = dcl->getByteSize();
1281         for (unsigned i = 0; i < byteSize; i++)
1282         {
1283             if (maskUsed[i] != NOMASK_BYTE)
1284             {
1285                 retval = false;
1286                 break;
1287             }
1288         }
1289     }
1290     else
1291     {
1292         if (getAugmentationMask(dcl) == AugmentationMasks::NonDefault)
1293             retval = true;
1294         else
1295             retval = false;
1296     }
1297     return retval;
1298 }
1299 
getBankAlign(const G4_Declare * dcl) const1300 BankAlign GlobalRA::getBankAlign(const G4_Declare* dcl) const
1301 {
1302     const IR_Builder* builder = kernel.fg.builder;
1303     switch (getBankConflict(dcl))
1304     {
1305     case BANK_CONFLICT_FIRST_HALF_EVEN:
1306     case BANK_CONFLICT_SECOND_HALF_EVEN:
1307         return builder->oneGRFBankDivision() ? BankAlign::Even : BankAlign::Even2GRF;
1308     case BANK_CONFLICT_FIRST_HALF_ODD:
1309     case BANK_CONFLICT_SECOND_HALF_ODD:
1310         return builder->oneGRFBankDivision() ? BankAlign::Odd : BankAlign::Odd2GRF;
1311     default:
1312         return BankAlign::Either;
1313     }
1314 }
1315 
emitFGWithLiveness(const LivenessAnalysis & liveAnalysis) const1316 void GlobalRA::emitFGWithLiveness(const LivenessAnalysis& liveAnalysis) const
1317 {
1318 #ifdef DEBUG_VERBOSE_ON
1319     for (G4_BB* bb : kernel.fg)
1320     {
1321         DEBUG_VERBOSE(std::endl << "-----------------------------------------------------------------");
1322         DEBUG_VERBOSE(std::endl << "BB" << bb->getId() << ":");
1323         DEBUG_VERBOSE(std::endl << "Preds: ");
1324         for (const G4_BB* pred : bb->Preds)
1325         {
1326             DEBUG_VERBOSE("BB" << pred->getId() << ", ");
1327         }
1328 
1329         DEBUG_VERBOSE(std::endl << "Succs: ");
1330         for (const G4_BB* succ : bb->Succs)
1331         {
1332             DEBUG_VERBOSE("BB" << succ->getId() << ", ");
1333         }
1334 
1335         if (kernel.getOption(vISA_LocalRA))
1336         {
1337             if (auto summary = kernel.fg.getBBLRASummary(bb))
1338             {
1339                 DEBUG_VERBOSE(std::endl << "Local RA: ");
1340                 {
1341                     for (unsigned i = 0; i < kernel.getNumRegTotal(); i++)
1342                     {
1343                         if (summary->isGRFBusy(i))
1344                         {
1345                             DEBUG_VERBOSE("r" << i << ", ");
1346                         }
1347                     }
1348                 }
1349             }
1350         }
1351 
1352         DEBUG_VERBOSE(std::endl << "Gen: ");
1353         for (const G4_Declare * dcl : kernel.Declares)
1354         {
1355             if (dcl->getAliasDeclare() != NULL)
1356                 continue;
1357 
1358             if (dcl->getRegVar()->isRegAllocPartaker())
1359             {
1360                 if (liveAnalysis.use_gen[bb->getId()].isSet(dcl->getRegVar()->getId()))
1361                 {
1362                     DEBUG_VERBOSE(dcl->getName() << ", ");
1363                 }
1364             }
1365         }
1366 
1367         DEBUG_VERBOSE(std::endl << "Kill: ");
1368         for (const G4_Declare * dcl : kernel.Declares)
1369         {
1370             if (dcl->getAliasDeclare() != NULL)
1371                 continue;
1372 
1373             if (dcl->getRegVar()->isRegAllocPartaker())
1374             {
1375                 if (liveAnalysis.use_kill[bb->getId()].isSet(dcl->getRegVar()->getId()))
1376                 {
1377                     DEBUG_VERBOSE(dcl->getName() << ", ");
1378                 }
1379             }
1380         }
1381 
1382         DEBUG_VERBOSE(std::endl << "Live-in: ");
1383         for (const G4_Declare * dcl : kernel.Declares)
1384         {
1385             if (dcl->getAliasDeclare() != NULL)
1386                 continue;
1387 
1388             if (dcl->getRegVar()->isRegAllocPartaker())
1389             {
1390                 if (liveAnalysis.isLiveAtEntry(bb, dcl->getRegVar()->getId()))
1391                 {
1392                     DEBUG_VERBOSE(dcl->getName() << ", ");
1393                 }
1394             }
1395         }
1396 
1397         DEBUG_VERBOSE(std::endl << "Live-out: ");
1398         for (const G4_Declare * dcl : kernel.Declares)
1399         {
1400             if (dcl->getAliasDeclare() != NULL)
1401                 continue;
1402 
1403             if (dcl->getRegVar()->isRegAllocPartaker())
1404             {
1405                 if (liveAnalysis.isLiveAtExit(bb, dcl->getRegVar()->getId()))
1406                 {
1407                     DEBUG_VERBOSE(dcl->getName() << ", ");
1408                 }
1409             }
1410         }
1411 
1412         DEBUG_VERBOSE(std::endl);
1413 
1414         bb->emit(COUT_ERROR);
1415     }
1416 #endif
1417 }
1418 
reportSpillInfo(const LivenessAnalysis & liveness,const GraphColor & coloring) const1419 void GlobalRA::reportSpillInfo(const LivenessAnalysis& liveness, const GraphColor& coloring) const
1420 {
1421     // Emit out interference graph of each spill candidate
1422     // and if a spill candidate is a local range, emit its
1423     // start and end line number in file
1424     std::ofstream optreport;
1425     getOptReportStream(optreport, coloring.getOptions());
1426     LiveRange** lrs = coloring.getLiveRanges();
1427 
1428     for (const vISA::LiveRange* slr : coloring.getSpilledLiveRanges())
1429     {
1430         if (slr->getRegKind() == G4_GRF) {
1431             const G4_RegVar* spillVar = slr->getVar();
1432             optreport << "Spill candidate " << spillVar->getName() << " intf:";
1433             optreport << "\t(" << spillVar->getDeclare()->getTotalElems() << "):" <<
1434                 TypeSymbol(spillVar->getDeclare()->getElemType()) << std::endl;
1435 
1436             if (getLocalLR(spillVar->getDeclare()) != NULL)
1437             {
1438                 if (getLocalLR(spillVar->getDeclare())->isLiveRangeLocal())
1439                 {
1440                     int start, end;
1441                     unsigned dummy;
1442                     start = getLocalLR(spillVar->getDeclare())->getFirstRef(dummy)->getLineNo();
1443                     end = getLocalLR(spillVar->getDeclare())->getLastRef(dummy)->getLineNo();
1444 
1445                     optreport << "(Liverange is local starting at line #" << start <<
1446                         " and ending at line #" << end << ")" << std::endl;
1447                 }
1448             }
1449 
1450             const Interference* intf = coloring.getIntf();
1451             unsigned spillVarId = slr->getVar()->getId();
1452 
1453             for (int i = 0; i < (int)liveness.getNumSelectedVar(); i++)
1454             {
1455                 if (intf->interfereBetween(spillVarId, i))
1456                 {
1457                     const G4_RegVar* intfRangeVar = lrs[i]->getVar();
1458 
1459                     optreport << "\t" << intfRangeVar->getName() << "(" <<
1460                         intfRangeVar->getDeclare()->getTotalElems() << "):" <<
1461                         TypeSymbol(intfRangeVar->getDeclare()->getElemType());
1462 
1463                     if (lrs[i]->getPhyReg() == NULL)
1464                     {
1465                         optreport << " --- spilled";
1466                     }
1467 
1468                     optreport << ", " << std::endl;
1469                 }
1470             }
1471 
1472             optreport << std::endl << std::endl;
1473         }
1474     }
1475 
1476     closeOptReportStream(optreport);
1477 }
1478 
1479 
LiveRange(G4_RegVar * v,GlobalRA & g)1480 LiveRange::LiveRange(G4_RegVar* v, GlobalRA& g) : var(v), dcl(v->getDeclare()), regKind(dcl->getRegFile()), gra(g)
1481 {
1482     isCandidate = true;
1483 
1484     if (getRegKind() == G4_ADDRESS)
1485         numRegNeeded = v->getDeclare()->getNumElems() * v->getDeclare()->getElemSize() / G4_WSIZE;
1486     else if (getRegKind() == G4_FLAG)
1487     {
1488         // number of elements are in words
1489         numRegNeeded = v->getDeclare()->getNumElems();
1490     }
1491     else
1492     {
1493         // number of GRFs
1494         numRegNeeded = v->getDeclare()->getNumRows();
1495     }
1496 }
1497 
checkForInfiniteSpillCost(G4_BB * bb,std::list<G4_INST * >::reverse_iterator & it)1498 void LiveRange::checkForInfiniteSpillCost(G4_BB* bb, std::list<G4_INST*>::reverse_iterator& it)
1499 {
1500     // G4_INST at *it defines liverange object (this ptr)
1501     // If next instruction of iterator uses same liverange then
1502     // it may be a potential infinite spill cost candidate.
1503     // To confirm, following requirements should be fulfilled:
1504     // a. this liverange is not a global
1505     // b. this liverange is defined/used in these 2 instructions only
1506     //
1507     // The idea is for ranges marked with infinite spill cost,
1508     // coloring will attempt to put them on top of stack so they
1509     // have higher chance of getting a color. If a range that should
1510     // be infinite spill cost is not marked as being so, the only
1511     // downside is extra compile time spent in inserting spill code
1512     // and then punting out when later spilled code will cause
1513     // even more spills.
1514     //
1515     // The assumption is that current live-range is a current register
1516     // allocation candidate.
1517     //
1518     G4_INST* curInst = (*it);
1519 
1520     // Skip the check if curInst is a pseudoKill
1521     // Otherwise, it may invalidate a previously marked infinite
1522     // spill cost candidate, e.g.,
1523     // pseudo_kill (1) P1(0,0)[1]:uw [Align1]
1524     // mov (1) P1(0,0)[1]:uw TV1(8,0)[0;1,0]:uw [Align1, NoMask]
1525     // (+P1.0) sel (16) V65(0,0)[1]:f TV0(0,0)[0;1,0]:f 0:f [Align1, H1]
1526     if (curInst->isPseudoKill())
1527     {
1528         return;
1529     }
1530 
1531     // Check whether dst variable is a global
1532     if (gra.isBlockLocal(this->getDcl()) == false)
1533     {
1534         isCandidate = false;
1535         isInfiniteCost = false;
1536 
1537         return;
1538     }
1539 
1540     G4_DstRegRegion* dst = curInst->getDst();
1541     // If cur instruction dst is indirect write then return
1542     if (dst &&
1543         dst->getRegAccess() == IndirGRF &&
1544         dst->getBase()->asRegVar()->getId() == this->getVar()->getId())
1545     {
1546         return;
1547     }
1548 
1549     // isCandidate is set to true only for first definition ever seen.
1550     // If more than 1 def if found this gets set to false.
1551     const std::list<G4_INST*>::reverse_iterator rbegin = bb->rbegin();
1552     if (this->isCandidate == true && it != rbegin)
1553     {
1554         G4_INST* nextInst = NULL;
1555         if (this->getRefCount() != 2 ||
1556             (this->getRegKind() == G4_GRF && this->getDcl()->getAddressed() == true))
1557         {
1558             // If a liverange has > 2 refs then it
1559             // cannot be a candidate.
1560             // Also an address taken GRF is not a candidate.
1561             // This represents an early exit.
1562             isCandidate = false;
1563             isInfiniteCost = false;
1564 
1565             return;
1566         }
1567 
1568         // Skip all pseudo kills
1569         std::list<G4_INST*>::reverse_iterator next = it;
1570         while (true)
1571         {
1572             if (next == rbegin)
1573             {
1574                 isCandidate = isInfiniteCost = false;
1575                 return;
1576             }
1577             --next;
1578 
1579             // This is not a pseudo-kill instruction, then find
1580             // the desired next instruction. Otherwise, continue.
1581             nextInst = *next;
1582             if (!(nextInst->isPseudoKill()))
1583                 break;
1584         }
1585 
1586         // Check whether this liverange is used in nextInst
1587         for (unsigned i = 0; i < G4_MAX_SRCS; i++)
1588         {
1589             G4_Operand* src = nextInst->getSrc(i);
1590 
1591             if (src &&
1592                 src->isSrcRegRegion() &&
1593                 src->getBase()->isRegAllocPartaker())
1594             {
1595                 // src can be Direct/Indirect
1596                 G4_SrcRegRegion* srcRgn = src->asSrcRegRegion();
1597 
1598                 if (srcRgn->getRegAccess() == Direct &&
1599                     srcRgn->getBase()->isRegVar() &&
1600                     srcRgn->getBase()->asRegVar()->getId() == this->getVar()->getId())
1601                 {
1602                     // Def-use found back-to-back
1603                     isInfiniteCost = true;
1604                     // Identify no more candidates
1605                     isCandidate = false;
1606                 }
1607                 else if (this->getRegKind() == G4_ADDRESS &&
1608                     srcRgn->getRegAccess() == IndirGRF &&
1609                     srcRgn->getBase()->isRegVar() &&
1610                     srcRgn->getBase()->asRegVar()->getId() == this->getVar()->getId())
1611                 {
1612                     // Def-use found back-to-back
1613                     isInfiniteCost = true;
1614                     // Identify no more candidates
1615                     isCandidate = false;
1616                 }
1617             }
1618         }
1619 
1620         G4_DstRegRegion* nextDst = nextInst->getDst();
1621         if (isCandidate == true &&
1622             this->getRegKind() == G4_ADDRESS &&
1623             nextDst &&
1624             nextDst->getRegAccess() == IndirGRF &&
1625             nextDst->getBase()->isRegVar() &&
1626             nextDst->getBase()->asRegVar()->isRegAllocPartaker() &&
1627             nextDst->getBase()->asRegVar()->getId() == this->getVar()->getId())
1628         {
1629             // Pattern found:
1630             // A0=
1631             // r[A0]=
1632             isInfiniteCost = true;
1633             // Identify no more candidates
1634             isCandidate = false;
1635         }
1636 
1637         if (isCandidate == true &&
1638             this->getRegKind() == G4_FLAG &&
1639             nextInst->getPredicate() &&
1640             nextInst->getPredicate()->getBase() &&
1641             nextInst->getPredicate()->getBase()->isRegVar() &&
1642             nextInst->getPredicate()->getBase()->asRegVar()->isRegAllocPartaker() &&
1643             nextInst->getPredicate()->getBase()->asRegVar()->getId() == this->getVar()->getId())
1644         {
1645             // Pattern found:
1646             // P0 = or cmp.P0 = <-- P0 defined
1647             // (P0) ... <-- P0 used as predicate
1648             isInfiniteCost = true;
1649             // Identify no more candidates
1650             isCandidate = false;
1651         }
1652 
1653 #ifdef DEBUG_VERBOSE_ON
1654         if (isInfiniteCost == true)
1655         {
1656             DEBUG_VERBOSE("Marking " << this->getDcl()->getName() <<
1657                 " as having infinite spill cost due to back-to-back def-use" << std::endl);
1658         }
1659 #endif
1660 
1661         // Once a def is seen, stop looking for more defs
1662         isCandidate = false;
1663     }
1664     else
1665     {
1666 #ifdef DEBUG_VERBOSE_ON
1667         if (isInfiniteCost == true)
1668         {
1669             DEBUG_VERBOSE("Unmarking " << this->getDcl()->getName() <<
1670                 " as having infinite spill cost" << std::endl);
1671         }
1672 #endif
1673         isCandidate = false;
1674         isInfiniteCost = false;
1675     }
1676 }
1677 
1678 //
1679 // return true, if live ranges v1 and v2 interfere
1680 //
interfereBetween(unsigned v1,unsigned v2) const1681 bool Interference::interfereBetween(unsigned v1, unsigned v2) const
1682 {
1683     if (v1 > v2)
1684     {
1685         std::swap(v1, v2);
1686     }
1687 
1688     if (useDenseMatrix())
1689     {
1690         unsigned col = v2 / BITS_DWORD;
1691         return matrix[v1 * rowSize + col] & (1 << (v2 % BITS_DWORD));
1692     }
1693     else
1694     {
1695         auto&& set = sparseMatrix[v1];
1696         return set.find(v2) != set.end();
1697     }
1698 }
1699 
1700 //
1701 // init live vector with all live ranges that are live at the exit
1702 // also set the next seq use of any live range that is live across to be INT_MAX
1703 // to indicate that this live range does not have exclusive sequential uses and hence
1704 // is not a candidate for being marked with an infinite spill cost.
1705 //
buildInterferenceAtBBExit(const G4_BB * bb,BitSet & live)1706 void Interference::buildInterferenceAtBBExit(const G4_BB* bb, BitSet& live)
1707 {
1708 
1709     // live must be empty at this point
1710     live = liveAnalysis->use_out[bb->getId()];
1711     live &= liveAnalysis->def_out[bb->getId()];
1712 }
1713 
1714 //
1715 // Filter out partial or splitted declares in batch interference.
1716 //
filterSplitDclares(unsigned startIdx,unsigned endIdx,unsigned n,unsigned col,unsigned & elt,bool is_partial)1717 inline void Interference::filterSplitDclares(unsigned startIdx, unsigned endIdx, unsigned n, unsigned col, unsigned &elt, bool is_partial)
1718 {
1719 
1720     if (is_partial)  //Don't interference with parent
1721     {
1722         unsigned rowSplited = n / BITS_DWORD;
1723         if (rowSplited == col)
1724         {
1725             elt &= ~(1 << (n % BITS_DWORD));
1726         }
1727     }
1728 
1729     //if current is splitted dcl, don't interference with any of its child nodes.
1730     //if current is partial dcl, don't interference with any other child nodes.
1731     if (col >= startIdx / BITS_DWORD  && col < (endIdx / BITS_DWORD + 1))
1732     {
1733         unsigned selt = 0;
1734         unsigned start_id = col * BITS_DWORD > startIdx ? 0 : startIdx % BITS_DWORD;
1735         unsigned end_id = (col + 1) * BITS_DWORD > endIdx ? endIdx % BITS_DWORD : BITS_DWORD;
1736 
1737         for (unsigned i = start_id; i < end_id; i++)
1738         {
1739             selt |= 1 << i;
1740         }
1741         elt &= ~selt;
1742     }
1743 
1744     return;
1745 }
1746 
1747 //
1748 // set interference for all live ranges that are currently live
1749 // for partial declares, following rules are applied
1750 // a. current partial declare does not interference with any other partial declare
1751 // b. current parent declare does not interference with its children declares, can children declare interference with parent declare?
1752 // c. current partial declare does not interference with hybrid declares added by local RA, the reason is simple, these declares are assigned register already.
1753 //
buildInterferenceWithLive(const BitSet & live,unsigned i)1754 void Interference::buildInterferenceWithLive(const BitSet& live, unsigned i)
1755 {
1756     const LiveRange* lr = lrs[i];
1757     bool is_partial = lr->getIsPartialDcl();
1758     bool is_splitted = lr->getIsSplittedDcl();
1759     unsigned n = 0;
1760 
1761     // For none partial varaible, interference with all varaibles
1762     unsigned numDwords = maxId / BITS_DWORD;
1763     unsigned numBits = maxId % BITS_DWORD;
1764 
1765     if (numBits)
1766     {
1767         numDwords++;
1768     }
1769 
1770     unsigned start_idx = 0;
1771     unsigned end_idx = 0;
1772     if (is_splitted) //if current is splitted dcl, don't interference with all its child nodes.
1773     {
1774         start_idx = lr->getDcl()->getSplitVarStartID();
1775         end_idx = start_idx + gra.getSplitVarNum(lr->getDcl());
1776     }
1777 
1778     if (is_partial)   //if current is partial dcl, don't interference with all other partial dcls, and it's parent dcl.
1779     {
1780         n = gra.getSplittedDeclare(lr->getDcl())->getRegVar()->getId();
1781         start_idx = splitStartId;
1782         end_idx = splitStartId + splitNum;
1783     }
1784 
1785     unsigned colEnd = i / BITS_DWORD;
1786 
1787     // Set column bits in intf graph
1788     for (unsigned k = 0; k < colEnd; k++)
1789     {
1790         unsigned elt = live.getElt(k);
1791 
1792         if (elt != 0)
1793         {
1794             if (is_partial || is_splitted)
1795             {
1796                 filterSplitDclares(start_idx, end_idx, n, k, elt, is_partial);
1797             }
1798 
1799             for (unsigned j = 0; j < BITS_DWORD; j++)
1800             {
1801                 if (elt & (1 << j))
1802                 {
1803                     unsigned curPos = j + (k * BITS_DWORD);
1804                     safeSetInterference(curPos, i);
1805                 }
1806             }
1807         }
1808     }
1809 
1810     // Set dword at transition point from column to row
1811     unsigned elt = live.getElt(colEnd);
1812     //checkAndSetIntf guarantee partial and splitted cases
1813     if (elt != 0)
1814     {
1815         for (unsigned j = 0; j < BITS_DWORD; j++)
1816         {
1817             if (elt & (1 << j))
1818             {
1819                 unsigned curPos = j + (colEnd * BITS_DWORD);
1820                 if (!varSplitCheckBeforeIntf(i, curPos))
1821                 {
1822                     checkAndSetIntf(i, curPos);
1823                 }
1824             }
1825         }
1826     }
1827 
1828     colEnd++;
1829     // Set row intf graph
1830     for (unsigned k = colEnd; k < numDwords; k++)
1831     {
1832         unsigned elt = live.getElt(k);
1833 
1834         if (is_partial || is_splitted)
1835         {
1836             filterSplitDclares(start_idx, end_idx, n, k, elt, is_partial);
1837         }
1838 
1839         if (elt != 0)
1840         {
1841             setBlockInterferencesOneWay(i, k, elt);
1842         }
1843     }
1844 }
1845 
buildInterferenceWithSubDcl(unsigned lr_id,G4_Operand * opnd,BitSet & live,bool setLive,bool setIntf)1846 void Interference::buildInterferenceWithSubDcl(unsigned lr_id, G4_Operand *opnd, BitSet& live, bool setLive, bool setIntf)
1847 {
1848 
1849     const G4_Declare *dcl = lrs[lr_id]->getDcl();
1850     for (const G4_Declare *subDcl : gra.getSubDclList(dcl))
1851     {
1852         unsigned leftBound = gra.getSubOffset(subDcl);
1853         unsigned rightBound = leftBound + subDcl->getByteSize() - 1;
1854         if (!(opnd->getRightBound() < leftBound || rightBound < opnd->getLeftBound()))
1855         {
1856             int subID = subDcl->getRegVar()->getId();
1857 
1858             if (setIntf)
1859             {
1860                 buildInterferenceWithLive(live, subID);
1861             }
1862             if (setLive)
1863             {
1864                 live.set(subID, true);
1865             }
1866         }
1867     }
1868 
1869     return;
1870 }
1871 
buildInterferenceWithAllSubDcl(unsigned v1,unsigned v2)1872 void Interference::buildInterferenceWithAllSubDcl(unsigned v1, unsigned v2)
1873 {
1874     const G4_Declare * d1 = lrs[v1]->getDcl();
1875     const G4_Declare * d2 = lrs[v2]->getDcl();
1876 
1877     if (d1->getIsSplittedDcl() && !d2->getIsPartialDcl())
1878     {
1879         for (const G4_Declare *subDcl : gra.getSubDclList(d1))
1880         {
1881             int subID = subDcl->getRegVar()->getId();
1882             checkAndSetIntf(v2, subID);
1883         }
1884     }
1885 
1886     if (d2->getIsSplittedDcl() && !d1->getIsPartialDcl())
1887     {
1888         for (const G4_Declare *subDcl : gra.getSubDclList(d2))
1889         {
1890             int subID = subDcl->getRegVar()->getId();
1891             checkAndSetIntf(v1, subID);
1892         }
1893     }
1894 
1895     return;
1896 }
1897 //
1898 // Bias the live ranges in "live" to be assigned the callee-save registers as they
1899 // are live through a stack call. Exclude file scope variables as they are always
1900 // save/restore before/after call and are better assigned to the caller-save space.
1901 //
addCalleeSaveBias(const BitSet & live)1902 void Interference::addCalleeSaveBias(const BitSet& live)
1903 {
1904     for (unsigned i = 0; i < maxId; i++)
1905     {
1906         if (live.isSet(i))
1907         {
1908             lrs[i]->setCallerSaveBias(false);
1909             lrs[i]->setCalleeSaveBias(true);
1910         }
1911     }
1912 }
1913 
buildInterferenceAmongLiveOuts()1914 void Interference::buildInterferenceAmongLiveOuts()
1915 {
1916     // Mark interference between dcls marked as Output.
1917     //
1918     // Interference computation marks interference for a
1919     // variable only when definition for that variable is
1920     // seen, not otherwise.
1921     //
1922     // This method is useful when definition of such
1923     // "Output" variables are emitted to program post RA.
1924     //
1925     // It is safe to mark interference between all "Output"
1926     // dcls even when their definition is present in the program.
1927 
1928     // First gather all Output dcls in a vector to avoid an O(N^2)
1929     // lookup. Number of OutputDcls should be small.
1930     std::vector<G4_Declare*> OutputDcls;
1931     for (auto dcl : kernel.Declares)
1932     {
1933         if (!dcl->getRegVar()->isRegAllocPartaker() ||
1934             !dcl->isOutput())
1935             continue;
1936 
1937         OutputDcls.push_back(dcl);
1938     }
1939 
1940     for (auto dcl1 : OutputDcls)
1941     {
1942         // dcl1 is RA partaker iter and is marked as Output
1943         for (auto dcl2 : OutputDcls)
1944         {
1945             if (dcl1 == dcl2)
1946                 continue;
1947 
1948             checkAndSetIntf(dcl1->getRegVar()->getId(), dcl2->getRegVar()->getId());
1949         }
1950     }
1951 }
1952 
buildInterferenceAmongLiveIns()1953 void Interference::buildInterferenceAmongLiveIns()
1954 {
1955     //
1956     // Build interference between all live-ins. If all live-ins are only
1957     // read then their interference will be skipped in earlier phase.
1958     // For eg, arg and globals are both live-in. And both may only have
1959     // uses in function and no def.
1960     //
1961     const G4_BB* entryBB = kernel.fg.getEntryBB();
1962 
1963 
1964     for (unsigned i = 0; i < liveAnalysis->getNumSelectedGlobalVar(); i++)
1965     {
1966         if (liveAnalysis->isLiveAtEntry(entryBB, i))
1967         {
1968             //Mark reference can not gaurantee all the varaibles are local, update here
1969             if (lrs[i]->getDcl()->getIsSplittedDcl())
1970             {
1971                 lrs[i]->getDcl()->setIsSplittedDcl(false);
1972                 lrs[i]->setIsSplittedDcl(false);
1973             }
1974 
1975             for (unsigned j = i + 1; j < liveAnalysis->getNumSelectedGlobalVar(); j++)
1976             {
1977                 if (liveAnalysis->isLiveAtEntry(entryBB, j))
1978                 {
1979                     if (lrs[i]->getDcl()->getRegFile() == G4_INPUT &&
1980                         lrs[i]->getVar()->getPhyReg() != NULL &&
1981                         lrs[j]->getDcl()->getRegFile() == G4_INPUT &&
1982                         lrs[j]->getVar()->getPhyReg() != NULL)
1983                     {
1984                         continue;
1985                     }
1986                     else
1987                     {
1988                         if (!varSplitCheckBeforeIntf(i, j))
1989                         {
1990                             checkAndSetIntf(i, j);
1991                         }
1992                     }
1993                 }
1994             }
1995         }
1996     }
1997 }
1998 
markInterferenceForSend(G4_BB * bb,G4_INST * inst,G4_DstRegRegion * dst)1999 void Interference::markInterferenceForSend(G4_BB* bb,
2000     G4_INST* inst,
2001     G4_DstRegRegion* dst)
2002 {
2003     bool isDstRegAllocPartaker = false;
2004     bool isDstLocallyAssigned = false;
2005     unsigned dstId = 0;
2006     int dstPreg = 0, dstNumRows = 0;
2007 
2008     if (dst->getBase()->isRegVar())
2009     {
2010         if (dst->getBase()->isRegAllocPartaker())
2011         {
2012             G4_DstRegRegion* dstRgn = dst;
2013             isDstRegAllocPartaker = true;
2014             dstId = ((G4_RegVar*)dstRgn->getBase())->getId();
2015         }
2016         else if (kernel.getOption(vISA_LocalRA))
2017         {
2018             LocalLiveRange* localLR = NULL;
2019             G4_Declare* topdcl = GetTopDclFromRegRegion(dst);
2020 
2021             if (topdcl)
2022                 localLR = gra.getLocalLR(topdcl);
2023 
2024             if (localLR && localLR->getAssigned())
2025             {
2026                 int sreg;
2027                 G4_VarBase* preg = localLR->getPhyReg(sreg);
2028 
2029                 MUST_BE_TRUE(preg->isGreg(), "Register in dst was not GRF");
2030 
2031                 isDstLocallyAssigned = true;
2032                 dstPreg = preg->asGreg()->getRegNum();
2033                 dstNumRows = localLR->getTopDcl()->getNumRows();
2034             }
2035         }
2036 
2037         if (isDstRegAllocPartaker || isDstLocallyAssigned)
2038         {
2039             for (unsigned j = 0; j < G4_MAX_SRCS; j++)
2040             {
2041                 G4_Operand* src = inst->getSrc(j);
2042                 if (src != NULL &&
2043                     src->isSrcRegRegion() &&
2044                     src->asSrcRegRegion()->getBase()->isRegVar())
2045                 {
2046                     if (src->asSrcRegRegion()->getBase()->isRegAllocPartaker())
2047                     {
2048                         unsigned srcId = src->asSrcRegRegion()->getBase()->asRegVar()->getId();
2049 
2050                         if (isDstRegAllocPartaker)
2051                         {
2052                             if (!varSplitCheckBeforeIntf(dstId, srcId))
2053                             {
2054                                 checkAndSetIntf(dstId, srcId);
2055                                 buildInterferenceWithAllSubDcl(dstId, srcId);
2056                             }
2057                         }
2058                         else
2059                         {
2060                             for (int j = dstPreg, sum = dstPreg + dstNumRows; j < sum; j++)
2061                             {
2062                                 int k = getGRFDclForHRA(j)->getRegVar()->getId();
2063                                 if (!varSplitCheckBeforeIntf(k, srcId))
2064                                 {
2065                                     checkAndSetIntf(k, srcId);
2066                                     buildInterferenceWithAllSubDcl(k, srcId);
2067                                 }
2068                             }
2069                         }
2070                     }
2071                     else if (kernel.getOption(vISA_LocalRA) && isDstRegAllocPartaker)
2072                     {
2073                         LocalLiveRange* localLR = nullptr;
2074                         const G4_Declare* topdcl = GetTopDclFromRegRegion(src);
2075 
2076                         if (topdcl)
2077                             localLR = gra.getLocalLR(topdcl);
2078 
2079                         if (localLR && localLR->getAssigned())
2080                         {
2081                             int sreg;
2082                             G4_VarBase* preg = localLR->getPhyReg(sreg);
2083                             int numrows = localLR->getTopDcl()->getNumRows();
2084 
2085                             MUST_BE_TRUE(preg->isGreg(), "Register in src was not GRF");
2086 
2087                             int reg = preg->asGreg()->getRegNum();
2088 
2089                             for (int j = reg, sum = reg + numrows; j < sum; j++)
2090                             {
2091                                 int k = getGRFDclForHRA(j)->getRegVar()->getId();
2092                                 if (!varSplitCheckBeforeIntf(dstId, k))
2093                                 {
2094                                     checkAndSetIntf(dstId, k);
2095                                     buildInterferenceWithAllSubDcl(dstId, k);
2096                                 }
2097                             }
2098                         }
2099                     }
2100                 }
2101             }
2102         }
2103     }
2104 }
2105 
markInterferenceToAvoidDstSrcOverlap(G4_BB * bb,G4_INST * inst)2106 void Interference::markInterferenceToAvoidDstSrcOverlap(G4_BB* bb,
2107     G4_INST* inst)
2108 {
2109     bool isDstRegAllocPartaker = false;
2110     bool isDstLocallyAssigned = false;
2111     unsigned dstId = 0;
2112     int dstPreg = 0, dstNumRows = 0;
2113     bool dstOpndNumRows = false;
2114 
2115     G4_DstRegRegion* dst = inst->getDst();
2116     if (dst->getBase()->isRegVar() && (dst->getTopDcl()->getRegFile() == G4_GRF))
2117     {
2118         G4_Declare* dstDcl = dst->getTopDcl();
2119         int dstOffset = dst->getLeftBound() / numEltPerGRF<Type_UB>();
2120         bool isDstEvenAlign = gra.isEvenAligned(dstDcl);
2121 
2122         if (dst->getBase()->isRegAllocPartaker())
2123         {
2124             isDstRegAllocPartaker = true;
2125             dstId = ((G4_RegVar*)dst->getBase())->getId();
2126             dstOpndNumRows = dst->getLinearizedEnd() - dst->getLinearizedStart() + 1 > numEltPerGRF<Type_UB>();
2127         }
2128         else if (kernel.getOption(vISA_LocalRA))
2129         {
2130             LocalLiveRange* localLR = NULL;
2131             G4_Declare* topdcl = GetTopDclFromRegRegion(dst);
2132 
2133             if (topdcl)
2134                 localLR = gra.getLocalLR(topdcl);
2135             if (localLR && localLR->getAssigned())
2136             {
2137                 int sreg;
2138                 G4_VarBase* preg = localLR->getPhyReg(sreg);
2139 
2140                 MUST_BE_TRUE(preg->isGreg(), "Register in dst was not GRF");
2141 
2142                 isDstLocallyAssigned = true;
2143                 dstPreg = preg->asGreg()->getRegNum();
2144                 dstNumRows = localLR->getTopDcl()->getNumRows();
2145                 dstOpndNumRows = dst->getLinearizedEnd() - dst->getLinearizedStart() + 1 > numEltPerGRF<Type_UB>();
2146                 isDstEvenAlign = (dstPreg % 2 == 0);
2147             }
2148         }
2149 
2150         if (isDstRegAllocPartaker || isDstLocallyAssigned)
2151         {
2152             for (unsigned j = 0; j < G4_MAX_SRCS; j++)
2153             {
2154                 if (inst->isDpas() && j != 1)
2155                     continue;
2156                 G4_Operand* src = inst->getSrc(j);
2157                 if (src != NULL &&
2158                     src->isSrcRegRegion() &&
2159                     src->asSrcRegRegion()->getBase()->isRegVar() )
2160                 {
2161                     G4_SrcRegRegion* srcRgn = src->asSrcRegRegion();
2162                     G4_Declare* srcDcl = src->getTopDcl();
2163                     if (srcRgn->getRegAccess() == Direct &&
2164                         (src->getTopDcl()->getRegFile() == G4_GRF || src->getTopDcl()->getRegFile() == G4_INPUT))
2165                     {
2166                         int srcOffset = src->getLeftBound() / numEltPerGRF<Type_UB>();
2167                         bool srcOpndNumRows = srcRgn->getLinearizedEnd() - srcRgn->getLinearizedStart() + 1 > numEltPerGRF<Type_UB>();
2168 
2169                         int srcReg = 0;
2170                         bool isSrcEvenAlign = gra.isEvenAligned(srcDcl);
2171                         if (!src->asSrcRegRegion()->getBase()->isRegAllocPartaker() &&
2172                             kernel.getOption(vISA_LocalRA))
2173                         {
2174                             int sreg;
2175                             LocalLiveRange* localLR = NULL;
2176                             G4_Declare* topdcl = GetTopDclFromRegRegion(src);
2177 
2178                             if (topdcl)
2179                                 localLR = gra.getLocalLR(topdcl);
2180                             if (localLR && localLR->getAssigned())
2181                             {
2182                                 G4_VarBase* preg = localLR->getPhyReg(sreg);
2183 
2184                                 MUST_BE_TRUE(preg->isGreg(), "Register in src was not GRF");
2185                                 srcReg = preg->asGreg()->getRegNum();
2186                                 isSrcEvenAlign = (srcReg % 2 == 0);
2187                             }
2188                         }
2189 
2190                         if (srcDcl->getRegFile() == G4_INPUT &&
2191                             srcDcl->getRegVar()->getPhyReg() != NULL &&
2192                             srcDcl->getRegVar()->getPhyReg()->isGreg())
2193                         {
2194                             srcReg = srcDcl->getRegVar()->getPhyReg()->asGreg()->getRegNum();
2195                             isSrcEvenAlign = (srcReg % 2 == 0);
2196                         }
2197 
2198                         if (dstOpndNumRows || srcOpndNumRows)
2199                         {
2200                             if (!(isDstEvenAlign && isSrcEvenAlign &&
2201                                 srcOffset % 2 == dstOffset % 2 &&
2202                                 dstOpndNumRows && srcOpndNumRows))
2203                             {
2204                                 if (src->asSrcRegRegion()->getBase()->isRegAllocPartaker())
2205                                 {
2206                                     unsigned srcId = src->asSrcRegRegion()->getBase()->asRegVar()->getId();
2207 #ifdef DEBUG_VERBOSE_ON
2208                                     printf("Src%d  ", j);
2209                                     inst->dump();
2210 #endif
2211                                     if (isDstRegAllocPartaker)
2212                                     {
2213                                         if (!varSplitCheckBeforeIntf(dstId, srcId))
2214                                         {
2215                                             checkAndSetIntf(dstId, srcId);
2216                                             buildInterferenceWithAllSubDcl(dstId, srcId);
2217                                         }
2218                                     }
2219                                     else
2220                                     {
2221                                         for (int j = dstPreg, sum = dstPreg + dstNumRows; j < sum; j++)
2222                                         {
2223                                             int k = getGRFDclForHRA(j)->getRegVar()->getId();
2224                                             if (!varSplitCheckBeforeIntf(k, srcId))
2225                                             {
2226                                                 checkAndSetIntf(k, srcId);
2227                                                 buildInterferenceWithAllSubDcl(k, srcId);
2228                                             }
2229                                         }
2230                                     }
2231                                 }
2232                                 else if (kernel.getOption(vISA_LocalRA) && isDstRegAllocPartaker)
2233                                 {
2234                                     LocalLiveRange* localLR = NULL;
2235                                     G4_Declare* topdcl = GetTopDclFromRegRegion(src);
2236 
2237                                     if (topdcl)
2238                                         localLR = gra.getLocalLR(topdcl);
2239 
2240                                     if (localLR && localLR->getAssigned())
2241                                     {
2242                                         int reg, sreg, numrows;
2243                                         G4_VarBase* preg = localLR->getPhyReg(sreg);
2244                                         numrows = localLR->getTopDcl()->getNumRows();
2245 
2246                                         MUST_BE_TRUE(preg->isGreg(), "Register in src was not GRF");
2247 
2248                                         reg = preg->asGreg()->getRegNum();
2249 #ifdef DEBUG_VERBOSE_ON
2250                                         printf("Src%d  ", j);
2251                                         inst->dump();
2252 #endif
2253                                         for (int j = reg, sum = reg + numrows; j < sum; j++)
2254                                         {
2255                                             int k = getGRFDclForHRA(j)->getRegVar()->getId();
2256                                             if (!varSplitCheckBeforeIntf(dstId, k))
2257                                             {
2258                                                 checkAndSetIntf(dstId, k);
2259                                                 buildInterferenceWithAllSubDcl(dstId, k);
2260                                             }
2261                                         }
2262                                     }
2263                                 }
2264                             }
2265                         }
2266                     }
2267                     else if (srcRgn->getRegAccess() == IndirGRF)
2268                     {
2269                         // make every var in points-to set live
2270                         const REGVAR_VECTOR& pointsToSet = liveAnalysis->getPointsToAnalysis().getAllInPointsToOrIndrUse(srcRgn, bb);
2271                         for (auto pt : pointsToSet)
2272                         {
2273                             if (pt.var->isRegAllocPartaker())
2274                             {
2275                                 unsigned srcId = pt.var->getId();
2276                                 if (isDstRegAllocPartaker)
2277                                 {
2278                                     if (!varSplitCheckBeforeIntf(dstId, srcId))
2279                                     {
2280                                         checkAndSetIntf(dstId, srcId);
2281                                         buildInterferenceWithAllSubDcl(dstId, srcId);
2282                                     }
2283                                 }
2284                                 else
2285                                 {
2286                                     for (int j = dstPreg, sum = dstPreg + dstNumRows; j < sum; j++)
2287                                     {
2288                                         int k = getGRFDclForHRA(j)->getRegVar()->getId();
2289                                         if (!varSplitCheckBeforeIntf(k, srcId))
2290                                         {
2291                                             checkAndSetIntf(k, srcId);
2292                                             buildInterferenceWithAllSubDcl(k, srcId);
2293                                         }
2294                                     }
2295                                 }
2296                             }
2297                         }
2298                     }
2299                 }
2300             }
2301         }
2302     }
2303 }
2304 
getRefCount(int loopNestLevel)2305 uint32_t GlobalRA::getRefCount(int loopNestLevel)
2306 {
2307     if (loopNestLevel == 0)
2308     {
2309         return 1;
2310     }
2311     return (uint32_t)std::pow(IN_LOOP_REFERENCE_COUNT_FACTOR, std::min(loopNestLevel, 8));
2312 }
2313 
2314 // handle return value interference for fcall
buildInterferenceForFcall(G4_BB * bb,BitSet & live,G4_INST * inst,std::list<G4_INST * >::reverse_iterator i,const G4_VarBase * regVar)2315 void Interference::buildInterferenceForFcall(G4_BB* bb, BitSet& live, G4_INST* inst, std::list<G4_INST*>::reverse_iterator i, const G4_VarBase* regVar)
2316 {
2317     assert(inst->opcode() == G4_pseudo_fcall && "expect fcall inst");
2318     unsigned refCount = GlobalRA::getRefCount(kernel.getOption(vISA_ConsiderLoopInfoInRA) ?
2319         bb->getNestLevel() : 0);
2320 
2321     if (regVar->isRegAllocPartaker())
2322     {
2323         unsigned id = static_cast<const G4_RegVar*>(regVar)->getId();
2324         lrs[id]->setRefCount(lrs[id]->getRefCount() + refCount);
2325 
2326         buildInterferenceWithLive(live, id);
2327         updateLiveness(live, id, false);
2328     }
2329 }
2330 
isReRAPass()2331 bool GlobalRA::isReRAPass()
2332 {
2333     auto gtPinInfo = kernel.getGTPinData();
2334     bool reRAPass = gtPinInfo && gtPinInfo->isReRAPass();
2335     return reRAPass;
2336 }
2337 
buildInterferenceForDst(G4_BB * bb,BitSet & live,G4_INST * inst,std::list<G4_INST * >::reverse_iterator i,G4_DstRegRegion * dst)2338 void Interference::buildInterferenceForDst(G4_BB* bb, BitSet& live, G4_INST* inst, std::list<G4_INST*>::reverse_iterator i, G4_DstRegRegion* dst)
2339 {
2340     unsigned refCount = GlobalRA::getRefCount(kernel.getOption(vISA_ConsiderLoopInfoInRA) ?
2341         bb->getNestLevel() : 0);
2342 
2343     if (dst->getBase()->isRegAllocPartaker())
2344     {
2345         unsigned id = ((G4_RegVar*)dst->getBase())->getId();
2346         //
2347         // In following code,
2348         // pseudo_kill V10
2349         // mov (8) V10, V11
2350         //
2351         // V10 and V11 do not interfere and can be assigned
2352         // same register.
2353         //
2354         // Following condition skips marking interference for
2355         // pseudo_kill nodes.
2356         //
2357         if (!inst->isPseudoKill() &&
2358             !inst->isLifeTimeEnd())
2359         {
2360             lrs[id]->setRefCount(lrs[id]->getRefCount() + refCount);  // update reference count
2361 
2362             buildInterferenceWithLive(live, id);
2363             if (lrs[id]->getIsSplittedDcl())
2364             {
2365                 buildInterferenceWithSubDcl(id, (G4_Operand *)dst, live, false, true);
2366             }
2367         }
2368 
2369         //
2370         // if the write does not cover the whole dst region, we should continue let the
2371         // liveness propagate upwards
2372         //
2373         if (liveAnalysis->writeWholeRegion(bb, inst, dst, builder.getOptions()) ||
2374             inst->isPseudoKill())
2375         {
2376             updateLiveness(live, id, false);
2377 
2378             if (lrs[id]->getIsSplittedDcl())
2379             {
2380                 for (unsigned i = lrs[id]->getDcl()->getSplitVarStartID();
2381                     i < lrs[id]->getDcl()->getSplitVarStartID() + gra.getSplitVarNum(lrs[id]->getDcl());
2382                     i++)
2383                 {
2384                     live.set(i, false);  //kill all childs, there may be not used childs generated due to splitting, killed also.
2385                 }
2386             }
2387         }
2388 
2389         // Indirect defs are actually uses of address reg
2390         lrs[id]->checkForInfiniteSpillCost(bb, i);
2391     }
2392     else if (dst->isIndirect() && liveAnalysis->livenessClass(G4_GRF))
2393     {
2394         //
2395         // add interferences to the list of potential indirect destination accesses.
2396         //
2397         const REGVAR_VECTOR& pointsToSet = liveAnalysis->getPointsToAnalysis().getAllInPointsToOrIndrUse(dst, bb);
2398         for (auto pt : pointsToSet)
2399         {
2400             if (pt.var->isRegAllocPartaker())
2401             {
2402                 buildInterferenceWithLive(live, pt.var->getId());
2403             }
2404         }
2405     }
2406 }
2407 
2408 
buildInterferenceWithinBB(G4_BB * bb,BitSet & live)2409 void Interference::buildInterferenceWithinBB(G4_BB* bb, BitSet& live)
2410 {
2411     DebugInfoState state;
2412     unsigned refCount = GlobalRA::getRefCount(kernel.getOption(vISA_ConsiderLoopInfoInRA) ?
2413         bb->getNestLevel() : 0);
2414 
2415     for (auto i = bb->rbegin(); i != bb->rend(); i++)
2416     {
2417         G4_INST* inst = (*i);
2418 
2419         G4_DstRegRegion* dst = inst->getDst();
2420         if (dst)
2421         {
2422             buildInterferenceForDst(bb, live, inst, i, dst);
2423         }
2424 
2425         if (inst->opcode() == G4_pseudo_fcall)
2426         {
2427             if (liveAnalysis->livenessClass(G4_GRF))
2428             {
2429                 G4_FCALL* fcall = kernel.fg.builder->getFcallInfo(bb->back());
2430                 G4_Declare* arg = kernel.fg.builder->getStackCallArg();
2431                 G4_Declare* ret = kernel.fg.builder->getStackCallRet();
2432                 MUST_BE_TRUE(fcall != NULL, "fcall info not found");
2433                 uint16_t retSize = fcall->getRetSize();
2434                 uint16_t argSize = fcall->getArgSize();
2435                 if (ret && retSize > 0 && ret->getRegVar())
2436                 {
2437                     buildInterferenceForFcall(bb, live, inst, i, ret->getRegVar());
2438                 }
2439                 if (arg && argSize > 0 && arg->getRegVar())
2440                 {
2441                     auto id = arg->getRegVar()->getId();
2442                     updateLiveness(live, id, true);
2443                 }
2444             }
2445             else if (liveAnalysis->livenessClass(G4_ADDRESS))
2446             {
2447                 // assume callee will use A0
2448                 auto A0Dcl = kernel.fg.fcallToPseudoDclMap[inst->asCFInst()].A0;
2449                 buildInterferenceWithLive(live, A0Dcl->getRegVar()->getId());
2450             }
2451             else if (liveAnalysis->livenessClass(G4_FLAG))
2452             {
2453                 // assume callee will use both F0 and F1
2454                 auto flagDcl = kernel.fg.fcallToPseudoDclMap[inst->asCFInst()].Flag;
2455                 buildInterferenceWithLive(live, flagDcl->getRegVar()->getId());
2456             }
2457         }
2458 
2459         if ((inst->isSend() || inst->isFillIntrinsic()) && !dst->isNullReg() &&
2460             kernel.fg.builder->WaDisableSendSrcDstOverlap())
2461         {
2462             markInterferenceForSend(bb, inst, dst);
2463         }
2464         else if (kernel.fg.builder->avoidDstSrcOverlap() && dst && !dst->isNullReg())
2465         {
2466             markInterferenceToAvoidDstSrcOverlap(bb, inst);
2467         }
2468 
2469         if ((inst->isSend() || inst->isFillIntrinsic()) && !dst->isNullReg())
2470         {
2471             //r127 must not be used for return address when there is a src and dest overlap in send instruction.
2472             //This applies to split-send as well
2473             if (kernel.fg.builder->needsToReserveR127() && liveAnalysis->livenessClass(G4_GRF))
2474             {
2475                 if (dst->getBase()->isRegAllocPartaker() && !dst->getBase()->asRegVar()->isPhyRegAssigned())
2476                 {
2477                     int dstId = dst->getBase()->asRegVar()->getId();
2478                     lrs[dstId]->markForbidden(kernel.getNumRegTotal() - 1, 1);
2479                 }
2480             }
2481         }
2482 
2483         if (inst->isSplitSend() && !inst->getSrc(1)->isNullReg())
2484         {
2485             G4_SrcRegRegion* src0 = inst->getSrc(0)->asSrcRegRegion();
2486             G4_SrcRegRegion* src1 = inst->getSrc(1)->asSrcRegRegion();
2487 
2488             if (src0->getBase()->isRegAllocPartaker() && src1->getBase()->isRegAllocPartaker())
2489             {
2490                 // src0 and src1 of split send may not overlap. In normal cases this is handled automatically
2491                 // as we add interference edge when we reach src0/src1's def.  If one source is an
2492                 // undefined variable (this can happen for URB write payload) and the other an input, however,
2493                 // we could miss the interference edge between the two.  So we add it explicitly here
2494                 int src0Id = src0->getBase()->asRegVar()->getId();
2495                 int src1Id = src1->getBase()->asRegVar()->getId();
2496 
2497                 checkAndSetIntf(src0Id, src1Id);
2498                 buildInterferenceWithAllSubDcl(src0Id, src1Id);
2499             }
2500         }
2501 
2502         //DPAS: As part of same instruction, src1 should not have overlap with dst. Src0 and src2 are okay to have overlap
2503         if (inst->isDpas() && !inst->getSrc(1)->isNullReg())
2504         {
2505             G4_SrcRegRegion* src1 = inst->getSrc(1)->asSrcRegRegion();
2506             if (dst->getBase()->isRegAllocPartaker() &&
2507                   src1->getBase()->isRegAllocPartaker())
2508             {
2509                 int dstId = dst->getBase()->asRegVar()->getId();
2510                 int src1Id = src1->getBase()->asRegVar()->getId();
2511                 checkAndSetIntf(dstId, src1Id);
2512                 buildInterferenceWithAllSubDcl(dstId, src1Id);
2513             }
2514         }
2515 
2516         //
2517         // process each source operand
2518         //
2519         for (unsigned j = 0; j < G4_MAX_SRCS; j++)
2520         {
2521             G4_Operand* src = inst->getSrc(j);
2522             if (src == NULL)
2523             {
2524                 continue;
2525             }
2526             if (src->isSrcRegRegion())
2527             {
2528                 G4_SrcRegRegion *srcRegion = src->asSrcRegRegion();
2529                 if (srcRegion->getBase()->isRegAllocPartaker())
2530                 {
2531                     unsigned id = ((G4_RegVar*)(srcRegion)->getBase())->getId();
2532                     lrs[id]->setRefCount(lrs[id]->getRefCount() + refCount); // update reference count
2533 
2534                     if (!inst->isLifeTimeEnd())
2535                     {
2536                         updateLiveness(live, id, true);
2537                         if (lrs[id]->getIsSplittedDcl())
2538                         {
2539                             buildInterferenceWithSubDcl(id, src, live, true, false);
2540                         }
2541                     }
2542 
2543                     if (inst->isEOT() && liveAnalysis->livenessClass(G4_GRF))
2544                     {
2545                         //mark the liveRange as the EOT source
2546                         lrs[id]->setEOTSrc();
2547                         if (builder.hasEOTGRFBinding())
2548                         {
2549                             lrs[id]->markForbidden(0, kernel.getNumRegTotal() - 16);
2550                         }
2551                     }
2552 
2553                     if (inst->isReturn())
2554                     {
2555                         lrs[id]->setRetIp();
2556                     }
2557                 }
2558                 else if (srcRegion->isIndirect() && liveAnalysis->livenessClass(G4_GRF))
2559                 {
2560                     // make every var in points-to set live
2561                     const REGVAR_VECTOR& pointsToSet = liveAnalysis->getPointsToAnalysis().getAllInPointsToOrIndrUse(srcRegion, bb);
2562                     for (auto pt : pointsToSet)
2563                     {
2564                         if (pt.var->isRegAllocPartaker())
2565                         {
2566                             updateLiveness(live, pt.var->getId(), true);
2567                         }
2568                     }
2569                 }
2570             }
2571         }
2572 
2573         //
2574         // Process register-indirect destination uses of ARF.
2575         //
2576         if (dst) {
2577             if (dst->getBase()->isRegAllocPartaker() &&
2578                 dst->getRegAccess() != Direct) {
2579                 live.set(dst->getBase()->asRegVar()->getId(), true);
2580             }
2581         }
2582 
2583         //
2584         // Process condMod
2585         //
2586         G4_CondMod* mod = inst->getCondMod();
2587         if (mod != NULL) {
2588             G4_VarBase *flagReg = mod->getBase();
2589             if (flagReg != NULL)
2590             {
2591                 unsigned id = flagReg->asRegVar()->getId();
2592                 if (flagReg->asRegVar()->isRegAllocPartaker())
2593                 {
2594                     lrs[id]->setRefCount(lrs[id]->getRefCount() + refCount); // update reference count
2595                     buildInterferenceWithLive(live, id);
2596 
2597                     if (liveAnalysis->writeWholeRegion(bb, inst, flagReg))
2598                     {
2599                         updateLiveness(live, id, false);
2600                     }
2601 
2602                     lrs[id]->checkForInfiniteSpillCost(bb, i);
2603                 }
2604             }
2605             else
2606             {
2607                 MUST_BE_TRUE((inst->opcode() == G4_sel ||
2608                     inst->opcode() == G4_csel) &&
2609                     inst->getCondMod() != NULL,
2610                     "Invalid CondMod");
2611             }
2612         }
2613 
2614         //
2615         // Process predicate
2616         //
2617         G4_Predicate* predicate = inst->getPredicate();
2618         if (predicate != NULL) {
2619             G4_VarBase *flagReg = predicate->getBase();
2620             unsigned id = flagReg->asRegVar()->getId();
2621             if (flagReg->asRegVar()->isRegAllocPartaker())
2622             {
2623                 lrs[id]->setRefCount(lrs[id]->getRefCount() + refCount); // update reference count
2624                 live.set(id, true);
2625             }
2626         }
2627 
2628         // Update debug info intervals based on live set
2629         if (builder.getOption(vISA_GenerateDebugInfo))
2630         {
2631             updateDebugInfo(kernel, inst, *liveAnalysis, lrs, live, &state, inst == bb->front());
2632         }
2633     }
2634 }
2635 
applyPartitionBias()2636 void Interference::applyPartitionBias()
2637 {
2638     // Any variable that interferes with a VCA dcl is live through an fcall.
2639     // This function makes such variables callee save biased to avoid save/restore
2640     // code around fcall. Save/restore may still be needed in case this is a
2641     // stack call function (vs kernel), but a single save/restore sequence can
2642     // free the callee save register throughout the function.
2643     for (unsigned int i = 0; i != liveAnalysis->getNumSelectedGlobalVar(); i++)
2644     {
2645         if (kernel.fg.isPseudoVCADcl(lrs[i]->getDcl()))
2646         {
2647             const auto& intfs = sparseIntf[i];
2648             for (const auto edge : intfs)
2649             {
2650                 // no point adding bias to any variable already assigned
2651                 if (lrs[edge]->getPhyReg())
2652                     continue;
2653 
2654                 lrs[edge]->setCalleeSaveBias(true);
2655                 lrs[edge]->setCallerSaveBias(false);
2656             }
2657         }
2658     }
2659 }
2660 
computeInterference()2661 void Interference::computeInterference()
2662 {
2663     startTimer(TimerID::INTERFERENCE);
2664     //
2665     // create bool vector, live, to track live ranges that are currently live
2666     //
2667     BitSet live(maxId, false);
2668 
2669     buildInterferenceAmongLiveOuts();
2670 
2671     for (G4_BB *bb : kernel.fg)
2672     {
2673         //
2674         // mark all live ranges dead
2675         //
2676         live.clear();
2677         //
2678         // start with all live ranges that are live at the exit of BB
2679         //
2680         buildInterferenceAtBBExit(bb, live);
2681         //
2682         // traverse inst in the reverse order
2683         //
2684 
2685         buildInterferenceWithinBB(bb, live);
2686     }
2687 
2688     buildInterferenceAmongLiveIns();
2689 
2690     //
2691     // Build interference with physical registers assigned by local RA
2692     //
2693     if (kernel.getOption(vISA_LocalRA))
2694     {
2695         for (auto curBB : kernel.fg)
2696         {
2697             buildInterferenceWithLocalRA(curBB);
2698         }
2699     }
2700 
2701     if (builder.getOption(vISA_RATrace))
2702     {
2703         RPE rpe(gra, liveAnalysis);
2704         rpe.run();
2705         std::cout << "\t--max RP: " << rpe.getMaxRP() << "\n";
2706     }
2707 
2708     // Augment interference graph to accomodate non-default masks
2709     Augmentation aug(kernel, *this, *liveAnalysis, lrs, gra);
2710     aug.augmentIntfGraph();
2711 
2712     generateSparseIntfGraph();
2713 
2714     // apply callee save bias after augmentation as interference graph is up-to-date.
2715     if (kernel.fg.getHasStackCalls())
2716     {
2717         applyPartitionBias();
2718     }
2719 }
2720 
2721 #define SPARSE_INTF_VEC_SIZE 64
2722 
generateSparseIntfGraph()2723 void Interference::generateSparseIntfGraph()
2724 {
2725     // Generate sparse intf graph from the dense one
2726     unsigned numVars = liveAnalysis->getNumSelectedVar();
2727 
2728     sparseIntf.resize(numVars);
2729 
2730     for (unsigned row = 0; row < numVars; row++)
2731     {
2732         sparseIntf[row].reserve(SPARSE_INTF_VEC_SIZE);
2733     }
2734 
2735     if (useDenseMatrix())
2736     {
2737         // Iterate over intf graph matrix
2738         for (unsigned row = 0; row < numVars; row++)
2739         {
2740             unsigned rowOffset = row * rowSize;
2741             unsigned colStart = (row + 1) / BITS_DWORD;
2742             for (unsigned j = colStart; j < rowSize; j++)
2743             {
2744                 unsigned intfBlk = getInterferenceBlk(rowOffset + j);
2745                 if (intfBlk != 0)
2746                 {
2747                     for (unsigned k = 0; k < BITS_DWORD; k++)
2748                     {
2749                         if (intfBlk & (1 << k))
2750                         {
2751                             unsigned v2 = (j*BITS_DWORD) + k;
2752                             if (v2 != row)
2753                             {
2754                                 sparseIntf[v2].emplace_back(row);
2755                                 sparseIntf[row].emplace_back(v2);
2756                             }
2757                         }
2758                     }
2759                 }
2760             }
2761         }
2762     }
2763     else
2764     {
2765         for (uint32_t v1 = 0; v1 < maxId; ++v1)
2766         {
2767             auto&& intfSet = sparseMatrix[v1];
2768             for (uint32_t v2 : intfSet)
2769             {
2770                 sparseIntf[v1].emplace_back(v2);
2771                 sparseIntf[v2].emplace_back(v1);
2772             }
2773         }
2774     }
2775 
2776     if (builder.getOption(vISA_RATrace))
2777     {
2778         uint32_t numNeighbor = 0;
2779         uint32_t maxNeighbor = 0;
2780         uint32_t maxIndex = 0;
2781         for (int i = 0, numVar = (int) sparseIntf.size(); i < numVar; ++i)
2782         {
2783             if (lrs[i]->getPhyReg() == nullptr)
2784             {
2785                 auto intf = sparseIntf[i];
2786                 numNeighbor += (uint32_t)intf.size();
2787                 maxNeighbor = std::max(maxNeighbor, (uint32_t)intf.size());
2788                 if (maxNeighbor == (uint32_t)intf.size())
2789                 {
2790                     maxIndex = i;
2791                 }
2792             }
2793         }
2794         float avgNeighbor = ((float)numNeighbor) / sparseIntf.size();
2795         std::cout << "\t--avg # neighbors: " << std::setprecision(6) << avgNeighbor << "\n";
2796         std::cout << "\t--max # neighbors: " << maxNeighbor << " (" << lrs[maxIndex]->getDcl()->getName() << ")\n";
2797     }
2798 
2799     stopTimer(TimerID::INTERFERENCE);
2800 }
2801 
2802 // This function can be invoked before local RA or after augmentation.
2803 // This function will update sub-reg data only for non-NoMask vars and
2804 // leave others unchanged, ie their value will be as per HW conformity
2805 // or earlier phase.
updateSubRegAlignment(G4_SubReg_Align subAlign)2806 void GlobalRA::updateSubRegAlignment(G4_SubReg_Align subAlign)
2807 {
2808     // Update alignment of all GRF declares to sub-align
2809     for (auto dcl : kernel.Declares)
2810     {
2811         if (dcl->getRegFile() & G4_GRF && !dcl->getIsPartialDcl())
2812         {
2813             G4_Declare* topdcl = dcl->getRootDeclare();
2814 
2815             if (!areAllDefsNoMask(topdcl) &&
2816                 getAugmentationMask(topdcl) != AugmentationMasks::NonDefault)
2817             {
2818                 dcl->setSubRegAlign(subAlign);
2819                 setSubRegAlign(dcl, subAlign);
2820             }
2821         }
2822     }
2823 }
2824 
evenAlignNeeded(G4_Declare * dcl)2825 bool GlobalRA::evenAlignNeeded(G4_Declare* dcl)
2826 {
2827     if (GlobalRA::useGenericAugAlign())
2828     {
2829         // Return true if even alignment is needed
2830         // Even align needed if for given SIMD size and elem type,
2831         // a complete def uses between 1-2 GRFs.
2832         auto kernelSimdSizeToUse = kernel.getSimdSizeWithSlicing();
2833         G4_Declare* topdcl = dcl->getRootDeclare();
2834         auto topdclAugMask = getAugmentationMask(topdcl);
2835 
2836         if (!areAllDefsNoMask(topdcl) && !topdcl->getIsPartialDcl() &&
2837             topdclAugMask != AugmentationMasks::NonDefault)
2838         {
2839             auto elemSizeToUse = topdcl->getElemSize();
2840             if (elemSizeToUse < 4 && topdclAugMask == AugmentationMasks::Default32Bit)
2841                 // :uw with hstride 2 can also be Default32Bit and hence needs even alignment
2842                 elemSizeToUse = 4;
2843             else if (elemSizeToUse < 8 && topdclAugMask == AugmentationMasks::Default64Bit)
2844                 elemSizeToUse = 8;
2845 
2846             if (// Even align if size is between 1-2 GRFs, for >2GRF sizes use weak edges
2847                 (elemSizeToUse * kernelSimdSizeToUse) > (unsigned)numEltPerGRF<Type_UB>() &&
2848                 (elemSizeToUse * kernelSimdSizeToUse) <= (unsigned)(2 * numEltPerGRF<Type_UB>()) &&
2849                 !(kernel.fg.builder->getOption(vISA_enablePreemption) &&
2850                     dcl == kernel.fg.builder->getBuiltinR0()))
2851             {
2852                 return true;
2853             }
2854         }
2855     }
2856     else
2857     {
2858         if (dcl->getRegFile() & G4_GRF)
2859         {
2860             G4_Declare* topdcl = dcl->getRootDeclare();
2861             auto topdclAugMask = getAugmentationMask(topdcl);
2862 
2863             if (!areAllDefsNoMask(topdcl) && !topdcl->getIsPartialDcl() &&
2864                 topdclAugMask != AugmentationMasks::NonDefault &&
2865                 topdclAugMask != AugmentationMasks::Default64Bit)
2866             {
2867                 if ((topdcl->getElemSize() >= 4 || topdclAugMask == AugmentationMasks::Default32Bit) &&
2868                     topdcl->getByteSize() >= numEltPerGRF<Type_UB>() &&
2869                     !(kernel.fg.builder->getOption(vISA_enablePreemption) &&
2870                         dcl == kernel.fg.builder->getBuiltinR0()))
2871                 {
2872                     return true;
2873                 }
2874             }
2875         }
2876     }
2877 
2878     return false;
2879 }
2880 
2881 // This function can be invoked before local RA or after augmentation.
evenAlign()2882 void GlobalRA::evenAlign()
2883 {
2884     // Update alignment of all GRF declares to align
2885     for (auto dcl : kernel.Declares)
2886     {
2887         if (dcl->getRegFile() & G4_GRF)
2888         {
2889             if (evenAlignNeeded(dcl))
2890             {
2891                 setEvenAligned(dcl, true);
2892             }
2893         }
2894     }
2895 }
2896 
getBankAlignment(LiveRange * lr,BankAlign & align)2897 void GlobalRA::getBankAlignment(LiveRange* lr, BankAlign &align)
2898 {
2899     G4_Declare *dcl = lr->getDcl();
2900     if (kernel.getSimdSize() < g4::SIMD16)
2901     {
2902         return;
2903     }
2904 
2905     if (dcl->getRegFile() & G4_GRF)
2906     {
2907         G4_Declare* topdcl = dcl->getRootDeclare();
2908         auto topdclBC = getBankConflict(topdcl);
2909 
2910         if (topdclBC != BANK_CONFLICT_NONE)
2911         {
2912             if (topdcl->getElemSize() >= 4 &&
2913                 topdcl->getNumRows() > 1 &&
2914                 !(kernel.fg.builder->getOption(vISA_enablePreemption) &&
2915                     dcl == kernel.fg.builder->getBuiltinR0()))
2916             {
2917                 if (topdclBC == BANK_CONFLICT_SECOND_HALF_EVEN ||
2918                     topdclBC == BANK_CONFLICT_SECOND_HALF_ODD)
2919                 {
2920                     align = BankAlign::Odd;
2921                 }
2922             }
2923         }
2924     }
2925 }
2926 
Augmentation(G4_Kernel & k,Interference & i,const LivenessAnalysis & l,LiveRange * const ranges[],GlobalRA & g)2927 Augmentation::Augmentation(G4_Kernel& k, Interference& i, const LivenessAnalysis& l, LiveRange* const ranges[], GlobalRA& g) :
2928     kernel(k), intf(i), gra(g), liveAnalysis(l), lrs(ranges), fcallRetMap(g.fcallRetMap), m(kernel.fg.mem)
2929 {
2930 }
2931 
2932 // For Scatter read, the channel is not handled as the block read.
2933 // Update the emask according to the definition of VISA
updateDstMaskForGather(G4_INST * inst,std::vector<unsigned char> & mask)2934 bool Augmentation::updateDstMaskForGather(G4_INST* inst, std::vector<unsigned char>& mask)
2935 {
2936     if (const G4_SendDescRaw *d = inst->getMsgDescRaw()) {
2937         return updateDstMaskForGatherRaw(inst, mask, d);
2938     } else if (const G4_SendDescLdSt *d = inst->getMsgDescLdSt()) {
2939         return updateDstMaskForGatherLdSt(inst, mask, d);
2940     } else {
2941         ASSERT_USER(false, "unexpected descriptor");
2942         return false;
2943     }
2944 }
2945 
updateMaskSIMT(unsigned char curEMBit,unsigned char execSize,std::vector<unsigned char> & mask,unsigned dataSizeBytes,unsigned vecElems)2946 static void updateMaskSIMT(
2947     unsigned char curEMBit,
2948     unsigned char execSize,
2949     std::vector<unsigned char>& mask,
2950     unsigned dataSizeBytes, unsigned vecElems)
2951 {
2952     unsigned blockSize = dataSizeBytes;
2953     unsigned blockNum = vecElems;
2954     for (unsigned i = 0; i < execSize; i++)
2955     {
2956         for (unsigned j = 0; j < blockNum; j++)
2957         {
2958             for (unsigned k = 0; k < blockSize; k++)
2959             {
2960                 mask[(j * execSize + i) * blockSize + k] = curEMBit;
2961             }
2962         }
2963         if (curEMBit != NOMASK_BYTE)
2964         {
2965             curEMBit++;
2966             ASSERT_USER(curEMBit <= 32, "Illegal mask channel");
2967         }
2968     }
2969 }
2970 
updateDstMaskForGatherRaw(G4_INST * inst,std::vector<unsigned char> & mask,const G4_SendDescRaw * msgDesc)2971 bool Augmentation::updateDstMaskForGatherRaw(
2972     G4_INST* inst, std::vector<unsigned char>& mask, const G4_SendDescRaw* msgDesc)
2973 {
2974     unsigned char execSize = inst->getExecSize();
2975     const G4_DstRegRegion* dst = inst->getDst();
2976     unsigned char curEMBit = (unsigned char)inst->getMaskOffset();
2977     unsigned short elemSize = dst->getElemSize();
2978 
2979     if (inst->isWriteEnableInst())
2980     {
2981         curEMBit = NOMASK_BYTE;
2982     }
2983 
2984     SFID funcID = msgDesc->getFuncId();
2985 
2986     switch (funcID)
2987     {
2988     case SFID::DP_DC1:
2989         switch (msgDesc->getHdcMessageType())
2990         {
2991         case DC1_A64_SCATTERED_READ:   //a64 scattered read: svm_gather
2992         {
2993             unsigned blockNum = msgDesc->getBlockNum();
2994             unsigned blockSize = msgDesc->getBlockSize();
2995 
2996             for (unsigned i = 0; i < execSize; i++)
2997             {
2998                 for (unsigned j = 0; j < blockNum; j++)
2999                 {
3000                     for (unsigned k = 0; k < blockSize; k++)
3001                     {
3002                         mask[(j * execSize + i) * blockSize + k] = curEMBit;
3003                     }
3004                 }
3005                 if (curEMBit != NOMASK_BYTE)
3006                 {
3007                     curEMBit++;
3008                     ASSERT_USER(curEMBit <= 32, "Illegal mask channel");
3009                 }
3010             }
3011             return true;
3012         }
3013         break;
3014 
3015         case DC1_A64_UNTYPED_SURFACE_READ:  //SVM gather 4
3016         case DC1_UNTYPED_SURFACE_READ:   //VISA gather 4
3017         case DC1_TYPED_SURFACE_READ:   //Gather 4 typed
3018         {
3019             unsigned channelNum = msgDesc->getEnabledChannelNum();
3020             if (channelNum == 0)
3021             {
3022                 return false;
3023             }
3024             if (elemSize < 4)
3025             {
3026                 elemSize = 4;
3027             }
3028 
3029             for (unsigned i = 0; i < channelNum; i++)
3030             {
3031                 for (unsigned j = 0; j < execSize; j++)
3032                 {
3033                     for (unsigned k = 0; k < elemSize; k++)
3034                     {
3035                         mask[(i * execSize + j)*elemSize + k] = curEMBit;
3036                     }
3037                     if (curEMBit != NOMASK_BYTE)
3038                     {
3039                         curEMBit++;
3040                         ASSERT_USER(curEMBit <= 32, "Illegal mask channel");
3041                     }
3042                 }
3043                 curEMBit = (unsigned char)inst->getMaskOffset();
3044             }
3045             return true;
3046         }
3047         break;
3048 
3049         default: return false;
3050         }
3051         break;
3052     case SFID::DP_DC2:
3053         switch (msgDesc->getHdcMessageType())
3054         {
3055         case DC2_UNTYPED_SURFACE_READ:   //gather 4 scaled
3056         case DC2_A64_UNTYPED_SURFACE_READ: //SVM gather 4 scaled
3057         {
3058             unsigned channelNum = msgDesc->getEnabledChannelNum();
3059             if (channelNum == 0)
3060             {
3061                 return false;
3062             }
3063             if (elemSize < 4)
3064             {
3065                 elemSize = 4;
3066             }
3067 
3068             for (unsigned i = 0; i < channelNum; i++)
3069             {
3070                 for (unsigned j = 0; j < execSize; j++)
3071                 {
3072                     for (unsigned k = 0; k < elemSize; k++)
3073                     {
3074                         mask[(i * execSize + j)*elemSize + k] = curEMBit;
3075                     }
3076                     if (curEMBit != NOMASK_BYTE)
3077                     {
3078                         curEMBit++;
3079                         ASSERT_USER(curEMBit <= 32, "Illegal mask channel");
3080                     }
3081                 }
3082                 curEMBit = (unsigned char)inst->getMaskOffset();
3083             }
3084             return true;
3085         }
3086 
3087         case DC2_BYTE_SCATTERED_READ:   //scaled byte scattered read: gather_scaled, handled as block read write
3088         default: return false;
3089         }
3090         break;
3091     case SFID::DP_DC0:
3092         switch (msgDesc->getHdcMessageType())
3093         {
3094         case DC_DWORD_SCATTERED_READ:   //dword scattered read: gather(dword), handled as block read write
3095         case DC_BYTE_SCATTERED_READ:       //byte scattered read:   gather(byte), handled as block read write
3096         default: return false;
3097         }
3098         break;
3099 
3100     case SFID::SAMPLER:
3101     {
3102         unsigned respLength = msgDesc->ResponseLength();
3103         if (respLength * numEltPerGRF<Type_UB>() != dst->getTopDcl()->getByteSize() &&
3104             msgDesc->isFence())
3105         {
3106             // since send dst size is not exactly equal to ResponseLength encoded in
3107             // the descriptor, conservatively treat the send as being non-default
3108             auto sz = dst->getTopDcl()->getByteSize();
3109             for (unsigned int i = 0; i != sz; ++i)
3110                 mask[i] = NOMASK_BYTE;
3111             return true;
3112         }
3113         unsigned char curEMBit = (unsigned char)inst->getMaskOffset();
3114         elemSize = msgDesc->is16BitReturn() ? 2 : 4;
3115         unsigned warpNum = respLength * numEltPerGRF<Type_UB>() / (execSize * elemSize);
3116         if (inst->isWriteEnableInst())
3117         {
3118             curEMBit = NOMASK_BYTE;
3119         }
3120         for (unsigned i = 0; i < warpNum; i++)
3121         {
3122             for (unsigned j = 0; j < execSize; j++)
3123             {
3124                 for (unsigned k = 0; k < elemSize; k++)
3125                 {
3126                     mask[(i * execSize + j)*elemSize + k] = curEMBit;
3127                 }
3128                 if (curEMBit != NOMASK_BYTE)
3129                 {
3130                     curEMBit++;
3131                     ASSERT_USER(curEMBit <= 32, "Illegal mask channel");
3132                 }
3133             }
3134             curEMBit = (unsigned char)inst->getMaskOffset();
3135         }
3136         return true;
3137     }
3138 
3139     break;
3140 
3141     case SFID::UGM:
3142     case SFID::UGML:
3143     case SFID::SLM:
3144     {
3145         uint32_t desc = msgDesc->getDesc();
3146         uint32_t op = (desc & 0x3F); // [5:0]
3147         uint32_t dszEncd = (desc >> 9) & 0x7; // [11:9]
3148         bool isTranspose = ((desc >> 15) & 0x1) != 0; // [15]
3149         if (op == LSC_LOAD && !isTranspose) { // transpose not supported yet
3150             int dataSzReg = 0;
3151             switch (dszEncd) { // dat size [11:9]
3152             case 0: dataSzReg = 1; break; // d8
3153             case 1: dataSzReg = 2; break; // d16
3154             default: dataSzReg = 4; break; // d32, d8u32, d16u32, d16u32h
3155             case 3: dataSzReg = 8; break; // d64
3156             }
3157             int vecSz = 0;
3158             int vecSzEncd = (desc >> 12) & 0x7; // [14:12]
3159             if (vecSzEncd <= 3) {
3160                 vecSz = vecSzEncd + 1; // V1, V2, V3, V4
3161             } else {
3162                 vecSz = 4 << (vecSzEncd - 3); // V8, V16, V32, V64
3163             }
3164             updateMaskSIMT(curEMBit, execSize, mask,
3165                 (unsigned)dataSzReg,
3166                 (unsigned)vecSz);
3167             return true;
3168         }
3169     }
3170     default: return false;
3171     }
3172 
3173     return false;
3174 }
3175 
updateDstMaskForGatherLdSt(G4_INST * inst,std::vector<unsigned char> & mask,const G4_SendDescLdSt * msgDesc)3176 bool Augmentation::updateDstMaskForGatherLdSt(
3177     G4_INST* inst, std::vector<unsigned char>& mask, const G4_SendDescLdSt *msgDesc)
3178 {
3179     // as in the raw case only support SIMT
3180     if (msgDesc->op != LdStOp::LOAD || msgDesc->order == LdStOrder::SCALAR) {
3181         return false;
3182     }
3183     unsigned char curEMBit = (unsigned char)inst->getMaskOffset();
3184     unsigned char execSize = inst->getExecSize();
3185     updateMaskSIMT(curEMBit, execSize, mask,
3186         msgDesc->elemBitsReg, msgDesc->elemPerAddr);
3187 
3188     return true;
3189 }
3190 
3191 // Value stored at each byte in mask determines which bits
3192 // of EM enable that byte for writing. When checkCmodOnly
3193 // is set dst is ignored and mask only for cmod is set. For
3194 // flag declares, mask is at bit granularity rather than byte.
3195 // Function updates mask field in declaration of correspoing
3196 // variable - dst or cmod.
updateDstMask(G4_INST * inst,bool checkCmodOnly)3197 void Augmentation::updateDstMask(G4_INST* inst, bool checkCmodOnly)
3198 {
3199     G4_DstRegRegion* dst = inst->getDst();
3200     G4_CondMod* cmod = inst->getCondMod();
3201 
3202     if ((checkCmodOnly == false && dst &&
3203         dst->getBase() &&
3204         dst->getBase()->isRegVar()) ||
3205         (checkCmodOnly == true && cmod != NULL && cmod->getBase() != NULL))
3206     {
3207         int dclOffset = 0;
3208         G4_Declare* topdcl = NULL;
3209 
3210         if (checkCmodOnly == false)
3211         {
3212             topdcl = dst->getBase()->asRegVar()->getDeclare();
3213         }
3214         else
3215         {
3216             topdcl = cmod->asCondMod()->getTopDcl();
3217         }
3218 
3219         while (topdcl->getAliasDeclare() != nullptr)
3220         {
3221             dclOffset += topdcl->getAliasOffset();
3222             topdcl = topdcl->getAliasDeclare();
3223         }
3224 
3225         auto& mask = const_cast<std::vector<unsigned char>&>(gra.getMask(topdcl));
3226 
3227         unsigned size = topdcl->getByteSize();
3228         if (checkCmodOnly == true || dst->isFlag())
3229         {
3230             size *= BITS_PER_BYTE;
3231         }
3232 
3233         if (mask.size() == 0)
3234         {
3235             mask.resize(size);
3236         }
3237 
3238         MUST_BE_TRUE(mask.size() > 0, "Valid mask not found for dcl " << topdcl->getName());
3239 
3240         unsigned short hstride, elemSize;
3241         short row, subReg;
3242         unsigned startByte;
3243 
3244         if (checkCmodOnly == false)
3245         {
3246             hstride = dst->getHorzStride();
3247 
3248             row = dst->getRegOff();
3249             subReg = dst->getSubRegOff();
3250             elemSize = dst->getElemSize();
3251 
3252             if (inst->isSend() && !inst->isEOT())
3253             {
3254                 if (updateDstMaskForGather(inst, mask))
3255                 {
3256                     return;
3257                 }
3258             }
3259 
3260             if (dst->isFlag())
3261             {
3262                 elemSize = 1;
3263             }
3264 
3265             startByte = (row * getGRFSize()) + (subReg * elemSize);
3266 
3267             if (dst->isFlag())
3268             {
3269                 startByte = (row * 32) + (subReg * 8);
3270             }
3271         }
3272         else
3273         {
3274             hstride = 1;
3275             row = 0;
3276             elemSize = 1;
3277             startByte = cmod->asCondMod()->getLeftBound();
3278         }
3279 
3280         unsigned rb = 0xffffffff;
3281 
3282         if (checkCmodOnly == true)
3283         {
3284             rb = cmod->asCondMod()->getRightBound();
3285         }
3286         else
3287         {
3288             rb = dst->getRightBound();
3289         }
3290 
3291         unsigned char curEMBit = (unsigned char)inst->getMaskOffset();
3292         if (inst->isWriteEnableInst())
3293         {
3294             curEMBit = NOMASK_BYTE;
3295         }
3296 
3297         for (unsigned i = dclOffset + startByte;
3298             i <= rb;
3299             i += (hstride * elemSize))
3300         {
3301             for (int j = 0; j < elemSize; j++)
3302             {
3303                 MUST_BE_TRUE2(i + j < size, "updateDstMask writing past end of mask array size:" << size, inst);
3304                 mask[i + j] |= curEMBit;
3305             }
3306             if (curEMBit != NOMASK_BYTE)
3307             {
3308                 curEMBit++;
3309             }
3310         }
3311     }
3312 }
3313 
getByteSizeFromMask(AugmentationMasks type)3314 unsigned Augmentation::getByteSizeFromMask(AugmentationMasks type)
3315 {
3316     if (type == AugmentationMasks::Default16Bit)
3317     {
3318         return 2;
3319     }
3320     else if (type == AugmentationMasks::Default32Bit)
3321     {
3322         return 4;
3323     }
3324     else if (type == AugmentationMasks::Default64Bit)
3325     {
3326         return 8;
3327     }
3328 
3329     MUST_BE_TRUE(false, "Unexpected type of mask");
3330 
3331     return 0;
3332 }
3333 
isDefaultMaskDcl(G4_Declare * dcl,unsigned simdSize,AugmentationMasks type)3334 bool Augmentation::isDefaultMaskDcl(G4_Declare* dcl, unsigned simdSize, AugmentationMasks type)
3335 {
3336     // default mask is one where dst's hstride is 1 and
3337     // elem size is 4
3338     bool isDefault = false;
3339     auto& mask = gra.getMask(dcl);
3340 
3341     unsigned byteSize = getByteSizeFromMask(type);
3342 
3343     // treat simd32 as simd16 as the instruction is always split to 2 simd16
3344     if (simdSize == 32)
3345     {
3346         simdSize = 16;
3347     }
3348     if (mask.size() > 0)
3349     {
3350         G4_Declare* topdcl = dcl->getRootDeclare();
3351         bool isFlagDcl = (topdcl->getRegFile() == G4_FLAG);
3352 
3353         unsigned size = topdcl->getByteSize();
3354         unsigned char curEMBit = 0;
3355         bool found = true;
3356         unsigned wrapAround = simdSize*byteSize;
3357 
3358         if (isFlagDcl == true)
3359         {
3360             size *= BITS_PER_BYTE;
3361             wrapAround = 16;
3362         }
3363 
3364         for (unsigned i = 0; i < size; i += 1)
3365         {
3366             if (isFlagDcl == true)
3367             {
3368                 curEMBit++;
3369             }
3370             else
3371             {
3372                 if (byteSize && i%byteSize == 0)
3373                 {
3374                     curEMBit++;
3375                 }
3376             }
3377 
3378             if (i%wrapAround == 0)
3379             {
3380                 // Wrap around based on simd size
3381                 // For SIMD8 wrap around each row,
3382                 // for SIMD16 wrap around every other row
3383                 curEMBit = 0;
3384             }
3385 
3386             if (mask[i] != curEMBit &&
3387                 // For flags, we set bytesize = 2 although
3388                 // the kernel is SIMD8. This means higher 8
3389                 // bits of mask will be set to 0 since those
3390                 // bits are never defined. Such masks need
3391                 // not be considered non-default.
3392                 !(isFlagDcl == true && mask[i] == 0))
3393             {
3394                 found = false;
3395                 break;
3396             }
3397         }
3398 
3399         if (found == true)
3400         {
3401             isDefault = true;
3402         }
3403     }
3404 
3405     return isDefault;
3406 }
3407 
isDefaultMaskSubDeclare(unsigned char * mask,unsigned lb,unsigned rb,G4_Declare * dcl,unsigned simdSize)3408 bool Augmentation::isDefaultMaskSubDeclare(unsigned char* mask, unsigned lb, unsigned rb, G4_Declare* dcl, unsigned simdSize)
3409 {
3410     bool isDefault = false;
3411 
3412     // treat simd32 as simd16 as the instruction is always split to 2 simd16
3413     if (simdSize == 32)
3414     {
3415         simdSize = 16;
3416     }
3417 
3418     if (mask != NULL)
3419     {
3420         unsigned size = dcl->getByteSize();
3421         unsigned char curEMBit = 0;
3422         bool found = true;
3423         unsigned wrapAround = simdSize * 4;
3424         unsigned leftBound = gra.getSubOffset(dcl);
3425         unsigned rightBound = leftBound + size - 1;
3426 
3427         ASSERT_USER(rightBound <= rb, "Wrong sub declare right bound!");
3428 
3429         for (unsigned i = lb; i < rightBound + 1; i += 1)
3430         {
3431             if ((i - lb) % 4 == 0)
3432             {
3433                 curEMBit++;
3434             }
3435 
3436             if ((i - lb) % wrapAround == 0)
3437             {
3438                 curEMBit = 0;
3439             }
3440 
3441             if (i >= leftBound)
3442             {
3443                 if (mask[i] != curEMBit)
3444                 {
3445                     found = false;
3446                     break;
3447                 }
3448             }
3449         }
3450 
3451         if (found == true)
3452         {
3453             isDefault = true;
3454         }
3455     }
3456 
3457     return isDefault;
3458 }
3459 
3460 
verifyMaskIfInit(G4_Declare * dcl,AugmentationMasks mask)3461 bool Augmentation::verifyMaskIfInit(G4_Declare* dcl, AugmentationMasks mask)
3462 {
3463     // Return true if dcl mask is either undetermined or same as mask
3464     auto m = gra.getAugmentationMask(dcl);
3465     if (m == mask ||
3466         m == AugmentationMasks::Undetermined)
3467     {
3468         return true;
3469     }
3470 
3471     return false;
3472 }
3473 
checkGRFPattern2(G4_Declare * dcl,G4_DstRegRegion * dst,unsigned maskOff,unsigned lb,unsigned rb,unsigned execSize)3474 bool Augmentation::checkGRFPattern2(G4_Declare* dcl, G4_DstRegRegion* dst, unsigned maskOff,
3475     unsigned lb, unsigned rb, unsigned execSize)
3476 {
3477     auto opndByteSize = dst->getTypeSize();
3478     unsigned modWith = opndByteSize*kernel.getSimdSize();
3479     if (lb % modWith - (maskOff * opndByteSize * dst->getHorzStride()) <= opndByteSize)
3480     {
3481         if ((lb + (execSize * opndByteSize * dst->getHorzStride() - dst->getHorzStride()) - rb) < opndByteSize)
3482         {
3483             if (opndByteSize == 2 &&
3484                 verifyMaskIfInit(dcl, AugmentationMasks::Default32Bit))
3485             {
3486                 gra.setAugmentationMask(dcl, AugmentationMasks::Default32Bit);
3487                 return true;
3488             }
3489             else if (opndByteSize == 4 &&
3490                 verifyMaskIfInit(dcl, AugmentationMasks::Default64Bit))
3491             {
3492                 gra.setAugmentationMask(dcl, AugmentationMasks::Default64Bit);
3493                 return true;
3494             }
3495             else
3496             {
3497                 gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
3498                 return true;
3499             }
3500         }
3501     }
3502 
3503     return false;
3504 }
3505 
3506 // Returns true if dcl mask deemed to be non-default, false otherwise.
checkGRFPattern1(G4_Declare * dcl,G4_DstRegRegion * dst,unsigned maskOff,unsigned lb,unsigned rb,unsigned execSize)3507 bool Augmentation::checkGRFPattern1(G4_Declare* dcl, G4_DstRegRegion* dst, unsigned maskOff,
3508     unsigned lb, unsigned rb, unsigned execSize)
3509 {
3510     auto opndByteSize = dst->getTypeSize();
3511     unsigned modWith = opndByteSize*kernel.getSimdSize();
3512     if (dst->getHorzStride() == 1)
3513     {
3514         if ((lb%modWith == (maskOff * opndByteSize) &&
3515             rb == (lb + (execSize * opndByteSize) - 1)))
3516         {
3517             // This will be taken only when hstride = 1
3518             if (opndByteSize == 2 &&
3519                 verifyMaskIfInit(dcl, AugmentationMasks::Default16Bit))
3520             {
3521                 gra.setAugmentationMask(dcl, AugmentationMasks::Default16Bit);
3522                 return true;
3523             }
3524             else if (opndByteSize == 4 &&
3525                 verifyMaskIfInit(dcl, AugmentationMasks::Default32Bit))
3526             {
3527                 gra.setAugmentationMask(dcl, AugmentationMasks::Default32Bit);
3528                 return true;
3529             }
3530             else if (opndByteSize == 8 &&
3531                 verifyMaskIfInit(dcl, AugmentationMasks::Default64Bit))
3532             {
3533                 gra.setAugmentationMask(dcl, AugmentationMasks::Default64Bit);
3534                 return true;
3535             }
3536             else
3537             {
3538                 gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
3539                 return true;
3540             }
3541         }
3542     }
3543 
3544     return false;
3545 }
3546 
markNonDefaultDstRgn(G4_INST * inst,G4_Operand * opnd)3547 void Augmentation::markNonDefaultDstRgn(G4_INST* inst, G4_Operand* opnd)
3548 {
3549     if (inst->isPseudoKill())
3550     {
3551         return;
3552     }
3553 
3554     G4_DstRegRegion* dst = nullptr;
3555     G4_CondMod* condMod = nullptr;
3556     if (opnd->isDstRegRegion())
3557     {
3558         dst = opnd->asDstRegRegion();
3559     }
3560     else if (opnd->isCondMod())
3561     {
3562         condMod = opnd->asCondMod();
3563     }
3564     else
3565     {
3566         MUST_BE_TRUE(false, "Dont know how to handle this type of operand");
3567     }
3568 
3569     // Handle condMod
3570     if (condMod && condMod->getBase())
3571     {
3572         G4_Declare* dcl = condMod->getTopDcl();
3573         dcl = dcl->getRootDeclare();
3574 
3575         if (inst->isWriteEnableInst() ||
3576             opnd->getLeftBound() != inst->getMaskOffset())
3577         {
3578             gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
3579             return;
3580         }
3581 
3582         if (verifyMaskIfInit(dcl, AugmentationMasks::DefaultPredicateMask))
3583         {
3584             gra.setAugmentationMask(dcl, AugmentationMasks::DefaultPredicateMask);
3585         }
3586         return;
3587     }
3588 
3589     // Handle dst
3590     if (inst->isCall() || inst->isCallerSave())
3591     {
3592         const G4_Declare* dcl = dst->getBase()->asRegVar()->getDeclare();
3593         if (dcl && liveAnalysis.livenessClass(dcl->getRegFile()))
3594         {
3595             gra.setAugmentationMask(dcl->getRootDeclare(), AugmentationMasks::NonDefault);
3596         }
3597         return;
3598     }
3599 
3600     bool isFlagRA = liveAnalysis.livenessClass(G4_FLAG);
3601     if (dst &&
3602         dst->getBase() &&
3603         dst->getBase()->isRegVar())
3604     {
3605         G4_Declare* dcl = dst->getBase()->asRegVar()->getDeclare();
3606         if (!liveAnalysis.livenessClass(dcl->getRegFile()))
3607         {
3608             return;
3609         }
3610         unsigned offTopDcl = 0;
3611         while (dcl->getAliasDeclare())
3612         {
3613             offTopDcl += dcl->getAliasOffset();
3614             dcl = dcl->getAliasDeclare();
3615         }
3616 
3617         // NoMask instructions's dst is always non-default
3618         if (inst->isWriteEnableInst())
3619         {
3620             gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
3621             return;
3622         }
3623 
3624         if (gra.getAugmentationMask(dcl) == AugmentationMasks::NonDefault)
3625             return;
3626 
3627         unsigned maskOff = inst->getMaskOffset();
3628         unsigned lb = dst->getLeftBound() + offTopDcl;
3629         unsigned rb = dst->getRightBound() + offTopDcl;
3630         unsigned execSize = inst->getExecSize();
3631 
3632         if (dcl->getAddressed())
3633         {
3634             gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
3635             return;
3636         }
3637 
3638         if (!isFlagRA)
3639         {
3640             // Treat send as special case because update mask for scatter
3641             // has some special checks.
3642             if (inst->isSend())
3643             {
3644                 if (gra.getAugmentationMask(dcl) == AugmentationMasks::NonDefault)
3645                 {
3646                     return;
3647                 }
3648 
3649                 updateDstMask(inst, false);
3650                 if (isDefaultMaskDcl(dcl, kernel.getSimdSize(), AugmentationMasks::Default16Bit))
3651                 {
3652                     gra.setAugmentationMask(dcl, AugmentationMasks::Default16Bit);
3653                 }
3654                 else if (isDefaultMaskDcl(dcl, kernel.getSimdSize(), AugmentationMasks::Default32Bit))
3655                 {
3656                     gra.setAugmentationMask(dcl, AugmentationMasks::Default32Bit);
3657                 }
3658                 else if (isDefaultMaskDcl(dcl, kernel.getSimdSize(), AugmentationMasks::Default64Bit))
3659                 {
3660                     bool useNonDefault = false;
3661                     useNonDefault |= (kernel.getSimdSize() >= g4::SIMD16 && dcl->getTotalElems() > 8);
3662                     useNonDefault |= (kernel.getSimdSize() == g4::SIMD8 && dcl->getTotalElems() > 4);
3663 
3664                     if (useNonDefault)
3665                     {
3666                         gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
3667                     }
3668                     else
3669                     {
3670                         gra.setAugmentationMask(dcl, AugmentationMasks::Default64Bit);
3671                     }
3672                 }
3673                 else
3674                 {
3675                     gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
3676                     return;
3677                 }
3678             }
3679             else
3680             {
3681                 bool found = false;
3682                 // default one
3683                 found |= checkGRFPattern1(dcl, dst, maskOff, lb, rb, execSize);
3684                 if (!found ||
3685                     gra.getAugmentationMask(dcl) == AugmentationMasks::Undetermined)
3686                 {
3687                     // hstride = 2 case
3688                     found |= checkGRFPattern2(dcl, dst, maskOff, lb, rb, execSize);
3689                 }
3690 
3691                 if (!found ||
3692                     gra.getAugmentationMask(dcl) == AugmentationMasks::Undetermined)
3693                 {
3694                     gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
3695                 }
3696             }
3697         }
3698         else
3699         {
3700             // Handle flag register as destination here
3701             if (!(lb == maskOff && rb == (lb + execSize - 1)))
3702             {
3703                 gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
3704                 return;
3705             }
3706 
3707             if (verifyMaskIfInit(dcl, AugmentationMasks::DefaultPredicateMask))
3708             {
3709                 gra.setAugmentationMask(dcl, AugmentationMasks::DefaultPredicateMask);
3710             }
3711         }
3712     }
3713 }
3714 
3715 // Returns true if any inst found using non-default mask.
3716 // This function sets up lexical id of all instructions.
markNonDefaultMaskDef()3717 bool Augmentation::markNonDefaultMaskDef()
3718 {
3719     // Iterate dcls list and mark obvious ones as non-default.
3720     // Obvoius non-default is 1 element, ie uniform dcl.
3721     for (auto dcl : kernel.Declares)
3722     {
3723         auto dclRegFile = dcl->getRegFile();
3724         if (!liveAnalysis.livenessClass(dclRegFile))
3725             continue;
3726 
3727         if (dclRegFile == G4_GRF || dclRegFile == G4_INPUT || dclRegFile == G4_ADDRESS)
3728         {
3729             if (dcl->getTotalElems() < 8 || dclRegFile == G4_INPUT)
3730             {
3731                 gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
3732             }
3733         }
3734         else if (dclRegFile == G4_FLAG)
3735         {
3736             // Flags are processed when processing instructions
3737         }
3738     }
3739 
3740     unsigned id = 0;
3741     bool isFlagRA = liveAnalysis.livenessClass(G4_FLAG);
3742 
3743     for (auto bb : kernel.fg)
3744     {
3745         for (auto inst : *bb)
3746         {
3747             inst->setLexicalId(id++);
3748 
3749             G4_DstRegRegion* dst = inst->getDst();
3750 
3751             if (dst)
3752             {
3753                 markNonDefaultDstRgn(inst, dst);
3754             }
3755 
3756             if (isFlagRA &&
3757                 inst->getCondMod())
3758             {
3759                 markNonDefaultDstRgn(inst, inst->getCondMod());
3760             }
3761         }
3762     }
3763 
3764     // Update whether each dcl is default/not
3765     AugmentationMasks prevAugMask = AugmentationMasks::Undetermined;
3766     bool nonDefaultMaskDefFound = false;
3767 
3768     for (auto dcl : kernel.Declares)
3769     {
3770         if (liveAnalysis.livenessClass(dcl->getRegFile()))
3771         {
3772             if (gra.getAugmentationMask(dcl) == AugmentationMasks::Undetermined)
3773             {
3774                 gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
3775                 nonDefaultMaskDefFound = true;
3776             }
3777 
3778             if(kernel.getOption(vISA_forceBCR) && gra.getBankConflict(dcl) != BANK_CONFLICT_NONE)
3779             {
3780                 gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
3781                 nonDefaultMaskDefFound = true;
3782             }
3783 
3784             if (!nonDefaultMaskDefFound &&
3785                 gra.getAugmentationMask(dcl) != prevAugMask &&
3786                 prevAugMask != AugmentationMasks::Undetermined)
3787             {
3788                 nonDefaultMaskDefFound = true;
3789             }
3790 
3791             prevAugMask = gra.getAugmentationMask(dcl);
3792         }
3793 
3794         bool checkLRAAlign = false;
3795         if (liveAnalysis.livenessClass(G4_GRF))
3796         {
3797             if ((GlobalRA::useGenericAugAlign() && gra.evenAlignNeeded(dcl)))
3798                 checkLRAAlign = true;
3799             else if (gra.getAugmentationMask(dcl) == AugmentationMasks::Default32Bit &&
3800                 kernel.getSimdSize() > numEltPerGRF<Type_UD>())
3801                 checkLRAAlign = true;
3802         }
3803 
3804         if (checkLRAAlign)
3805         {
3806             auto dclLR = gra.getLocalLR(dcl);
3807             if (dclLR)
3808             {
3809                 int s;
3810                 auto phyReg = dclLR->getPhyReg(s);
3811                 if (phyReg && phyReg->asGreg()->getRegNum() % 2 != 0)
3812                 {
3813                     // If LRA assignment is not 2GRF aligned for then
3814                     // mark it as non-default. GRA candidates cannot fully
3815                     // overlap with such ranges. Partial overlap is illegal.
3816                     gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
3817                     nonDefaultMaskDefFound = true;
3818                 }
3819             }
3820         }
3821     }
3822 
3823     return nonDefaultMaskDefFound;
3824 }
3825 
getTopmostBBDst(G4_BB * src,G4_BB * end,G4_BB * origSrc,unsigned traversal)3826 G4_BB* Augmentation::getTopmostBBDst(G4_BB* src, G4_BB* end, G4_BB* origSrc, unsigned traversal)
3827 {
3828     // Start from src BB and do a DFS. If any back-edges
3829     // are found then recursively invoke itself with dst
3830     // of back-edge. Any path that reaches BB "end"
3831     // will not be propagated forward.
3832     unsigned topLexId = src->front()->getLexicalId();
3833     G4_BB* topmostBB = src;
3834 
3835     if (src != end)
3836     {
3837         src->markTraversed(traversal);
3838         src->setNestLevel();
3839 
3840         for (G4_BB* succ : src->Succs)
3841         {
3842             if (succ == origSrc)
3843             {
3844                 // Src of traversal traversed again without
3845                 // ever traversing end node. So abort this path.
3846                 return nullptr;
3847             }
3848 
3849             if (succ->isAlreadyTraversed(traversal) == true)
3850                 continue;
3851 
3852             G4_BB* recursiveTopMostBB = getTopmostBBDst(succ, end, origSrc, traversal);
3853 
3854             if (recursiveTopMostBB != NULL)
3855             {
3856                 unsigned recursiveTopMostBBLexId = recursiveTopMostBB->front()->getLexicalId();
3857 
3858                 if (recursiveTopMostBBLexId < topLexId)
3859                 {
3860                     topmostBB = recursiveTopMostBB;
3861                     topLexId = recursiveTopMostBBLexId;
3862                 }
3863             }
3864             else
3865             {
3866                 if (src != origSrc)
3867                 {
3868                     topmostBB = NULL;
3869                     topLexId = 0;
3870                 }
3871             }
3872 
3873             succ->markTraversed(traversal);
3874             succ->setNestLevel();
3875         }
3876     }
3877 
3878     return topmostBB;
3879 }
3880 
updateStartIntervalForSubDcl(G4_Declare * dcl,G4_INST * curInst,G4_Operand * opnd)3881 void Augmentation::updateStartIntervalForSubDcl(G4_Declare* dcl, G4_INST* curInst, G4_Operand *opnd)
3882 {
3883     for (const G4_Declare *subDcl : gra.getSubDclList(dcl))
3884     {
3885         unsigned leftBound = gra.getSubOffset(subDcl);
3886         unsigned rightBound = leftBound + subDcl->getByteSize() - 1;
3887         if (!(opnd->getRightBound() < leftBound || rightBound < opnd->getLeftBound()))
3888         {
3889             auto subDclStartInterval = gra.getStartInterval(subDcl);
3890             if (subDclStartInterval == NULL ||
3891                 (subDclStartInterval->getLexicalId() > curInst->getLexicalId()))
3892             {
3893                 gra.setStartInterval(subDcl, curInst);
3894             }
3895 
3896             auto subDclEndIntrval = gra.getEndInterval(subDcl);
3897             if (subDclEndIntrval == NULL ||
3898                 (subDclEndIntrval->getLexicalId() < curInst->getLexicalId()))
3899             {
3900                 gra.setEndInterval(subDcl, curInst);
3901             }
3902         }
3903     }
3904 
3905     return;
3906 }
3907 
updateEndIntervalForSubDcl(G4_Declare * dcl,G4_INST * curInst,G4_Operand * opnd)3908 void Augmentation::updateEndIntervalForSubDcl(G4_Declare* dcl, G4_INST* curInst, G4_Operand *opnd)
3909 {
3910     for (const G4_Declare *subDcl : gra.getSubDclList(dcl))
3911     {
3912         unsigned leftBound = gra.getSubOffset(subDcl);
3913         unsigned rightBound = leftBound + subDcl->getByteSize() - 1;
3914         if (!(opnd->getRightBound() < leftBound || rightBound < opnd->getLeftBound()))
3915         {
3916             auto subDclEndInterval = gra.getEndInterval(subDcl);
3917             if (subDclEndInterval == NULL ||
3918                 (subDclEndInterval->getLexicalId() < curInst->getLexicalId()))
3919             {
3920                 gra.setEndInterval(subDcl, curInst);
3921             }
3922 
3923             auto subDclStartInterval = gra.getStartInterval(subDcl);
3924             if (subDclStartInterval == NULL ||
3925                 (subDclStartInterval->getLexicalId() > curInst->getLexicalId()))
3926             {
3927                 gra.setStartInterval(subDcl, curInst);
3928             }
3929         }
3930     }
3931 
3932     return;
3933 }
3934 
updateStartInterval(const G4_Declare * dcl,G4_INST * curInst)3935 void Augmentation::updateStartInterval(const G4_Declare* dcl, G4_INST* curInst)
3936 {
3937     auto dclStartInterval = gra.getStartInterval(dcl);
3938     if (dclStartInterval == NULL ||
3939         (dclStartInterval->getLexicalId() > curInst->getLexicalId()))
3940     {
3941         gra.setStartInterval(dcl, curInst);
3942     }
3943 
3944     auto dclEndInterval = gra.getEndInterval(dcl);
3945     if (dclEndInterval == NULL ||
3946         (dclEndInterval->getLexicalId() < curInst->getLexicalId()))
3947     {
3948         gra.setEndInterval(dcl, curInst);
3949     }
3950 }
3951 
updateEndInterval(const G4_Declare * dcl,G4_INST * curInst)3952 void Augmentation::updateEndInterval(const G4_Declare* dcl, G4_INST* curInst)
3953 {
3954     auto dclEndInterval = gra.getEndInterval(dcl);
3955     if (dclEndInterval == NULL ||
3956         (dclEndInterval->getLexicalId() < curInst->getLexicalId()))
3957     {
3958         gra.setEndInterval(dcl, curInst);
3959     }
3960 
3961     auto dclStartInterval = gra.getStartInterval(dcl);
3962     if (dclStartInterval == NULL ||
3963         (dclStartInterval->getLexicalId() > curInst->getLexicalId()))
3964     {
3965         gra.setStartInterval(dcl, curInst);
3966     }
3967 }
3968 
updateStartIntervalForLocal(G4_Declare * dcl,G4_INST * curInst,G4_Operand * opnd)3969 void Augmentation::updateStartIntervalForLocal(G4_Declare* dcl, G4_INST* curInst, G4_Operand *opnd)
3970 {
3971     updateStartInterval(dcl, curInst);
3972     if (dcl->getIsSplittedDcl())
3973     {
3974         updateStartIntervalForSubDcl(dcl, curInst, opnd);
3975     }
3976 }
3977 
updateEndIntervalForLocal(G4_Declare * dcl,G4_INST * curInst,G4_Operand * opnd)3978 void Augmentation::updateEndIntervalForLocal(G4_Declare* dcl, G4_INST* curInst, G4_Operand *opnd)
3979 {
3980     updateEndInterval(dcl, curInst);
3981     if (dcl->getIsSplittedDcl())
3982     {
3983         updateEndIntervalForSubDcl(dcl, curInst, opnd);
3984     }
3985 }
3986 
3987 
3988 
printLiveIntervals()3989 void GlobalRA::printLiveIntervals()
3990 {
3991     for (const G4_Declare * dcl : kernel.Declares)
3992     {
3993         if (getStartInterval(dcl) != nullptr ||
3994             getEndInterval(dcl) != nullptr)
3995         {
3996             DEBUG_VERBOSE(dcl->getName() << " (");
3997 
3998             if (getStartInterval(dcl) != nullptr)
3999             {
4000                 DEBUG_VERBOSE(getStartInterval(dcl)->getLexicalId());
4001             }
4002             else
4003             {
4004                 DEBUG_VERBOSE("*");
4005             }
4006 
4007             DEBUG_VERBOSE(", ");
4008 
4009             if (getEndInterval(dcl) != nullptr)
4010             {
4011                 DEBUG_VERBOSE(getEndInterval(dcl)->getLexicalId());
4012             }
4013             else
4014             {
4015                 DEBUG_VERBOSE("*");
4016             }
4017 
4018             DEBUG_VERBOSE("] " << std::endl);
4019         }
4020     }
4021 }
4022 
4023 #ifdef DEBUG_VERBOSE_ON
calculateBankConflictsInBB(G4_BB * bb,int & even_odd_num,int & low_high_num,int & threeSourceNum,bool twoSrcsBank)4024 static int calculateBankConflictsInBB(G4_BB* bb, int &even_odd_num, int &low_high_num, int &threeSourceNum, bool twoSrcsBank)
4025 {
4026     int conflict_num = 0;
4027 
4028     for (std::list<G4_INST*>::reverse_iterator i = bb->rbegin();
4029         i != bb->rend();
4030         i++)
4031     {
4032         bool hasSrc0 = false;
4033         int regNum0 = 0;
4034         int regNum1 = 0;
4035         int regNum2 = 0;
4036 
4037         const G4_INST* inst = (*i);
4038 
4039         if (!(inst->getNumSrc() == 3 && !inst->isSend()))
4040             continue;
4041 
4042         const G4_Operand* src0 = inst->getSrc(0);
4043         const G4_Operand* src1 = inst->getSrc(1);
4044         const G4_Operand* src2 = inst->getSrc(2);
4045 
4046 
4047         if (src1 && src1->isSrcRegRegion() &&
4048             src1->getBase() && src1->getBase()->asRegVar()->isPhyRegAssigned())
4049         {
4050             regNum1 = src1->getBase()->asRegVar()->getPhyReg()->getRegNum();
4051         }
4052         if (src2 && src2->isSrcRegRegion() &&
4053             src2->getBase() && src2->getBase()->asRegVar()->isPhyRegAssigned())
4054         {
4055             regNum2 = src2->getBase()->asRegVar()->getPhyReg()->getRegNum();
4056         }
4057 
4058         if ((src0 && src0->isSrcRegRegion()) &&
4059             src0->getBase() && src0->getBase()->asRegVar()->isPhyRegAssigned())
4060         {
4061             regNum0 = src0->getBase()->asRegVar()->getPhyReg()->getRegNum();
4062         }
4063 
4064         if (regNum1 == regNum2 && regNum0 == regNum1)
4065             continue;
4066 
4067         if (!twoSrcsBank)
4068         {
4069             if (regNum0 < SECOND_HALF_BANK_START_GRF &&
4070                 regNum1 < SECOND_HALF_BANK_START_GRF &&
4071                 regNum2 < SECOND_HALF_BANK_START_GRF ||
4072                 regNum0 >= SECOND_HALF_BANK_START_GRF &&
4073                 regNum1 >= SECOND_HALF_BANK_START_GRF &&
4074                 regNum2 >= SECOND_HALF_BANK_START_GRF)
4075             {
4076                 if (regNum1 % 2 == regNum2 % 2 &&
4077                     regNum0 % 2 == regNum1 % 2)
4078                 {
4079                     conflict_num++;
4080                 }
4081             }
4082         }
4083         else
4084         {
4085             if ((regNum1 % 2) == (regNum2 % 2))
4086             {
4087                 if (regNum1 < SECOND_HALF_BANK_START_GRF &&
4088                     regNum2 < SECOND_HALF_BANK_START_GRF ||
4089                     regNum1 >= SECOND_HALF_BANK_START_GRF &&
4090                     regNum2 >= SECOND_HALF_BANK_START_GRF)
4091                 {
4092                     conflict_num++;
4093                 }
4094                 else
4095                 {
4096                     low_high_num++;
4097                 }
4098             }
4099             else
4100             {
4101                 even_odd_num++;
4102             }
4103         }
4104         threeSourceNum++;
4105     }
4106 
4107     return conflict_num;
4108 }
4109 
calculateBankConflicts(G4_Kernel & kernel)4110 static int calculateBankConflicts(G4_Kernel& kernel)
4111 {
4112     bool SIMD16 = (kernel.getSimdSize() >= 16);
4113     bool twoSrcsConflict = kernel.fg.builder->twoSourcesCollision();
4114 
4115     for (G4_BB* curBB : kernel.fg)
4116     {
4117         int even_odd_num = 0;
4118         int low_high_num = 0;
4119         int threeSourceNum = 0;
4120 
4121         int conflict_num = calculateBankConflictsInBB(curBB, even_odd_num, low_high_num, threeSourceNum, twoSrcsConflict);
4122         if (threeSourceNum)
4123         {
4124             if (SIMD16)
4125             {
4126                 printf("SIMD16, BB: %d,  Even_odd: %d, low_high: %d, Conflicts: %d, Three: %d, Insts: %d,  kernel: %s\n", curBB->getId(), even_odd_num, low_high_num, conflict_num, threeSourceNum, curBB->size(), kernel.getName());
4127             }
4128             else
4129             {
4130                 printf("SIMD8, BB: %d,  Even_odd: %d, low_high: %d, Conflicts: %d, Three: %d, Insts: %d,  kernel: %s\n", curBB->getId(), even_odd_num, low_high_num, conflict_num, threeSourceNum, curBB->size(), kernel.getName());
4131             }
4132         }
4133     }
4134 
4135     return 0;
4136 }
4137 #endif
4138 
buildLiveIntervals()4139 void Augmentation::buildLiveIntervals()
4140 {
4141     // Treat variables live-in to program first
4142     G4_BB* entryBB = kernel.fg.getEntryBB();
4143 
4144     // Live-in variables have their start interval start with
4145     // first instruction of entry BB
4146     for (unsigned i = 0; i < liveAnalysis.getNumSelectedGlobalVar(); i++)
4147     {
4148         if (liveAnalysis.isLiveAtEntry(entryBB, i))
4149         {
4150             const G4_Declare* dcl = lrs[i]->getDcl()->getRootDeclare();
4151 
4152             updateStartInterval(dcl, entryBB->front());
4153         }
4154     }
4155 
4156     unsigned funcCnt = 0;
4157 
4158     for (G4_BB* curBB : kernel.fg)
4159     {
4160         for (G4_INST* inst : *curBB)
4161         {
4162             if (inst->isPseudoKill() == true)
4163             {
4164                 continue;
4165             }
4166 
4167             G4_DstRegRegion* dst = inst->getDst();
4168 
4169             if (inst->isCall() == true)
4170             {
4171                 const char* name = kernel.fg.builder->getNameString(kernel.fg.builder->mem, 32, "SCALL_%d", funcCnt++);
4172                 G4_Declare* scallDcl = kernel.fg.builder->createDeclareNoLookup(name, G4_GRF, 1, 1, Type_UD);
4173 
4174                 updateStartInterval(scallDcl, inst);
4175                 updateEndInterval(scallDcl, inst);
4176 
4177                 FuncInfo* callee = curBB->getCalleeInfo();
4178                 std::pair<G4_INST*, FuncInfo*> callInfo(inst, callee);
4179                 callDclMap.emplace(scallDcl, callInfo);
4180 
4181                 continue;
4182             }
4183 
4184             if (dst &&
4185                 dst->getRegAccess() == Direct &&
4186                 dst->getBase())
4187             {
4188                 // Destination
4189                 G4_Declare* defdcl = GetTopDclFromRegRegion(dst);
4190 
4191                 if (dst->getBase()->isRegAllocPartaker())
4192                 {
4193                     if (defdcl &&
4194                         gra.getLocalLR(defdcl))
4195                     {
4196                         updateStartIntervalForLocal(defdcl, inst, dst);
4197                     }
4198                     else
4199                     {
4200                         updateStartInterval(defdcl, inst);
4201                     }
4202                 }
4203                 else if (liveAnalysis.livenessClass(G4_GRF))
4204                 {
4205                     LocalLiveRange* defdclLR;
4206 
4207                     // Handle ranges allocated by local RA
4208                     if (defdcl &&
4209                         (defdclLR = gra.getLocalLR(defdcl)) &&
4210                         defdclLR->getAssigned() == true &&
4211                         !defdclLR->isEOT())
4212                     {
4213                         updateStartInterval(defdcl, inst);
4214                     }
4215                 }
4216             }
4217             else if (liveAnalysis.livenessClass(G4_ADDRESS) &&
4218                 dst &&
4219                 dst->getRegAccess() == IndirGRF &&
4220                 dst->getBase() &&
4221                 dst->getBase()->isRegVar())
4222             {
4223                 // Destination is indirect
4224                 G4_Declare* defdcl = dst->getBaseRegVarRootDeclare();
4225 
4226                 updateEndInterval(defdcl, inst);
4227             }
4228 
4229             if (liveAnalysis.livenessClass(G4_FLAG))
4230             {
4231                 G4_CondMod* cmod = inst->getCondMod();
4232 
4233                 if (cmod != nullptr &&
4234                     cmod->getBase() != nullptr)
4235                 {
4236                     // Conditional modifier
4237                     G4_Declare* dcl = cmod->getBaseRegVarRootDeclare();
4238 
4239                     updateStartInterval(dcl, inst);
4240                 }
4241             }
4242 
4243             for (unsigned i = 0; i < G4_MAX_SRCS; i++)
4244             {
4245                 G4_Operand* src = inst->getSrc(i);
4246                 if (!src || !src->isSrcRegRegion())
4247                 {
4248                     continue;
4249                 }
4250                 G4_SrcRegRegion* srcRegion = src->asSrcRegRegion();
4251 
4252                 if (srcRegion->getRegAccess() == Direct && srcRegion->getBase())
4253                 {
4254                     G4_Declare* usedcl = GetTopDclFromRegRegion(src);
4255 
4256                     if (srcRegion->getBase()->isRegAllocPartaker())
4257                     {
4258                         if (gra.getLocalLR(usedcl))
4259                         {
4260                             updateEndIntervalForLocal(usedcl, inst, src);
4261                         }
4262                         else
4263                         {
4264                             updateEndInterval(usedcl, inst);
4265                         }
4266                     }
4267                     else if (liveAnalysis.livenessClass(G4_GRF))
4268                     {
4269                         LocalLiveRange* usedclLR = nullptr;
4270                         if (usedcl &&
4271                             (usedclLR = gra.getLocalLR(usedcl)) &&
4272                             usedclLR->getAssigned() == true &&
4273                             !usedclLR->isEOT())
4274                         {
4275                             updateEndInterval(usedcl, inst);
4276                         }
4277                     }
4278                 }
4279                 else if (liveAnalysis.livenessClass(G4_GRF) && srcRegion->isIndirect())
4280                 {
4281                     const REGVAR_VECTOR& pointsToSet = liveAnalysis.getPointsToAnalysis().getAllInPointsToOrIndrUse(srcRegion, curBB);
4282                     for (auto pointsToVar : pointsToSet)
4283                     {
4284                         if (pointsToVar.var->isRegAllocPartaker())
4285                         {
4286                             updateEndInterval(pointsToVar.var->getDeclare()->getRootDeclare(), inst);
4287                         }
4288                     }
4289                 }
4290                 else if (liveAnalysis.livenessClass(G4_ADDRESS) &&
4291                     srcRegion->getRegAccess() == IndirGRF &&
4292                     srcRegion->getBase() &&
4293                     srcRegion->getBase()->isRegVar())
4294                 {
4295                     G4_Declare* usedcl = src->getBaseRegVarRootDeclare();
4296 
4297                     updateEndInterval(usedcl, inst);
4298                 }
4299             }
4300 
4301             if (liveAnalysis.livenessClass(G4_FLAG))
4302             {
4303                 G4_Predicate* pred = inst->getPredicate();
4304 
4305                 if (pred != NULL)
4306                 {
4307                     // Predicate
4308                     G4_Declare* dcl = pred->getBaseRegVarRootDeclare();
4309 
4310                     updateEndInterval(dcl, inst);
4311                 }
4312             }
4313         }
4314     }
4315 
4316     // extend all variables that are live at bb entry to the given inst
4317     // ToDo: this seems very slow when # variable is large, should look for sparse implementation
4318     auto extendVarLiveness = [this](G4_BB* bb, G4_INST* inst)
4319     {
4320         for (unsigned i = 0; i < liveAnalysis.getNumSelectedGlobalVar(); i++)
4321         {
4322             if (liveAnalysis.isLiveAtEntry(bb, i) == true)
4323             {
4324                 // Extend ith live-interval
4325                 G4_Declare* dcl = lrs[i]->getDcl()->getRootDeclare();
4326 
4327 #ifdef DEBUG_VERBOSE_ON
4328                 unsigned oldStart = dcl->getStartInterval()->getLexicalId();
4329 #endif
4330 
4331                 updateStartInterval(dcl, inst);
4332 
4333 #ifdef DEBUG_VREBOSE_ON
4334                 if (oldStart > dcl->getStartInterval()->getLexicalId())
4335                 {
4336                     std::cout << "Extending " << dcl->getName() << " from old start " <<
4337                         oldStart << " to " <<
4338                         startInst->getLexicalId() <<
4339                         " due to back-edge" <<
4340                         std::endl;
4341                 }
4342 #endif
4343             }
4344         }
4345     };
4346 
4347     if (!kernel.fg.isReducible())
4348     {
4349         //use SCC instead
4350         //FIXME: does augmentation work in the presence of subroutine? neither SCCAnalysis nor findNaturalLoops
4351         //considers the call graph
4352         SCCAnalysis SCCFinder(kernel.fg);
4353         SCCFinder.run();
4354         for (auto iter = SCCFinder.SCC_begin(), iterEnd = SCCFinder.SCC_end(); iter != iterEnd; ++iter)
4355         {
4356             auto&& anSCC = *iter;
4357             std::unordered_set<G4_BB*> SCCSucc; // any successor BB of the SCC
4358             G4_BB* headBB = anSCC.getEarliestBB();
4359             for (auto BI = anSCC.body_begin(), BIEnd = anSCC.body_end(); BI != BIEnd; ++BI)
4360             {
4361                 G4_BB* bb = *BI;
4362                 for (auto succ : bb->Succs)
4363                 {
4364                     if (!anSCC.isMember(succ))
4365                     {
4366                         SCCSucc.insert(succ);
4367                     }
4368                 }
4369             }
4370             for (auto exitBB : SCCSucc)
4371             {
4372                 extendVarLiveness(exitBB, headBB->front());
4373             }
4374         }
4375     }
4376     else
4377     {
4378         // process each natural loop
4379         for (auto&& iter : kernel.fg.naturalLoops)
4380         {
4381             auto&& backEdge = iter.first;
4382             G4_INST* startInst = (backEdge.second)->front();
4383             const std::set<G4_BB*>& loopBody = iter.second;
4384 
4385             for (auto block : loopBody)
4386             {
4387                 // FIXME: this may process a BB multiple times
4388                 for (auto succBB : block->Succs)
4389                 {
4390                     if (loopBody.find(succBB) == loopBody.end())
4391                     {
4392                         G4_BB* exitBB = succBB;
4393 
4394                         unsigned latchBBId = (backEdge.first)->getId();
4395                         unsigned exitBBId = succBB->getId();
4396                         if (exitBBId < latchBBId &&
4397                             succBB->Succs.size() == 1)
4398                         {
4399                             exitBB = succBB->Succs.front();
4400                         }
4401 
4402 #ifdef DEBUG_VERBOSE_ON
4403                         std::cout << "==> Extend live-in for BB" << exitBB->getId() << std::endl;
4404                         exitBB->emit(std::cout);
4405 #endif
4406                         extendVarLiveness(exitBB, startInst);
4407                     }
4408                 }
4409             }
4410 
4411             G4_BB* startBB = backEdge.second;
4412             G4_BB* EndBB = backEdge.first;
4413             for (unsigned i = 0; i < liveAnalysis.getNumSelectedGlobalVar(); i++)
4414             {
4415                 if (liveAnalysis.isLiveAtEntry(startBB, i) == true &&
4416                     liveAnalysis.isLiveAtExit(EndBB, i) == true)
4417                 {
4418                     const G4_Declare* dcl = lrs[i]->getDcl()->getRootDeclare();
4419 
4420 #ifdef DEBUG_VERBOSE_ON
4421                     unsigned oldEnd = dcl->getEndInterval()->getLexicalId();
4422 #endif
4423 
4424                     updateEndInterval(dcl, EndBB->back());
4425 
4426 #ifdef DEBUG_VERBOSE_ON
4427                     if (oldEnd < dcl->getEndInterval()->getLexicalId())
4428                     {
4429                         std::cout << "Extending " << dcl->getName() << " from old end " <<
4430                             oldEnd << " to " <<
4431                             dcl->getEndInterval()->getLexicalId() <<
4432                             " due to back-edge" <<
4433                             std::endl;
4434                     }
4435 #endif
4436                 }
4437             }
4438 
4439         }
4440     }
4441 
4442 #ifdef DEBUG_VERBOSE_ON
4443     // Print calculated live-ranges
4444     gra.printLiveIntervals();
4445 #endif
4446 }
4447 
clearIntervalInfo()4448 void Augmentation::clearIntervalInfo()
4449 {
4450     // Clear out calculated information so that subsequent RA
4451     // iterations dont have stale information
4452     for (DECLARE_LIST_ITER dcl_it = kernel.Declares.begin(), end = kernel.Declares.end();
4453         dcl_it != end;
4454         dcl_it++)
4455     {
4456         gra.setStartInterval(*dcl_it, nullptr);
4457         gra.setEndInterval(*dcl_it, nullptr);
4458         gra.setMask(*dcl_it, {});
4459         gra.setAugmentationMask(*dcl_it, AugmentationMasks::Undetermined);
4460     }
4461 }
4462 
4463 class compareInterval
4464 {
4465 public:
4466     GlobalRA& gra;
4467 
compareInterval(GlobalRA & g)4468     compareInterval(GlobalRA& g) : gra(g)
4469     {
4470     }
4471 
operator ()(G4_Declare * dcl1,G4_Declare * dcl2)4472     bool operator()(G4_Declare* dcl1, G4_Declare* dcl2)
4473     {
4474         return gra.getStartInterval(dcl1)->getLexicalId() < gra.getStartInterval(dcl2)->getLexicalId();
4475     }
4476 };
4477 
sortLiveIntervals()4478 void Augmentation::sortLiveIntervals()
4479 {
4480     // Sort all intervals in kernel based on their starting point in
4481     // ascending order and return them in sortedIntervals vector
4482     // This is actually more efficient (at least according to vTune) than the O(N)
4483     // bucket sort algorithm below, since it avoids most of the malloc/free overhead from the vector.resize()
4484     for (G4_Declare* dcl : kernel.Declares)
4485     {
4486         if (gra.getStartInterval(dcl) != NULL)
4487         {
4488             sortedIntervals.push_back(dcl);
4489         }
4490     }
4491 
4492     std::sort(sortedIntervals.begin(), sortedIntervals.end(), compareInterval(gra));
4493 
4494 #ifdef DEBUG_VERBOSE_ON
4495     DEBUG_VERBOSE("Live-intervals in sorted order: " << std::endl);
4496     for (const G4_Declare* dcl : sortedIntervals)
4497     {
4498         DEBUG_VERBOSE(dcl->getName() << " - " <<
4499             "(" << dcl->getStartInterval()->getLexicalId() <<
4500             ", " << dcl->getEndInterval()->getLexicalId() <<
4501             "]" << std::endl);
4502     }
4503 #endif
4504 }
4505 
getEnd(const G4_Declare * dcl) const4506 unsigned Augmentation::getEnd(const G4_Declare* dcl) const
4507 {
4508     return gra.getEndInterval(dcl)->getLexicalId();
4509 }
4510 
4511 // Mark interference between dcls. Either one of dcls may have
4512 // register assigned by local RA so handle those cases too.
4513 // Re-entrant function.
handleSIMDIntf(G4_Declare * firstDcl,G4_Declare * secondDcl,bool isCall)4514 void Augmentation::handleSIMDIntf(G4_Declare* firstDcl, G4_Declare* secondDcl, bool isCall)
4515 {
4516     auto markIntfWithLRAAssignment = [](const G4_Declare* firstDcl, const G4_Declare* lraAssigned, Interference& intf)
4517     {
4518         unsigned numRows = lraAssigned->getNumRows();
4519         const G4_VarBase* preg = lraAssigned->getRegVar()->getPhyReg();
4520         MUST_BE_TRUE(preg->isGreg(), "Expecting a physical register during building interference among incompatible masks");
4521         unsigned start = preg->asGreg()->getRegNum();
4522 
4523         for (unsigned i = start; i < (start + numRows); i++)
4524         {
4525             auto GRFDcl = intf.getGRFDclForHRA(i);
4526             intf.checkAndSetIntf(firstDcl->getRegVar()->getId(), GRFDcl->getRegVar()->getId());
4527 
4528 #ifdef DEBUG_VERBOSE_ON
4529             DEBUG_VERBOSE("Marking interference between " << firstDcl->getName() <<
4530                 " and " << GRFDcl->getName() << std::endl);
4531 #endif
4532         }
4533     };
4534 
4535     if (firstDcl->getRegFile() == G4_INPUT &&
4536         firstDcl->getRegVar()->getPhyReg() != NULL &&
4537         secondDcl->getRegFile() == G4_INPUT &&
4538         secondDcl->getRegVar()->getPhyReg() != NULL)
4539     {
4540         return;
4541     }
4542 
4543     auto contain = [](const auto& C, auto pred)
4544     {
4545         return std::find_if(C.cbegin(), C.cend(), pred) != C.cend();
4546     };
4547 
4548     bool isFirstDcl = true;
4549 
4550     auto pred = [firstDcl, secondDcl, &isFirstDcl](const auto& el)
4551     {
4552         if (el.second.VCA == firstDcl) return true;
4553         if (el.second.VCA == secondDcl)
4554         {
4555             isFirstDcl = false;
4556             return true;
4557         }
4558         return false;
4559     };
4560 
4561     if (contain(kernel.fg.fcallToPseudoDclMap, pred))
4562     {
4563         // Mark intf for following pattern:
4564         // V33 =
4565         // ...
4566         // if
4567         //     = V33
4568         //     fcall
4569         // ...
4570         // else
4571         //     = V33
4572         // endif
4573         //
4574         // V33 will interfere with VCA_SAVE pseudo node.
4575         // It also needs to interfere with retval to
4576         // ensure V33 and retval dont get same allocation.
4577         // Note that if V33 is actually live after fcall
4578         // then graph coloring will do this for us. In this
4579         // case however we need to rely on augmentation.
4580         FCALL_RET_MAP_ITER retIter = isFirstDcl ? fcallRetMap.find(firstDcl) : fcallRetMap.find(secondDcl);
4581         if (retIter != fcallRetMap.end())
4582         {
4583             G4_Declare* retVar = retIter->second;
4584             LocalLiveRange* otherDclLR;
4585             G4_Declare* otherDcl = isFirstDcl ? secondDcl : firstDcl;
4586             if (otherDcl->getRegVar()->isRegAllocPartaker())
4587                 intf.checkAndSetIntf(otherDcl->getRegVar()->getId(), retVar->getRegVar()->getId());
4588             else if ((otherDclLR = gra.getLocalLR(otherDcl)) &&
4589                 otherDclLR->getAssigned() &&
4590                 !otherDclLR->isEOT())
4591             {
4592                 markIntfWithLRAAssignment(retVar, otherDcl, intf);
4593             }
4594         }
4595     }
4596 
4597     if (firstDcl->getRegVar()->isRegAllocPartaker() &&
4598         secondDcl->getRegVar()->isRegAllocPartaker())
4599     {
4600         if (!intf.varSplitCheckBeforeIntf(firstDcl->getRegVar()->getId(), secondDcl->getRegVar()->getId()))
4601         {
4602             intf.checkAndSetIntf(firstDcl->getRegVar()->getId(), secondDcl->getRegVar()->getId());
4603             if (isCall)
4604             {
4605                 intf.buildInterferenceWithAllSubDcl(firstDcl->getRegVar()->getId(), secondDcl->getRegVar()->getId());
4606             }
4607 #ifdef DEBUG_VERBOSE_ON
4608             DEBUG_VERBOSE("Marking interference between " << firstDcl->getName() <<
4609                 " and " << secondDcl->getName() << std::endl);
4610 #endif
4611         }
4612     }
4613     else if (liveAnalysis.livenessClass(G4_GRF))
4614     {
4615         LocalLiveRange* secondDclLR = nullptr, *firstDclLR = nullptr;
4616 
4617         if (firstDcl->getRegVar()->isRegAllocPartaker() &&
4618             (secondDclLR = gra.getLocalLR(secondDcl)) &&
4619             secondDclLR->getAssigned() &&
4620             !secondDclLR->isEOT())
4621         {
4622             // secondDcl was assigned by local RA and it uses
4623             markIntfWithLRAAssignment(firstDcl, secondDcl, intf);
4624         }
4625         else if (secondDcl->getRegVar()->isRegAllocPartaker() &&
4626             (firstDclLR = gra.getLocalLR(firstDcl)) &&
4627             firstDclLR->getAssigned() &&
4628             !firstDclLR->isEOT())
4629         {
4630             // Call self with reversed parameters instead of re-implementing
4631             // above code
4632             handleSIMDIntf(secondDcl, firstDcl, isCall);
4633         }
4634     }
4635 }
4636 
isNoMask(const G4_Declare * dcl,unsigned size) const4637 bool Augmentation::isNoMask(const G4_Declare* dcl, unsigned size) const
4638 {
4639     auto& mask = gra.getMask(dcl);
4640     bool result = false;
4641 
4642     if (mask.size() > 0)
4643     {
4644         result = true;
4645 
4646         for (unsigned i = 0; i < size; i++)
4647         {
4648             if (mask[i] != NOMASK_BYTE)
4649             {
4650                 result = false;
4651             }
4652         }
4653     }
4654 
4655     return result;
4656 }
4657 
isConsecutiveBits(const G4_Declare * dcl,unsigned size) const4658 bool Augmentation::isConsecutiveBits(const G4_Declare* dcl, unsigned size) const
4659 {
4660     auto& mask = gra.getMask(dcl);
4661     bool result = false;
4662 
4663     if (mask.size() > 0)
4664     {
4665         result = true;
4666 
4667         for (unsigned i = 0; i < size; i++)
4668         {
4669             if (mask[i] != i)
4670             {
4671                 result = false;
4672             }
4673         }
4674     }
4675 
4676     return result;
4677 }
4678 
isCompatible(const G4_Declare * testDcl,const G4_Declare * biggerDcl) const4679 bool Augmentation::isCompatible(const G4_Declare* testDcl, const G4_Declare* biggerDcl) const
4680 {
4681     bool compatible = false;
4682 
4683     unsigned testSize = testDcl->getRegVar()->isFlag() ? testDcl->getNumberFlagElements() : testDcl->getByteSize();
4684     unsigned biggerSize = biggerDcl->getRegVar()->isFlag() ? biggerDcl->getNumberFlagElements() : biggerDcl->getByteSize();
4685     unsigned size = (testSize < biggerSize ? testSize : biggerSize);
4686 
4687     // Masks are compatible when:
4688     // i. Both decls have exactly 1 EM bit defining each byte
4689     //  (This means a dcl with Q1 in one inst and Q2 in another
4690     //   instruction writing same subregisters is not a candidate
4691     //   for next step).
4692     // ii. Bytes at common indices are enabled by same EM bit
4693     //  (This means NoMask dcl is compatible with NoMask dcl and
4694     //   not with any other dcl).
4695     // UPDATE: (ii) above is now altered such that NoMask dcls
4696     // that overlap are considered to be incompatible. This is to
4697     // handle removal of JIP edges (then->else edge).
4698 
4699     auto& testMask = gra.getMask(testDcl);
4700     auto& biggerMask = gra.getMask(biggerDcl);
4701 
4702     if (testMask.size() > 0 && biggerMask.size() > 0)
4703     {
4704         // Lets pattern match
4705         if (testDcl->getRegFile() == G4_FLAG)
4706         {
4707             if (isConsecutiveBits(testDcl, size) &&
4708                 isConsecutiveBits(biggerDcl, size))
4709             {
4710                 compatible = true;
4711             }
4712         }
4713         else
4714         {
4715             // Add another pattern to check here
4716         }
4717     }
4718 
4719     return compatible;
4720 }
4721 
expireIntervals(unsigned startIdx)4722 void Augmentation::expireIntervals(unsigned startIdx)
4723 {
4724     // Expire elements from both lists
4725     while (defaultMask.size() > 0)
4726     {
4727         if (gra.getEndInterval(defaultMask.front())->getLexicalId() <= startIdx)
4728         {
4729 #ifdef DEBUG_VERBOSE_ON
4730             DEBUG_VERBOSE("Expiring " << defaultMask.front()->getName() << std::endl);
4731 #endif
4732             defaultMask.pop_front();
4733         }
4734         else
4735         {
4736             break;
4737         }
4738     }
4739 
4740     while (nonDefaultMask.size() > 0)
4741     {
4742         if (gra.getEndInterval(nonDefaultMask.front())->getLexicalId() <= startIdx)
4743         {
4744 #ifdef DEBUG_VERBOSE_ON
4745             DEBUG_VERBOSE("Expiring " << nonDefaultMask.front()->getName() << std::endl);
4746 #endif
4747             nonDefaultMask.pop_front();
4748         }
4749         else
4750         {
4751             break;
4752         }
4753     }
4754 }
4755 
4756 // Return true if edge between dcl1 and dcl2 is strong.
isStrongEdgeBetween(const G4_Declare * dcl1,const G4_Declare * dcl2) const4757 bool Interference::isStrongEdgeBetween(const G4_Declare* dcl1, const G4_Declare* dcl2) const
4758 {
4759     auto dcl1RegVar = dcl1->getRegVar();
4760     auto dcl2RegVar = dcl2->getRegVar();
4761     auto dcl1RAPartaker = dcl1RegVar->isRegAllocPartaker();
4762     auto dcl2RAPartaker = dcl2RegVar->isRegAllocPartaker();
4763 
4764     if (dcl1RAPartaker &&
4765         dcl2RAPartaker)
4766     {
4767         if (interfereBetween(dcl1RegVar->getId(),
4768             dcl2RegVar->getId()))
4769         {
4770             return true;
4771         }
4772         else
4773         {
4774             return false;
4775         }
4776     }
4777 
4778     if (dcl1RAPartaker)
4779     {
4780         auto dcl2NumRows = dcl2->getNumRows();
4781         auto startPhyReg = dcl2RegVar->getPhyReg()->asGreg()->getRegNum();
4782         auto dcl2LR = gra.getLocalLR(dcl2);
4783 
4784         if (dcl2LR &&
4785             dcl2LR->getAssigned())
4786         {
4787             bool allEdgesStrong = true;
4788             for (unsigned i = startPhyReg; i < (startPhyReg + dcl2NumRows); i++)
4789             {
4790                 const G4_Declare* lraPreg = getGRFDclForHRA(i);
4791                 allEdgesStrong &= interfereBetween(lraPreg->getRegVar()->getId(), dcl1RegVar->getId());
4792             }
4793 
4794             if (allEdgesStrong)
4795                 return true;
4796         }
4797     }
4798     else
4799     {
4800         return isStrongEdgeBetween(dcl2, dcl1);
4801     }
4802 
4803     return false;
4804 }
4805 
weakEdgeNeeded(AugmentationMasks defaultDclMask,AugmentationMasks newDclMask)4806 bool Augmentation::weakEdgeNeeded(AugmentationMasks defaultDclMask, AugmentationMasks newDclMask)
4807 {
4808     if (GlobalRA::useGenericAugAlign())
4809     {
4810         // Weak edge needed in case #GRF exceeds 2
4811         if (newDclMask == AugmentationMasks::Default64Bit)
4812             return (TypeSize(Type_Q) * kernel.getSimdSizeWithSlicing()) > (unsigned)(2 * numEltPerGRF<Type_UB>());
4813 
4814         if (newDclMask == AugmentationMasks::Default32Bit)
4815         {
4816             // Even align up to 2 GRFs size variable, use weak edges beyond
4817             return (TypeSize(Type_D) * kernel.getSimdSizeWithSlicing()) > (unsigned)(2 * numEltPerGRF<Type_UB>());
4818         }
4819     }
4820     else
4821     {
4822         return (defaultDclMask == AugmentationMasks::Default64Bit &&
4823             newDclMask == AugmentationMasks::Default64Bit);
4824     }
4825 
4826     return false;
4827 }
4828 
4829 //
4830 // Mark interference between newDcl and other incompatible dcls in current active lists.
4831 //
addSIMDIntfDclForCallSite(MaskDeclares * maskDeclares)4832 void Augmentation::addSIMDIntfDclForCallSite(MaskDeclares* maskDeclares)
4833 {
4834     for (auto defaultDcl : defaultMask)
4835     {
4836         maskDeclares->first.set(defaultDcl->getRegVar()->getId(), true);
4837     }
4838 
4839     for (auto nonDefaultDcl : nonDefaultMask)
4840     {
4841         maskDeclares->second.set(nonDefaultDcl->getRegVar()->getId(), true);
4842     }
4843 }
4844 
addSIMDIntfForRetDclares(G4_Declare * newDcl)4845 void Augmentation::addSIMDIntfForRetDclares(G4_Declare* newDcl)
4846 {
4847     auto dclIt = retDeclares.find(newDcl);
4848     MaskDeclares* mask = nullptr;
4849     if (dclIt == retDeclares.end())
4850     {
4851         MaskDeclares newMask;
4852         newMask.first.resize(liveAnalysis.getNumSelectedGlobalVar());
4853         newMask.second.resize(liveAnalysis.getNumSelectedGlobalVar());
4854         retDeclares[newDcl] = std::move(newMask);
4855         mask = &retDeclares[newDcl];
4856     }
4857     else
4858     {
4859         mask = &dclIt->second;
4860     }
4861     addSIMDIntfDclForCallSite(mask);
4862 }
4863 
4864 //
4865 // Mark interference between newDcl and other incompatible dcls in current active lists.
4866 //
buildSIMDIntfDcl(G4_Declare * newDcl,bool isCall)4867 void Augmentation::buildSIMDIntfDcl(G4_Declare* newDcl, bool isCall)
4868 {
4869     auto newDclAugMask = gra.getAugmentationMask(newDcl);
4870 
4871     for (auto defaultDcl : defaultMask)
4872     {
4873         if (gra.getAugmentationMask(defaultDcl) != newDclAugMask)
4874         {
4875             handleSIMDIntf(defaultDcl, newDcl, isCall);
4876         }
4877         else
4878         {
4879             if (liveAnalysis.livenessClass(G4_GRF) &&
4880                 // Populate compatible sparse intf data structure
4881                 // only for weak edges.
4882                 weakEdgeNeeded(gra.getAugmentationMask(defaultDcl), newDclAugMask))
4883             {
4884                 if (defaultDcl->getRegVar()->isPhyRegAssigned() &&
4885                     newDcl->getRegVar()->isPhyRegAssigned())
4886                 {
4887                     continue;
4888                 }
4889 
4890                 if (intf.isStrongEdgeBetween(defaultDcl, newDcl))
4891                 {
4892                     // No need to add weak edge
4893                     continue;
4894                 }
4895 
4896                 // defaultDcl and newDcl are compatible live-ranges and can have weak edge in intf graph
4897                 auto it = intf.compatibleSparseIntf.find(defaultDcl);
4898                 if (it != intf.compatibleSparseIntf.end())
4899                 {
4900                     it->second.push_back(newDcl);
4901                 }
4902                 else
4903                 {
4904                     std::vector<G4_Declare*> v(1, newDcl);
4905                     intf.compatibleSparseIntf.insert(
4906                         std::make_pair(defaultDcl, v));
4907                 }
4908 
4909                 it = intf.compatibleSparseIntf.find(newDcl);
4910                 if (it != intf.compatibleSparseIntf.end())
4911                 {
4912                     it->second.push_back(defaultDcl);
4913                 }
4914                 else
4915                 {
4916                     std::vector<G4_Declare*> v(1, defaultDcl);
4917                     intf.compatibleSparseIntf.insert(
4918                         std::make_pair(newDcl, v));
4919                 }
4920             }
4921         }
4922     }
4923 
4924     // Mark interference among non-default mask variables
4925     for (auto nonDefaultDcl : nonDefaultMask)
4926     {
4927         // Non-default masks are different so mark interference
4928         handleSIMDIntf(nonDefaultDcl, newDcl, isCall);
4929     }
4930 }
4931 
4932 //
4933 // Mark interference between newDcl and other incompatible dcls in current active lists.
4934 // If newDcl was created for a subroutine call, do this for all varaibles in function summary.
4935 //
buildSIMDIntfAll(G4_Declare * newDcl)4936 void Augmentation::buildSIMDIntfAll(G4_Declare* newDcl)
4937 {
4938     auto callDclMapIt = callDclMap.find(newDcl);
4939     if (callDclMapIt != callDclMap.end())
4940     {
4941 
4942         G4_Declare* varDcl = NULL;
4943 
4944         if (liveAnalysis.livenessClass(G4_GRF)) //For return value
4945         {
4946             G4_INST* callInst = callDclMapIt->second.first;
4947             varDcl = callInst->getDst()->getBase()->asRegVar()->getDeclare();
4948             addSIMDIntfForRetDclares(varDcl);
4949         }
4950 
4951         auto& func = callDclMapIt->second.second;
4952         addSIMDIntfDclForCallSite(&callsiteDeclares[func]);
4953 
4954         return;
4955     }
4956 
4957     buildSIMDIntfDcl(newDcl, false);
4958     return;
4959 }
4960 
buildSIMDIntfAllOld(G4_Declare * newDcl)4961 void Augmentation::buildSIMDIntfAllOld(G4_Declare* newDcl)
4962 {
4963     auto callDclMapIt = callDclMap.find(newDcl);
4964     if (callDclMapIt != callDclMap.end())
4965     {
4966 
4967         G4_Declare* varDcl = NULL;
4968 
4969         if (liveAnalysis.livenessClass(G4_GRF)) //For return value
4970         {
4971             G4_INST* callInst = callDclMapIt->second.first;
4972             varDcl = callInst->getDst()->getBase()->asRegVar()->getDeclare();
4973             buildSIMDIntfDcl(varDcl, false);
4974         }
4975 
4976         auto& func = callDclMapIt->second.second;
4977         for (unsigned i = 0; i < liveAnalysis.getNumSelectedVar(); i++)
4978         {
4979             auto maydef = liveAnalysis.subroutineMaydef.find(func);
4980             if (maydef != liveAnalysis.subroutineMaydef.end() && maydef->second.isSet(i))
4981             {
4982                 varDcl = lrs[i]->getDcl();
4983                 buildSIMDIntfDcl(varDcl, true);
4984             }
4985         }
4986     }
4987     else
4988     {
4989         buildSIMDIntfDcl(newDcl, false);
4990     }
4991 }
4992 
updateActiveList(G4_Declare * newDcl,std::list<G4_Declare * > * dclMaskList)4993 void Augmentation::updateActiveList(G4_Declare* newDcl, std::list<G4_Declare*>* dclMaskList)
4994 {
4995     bool done = false;
4996 
4997     for (auto defaultIt = dclMaskList->begin();
4998         defaultIt != dclMaskList->end();
4999         defaultIt++)
5000     {
5001         G4_Declare* defaultDcl = (*defaultIt);
5002 
5003         if (gra.getEndInterval(defaultDcl)->getLexicalId() >= gra.getEndInterval(newDcl)->getLexicalId())
5004         {
5005             dclMaskList->insert(defaultIt, newDcl);
5006             done = true;
5007             break;
5008         }
5009     }
5010 
5011     if (done == false)
5012     {
5013         dclMaskList->push_back(newDcl);
5014     }
5015  }
5016 
5017 //
5018 // Perform linear scan and mark interference between conflicting dcls with incompatible masks.
5019 //
buildInterferenceIncompatibleMask()5020 void Augmentation::buildInterferenceIncompatibleMask()
5021 {
5022     // Create 2 active lists - 1 for holding active live-intervals
5023     // with non-default mask and other for default mask
5024 
5025     for (G4_Declare *newDcl : sortedIntervals)
5026     {
5027         unsigned startIdx = gra.getStartInterval(newDcl)->getLexicalId();
5028 #ifdef DEBUG_VERBOSE_ON
5029         DEBUG_VERBOSE("New idx " << startIdx << std::endl);
5030 #endif
5031 
5032         expireIntervals(startIdx);
5033         if (!kernel.fg.builder->getOption(vISA_UseOldSubRoutineAugIntf))
5034         {
5035             buildSIMDIntfAll(newDcl);
5036         }
5037         else
5038         {
5039             buildSIMDIntfAllOld(newDcl);
5040         }
5041 
5042         // Add newDcl to correct list
5043         if (gra.getHasNonDefaultMaskDef(newDcl) || newDcl->getAddressed() == true)
5044         {
5045             updateActiveList(newDcl, &nonDefaultMask);
5046 #ifdef DEBUG_VERBOSE_ON
5047             DEBUG_VERBOSE("Adding " << newDcl->getName() <<
5048                 " to non-default list" << std::endl);
5049 #endif
5050         }
5051         else
5052         {
5053             updateActiveList(newDcl, &defaultMask);
5054 #ifdef DEBUG_VERBOSE_ON
5055             DEBUG_VERBOSE("Adding " << newDcl->getName() <<
5056                 " to default list" << std::endl);
5057 #endif
5058         }
5059     }
5060 
5061     if (!kernel.fg.builder->getOption(vISA_UseOldSubRoutineAugIntf))
5062     {
5063         for (auto func : kernel.fg.funcInfoTable)
5064         {
5065             buildInteferenceForCallsite(func);
5066         }
5067         buildInteferenceForRetDeclares();
5068     }
5069 }
5070 
buildInteferenceForCallSiteOrRetDeclare(G4_Declare * newDcl,MaskDeclares * mask)5071 void Augmentation::buildInteferenceForCallSiteOrRetDeclare(G4_Declare* newDcl, MaskDeclares* mask)
5072 {
5073 
5074     for (unsigned i = 0; i < liveAnalysis.getNumSelectedGlobalVar(); i++)
5075     {
5076         auto newDclAugMask = gra.getAugmentationMask(newDcl);
5077 
5078         if (mask->first.isSet(i))
5079         {
5080             G4_Declare* defaultDcl = lrs[i]->getDcl();
5081             if (gra.getAugmentationMask(defaultDcl) != newDclAugMask)
5082             {
5083                 handleSIMDIntf(defaultDcl, newDcl, true);
5084             }
5085             else
5086             {
5087                 if (liveAnalysis.livenessClass(G4_GRF) &&
5088                     // Populate compatible sparse intf data structure
5089                     // only for weak edges.
5090                     weakEdgeNeeded(gra.getAugmentationMask(defaultDcl), newDclAugMask))
5091                 {
5092                     if (defaultDcl->getRegVar()->isPhyRegAssigned() &&
5093                         newDcl->getRegVar()->isPhyRegAssigned())
5094                     {
5095                         continue;
5096                     }
5097 
5098                     if (intf.isStrongEdgeBetween(defaultDcl, newDcl))
5099                     {
5100                         // No need to add weak edge
5101                         continue;
5102                     }
5103 
5104                     // defaultDcl and newDcl are compatible live-ranges and can have weak edge in intf graph
5105                     auto it = intf.compatibleSparseIntf.find(defaultDcl);
5106                     if (it != intf.compatibleSparseIntf.end())
5107                     {
5108                         it->second.push_back(newDcl);
5109                     }
5110                     else
5111                     {
5112                         std::vector<G4_Declare*> v(1, newDcl);
5113                         intf.compatibleSparseIntf.insert(
5114                             std::make_pair(defaultDcl, v));
5115                     }
5116 
5117                     it = intf.compatibleSparseIntf.find(newDcl);
5118                     if (it != intf.compatibleSparseIntf.end())
5119                     {
5120                         it->second.push_back(defaultDcl);
5121                     }
5122                     else
5123                     {
5124                         std::vector<G4_Declare*> v(1, defaultDcl);
5125                         intf.compatibleSparseIntf.insert(
5126                             std::make_pair(newDcl, v));
5127                     }
5128                 }
5129             }
5130         }
5131 
5132         // Mark interference among non-default mask variables
5133         if (mask->second.isSet(i))
5134         {
5135             G4_Declare* nonDefaultDcl = lrs[i]->getDcl();
5136             // Non-default masks are different so mark interference
5137             handleSIMDIntf(nonDefaultDcl, newDcl, true);
5138         }
5139     }
5140 }
5141 
buildInteferenceForCallsite(FuncInfo * func)5142 void Augmentation::buildInteferenceForCallsite(FuncInfo* func)
5143 {
5144     for (unsigned i = 0; i < liveAnalysis.getNumSelectedVar(); i++)
5145     {
5146         auto maydef = liveAnalysis.subroutineMaydef.find(func);
5147         if (maydef != liveAnalysis.subroutineMaydef.end() && maydef->second.isSet(i))
5148         {
5149             G4_Declare* varDcl = lrs[i]->getDcl();
5150             buildInteferenceForCallSiteOrRetDeclare(varDcl, &callsiteDeclares[func]);
5151         }
5152     }
5153     if (kernel.getOption(vISA_LocalRA))
5154     {
5155         for (uint32_t j = 0; j < kernel.getNumRegTotal(); j++)
5156         {
5157             if (localSummaryOfCallee[func].isGRFBusy(j))
5158             {
5159                 G4_Declare* varDcl = gra.getGRFDclForHRA(j);
5160                 buildInteferenceForCallSiteOrRetDeclare(varDcl, &callsiteDeclares[func]);
5161             }
5162         }
5163     }
5164 }
5165 
buildInteferenceForRetDeclares()5166 void Augmentation::buildInteferenceForRetDeclares()
5167 {
5168     for (auto retDclIt : retDeclares)
5169     {
5170         buildInteferenceForCallSiteOrRetDeclare(retDclIt.first, &retDclIt.second);
5171     }
5172 }
5173 
buildSummaryForCallees()5174 void Augmentation::buildSummaryForCallees()
5175 {
5176     int totalGRFNum = kernel.getNumRegTotal();
5177 
5178     for (auto func : kernel.fg.sortedFuncTable)
5179     {
5180         unsigned fid = func->getId();
5181         if (fid == UINT_MAX)
5182         {
5183             // entry kernel
5184             continue;
5185         }
5186         PhyRegSummary funcSummary(totalGRFNum);
5187         for (auto&& bb : func->getBBList())
5188         {
5189             if (auto summary = kernel.fg.getBBLRASummary(bb))
5190             {
5191                 for (int i = 0; i < totalGRFNum; i++)
5192                 {
5193                     if (summary->isGRFBusy(i))
5194                     {
5195                         funcSummary.setGRFBusy(i);
5196                     }
5197                 }
5198             }
5199         }
5200 
5201         for (auto&& callee : func->getCallees())
5202         {
5203             PhyRegSummary* summary = &localSummaryOfCallee[callee];
5204             if (summary)
5205             {
5206                 for (int i = 0; i < totalGRFNum; i++)
5207                 {
5208                     if (summary->isGRFBusy(i))
5209                     {
5210                         funcSummary.setGRFBusy(i);
5211                     }
5212                 }
5213             }
5214         }
5215         localSummaryOfCallee[func] = funcSummary;
5216     }
5217 }
5218 
augmentIntfGraph()5219 void Augmentation::augmentIntfGraph()
5220 {
5221     if (!(kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D &&
5222         !liveAnalysis.livenessClass(G4_ADDRESS) &&
5223         kernel.fg.size() > 2))
5224     {
5225         if (!kernel.getOption(vISA_DumpRegChart))
5226         {
5227             return;
5228         }
5229     }
5230 
5231     for (auto func : kernel.fg.funcInfoTable)
5232     {
5233         auto& item = callsiteDeclares[func];
5234         item.first.resize(liveAnalysis.getNumSelectedGlobalVar());
5235         item.second.resize(liveAnalysis.getNumSelectedGlobalVar());
5236     }
5237 
5238     if (kernel.getOption(vISA_LocalRA))
5239     {
5240         buildSummaryForCallees();
5241     }
5242 
5243     // First check whether any definitions exist with incompatible mask
5244     bool nonDefaultMaskDef = markNonDefaultMaskDef();
5245 
5246     if (nonDefaultMaskDef == true)
5247     {
5248         // Atleast one definition with non-default mask was found so
5249         // perform steps to augment intf graph with such defs
5250 
5251         // Now build live-intervals globally. This function will
5252         // calculate live-intervals and assign start/end inst
5253         // for respective declares.
5254         buildLiveIntervals();
5255 
5256         // Sort live-intervals based on their start
5257         sortLiveIntervals();
5258 
5259         if (kernel.getOption(vISA_DumpRegChart))
5260         {
5261             gra.regChart = std::make_unique<RegChartDump>(gra);
5262             gra.regChart->recordLiveIntervals(sortedIntervals);
5263         }
5264 
5265         if (gra.verifyAugmentation)
5266         {
5267             gra.verifyAugmentation->loadAugData(sortedIntervals, lrs, intf.liveAnalysis->getNumSelectedVar(), &intf, gra);
5268         }
5269 
5270         if (kernel.getOption(vISA_SpillAnalysis))
5271         {
5272             if (gra.spillAnalysis.get())
5273                 gra.spillAnalysis->LoadAugIntervals(sortedIntervals, gra);
5274         }
5275 
5276         if (kernel.fg.builder->getOption(vISA_GenerateDebugInfo))
5277         {
5278             // Following is done to prevent passing GlobalRA to debug info function
5279             // for clear interface.
5280             std::vector<std::tuple<G4_Declare*, G4_INST*, G4_INST*>> dclIntervals;
5281             dclIntervals.reserve(sortedIntervals.size());
5282             for (auto& dcl : sortedIntervals)
5283             {
5284                 dclIntervals.push_back(std::make_tuple(dcl, gra.getStartInterval(dcl), gra.getEndInterval(dcl)));
5285             }
5286             updateDebugInfo(kernel, dclIntervals);
5287         }
5288 
5289         // Perform linear scan to augment graph
5290         buildInterferenceIncompatibleMask();
5291 
5292         if (liveAnalysis.livenessClass(G4_GRF))
5293         {
5294             if ((GlobalRA::useGenericAugAlign() && kernel.getSimdSize() >= numEltPerGRF<Type_UD>()) ||
5295                 (!GlobalRA::useGenericAugAlign() && kernel.getSimdSize() > numEltPerGRF<Type_UD>()))
5296             {
5297                 // Set alignment of all GRF candidates
5298                 // to 2GRF except for NoMask variables
5299 #ifdef DEBUG_VERBOSE_ON
5300                 DEBUG_VERBOSE("Kernel size is SIMD" << kernel.getSimdSize() << " so updating all GRFs to be 2GRF aligned" << std::endl);
5301 #endif
5302                 gra.evenAlign();
5303             }
5304             gra.updateSubRegAlignment(GRFALIGN);
5305         }
5306 
5307         // Clear information calculated in this iteration of RA so
5308         // a later RA iteration does not use stale information
5309         clearIntervalInfo();
5310     }
5311 }
5312 
buildInterferenceWithLocalRA(G4_BB * bb)5313 void Interference::buildInterferenceWithLocalRA(G4_BB* bb)
5314 {
5315     auto LRASummary = kernel.fg.getBBLRASummary(bb);
5316     if (LRASummary == nullptr)
5317     {
5318         return;
5319     }
5320 
5321     BitSet cur(kernel.getNumRegTotal(), true);
5322     BitSet live(maxId, false);
5323     std::vector<int> curUpdate;
5324 
5325     buildInterferenceAtBBExit(bb, live);
5326 
5327 #ifdef DEBUG_VERBOSE_ON
5328     DEBUG_VERBOSE("BB" << bb->getId() << std::endl);
5329 #endif
5330 
5331     for (INST_LIST_RITER rit = bb->rbegin(), rend = bb->rend();
5332         rit != rend;
5333         rit++)
5334     {
5335         bool update = false;
5336         G4_INST* inst = (*rit);
5337         curUpdate.clear();
5338 
5339 #ifdef DEBUG_VERBOSE_ON
5340         inst->emit(COUT_ERROR);
5341         DEBUG_VERBOSE("    //" << inst->getLineNo() << ":$" << inst->getCISAOff());
5342         DEBUG_VERBOSE(std::endl);
5343 #endif
5344 
5345         // Any physical registers defined will be marked available if
5346         // current inst is first def or if complete region is written
5347         G4_DstRegRegion* dst = inst->getDst();
5348 
5349         if (dst &&
5350             dst->getBase()->isRegVar())
5351         {
5352             LocalLiveRange* localLR = NULL;
5353             G4_Declare* topdcl = GetTopDclFromRegRegion(dst);
5354             unsigned t;
5355 
5356             if (topdcl)
5357                 localLR = gra.getLocalLR(topdcl);
5358 
5359             if (localLR && localLR->getAssigned() && !localLR->isEOT())
5360             {
5361                 int reg, sreg, numrows;
5362                 G4_VarBase* preg = localLR->getPhyReg(sreg);
5363                 numrows = localLR->getTopDcl()->getNumRows();
5364 
5365                 MUST_BE_TRUE(preg->isGreg(), "Register in dst was not GRF");
5366 
5367                 reg = preg->asGreg()->getRegNum();
5368 
5369                 // Check whether the dst physical register is busy/available.
5370                 // If it is available, and we still see a def that means there was no
5371                 // corresponding use. In such cases mark the physical register as
5372                 // busy, so interference building can take place correctly.
5373                 for (int j = reg, sum = reg + numrows; j < sum; j++)
5374                 {
5375                     int k = getGRFDclForHRA(j)->getRegVar()->getId();
5376 
5377                     if (cur.isSet(j) == true)
5378                     {
5379                         buildInterferenceWithLive(live, k);
5380 #ifdef DEBUG_VERBOSE_ON
5381                         DEBUG_VERBOSE("Found no use for r" << j << ".0 so marking it as interfering with live set" << std::endl);
5382 #endif
5383                     }
5384                 }
5385 
5386                 if ((localLR->getFirstRef(t) == inst) ||
5387                     liveAnalysis->writeWholeRegion(bb, inst, dst, builder.getOptions()))
5388                 {
5389                     // Last row may be only partially used by the current dcl
5390                     // so we still need to pessimistically mark last range as
5391                     // busy. Because some other src opnd that is live may still
5392                     // be using the remaining GRF.
5393                     if (localLR->getSizeInWords() % numEltPerGRF<Type_UW>() != 0)
5394                         numrows--;
5395 
5396                     for (int j = reg, sum = reg + numrows; j < sum; j++)
5397                     {
5398                         cur.set(j, true);
5399 #ifdef DEBUG_VERBOSE_ON
5400                         DEBUG_VERBOSE("Setting r" << j << ".0 available" << std::endl);
5401 #endif
5402                     }
5403 
5404                     // Build interference only for point ranges, ideally which shouldnt exist
5405                     // These are ranges that have a def, but no use
5406                     if (localLR->getFirstRef(t) == localLR->getLastRef(t))
5407                     {
5408                         for (int j = reg; j < reg + localLR->getTopDcl()->getNumRows(); j++)
5409                         {
5410                             int k = getGRFDclForHRA(j)->getRegVar()->getId();
5411                             buildInterferenceWithLive(live, k);
5412                         }
5413                     }
5414                 }
5415             }
5416             else if (dst->getBase()->isRegAllocPartaker()) {
5417                 // Global range
5418 
5419                 // In bottom-up order if the live-range has not started then
5420                 // a use was not seen for this def. But we need to ensure this
5421                 // variable interferes with all other live vars.
5422                 bool isPointRange = !live.isSet(dst->getBase()->asRegVar()->getId());
5423 
5424                 if (isPointRange)
5425                 {
5426                     // Mark interference with all busy physical registers
5427                     for (unsigned i = 0; i < kernel.getNumRegTotal(); i++)
5428                     {
5429                         if (cur.isSet(i) == false)
5430                         {
5431                             int k = getGRFDclForHRA(i)->getRegVar()->getId();
5432                             checkAndSetIntf(dst->getBase()->asRegVar()->getId(), k);
5433                         }
5434                     }
5435                 }
5436 
5437                 if (liveAnalysis->writeWholeRegion(bb, inst, dst, builder.getOptions()) ||
5438                     inst->isPseudoKill())
5439                 {
5440                     // Whole write or first def found so mark this operand as not live for earlier instructions
5441                     auto id = dst->getBase()->asRegVar()->getId();
5442                     updateLiveness(live, id, false);
5443                 }
5444             }
5445         }
5446 
5447         // Any physical registers used by src opnds will be busy before the current inst
5448         for (int i = 0; i < G4_MAX_SRCS; i++)
5449         {
5450             G4_Operand* src = inst->getSrc(i);
5451 
5452             if (src &&
5453                 src->isSrcRegRegion() &&
5454                 src->asSrcRegRegion()->getBase()->isRegVar())
5455             {
5456                 LocalLiveRange* localLR = NULL;
5457                 G4_Declare* topdcl = GetTopDclFromRegRegion(src);
5458 
5459                 if (topdcl)
5460                     localLR = gra.getLocalLR(topdcl);
5461 
5462                 if (localLR && localLR->getAssigned() && !localLR->isEOT())
5463                 {
5464                     int sreg;
5465                     G4_VarBase* preg = localLR->getPhyReg(sreg);
5466                     int numrows = localLR->getTopDcl()->getNumRows();
5467 
5468                     MUST_BE_TRUE(preg->isGreg(), "Register in src was not GRF");
5469 
5470                     int reg = preg->asGreg()->getRegNum();
5471 
5472                     for (int j = reg, sum = reg + numrows; j < sum; j++)
5473                     {
5474                         int k = getGRFDclForHRA(j)->getRegVar()->getId();
5475 
5476                         if (cur.isSet(j) == true)
5477                         {
5478                             // G4_RegVar with id k was marked free, but becomes
5479                             // busy at this instruction. For incremental updates
5480                             // push this to a vector and use it while updating
5481                             // interference graph incrementally.
5482                             curUpdate.push_back(k);
5483                         }
5484 
5485                         cur.set(j, false);
5486 #ifdef DEBUG_VERBOSE_ON
5487                         DEBUG_VERBOSE("Setting r" << j << ".0 busy" << std::endl);
5488 #endif
5489                     }
5490                 }
5491                 else if (src->asSrcRegRegion()->getBase()->isRegAllocPartaker())
5492                 {
5493                     if (live.isSet(src->asSrcRegRegion()->getBase()->asRegVar()->getId()) == false)
5494                         update = true;
5495 
5496                     // Mark operand as live from this inst upwards
5497                     auto id = src->asSrcRegRegion()->getBase()->asRegVar()->getId();
5498                     updateLiveness(live, id, true);
5499                 }
5500             }
5501         }
5502 
5503         if (update == true)
5504         {
5505             // Mark interference with all live
5506             for (unsigned i = 0; i < kernel.getNumRegTotal(); i++)
5507             {
5508                 if (cur.isSet(i) == false)
5509                 {
5510                     int k = getGRFDclForHRA(i)->getRegVar()->getId();
5511                     buildInterferenceWithLive(live, k);
5512                 }
5513             }
5514         }
5515         else {
5516             if (curUpdate.size() > 0)
5517             {
5518                 // Perform incremental update. This code is executed when:
5519                 // 1) live set is unchanged, ie no new global range was started in inst
5520                 // 2) cur set has changed, ie an earlier free GRF has become busy
5521                 // Any new busy GRFs will have to be marked as interfering with
5522                 // currently live-ranges. There is no need to iterate over all
5523                 // busy GRFs. Instead only those GRFs that have got busy in this iteration
5524                 // can be considered for incremental updates.
5525                 for (int k : curUpdate)
5526                 {
5527                     buildInterferenceWithLive(live, k);
5528                 }
5529             }
5530         }
5531     }
5532 
5533     for (unsigned i = 0; i < maxId; i++)
5534     {
5535         bool isAddrSensitive = liveAnalysis->isAddressSensitive(i);
5536 
5537         // If a range is Address taken AND (live-in or live-out or killed)
5538         // mark it to interfere with all physical registers used by local RA
5539         // FIXME: need to check if this is actually needed
5540         if (isAddrSensitive)
5541         {
5542             bool assigned = (lrs[i]->getVar()->getPhyReg() != NULL);
5543             if (!assigned)
5544             {
5545                 bool isLiveIn = liveAnalysis->isLiveAtEntry(bb, i);
5546                 bool isLiveOut = liveAnalysis->isLiveAtExit(bb, i);
5547                 bool isKilled = liveAnalysis->use_kill[bb->getId()].isSet(i);
5548                 if (isLiveIn || isLiveOut || isKilled)
5549                 {
5550                     // Make it to interfere with all physical registers used in the BB
5551                     for (uint32_t j = 0, numReg = kernel.getNumRegTotal(); j < numReg; j++)
5552                     {
5553                         if (LRASummary->isGRFBusy(j))
5554                         {
5555                             int k = getGRFDclForHRA(j)->getRegVar()->getId();
5556                             checkAndSetIntf(i, k);
5557                         }
5558                     }
5559                 }
5560             }
5561         }
5562     }
5563 }
5564 
5565 
interferenceVerificationForSplit() const5566 void Interference::interferenceVerificationForSplit() const
5567 {
5568 
5569     std::cout << "\n\n **** Interference Verification Table ****\n";
5570     for (unsigned i = 0; i < maxId; i++)
5571     {
5572         std::cout << "(" << i << ") ";
5573         //lrs[i]->dump();
5574         for (unsigned j = 0; j < maxId; j++)
5575         {
5576             if (interfereBetween(i, j))
5577             {
5578                 if (!interfereBetween(gra.getSplittedDeclare(lrs[i]->getDcl())->getRegVar()->getId(), j) &&
5579                     (gra.getSplittedDeclare(lrs[i]->getDcl()) != lrs[j]->getDcl()))
5580                 {
5581                     std::cout << "\t";
5582                     lrs[j]->getVar()->emit(std::cout);
5583                 }
5584             }
5585         }
5586         std::cout << "\n";
5587     }
5588 }
5589 
linearScanVerify() const5590 bool Interference::linearScanVerify() const
5591 {
5592      std::cout << "--------------- " << kernel.getName() << " ----------------" << "\n";
5593 
5594     for (unsigned i = 0; i < maxId; i++)
5595     {
5596         G4_VarBase* phyReg_i = lrs[i]->getVar()->getPhyReg();
5597         if (!phyReg_i || !phyReg_i->isGreg() || gra.isUndefinedDcl(lrs[i]->getDcl()) || lrs[i]->getDcl()->getRegVar()->isNullReg())
5598         {
5599             continue;
5600         }
5601         unsigned regOff_i = lrs[i]->getVar()->getPhyRegOff() * lrs[i]->getVar()->getDeclare()->getElemSize();
5602         unsigned GRFStart_i = phyReg_i->asGreg()->getRegNum() * numEltPerGRF<Type_UB>() + regOff_i;
5603         unsigned elemsSize_i = lrs[i]->getVar()->getDeclare()->getNumElems() * lrs[i]->getVar()->getDeclare()->getElemSize();
5604         unsigned GRFEnd_i = GRFStart_i + elemsSize_i - 1;
5605 
5606         for (unsigned j = 0; j < maxId; j++)
5607         {
5608             if (interfereBetween(i, j))
5609             {
5610                 if (gra.isUndefinedDcl(lrs[j]->getDcl()) || builder.kernel.fg.isPseudoDcl(lrs[j]->getDcl()) || lrs[j]->getDcl()->getRegVar()->isNullReg())
5611                 {
5612                     continue;
5613                 }
5614 
5615                 G4_VarBase* phyReg_j = lrs[j]->getVar()->getPhyReg();
5616                 unsigned regOff_j = lrs[j]->getVar()->getPhyRegOff() * lrs[j]->getVar()->getDeclare()->getElemSize();
5617                 unsigned GRFStart_j = phyReg_j->asGreg()->getRegNum() * numEltPerGRF<Type_UB>() + regOff_j;
5618                 unsigned elemsSize_j = lrs[j]->getVar()->getDeclare()->getNumElems() * lrs[j]->getVar()->getDeclare()->getElemSize();
5619                 unsigned GRFEnd_j = GRFStart_j + elemsSize_j - 1;
5620                 if (!(GRFEnd_i < GRFStart_j || GRFEnd_j < GRFStart_i))
5621                 {
5622                     LSLiveRange* i_LSLR = gra.getLSLR(lrs[i]->getDcl());
5623                     LSLiveRange* j_LSLR = gra.getLSLR(lrs[j]->getDcl());
5624                     unsigned i_start = 0;
5625                     unsigned i_end = 0;
5626                     if (i_LSLR)  //For the stack call or some other function which will add extra declares after allocation
5627                     {
5628                         i_LSLR->getFirstRef(i_start);
5629                         i_LSLR->getLastRef(i_end);
5630                     }
5631 
5632                     unsigned j_start = 0;
5633                     unsigned j_end = 0;
5634                     if (j_LSLR)
5635                     {
5636                         j_LSLR->getFirstRef(j_start);
5637                         j_LSLR->getLastRef(j_end);
5638                     }
5639 
5640                     std::cout << "(" << i << "," << j << ")" << lrs[i]->getDcl()->getName() << "(" << GRFStart_i << ":" << GRFEnd_i << ")[" << i_start << "," << i_end << "] vs "
5641                         << lrs[j]->getDcl()->getName() << "(" << GRFStart_i << ":" << GRFEnd_j << ")[" << j_start << "," << j_end << "]" << "\n";
5642                 }
5643             }
5644         }
5645     }
5646 
5647     return true;
5648 }
5649 
dumpInterference() const5650 void Interference::dumpInterference() const
5651 {
5652 
5653     std::cout << "\n\n **** Interference Table ****\n";
5654     for (unsigned i = 0; i < maxId; i++)
5655     {
5656         std::cout << "(" << i << ") ";
5657         lrs[i]->dump();
5658         std::cout << "\n";
5659         for (unsigned j = 0; j < maxId; j++)
5660         {
5661             if (interfereBetween(i, j))
5662             {
5663                 std::cout << "\t";
5664                 lrs[j]->getVar()->emit(std::cout);
5665             }
5666         }
5667         std::cout << "\n\n";
5668     }
5669 }
5670 
dumpVarInterference() const5671 void Interference::dumpVarInterference() const
5672 {
5673 
5674     std::cout << "\n\n **** Var Interference Table ****\n";
5675     for (G4_Declare* decl : gra.kernel.Declares)
5676     {
5677         if (decl->getRegVar()->isRegAllocPartaker())
5678         {
5679             unsigned i = decl->getRegVar()->getId();
5680             //std::cout << "(" << i << ") ";
5681             lrs[i]->dump();
5682             std::cout << "\n";
5683             for (G4_Declare* decl : gra.kernel.Declares)
5684             {
5685                 if (decl->getRegVar()->isRegAllocPartaker())
5686                 {
5687                     unsigned j = decl->getRegVar()->getId();
5688                     if (interfereBetween(i, j))
5689                     {
5690                         std::cout << "\t";
5691                         lrs[j]->getVar()->emit(std::cout);
5692                     }
5693                 }
5694             }
5695             std::cout << "\n\n";
5696         }
5697     }
5698 }
5699 
GraphColor(LivenessAnalysis & live,unsigned totalGRF,bool hybrid,bool forceSpill_)5700 GraphColor::GraphColor(LivenessAnalysis& live, unsigned totalGRF, bool hybrid, bool forceSpill_) :
5701     gra(live.gra), totalGRFRegCount(totalGRF), numVar(live.getNumSelectedVar()), numSplitStartID(live.getNumSplitStartID()), numSplitVar(live.getNumSplitVar()),
5702     intf(&live, lrs, live.getNumSelectedVar(), live.getNumSplitStartID(), live.getNumSplitVar(), gra), regPool(gra.regPool),
5703     builder(gra.builder), isHybrid(hybrid),
5704     forceSpill(forceSpill_), mem(GRAPH_COLOR_MEM_SIZE),
5705     kernel(gra.kernel), liveAnalysis(live)
5706 {
5707     spAddrRegSig = (unsigned*)mem.alloc(getNumAddrRegisters() * sizeof(unsigned));
5708     m_options = builder.getOptions();
5709 }
5710 
5711 //
5712 // lrs[i] gives the live range whose id is i
5713 //
createLiveRanges(unsigned reserveSpillSize)5714 void GraphColor::createLiveRanges(unsigned reserveSpillSize)
5715 {
5716     lrs = (LiveRange**)mem.alloc(sizeof(LiveRange*)*numVar);
5717     bool hasStackCall = builder.kernel.fg.getHasStackCalls() || builder.kernel.fg.getIsStackCallFunc();
5718     // Modification For Alias Dcl
5719     for (auto dcl : gra.kernel.Declares)
5720     {
5721         G4_RegVar* var = dcl->getRegVar();
5722         // Do not include alias var in liverange creation
5723         if (!var->isRegAllocPartaker() || dcl->getAliasDeclare() != NULL)
5724         {
5725             continue;
5726         }
5727         lrs[var->getId()] = new (mem)LiveRange(var, this->gra);
5728         unsigned reservedGRFNum = m_options->getuInt32Option(vISA_ReservedGRFNum);
5729 
5730         if (builder.kernel.fg.isPseudoDcl(dcl))
5731         {
5732             lrs[var->getId()]->setIsPseudoNode();
5733         }
5734         if (dcl->getIsPartialDcl())
5735         {
5736             if (G4_Declare * parentDcl = this->gra.getSplittedDeclare(dcl))
5737             {
5738                 lrs[var->getId()]->setParentLRID(parentDcl->getRegVar()->getId());
5739                 lrs[var->getId()]->setIsPartialDcl();
5740             }
5741         }
5742         if (dcl->getIsSplittedDcl())
5743         {
5744             lrs[var->getId()]->setIsSplittedDcl(true);
5745         }
5746         lrs[var->getId()]->setBC(gra.getBankConflict(dcl));
5747 
5748         lrs[var->getId()]->allocForbidden(mem, hasStackCall, reserveSpillSize, reservedGRFNum);
5749         lrs[var->getId()]->setCallerSaveBias(hasStackCall);
5750         G4_Declare* varDcl = lrs[var->getId()]->getDcl();
5751         if (builder.kernel.fg.isPseudoVCADcl(varDcl))
5752         {
5753             lrs[var->getId()]->allocForbiddenCallerSave(mem, &builder.kernel);
5754         }
5755         else if (builder.kernel.fg.isPseudoVCEDcl(varDcl))
5756         {
5757             lrs[var->getId()]->allocForbiddenCalleeSave(mem, &builder.kernel);
5758         }
5759         else if (varDcl == gra.getOldFPDcl())
5760         {
5761             lrs[var->getId()]->allocForbiddenCallerSave(mem, &builder.kernel);
5762         }
5763     }
5764 }
5765 
computeDegreeForGRF()5766 void GraphColor::computeDegreeForGRF()
5767 {
5768     for (unsigned i = 0; i < numVar; i++)
5769     {
5770         unsigned degree = 0;
5771 
5772         if (!(lrs[i]->getIsPseudoNode()) &&
5773             !(lrs[i]->getIsPartialDcl()))
5774         {
5775             const std::vector<unsigned>& intfs = intf.getSparseIntfForVar(i);
5776             unsigned bankDegree = 0;
5777             auto lraBC = lrs[i]->getBC();
5778             bool isOdd = (lraBC == BANK_CONFLICT_SECOND_HALF_EVEN ||
5779                 lraBC == BANK_CONFLICT_SECOND_HALF_ODD);
5780 
5781 
5782             auto computeDegree = [&](LiveRange* lr1)
5783             {
5784                 if (!lr1->getIsPartialDcl())
5785                 {
5786                     unsigned edgeDegree = edgeWeightGRF(lrs[i], lr1);
5787 
5788                     degree += edgeDegree;
5789 
5790                     auto lrsitBC = lr1->getBC();
5791                     bool isOddBC = (lrsitBC == BANK_CONFLICT_SECOND_HALF_EVEN ||
5792                         lrsitBC == BANK_CONFLICT_SECOND_HALF_ODD);
5793 
5794                     if ((isOdd && isOddBC) ||
5795                         (!isOdd && !isOddBC))
5796                     {
5797                         bankDegree += edgeDegree;
5798                     }
5799                 }
5800             };
5801 
5802             for (auto it : intfs)
5803             {
5804                 computeDegree(lrs[it]);
5805             }
5806 
5807             // consider weak edges in degree computation
5808             auto* weakEdges = intf.getCompatibleSparseIntf(lrs[i]->getDcl());
5809             if (weakEdges)
5810             {
5811                 for (auto weakNeighbor : *weakEdges)
5812                 {
5813                     if (!weakNeighbor->getRegVar()->isRegAllocPartaker())
5814                         continue;
5815 
5816                     computeDegree(lrs[weakNeighbor->getRegVar()->getId()]);
5817                 }
5818             }
5819 
5820             if (isOdd)
5821             {
5822                 oddTotalDegree += bankDegree; //std::max(bankDegree, oddMaxDegree);
5823                 oddTotalRegNum += lrs[i]->getNumRegNeeded();
5824                 oddMaxRegNum = std::max(oddMaxRegNum, lrs[i]->getNumRegNeeded());
5825             }
5826             else
5827             {
5828                 evenTotalDegree += bankDegree; //std::max(bankDegree, evenMaxDegree);
5829                 evenTotalRegNum += lrs[i]->getNumRegNeeded();
5830                 evenMaxRegNum = std::max(evenMaxRegNum, lrs[i]->getNumRegNeeded());
5831             }
5832         }
5833 
5834         lrs[i]->setDegree(degree);
5835     }
5836 
5837     if (kernel.getOption(vISA_SpillAnalysis))
5838     {
5839         for (unsigned int i = 0; i != numVar; ++i)
5840         {
5841             auto dcl = lrs[i]->getDcl();
5842             auto degree = lrs[i]->getDegree();
5843             gra.spillAnalysis->LoadDegree(dcl, degree);
5844         }
5845     }
5846 }
5847 
computeDegreeForARF()5848 void GraphColor::computeDegreeForARF()
5849 {
5850     for (unsigned i = 0; i < numVar; i++)
5851     {
5852         unsigned degree = 0;
5853 
5854         if (!(lrs[i]->getIsPseudoNode()))
5855         {
5856             const std::vector<unsigned>& intfs = intf.getSparseIntfForVar(i);
5857             for (auto it : intfs)
5858             {
5859                 degree += edgeWeightARF(lrs[i], lrs[it]);
5860             }
5861         }
5862 
5863         lrs[i]->setDegree(degree);
5864     }
5865 }
5866 
computeSpillCosts(bool useSplitLLRHeuristic)5867 void GraphColor::computeSpillCosts(bool useSplitLLRHeuristic)
5868 {
5869     std::vector <LiveRange *> addressSensitiveVars;
5870     float maxNormalCost = 0.0f;
5871 
5872     for (unsigned i = 0; i < numVar; i++)
5873     {
5874         G4_Declare* dcl = lrs[i]->getDcl();
5875 
5876         if (dcl->getIsPartialDcl())
5877         {
5878             continue;
5879         }
5880         //
5881         // The spill cost of pseudo nodes inserted to aid generation of save/restore code
5882         // must be the minimum so that such nodes go to the bootom of the color stack.
5883         //
5884         if (builder.kernel.fg.isPseudoDcl(dcl))
5885         {
5886             if (builder.kernel.fg.isPseudoVCADcl(dcl))
5887             {
5888                 lrs[i]->setSpillCost(MINSPILLCOST + 1);
5889             }
5890             else
5891             {
5892                 lrs[i]->setSpillCost(MINSPILLCOST);
5893             }
5894         }
5895 
5896         auto dclLR = gra.getLocalLR(dcl);
5897         if (dclLR != NULL &&
5898             dclLR->getSplit())
5899         {
5900             lrs[i]->setSpillCost(MINSPILLCOST + 2);
5901         }
5902         //
5903         // Give the tiny spill/fill ranges an infinite spill cost, so that they are
5904         // picked first for coloring.
5905         // Also ARF live ranges with exclusively sequential references within the code are
5906         // assigned an infinite spill cost as spilling them will not lower the register
5907         // pressure in the region they are referenced. This does not necessarily hold for
5908         // GRF live ranges are these are potentially large in size but the portions
5909         // accessed by each sequential use are limited to 2 registers for general instructions
5910         // and 8 registers for SEND instructions.
5911         //
5912         else if (gra.isAddrFlagSpillDcl(dcl) ||
5913             lrs[i]->isRetIp() ||
5914             lrs[i]->getIsInfiniteSpillCost() == true ||
5915             ((lrs[i]->getVar()->isRegVarTransient() == true ||
5916                 lrs[i]->getVar()->isRegVarTmp() == true) &&
5917                 lrs[i]->getVar()->isSpilled() == false) ||
5918             dcl == gra.getOldFPDcl() ||
5919             (m_options->getOption(vISA_enablePreemption) &&
5920                 dcl == builder.getBuiltinR0()))
5921         {
5922             lrs[i]->setSpillCost(MAXSPILLCOST);
5923         }
5924         else if (dcl->isDoNotSpill())
5925         {
5926             lrs[i]->setSpillCost(MAXSPILLCOST);
5927         }
5928         //
5929         // Calculate spill costs of regular nodes.
5930         //
5931         else
5932         {
5933             float spillCost = 0.0f;
5934             // NOTE: Add 1 to degree to avoid divide-by-0, as a live range may have no neighbors
5935             if (builder.kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D)
5936             {
5937                 if (useSplitLLRHeuristic)
5938                 {
5939                     spillCost = 1.0f*lrs[i]->getRefCount() / (lrs[i]->getDegree() + 1);
5940                 }
5941                 else
5942                 {
5943                     assert(lrs[i]->getDcl()->getTotalElems() > 0);
5944                     unsigned short numRows = lrs[i]->getDcl()->getNumRows();
5945                     spillCost = 1.0f * lrs[i]->getRefCount() * lrs[i]->getRefCount() * lrs[i]->getDcl()->getByteSize() *
5946                         (float)sqrt(lrs[i]->getDcl()->getByteSize())
5947                         / ((float)sqrt(lrs[i]->getDegree() + 1) * (float)(sqrt(sqrt(numRows))));
5948                 }
5949             }
5950             else
5951             {
5952                 spillCost =
5953                     liveAnalysis.livenessClass(G4_GRF) ?
5954                     lrs[i]->getDegree() : 1.0f*lrs[i]->getRefCount()*lrs[i]->getRefCount() / (lrs[i]->getDegree() + 1);
5955             }
5956 
5957             lrs[i]->setSpillCost(spillCost);
5958 
5959             // Track address sensitive live range.
5960             if (liveAnalysis.isAddressSensitive(i))
5961             {
5962                 addressSensitiveVars.push_back(lrs[i]);
5963             }
5964             else
5965             {
5966                 // Set the spill cost of all other normal live ranges, and
5967                 // track the max normal cost.
5968                 if (maxNormalCost < spillCost)
5969                 {
5970                     maxNormalCost = spillCost;
5971                 }
5972             }
5973         }
5974     }
5975 
5976     //
5977     // Set the spill cost of address sensitive live ranges above all the
5978     // normal live ranges, so that they get colored before all the normal
5979     // live ranges.
5980     //
5981     for (LiveRange *lr : addressSensitiveVars)
5982     {
5983         if (lr->getSpillCost() != MAXSPILLCOST)
5984         {
5985             lr->setSpillCost(maxNormalCost + lr->getSpillCost());
5986         }
5987     }
5988 }
5989 
5990 
5991 //
5992 // subtract lr's neighbors that are still in work list
5993 //
relaxNeighborDegreeGRF(LiveRange * lr)5994 void GraphColor::relaxNeighborDegreeGRF(LiveRange* lr)
5995 {
5996     if (!(lr->getIsPseudoNode()) &&
5997         !(lr->getIsPartialDcl()))
5998     {
5999         unsigned lr_id = lr->getVar()->getId();
6000 
6001         // relax degree between 2 nodes
6002         auto relaxDegree = [&](LiveRange* lr1)
6003         {
6004             if (lr1->getActive() &&
6005                 !lr1->getIsPseudoNode() &&
6006                 !(lr1->getIsPartialDcl()))
6007             {
6008                 unsigned w = edgeWeightGRF(lr1, lr);
6009 
6010 #ifdef DEBUG_VERBOSE_ON
6011                 DEBUG_VERBOSE("\t relax ");
6012                 lr1->dump();
6013                 DEBUG_VERBOSE(" degree(" << lr1->getDegree() << ") - " << w << std::endl);
6014 #endif
6015                 lr1->subtractDegree(w);
6016 
6017                 unsigned availColor = numColor;
6018                 availColor = numColor - lr1->getNumForbidden();
6019 
6020                 if (lr1->getDegree() + lr1->getNumRegNeeded() <= availColor)
6021                 {
6022                     unconstrainedWorklist.push_back(lr1);
6023                     lr1->setActive(false);
6024                 }
6025             }
6026         };
6027 
6028         const std::vector<unsigned>& intfs = intf.getSparseIntfForVar(lr_id);
6029         for (auto it : intfs)
6030         {
6031             LiveRange* lrs_it = lrs[it];
6032 
6033             relaxDegree(lrs_it);
6034         }
6035 
6036         auto* weakEdges = intf.getCompatibleSparseIntf(lr->getDcl());
6037         if (weakEdges)
6038         {
6039             for (auto weakNeighbor : *weakEdges)
6040             {
6041                 if (!weakNeighbor->getRegVar()->isRegAllocPartaker())
6042                     continue;
6043                 auto lr1 = lrs[weakNeighbor->getRegVar()->getId()];
6044                 relaxDegree(lr1);
6045             }
6046         }
6047     }
6048 }
relaxNeighborDegreeARF(LiveRange * lr)6049 void GraphColor::relaxNeighborDegreeARF(LiveRange* lr)
6050 {
6051     if (!(lr->getIsPseudoNode()))
6052     {
6053         unsigned lr_id = lr->getVar()->getId();
6054         const std::vector<unsigned>& intfs = intf.getSparseIntfForVar(lr_id);
6055         for (auto it : intfs)
6056         {
6057             LiveRange* lrs_it = lrs[it];
6058 
6059             if (lrs_it->getActive() &&
6060                 !lrs_it->getIsPseudoNode())
6061             {
6062                 unsigned w = edgeWeightARF(lrs_it, lr);
6063 
6064 #ifdef DEBUG_VERBOSE_ON
6065                 DEBUG_VERBOSE("\t relax ");
6066                 lrs_it->dump();
6067                 DEBUG_VERBOSE(" degree(" << lrs_it->getDegree() << ") - " << w << std::endl);
6068 #endif
6069                 lrs_it->subtractDegree(w);
6070 
6071                 unsigned availColor = numColor;
6072 
6073                 if (lrs_it->getDegree() + lrs_it->getNumRegNeeded() <= availColor)
6074                 {
6075                     unconstrainedWorklist.push_back(lrs_it);
6076                     lrs_it->setActive(false);
6077                 }
6078             }
6079         }
6080     }
6081 }
6082 
6083 
compareSpillCost(LiveRange * lr1,LiveRange * lr2)6084 static bool compareSpillCost(LiveRange* lr1, LiveRange* lr2)
6085 {
6086     return lr1->getSpillCost() < lr2->getSpillCost() ||
6087         (lr1->getSpillCost() == lr2->getSpillCost() && lr1->getVar()->getId() < lr2->getVar()->getId());
6088 }
6089 
6090 //
6091 // All nodes in work list are all contrained (whose degree > max color)
6092 // find one contrained node and move it to order list
6093 //
removeConstrained()6094 void GraphColor::removeConstrained()
6095 {
6096     if (!constrainedWorklist.empty())
6097     {
6098         LiveRange* lr = constrainedWorklist.front();
6099         constrainedWorklist.pop_front();
6100 
6101         if (lr->getActive())
6102         {
6103 
6104 #ifdef DEBUG_VERBOSE_ON
6105             DEBUG_VERBOSE(".... Remove Constrained ");
6106             lr->dump();
6107             DEBUG_VERBOSE(std::endl);
6108 #endif
6109 
6110             if (liveAnalysis.livenessClass(G4_GRF))
6111             {
6112                 relaxNeighborDegreeGRF(lr);
6113             }
6114             else
6115             {
6116                 relaxNeighborDegreeARF(lr);
6117             }
6118             colorOrder.push_back(lr);
6119             lr->setActive(false);
6120         }
6121     }
6122 }
6123 
6124 
determineColorOrdering()6125 void GraphColor::determineColorOrdering()
6126 {
6127     numColor = 0;
6128     if (liveAnalysis.livenessClass(G4_GRF))
6129         numColor = totalGRFRegCount;
6130     else if (liveAnalysis.livenessClass(G4_ADDRESS))
6131         numColor = getNumAddrRegisters();
6132     else if (liveAnalysis.livenessClass(G4_FLAG))
6133         numColor = builder.getNumFlagRegisters();
6134 
6135     unsigned numUnassignedVar = liveAnalysis.getNumUnassignedVar();
6136 
6137     //
6138     // create an array for sorting live ranges
6139     //
6140     std::vector<LiveRange*> sorted;
6141     sorted.reserve(numUnassignedVar);
6142     unsigned j = 0;
6143     for (unsigned i = 0; i < numVar; i++)
6144     {
6145         if (lrs[i]->getPhyReg() == nullptr && !lrs[i]->getIsPartialDcl())
6146         {
6147             sorted.push_back(lrs[i]);
6148             j++;
6149         }
6150     }
6151     MUST_BE_TRUE(j == numUnassignedVar, ERROR_GRAPHCOLOR);
6152 
6153     //
6154     // sort the live range array
6155     //
6156     std::sort(sorted.begin(), sorted.end(), compareSpillCost);
6157 
6158     for (unsigned i = 0; i < numUnassignedVar; i++)
6159     {
6160         LiveRange* lr = sorted[i];
6161         unsigned availColor = numColor;
6162         availColor = numColor - lr->getNumForbidden();
6163 
6164         if (lr->getDegree() + lr->getNumRegNeeded() <= availColor)
6165         {
6166             unconstrainedWorklist.push_back(lr);
6167             lr->setActive(false);
6168         }
6169         else
6170         {
6171             constrainedWorklist.push_back(lr);
6172             lr->setActive(true);
6173         }
6174     }
6175 
6176 #ifdef DEBUG_VERBOSE_ON
6177     DEBUG_VERBOSE("\nSPILL COST" << std::endl);
6178     for (unsigned i = 0; i < numUnassignedVar; i++)
6179     {
6180         sorted[i]->dump();
6181         DEBUG_VERBOSE("\t spillCost=" << sorted[i]->getSpillCost());
6182         DEBUG_VERBOSE("\t degree=" << sorted[i]->getDegree());
6183         DEBUG_VERBOSE("\t refCnt=" << sorted[i]->getRefCount());
6184         DEBUG_VERBOSE("\t size=" << sorted[i]->getDcl()->getByteSize());
6185         DEBUG_VERBOSE(std::endl);
6186     }
6187     DEBUG_VERBOSE(std::endl);
6188 #endif
6189 
6190     while (!constrainedWorklist.empty() ||
6191         !unconstrainedWorklist.empty())
6192     {
6193         while (!unconstrainedWorklist.empty())
6194         {
6195             LiveRange* lr = unconstrainedWorklist.front();
6196             unconstrainedWorklist.pop_front();
6197 
6198 #ifdef DEBUG_VERBOSE_ON
6199             DEBUG_VERBOSE(".... Remove Unconstrained ");
6200             lr->dump();
6201             DEBUG_VERBOSE(std::endl);
6202 #endif
6203 
6204             if (liveAnalysis.livenessClass(G4_GRF))
6205             {
6206                 relaxNeighborDegreeGRF(lr);
6207             }
6208             else
6209             {
6210                 relaxNeighborDegreeARF(lr);
6211             }
6212             colorOrder.push_back(lr);
6213         }
6214 
6215         removeConstrained();
6216     }
6217 }
6218 
updateRegUsage(LiveRange * lr)6219 void PhyRegUsage::updateRegUsage(LiveRange* lr)
6220 {
6221     G4_Declare* dcl = lr->getDcl();
6222     G4_VarBase* pr;
6223     if (lr->getIsPartialDcl())
6224     {
6225         pr = lrs[lr->getParentLRID()]->getPhyReg();
6226     }
6227     else
6228     {
6229         pr = lr->getPhyReg();
6230     }
6231 
6232     if (!pr)
6233     {
6234         return;
6235     }
6236     if (pr->isGreg())
6237     {
6238         if (dcl->getIsPartialDcl())
6239         {
6240             //Assumptions:
6241             // 1. the offset of the sub declare must be G4_WSIZE aligned
6242             // 2. the size of the subdeclare must be G4_WSIZE aligned
6243             markBusyForDclSplit(G4_GRF,
6244                 ((G4_Greg*)pr)->getRegNum(),
6245                 (lrs[lr->getParentLRID()]->getPhyRegOff() * TypeSize(dcl->getElemType()) + gra.getSubOffset(dcl)) / G4_WSIZE,
6246                 dcl->getByteSize() / G4_WSIZE,
6247                 dcl->getNumRows());
6248         }
6249         else
6250         {
6251             markBusyGRF(((G4_Greg*)pr)->getRegNum(),
6252                 PhyRegUsage::offsetAllocUnit(lr->getPhyRegOff(), dcl->getElemType()),
6253                 dcl->getWordSize(),
6254                 lr->getNumRegNeeded(), dcl->isPreDefinedVar());
6255         }
6256     }
6257     else if (pr->isFlag())
6258     {
6259         auto flagWordOffset = lr->getPhyReg()->asAreg()->getFlagNum() * 2;
6260         markBusyFlag(0,
6261             PhyRegUsage::offsetAllocUnit(
6262                 flagWordOffset + lr->getPhyRegOff(),
6263                 dcl->getElemType()),
6264             PhyRegUsage::numAllocUnit(dcl->getNumElems(), dcl->getElemType()),
6265             dcl->getNumRows());
6266     }
6267     else if (pr->isAreg())
6268     {
6269         markBusyAddress(0,
6270             PhyRegUsage::offsetAllocUnit(lr->getPhyRegOff(), dcl->getElemType()),
6271             PhyRegUsage::numAllocUnit(dcl->getNumElems(), dcl->getElemType()),
6272             dcl->getNumRows());
6273     }
6274     else
6275     {
6276         MUST_BE_TRUE(false, ERROR_GRAPHCOLOR); // un-handled reg type
6277     }
6278 }
6279 
assignColors(ColorHeuristic colorHeuristicGRF,bool doBankConflict,bool highInternalConflict,bool honorHints)6280 bool GraphColor::assignColors(ColorHeuristic colorHeuristicGRF, bool doBankConflict, bool highInternalConflict, bool honorHints)
6281 {
6282     if (builder.getOption(vISA_RATrace))
6283     {
6284         std::cout << "\t--" << (colorHeuristicGRF == ROUND_ROBIN ? "round-robin" : "first-fit") <<
6285             (doBankConflict ? " BCR" : "") << " graph coloring\n";
6286     }
6287 
6288     unsigned startARFReg = 0;
6289     unsigned startFLAGReg = 0;
6290     unsigned startGRFReg = 0;
6291     unsigned bank1_end = 0;
6292     unsigned bank2_end = totalGRFRegCount - 1;
6293     unsigned bank1_start = 0;
6294     unsigned bank2_start = totalGRFRegCount - 1;
6295     unsigned totalGRFNum = kernel.getNumRegTotal();
6296     bool oneGRFBankDivision = gra.kernel.fg.builder->oneGRFBankDivision();
6297     bool allocFromBanks = liveAnalysis.livenessClass(G4_GRF) && builder.lowHighBundle() &&
6298         !builder.getOptions()->getuInt32Option(vISA_ReservedGRFNum) &&
6299         doBankConflict &&
6300         ((oneGRFBankDivision && gra.kernel.getSimdSize() >= g4::SIMD16) || (!oneGRFBankDivision && highInternalConflict));
6301 
6302     if (allocFromBanks &&
6303         (colorHeuristicGRF == ROUND_ROBIN))
6304     {
6305         bank1_end = (unsigned)((totalGRFRegCount - 1) * (((float)evenTotalDegree / evenTotalRegNum) / (((float)evenTotalDegree / evenTotalRegNum) + ((float)oddTotalDegree / oddTotalRegNum))));
6306         if (bank1_end < evenMaxRegNum ||
6307             totalGRFRegCount - bank1_end < oddMaxRegNum ||
6308             bank1_end == totalGRFRegCount - 1 ||
6309             bank1_end == 0)
6310         {
6311             return false;
6312         }
6313 
6314         bank2_end = bank1_end + 1;
6315     }
6316 
6317     bool* availableGregs = (bool *)mem.alloc(sizeof(bool)* totalGRFNum);
6318     uint32_t* availableSubRegs = (uint32_t *)mem.alloc(sizeof(uint32_t)* totalGRFNum);
6319     bool* availableAddrs = (bool *)mem.alloc(sizeof(bool)* getNumAddrRegisters());
6320     bool* availableFlags = (bool *)mem.alloc(sizeof(bool)* builder.getNumFlagRegisters());
6321     uint8_t* weakEdgeUsage = (uint8_t*)mem.alloc(sizeof(uint8_t)*totalGRFNum);
6322     G4_RegFileKind rFile = G4_GRF;
6323     if (liveAnalysis.livenessClass(G4_FLAG))
6324         rFile = G4_FLAG;
6325     else if (liveAnalysis.livenessClass(G4_ADDRESS))
6326         rFile = G4_ADDRESS;
6327 
6328     unsigned maxGRFCanBeUsed = totalGRFRegCount;
6329     PhyRegUsageParms parms(gra, lrs, rFile, maxGRFCanBeUsed, startARFReg, startFLAGReg, startGRFReg, bank1_start, bank1_end, bank2_start, bank2_end,
6330         doBankConflict, availableGregs, availableSubRegs, availableAddrs, availableFlags, weakEdgeUsage);
6331     bool noIndirForceSpills = builder.getOption(vISA_NoIndirectForceSpills);
6332 
6333     auto& varSplitPass = *gra.getVarSplitPass();
6334 
6335     // Returns true when valid assignment is found or when lr is added to spilled set.
6336     // Adding to spill set happens only if heuristic is not round_robin (FF may not spill).
6337     // Parameter returnFalseOnFail is set when the function is required to return false on
6338     // assignment failure.
6339     // When parameter spillAllowed is set to true, this function adds lr to spilled set. If
6340     // spillAllowed is false, the lr is not added to spill set. This logic is useful to
6341     // try re-allocation of a child/parent dcl when split is enabled.
6342     // ignoreChildrenIntf is set to true when all children are assigned to consecutive ranges
6343     // and we want to get fully coalesceable assignment for parent. In such circumstance, we
6344     // dont want to account for interference between parent/child since doing so cannot result
6345     // in a coalesceable assignment.
6346     auto assignColor = [&](LiveRange* lr, bool ignoreChildrenIntf = false, bool spillAllowed = true, bool returnFalseOnFail = false)
6347     {
6348         auto lrVar = lr->getVar();
6349 
6350         //
6351         // assign register to live ranges
6352         //
6353         if (lr->getPhyReg() == NULL && !lrVar->isSpilled() && !lr->getIsPartialDcl()) // no assigned register yet and not spilled
6354         {
6355             G4_Declare* parentDcl = nullptr;
6356             bool skipParentIntf = false;
6357             if (lr->hasAllocHint())
6358             {
6359                 parms.startGRFReg = (lr->getAllocHint() >= maxGRFCanBeUsed ? 0 : lr->getAllocHint());
6360                 if (varSplitPass.isPartialDcl(lr->getDcl()))
6361                 {
6362                     parentDcl = varSplitPass.getParentDcl(lr->getDcl());
6363                     if (parentDcl)
6364                     {
6365                         auto parentGRF = parentDcl->getRegVar()->getPhyReg();
6366                         if (!parentGRF && parentDcl->getRegVar()->isRegAllocPartaker())
6367                         {
6368                             parentGRF = lrs[parentDcl->getRegVar()->getId()]->getPhyReg();
6369                         }
6370                         if (parentGRF)
6371                         {
6372                             // mark interference between partial lr and all
6373                             // other GRFs allocated to parent dcl. this logic
6374                             // allows either coalesceable allocation or a
6375                             // fully non-overlapping assignment.
6376                             auto siblingNum = varSplitPass.getSiblingNum(lr->getDcl());
6377                             auto parentGRFNum = parentGRF->asGreg()->getRegNum();
6378                             auto parentNumRows = parentDcl->getNumRows();
6379                             auto numRows = lr->getDcl()->getNumRows();
6380                             for (unsigned i = parentGRFNum; i != (parentGRFNum + parentNumRows); i += numRows)
6381                             {
6382                                 if ((i - parentGRFNum) == siblingNum * numRows)
6383                                     continue;
6384                                 lr->markForbidden(i, numRows);
6385                             }
6386                             skipParentIntf = true;
6387                         }
6388                     }
6389                 }
6390             }
6391 
6392             unsigned lr_id = lrVar->getId();
6393             //
6394             // compute what registers are already assigned
6395             //
6396             PhyRegUsage regUsage(parms);
6397 
6398             const std::vector<unsigned>& intfs = intf.getSparseIntfForVar(lr_id);
6399             auto weakEdgeSet = intf.getCompatibleSparseIntf(lrVar->getDeclare()->getRootDeclare());
6400             for (auto it : intfs)
6401             {
6402                 LiveRange* lrTemp = lrs[it];
6403                 if (lrTemp->getPhyReg() != nullptr || lrTemp->getIsPartialDcl())
6404                 {
6405                     if (lrTemp->getIsSplittedDcl())  //Only interfere with children declares
6406                     {
6407                         continue;
6408                     }
6409 
6410                     if (skipParentIntf && lrTemp->getDcl() == parentDcl)
6411                         continue;
6412 
6413                     if (ignoreChildrenIntf && varSplitPass.isParentChildRelation(lr->getDcl(), lrTemp->getDcl()))
6414                         continue;
6415 
6416                     regUsage.updateRegUsage(lrTemp);
6417                 }
6418             }
6419 
6420             if (weakEdgeSet)
6421             {
6422                 regUsage.runOverlapTest(true);
6423                 for (auto weakDcl : *weakEdgeSet)
6424                 {
6425                     auto regVar = weakDcl->getRootDeclare()->getRegVar();
6426                     unsigned pvar = 0, numRegs = 0;
6427                     if (regVar->isPhyRegAssigned())
6428                     {
6429                         // This branch will be taken for dcls assigned
6430                         // regs by LRA.
6431                         pvar = regVar->getPhyReg()->asGreg()->getRegNum();
6432                         numRegs = weakDcl->getNumRows();
6433                     }
6434                     else
6435                     {
6436                         // For dcls not assigned regs by LRA, lookup temp
6437                         // registers assigned to LiveRange instances.
6438                         auto id = regVar->getId();
6439                         auto lr = lrs[id];
6440                         auto phyReg = lr->getPhyReg();
6441                         if (phyReg)
6442                         {
6443                             pvar = phyReg->asGreg()->getRegNum();
6444                             numRegs = weakDcl->getNumRows();
6445                         }
6446                     }
6447 
6448                     // For now it is assumed only 8-byte types will appear
6449                     // here. If other sized types will also appear then
6450                     // augmentation mask also needs to be sent in
6451                     // weak edge data structure below.
6452                     for (unsigned r = pvar; r < (pvar + numRegs); r++)
6453                     {
6454                         auto use = regUsage.getWeakEdgeUse(r);
6455                         if (use == 0 || use == (r - pvar + 1))
6456                         {
6457                             regUsage.setWeakEdgeUse(r, r - pvar + 1);
6458                         }
6459                         else
6460                         {
6461                             // Indiates two neighbors use a physical
6462                             // register with different overlap.
6463                             regUsage.setWeakEdgeUse(r, 0xff);
6464                         }
6465                     }
6466                 }
6467             }
6468 
6469             ColorHeuristic heuristic = colorHeuristicGRF;
6470 
6471             bool failed_alloc = false;
6472             G4_Declare* dcl = lrVar->getDeclare();
6473 
6474             if (!(noIndirForceSpills &&
6475                 liveAnalysis.isAddressSensitive(lr_id)) &&
6476                 forceSpill &&
6477                 (dcl->getRegFile() == G4_GRF || dcl->getRegFile() == G4_FLAG) &&
6478                 lr->getRefCount() != 0 &&
6479                 lr->getSpillCost() != MAXSPILLCOST)
6480             {
6481                 failed_alloc = true;
6482             }
6483 
6484             if (dcl->getNumRows() > totalGRFNum)
6485             {
6486                 // we sure as hell won't get an assignment
6487                 failed_alloc = true;
6488             }
6489 
6490             if (!failed_alloc)
6491             {
6492                 // When evenAlignNeeded is true, it is binding for correctness
6493                 bool evenAlignNeeded = gra.isEvenAligned(lrVar->getDeclare());
6494                 BankAlign align = evenAlignNeeded ? BankAlign::Even : BankAlign::Either;
6495                 if (allocFromBanks && !lr->hasAllocHint())
6496                 {
6497 
6498                     if (!isHybrid && oneGRFBankDivision &&
6499                         (!evenAlignNeeded || getPlatformGeneration(builder.getPlatform()) == PlatformGen::GEN9))
6500                     {
6501                         gra.getBankAlignment(lr, align);
6502                     }
6503                     failed_alloc |= !regUsage.assignGRFRegsFromBanks(lr, align, lr->getForbidden(),
6504                         heuristic, oneGRFBankDivision);
6505                 }
6506                 else
6507                 {
6508                     failed_alloc |= !regUsage.assignRegs(highInternalConflict, lr, lr->getForbidden(),
6509                         align, gra.getSubRegAlign(lrVar->getDeclare()), heuristic, lr->getSpillCost(),
6510                         lr->hasAllocHint());
6511                 }
6512             }
6513 
6514             //
6515             // assign unused color
6516             //
6517             if (failed_alloc)
6518             {
6519                 //
6520                 // for GRF register assignment, if we are performing round-robin (1st pass) then abort on spill
6521                 //
6522                 if ((heuristic == ROUND_ROBIN || (doBankConflict && !kernel.getOption(vISA_forceBCR))) &&
6523                     (lr->getRegKind() == G4_GRF || lr->getRegKind() == G4_FLAG))
6524                 {
6525                     return false;
6526                 }
6527                 else if (kernel.fg.isPseudoDcl(dcl))
6528                 {
6529                     // these pseudo dcls are not (and cannot be) spilled, but instead save/restore code will
6530                     // be inserted in stack call prolog/epilog
6531                 }
6532                 else
6533                 {
6534                     // for first-fit register assignment track spilled live ranges
6535                     if (spillAllowed)
6536                     {
6537                         // When retrying a coalesceable assignment, dont spill
6538                         // if there is no GRF available.
6539                         spilledLRs.push_back(lr);
6540                         lr->setSpilled(true);
6541                     }
6542                 }
6543 
6544                 if (returnFalseOnFail)
6545                 {
6546                     return false;
6547                 }
6548             }
6549             else
6550             {
6551                 // Allocation succeeded, set hint if this is a split/child dcl
6552                 if (!ignoreChildrenIntf &&
6553                     (varSplitPass.isSplitDcl(dcl) || varSplitPass.isPartialDcl(dcl)))
6554                 {
6555                     varSplitPass.writeHints(dcl, lrs);
6556                 }
6557             }
6558         }
6559 #ifdef DEBUG_VERBOSE_ON
6560         lr->dump();
6561         COUT_ERROR << std::endl;
6562 #endif
6563         return true;
6564     };
6565 
6566     // colorOrder is in reverse order (unconstrained at front)
6567     for (auto iter = colorOrder.rbegin(), iterEnd = colorOrder.rend(); iter != iterEnd; ++iter)
6568     {
6569         auto lr = (*iter);
6570 
6571         // in case child/parent was already spilled earlier, dont recolor
6572         if (lr->isSpilled())
6573             continue;
6574 
6575         bool ret = assignColor(lr);
6576 
6577         // early exit
6578         if (!ret)
6579             return false;
6580 
6581         if (lr->getSpillCost() == MAXSPILLCOST &&
6582             !lr->getPhyReg() &&
6583             honorHints)
6584         {
6585             // infinite spill cost range spilled
6586             // undo all allocations done to split vars
6587             // and skip adhering to hints for preserving
6588             // correctness.
6589             resetTemporaryRegisterAssignments();
6590             return assignColors(colorHeuristicGRF, doBankConflict, highInternalConflict, false);
6591         }
6592 
6593         if (honorHints && gra.getIterNo() == 0)
6594         {
6595             // attempt coalescing in non-spill iteration only
6596             if (varSplitPass.isSplitDcl(lr->getDcl()))
6597             {
6598                 // Try allocating children, out of order in hopes
6599                 // of getting a coalesceable assignment
6600                 auto children = varSplitPass.getChildren(lr->getDcl());
6601                 for (auto child : *children)
6602                 {
6603                     if (child->getRegVar()->isRegAllocPartaker())
6604                     {
6605                         auto childLR = lrs[child->getRegVar()->getId()];
6606                         if (!childLR->getPhyReg())
6607                         {
6608                             auto isChildSpilled = childLR->isSpilled();
6609                             assignColor(childLR, false, !isChildSpilled);
6610                             // if allocated GRF is different than hint, then
6611                             // undo allocation and let coloring take its course.
6612                             // this can be done only if the childLR wasnt
6613                             // already processed in colorOrder.
6614                             if (!isChildSpilled && childLR->getPhyReg())
6615                             {
6616                                 auto hint = childLR->getAllocHint();
6617                                 if (childLR->getPhyReg()->asGreg()->getRegNum() != hint)
6618                                 {
6619                                     // this is executed only if childLR is guaranteed to be
6620                                     // processed later on in colorOrder.
6621                                     childLR->resetPhyReg();
6622                                 }
6623                             }
6624                             else if (isChildSpilled && childLR->getPhyReg())
6625                             {
6626                                 // was spilled earlier, got allocation now
6627                                 spilledLRs.remove(childLR);
6628                             }
6629                         }
6630                         else
6631                         {
6632                             // retry allocating as per hint
6633                             auto oldPhyReg = childLR->getPhyReg();
6634                             auto oldPhySubReg = childLR->getPhyRegOff();
6635                             auto hint = childLR->getAllocHint();
6636                             if (oldPhyReg->asGreg()->getRegNum() == hint)
6637                                 continue;
6638                             childLR->resetPhyReg();
6639                             bool success = assignColor(childLR, false, false, true);
6640                             if (!success || childLR->getPhyReg()->asGreg()->getRegNum() != hint)
6641                                 childLR->setPhyReg(oldPhyReg, oldPhySubReg);
6642                         }
6643                     }
6644                 }
6645             }
6646 
6647             // if all children are assigned consecutive GRFs but parent isnt
6648             // then try re-assigning parent
6649             if (varSplitPass.isPartialDcl(lr->getDcl()) &&
6650                 varSplitPass.reallocParent(lr->getDcl(), getLiveRanges()))
6651             {
6652                 auto parentDcl = varSplitPass.getParentDcl(lr->getDcl());
6653                 auto parentLR = getLiveRanges()[parentDcl->getRegVar()->getId()];
6654                 auto oldPhyReg = parentLR->getPhyReg();
6655                 auto oldPhySubReg = parentLR->getPhyRegOff();
6656                 bool isParentSpilled = parentLR->isSpilled();
6657                 parentLR->resetPhyReg();
6658                 varSplitPass.writeHints(lr->getDcl(), getLiveRanges());
6659                 assignColor(parentLR, true, !isParentSpilled);
6660                 // If parent's assigned GRF is non-coalesceable assignment then
6661                 // undo it as it is risky to keep this because parent's intf
6662                 // doesnt include children.
6663                 auto newParentAssignment = parentLR->getPhyReg();
6664                 if ((newParentAssignment && newParentAssignment->asGreg()->getRegNum() != parentLR->getAllocHint()) ||
6665                     !newParentAssignment)
6666                     parentLR->setPhyReg(oldPhyReg, oldPhySubReg);
6667 
6668                 if (isParentSpilled && parentLR->getPhyReg())
6669                 {
6670                     // remove parent from spill list since it got an allocation this time
6671                     spilledLRs.remove(parentLR);
6672                     parentLR->setSpilled(false);
6673                 }
6674             }
6675         }
6676     }
6677 
6678     // record RA type
6679     if (liveAnalysis.livenessClass(G4_GRF))
6680     {
6681         if (colorHeuristicGRF == ROUND_ROBIN)
6682         {
6683             kernel.setRAType(doBankConflict ? RA_Type::GRAPH_COLORING_RR_BC_RA : RA_Type::GRAPH_COLORING_RR_RA);
6684         }
6685         else
6686         {
6687             kernel.setRAType(doBankConflict ? RA_Type::GRAPH_COLORING_FF_BC_RA : RA_Type::GRAPH_COLORING_FF_RA);
6688         }
6689     }
6690 
6691 #ifdef _DEBUG
6692     // Verify that spilledLRs has no duplicate
6693     for (auto item : spilledLRs)
6694     {
6695         unsigned count = 0;
6696         for (auto checkItem : spilledLRs)
6697         {
6698             if (checkItem == item)
6699             {
6700                 MUST_BE_TRUE(count == 0, "Duplicate entry found in spilledLRs");
6701                 count++;
6702             }
6703         }
6704     }
6705 
6706     // Verify that none of spilledLRs have an allocation
6707     for (auto lr : spilledLRs)
6708     {
6709         MUST_BE_TRUE(lr->getPhyReg() == nullptr, "Spilled LR contains valid allocation");
6710     }
6711 
6712     // Verify that all spilled LRs are synced
6713     for (auto lr : spilledLRs)
6714     {
6715         MUST_BE_TRUE(lr->isSpilled(), "LR not marked as spilled, but inserted in spilledLRs list");
6716     }
6717 
6718     // Verify if all LRs have either an allocation or are spilled
6719     for (auto lr : colorOrder)
6720     {
6721         if (!kernel.fg.isPseudoDcl(lr->getDcl()))
6722         {
6723             MUST_BE_TRUE(lr->isSpilled() || lr->getPhyReg() || lr->getDcl()->isSpilled(), "Range without allocation and not spilled");
6724         }
6725     }
6726 #endif
6727 
6728     return true;
6729 }
6730 
6731 template <class REGION_TYPE>
getRegionDisp(REGION_TYPE * region)6732 unsigned GlobalRA::getRegionDisp(
6733     REGION_TYPE * region
6734 )
6735 {
6736     unsigned rowOffset = numEltPerGRF<Type_UB>() * region->getRegOff();
6737     unsigned columnOffset = region->getSubRegOff() * region->getElemSize();
6738     return rowOffset + columnOffset;
6739 }
6740 
addEUFusionWAInsts(G4_INST * inst)6741 void GlobalRA::addEUFusionWAInsts(G4_INST* inst)
6742 {
6743     if(EUFusionWANeeded())
6744         EUFusionWAInsts.insert(inst);
6745 }
6746 
getRegionByteSize(G4_DstRegRegion * region,unsigned execSize)6747 unsigned GlobalRA::getRegionByteSize(
6748     G4_DstRegRegion * region,
6749     unsigned          execSize
6750 )
6751 {
6752     unsigned size = region->getHorzStride() * region->getElemSize() *
6753         (execSize - 1) + region->getElemSize();
6754 
6755     return size;
6756 }
6757 
6758 #define OWORD_BYTE_SIZE 16
6759 
6760 template <class REGION_TYPE>
isUnalignedRegion(REGION_TYPE * region,unsigned execSize)6761 bool GlobalRA::isUnalignedRegion(
6762     REGION_TYPE * region,
6763     unsigned      execSize
6764 )
6765 {
6766     unsigned regionDisp = getRegionDisp(region);
6767     unsigned regionByteSize = getRegionByteSize(region, execSize);
6768 
6769     if (regionDisp%numEltPerGRF<Type_UB>() == 0 && regionByteSize%numEltPerGRF<Type_UB>() == 0)
6770     {
6771         return
6772             regionByteSize / numEltPerGRF<Type_UB>() != 1 &&
6773             regionByteSize / numEltPerGRF<Type_UB>() != 2 &&
6774             regionByteSize / numEltPerGRF<Type_UB>() != 4;
6775     }
6776     return true;
6777 
6778 }
6779 
shouldPreloadDst(G4_INST * instContext,G4_BB * curBB)6780 bool GlobalRA::shouldPreloadDst(
6781     G4_INST *         instContext,
6782     G4_BB*            curBB
6783 )
6784 {
6785     // Check for partial and unaligned regions and add pre-load code, if
6786     // necessary.
6787     auto spilledRangeRegion = instContext->getDst();
6788     uint8_t execSize = instContext->getExecSize();
6789 
6790     if (isPartialRegion(spilledRangeRegion, execSize) ||
6791         isUnalignedRegion(spilledRangeRegion, execSize) ||
6792         instContext->isPartialWriteForSpill(!curBB->isAllLaneActive())) {
6793         return true;
6794     }
6795     // No pre-load for whole and aligned region writes
6796     else {
6797         return false;
6798     }
6799 }
6800 
livenessCandidate(const G4_Declare * decl) const6801 bool GlobalRA::livenessCandidate(const G4_Declare* decl) const
6802 {
6803     if (decl->getAliasDeclare())
6804     {
6805         return false;
6806     }
6807 
6808     if ((G4_GRF & decl->getRegFile()))
6809     {
6810         if ((decl->getRegFile() & G4_INPUT) && decl->getRegVar()->isPhyRegAssigned() && !decl->getRegVar()->isGreg())
6811         {
6812             return false;
6813         }
6814         if (decl->getByteSize() == 0)
6815         {
6816             // regrettably, this can happen for arg/retval pre-defined variable
6817             return false;
6818         }
6819         return true;
6820     }
6821     else
6822     {
6823         return false;
6824     }
6825 }
6826 
determineSpillRegSize(unsigned & spillRegSize,unsigned & indrSpillRegSize)6827 void GlobalRA::determineSpillRegSize(unsigned& spillRegSize, unsigned& indrSpillRegSize)
6828 {
6829     // Iterate over all BBs
6830     for (auto curBB : kernel.fg)
6831     {
6832         // Iterate over all insts
6833         for (INST_LIST_ITER inst_it = curBB->begin(), iend = curBB->end(); inst_it != iend; ++inst_it)
6834         {
6835             unsigned currentSpillRegSize = 0;
6836             unsigned currentIndrSpillRegSize = 0;
6837 
6838             G4_INST* curInst = (*inst_it);
6839 
6840             if (curInst->isPseudoKill() ||
6841                 curInst->isLifeTimeEnd() ||
6842                 curInst->opcode() == G4_pseudo_fcall ||
6843                 curInst->opcode() == G4_pseudo_fret)
6844             {
6845                 continue;
6846             }
6847 
6848             if (curInst->isSend())
6849             {
6850                 G4_SendDesc* msgDesc = curInst->getMsgDesc();
6851 
6852                 unsigned dstSpillRegSize = 0;
6853                 dstSpillRegSize = msgDesc->getDstLenRegs();
6854 
6855                 unsigned src0FillRegSize = 0;
6856                 src0FillRegSize = msgDesc->getSrc0LenRegs();
6857 
6858                 unsigned src1FillRegSize = 0;
6859                 if (curInst->isSplitSend())
6860                 {
6861                     src1FillRegSize = msgDesc->getSrc1LenRegs();
6862                 }
6863 
6864                 if (!kernel.fg.builder->useSends())
6865                 {
6866                     dstSpillRegSize++;
6867                 }
6868 
6869                 currentSpillRegSize = dstSpillRegSize + src0FillRegSize + src1FillRegSize;
6870             }
6871             else if (curInst->isDpas())
6872             {
6873                 unsigned dstSpillRegSize = 0;
6874                 G4_DstRegRegion* dst = curInst->getDst();
6875                 if (dst && dst->getBase()->isRegVar())
6876                 {
6877                     dstSpillRegSize = dst->getBase()->asRegVar()->getDeclare()->getNumRows();
6878                 }
6879 
6880                 unsigned srcFillRegSize = 0;
6881                 for (int i = 0, srcNum = curInst->getNumSrc(); i < srcNum; i++)
6882                 {
6883                     G4_Operand* src = curInst->getSrc(i);
6884 
6885                     if (src &&
6886                         src->isSrcRegRegion() &&
6887                         src->asSrcRegRegion()->getBase()->isRegVar())
6888                     {
6889                         if (src->asSrcRegRegion()->getBase()->asRegVar()->getDeclare()->getRegFile() == G4_GRF)
6890                         {
6891                             unsigned srcSize = src->getBase()->asRegVar()->getDeclare()->getNumRows();
6892                             //FIXME, currently we only use the max src size.
6893                             //To save the spill registers, it's better the space can be determined by checking if the variable is really spilled or not.
6894                             srcFillRegSize += srcSize;
6895                         }
6896                     }
6897                 }
6898                 currentSpillRegSize = srcFillRegSize + dstSpillRegSize;
6899             }
6900             else
6901             {
6902                 ORG_REGVAR_VECTOR indrVars;
6903 
6904                 unsigned dstSpillRegSize = 0;
6905                 unsigned indrDstSpillRegSize = 0;
6906                 if (G4_Inst_Table[curInst->opcode()].n_dst == 1)
6907                 {
6908                     G4_DstRegRegion* dst = curInst->getDst();
6909 
6910                     if (dst &&
6911                         dst->getBase()->isRegVar())
6912                     {
6913                         if (dst->getBase()->asRegVar()->getDeclare()->getRegFile() == G4_GRF)
6914                         {
6915                             if (dst->isCrossGRFDst())
6916                             {
6917                                 dstSpillRegSize = 2;
6918                             }
6919                             else
6920                             {
6921                                 dstSpillRegSize = 1;
6922                             }
6923 
6924                             if (shouldPreloadDst(curInst, curBB))
6925                             {
6926                                 dstSpillRegSize *= 3;
6927                             }
6928                             else
6929                             {
6930                                 dstSpillRegSize *= 2;
6931                             }
6932 
6933                             if (!kernel.fg.builder->useSends())
6934                             {
6935                                 dstSpillRegSize++;
6936                             }
6937                         }
6938                         else if (dst->getRegAccess() == IndirGRF)
6939                         {
6940                             auto pointsToSet = pointsToAnalysis.getAllInPointsTo(dst->getBase()->asRegVar());
6941                             if (pointsToSet != nullptr)
6942                             {
6943                                 for (auto pt : *pointsToSet)
6944                                 {
6945                                     if (pt.var->isRegAllocPartaker() ||
6946                                        ((builder.getOption(vISA_HybridRAWithSpill) || builder.getOption(vISA_FastCompileRA)) && livenessCandidate(pt.var->getDeclare())))
6947                                     {
6948                                         indrVars.push_back(pt.var);
6949                                         indrDstSpillRegSize += pt.var->getDeclare()->getNumRows();
6950                                     }
6951                                 }
6952                             }
6953                         }
6954                     }
6955                 }
6956 
6957                 unsigned srcFillRegSize = 0;
6958                 unsigned indirSrcFillRegSize = 0;
6959                 // Scan srcs
6960                 for (int i = 0, srcNum = curInst->getNumSrc(); i < srcNum; i++)
6961                 {
6962                     G4_Operand* src = curInst->getSrc(i);
6963 
6964                     if (src &&
6965                         src->isSrcRegRegion() &&
6966                         src->asSrcRegRegion()->getBase()->isRegVar())
6967                     {
6968                         if (src->asSrcRegRegion()->getBase()->asRegVar()->getDeclare()->getRegFile() == G4_GRF)
6969                         {
6970                             if (src->asSrcRegRegion()->crossGRF())
6971                             {
6972                                 srcFillRegSize += 2;
6973                             }
6974                             else
6975                             {
6976                                 srcFillRegSize += 1;
6977                             }
6978                         }
6979                         else if (src->asSrcRegRegion()->getRegAccess() == IndirGRF)
6980                         {
6981                             auto pointsToSet = pointsToAnalysis.getAllInPointsTo(src->asSrcRegRegion()->getBase()->asRegVar());
6982                             if (pointsToSet != nullptr)
6983                             {
6984                                 for (auto pt : *pointsToSet)
6985                                 {
6986                                     if (pt.var->isRegAllocPartaker() ||
6987                                         ((builder.getOption(vISA_HybridRAWithSpill) || builder.getOption(vISA_FastCompileRA)) && livenessCandidate(pt.var->getDeclare())))
6988                                     {
6989                                         if (std::find(indrVars.begin(), indrVars.end(), pt.var) == indrVars.end())
6990                                         {
6991                                             indrVars.push_back(pt.var);
6992                                             indirSrcFillRegSize += pt.var->getDeclare()->getNumRows();
6993                                         }
6994                                     }
6995                                 }
6996                             }
6997                         }
6998                     }
6999                 }
7000 
7001                 if (builder.avoidDstSrcOverlap())
7002                 {
7003                     currentSpillRegSize = srcFillRegSize + dstSpillRegSize;
7004                 }
7005                 else
7006                 {
7007                     currentSpillRegSize = srcFillRegSize > dstSpillRegSize ? srcFillRegSize : dstSpillRegSize;
7008                 }
7009                 currentIndrSpillRegSize = indrDstSpillRegSize + indirSrcFillRegSize;
7010             }
7011 
7012             spillRegSize = std::max(spillRegSize, currentSpillRegSize);
7013             indrSpillRegSize = std::max(indrSpillRegSize, currentIndrSpillRegSize);
7014         }
7015     }
7016 }
7017 
7018 
regAlloc(bool doBankConflictReduction,bool highInternalConflict,bool reserveSpillReg,unsigned & spillRegSize,unsigned & indrSpillRegSize,const RPE * rpe)7019 bool GraphColor::regAlloc(
7020     bool doBankConflictReduction,
7021     bool highInternalConflict,
7022     bool reserveSpillReg, unsigned& spillRegSize, unsigned& indrSpillRegSize,
7023     const RPE* rpe)
7024 {
7025 
7026     bool useSplitLLRHeuristic = false;
7027 
7028     if (builder.getOption(vISA_RATrace))
7029     {
7030         std::cout << "\t--# variables: " << liveAnalysis.getNumSelectedVar() << "\n";
7031     }
7032 
7033     unsigned reserveSpillSize = 0;
7034     if (reserveSpillReg)
7035     {
7036         gra.determineSpillRegSize(spillRegSize, indrSpillRegSize);
7037         reserveSpillSize = spillRegSize + indrSpillRegSize;
7038         MUST_BE_TRUE(reserveSpillSize < kernel.getNumCalleeSaveRegs(), "Invalid reserveSpillSize in fail-safe RA!");
7039         totalGRFRegCount -= reserveSpillSize;
7040     }
7041 
7042     // Copy over alignment for vars inserted by RA
7043     gra.copyMissingAlignment();
7044 
7045     //
7046     // create an array of live ranges.
7047     //
7048     createLiveRanges(reserveSpillSize);
7049     //
7050     // set the pre-assigned registers
7051     //
7052     for (unsigned i = 0; i < numVar; i++)
7053     {
7054         if (lrs[i]->getVar()->getPhyReg())
7055         {
7056             lrs[i]->setPhyReg(lrs[i]->getVar()->getPhyReg(), lrs[i]->getVar()->getPhyRegOff());
7057         }
7058 
7059         G4_Declare* dcl = lrs[i]->getDcl();
7060         if (!useSplitLLRHeuristic)
7061         {
7062             auto dclLR = gra.getLocalLR(dcl);
7063 
7064             if (dclLR != nullptr &&
7065                 dclLR->getSplit())
7066             {
7067                 useSplitLLRHeuristic = true;
7068             }
7069         }
7070 
7071     }
7072 
7073     //
7074     // compute interference matrix
7075     //
7076     intf.init(mem);
7077     intf.computeInterference();
7078 
7079     TIME_SCOPE(COLORING);
7080     //
7081     // compute degree and spill costs for each live range
7082     //
7083     if (liveAnalysis.livenessClass(G4_GRF))
7084     {
7085         computeDegreeForGRF();
7086     }
7087     else
7088     {
7089         computeDegreeForARF();
7090     }
7091     computeSpillCosts(useSplitLLRHeuristic);
7092 
7093     if (kernel.getOption(vISA_DumpRAIntfGraph))
7094         intf.dumpInterference();
7095     //
7096     // determine coloring order
7097     //
7098     determineColorOrdering();
7099 
7100     //
7101     // Set up the sub-reg alignment from declare information
7102     //
7103     for (unsigned i = 0; i < numVar; i++)
7104     {
7105         G4_Declare* dcl = lrs[i]->getDcl();
7106 
7107         if (gra.getSubRegAlign(dcl) == Any && !dcl->getIsPartialDcl())
7108         {
7109             //
7110             // multi-row, subreg alignment = 16 words
7111             //
7112             if (dcl->getNumRows() > 1)
7113             {
7114                 gra.setSubRegAlign(lrs[i]->getVar()->getDeclare(), GRFALIGN);
7115             }
7116             //
7117             // single-row
7118             //
7119             else if (gra.getSubRegAlign(lrs[i]->getVar()->getDeclare()) == Any)
7120             {
7121                 //
7122                 // set up Odd word or Even word sub reg alignment
7123                 //
7124                 unsigned nbytes = dcl->getNumElems() * TypeSize(dcl->getElemType());
7125                 unsigned nwords = nbytes / G4_WSIZE + nbytes % G4_WSIZE;
7126                 if (nwords >= 2 && lrs[i]->getRegKind() == G4_GRF)
7127                 {
7128                     gra.setSubRegAlign(lrs[i]->getVar()->getDeclare(), Even_Word);
7129                 }
7130             }
7131         }
7132     }
7133     //
7134     // assign registers for GRFs, GRFs are first attempted to be assigned using round-robin and if it fails
7135     // then we retry using a first-fit heuristic.
7136     //
7137     if (liveAnalysis.livenessClass(G4_GRF))
7138     {
7139         bool hasStackCall = kernel.fg.getHasStackCalls() || kernel.fg.getIsStackCallFunc();
7140 
7141         bool willSpill = ((builder.getOption(vISA_FastCompileRA) || builder.getOption(vISA_HybridRAWithSpill)) && !hasStackCall) ||
7142             (kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D &&
7143             rpe->getMaxRP() >= kernel.getNumRegTotal() + 24);
7144         if (willSpill)
7145         {
7146             // go straight to first_fit to save compile time since we are definitely spilling
7147             // we do this for 3D only since with indirect/subroutine the RP pressure can be very unreliable
7148             // FIXME: due to factors like local split and scalar variables that are not accurately modeled in RP estimate,
7149             // RA may succeed even when RP is > total #GRF. We should investigate these cases and fix RPE
7150             assignColors(FIRST_FIT, false, false);
7151             //assert(requireSpillCode() && "inaccurate GRF pressure estimate");
7152             return !requireSpillCode();
7153         }
7154 
7155         if (kernel.getOption(vISA_RoundRobin) && !hasStackCall && !gra.isReRAPass())
7156         {
7157             if (assignColors(ROUND_ROBIN, doBankConflictReduction, highInternalConflict) == false)
7158             {
7159                 resetTemporaryRegisterAssignments();
7160                 bool success = assignColors(FIRST_FIT, doBankConflictReduction, highInternalConflict);
7161 
7162                 if (!success && doBankConflictReduction && isHybrid)
7163                 {
7164                     return false;
7165                 }
7166 
7167                 if (!kernel.getOption(vISA_forceBCR))
7168                 {
7169                     if (!success && doBankConflictReduction)
7170                     {
7171                         resetTemporaryRegisterAssignments();
7172                         kernel.getOptions()->setOption(vISA_enableBundleCR, false);
7173                         assignColors(FIRST_FIT, false, false);
7174                         kernel.getOptions()->setOption(vISA_enableBundleCR, true);
7175                     }
7176                 }
7177             }
7178         }
7179         else
7180         {
7181             bool success = assignColors(FIRST_FIT, true, highInternalConflict);
7182             if (!success)
7183             {
7184                 resetTemporaryRegisterAssignments();
7185                 assignColors(FIRST_FIT, false, false);
7186             }
7187         }
7188     }
7189     else if (liveAnalysis.livenessClass(G4_FLAG))
7190     {
7191         if (kernel.getOption(vISA_RoundRobin))
7192         {
7193             if (assignColors(ROUND_ROBIN, false, false) == false)
7194             {
7195                 resetTemporaryRegisterAssignments();
7196                 assignColors(FIRST_FIT, false, false);
7197             }
7198         }
7199         else
7200         {
7201             assignColors(FIRST_FIT, false, false);
7202         }
7203     }
7204     else
7205     {
7206         // assign registers for ARFs using a first-fit heuristic
7207         assignColors(FIRST_FIT, false, false);
7208     }
7209 
7210     return (requireSpillCode() == false);
7211 }
7212 
confirmRegisterAssignments()7213 void GraphColor::confirmRegisterAssignments()
7214 {
7215     for (unsigned i = 0; i < numVar; i++)
7216     {
7217         if (lrs[i]->getPhyReg()) {
7218             if (lrs[i]->getVar()->getPhyReg()) {
7219                 MUST_BE_TRUE((lrs[i]->getVar()->getPhyReg() == lrs[i]->getPhyReg()), ERROR_GRAPHCOLOR);
7220             }
7221             else {
7222                 lrs[i]->getVar()->setPhyReg(lrs[i]->getPhyReg(), lrs[i]->getPhyRegOff());
7223             }
7224         }
7225     }
7226 }
7227 
resetTemporaryRegisterAssignments()7228 void GraphColor::resetTemporaryRegisterAssignments()
7229 {
7230     for (unsigned i = 0; i < numVar; i++)
7231     {
7232         if (lrs[i]->getVar()->getPhyReg() == NULL) {
7233             lrs[i]->resetPhyReg();
7234             lrs[i]->resetAllocHint();
7235             lrs[i]->setSpilled(false);
7236         }
7237     }
7238     spilledLRs.clear();
7239 }
7240 
cleanupRedundantARFFillCode()7241 void GraphColor::cleanupRedundantARFFillCode()
7242 {
7243     for (G4_BB *bb : builder.kernel.fg)
7244     {
7245         clearSpillAddrLocSignature();
7246 
7247         for (std::list<G4_INST*>::iterator i = bb->begin(); i != bb->end();)
7248         {
7249             G4_INST* inst = (*i);
7250 
7251             //
7252             // process writes to spill storage (GRF) of addr regs
7253             //
7254             G4_DstRegRegion* dst = inst->getDst();
7255 
7256             if (dst && dst->getBase() &&
7257                 dst->getBase()->isRegVar() &&
7258                 (kernel.fg.isPseudoA0Dcl(dst->getBase()->asRegVar()->getDeclare()) ||
7259                     inst->isPseudoKill()))
7260             {
7261                 i++;
7262                 continue;
7263             }
7264 
7265             if (dst != NULL &&
7266                 dst->getRegAccess() == Direct) {
7267 
7268                 if (dst->getBase()->isRegVar() &&
7269                     dst->getBase()->asRegVar()->isRegVarAddrSpillLoc())
7270                 {
7271                     pruneActiveSpillAddrLocs(dst, inst->getExecSize(), inst->getExecType());
7272                 }
7273                 //
7274                 // process writes to (allocated) addr regs
7275                 //
7276                 else if (dst->getBase()->isRegAllocPartaker())
7277                 {
7278                     G4_RegVar* addrReg = dst->getBase()->asRegVar();
7279 
7280                     if (gra.isAddrFlagSpillDcl(addrReg->getDeclare()))
7281                     {
7282                         G4_SrcRegRegion* srcRgn = inst->getSrc(0)->asSrcRegRegion();
7283 
7284                         if (redundantAddrFill(dst, srcRgn, inst->getExecSize())) {
7285                             std::list<G4_INST*>::iterator j = i++;
7286                             bb->erase(j);
7287                             continue;
7288                         }
7289                         else {
7290                             updateActiveSpillAddrLocs(dst, srcRgn, inst->getExecSize());
7291                         }
7292                     }
7293                     else {
7294                         pruneActiveSpillAddrLocs(dst, inst->getExecSize(), inst->getExecType());
7295                     }
7296                 }
7297             }
7298 
7299             i++;
7300         }
7301     }
7302 }
7303 
pruneActiveSpillAddrLocs(G4_DstRegRegion * dstRegion,unsigned exec_size,G4_Type exec_type)7304 void GraphColor::pruneActiveSpillAddrLocs(G4_DstRegRegion* dstRegion, unsigned exec_size, G4_Type exec_type)
7305 {
7306     if (dstRegion->getBase()->asRegVar()->isRegVarAddrSpillLoc()) {
7307         MUST_BE_TRUE(((exec_type == Type_UW || exec_type == Type_W) && exec_size <= getNumAddrRegisters()) ||
7308             (exec_size == 1), "Unexpected ADDR spill loc update format!");
7309         MUST_BE_TRUE(dstRegion->getRegAccess() == Direct, "Unexpected ADDR spill loc");
7310 
7311         G4_RegVarAddrSpillLoc * spillLocReg = static_cast<G4_RegVarAddrSpillLoc*>(dstRegion->getBase());
7312         unsigned startId = spillLocReg->getLocId() + dstRegion->getSubRegOff();
7313         unsigned endId = startId + exec_size * dstRegion->getHorzStride();
7314 
7315         for (unsigned i = 0, horzStride = dstRegion->getHorzStride(); i < getNumAddrRegisters(); i += horzStride)
7316         {
7317             if (spAddrRegSig[i] >= startId && spAddrRegSig[i] < endId)
7318             {
7319                 spAddrRegSig[i] = 0;
7320             }
7321         }
7322     }
7323     else if (dstRegion->getBase()->asRegVar()->isPhyRegAssigned()) {
7324         G4_RegVar* addrReg = dstRegion->getBase()->asRegVar();
7325         MUST_BE_TRUE(addrReg->getPhyReg()->isA0(), "Unknown error in ADDR reg spill code cleanup!");
7326         unsigned startId = addrReg->getPhyRegOff();
7327         unsigned endId = startId + exec_size * dstRegion->getHorzStride();
7328         MUST_BE_TRUE(endId <= getNumAddrRegisters(), "Unknown error in ADDR reg spill code cleanup!");
7329 
7330         for (unsigned i = startId; i < endId; i += dstRegion->getHorzStride())
7331         {
7332             spAddrRegSig[i] = 0;
7333         }
7334     }
7335     else {
7336         MUST_BE_TRUE(false, "Unknown error in ADDR reg spill code cleanup!");
7337     }
7338 }
7339 
updateActiveSpillAddrLocs(G4_DstRegRegion * tmpDstRegion,G4_SrcRegRegion * srcRegion,unsigned exec_size)7340 void GraphColor::updateActiveSpillAddrLocs(G4_DstRegRegion* tmpDstRegion, G4_SrcRegRegion* srcRegion, unsigned exec_size)
7341 {
7342     MUST_BE_TRUE(gra.isAddrFlagSpillDcl(tmpDstRegion->getBase()->asRegVar()->getDeclare()), "Unknown error in ADDR reg spill code cleanup!");
7343     G4_RegVar* addrReg = tmpDstRegion->getBase()->asRegVar();
7344     MUST_BE_TRUE(addrReg->getPhyReg()->isA0(), "Unknown error in ADDR reg spill code cleanup!");
7345     unsigned startAddrId = addrReg->getPhyRegOff();
7346     unsigned endAddrId = startAddrId + exec_size * tmpDstRegion->getHorzStride();
7347     MUST_BE_TRUE(endAddrId <= getNumAddrRegisters(), "Unknown error in ADDR reg spill code cleanup!");
7348 
7349     MUST_BE_TRUE(srcRegion->getBase()->asRegVar()->isRegVarAddrSpillLoc(), "Unknown error in ADDR reg spill code cleanup!");
7350     G4_RegVarAddrSpillLoc * spillLocReg = static_cast<G4_RegVarAddrSpillLoc*>(srcRegion->getBase());
7351     unsigned startLocId = spillLocReg->getLocId() + srcRegion->getSubRegOff();
7352 
7353     for (unsigned i = startAddrId, j = startLocId; i < endAddrId;
7354         i += tmpDstRegion->getHorzStride(), j += srcRegion->getRegion()->horzStride)
7355     {
7356         spAddrRegSig[i] = j;
7357     }
7358 }
7359 
redundantAddrFill(G4_DstRegRegion * tmpDstRegion,G4_SrcRegRegion * srcRegion,unsigned exec_size)7360 bool GraphColor::redundantAddrFill(G4_DstRegRegion* tmpDstRegion, G4_SrcRegRegion* srcRegion, unsigned exec_size)
7361 {
7362     bool match = true;
7363 
7364     MUST_BE_TRUE(gra.isAddrFlagSpillDcl(tmpDstRegion->getBase()->asRegVar()->getDeclare()), "Unknown error in ADDR reg spill code cleanup!");
7365     G4_RegVar* addrReg = tmpDstRegion->getBase()->asRegVar();
7366     MUST_BE_TRUE(addrReg->getPhyReg()->isA0(), "Unknown error in ADDR reg spill code cleanup!");
7367     unsigned startAddrId = addrReg->getPhyRegOff();
7368     unsigned endAddrId = startAddrId + exec_size * tmpDstRegion->getHorzStride();
7369     MUST_BE_TRUE(endAddrId <= getNumAddrRegisters(), "Unknown error in ADDR reg spill code cleanup!");
7370 
7371     MUST_BE_TRUE(srcRegion->getBase()->asRegVar()->isRegVarAddrSpillLoc(), "Unknown error in ADDR reg spill code cleanup!");
7372     G4_RegVarAddrSpillLoc * spillLocReg = static_cast<G4_RegVarAddrSpillLoc*>(srcRegion->getBase());
7373     unsigned startLocId = spillLocReg->getLocId() + srcRegion->getSubRegOff();
7374 
7375     for (unsigned i = startAddrId, j = startLocId; i < endAddrId;
7376         i += tmpDstRegion->getHorzStride(), j += srcRegion->getRegion()->horzStride)
7377     {
7378         if (spAddrRegSig[i] != j)
7379         {
7380             match = false;
7381             break;
7382         }
7383     }
7384 
7385     return match;
7386 }
7387 
sendBlockSizeCode(unsigned owordSize)7388 unsigned GlobalRA::sendBlockSizeCode(unsigned owordSize)
7389 {
7390     unsigned code;
7391 
7392     switch (owordSize) {
7393     case 1:
7394         code = 0;
7395         break;
7396     case 2:
7397         code = 2;
7398         break;
7399     case 4:
7400         code = 3;
7401         break;
7402     case 8:
7403         code = 4;
7404         break;
7405     case 16:
7406         code = 5;
7407         break;
7408     default:
7409         MUST_BE_TRUE(false, ERROR_REGALLOC);
7410         code = 0;
7411     }
7412 
7413     return code;
7414 }
7415 
7416 #define STATELESS_SURFACE_INDEX            0xFF
7417 #define HEADER_PRESENT                    0x80000
7418 #define SEND_OWORD_READ_TYPE            0
7419 #define SEND_OWORD_WRITE_TYPE            8
7420 #define SEND_MSG_TYPE_BIT_OFFSET        14
7421 #define    SEND_RSP_LENGTH_BIT_OFFSET        20
7422 #define    SEND_MSG_LENGTH_BIT_OFFSET        25
7423 #define SEND_DESC_DATA_SIZE_BIT_OFFSET    8
7424 
createMsgDesc(unsigned owordSize,bool writeType,bool isSplitSend)7425 G4_Imm* GlobalRA::createMsgDesc(unsigned owordSize, bool writeType, bool isSplitSend)
7426 {
7427     // If isSplitSend = true then messageLength = 1 and extMesLength = (owordSize/2) GRFs
7428     unsigned message = STATELESS_SURFACE_INDEX;
7429     message |= HEADER_PRESENT;
7430     if (writeType)
7431     {
7432         unsigned messageType = SEND_OWORD_WRITE_TYPE;
7433         message |= messageType << SEND_MSG_TYPE_BIT_OFFSET;
7434         unsigned messageLength = 1;
7435         if (!isSplitSend)
7436         {
7437             messageLength += owordToGRFSize(ROUND(owordSize, numEltPerGRF<Type_UB>()/OWORD_BYTE_SIZE));
7438         }
7439         message |= messageLength << SEND_MSG_LENGTH_BIT_OFFSET;
7440     }
7441     else
7442     {
7443         unsigned messageType = SEND_OWORD_READ_TYPE;
7444         message |= messageType << SEND_MSG_TYPE_BIT_OFFSET;
7445         unsigned responseLength = owordToGRFSize(ROUND(owordSize, numEltPerGRF<Type_UB>() / OWORD_BYTE_SIZE));
7446         message |= responseLength << SEND_RSP_LENGTH_BIT_OFFSET;
7447         unsigned messageLength = 1;
7448         message |= messageLength << SEND_MSG_LENGTH_BIT_OFFSET;
7449     }
7450     unsigned writeOwordSize = sendBlockSizeCode(owordSize);
7451     message |= writeOwordSize << SEND_DESC_DATA_SIZE_BIT_OFFSET;
7452     return builder.createImm(message, Type_UD);
7453 }
7454 
stackCallProlog()7455 void GlobalRA::stackCallProlog()
7456 {
7457     // mov (8) r126.0<1>:ud    r0.0<8;8,1>:ud
7458     // This sets up the header for oword block r/w used for caller/callee-save
7459 
7460     // Kernel should've already setup r0 in r126.
7461     // Useful data in r126 is expected to be preserved by all functions.
7462     if (kernel.fg.getIsStackCallFunc())
7463     {
7464         if (kernel.getOption(vISA_skipFDE))
7465             return;
7466 
7467         // emit frame descriptor
7468         auto payload = builder.createHardwiredDeclare(8, Type_UD, kernel.getFPSPGRF(), 0);
7469         payload->setName(builder.getNameString(builder.kernel.fg.mem, 24, "FrameDescriptorGRF"));
7470         auto payloadSrc = builder.createSrcRegRegion(payload, builder.getRegionStride1());
7471         const unsigned execSize = 8;
7472         G4_DstRegRegion* postDst = builder.createNullDst(Type_UD);
7473         G4_INST* store = nullptr;
7474         if (builder.supportsLSC())
7475         {
7476             auto headerOpnd = getSpillFillHeader(*kernel.fg.builder, nullptr);
7477             store = builder.createSpill(postDst, headerOpnd, payloadSrc, G4_ExecSize(execSize), 1, 0, builder.getBESP(), InstOpt_WriteEnable, false);
7478         }
7479         else
7480         {
7481             store = builder.createSpill(postDst, payloadSrc, G4_ExecSize(execSize), 1, 0, builder.getBESP(), InstOpt_WriteEnable, false);
7482         }
7483         builder.setFDSpillInst(store);
7484         G4_BB* entryBB = builder.kernel.fg.getEntryBB();
7485         auto iter = std::find_if(entryBB->begin(), entryBB->end(), [](G4_INST* inst) { return !inst->isLabel(); });
7486         entryBB->insertBefore(iter, store);
7487 
7488         if (EUFusionWANeeded())
7489         {
7490             auto oldSaveInst = builder.getPartFDSaveInst();
7491             builder.setPartFDSaveInst(store);
7492             entryBB->remove(oldSaveInst);
7493         }
7494 
7495         addEUFusionWAInsts(store);
7496 
7497         return;
7498     }
7499 
7500     auto dstRgn = builder.createDstRegRegion(builder.kernel.fg.scratchRegDcl, 1);
7501     auto srcRgn = builder.createSrcRegRegion(builder.getBuiltinR0(), builder.getRegionStride1());
7502 
7503     G4_INST* mov = builder.createMov(G4_ExecSize(numEltPerGRF<Type_UD>()), dstRgn, srcRgn, InstOpt_WriteEnable, false);
7504 
7505     G4_BB* entryBB = builder.kernel.fg.getEntryBB();
7506     auto iter = std::find_if(entryBB->begin(), entryBB->end(), [](G4_INST* inst) { return !inst->isLabel(); });
7507     entryBB->insertBefore(iter, mov);
7508 }
7509 
7510 //
7511 // Generate the save code for startReg to startReg+owordSize/2.
7512 //
saveRegs(unsigned startReg,unsigned owordSize,G4_Declare * scratchRegDcl,G4_Declare * framePtr,unsigned frameOwordOffset,G4_BB * bb,INST_LIST_ITER insertIt,std::unordered_set<G4_INST * > & group)7513 void GlobalRA::saveRegs(
7514     unsigned startReg, unsigned owordSize, G4_Declare* scratchRegDcl, G4_Declare* framePtr,
7515     unsigned frameOwordOffset, G4_BB* bb, INST_LIST_ITER insertIt, std::unordered_set<G4_INST*>& group)
7516 {
7517 
7518     assert(builder.getPlatform() >= GENX_SKL && "stack call only supported on SKL+");
7519 
7520     if (owordSize == 8 || owordSize == 4 || owordSize == 2)
7521     {
7522         // add (1) r126.2<1>:ud    r125.3<0;1,0>:ud    0x2:ud
7523         // sends (8) null<1>:ud    r126.0    r1.0 ...
7524         G4_ExecSize execSize = (owordSize > 2) ? g4::SIMD16 : g4::SIMD8;
7525         unsigned messageLength = GlobalRA::owordToGRFSize(owordSize);
7526         G4_Declare* msgDcl = builder.createTempVar(messageLength * GENX_DATAPORT_IO_SZ,
7527             Type_UD, GRFALIGN, StackCallStr);
7528         msgDcl->getRegVar()->setPhyReg(regPool.getGreg(startReg), 0);
7529         auto sendSrc2 = builder.createSrc(msgDcl->getRegVar(), 0, 0,
7530             builder.getRegionStride1(), Type_UD);
7531         G4_DstRegRegion* dst = builder.createNullDst((execSize > 8) ? Type_UW : Type_UD);
7532         G4_INST* spillIntrinsic = nullptr;
7533         if (builder.supportsLSC())
7534         {
7535             auto headerOpnd = getSpillFillHeader(*kernel.fg.builder, nullptr);
7536             spillIntrinsic = builder.createSpill(dst, headerOpnd, sendSrc2, execSize, messageLength, frameOwordOffset / 2, framePtr, InstOpt_WriteEnable, false);
7537         }
7538         else
7539         spillIntrinsic = builder.createSpill(dst, sendSrc2, execSize, messageLength, frameOwordOffset/2, framePtr, InstOpt_WriteEnable, false);
7540         spillIntrinsic->inheritDIFrom(*insertIt);
7541         bb->insertBefore(insertIt, spillIntrinsic);
7542         group.insert(spillIntrinsic);
7543     }
7544     else if (owordSize > 8)
7545     {
7546         saveRegs(startReg, 8, scratchRegDcl, framePtr, frameOwordOffset, bb, insertIt, group);
7547         saveRegs(startReg + GlobalRA::owordToGRFSize(8), owordSize - 8, scratchRegDcl, framePtr, frameOwordOffset + 8, bb, insertIt, group);
7548     }
7549     //
7550     // Split into chunks of sizes 4 and remaining owords.
7551     //
7552     else if (owordSize > 4)
7553     {
7554         saveRegs(startReg, 4, scratchRegDcl, framePtr, frameOwordOffset, bb, insertIt, group);
7555         saveRegs(startReg + GlobalRA::owordToGRFSize(4), owordSize - 4, scratchRegDcl, framePtr, frameOwordOffset + 4, bb, insertIt, group);
7556     }
7557     //
7558     // Split into chunks of sizes 2 and remaining owords.
7559     //
7560     else if (owordSize > 2)
7561     {
7562         saveRegs(startReg, 2, scratchRegDcl, framePtr, frameOwordOffset, bb, insertIt, group);
7563         saveRegs(startReg + GlobalRA::owordToGRFSize(2), owordSize - 2, scratchRegDcl, framePtr, frameOwordOffset + 2, bb, insertIt, group);
7564     }
7565     else
7566     {
7567         MUST_BE_TRUE(false, ERROR_REGALLOC);
7568     }
7569 }
7570 
7571 //
7572 // Generate the save code for the i/p saveRegs.
7573 //
saveActiveRegs(std::vector<bool> & saveRegs,unsigned startReg,unsigned frameOffset,G4_BB * bb,INST_LIST_ITER insertIt,std::unordered_set<G4_INST * > & group)7574 void GlobalRA::saveActiveRegs(
7575     std::vector<bool>& saveRegs, unsigned startReg, unsigned frameOffset,
7576     G4_BB* bb, INST_LIST_ITER insertIt, std::unordered_set<G4_INST*>& group)
7577 {
7578     G4_Declare* scratchRegDcl = builder.kernel.fg.scratchRegDcl;
7579     G4_Declare* framePtr = builder.kernel.fg.framePtrDcl;
7580 
7581     unsigned frameOwordPos = frameOffset;
7582     unsigned startPos = 0;
7583 
7584     while (startPos < saveRegs.size())
7585     {
7586         for (; startPos < saveRegs.size() && saveRegs[startPos] == false; startPos++);
7587         if (startPos < saveRegs.size() && saveRegs[startPos]) {
7588             unsigned endPos = startPos + 1;
7589             for (; endPos < saveRegs.size() && saveRegs[endPos] == true; endPos++);
7590             unsigned owordSize = (endPos - startPos) * GlobalRA::GRFSizeToOwords(1);
7591             owordSize = std::max(owordSize, GlobalRA::GRFSizeToOwords(1));
7592             this->saveRegs(startPos + startReg, owordSize, scratchRegDcl, framePtr, frameOwordPos, bb, insertIt, group);
7593             frameOwordPos += owordSize;
7594             startPos = endPos;
7595         }
7596     }
7597 }
7598 
getScratchSurface() const7599 G4_SrcRegRegion* GraphColor::getScratchSurface() const
7600 {
7601     if (builder.hasScratchSurface())
7602     {
7603         return builder.createSrcRegRegion(builder.getBuiltinScratchSurface(), builder.getRegionScalar());
7604     }
7605     return nullptr; // use stateless access
7606 }
7607 
7608 //
7609 // Generate the restore code for startReg to startReg+owordSize/2.
7610 //
restoreRegs(unsigned startReg,unsigned owordSize,G4_Declare * scratchRegDcl,G4_Declare * framePtr,unsigned frameOwordOffset,G4_BB * bb,INST_LIST_ITER insertIt,std::unordered_set<G4_INST * > & group,bool caller)7611 void GlobalRA::restoreRegs(
7612     unsigned startReg, unsigned owordSize, G4_Declare* scratchRegDcl, G4_Declare* framePtr,
7613     unsigned frameOwordOffset, G4_BB* bb, INST_LIST_ITER insertIt, std::unordered_set<G4_INST*>& group, bool caller)
7614 {
7615     //
7616     // Process chunks of size 8, 4, 2 and 1.
7617     //
7618     if (owordSize == 8 || owordSize == 4 || owordSize == 2)
7619     {
7620         G4_ExecSize execSize = (owordSize > 2) ? g4::SIMD16 : g4::SIMD8;
7621         unsigned responseLength = GlobalRA::owordToGRFSize(owordSize);
7622         G4_Declare* dstDcl = builder.createTempVar(responseLength * GENX_DATAPORT_IO_SZ,
7623             Type_UD, GRFALIGN, StackCallStr);
7624         if (caller)
7625         {
7626             kernel.callerRestoreDecls.push_back(dstDcl);
7627         }
7628         dstDcl->getRegVar()->setPhyReg(regPool.getGreg(startReg), 0);
7629         G4_DstRegRegion* dstRgn = builder.createDst(dstDcl->getRegVar(), 0, 0, 1, (execSize > 8) ? Type_UW : Type_UD);
7630         G4_INST* fillIntrinsic = nullptr;
7631         if (builder.supportsLSC())
7632         {
7633             auto headerOpnd = getSpillFillHeader(*kernel.fg.builder, nullptr);
7634             fillIntrinsic = builder.createFill(headerOpnd, dstRgn, execSize, responseLength, frameOwordOffset / 2, framePtr, InstOpt_WriteEnable, false);
7635         }
7636         else
7637         fillIntrinsic = builder.createFill(dstRgn, execSize, responseLength, frameOwordOffset / 2, framePtr, InstOpt_WriteEnable, false);
7638         fillIntrinsic->inheritDIFrom(*insertIt);
7639         bb->insertBefore(insertIt, fillIntrinsic);
7640         group.insert(fillIntrinsic);
7641     }
7642     //
7643     // Split into chunks of sizes 8 and remaining owords.
7644     //
7645     else if (owordSize > 8)
7646     {
7647         restoreRegs(startReg, 8, scratchRegDcl, framePtr, frameOwordOffset, bb, insertIt, group, caller);
7648         restoreRegs(startReg + GlobalRA::owordToGRFSize(8), owordSize - 8, scratchRegDcl, framePtr, frameOwordOffset + 8, bb, insertIt, group, caller);
7649     }
7650     //
7651     // Split into chunks of sizes 4 and remaining owords.
7652     //
7653     else if (owordSize > 4)
7654     {
7655         restoreRegs(startReg, 4, scratchRegDcl, framePtr, frameOwordOffset, bb, insertIt, group, caller);
7656         restoreRegs(startReg + GlobalRA::owordToGRFSize(4), owordSize - 4, scratchRegDcl, framePtr, frameOwordOffset + 4, bb, insertIt, group, caller);
7657     }
7658     //
7659     // Split into chunks of sizes 2 and remaining owords.
7660     //
7661     else if (owordSize > 2)
7662     {
7663         restoreRegs(startReg, 2, scratchRegDcl, framePtr, frameOwordOffset, bb, insertIt, group, caller);
7664         restoreRegs(startReg + GlobalRA::owordToGRFSize(2), owordSize - 2, scratchRegDcl, framePtr, frameOwordOffset + 2, bb, insertIt, group, caller);
7665     }
7666     else
7667     {
7668         MUST_BE_TRUE(false, ERROR_REGALLOC);
7669     }
7670 }
7671 
7672 //
7673 // Generate the restore code for the i/p restoreRegs.
7674 //
restoreActiveRegs(std::vector<bool> & restoreRegs,unsigned startReg,unsigned frameOffset,G4_BB * bb,INST_LIST_ITER insertIt,std::unordered_set<G4_INST * > & group,bool caller)7675 void GlobalRA::restoreActiveRegs(
7676     std::vector<bool>& restoreRegs, unsigned startReg, unsigned frameOffset,
7677     G4_BB* bb, INST_LIST_ITER insertIt, std::unordered_set<G4_INST*>& group, bool caller)
7678 {
7679     G4_Declare* scratchRegDcl = builder.kernel.fg.scratchRegDcl;
7680     G4_Declare* framePtr = builder.kernel.fg.framePtrDcl;
7681 
7682     unsigned frameOwordPos = frameOffset;
7683     unsigned startPos = 0;
7684 
7685     while (startPos < restoreRegs.size())
7686     {
7687         for (; startPos < restoreRegs.size() && restoreRegs[startPos] == false; startPos++);
7688         if (startPos < restoreRegs.size() && restoreRegs[startPos]) {
7689             unsigned endPos = startPos + 1;
7690             for (; endPos < restoreRegs.size() && restoreRegs[endPos] == true; endPos++);
7691             unsigned owordSize = (endPos - startPos) * GlobalRA::GRFSizeToOwords(1);
7692             owordSize = std::max(owordSize, GlobalRA::GRFSizeToOwords(1));
7693             this->restoreRegs(startPos + startReg, owordSize, scratchRegDcl, framePtr, frameOwordPos, bb, insertIt, group, caller);
7694             frameOwordPos += owordSize;
7695             startPos = endPos;
7696         }
7697     }
7698 }
7699 
7700 //
7701 // Optimize the reg footprint so as to reduce the number of "send" instructions required for
7702 // save/restore, at the cost of a little additional save/restore memory (if any). Since we
7703 // are using oword read/write for save/restore, we can only read/write only in units of 1, 2
7704 // or 4 regs per "send" instruction.
7705 //
OptimizeActiveRegsFootprint(std::vector<bool> & saveRegs)7706 void GlobalRA::OptimizeActiveRegsFootprint(std::vector<bool>& saveRegs)
7707 {
7708     unsigned startPos = 0;
7709     while (startPos < saveRegs.size())
7710     {
7711         for (; startPos < saveRegs.size() && !saveRegs[startPos]; ++startPos);
7712         if (startPos == saveRegs.size())
7713         {
7714             break;
7715         }
7716         if (startPos + 4 <= saveRegs.size())
7717         {
7718             if (saveRegs[startPos] & saveRegs[startPos + 2] & !saveRegs[startPos + 3])
7719             {
7720                 saveRegs[startPos + 1] = saveRegs[startPos + 3] = true;
7721             }
7722             else if (saveRegs[startPos] & saveRegs[startPos + 3])
7723             {
7724                 if (startPos + 4 < saveRegs.size())
7725                 {
7726                     if (!saveRegs[startPos + 4])
7727                     {
7728                         saveRegs[startPos + 1] = saveRegs[startPos + 2] = true;
7729                     }
7730                 }
7731                 else
7732                 {
7733                     saveRegs[startPos + 1] = saveRegs[startPos + 2] = true;
7734                 }
7735             }
7736         }
7737         unsigned winBound = std::min(static_cast<unsigned>(saveRegs.size()), startPos + 4);
7738         for (; startPos < winBound && saveRegs[startPos]; ++startPos);
7739     }
7740 }
7741 
OptimizeActiveRegsFootprint(std::vector<bool> & saveRegs,std::vector<bool> & retRegs)7742 void GlobalRA::OptimizeActiveRegsFootprint(std::vector<bool>& saveRegs, std::vector<bool>& retRegs)
7743 {
7744     unsigned startPos = 0;
7745     while (startPos < saveRegs.size())
7746     {
7747         for (; startPos < saveRegs.size() && !saveRegs[startPos]; ++startPos);
7748         if (startPos == saveRegs.size())
7749         {
7750             break;
7751         }
7752         if (startPos + 4 <= saveRegs.size())
7753         {
7754             if (saveRegs[startPos] & saveRegs[startPos + 2])
7755             {
7756                 if (!saveRegs[startPos + 1] & !retRegs[startPos + 1])
7757                 {
7758                     saveRegs[startPos + 1] = true;
7759                 }
7760                 if (!saveRegs[startPos + 3] & !retRegs[startPos + 3])
7761                 {
7762                     saveRegs[startPos + 3] = true;
7763                 }
7764             }
7765             else if (saveRegs[startPos] & saveRegs[startPos + 3])
7766             {
7767                 if (startPos + 4 < saveRegs.size())
7768                 {
7769                     if (!saveRegs[startPos + 4])
7770                     {
7771                         if (!saveRegs[startPos + 1] & !retRegs[startPos + 1])
7772                         {
7773                             saveRegs[startPos + 1] = true;
7774                         }
7775                         if (!saveRegs[startPos + 2] & !retRegs[startPos + 2])
7776                         {
7777                             saveRegs[startPos + 2] = true;
7778                         }
7779                     }
7780                 }
7781                 else
7782                 {
7783                     if (!saveRegs[startPos + 1] & !retRegs[startPos + 1])
7784                     {
7785                         saveRegs[startPos + 1] = true;
7786                     }
7787                     if (!saveRegs[startPos + 2] & !retRegs[startPos + 2])
7788                     {
7789                         saveRegs[startPos + 2] = true;
7790                     }
7791                 }
7792             }
7793         }
7794         unsigned winBound = std::min(static_cast<unsigned>(saveRegs.size()), startPos + 4);
7795         for (; startPos < winBound && saveRegs[startPos]; ++startPos);
7796     }
7797 }
7798 
getCallerSaveRegisters()7799 void GraphColor::getCallerSaveRegisters()
7800 {
7801     unsigned callerSaveNumGRF = builder.kernel.getCallerSaveLastGRF() + 1;
7802 
7803     for (BB_LIST_ITER it = builder.kernel.fg.begin(); it != builder.kernel.fg.end(); ++it)
7804     {
7805         if ((*it)->isEndWithFCall())
7806         {
7807             //
7808             // Determine the caller-save registers per call site.
7809             //
7810             gra.callerSaveRegsMap[(*it)].resize(callerSaveNumGRF, false);
7811             gra.retRegsMap[(*it)].resize(callerSaveNumGRF, false);
7812             unsigned callerSaveRegCount = 0;
7813             G4_INST* callInst = (*it)->back();
7814             unsigned pseudoVCAId = builder.kernel.fg.fcallToPseudoDclMap[callInst->asCFInst()].VCA->getRegVar()->getId();
7815             ASSERT_USER((*it)->Succs.size() == 1, "fcall basic block cannot have more than 1 successor");
7816 
7817             for (unsigned i = 0; i < numVar; i++)
7818             {
7819                 if (i != pseudoVCAId &&
7820                     kernel.fg.isPseudoVCEDcl(lrs[i]->getDcl()) != true &&
7821                     intf.interfereBetween(pseudoVCAId, i) == true)
7822                 {
7823                     if (!builder.isPreDefArg(lrs[i]->getDcl()))
7824                     {
7825                         // NOTE: Spilled live ranges should not be caller-save.
7826                         MUST_BE_TRUE(lrs[i]->getPhyReg()->isGreg(), ERROR_REGALLOC);
7827                         unsigned startReg = lrs[i]->getPhyReg()->asGreg()->getRegNum();
7828                         unsigned endReg = startReg + lrs[i]->getDcl()->getNumRows();
7829                         startReg = (startReg < callerSaveNumGRF) ? startReg : callerSaveNumGRF;
7830                         startReg = (startReg > 0) ? startReg : 1;
7831                         endReg = (endReg < callerSaveNumGRF) ? endReg : callerSaveNumGRF;
7832                         endReg = (endReg > 0) ? endReg : 1;
7833                         for (unsigned j = startReg; j < endReg; j++)
7834                         {
7835                             if (builder.isPreDefRet(lrs[i]->getDcl()))
7836                             {
7837                                 if (gra.retRegsMap[(*it)][j] == false)
7838                                 {
7839                                     gra.retRegsMap[(*it)][j] = true;
7840                                 }
7841                             }
7842                             else
7843                             {
7844                                 if (gra.callerSaveRegsMap[(*it)][j] == false)
7845                                 {
7846                                     gra.callerSaveRegsMap[(*it)][j] = true;
7847                                     callerSaveRegCount++;
7848                                 }
7849                             }
7850                         }
7851                     }
7852                 }
7853             }
7854 
7855             gra.callerSaveRegCountMap[(*it)] = callerSaveRegCount;
7856 
7857             if (builder.kernel.getOption(vISA_OptReport))
7858             {
7859                 std::ofstream optreport;
7860                 getOptReportStream(optreport, builder.kernel.getOptions());
7861                 optreport << "Caller save size: " << callerSaveRegCount * getGRFSize() <<
7862                     " bytes for fcall at cisa id " <<
7863                     (*it)->back()->getCISAOff() << std::endl;
7864                 closeOptReportStream(optreport);
7865             }
7866         }
7867     }
7868 }
7869 
7870 //
7871 // Add caller save/restore code before/after each stack call.
7872 //
addCallerSaveRestoreCode()7873 void GlobalRA::addCallerSaveRestoreCode()
7874 {
7875     uint32_t maxCallerSaveSize = 0;
7876 
7877     for (G4_BB* bb : builder.kernel.fg)
7878     {
7879         if (bb->isEndWithFCall())
7880         {
7881             //
7882             // Determine the caller-save registers per call site.
7883             //
7884             G4_INST* callInst = bb->back();
7885             G4_BB* afterFCallBB = bb->Succs.front();
7886 
7887             OptimizeActiveRegsFootprint(callerSaveRegsMap[bb], retRegsMap[bb]);
7888 
7889             unsigned callerSaveRegsWritten = 0;
7890             for (bool csr : callerSaveRegsMap[bb])
7891                 callerSaveRegsWritten += (csr ? 1 : 0);
7892 
7893             INST_LIST_ITER insertSaveIt = bb->end();
7894             --insertSaveIt, --insertSaveIt;
7895             while ((*insertSaveIt)->isPseudoKill())
7896             {
7897                 --insertSaveIt;
7898             }
7899             MUST_BE_TRUE((*insertSaveIt)->isCallerSave(), ERROR_REGALLOC);
7900             INST_LIST_ITER rmIt = insertSaveIt;
7901             if (insertSaveIt == bb->begin())
7902             {
7903                 insertSaveIt = bb->end();
7904             }
7905 
7906             if (insertSaveIt != bb->end())
7907             {
7908                 ++insertSaveIt;
7909             }
7910             else
7911             {
7912                 insertSaveIt = bb->begin();
7913             }
7914             if (callerSaveRegCountMap[bb] > 0)
7915             {
7916                 if (builder.kernel.getOption(vISA_GenerateDebugInfo))
7917                 {
7918                     builder.kernel.getKernelDebugInfo()->clearOldInstList();
7919                     builder.kernel.getKernelDebugInfo()->setOldInstList(bb);
7920                 }
7921 
7922                 saveActiveRegs(callerSaveRegsMap[bb], 0, builder.kernel.fg.callerSaveAreaOffset,
7923                     bb, insertSaveIt, callerSaveInsts[callInst]);
7924 
7925                 if (builder.kernel.getOption(vISA_GenerateDebugInfo))
7926                 {
7927                     auto deltaInstList = builder.kernel.getKernelDebugInfo()->getDeltaInstructions(bb);
7928                     for (auto jt : deltaInstList)
7929                     {
7930                         builder.kernel.getKernelDebugInfo()->addCallerSaveInst(bb, jt);
7931                     }
7932                 }
7933             }
7934             bb->erase(rmIt);
7935             INST_LIST_ITER insertRestIt = afterFCallBB->begin();
7936             for (; !(*insertRestIt)->isCallerRestore(); ++insertRestIt);
7937             if (callerSaveRegCountMap[bb] > 0)
7938             {
7939                 if (builder.kernel.getOption(vISA_GenerateDebugInfo))
7940                 {
7941                     builder.kernel.getKernelDebugInfo()->clearOldInstList();
7942                     builder.kernel.getKernelDebugInfo()->setOldInstList(afterFCallBB);
7943                 }
7944 
7945                 restoreActiveRegs(callerSaveRegsMap[bb], 0, builder.kernel.fg.callerSaveAreaOffset,
7946                     afterFCallBB, insertRestIt, callerRestoreInsts[callInst], true);
7947 
7948                 if (builder.kernel.getOption(vISA_GenerateDebugInfo))
7949                 {
7950                     auto deltaInsts = builder.kernel.getKernelDebugInfo()->getDeltaInstructions(afterFCallBB);
7951                     for (auto jt : deltaInsts)
7952                     {
7953                         builder.kernel.getKernelDebugInfo()->addCallerRestoreInst(bb, jt);
7954                     }
7955                 }
7956             }
7957             afterFCallBB->erase(insertRestIt);
7958 
7959             maxCallerSaveSize = std::max(maxCallerSaveSize, callerSaveRegsWritten * getGRFSize());
7960         }
7961     }
7962 
7963     auto byteOffset = builder.kernel.fg.callerSaveAreaOffset * 16 + maxCallerSaveSize;
7964     builder.kernel.fg.frameSizeInOWord = ROUND(byteOffset, 64) / 16;
7965 
7966     builder.instList.clear();
7967 }
7968 
getCalleeSaveRegisters()7969 void GraphColor::getCalleeSaveRegisters()
7970 {
7971     unsigned callerSaveNumGRF = builder.kernel.getCallerSaveLastGRF() + 1;
7972     unsigned numCalleeSaveRegs = builder.kernel.getNumCalleeSaveRegs();
7973 
7974     // Determine the callee-save registers.
7975 
7976     gra.calleeSaveRegs.resize(numCalleeSaveRegs, false);
7977     gra.calleeSaveRegCount = 0;
7978 
7979     unsigned pseudoVCEId = builder.kernel.fg.pseudoVCEDcl->getRegVar()->getId();
7980     unsigned stackCallStartReg = builder.kernel.getStackCallStartReg();
7981     for (unsigned i = 0; i < numVar; i++)
7982     {
7983         if (pseudoVCEId != i && intf.interfereBetween(pseudoVCEId, i))
7984         {
7985             if (lrs[i]->getPhyReg())
7986             {
7987                 MUST_BE_TRUE(lrs[i]->getPhyReg()->isGreg(), ERROR_REGALLOC);
7988                 unsigned startReg = lrs[i]->getPhyReg()->asGreg()->getRegNum();
7989                 unsigned endReg = startReg + lrs[i]->getDcl()->getNumRows();
7990                 startReg = (startReg >= callerSaveNumGRF) ? startReg : callerSaveNumGRF;
7991                 startReg = (startReg < stackCallStartReg) ? startReg : stackCallStartReg;
7992                 endReg = (endReg >= callerSaveNumGRF) ? endReg : callerSaveNumGRF;
7993                 endReg = (endReg < stackCallStartReg) ? endReg : stackCallStartReg;
7994                 for (unsigned j = startReg; j < endReg; j++)
7995                 {
7996                     if (gra.calleeSaveRegs[j - callerSaveNumGRF] == false)
7997                     {
7998                         gra.calleeSaveRegs[j - callerSaveNumGRF] = true;
7999                         gra.calleeSaveRegCount++;
8000                     }
8001                 }
8002             }
8003         }
8004     }
8005 }
8006 
8007 //
8008 // Add callee save/restore code at stack call function entry/exit.
8009 //
addCalleeSaveRestoreCode()8010 void GlobalRA::addCalleeSaveRestoreCode()
8011 {
8012     unsigned callerSaveNumGRF = builder.kernel.getCallerSaveLastGRF() + 1;
8013 
8014     OptimizeActiveRegsFootprint(calleeSaveRegs);
8015     unsigned calleeSaveRegsWritten = 0;
8016     for (bool b : calleeSaveRegs)
8017         calleeSaveRegsWritten += (b ? 1 : 0);
8018 
8019     INST_LIST_ITER insertSaveIt = builder.kernel.fg.getEntryBB()->end();
8020     for (--insertSaveIt; !(*insertSaveIt)->isCalleeSave(); --insertSaveIt);
8021     if (calleeSaveRegCount > 0)
8022     {
8023         if (builder.kernel.getOption(vISA_GenerateDebugInfo))
8024         {
8025             // Store old inst list so we can separate callee save
8026             // instructions that get inserted.
8027             builder.kernel.getKernelDebugInfo()->clearOldInstList();
8028             builder.kernel.getKernelDebugInfo()->setOldInstList
8029             (builder.kernel.fg.getEntryBB());
8030         }
8031         saveActiveRegs(calleeSaveRegs, callerSaveNumGRF, builder.kernel.fg.calleeSaveAreaOffset,
8032             builder.kernel.fg.getEntryBB(), insertSaveIt, calleeSaveInsts);
8033 
8034         if (builder.kernel.getOption(vISA_GenerateDebugInfo))
8035         {
8036             // Delta of oldInstList and current instList are all
8037             // callee save instructions.
8038             auto instList = builder.kernel.getKernelDebugInfo()->getDeltaInstructions
8039             (builder.kernel.fg.getEntryBB());
8040             for (auto inst : instList)
8041             {
8042                 builder.kernel.getKernelDebugInfo()->addCalleeSaveInst(inst);
8043             }
8044         }
8045     }
8046     builder.kernel.fg.getEntryBB()->erase(insertSaveIt);
8047     INST_LIST_ITER insertRestIt = builder.kernel.fg.getUniqueReturnBlock()->end();
8048     for (--insertRestIt; !(*insertRestIt)->isCalleeRestore(); --insertRestIt);
8049     INST_LIST_ITER eraseIt = insertRestIt++;
8050     if (calleeSaveRegCount > 0)
8051     {
8052         if (builder.kernel.getOption(vISA_GenerateDebugInfo))
8053         {
8054             // Store old inst list so we can separate callee save
8055             // instructions that get inserted.
8056             builder.kernel.getKernelDebugInfo()->clearOldInstList();
8057             builder.kernel.getKernelDebugInfo()->setOldInstList
8058             (builder.kernel.fg.getUniqueReturnBlock());
8059         }
8060 
8061         restoreActiveRegs(calleeSaveRegs, callerSaveNumGRF, builder.kernel.fg.calleeSaveAreaOffset,
8062             builder.kernel.fg.getUniqueReturnBlock(), insertRestIt, calleeRestoreInsts, false);
8063 
8064         if (builder.kernel.getOption(vISA_GenerateDebugInfo))
8065         {
8066             auto instList = builder.kernel.getKernelDebugInfo()->getDeltaInstructions
8067             (builder.kernel.fg.getUniqueReturnBlock());
8068             for (auto inst : instList)
8069             {
8070                 builder.kernel.getKernelDebugInfo()->addCalleeRestoreInst(inst);
8071             }
8072         }
8073     }
8074     builder.kernel.fg.getUniqueReturnBlock()->erase(eraseIt);
8075 
8076     builder.instList.clear();
8077 
8078     // caller-save starts after callee-save and is 64-byte aligned
8079     auto byteOffset = builder.kernel.fg.calleeSaveAreaOffset * 16 + calleeSaveRegsWritten * getGRFSize();
8080     builder.kernel.fg.callerSaveAreaOffset = ROUND(byteOffset, 64) / 16;
8081     if (builder.kernel.getOption(vISA_OptReport))
8082     {
8083         std::ofstream optreport;
8084         getOptReportStream(optreport, builder.kernel.getOptions());
8085         optreport << "Callee save size: " << calleeSaveRegCount * getGRFSize() <<
8086             " bytes" << std::endl;
8087         closeOptReportStream(optreport);
8088     }
8089 }
8090 
8091 //
8092 // Add code to setup the stack frame in callee.
8093 //
addGenxMainStackSetupCode()8094 void GlobalRA::addGenxMainStackSetupCode()
8095 {
8096     uint32_t fpInitVal = (uint32_t)kernel.getInt32KernelAttr(Attributes::ATTR_SpillMemOffset);
8097     // FIXME: a potential failure here is that frameSizeInOword is already the offset based on
8098     // GlobalSratchOffset, which is the value of fpInitVal. So below we generate code to do
8099     // SP = fpInitVal + frameSize, which does not make sense. It is correct now since when there's stack call,
8100     // IGC will not use scratch, so fpInitVal will be 0.
8101     unsigned frameSize = builder.kernel.fg.frameSizeInOWord;
8102     uint16_t factor = 1;
8103     if (useLscForSpillFill)
8104         factor = 16;
8105     G4_Declare* framePtr = builder.kernel.fg.framePtrDcl;
8106     G4_Declare* stackPtr = builder.kernel.fg.stackPtrDcl;
8107 
8108     auto entryBB = builder.kernel.fg.getEntryBB();
8109     auto insertIt = std::find_if(entryBB->begin(), entryBB->end(), [](G4_INST* inst) { return !inst->isLabel(); });
8110     //
8111     // FP = spillMemOffset
8112     //
8113     {
8114         G4_DstRegRegion* dst = builder.createDst(framePtr->getRegVar(), 0, 0, 1, Type_UD);
8115         G4_Imm * src = builder.createImm(fpInitVal, Type_UD);
8116         G4_INST* fpInst = builder.createMov(g4::SIMD1, dst, src, InstOpt_WriteEnable, false);
8117         insertIt = entryBB->insertBefore(insertIt, fpInst);
8118 
8119         setBEFPSetupInst(fpInst);
8120 
8121         if (builder.kernel.getOption(vISA_GenerateDebugInfo))
8122         {
8123             builder.kernel.getKernelDebugInfo()->setBEFPSetupInst(fpInst);
8124             builder.kernel.getKernelDebugInfo()->setFrameSize(frameSize * 16);
8125         }
8126     }
8127     //
8128     // SP = FP + FrameSize (overflow-area offset + overflow-area size)
8129     //
8130     {
8131         G4_DstRegRegion* dst = builder.createDst(stackPtr->getRegVar(), 0, 0, 1, Type_UD);
8132         G4_Imm * src = builder.createImm(fpInitVal + frameSize*factor, Type_UD);
8133         G4_INST* spIncInst = builder.createMov(g4::SIMD1, dst, src, InstOpt_WriteEnable, false);
8134         entryBB->insertBefore(++insertIt, spIncInst);
8135     }
8136 
8137     if (builder.kernel.getOption(vISA_OptReport))
8138     {
8139         std::ofstream optreport;
8140         getOptReportStream(optreport, builder.kernel.getOptions());
8141         optreport << "Total frame size: " << frameSize * 16 << " bytes" << std::endl;
8142         closeOptReportStream(optreport);
8143     }
8144 }
8145 
8146 //
8147 // Add code to setup the stack frame in callee.
8148 //
addCalleeStackSetupCode()8149 void GlobalRA::addCalleeStackSetupCode()
8150 {
8151     int frameSize = (int)builder.kernel.fg.frameSizeInOWord;
8152     uint16_t factor = 1;
8153     // convert framesize to bytes from oword for LSC
8154     if (useLscForSpillFill)
8155         factor = 16;
8156     G4_Declare* framePtr = builder.kernel.fg.framePtrDcl;
8157     G4_Declare* stackPtr = builder.kernel.fg.stackPtrDcl;
8158 
8159     MUST_BE_TRUE(frameSize > 0, "frame size cannot be 0");
8160 
8161     //
8162     // BE_FP = BE_SP
8163     // BE_SP += FrameSize
8164     //
8165     {
8166         G4_DstRegRegion* dst = builder.createDst(stackPtr->getRegVar(), 0, 0, 1, Type_UD);
8167         G4_DstRegRegion* fp_dst = builder.createDst(framePtr->getRegVar(), 0, 0, 1, Type_UD);
8168         const RegionDesc* rDesc = builder.getRegionScalar();
8169         G4_Operand* src0 = builder.createSrc(stackPtr->getRegVar(), 0, 0, rDesc, Type_UD);
8170         G4_Operand* sp_src = builder.createSrc(stackPtr->getRegVar(), 0, 0, rDesc, Type_UD);
8171         G4_Imm * src1 = builder.createImm(frameSize*factor, Type_UD);
8172         auto createBEFP = builder.createMov(g4::SIMD1, fp_dst, sp_src, InstOpt_WriteEnable, false);
8173         createBEFP->addComment("vISA_FP = vISA_SP");
8174         auto addInst = builder.createBinOp(G4_add, g4::SIMD1,
8175             dst, src0, src1, InstOpt_WriteEnable, false);
8176         addInst->addComment("vISA_SP += vISA_frameSize");
8177         G4_BB* entryBB = builder.kernel.fg.getEntryBB();
8178         auto insertIt = std::find(entryBB->begin(), entryBB->end(), getSaveBE_FPInst());
8179         MUST_BE_TRUE(insertIt != entryBB->end(), "Can't find BE_FP store inst");
8180 
8181         setBEFPSetupInst(createBEFP);
8182 
8183         if (builder.kernel.getOption(vISA_GenerateDebugInfo))
8184         {
8185             builder.kernel.getKernelDebugInfo()->setBEFPSetupInst(createBEFP);
8186             builder.kernel.getKernelDebugInfo()->setFrameSize(frameSize * 16);
8187         }
8188 
8189         addEUFusionWAInsts(createBEFP);
8190         addEUFusionWAInsts(addInst);
8191 
8192         insertIt++;
8193         entryBB->insertBefore(insertIt, createBEFP);
8194         entryBB->insertBefore(insertIt, addInst);
8195     }
8196 
8197     // Stack is destroyed in function addStoreRestoreToReturn() where part FDE is restored before fret.
8198     // This is an optimization as 1 SIMD4 instruction restores ret %ip, ret EM, caller's BE_FP, BE_SP.
8199 
8200     builder.instList.clear();
8201 
8202     if (builder.kernel.getOption(vISA_OptReport))
8203     {
8204         std::ofstream optreport;
8205         getOptReportStream(optreport, builder.kernel.getOptions());
8206         optreport << std::endl << "Total frame size: "
8207             << frameSize * 16 << " bytes" << std::endl;
8208         closeOptReportStream(optreport);
8209     }
8210 }
8211 
8212 //
8213 // Add A0 save/restore code for stack calls.
8214 //
addA0SaveRestoreCode()8215 void GraphColor::addA0SaveRestoreCode()
8216 {
8217     uint8_t numA0Elements = (uint8_t)getNumAddrRegisters();
8218 
8219     int count = 0;
8220     for (auto bb : builder.kernel.fg)
8221     {
8222         if (bb->isEndWithFCall())
8223         {
8224             G4_BB* succ = bb->Succs.front();
8225             auto fcallInst = bb->back()->asCFInst();
8226             G4_RegVar* assocPseudoA0 = bb->getParent().fcallToPseudoDclMap[fcallInst].A0->getRegVar();
8227 
8228             if (!assocPseudoA0->getPhyReg())
8229             {
8230                 // Insert save/restore code because the pseudo node did not get an allocation
8231                 const char* name = builder.getNameString(builder.mem, 20, "SA0_%d", count++);
8232                 G4_Declare* savedDcl = builder.createDeclareNoLookup(name, G4_GRF, numA0Elements, 1, Type_UW);
8233 
8234                 {
8235                     //
8236                     // (W) mov (16) TMP_GRF<1>:uw a0.0<16;16,1>:uw
8237                     //
8238                     G4_DstRegRegion* dst = builder.createDst(savedDcl->getRegVar(), 0, 0, 1, Type_UW);
8239                     const RegionDesc* rDesc = builder.getRegionStride1();
8240                     G4_Operand* src = builder.createSrc(regPool.getAddrReg(), 0, 0, rDesc, Type_UW);
8241                     G4_INST* saveInst = builder.createMov(
8242                         G4_ExecSize(numA0Elements), dst, src, InstOpt_WriteEnable, false);
8243                     INST_LIST_ITER insertIt = std::prev(bb->end());
8244                     bb->insertBefore(insertIt, saveInst);
8245                 }
8246 
8247                 {
8248                     //
8249                     // (W) mov (16) a0.0<1>:uw TMP_GRF<16;16,1>:uw
8250                     //
8251                     G4_DstRegRegion* dst = builder.createDst(regPool.getAddrReg(), 0, 0, 1, Type_UW);
8252                     const RegionDesc* rDesc = builder.getRegionStride1();
8253                     G4_Operand* src = builder.createSrc(savedDcl->getRegVar(), 0, 0, rDesc, Type_UW);
8254                     G4_INST* restoreInst = builder.createMov(
8255                         G4_ExecSize(numA0Elements), dst, src, InstOpt_WriteEnable, false);
8256                     auto insertIt = std::find_if(succ->begin(), succ->end(), [](G4_INST* inst) { return !inst->isLabel(); });
8257                     succ->insertBefore(insertIt, restoreInst);
8258                 }
8259             }
8260         }
8261     }
8262 
8263     builder.instList.clear();
8264 }
8265 
8266 //
8267 // Add Flag save/restore code for stack calls.
8268 //
addFlagSaveRestoreCode()8269 void GraphColor::addFlagSaveRestoreCode()
8270 {
8271     int count = 0;
8272     int num32BitFlags = builder.getNumFlagRegisters() / 2;
8273 
8274     // each 32-bit flag gets a declare
8275     // ToDo: should we use flag ARF directly here?
8276     std::vector<G4_Declare*> tmpFlags;
8277     for (int i = 0; i < num32BitFlags; ++i)
8278     {
8279         G4_Declare* tmpFlag = builder.createTempFlag(2);
8280         tmpFlag->getRegVar()->setPhyReg(regPool.getFlagAreg(i), 0);
8281         tmpFlags.push_back(tmpFlag);
8282     }
8283 
8284     for (auto bb : builder.kernel.fg)
8285     {
8286         if (bb->isEndWithFCall())
8287         {
8288             G4_BB* succ = bb->Succs.front();
8289             auto fcallInst = bb->back()->asCFInst();
8290             G4_RegVar* assocPseudoFlag = bb->getParent().fcallToPseudoDclMap[fcallInst].Flag->getRegVar();
8291 
8292             if (!assocPseudoFlag->getPhyReg())
8293             {
8294                 // Insert save/restore code because the pseudo node did not get an allocation
8295                 const char* name = builder.getNameString(builder.mem, 32, "SFLAG_%d", count++);
8296                 G4_Declare* savedDcl1 = builder.createDeclareNoLookup(name, G4_GRF, num32BitFlags, 1, Type_UD);
8297                 {
8298                     //
8299                     // (W) mov (1) TMP_GRF.0<1>:ud f0.0:ud
8300                     // (W) mov (1) TMP_GRF.1<1>:ud f1.0:ud
8301                     //
8302                     auto createFlagSaveInst = [&](int index)
8303                     {
8304                         auto flagDcl = tmpFlags[index];
8305                         G4_DstRegRegion* dst = builder.createDst(savedDcl1->getRegVar(), 0, index, 1, Type_UD);
8306                         G4_Operand* src = builder.createSrc(flagDcl->getRegVar(), 0, 0,
8307                             builder.getRegionScalar(), Type_UD);
8308                         return builder.createMov(g4::SIMD1, dst, src, InstOpt_WriteEnable, false);
8309                     };
8310 
8311                     auto iter = std::prev(bb->end());
8312                     for (int i = 0; i < num32BitFlags; ++i)
8313                     {
8314                         auto saveInst = createFlagSaveInst(i);
8315                         bb->insertBefore(iter, saveInst);
8316                     }
8317                 }
8318 
8319                 {
8320                     //
8321                     // mov (1) f0.0:ud TMP_GRF.0<0;1,0>:ud
8322                     // mov (1) f1.0:ud TMP_GRF.1<0;1,0>:ud
8323                     //
8324                     auto createRestoreFlagInst = [&](int index)
8325                     {
8326                         auto flagDcl = tmpFlags[index];
8327                         G4_DstRegRegion* dst = builder.createDst(flagDcl->getRegVar(), 0, 0, 1, Type_UD);
8328                         const RegionDesc* rDesc = builder.getRegionScalar();
8329                         G4_Operand* src = builder.createSrc(savedDcl1->getRegVar(), 0, index, rDesc, Type_UD);
8330                         return builder.createMov(g4::SIMD1, dst, src, InstOpt_WriteEnable, false);
8331                     };
8332                     auto insertIt = std::find_if(succ->begin(), succ->end(), [](G4_INST* inst) { return !inst->isLabel(); });
8333                     for (int i = 0; i < num32BitFlags; ++i)
8334                     {
8335                         auto restoreInst = createRestoreFlagInst(i);
8336                         succ->insertBefore(insertIt, restoreInst);
8337                     }
8338                 }
8339             }
8340         }
8341     }
8342 
8343     builder.instList.clear();
8344 }
8345 
getSaveRestoreRegister()8346 void GraphColor::getSaveRestoreRegister()
8347 {
8348     if (!builder.getIsKernel())
8349     {
8350         getCalleeSaveRegisters();
8351     }
8352     getCallerSaveRegisters();
8353 }
8354 //
8355 // Add GRF caller/callee save/restore code for stack calls.
8356 // localSpillAreaOwordsize specifices the starting offset of the caller/callee-save area in this frame.
8357 // It is 64-byte aligned.
8358 //
addSaveRestoreCode(unsigned localSpillAreaOwordSize)8359 void GlobalRA::addSaveRestoreCode(unsigned localSpillAreaOwordSize)
8360 {
8361     auto gtpin = builder.kernel.getGTPinData();
8362     if (gtpin &&
8363         gtpin->isFirstRAPass())
8364     {
8365         gtpin->markInsts();
8366     }
8367 
8368     if (builder.getIsKernel())
8369     {
8370         builder.kernel.fg.callerSaveAreaOffset = localSpillAreaOwordSize;
8371     }
8372     else
8373     {
8374         builder.kernel.fg.calleeSaveAreaOffset = localSpillAreaOwordSize;
8375         addCalleeSaveRestoreCode();
8376     }
8377     addCallerSaveRestoreCode();
8378     if (builder.getIsKernel())
8379     {
8380         addGenxMainStackSetupCode();
8381     }
8382     else
8383     {
8384         addCalleeStackSetupCode();
8385     }
8386     stackCallProlog();
8387     builder.instList.clear();
8388 }
8389 
8390 //
8391 // If the graph has stack calls, then add the caller-save pseudo code immediately before and
8392 // after the stack call. The pseudo code is either converted to actual save/restore code or
8393 // is eliminated at the end of coloringRegAlloc().
8394 //
addCallerSavePseudoCode()8395 void GlobalRA::addCallerSavePseudoCode()
8396 {
8397     unsigned retID = 0;
8398 
8399     for (G4_BB* bb : builder.kernel.fg)
8400     {
8401         if (bb->isEndWithFCall())
8402         {
8403             // GRF caller save/restore
8404             auto fcallInst = bb->back()->asCFInst();
8405             G4_Declare* pseudoVCADcl = bb->getParent().fcallToPseudoDclMap[fcallInst].VCA;
8406             G4_DstRegRegion* dst = builder.createDst(pseudoVCADcl->getRegVar(), 0, 0, 1, Type_UD);
8407             G4_INST* saveInst = builder.createInternalIntrinsicInst(
8408                 nullptr, Intrinsic::CallerSave, g4::SIMD1, dst, nullptr, nullptr, nullptr, InstOpt_WriteEnable);
8409             saveInst->inheritDIFrom(fcallInst);
8410             INST_LIST_ITER callBBIt = bb->end();
8411             bb->insertBefore(--callBBIt, saveInst);
8412 
8413             G4_FCALL* fcall = builder.getFcallInfo(bb->back());
8414             MUST_BE_TRUE(fcall != NULL, "fcall info not found");
8415             uint16_t retSize = fcall->getRetSize();
8416             if (retSize > 0)
8417             {
8418                 const char* name = builder.getNameString(builder.mem, 32, "FCALL_RETVAL_%d", retID++);
8419                 auto retDcl = builder.createHardwiredDeclare(numEltPerGRF<Type_UD>() * retSize, Type_UD, IR_Builder::ArgRet_Stackcall::Ret, 0);
8420                 retDcl->setName(name);
8421                 fcallRetMap.emplace(pseudoVCADcl, retDcl);
8422             }
8423 
8424             ASSERT_USER(bb->Succs.size() == 1, "fcall basic block cannot have more than 1 successor node");
8425 
8426             G4_BB* retBB = bb->Succs.front();
8427             const RegionDesc* rd = builder.getRegionScalar();
8428             G4_Operand* src = builder.createSrc(pseudoVCADcl->getRegVar(), 0, 0, rd, Type_UD);
8429             INST_LIST_ITER retBBIt = retBB->begin();
8430             for (; retBBIt != retBB->end() && (*retBBIt)->isLabel(); ++retBBIt);
8431             G4_INST* restoreInst =
8432                 builder.createInternalIntrinsicInst(
8433                     nullptr, Intrinsic::CallerRestore, g4::SIMD1, nullptr, src, nullptr, nullptr, InstOpt_WriteEnable);
8434             restoreInst->inheritDIFrom(fcallInst);
8435             retBB->insertBefore(retBBIt, restoreInst);
8436         }
8437     }
8438     builder.instList.clear();
8439 }
8440 
8441 //
8442 // If the graph has stack calls, then add the callee-save pseudo code at the entry/exit blocks
8443 // of the function. The pseudo code is either converted to actual save/restore code or is
8444 // eliminated at the end of coloringRegAlloc().
8445 //
addCalleeSavePseudoCode()8446 void GlobalRA::addCalleeSavePseudoCode()
8447 {
8448     G4_Declare* pseudoVCEDcl = builder.kernel.fg.pseudoVCEDcl;
8449 
8450     G4_DstRegRegion* dst = builder.createDst(pseudoVCEDcl->getRegVar(), 0, 0, 1, Type_UD);
8451     auto saveInst = builder.createInternalIntrinsicInst(
8452         nullptr, Intrinsic::CalleeSave, g4::SIMD1, dst, nullptr, nullptr, nullptr, InstOpt_WriteEnable);
8453     INST_LIST_ITER insertIt = builder.kernel.fg.getEntryBB()->begin();
8454     for (; insertIt != builder.kernel.fg.getEntryBB()->end() && (*insertIt)->isLabel();
8455         ++insertIt)
8456     {   /*  void */
8457     };
8458     builder.kernel.fg.getEntryBB()->insertBefore(insertIt, saveInst);
8459 
8460     G4_BB* exitBB = builder.kernel.fg.getUniqueReturnBlock();
8461     const RegionDesc* rDesc = builder.getRegionScalar();
8462     G4_Operand* src = builder.createSrc(pseudoVCEDcl->getRegVar(), 0, 0, rDesc, Type_UD);
8463     G4_INST* restoreInst =
8464         builder.createInternalIntrinsicInst(
8465             nullptr, Intrinsic::CalleeRestore, g4::SIMD1, nullptr, src, nullptr, nullptr, InstOpt_WriteEnable);
8466     INST_LIST_ITER exitBBIt = exitBB->end();
8467     --exitBBIt;
8468     MUST_BE_TRUE((*exitBBIt)->isFReturn(), ERROR_REGALLOC);
8469     exitBB->insertBefore(exitBBIt, restoreInst);
8470     builder.instList.clear();
8471 }
8472 
8473 //
8474 // Insert store r125.[0-4] at entry and restore before return.
8475 // Dst of store will be a hardwired temp at upper end of caller save area.
8476 // This method emits:
8477 // (W) mov (4) SR_BEStack<1>:ud    r125.0<4;4,1>:ud <-- in prolog
8478 // (W) mov (4) r125.0<1>:ud        SR_BEStack<4;4,1>:ud <-- in epilog
addStoreRestoreToReturn()8479 void GlobalRA::addStoreRestoreToReturn()
8480 {
8481 
8482     unsigned regNum = builder.kernel.getCallerSaveLastGRF();
8483     unsigned subRegNum = numEltPerGRF<Type_UD>() - 4;
8484     oldFPDcl = builder.createHardwiredDeclare(4, Type_UD, regNum, subRegNum);
8485     oldFPDcl->setName(builder.getNameString(builder.kernel.fg.mem, 24, "CallerSaveRetIp_BE_FP"));
8486 
8487     G4_DstRegRegion* oldFPDst = builder.createDst(oldFPDcl->getRegVar(), 0, 0, 1, Type_UD);
8488     const RegionDesc* rd = builder.getRegionStride1();
8489     G4_Operand* oldFPSrc = builder.createSrc(oldFPDcl->getRegVar(), 0, 0, rd, Type_UD);
8490 
8491     auto SRDecl = builder.createHardwiredDeclare(4, Type_UD, builder.kernel.getFPSPGRF(), IR_Builder::SubRegs_Stackcall::Ret_IP);
8492     SRDecl->setName(builder.getNameString(builder.kernel.fg.mem, 24, "SR_BEStack"));
8493     G4_DstRegRegion* FPdst = builder.createDst(SRDecl->getRegVar(), 0, 0, 1, Type_UD);
8494     rd = builder.getRegionStride1();
8495     G4_Operand* FPsrc = builder.createSrc(SRDecl->getRegVar(), 0, 0, rd, Type_UD);
8496 
8497     saveBE_FPInst = builder.createMov(g4::SIMD4, oldFPDst, FPsrc, InstOpt_WriteEnable, false);
8498     saveBE_FPInst->addComment("save vISA SP/FP to temp");
8499     builder.setPartFDSaveInst(saveBE_FPInst);
8500 
8501     auto entryBB = builder.kernel.fg.getEntryBB();
8502     auto insertIt = std::find_if(entryBB->begin(), entryBB->end(), [](G4_INST* inst) { return !inst->isLabel(); });
8503     entryBB->insertBefore(insertIt, saveBE_FPInst);
8504 
8505     auto fretBB = builder.kernel.fg.getUniqueReturnBlock();
8506     auto iter = std::prev(fretBB->end());
8507     assert((*iter)->isFReturn() && "fret BB must end with fret");
8508 
8509     if (!EUFusionWANeeded())
8510     {
8511         restoreBE_FPInst = builder.createMov(g4::SIMD4, FPdst, oldFPSrc, InstOpt_WriteEnable, false);
8512         fretBB->insertBefore(iter, restoreBE_FPInst);
8513     }
8514     else
8515     {
8516         // emit frame descriptor
8517         auto dstDcl = builder.createHardwiredDeclare(8, Type_UD, kernel.getFPSPGRF(), 0);
8518         dstDcl->setName(builder.getNameString(builder.kernel.fg.mem, 24, "FrameDescriptorGRF"));
8519         auto dstData = builder.createDstRegRegion(dstDcl, 1);
8520         const unsigned execSize = 8;
8521         G4_INST* load = nullptr;
8522         if (builder.supportsLSC())
8523         {
8524             auto headerOpnd = getSpillFillHeader(*kernel.fg.builder, nullptr);
8525             load = builder.createFill(headerOpnd, dstData, G4_ExecSize(execSize), 1, 0, builder.getBEFP(), InstOpt_WriteEnable, false);
8526         }
8527         else
8528         {
8529             load = builder.createFill(dstData, G4_ExecSize(execSize), 1, 0, builder.getBEFP(), InstOpt_WriteEnable, false);
8530         }
8531         fretBB->insertBefore(iter, load);
8532         addEUFusionWAInsts(load);
8533         restoreBE_FPInst = load;
8534     }
8535 
8536     restoreBE_FPInst->addComment("restore vISA SP/FP from temp");
8537 
8538     if (builder.kernel.getOption(vISA_GenerateDebugInfo))
8539     {
8540         builder.kernel.getKernelDebugInfo()->setCallerBEFPRestoreInst(restoreBE_FPInst);
8541         builder.kernel.getKernelDebugInfo()->setCallerSPRestoreInst(restoreBE_FPInst);
8542         builder.kernel.getKernelDebugInfo()->setCallerBEFPSaveInst(saveBE_FPInst);
8543     }
8544 
8545     auto gtpin = builder.kernel.getGTPinData();
8546     if (gtpin &&
8547         gtpin->isFirstRAPass())
8548     {
8549         gtpin->markInst(saveBE_FPInst);
8550         gtpin->markInst(restoreBE_FPInst);
8551     }
8552 }
8553 
reportUndefinedUses(LivenessAnalysis & liveAnalysis,G4_BB * bb,G4_INST * inst,G4_Declare * referencedDcl,std::set<G4_Declare * > & defs,std::ofstream & optreport,Gen4_Operand_Number opndNum)8554 void GlobalRA::reportUndefinedUses(
8555     LivenessAnalysis& liveAnalysis, G4_BB* bb, G4_INST* inst, G4_Declare* referencedDcl,
8556     std::set<G4_Declare*>& defs, std::ofstream& optreport, Gen4_Operand_Number opndNum)
8557 {
8558     // Get topmost dcl
8559     while (referencedDcl->getAliasDeclare() != NULL)
8560     {
8561         referencedDcl = referencedDcl->getAliasDeclare();
8562     }
8563 
8564     if (referencedDcl->getAddressed() == true)
8565     {
8566         // Dont run analysis for addressed opnds.
8567         // Specifically, we dont analyze following,
8568         //
8569         // A0 = &V1
8570         // r[A0] = 0 <-- V1 indirectly defined
8571         // ... = V1 <-- Use-before-def warning for V1 skipped due to indirect def
8572         //
8573 
8574         return;
8575     }
8576 
8577     if (referencedDcl->getRegVar()->isRegAllocPartaker())
8578     {
8579         const char* opndName = "";
8580 
8581         if (opndNum == Opnd_pred)
8582         {
8583             opndName = "predicate";
8584         }
8585         else if (opndNum == Opnd_src0)
8586         {
8587             opndName = "src0";
8588         }
8589         else if (opndNum == Opnd_src1)
8590         {
8591             opndName = "src1";
8592         }
8593         else if (opndNum == Opnd_src2)
8594         {
8595             opndName = "src2";
8596         }
8597 
8598         unsigned id = referencedDcl->getRegVar()->getId();
8599         if (liveAnalysis.def_in[bb->getId()].isSet(id) == false &&
8600             defs.find(referencedDcl) == defs.end())
8601         {
8602             // Def not found for use so report it
8603             optreport << "Def not found for use " << referencedDcl->getName() <<
8604                 " (" << opndName << ") at CISA offset " << inst->getCISAOff() << ", src line " <<
8605                 inst->getLineNo() << ":" << std::endl;
8606             inst->emit(optreport);
8607             optreport << std::endl << std::endl;
8608         }
8609     }
8610 }
8611 
updateDefSet(std::set<G4_Declare * > & defs,G4_Declare * referencedDcl)8612 void GlobalRA::updateDefSet(std::set<G4_Declare*>& defs, G4_Declare* referencedDcl)
8613 {
8614     // Get topmost dcl
8615     while (referencedDcl->getAliasDeclare() != NULL)
8616     {
8617         referencedDcl = referencedDcl->getAliasDeclare();
8618     }
8619 
8620     defs.insert(referencedDcl);
8621 }
8622 
detectUndefinedUses(LivenessAnalysis & liveAnalysis,G4_Kernel & kernel)8623 void GlobalRA::detectUndefinedUses(LivenessAnalysis& liveAnalysis, G4_Kernel& kernel)
8624 {
8625     // This function iterates over each inst and checks whether there is
8626     // a reaching def for each src operand. If not, it reports it to
8627     // opt report.
8628     std::ofstream optreport;
8629     getOptReportStream(optreport, kernel.getOptions());
8630 
8631     optreport << std::endl;
8632     if (liveAnalysis.livenessClass(G4_FLAG))
8633     {
8634         optreport << "=== Uses with reaching def - Flags ===" << std::endl;
8635     }
8636     else if (liveAnalysis.livenessClass(G4_ADDRESS))
8637     {
8638         optreport << "=== Uses with reaching def - Address ===" << std::endl;
8639     }
8640     else
8641     {
8642         optreport << "=== Uses with reaching def - GRF ===" << std::endl;
8643     }
8644     if (kernel.getOption(vISA_LocalRA))
8645     {
8646         optreport << "(Use -nolocalra switch for accurate results of uses without reaching defs)" << std::endl;
8647     }
8648 
8649     for (G4_BB* bb : kernel.fg)
8650     {
8651         std::set<G4_Declare*> defs;
8652         std::set<G4_Declare*>::iterator defs_it;
8653         G4_Declare* referencedDcl = NULL;
8654 
8655         for (G4_INST* inst : *bb)
8656         {
8657             // Src/predicate opnds are uses
8658             if (inst->getPredicate() &&
8659                 inst->getPredicate()->getBase() &&
8660                 inst->getPredicate()->getBase()->isRegVar() &&
8661                 inst->getPredicate()->getBase()->isRegAllocPartaker())
8662             {
8663                 referencedDcl = inst->getPredicate()->asPredicate()->getBase()->asRegVar()->getDeclare();
8664                 reportUndefinedUses(liveAnalysis, bb, inst, referencedDcl, defs, optreport, Opnd_pred);
8665             }
8666 
8667             for (unsigned i = 0; i < G4_MAX_SRCS; i++)
8668             {
8669                 G4_Operand* opnd = inst->getSrc(i);
8670 
8671                 if (opnd &&
8672                     opnd->isAddrExp() == false &&
8673                     opnd->getBase() &&
8674                     opnd->getBase()->isRegVar() &&
8675                     opnd->getBase()->isRegAllocPartaker())
8676                 {
8677                     referencedDcl = opnd->getBase()->asRegVar()->getDeclare();
8678                     reportUndefinedUses(liveAnalysis, bb, inst, referencedDcl, defs, optreport, (Gen4_Operand_Number)(i + Opnd_src0));
8679                 }
8680             }
8681 
8682             // Dst/cond modifier opnds are defs
8683             if (inst->getCondModBase() &&
8684                 inst->getCondMod()->getBase()->isRegVar() &&
8685                 inst->getCondMod()->getBase()->isRegAllocPartaker())
8686             {
8687                 referencedDcl = inst->getCondMod()->asCondMod()->getBase()->asRegVar()->getDeclare();
8688                 updateDefSet(defs, referencedDcl);
8689             }
8690 
8691             if (inst->getDst() &&
8692                 inst->getDst()->getBase() &&
8693                 inst->getDst()->getBase()->isRegVar() &&
8694                 inst->getDst()->getBase()->isRegAllocPartaker())
8695             {
8696                 referencedDcl = inst->getDst()->getBase()->asRegVar()->getDeclare();
8697                 updateDefSet(defs, referencedDcl);
8698             }
8699         }
8700     }
8701 
8702     optreport << std::endl << std::endl;
8703 
8704     closeOptReportStream(optreport);
8705 }
8706 
detectNeverDefinedUses()8707 void GlobalRA::detectNeverDefinedUses()
8708 {
8709     // Detect variables that are used but never defined in entire CFG.
8710     // This does not use liveness information.
8711     // Hold all decls from symbol table as key.
8712     // Boolean mapped value determines whether the dcl is
8713     // defined in kernel or not.
8714     std::map<G4_Declare*, bool> vars;
8715     std::map<G4_Declare*, bool>::iterator map_it;
8716 
8717     for (auto bb : kernel.fg)
8718     {
8719         for (G4_INST* inst : *bb)
8720         {
8721             G4_Declare* referencedDcl = nullptr;
8722 
8723             if (inst->getDst() &&
8724                 inst->getDst()->getBase() &&
8725                 inst->getDst()->getBase()->isRegVar())
8726             {
8727                 referencedDcl = inst->getDst()->getBaseRegVarRootDeclare();
8728 
8729                 // Always insert top-most dcl
8730                 map_it = vars.find(referencedDcl);
8731                 if (map_it == vars.end())
8732                 {
8733                     vars.emplace(referencedDcl, true);
8734                 }
8735                 else
8736                 {
8737                     map_it->second = true;
8738                 }
8739             }
8740 
8741             if (inst->getCondModBase() &&
8742                 inst->getCondMod()->getBase()->isRegVar())
8743             {
8744                 referencedDcl = inst->getCondMod()->getBaseRegVarRootDeclare();
8745 
8746                 map_it = vars.find(referencedDcl);
8747                 if (map_it == vars.end())
8748                 {
8749                     vars.emplace(referencedDcl, true);
8750                 }
8751                 else
8752                 {
8753                     map_it->second = true;
8754                 }
8755             }
8756 
8757             if (inst->getPredicate() &&
8758                 inst->getPredicate()->getBase() &&
8759                 inst->getPredicate()->getBase()->isRegVar())
8760             {
8761                 referencedDcl = inst->getPredicate()->getBaseRegVarRootDeclare();
8762 
8763                 // Check whether dcl was already added to list.
8764                 // If not, add it with flag set to false to indicate
8765                 // that a use was found but a def hasnt been seen yet.
8766                 map_it = vars.find(referencedDcl);
8767                 if (map_it == vars.end())
8768                 {
8769                     vars.emplace(referencedDcl, false);
8770                 }
8771             }
8772 
8773             for (unsigned i = 0; i < G4_MAX_SRCS; i++)
8774             {
8775                 G4_Operand* opnd = inst->getSrc(i);
8776 
8777                 if (opnd &&
8778                     opnd->getBase() &&
8779                     opnd->getBase()->isRegVar())
8780                 {
8781                     referencedDcl = opnd->getBaseRegVarRootDeclare();
8782 
8783                     map_it = vars.find(referencedDcl);
8784                     if (map_it == vars.end())
8785                     {
8786                         vars.emplace(referencedDcl, false);
8787                     }
8788                 }
8789             }
8790         }
8791     }
8792 
8793     std::ofstream optreport;
8794     getOptReportStream(optreport, kernel.getOptions());
8795     optreport << std::endl << "=== Variables used but never defined ===" << std::endl << std::endl;
8796 
8797     for (auto dcl : kernel.Declares)
8798     {
8799         while (dcl->getAliasDeclare() != NULL)
8800         {
8801             dcl = dcl->getAliasDeclare();
8802         }
8803 
8804         map_it = vars.find(dcl);
8805         if (map_it != vars.end())
8806         {
8807             if (map_it->second == false &&
8808                 dcl->getRegFile() != G4_INPUT &&
8809                 dcl->getAddressed() == false)
8810             {
8811                 // No def found for this non-input variable in
8812                 // entire CFG so report it.
8813                 optreport << dcl->getName();
8814                 if (dcl->getRegFile() == G4_GRF)
8815                 {
8816                     optreport << " (General)";
8817                 }
8818                 else if (dcl->getRegFile() == G4_ADDRESS)
8819                 {
8820                     optreport << " (Address)";
8821                 }
8822                 else if (dcl->getRegFile() == G4_FLAG)
8823                 {
8824                     optreport << " (Flag)";
8825                 }
8826 
8827                 optreport << std::endl;
8828             }
8829         }
8830     }
8831 
8832     optreport << std::endl << std::endl;
8833 
8834     closeOptReportStream(optreport);
8835 }
8836 
emitVarLiveIntervals()8837 void GlobalRA::emitVarLiveIntervals()
8838 {
8839     for (auto dcl : kernel.Declares)
8840     {
8841         std::vector<std::pair<uint32_t, uint32_t>> liveIntervals;
8842         LiveIntervalInfo* lr = kernel.getKernelDebugInfo()->getLiveIntervalInfo(dcl, false);
8843 
8844         if (lr != NULL)
8845         {
8846             lr->getLiveIntervals(liveIntervals);
8847 
8848             if (liveIntervals.size() > 0)
8849             {
8850                 DEBUG_VERBOSE(dcl->getName() << " - ");
8851             }
8852 
8853             for (auto&& i : liveIntervals)
8854             {
8855                 std::cerr << "(" << i.first << ", " << i.second << ")\n";
8856             }
8857         }
8858     }
8859 }
8860 
8861 //
8862 //  Check the overlap of two sources' ranges and do range splitting
8863 //  Such as, range1: 0~63, range2: 32~95  --> 0~31,32~63,64~95
8864 //       or, range1: 0~63, range2: 32~63  --> 0~31,32~63
8865 //
splitVarRange(VarRange * src1,VarRange * src2,std::stack<VarRange * > * toDelete)8866 VarRange* VarSplit::splitVarRange(VarRange *src1,
8867     VarRange *src2,
8868     std::stack<VarRange*> *toDelete)
8869 {
8870     VarRange * new_var_range = nullptr;
8871 
8872     ASSERT_USER(!(src1->leftBound == src2->leftBound && src1->rightBound == src2->rightBound), "Same ranges can not be spiltted");
8873 
8874     if (src1->leftBound > src2->rightBound ||
8875         src1->rightBound < src2->leftBound)  //No overlap
8876     {
8877         return NULL;
8878     }
8879 
8880     unsigned left1 = std::min(src1->leftBound, src2->leftBound);  //left
8881     unsigned right1 = std::max(src1->leftBound, src2->leftBound);
8882 
8883     unsigned left2 = std::min(src1->rightBound, src2->rightBound); //right
8884     unsigned right2 = std::max(src1->rightBound, src2->rightBound);
8885 
8886     if (left1 == right1) //Same left
8887     {
8888         src1->leftBound = left1;
8889         src1->rightBound = left2;
8890 
8891         src2->leftBound = left2 + 1;
8892         src2->rightBound = right2;
8893     }
8894     else if (left2 == right2)  //Same right
8895     {
8896         src1->leftBound = left1;
8897         src1->rightBound = right1 - 1;
8898         src2->leftBound = right1;
8899         src2->rightBound = right2;
8900     }
8901     else  //No same boundary
8902     {
8903         src1->leftBound = left1;           //Left one: in list already
8904         src1->rightBound = right1 - 1;
8905 
8906         src2->leftBound = left2 + 1;       //Right one: keep in list
8907         src2->rightBound = right2;
8908 
8909         new_var_range = new VarRange;
8910         new_var_range->leftBound = right1; //Middle one: need add one range object
8911         new_var_range->rightBound = left2;
8912         toDelete->push(new_var_range);
8913     }
8914 
8915     return new_var_range;
8916 }
8917 
8918 //
8919 // Scan the range list, Insert the new range into the range list.
8920 // Range splitting is applied if required.
8921 //
rangeListSpliting(VAR_RANGE_LIST * rangeList,G4_Operand * opnd,std::stack<VarRange * > * toDelete)8922 void VarSplit::rangeListSpliting(VAR_RANGE_LIST *rangeList, G4_Operand *opnd, std::stack<VarRange*> *toDelete)
8923 {
8924     VarRange *range = new VarRange;
8925     range->leftBound = opnd->getLeftBound();
8926     range->rightBound = opnd->getRightBound();
8927     toDelete->push(range);
8928 
8929     VAR_RANGE_LIST_ITER it = rangeList->begin();
8930 
8931     //The ranges in the list are ordered from low to high
8932     while (it != rangeList->end())
8933     {
8934         if ((*it)->leftBound == range->leftBound &&
8935             ((*it)->rightBound == range->rightBound))
8936         {
8937             //Same range exists in the list already
8938             return;
8939         }
8940 
8941         if ((*it)->leftBound > range->rightBound)
8942         {
8943             //The range item in the list is on the right of current range, insert it before the postion.
8944             //Since the whole range is inserted first, all the ranges should be continous.
8945             ASSERT_USER((*it)->leftBound - range->rightBound == 1, "none continous spliting happened\n");
8946             rangeList->insert(it, range);
8947             return;
8948         }
8949 
8950         //Overlap happened, do splitting.
8951         //(*lt) is updated to the left range
8952         //"range" is updated to the right range
8953         //If "newRange" is not NULL, it's the middle range.
8954         VarRange *newRange = splitVarRange((*it), range, toDelete);
8955 
8956         //Insert the middle one
8957         it++;
8958         if (newRange)
8959         {
8960             it = rangeList->insert(it, newRange);
8961         }
8962     }
8963 
8964     rangeList->push_back(range);  //Insert the right one
8965 
8966     return;
8967 }
8968 
getHeightWidth(G4_Type type,unsigned numberElements,unsigned short & dclWidth,unsigned short & dclHeight,int & totalByteSize)8969 void VarSplit::getHeightWidth(G4_Type type, unsigned numberElements, unsigned short &dclWidth, unsigned short &dclHeight, int &totalByteSize)
8970 {
8971     dclWidth = 1, dclHeight = 1;
8972     totalByteSize = numberElements * TypeSize(type);
8973     if (totalByteSize <= (int)numEltPerGRF<Type_UB>())
8974     {
8975         dclWidth = (uint16_t)numberElements;
8976     }
8977     else {
8978         // here we assume that the start point of the var is the beginning of a GRF?
8979         // so subregister must be 0?
8980         dclWidth = numEltPerGRF<Type_UB>() / TypeSize(type);
8981         dclHeight = totalByteSize / numEltPerGRF<Type_UB>();
8982         if (totalByteSize % numEltPerGRF<Type_UB>() != 0) {
8983             dclHeight++;
8984         }
8985     }
8986 }
8987 
8988 
createSubDcls(G4_Kernel & kernel,G4_Declare * oldDcl,std::vector<G4_Declare * > & splitDclList)8989 void VarSplit::createSubDcls(G4_Kernel& kernel, G4_Declare* oldDcl, std::vector<G4_Declare*> &splitDclList)
8990 {
8991     if (oldDcl->getByteSize() <= numEltPerGRF<Type_UB>() || oldDcl->getByteSize() % numEltPerGRF<Type_UB>())
8992     {
8993         return;
8994     }
8995 
8996     int splitVarSize = kernel.getSimdSize() == g4::SIMD8 ? 1 : 2;
8997     for (unsigned i = 0, bSizePerGRFSize = (oldDcl->getByteSize() / numEltPerGRF<Type_UB>()); i < bSizePerGRFSize; i += splitVarSize)
8998     {
8999         G4_Declare* splitDcl = NULL;
9000         unsigned leftBound = i * numEltPerGRF<Type_UB>();
9001         unsigned rightBound = (i + splitVarSize) * numEltPerGRF<Type_UB>() - 1;
9002         unsigned short dclWidth = 0;
9003         unsigned short dclHeight = 0;
9004         int dclTotalSize = 0;
9005 
9006         getHeightWidth(oldDcl->getElemType(), (rightBound - leftBound + 1) / oldDcl->getElemSize(), dclWidth, dclHeight, dclTotalSize);
9007         const char* splitDclName = kernel.fg.builder->getNameString(kernel.fg.builder->mem, 16, "split_%d_%s", i, oldDcl->getName());
9008         splitDcl = kernel.fg.builder->createDeclareNoLookup(splitDclName, G4_GRF, dclWidth, dclHeight, oldDcl->getElemType());
9009         gra.setSubOffset(splitDcl, leftBound);
9010         splitDcl->copyAlign(oldDcl);
9011         gra.copyAlignment(splitDcl, oldDcl);
9012         unsigned nElementSize = (rightBound - leftBound + 1) / oldDcl->getElemSize();
9013         if ((rightBound - leftBound + 1) % oldDcl->getElemSize())
9014         {
9015             nElementSize++;
9016         }
9017         splitDcl->setTotalElems(nElementSize);
9018         splitDclList.push_back(splitDcl);
9019     }
9020 
9021     return;
9022 }
9023 
insertMovesToTemp(IR_Builder & builder,G4_Declare * oldDcl,G4_Operand * dstOpnd,G4_BB * bb,INST_LIST_ITER instIter,std::vector<G4_Declare * > & splitDclList)9024 void VarSplit::insertMovesToTemp(
9025     IR_Builder& builder, G4_Declare* oldDcl, G4_Operand *dstOpnd, G4_BB* bb,
9026     INST_LIST_ITER instIter, std::vector<G4_Declare*> &splitDclList)
9027 {
9028     G4_INST *inst = (*instIter);
9029     INST_LIST_ITER iter = instIter;
9030     iter++;
9031 
9032     for (size_t i = 0, size = splitDclList.size(); i < size; i++)
9033     {
9034         G4_Declare * subDcl = splitDclList[i];
9035         unsigned leftBound = gra.getSubOffset(subDcl);
9036         unsigned rightBound = leftBound + subDcl->getByteSize() - 1;
9037 
9038         if (!(dstOpnd->getRightBound() < leftBound || rightBound < dstOpnd->getLeftBound()))
9039         {
9040             unsigned maskFlag = (inst->getOption() & 0xFFF010C);
9041             G4_DstRegRegion* dst = builder.createDstRegRegion(subDcl, 1);
9042             auto src = builder.createSrc(oldDcl->getRegVar(),
9043                 (gra.getSubOffset(subDcl)) / numEltPerGRF<Type_UB>(), 0, builder.getRegionStride1(), oldDcl->getElemType());
9044             G4_INST* splitInst = builder.createMov(G4_ExecSize(subDcl->getTotalElems()), dst, src, maskFlag, false);
9045             bb->insertBefore(iter, splitInst);
9046         }
9047     }
9048 
9049     return;
9050 }
9051 
insertMovesFromTemp(G4_Kernel & kernel,G4_Declare * oldDcl,int index,G4_Operand * srcOpnd,int pos,G4_BB * bb,INST_LIST_ITER instIter,std::vector<G4_Declare * > & splitDclList)9052 void VarSplit::insertMovesFromTemp(G4_Kernel& kernel, G4_Declare* oldDcl, int index, G4_Operand *srcOpnd, int pos, G4_BB* bb, INST_LIST_ITER instIter, std::vector<G4_Declare*> &splitDclList)
9053 {
9054     G4_INST *inst = (*instIter);
9055 
9056     int sizeInGRF = (srcOpnd->getRightBound() - srcOpnd->getLeftBound() + numEltPerGRF<Type_UB>() - 1) /
9057         numEltPerGRF<Type_UB>();
9058     int splitSize = kernel.getSimdSize() == g4::SIMD8 ? 1 : 2;
9059     if (sizeInGRF != splitSize)
9060     {
9061         unsigned short dclWidth = 0;
9062         unsigned short dclHeight = 0;
9063         int dclTotalSize = 0;
9064         G4_SrcRegRegion* oldSrc = srcOpnd->asSrcRegRegion();
9065         getHeightWidth(oldSrc->getType(), (srcOpnd->getRightBound() - srcOpnd->getLeftBound() + 1) / oldSrc->getElemSize(), dclWidth, dclHeight, dclTotalSize);
9066         const char* newDclName = kernel.fg.builder->getNameString(kernel.fg.builder->mem, 16, "copy_%d_%s", index, oldDcl->getName());
9067         G4_Declare * newDcl = kernel.fg.builder->createDeclareNoLookup(newDclName, G4_GRF, dclWidth, dclHeight, oldSrc->getType());
9068         newDcl->copyAlign(oldDcl);
9069         gra.copyAlignment(newDcl, oldDcl);
9070 
9071         unsigned newLeftBound = 0;
9072 
9073         for (size_t i = 0, size = splitDclList.size(); i < size; i++)
9074         {
9075             G4_Declare * subDcl = splitDclList[i];
9076             unsigned leftBound = gra.getSubOffset(subDcl);
9077             unsigned rightBound = leftBound + subDcl->getByteSize() - 1;
9078 
9079             if (!(srcOpnd->getRightBound() < leftBound || rightBound < srcOpnd->getLeftBound()))
9080             {
9081 
9082                 G4_DstRegRegion* dst = kernel.fg.builder->createDst(
9083                     newDcl->getRegVar(),
9084                     newLeftBound / numEltPerGRF<Type_UB>(),
9085                     0,
9086                     1,
9087                     oldSrc->getType());
9088                 newLeftBound += subDcl->getByteSize();
9089                 G4_SrcRegRegion* src = kernel.fg.builder->createSrc(
9090                     subDcl->getRegVar(),
9091                     0,
9092                     0,
9093                     kernel.fg.builder->getRegionStride1(),
9094                     oldSrc->getType());
9095                 G4_INST* movInst = kernel.fg.builder->createMov(
9096                     G4_ExecSize(subDcl->getTotalElems()), dst, src, InstOpt_WriteEnable, false);
9097                 bb->insertBefore(instIter, movInst);
9098             }
9099         }
9100         auto newSrc = kernel.fg.builder->createSrcRegRegion(oldSrc->getModifier(), Direct, newDcl->getRegVar(),
9101             0, oldSrc->getSubRegOff(), oldSrc->getRegion(), newDcl->getElemType());
9102         inst->setSrc(newSrc, pos);
9103     }
9104     else
9105     {
9106         for (size_t i = 0, size = splitDclList.size(); i < size; i++)
9107         {
9108             G4_Declare * subDcl = splitDclList[i];
9109             unsigned leftBound = gra.getSubOffset(subDcl);
9110             unsigned rightBound = leftBound + subDcl->getByteSize() - 1;
9111 
9112             if (!(srcOpnd->getRightBound() < leftBound || rightBound < srcOpnd->getLeftBound()))
9113             {
9114                 G4_SrcRegRegion* oldSrc = srcOpnd->asSrcRegRegion();
9115                 G4_SrcRegRegion* newSrc = kernel.fg.builder->createSrcRegRegion(
9116                     oldSrc->getModifier(),
9117                     Direct,
9118                     subDcl->getRegVar(),
9119                     0,
9120                     oldSrc->getSubRegOff(),
9121                     oldSrc->getRegion(),
9122                     oldSrc->getType());
9123                 inst->setSrc(newSrc, pos);
9124                 break;
9125             }
9126         }
9127     }
9128 
9129     return;
9130 }
9131 
canDoGlobalSplit(IR_Builder & builder,G4_Kernel & kernel,uint32_t sendSpillRefCount)9132 bool VarSplit::canDoGlobalSplit(IR_Builder& builder, G4_Kernel &kernel, uint32_t sendSpillRefCount)
9133 {
9134     if (!builder.getOption(vISA_GlobalSendVarSplit))
9135     {
9136         return false;
9137     }
9138 
9139     if (!builder.getOption(vISA_Debug) &&               //Not work in debug mode
9140         kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D &&   //Only works for 3D/OCL/OGL
9141         sendSpillRefCount)
9142     {
9143         return true;
9144     }
9145 
9146     return false;
9147 }
9148 
globalSplit(IR_Builder & builder,G4_Kernel & kernel)9149 void VarSplit::globalSplit(IR_Builder& builder, G4_Kernel &kernel)
9150 {
9151     typedef std::list<std::tuple<G4_BB*, G4_Operand*, int, unsigned, INST_LIST_ITER>> SPLIT_OPERANDS;
9152     typedef std::list<std::tuple<G4_BB*, G4_Operand*, int, unsigned, INST_LIST_ITER>>::iterator SPLIT_OPERANDS_ITER;
9153     typedef std::map<G4_RegVar*, SPLIT_OPERANDS> SPLIT_DECL_OPERANDS;
9154     typedef std::map<G4_RegVar*, SPLIT_OPERANDS>::iterator SPLIT_DECL_OPERANDS_ITER;
9155 
9156     SPLIT_DECL_OPERANDS splitDcls;
9157     unsigned instIndex = 0;
9158     int splitSize = kernel.getSimdSize() == g4::SIMD8 ? 1 : 2;
9159     for (auto bb : kernel.fg)
9160     {
9161         for (INST_LIST_ITER it = bb->begin(), iend = bb->end(); it != iend; ++it, ++instIndex)
9162         {
9163             G4_INST* inst = (*it);
9164             G4_DstRegRegion* dst = inst->getDst();
9165 
9166             if (inst->isLifeTimeEnd() || inst->isPseudoKill())
9167             {
9168                 continue;
9169             }
9170 
9171             //
9172             // process send destination operand
9173             //
9174             if (inst->isSend() &&
9175                 inst->getMsgDesc()->getDstLenRegs() > (size_t)splitSize &&
9176                 inst->asSendInst()->isDirectSplittableSend())
9177             {
9178                 G4_DstRegRegion* dstrgn = dst;
9179                 G4_Declare* topdcl = GetTopDclFromRegRegion(dstrgn);
9180 
9181                 if (topdcl &&
9182                     dstrgn->getRegAccess() == Direct &&
9183                     !topdcl->getAddressed() &&
9184                     topdcl->getRegFile() != G4_INPUT &&
9185                     (dstrgn->getRightBound() - dstrgn->getLeftBound() + 1) == topdcl->getByteSize() &&
9186                     (dstrgn->getRightBound() - dstrgn->getLeftBound()) > numEltPerGRF<Type_UB>())
9187                 {
9188                     //The tuple<G4_BB*, G4_Operand*, int pos, unsigned instIndex, INST_LIST_ITER>,
9189                     //these info are tuning and split operand/instruction generation
9190                     splitDcls[topdcl->getRegVar()].push_front(make_tuple(bb, dst, 0, instIndex, it));
9191                 }
9192             }
9193         }
9194     }
9195 
9196     instIndex = 0;
9197     for (auto bb : kernel.fg)
9198     {
9199         for (INST_LIST_ITER it = bb->begin(), end = bb->end(); it != end; ++it, ++instIndex)
9200         {
9201 
9202             G4_INST* inst = (*it);
9203 
9204             if (inst->isLifeTimeEnd() || inst->isPseudoKill())
9205             {
9206                 continue;
9207             }
9208 
9209             //
9210             // process each source operand
9211             //
9212             for (unsigned j = 0; j < G4_MAX_SRCS; j++)
9213             {
9214                 G4_Operand* src = inst->getSrc(j);
9215 
9216                 if (src == NULL)
9217                 {
9218                     continue;
9219                 }
9220 
9221                 if (src->isSrcRegRegion())
9222                 {
9223                     G4_Declare* topdcl = GetTopDclFromRegRegion(src);
9224 
9225                     if (topdcl &&
9226                         topdcl->getRegFile() != G4_INPUT &&
9227                         !topdcl->getAddressed() &&
9228                         splitDcls.find(topdcl->getRegVar()) != splitDcls.end() &&
9229                         ((src->asSrcRegRegion()->getRightBound() - src->asSrcRegRegion()->getLeftBound() + 1) < topdcl->getByteSize()) &&
9230                         src->asSrcRegRegion()->getRegAccess() == Direct)  //We don't split the indirect access
9231                     {
9232                         splitDcls[topdcl->getRegVar()].push_back(make_tuple(bb, src, j, instIndex, it));
9233                     }
9234                 }
9235             }
9236         }
9237     }
9238 
9239     for (SPLIT_DECL_OPERANDS_ITER it = splitDcls.begin();
9240         it != splitDcls.end();)
9241     {
9242         unsigned srcIndex = 0xFFFFFFFF;
9243         unsigned dstIndex = 0;
9244         SPLIT_DECL_OPERANDS_ITER succIt = it;
9245         succIt++;
9246         G4_Declare * topDcl = it->first->getDeclare();
9247         if (topDcl->getByteSize() <= numEltPerGRF<Type_UB>() * 2u)
9248         {
9249             splitDcls.erase(it);
9250             it = succIt;
9251             continue;
9252         }
9253 
9254         bool hasSrcOpearnd = false;
9255         for (SPLIT_OPERANDS_ITER vt = it->second.begin(); vt != it->second.end(); vt++)
9256         {
9257             G4_BB *bb = nullptr;
9258             G4_Operand *opnd = nullptr;
9259             INST_LIST_ITER instIter;
9260             int pos = 0;
9261             unsigned iIndex = 0;
9262 
9263             std::tie(bb, opnd, pos, iIndex, instIter) = (*vt);
9264 
9265             if (opnd == nullptr)
9266             {
9267                 continue;
9268             }
9269 
9270             if (opnd->isDstRegRegion())
9271             {
9272                 dstIndex = std::max(dstIndex, iIndex);
9273             }
9274 
9275             if (opnd->isSrcRegRegion())
9276             {
9277                 srcIndex = std::min(srcIndex, iIndex);
9278                 hasSrcOpearnd = true;
9279             }
9280         }
9281 
9282         if (!hasSrcOpearnd || (dstIndex > srcIndex &&
9283             dstIndex - srcIndex < it->second.size() + 1))
9284         {
9285             splitDcls.erase(it);
9286             it = succIt;
9287             continue;
9288         }
9289 
9290         it++;
9291     }
9292 
9293     for (SPLIT_DECL_OPERANDS_ITER it = splitDcls.begin();
9294         it != splitDcls.end();
9295         it++)
9296     {
9297         G4_Declare * topDcl = it->first->getDeclare();
9298         std::vector<G4_Declare*> splitDclList;
9299         splitDclList.clear();
9300 
9301         createSubDcls(kernel, topDcl, splitDclList);
9302         int srcIndex = 0;
9303         for (SPLIT_OPERANDS_ITER vt = it->second.begin(); vt != it->second.end(); vt++)
9304         {
9305             G4_BB *bb = nullptr;
9306             G4_Operand *opnd = nullptr;
9307             INST_LIST_ITER instIter;
9308             int pos = 0;
9309             unsigned instIndex = 0;
9310             std::tie(bb, opnd, pos, instIndex, instIter) = (*vt);
9311 
9312             if (opnd == nullptr)
9313             {
9314                 continue;
9315             }
9316 
9317             if (opnd->isDstRegRegion())
9318             {
9319                 insertMovesToTemp(builder, topDcl, opnd, bb, instIter, splitDclList);
9320             }
9321 
9322             if (opnd->isSrcRegRegion())
9323             {
9324                 insertMovesFromTemp(kernel, topDcl, srcIndex, opnd, pos, bb, instIter, splitDclList);
9325             }
9326 
9327             srcIndex++;
9328         }
9329     }
9330 
9331     return;
9332 }
9333 
localSplit(IR_Builder & builder,G4_BB * bb)9334 void VarSplit::localSplit(IR_Builder& builder,
9335     G4_BB* bb)
9336 {
9337     class CmpRegVarId
9338     {
9339     public:
9340         bool operator()(G4_RegVar* first, G4_RegVar* second) const
9341         {
9342             return first->getDeclare()->getDeclId() < second->getDeclare()->getDeclId();
9343         }
9344     };
9345     std::map<G4_RegVar*, std::vector<std::pair<G4_Operand*, INST_LIST_ITER>>, CmpRegVarId> localRanges;
9346     std::map<G4_RegVar*, std::vector<std::pair<G4_Operand*, INST_LIST_ITER>>, CmpRegVarId>::iterator localRangesIt;
9347     std::map<G4_RegVar*, VarRangeListPackage, CmpRegVarId> varRanges;
9348     std::map<G4_RegVar*, VarRangeListPackage, CmpRegVarId>::iterator varRangesIt;
9349     std::stack<VarRange*> toDelete;
9350 
9351     //
9352     // Iterate instruction in BB from back to front
9353     //
9354     for (INST_LIST::reverse_iterator rit = bb->rbegin(), rend = bb->rend(); rit != rend; ++rit)
9355     {
9356         G4_INST* i = (*rit);
9357         G4_DstRegRegion* dst = i->getDst();
9358 
9359         if (i->isLifeTimeEnd() || i->isPseudoKill())
9360         {
9361             continue;
9362         }
9363 
9364         //
9365         // process destination operand
9366         //
9367         if (dst != NULL)
9368         {
9369             G4_DstRegRegion* dstrgn = dst;
9370 
9371             //It's RA candidate
9372             G4_Declare* topdcl = GetTopDclFromRegRegion(dstrgn);
9373 
9374             LocalLiveRange* topdclLR = nullptr;
9375             //Local only
9376             if ((topdcl &&
9377                 (topdclLR = gra.getLocalLR(topdcl)) &&
9378                 topdcl->getIsRefInSendDcl() &&
9379                 topdclLR->isLiveRangeLocal()) &&
9380                 topdcl->getRegFile() == G4_GRF)
9381             {
9382                 varRangesIt = varRanges.find(topdcl->getRegVar());
9383                 INST_LIST_ITER iterToInsert = rit.base();
9384                 iterToInsert--; //Point to the iterator of current instruction
9385                 if (varRangesIt == varRanges.end())
9386                 {
9387                     VarRange* new_range = new VarRange;
9388                     new_range->leftBound = 0;
9389                     new_range->rightBound = topdcl->getByteSize() - 1;
9390                     toDelete.push(new_range);
9391                     varRanges[topdcl->getRegVar()].list.push_back(new_range);
9392                 }
9393                 else
9394                 {
9395                     rangeListSpliting(&(varRanges[topdcl->getRegVar()].list), dstrgn, &toDelete);
9396                 }
9397 
9398                 localRanges[topdcl->getRegVar()].emplace_back(dst, iterToInsert);  // Ordered from back to front.
9399             }
9400         }
9401 
9402         //
9403         // process each source operand
9404         //
9405         for (unsigned j = 0; j < G4_MAX_SRCS; j++)
9406         {
9407             G4_Operand* src = i->getSrc(j);
9408 
9409             if (src == NULL)
9410             {
9411                 continue;
9412             }
9413 
9414             //Local only
9415             if (src->isSrcRegRegion())
9416             {
9417                 G4_Declare* topdcl = GetTopDclFromRegRegion(src);
9418                 LocalLiveRange* topdclLR = nullptr;
9419 
9420                 if (topdcl &&
9421                     (topdclLR = gra.getLocalLR(topdcl)) &&
9422                     topdcl->getIsRefInSendDcl() &&
9423                     topdclLR->isLiveRangeLocal() &&
9424                     topdcl->getRegFile() == G4_GRF)
9425                 {
9426                     G4_VarBase* base = (topdcl != NULL ? topdcl->getRegVar() : src->asSrcRegRegion()->getBase());
9427 
9428                     INST_LIST_ITER iterToInsert = rit.base();
9429                     iterToInsert--;
9430 
9431                     varRangesIt = varRanges.find(base->asRegVar());
9432                     if (varRangesIt == varRanges.end())
9433                     {
9434                         VarRange* new_range = new VarRange;
9435                         new_range->leftBound = 0;
9436                         new_range->rightBound = topdcl->getByteSize() - 1;
9437                         toDelete.push(new_range);
9438                         varRanges[topdcl->getRegVar()].list.push_back(new_range);
9439                     }
9440 
9441                     rangeListSpliting(&(varRanges[topdcl->getRegVar()].list), src, &toDelete);
9442 
9443                     localRanges[topdcl->getRegVar()].emplace_back(src, iterToInsert);  // Ordered from back to front.
9444                 }
9445             }
9446         }
9447     }
9448 
9449     //Clean the varaibles without no partial usage, or whose partial live range is too short
9450     std::map<G4_RegVar*, VarRangeListPackage>::iterator it = varRanges.begin();
9451     while (it != varRanges.end())
9452     {
9453         std::map<G4_RegVar*, VarRangeListPackage>::iterator succ_it = it;
9454         succ_it++;
9455 
9456         //No partial
9457         if (it->second.list.size() <= 1)
9458         {
9459             varRanges.erase(it);
9460             it = succ_it;
9461             continue;
9462         }
9463 
9464         //If total GRF size divides partial number is less than 16 bytes (half GRF), remove it
9465         if (((*it->second.list.rbegin())->rightBound - (*it->second.list.begin())->leftBound) / it->second.list.size() < numEltPerGRF<Type_UW>() * 2 / 2)
9466         {
9467             varRanges.erase(it);
9468             it = succ_it;
9469             continue;
9470         }
9471 
9472         G4_Declare * topDcl = it->first->getDeclare();
9473         bool aligned = true;
9474         for (const VarRange *vr : it->second.list)
9475         {
9476             unsigned leftBound = vr->leftBound;
9477             unsigned rightBound = vr->rightBound;
9478             int elementSize = topDcl->getElemSize() > G4_WSIZE ? topDcl->getElemSize() : G4_WSIZE;
9479             unsigned short elemsNum = (rightBound - leftBound + 1) / elementSize;
9480 
9481             if (!elemsNum)
9482             {
9483                 aligned = false;
9484                 break;
9485             }
9486 
9487             //TODO: we can merge serveral unaligned sub declares into one aligned.  Such as [0-1], [2-63]  --> [0-63]
9488             if (leftBound % numEltPerGRF<Type_UW>() || (rightBound + 1) % numEltPerGRF<Type_UW>())
9489             {
9490                 aligned = false;
9491                 break;
9492             }
9493         }
9494 
9495         if (!aligned)
9496         {
9497             varRanges.erase(it);
9498             it = succ_it;
9499             continue;
9500         }
9501 
9502 
9503         it = succ_it;
9504     }
9505 
9506     int splitid = 0;
9507     for (std::map<G4_RegVar*, VarRangeListPackage>::iterator it = varRanges.begin();
9508         it != varRanges.end();
9509         it++)
9510     {
9511         G4_Declare * topDcl = it->first->getDeclare();
9512         const char * dclName = topDcl->getName();
9513 
9514         topDcl->setIsSplittedDcl(true);
9515 
9516         // Vertical split: varaible split
9517         unsigned splitVarNum = 0;
9518         unsigned pre_rightBound = 0;
9519         for (VAR_RANGE_LIST_ITER vt = it->second.list.begin(); vt != it->second.list.end(); vt++)
9520         {
9521             unsigned leftBound = (*vt)->leftBound;
9522             unsigned rightBound = (*vt)->rightBound;
9523             int elementSize = topDcl->getElemSize() > G4_WSIZE ? topDcl->getElemSize() : G4_WSIZE;
9524             unsigned short elemsNum = (rightBound - leftBound + 1) / elementSize;
9525 
9526             if (!elemsNum)
9527             {
9528                 assert(0);
9529                 pre_rightBound = rightBound;
9530                 continue;
9531             }
9532 
9533             if (leftBound && pre_rightBound + 1 != leftBound)
9534             {
9535                 assert(0);
9536             }
9537             pre_rightBound = rightBound;
9538 
9539             std::stringstream nameStrm;
9540             nameStrm << dclName << "_" << splitid << "_" << leftBound << "_" << rightBound << std::ends;
9541             int nameLen = unsigned(nameStrm.str().length()) + 1;
9542             const char* name = builder.getNameString(builder.mem, nameLen, "%s_%d_%d_%d", dclName, splitid, leftBound, rightBound);
9543 
9544             unsigned short dclWidth = 0;
9545             unsigned short dclHeight = 0;
9546             int dclTotalSize = 0;
9547 
9548             getHeightWidth(topDcl->getElemType(), (rightBound - leftBound + 1) / topDcl->getElemSize(), dclWidth, dclHeight, dclTotalSize);
9549             G4_Declare* partialDcl = builder.createDeclareNoLookup(name, G4_GRF, dclWidth, dclHeight, topDcl->getElemType());
9550             gra.setSubOffset(partialDcl, leftBound);
9551             partialDcl->setIsPartialDcl(true);
9552             gra.setSplittedDeclare(partialDcl, topDcl);
9553             unsigned nElementSize = (rightBound - leftBound + 1) / topDcl->getElemSize();
9554             if ((rightBound - leftBound + 1) % topDcl->getElemSize())
9555             {
9556                 nElementSize++;
9557             }
9558             partialDcl->setTotalElems(nElementSize);
9559             gra.addSubDcl(topDcl, partialDcl);
9560             splitVarNum++;
9561 #ifdef DEBUG_VERBOSE_ON
9562             std::cout << "==> Sub Declare: " << splitid << "::" << name << std::endl;
9563 #endif
9564             splitid++;
9565         }
9566         if (splitVarNum)
9567         {
9568             gra.setSplitVarNum(topDcl, splitVarNum);
9569         }
9570     }
9571 
9572     while (toDelete.size() > 0)
9573     {
9574         delete toDelete.top();
9575         toDelete.pop();
9576     }
9577 
9578     return;
9579 }
9580 
addrRegAlloc()9581 void GlobalRA::addrRegAlloc()
9582 {
9583     uint32_t addrSpillId = 0;
9584     unsigned maxRAIterations = 10;
9585     unsigned iterationNo = 0;
9586 
9587     while (iterationNo < maxRAIterations)
9588     {
9589         if (builder.getOption(vISA_RATrace))
9590         {
9591             std::cout << "--address RA iteration " << iterationNo << "\n";
9592         }
9593         //
9594         // choose reg vars whose reg file kind is ARF
9595         //
9596         LivenessAnalysis liveAnalysis(*this, G4_ADDRESS);
9597         liveAnalysis.computeLiveness();
9598 
9599         //
9600         // if no reg var needs to reg allocated, then skip reg allocation
9601         //
9602         if (liveAnalysis.getNumSelectedVar() > 0)
9603         {
9604             GraphColor coloring(liveAnalysis, kernel.getNumRegTotal(), false, false);
9605             unsigned spillRegSize = 0;
9606             unsigned indrSpillRegSize = 0;
9607             if (!coloring.regAlloc(false, false, false, spillRegSize, indrSpillRegSize, nullptr))
9608             {
9609                 SpillManager spillARF(*this, coloring.getSpilledLiveRanges(), addrSpillId);
9610                 spillARF.insertSpillCode();
9611                 addrSpillId = spillARF.getNextTempDclId();
9612 
9613                 //
9614                 // if new addr temps are created, we need to do RA again so that newly created temps
9615                 // can get registers. If there are no more newly created temps, we then commit reg assignments
9616                 //
9617                 if (spillARF.isAnyNewTempCreated() == false)
9618                 {
9619                     coloring.confirmRegisterAssignments();
9620                     coloring.cleanupRedundantARFFillCode();
9621                     if ((builder.kernel.fg.getHasStackCalls() || builder.kernel.fg.getIsStackCallFunc()))
9622                     {
9623                         coloring.addA0SaveRestoreCode();
9624                     }
9625                     break; // no more new addr temps; done with ARF allocation
9626                 }
9627             }
9628             else  // successfully allocate register without spilling
9629             {
9630                 coloring.confirmRegisterAssignments();
9631                 coloring.cleanupRedundantARFFillCode();
9632                 if ((builder.kernel.fg.getHasStackCalls() || builder.kernel.fg.getIsStackCallFunc()))
9633                 {
9634                     coloring.addA0SaveRestoreCode();
9635                 }
9636                 if (builder.getOption(vISA_OptReport))
9637                 {
9638                     detectUndefinedUses(liveAnalysis, kernel);
9639                 }
9640 
9641                 break; // done with ARF allocation
9642             }
9643         }
9644         else {
9645             break; // no ARF allocation needed
9646         }
9647         kernel.dumpToFile("after.Address_RA." + std::to_string(iterationNo));
9648         iterationNo++;
9649 
9650 
9651     }
9652 
9653     MUST_BE_TRUE(iterationNo < maxRAIterations, "Address RA has failed.");
9654 }
9655 
flagRegAlloc()9656 void GlobalRA::flagRegAlloc()
9657 {
9658     uint32_t flagSpillId = 0;
9659     unsigned maxRAIterations = 10;
9660     uint32_t iterationNo = 0;
9661     bool spillingFlag = false;
9662 
9663     while (iterationNo < maxRAIterations)
9664     {
9665         if (builder.getOption(vISA_RATrace))
9666         {
9667             std::cout << "--flag RA iteration " << iterationNo << "\n";
9668         }
9669 
9670         //
9671         // choose reg vars whose reg file kind is FLAG
9672         //
9673         LivenessAnalysis liveAnalysis(*this, G4_FLAG);
9674         liveAnalysis.computeLiveness();
9675 
9676         //
9677         // if no reg var needs to reg allocated, then skip reg allocation
9678         //
9679         if (liveAnalysis.getNumSelectedVar() > 0)
9680         {
9681             GraphColor coloring(liveAnalysis, kernel.getNumRegTotal(), false, false);
9682             unsigned spillRegSize = 0;
9683             unsigned indrSpillRegSize = 0;
9684             if (!coloring.regAlloc(false, false, false, spillRegSize, indrSpillRegSize, nullptr))
9685             {
9686                 SpillManager spillFlag(*this, coloring.getSpilledLiveRanges(), flagSpillId);
9687                 spillFlag.insertSpillCode();
9688 #ifdef DEBUG_VERBOSE_ON
9689                 printf("FLAG Spill inst count: %d\n", spillFlag.getNumFlagSpillStore());
9690                 printf("FLAG Fill inst count: %d\n", spillFlag.getNumFlagSpillLoad());
9691                 printf("*************************\n");
9692 #endif
9693                 flagSpillId = spillFlag.getNextTempDclId();
9694 
9695                 spillingFlag = true;
9696                 if (spillFlag.isAnyNewTempCreated() == false)
9697                 {
9698                     coloring.confirmRegisterAssignments();
9699 
9700                     if ((builder.kernel.fg.getHasStackCalls() || builder.kernel.fg.getIsStackCallFunc()))
9701                     {
9702                         coloring.addFlagSaveRestoreCode();
9703                     }
9704                     break;
9705                 }
9706                 builder.getJitInfo()->numFlagSpillStore = spillFlag.getNumFlagSpillStore();
9707                 builder.getJitInfo()->numFlagSpillLoad = spillFlag.getNumFlagSpillLoad();
9708             }
9709             else  // successfully allocate register without spilling
9710             {
9711                 coloring.confirmRegisterAssignments();
9712                 if ((builder.kernel.fg.getHasStackCalls() || builder.kernel.fg.getIsStackCallFunc()))
9713                 {
9714                     coloring.addFlagSaveRestoreCode();
9715                 }
9716 
9717                 if (spillingFlag && builder.getOption(vISA_FlagSpillCodeCleanup))
9718                 {
9719                     CLEAN_NUM_PROFILE clean_num_profile;
9720 
9721                     FlagSpillCleanup f(*this);
9722                     f.spillFillCodeCleanFlag(builder, kernel, &clean_num_profile);
9723 
9724 #ifdef DEBUG_VERBOSE_ON1
9725                     for (int i = 0; i < 3; i++)
9726                     {
9727                         printf("Profiler %d Spill clean: %d\n", i, clean_num_profile.spill_clean_num[i]);
9728                         printf("Profiler %d Fill clean: %d\n", i, clean_num_profile.fill_clean_num[i]);
9729                         clean_num += clean_num_profile.spill_clean_num[i];
9730                         clean_num += clean_num_profile.fill_clean_num[i];
9731                     }
9732                     printf("**Flag clean num: %d\n", clean_num);
9733 #endif
9734                 }
9735 
9736                 if (builder.getOption(vISA_OptReport))
9737                 {
9738                     detectUndefinedUses(liveAnalysis, kernel);
9739                 }
9740 
9741                 break; // done with FLAG allocation
9742             }
9743         }
9744         else {
9745             break; // no FLAG allocation needed
9746         }
9747         kernel.dumpToFile("after.Flag_RA." + std::to_string(iterationNo));
9748         iterationNo++;
9749     }
9750 
9751     MUST_BE_TRUE(iterationNo < maxRAIterations, "Flag RA has failed.");
9752 }
9753 
assignRegForAliasDcl()9754 void GlobalRA::assignRegForAliasDcl()
9755 {
9756     //
9757     // assign Reg for Alias DCL
9758     //
9759     for (G4_Declare *dcl : kernel.Declares)
9760     {
9761         G4_RegVar * AliasRegVar;
9762         G4_RegVar * CurrentRegVar;
9763         unsigned tempoffset;
9764 
9765         if (dcl->getAliasDeclare() != NULL)
9766         {
9767             AliasRegVar = dcl->getAliasDeclare()->getRegVar();
9768             CurrentRegVar = dcl->getRegVar();
9769             tempoffset = AliasRegVar->getPhyRegOff()*AliasRegVar->getDeclare()->getElemSize() + dcl->getAliasOffset();
9770             if (AliasRegVar->getPhyReg() != NULL)
9771             {
9772                 //
9773                 // alias register assignment for A0
9774                 //
9775                 if (CurrentRegVar->getDeclare()->useGRF())
9776                 {
9777                     // if the tempoffset is one grf
9778                     if (tempoffset < numEltPerGRF<Type_UW>() * 2u)
9779                     {
9780                         CurrentRegVar->setPhyReg(AliasRegVar->getPhyReg(), tempoffset / CurrentRegVar->getDeclare()->getElemSize());
9781                     }
9782                     // tempoffset covers several GRFs
9783                     else
9784                     {
9785                         unsigned addtionalrow = tempoffset / (numEltPerGRF<Type_UW>() * 2);
9786                         unsigned actualoffset = tempoffset % (numEltPerGRF<Type_UW>() * 2);
9787                         bool valid = false;
9788                         unsigned orignalrow = AliasRegVar->ExRegNum(valid);
9789                         MUST_BE_TRUE(valid == true, ERROR_REGALLOC);
9790                         CurrentRegVar->setPhyReg(regPool.getGreg(orignalrow + addtionalrow), actualoffset / CurrentRegVar->getDeclare()->getElemSize());
9791                     }
9792                 }
9793                 else if (CurrentRegVar->getDeclare()->getRegFile() == G4_ADDRESS)
9794                 {
9795                     MUST_BE_TRUE(tempoffset < getNumAddrRegisters() * 2,
9796                         ERROR_REGALLOC);    // Must hold tempoffset in one A0 reg
9797                     CurrentRegVar->setPhyReg(AliasRegVar->getPhyReg(), tempoffset / CurrentRegVar->getDeclare()->getElemSize());
9798                 }
9799                 else
9800                 {
9801                     MUST_BE_TRUE(false, ERROR_REGALLOC);
9802                 }
9803             }
9804             else {
9805                 // Propagate addr taken spill/fill to aliases
9806                 CurrentRegVar->getDeclare()->setAddrTakenSpillFill(AliasRegVar->getDeclare()->getAddrTakenSpillFill());
9807 
9808                 if (dcl->isSpilled() == false)
9809                     dcl->setSpillFlag();
9810             }
9811         }
9812     }
9813 
9814     return;
9815 }
9816 
removeSplitDecl()9817 void GlobalRA::removeSplitDecl()
9818 {
9819     for (auto dcl : kernel.Declares)
9820     {
9821         if (!getSubDclList(dcl).empty())
9822         {
9823             clearSubDcl(dcl);
9824             dcl->setIsSplittedDcl(false);
9825         }
9826     }
9827 
9828     kernel.Declares.erase(std::remove_if(kernel.Declares.begin(), kernel.Declares.end(),
9829         [](G4_Declare* dcl) { return dcl->getIsPartialDcl(); }), kernel.Declares.end());
9830 }
9831 
9832 // FIXME: doBankConflictReduction and highInternalConflict are computed by local RA
9833 //        they should be moved to some common code
hybridRA(bool doBankConflictReduction,bool highInternalConflict,LocalRA & lra)9834 bool GlobalRA::hybridRA(bool doBankConflictReduction, bool highInternalConflict, LocalRA& lra)
9835 {
9836     if (builder.getOption(vISA_RATrace))
9837     {
9838         std::cout << "--hybrid RA--\n";
9839     }
9840     uint32_t numOrigDcl = (uint32_t) kernel.Declares.size();
9841     insertPhyRegDecls();
9842 
9843     LivenessAnalysis liveAnalysis(*this, G4_GRF | G4_INPUT);
9844     liveAnalysis.computeLiveness();
9845 
9846     if (liveAnalysis.getNumSelectedVar() > 0)
9847     {
9848         RPE rpe(*this, &liveAnalysis);
9849         rpe.run();
9850 
9851         bool spillLikely = kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D &&
9852             rpe.getMaxRP() >= kernel.getNumRegTotal() - 16;
9853         if (spillLikely)
9854         {
9855             if (builder.getOption(vISA_RATrace))
9856             {
9857                 std::cout << "\t--skip hybrid RA due to high pressure: " << rpe.getMaxRP() << "\n";
9858             }
9859             kernel.Declares.resize(numOrigDcl);
9860             lra.undoLocalRAAssignments(false);
9861             return false;
9862         }
9863 
9864         GraphColor coloring(liveAnalysis, kernel.getNumRegTotal(), true, false);
9865 
9866         unsigned spillRegSize = 0;
9867         unsigned indrSpillRegSize = 0;
9868         bool isColoringGood =
9869             coloring.regAlloc(doBankConflictReduction, highInternalConflict, false, spillRegSize, indrSpillRegSize, &rpe);
9870         if (!isColoringGood)
9871         {
9872             if (!kernel.getOption(vISA_Debug))
9873             {
9874                 // Why?? Keep LRA results when -debug is passed
9875                 kernel.Declares.resize(numOrigDcl);
9876                 lra.undoLocalRAAssignments(false);
9877             }
9878             // Restore alignment in case LRA modified it
9879             copyAlignment();
9880             return false;
9881         }
9882         coloring.confirmRegisterAssignments();
9883 
9884         if (kernel.fg.getHasStackCalls() || kernel.fg.getIsStackCallFunc())
9885         {
9886             coloring.getSaveRestoreRegister();
9887             addSaveRestoreCode(0);
9888         }
9889 
9890         if (verifyAugmentation)
9891         {
9892             assignRegForAliasDcl();
9893             computePhyReg();
9894             verifyAugmentation->verify();
9895         }
9896     }
9897 
9898     kernel.setRAType(doBankConflictReduction ? RA_Type::HYBRID_BC_RA : RA_Type::HYBRID_RA);
9899     return true;
9900 }
9901 
canDoHRA(G4_Kernel & kernel)9902 bool canDoHRA(G4_Kernel& kernel)
9903 {
9904     bool ret = true;
9905 
9906     if (kernel.getVarSplitPass()->splitOccured())
9907     {
9908         ret = false;
9909     }
9910 
9911     return ret;
9912 }
9913 
9914 //
9915 // graph coloring entry point.  returns nonzero if RA fails
9916 //
coloringRegAlloc()9917 int GlobalRA::coloringRegAlloc()
9918 {
9919     if (kernel.getOption(vISA_OptReport))
9920     {
9921         std::ofstream optreport;
9922         getOptReportStream(optreport, builder.getOptions());
9923         optreport << std::endl << "=== Register Allocation ===" << std::endl;
9924         if (builder.getIsKernel() == false)
9925         {
9926             optreport << "Function: " << kernel.getName() << std::endl;
9927         }
9928         else
9929         {
9930             optreport << "Kernel: " << kernel.getName() << std::endl;
9931         }
9932         closeOptReportStream(optreport);
9933 
9934         detectNeverDefinedUses();
9935     }
9936 
9937     bool hasStackCall = kernel.fg.getHasStackCalls() || kernel.fg.getIsStackCallFunc();
9938 
9939     // this needs to be called before addr/flag RA since it changes their alignment as well
9940     fixAlignment();
9941 
9942     {
9943         TIME_SCOPE(ADDR_FLAG_RA);
9944 
9945         addrRegAlloc();
9946 
9947         flagRegAlloc();
9948     }
9949 
9950     // LSC messages are used when:
9951     // a. Stack call is used on PVC+,
9952     // b. Spill size exceeds what can be represented using hword msg on PVC+
9953     if (builder.supportsLSC()) {
9954         useLscForSpillFill = true;
9955         useLscForNonStackCallSpillFill =
9956             builder.getOption(vISA_lscNonStackSpill) != 0;
9957     }
9958 
9959     if (builder.hasFusedEUWA() && !builder.getIsPayload())
9960     {
9961         if (G4_BB* entryBB = (*kernel.fg.begin()))
9962         {
9963             INST_LIST_ITER inst_it = entryBB->begin();
9964             const INST_LIST_ITER inst_ie = entryBB->end();
9965             while (inst_it != inst_ie && (*inst_it)->isLabel())
9966             {
9967                 inst_it++;
9968             }
9969             G4_INST* euWAInst = builder.createEUWASpill(false);
9970             entryBB->insertBefore(inst_it, euWAInst);
9971         }
9972     }
9973 
9974     //
9975     // If the graph has stack calls, then add the caller-save/callee-save pseudo
9976     // declares and code. This currently must be done after flag/addr RA due to
9977     // the assumption about the location of the pseudo save/restore instructions
9978     //
9979     if (hasStackCall)
9980     {
9981         addCallerSavePseudoCode();
9982 
9983         // Only GENX sub-graphs require callee-save code.
9984 
9985         if (builder.getIsKernel() == false)
9986         {
9987             addCalleeSavePseudoCode();
9988             addStoreRestoreToReturn();
9989         }
9990 
9991         // bind builtinR0 to the reserved stack call ABI GRF so that caller and
9992         // callee can agree on which GRF to use for r0
9993         builder.getBuiltinR0()->getRegVar()->setPhyReg(
9994             builder.phyregpool.getGreg(kernel.getThreadHeaderGRF()), 0);
9995     }
9996 
9997     if (kernel.getOption(vISA_SpillAnalysis))
9998     {
9999         spillAnalysis = std::make_unique<SpillAnalysis>();
10000     }
10001 
10002     if (!isReRAPass())
10003     {
10004         //Global linear scan RA
10005         if (builder.getOption(vISA_LinearScan))
10006         {
10007             copyMissingAlignment();
10008             BankConflictPass bc(*this, false);
10009             LivenessAnalysis liveAnalysis(*this, G4_GRF | G4_INPUT);
10010             liveAnalysis.computeLiveness();
10011 
10012             TIME_SCOPE(LINEARSCAN_RA);
10013             LinearScanRA lra(bc, *this, liveAnalysis);
10014             int success = lra.doLinearScanRA();
10015             if (success == VISA_SUCCESS)
10016             {
10017                 // TODO: Get correct spillSize from LinearScanRA
10018                 unsigned spillSize = 0;
10019                 expandSpillFillIntrinsics(spillSize);
10020                 assignRegForAliasDcl();
10021                 computePhyReg();
10022                 if (builder.getOption(vISA_verifyLinearScan))
10023                 {
10024                     resetGlobalRAStates();
10025                     markGraphBlockLocalVars();
10026                     LivenessAnalysis live(*this, G4_GRF | G4_INPUT, false, true);
10027                     live.computeLiveness();
10028                     GraphColor coloring(live, kernel.getNumRegTotal(), false, false);
10029                     vISA::Mem_Manager mem(GRAPH_COLOR_MEM_SIZE);
10030                     coloring.createLiveRanges(0);
10031                     LiveRange** lrs = coloring.getLRs();
10032                     Interference intf(&live, lrs, live.getNumSelectedVar(), live.getNumSplitStartID(), live.getNumSplitVar(), *this);
10033                     intf.init(mem);
10034                     intf.computeInterference();
10035 
10036                     if(kernel.getOption(vISA_DumpRAIntfGraph))
10037                         intf.dumpInterference();
10038                     intf.linearScanVerify();
10039                 }
10040                 return VISA_SUCCESS;
10041             }
10042 
10043             if (success == VISA_SPILL)
10044             {
10045                 return VISA_SPILL;
10046             }
10047         }
10048         else if (builder.getOption(vISA_LocalRA) && !hasStackCall)
10049         {
10050             copyMissingAlignment();
10051             BankConflictPass bc(*this, false);
10052             LocalRA lra(bc, *this);
10053             bool success = lra.localRA();
10054             if (!success && !builder.getOption(vISA_HybridRAWithSpill))
10055             {
10056                 if (canDoHRA(kernel))
10057                 {
10058                     success = hybridRA(lra.doHybridBCR(), lra.hasHighInternalBC(), lra);
10059                 }
10060                 else
10061                 {
10062                     if (builder.getOption(vISA_RATrace))
10063                     {
10064                         std::cout << "\t--skip HRA due to var split. undo LRA results." << "\n";
10065                     }
10066                     lra.undoLocalRAAssignments(false);
10067                 }
10068             }
10069             if (success)
10070             {
10071                 // either local or hybrid RA succeeds
10072                 assignRegForAliasDcl();
10073                 computePhyReg();
10074                 return VISA_SUCCESS;
10075             }
10076             if (builder.getOption(vISA_HybridRAWithSpill))
10077             {
10078                 insertPhyRegDecls();
10079             }
10080         }
10081     }
10082 
10083     startTimer(TimerID::GRF_GLOBAL_RA);
10084     const unsigned maxRAIterations = 10;
10085     unsigned iterationNo = 0;
10086 
10087     int globalScratchOffset = kernel.getInt32KernelAttr(Attributes::ATTR_SpillMemOffset);
10088     bool useScratchMsgForSpill = !hasStackCall && (globalScratchOffset < (int)(SCRATCH_MSG_LIMIT * 0.6)
10089         // useScratchMsgForSpill is true for
10090         // * scratch msg
10091         // * LSC msg
10092         // Spill insertion module decides whether to expand a fill/spill to scratch or LSC
10093         // depending on spill offset. oword is supported for PVC but it is not emitted in
10094         // favor of LSC.
10095         || builder.supportsLSC());
10096     bool enableSpillSpaceCompression = builder.getOption(vISA_SpillSpaceCompression);
10097 
10098     uint32_t nextSpillOffset = 0;
10099     uint32_t scratchOffset = 0;
10100 
10101     if (kernel.fg.getIsStackCallFunc())
10102     {
10103         // Allocate space to store Frame Descriptor
10104         nextSpillOffset += 32;
10105         scratchOffset += 32;
10106     }
10107 
10108     uint32_t GRFSpillFillCount = 0;
10109     uint32_t sendAssociatedGRFSpillFillCount = 0;
10110     unsigned fastCompileIter = 1;
10111     bool fastCompile =
10112         (builder.getOption(vISA_FastCompileRA) || builder.getOption(vISA_HybridRAWithSpill)) &&
10113         !hasStackCall;
10114 
10115     if (fastCompile)
10116     {
10117         fastCompileIter = 0;
10118     }
10119 
10120     unsigned failSafeRAIteration = (builder.getOption(vISA_FastSpill) || fastCompile) ? fastCompileIter : FAIL_SAFE_RA_LIMIT;
10121     if (failSafeRAIteration == 0)
10122     {
10123         builder.getSpillFillHeader();
10124         builder.getOldA0Dot2Temp();
10125         if (builder.hasScratchSurface())
10126         {
10127             builder.initScratchSurfaceOffset();
10128         }
10129         //BuiltinR0 may be spilled which is not allowed.
10130         //FIXME: BuiltinR0 spill cost has been set to MAX already,
10131         //keep spilling means there is some issue in cost model
10132         builder.getBuiltinR0()->setLiveOut();
10133         builder.getBuiltinR0()->getRegVar()->setPhyReg(
10134             builder.phyregpool.getGreg(0), 0);
10135     }
10136     bool rematDone = false, alignedScalarSplitDone = false;
10137     bool reserveSpillReg = false;
10138     VarSplit splitPass(*this);
10139 
10140     while (iterationNo < maxRAIterations)
10141     {
10142         if (builder.getOption(vISA_RATrace))
10143         {
10144             std::cout << "--GRF RA iteration " << iterationNo << "--" << kernel.getName() << "\n";
10145         }
10146         setIterNo(iterationNo);
10147 
10148         if (!builder.getOption(vISA_HybridRAWithSpill))
10149         {
10150             resetGlobalRAStates();
10151         }
10152 
10153         if (builder.getOption(vISA_clearScratchWritesBeforeEOT) &&
10154             (globalScratchOffset + nextSpillOffset) > 0)
10155         {
10156             // we need to set r0 be live out for this WA
10157             builder.getBuiltinR0()->setLiveOut();
10158         }
10159 
10160         //Identify the local variables to speedup following analysis
10161         if (!builder.getOption(vISA_HybridRAWithSpill))
10162         {
10163             markGraphBlockLocalVars();
10164         }
10165 
10166         if (kernel.getOption(vISA_SpillAnalysis))
10167         {
10168             spillAnalysis->Clear();
10169         }
10170 
10171         //Do variable splitting in each iteration
10172         if (builder.getOption(vISA_LocalDeclareSplitInGlobalRA))
10173         {
10174             if (builder.getOption(vISA_RATrace))
10175             {
10176                 std::cout << "\t--split local send--\n";
10177             }
10178             for (auto bb : kernel.fg)
10179             {
10180                 if (bb->isSendInBB())
10181                 {
10182                     splitPass.localSplit(builder, bb);
10183                 }
10184             }
10185         }
10186 
10187         bool doBankConflictReduction = false;
10188         bool highInternalConflict = false;  // this is set by setupBankConflictsForKernel
10189 
10190         if (builder.getOption(vISA_LocalBankConflictReduction) &&
10191             builder.hasBankCollision())
10192         {
10193             bool reduceBCInRR = false;
10194             bool reduceBCInTAandFF = false;
10195             BankConflictPass bc(*this, true);
10196 
10197             reduceBCInRR = bc.setupBankConflictsForKernel(true, reduceBCInTAandFF, SECOND_HALF_BANK_START_GRF * 2, highInternalConflict);
10198             doBankConflictReduction = reduceBCInRR && reduceBCInTAandFF;
10199         }
10200 
10201         bool allowAddrTaken = builder.getOption(vISA_FastSpill) || fastCompile ||
10202             !kernel.getHasAddrTaken();
10203         if (builder.getOption(vISA_FailSafeRA) &&
10204             kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D &&
10205             !hasStackCall &&
10206             ((iterationNo == maxRAIterations - 1) ||
10207              (allowAddrTaken &&
10208               iterationNo == failSafeRAIteration)))
10209         {
10210             if (builder.getOption(vISA_RATrace))
10211             {
10212                 std::cout << "\t--enable failSafe RA\n";
10213             }
10214             reserveSpillReg = true;
10215         }
10216 
10217         LivenessAnalysis liveAnalysis(*this, G4_GRF | G4_INPUT);
10218         liveAnalysis.computeLiveness();
10219         if (builder.getOption(vISA_dumpLiveness))
10220         {
10221             liveAnalysis.dump();
10222         }
10223 
10224 #ifdef DEBUG_VERBOSE_ON
10225         emitFGWithLiveness(liveAnalysis);
10226 #endif
10227         //
10228         // if no reg var needs to reg allocated, then skip reg allocation
10229         //
10230         if (liveAnalysis.getNumSelectedVar() > 0)
10231         {
10232             // force spill should be done only for the 1st iteration
10233             bool forceSpill = iterationNo > 0 ? false : builder.getOption(vISA_ForceSpills);
10234             RPE rpe(*this, &liveAnalysis);
10235             if (!fastCompile)
10236             {
10237                 rpe.run();
10238             }
10239             GraphColor coloring(liveAnalysis, kernel.getNumRegTotal(), false, forceSpill);
10240 
10241             if (builder.getOption(vISA_dumpRPE) && iterationNo == 0 && !rematDone)
10242             {
10243                 // dump pressure the first time we enter global RA
10244                 coloring.dumpRegisterPressure();
10245             }
10246 
10247             unsigned spillRegSize = 0;
10248             unsigned indrSpillRegSize = 0;
10249             bool isColoringGood =
10250                 coloring.regAlloc(doBankConflictReduction, highInternalConflict, reserveSpillReg, spillRegSize, indrSpillRegSize, &rpe);
10251             if (!isColoringGood)
10252             {
10253                 if (isReRAPass())
10254                 {
10255                     // Dont modify program if reRA pass spills
10256                     return VISA_SPILL;
10257                 }
10258 
10259                 bool runRemat = kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_CM
10260                     ? true :  kernel.getSimdSize() < numEltPerGRF<Type_UB>();
10261                 // -noremat takes precedence over -forceremat
10262                 bool rematOn = !kernel.getOption(vISA_Debug) &&
10263                     !kernel.getOption(vISA_NoRemat) &&
10264                     !kernel.getOption(vISA_FastSpill) &&
10265                     !fastCompile &&
10266                     (kernel.getOption(vISA_ForceRemat) || runRemat);
10267                 bool rerunGRA = false;
10268                 bool globalSplitChange = false;
10269 
10270                 if (!rematDone &&
10271                     rematOn)
10272                 {
10273                     if (builder.getOption(vISA_RATrace))
10274                     {
10275                         std::cout << "\t--rematerialize\n";
10276                     }
10277                     Rematerialization remat(kernel, liveAnalysis, coloring, rpe, *this);
10278                     remat.run();
10279                     rematDone = true;
10280 
10281                     // Re-run GRA loop only if remat caused changes to IR
10282                     rerunGRA |= remat.getChangesMade();
10283                 }
10284 
10285                 if (kernel.getOption(vISA_SplitGRFAlignedScalar) &&
10286                     !fastCompile &&
10287                     !kernel.getOption(vISA_FastSpill) &&
10288                     !alignedScalarSplitDone)
10289                 {
10290                     SplitAlignedScalars split(*this, coloring);
10291                     split.run();
10292                     alignedScalarSplitDone = true;
10293 
10294                     // Re-run GRA loop if changes were made to IR
10295                     rerunGRA |= split.getChangesMade();
10296                 }
10297 
10298                 //Calculate the spill caused by send to decide if global splitting is required or not
10299                 for (auto spilled : coloring.getSpilledLiveRanges())
10300                 {
10301                     auto spillDcl = spilled->getDcl();
10302                     if (spillDcl->getIsRefInSendDcl() && spillDcl->getNumRows() > 1)
10303                     {
10304                         sendAssociatedGRFSpillFillCount += spilled->getRefCount();
10305                     }
10306                 }
10307 
10308                 int instNum = 0;
10309                 for (auto bb : kernel.fg)
10310                 {
10311                     instNum += (int)bb->size();
10312                 }
10313 
10314                 if (iterationNo == 0 &&                             //Only works when first iteration of Global RA failed.
10315                     !splitPass.didGlobalSplit &&                      //Do only one time.
10316                     splitPass.canDoGlobalSplit(builder, kernel, sendAssociatedGRFSpillFillCount))
10317                 {
10318                     if (builder.getOption(vISA_RATrace))
10319                     {
10320                         std::cout << "\t--global send split\n";
10321                     }
10322                     splitPass.globalSplit(builder, kernel);
10323                     splitPass.didGlobalSplit = true;
10324                     globalSplitChange = true;
10325                 }
10326 
10327                 if (iterationNo == 0 &&
10328                     (rerunGRA || globalSplitChange || kernel.getOption(vISA_forceBCR)))
10329                 {
10330                     if (kernel.getOption(vISA_forceBCR))
10331                     {
10332                         kernel.getOptions()->setOption(vISA_forceBCR, false);
10333                     }
10334 
10335                     continue;
10336                 }
10337 
10338                 if (iterationNo == 0 && !fastCompile &&
10339                     kernel.getOption(vISA_DoSplitOnSpill))
10340                 {
10341                     LoopVarSplit loopSplit(kernel, &coloring, &rpe);
10342                     kernel.fg.getLoops().computePreheaders();
10343                     loopSplit.run();
10344                 }
10345 
10346                 //Calculate the spill caused by send to decide if global splitting is required or not
10347                 for (auto spilled : coloring.getSpilledLiveRanges())
10348                 {
10349                     GRFSpillFillCount += spilled->getRefCount();
10350                 }
10351 
10352                 if (builder.getOption(vISA_OptReport) && iterationNo == 0)
10353                 {
10354                     // Dump out interference graph information of spill candidates
10355                     reportSpillInfo(liveAnalysis, coloring);
10356                 }
10357 
10358                 // vISA_AbortOnSpillThreshold is defined as [0..200]
10359                 // where 0 means abort on any spill and 200 means never abort
10360                 auto underSpillThreshold = [this](int numSpill, int asmCount)
10361                 {
10362                     int threshold = std::min(builder.getOptions()->getuInt32Option(vISA_AbortOnSpillThreshold), 200u);
10363                     return (numSpill * 200) < (threshold * asmCount);
10364                 };
10365 
10366                 bool isUnderThreshold = underSpillThreshold(GRFSpillFillCount, instNum);
10367                 if (isUnderThreshold)
10368                 {
10369                     if (auto jitInfo = builder.getJitInfo())
10370                     {
10371                         jitInfo->avoidRetry = true;
10372                     }
10373                 }
10374 
10375                 if (builder.getOption(vISA_AbortOnSpill) && !isUnderThreshold)
10376                 {
10377                     // update jit metadata information
10378                     if (auto jitInfo = builder.getJitInfo())
10379                     {
10380                         jitInfo->isSpill = true;
10381                         jitInfo->spillMemUsed = 0;
10382                         jitInfo->numAsmCount = instNum;
10383                         jitInfo->numGRFSpillFill = GRFSpillFillCount;
10384                     }
10385 
10386                     // Early exit when -abortonspill is passed, instead of
10387                     // spending time inserting spill code and then aborting.
10388                     stopTimer(TimerID::GRF_GLOBAL_RA);
10389                     return VISA_SPILL;
10390                 }
10391 
10392                 if (iterationNo == 0 &&
10393                     enableSpillSpaceCompression &&
10394                     kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D &&
10395                     !hasStackCall)
10396                 {
10397                     unsigned spillSize = 0;
10398                     const LIVERANGE_LIST& spilledLRs = coloring.getSpilledLiveRanges();
10399                     for (auto lr : spilledLRs)
10400                     {
10401                         spillSize += lr->getDcl()->getByteSize();
10402                     }
10403                     if ((int)(spillSize * 1.5) < (SCRATCH_MSG_LIMIT - globalScratchOffset))
10404                     {
10405                         enableSpillSpaceCompression = false;
10406                     }
10407                 }
10408 
10409                 startTimer(TimerID::SPILL);
10410                 SpillManagerGRF spillGRF(*this,
10411                     nextSpillOffset,
10412                     liveAnalysis.getNumSelectedVar(),
10413                     &liveAnalysis,
10414                     coloring.getLiveRanges(),
10415                     coloring.getIntf(),
10416                     &coloring.getSpilledLiveRanges(),
10417                     iterationNo++,
10418                     reserveSpillReg,
10419                     spillRegSize,
10420                     indrSpillRegSize,
10421                     enableSpillSpaceCompression,
10422                     useScratchMsgForSpill,
10423                     builder.avoidDstSrcOverlap());
10424 
10425                 if (kernel.getOption(vISA_SpillAnalysis))
10426                 {
10427                     spillAnalysis->Do(&liveAnalysis, &coloring, &spillGRF);
10428                 }
10429 
10430                 bool success = spillGRF.insertSpillFillCode(&kernel, pointsToAnalysis);
10431                 nextSpillOffset = spillGRF.getNextOffset();
10432 
10433                 if (builder.hasScratchSurface() && !hasStackCall &&
10434                     (nextSpillOffset + globalScratchOffset) > SCRATCH_MSG_LIMIT)
10435                 {
10436                     // create temp variable to store old a0.2 - this is marked as live-in and live-out.
10437                     // because the variable is emitted only post RA to preserve old value of a0.2.
10438                     kernel.fg.builder->getOldA0Dot2Temp();
10439                 } else if (useLscForNonStackCallSpillFill) {
10440                     kernel.fg.builder->getOldA0Dot2Temp();
10441                 }
10442 
10443                 if (builder.getOption(vISA_RATrace))
10444                 {
10445                     auto&& spills = coloring.getSpilledLiveRanges();
10446                     std::cout << "\t--# variables spilled: " << spills.size() << "\n";
10447                     if (spills.size() < 100)
10448                     {
10449                         std::cout << "\t--spilled variables: ";
10450                         for (auto&& lr : spills)
10451                         {
10452                             std::cout << lr->getDcl()->getName() << "  ";
10453                         }
10454                         std::cout << "\n";
10455                     }
10456                     std::cout << "\t--current spill size: " << nextSpillOffset << "\n";
10457                 }
10458 
10459                 if (!success)
10460                 {
10461                     iterationNo = maxRAIterations;
10462                     break;
10463                 }
10464 
10465                 kernel.dumpToFile("after.Spill_GRF." + std::to_string(iterationNo));
10466                 scratchOffset = std::max(scratchOffset, spillGRF.getNextScratchOffset());
10467 
10468                 bool disableSpillCoalecse = builder.getOption(vISA_DisableSpillCoalescing) ||
10469                     builder.getOption(vISA_FastSpill) || fastCompile || builder.getOption(vISA_Debug) ||
10470                     // spill cleanup is not support when we use oword msg for spill/fill for non-stack calls.
10471                     (!useScratchMsgForSpill && !hasStackCall);
10472 
10473                 if (!reserveSpillReg && !disableSpillCoalecse && builder.useSends())
10474                 {
10475                     CoalesceSpillFills c(kernel, liveAnalysis, coloring, spillGRF, iterationNo, rpe, *this);
10476                     c.run();
10477                 }
10478 
10479                 if (iterationNo == FAIL_SAFE_RA_LIMIT)
10480                 {
10481                     if (coloring.getSpilledLiveRanges().size() < 2)
10482                     {
10483                         // give regular RA one more try as we are close to success
10484                         failSafeRAIteration++;
10485                     }
10486                 }
10487                 stopTimer(TimerID::SPILL);
10488             }
10489             // RA successfully allocates regs
10490             if (isColoringGood == true || reserveSpillReg)
10491             {
10492                 coloring.confirmRegisterAssignments();
10493 
10494                 if (hasStackCall)
10495                 {
10496                     // spill/fill intrinsics expect offset in HWord, so round up to 64 byte but maintain it in OWord unit
10497                     // ToDo: we really need to change everything to byte for everyone's sanity..
10498                     unsigned localSpillAreaOwordSize = ROUND(scratchOffset, 64) / 16;
10499                     coloring.getSaveRestoreRegister();
10500                     addSaveRestoreCode(localSpillAreaOwordSize);
10501                 }
10502 
10503                 if (kernel.getOption(vISA_DumpRegChart))
10504                 {
10505                     assignRegForAliasDcl();
10506                     computePhyReg();
10507                     // invoke before expanding spill/fill since
10508                     // it modifies IR
10509                     regChart->dumpRegChart(std::cerr);
10510                 }
10511 
10512                 expandSpillFillIntrinsics(nextSpillOffset);
10513 
10514                 if (builder.getOption(vISA_OptReport))
10515                 {
10516                     detectUndefinedUses(liveAnalysis, kernel);
10517                 }
10518 
10519                 if (nextSpillOffset)
10520                 {
10521                     switch (kernel.getRAType())
10522                     {
10523                     case RA_Type::GRAPH_COLORING_RR_BC_RA:
10524                         kernel.setRAType(RA_Type::GRAPH_COLORING_SPILL_RR_BC_RA);
10525                         break;
10526                     case RA_Type::GRAPH_COLORING_FF_BC_RA:
10527                         kernel.setRAType(RA_Type::GRAPH_COLORING_SPILL_FF_BC_RA);
10528                         break;
10529                     case RA_Type::GRAPH_COLORING_RR_RA:
10530                         kernel.setRAType(RA_Type::GRAPH_COLORING_SPILL_RR_RA);
10531                         break;
10532                     case RA_Type::GRAPH_COLORING_FF_RA:
10533                         kernel.setRAType(RA_Type::GRAPH_COLORING_SPILL_FF_RA);
10534                         break;
10535                     default:
10536                         assert(0);
10537                         break;
10538                     }
10539                 }
10540 
10541                 if (verifyAugmentation)
10542                 {
10543                     assignRegForAliasDcl();
10544                     computePhyReg();
10545                     verifyAugmentation->verify();
10546                 }
10547                 break; // done
10548             }
10549         }
10550         else
10551         {
10552             break;
10553         }
10554     }
10555     assignRegForAliasDcl();
10556     computePhyReg();
10557 
10558     stopTimer(TimerID::GRF_GLOBAL_RA);
10559     //
10560     // Report failure to allocate due to excessive register pressure.
10561     //
10562     if (!reserveSpillReg && (iterationNo == maxRAIterations))
10563     {
10564         std::stringstream spilledVars;
10565         for (auto dcl : kernel.Declares)
10566         {
10567             if (dcl->isSpilled() && dcl->getRegFile() == G4_GRF)
10568             {
10569                 spilledVars << dcl->getName() << "\t";
10570             }
10571         }
10572 
10573         MUST_BE_TRUE(false,
10574             "ERROR: " << kernel.getNumRegTotal() - builder.getOptions()->getuInt32Option(vISA_ReservedGRFNum)
10575             << " GRF registers are NOT enough to compile kernel " << kernel.getName() << "!"
10576             << " The maximum register pressure in the kernel is higher"
10577             << " than the available physical registers in hardware (even"
10578             << " with spill code)."
10579             << " Please consider rewriting the kernel."
10580             << " Compiling with the symbolic register option and inspecting the"
10581             << " spilled registers may help in determining the region of high pressure.\n"
10582             << "The spilling virtual registers are as follows: "
10583             << spilledVars.str());
10584 
10585         return VISA_SPILL;
10586     }
10587 
10588     // this includes vISA's scratch space use only and does not include whatever IGC may use for private memory
10589     uint32_t spillMemUsed = ROUND(nextSpillOffset, numEltPerGRF<Type_UB>());
10590 
10591     if (spillMemUsed &&
10592         !(kernel.fg.getHasStackCalls() || kernel.fg.getIsStackCallFunc()))
10593     {
10594         builder.criticalMsgStream() << "Spill memory used = " << spillMemUsed << " bytes for kernel " <<
10595             kernel.getName() << "\n Compiling kernel with spill code may degrade performance." <<
10596             " Please consider rewriting the kernel to use less registers.\n";
10597     }
10598 
10599     // update jit metadata information for spill
10600     if (auto jitInfo = builder.getJitInfo())
10601     {
10602         jitInfo->isSpill = spillMemUsed > 0;
10603         jitInfo->hasStackcalls = kernel.fg.getHasStackCalls();
10604 
10605         if (jitInfo->hasStackcalls && builder.getIsKernel()) {
10606             // jitInfo->spillMemUsed is the entire visa stack size. Consider the caller/callee
10607             // save size if having caller/callee save
10608             // globalScratchOffset in unit of byte, others in Oword
10609             //
10610             //                               vISA stack
10611             //  globalScratchOffset     -> ---------------------
10612             //  FIXME: should be 0-based   |  spill            |
10613             //                             |                   |
10614             //  calleeSaveAreaOffset    -> ---------------------
10615             //                             |  callee save      |
10616             //  callerSaveAreaOffset    -> ---------------------
10617             //                             |  caller save      |
10618             //  paramOverflowAreaOffset -> ---------------------
10619 
10620             // Since it is difficult to predict amount of space needed to store stack, we
10621             // reserve 64k. Reserving PTSS is ideal, but it can lead to OOM on machines
10622             // with large number of threads.
10623             unsigned int scratchAllocation = 1024 * kernel.getOptions()->getuInt32Option(vISA_ScratchAllocForStackInKB);
10624             jitInfo->spillMemUsed = scratchAllocation;
10625             jitInfo->isSpill = true;
10626 
10627             // reserve spillMemUsed #bytes at upper end
10628             kernel.getGTPinData()->setScratchNextFree(scratchAllocation - kernel.getGTPinData()->getNumBytesScratchUse());
10629         }
10630         else {
10631             // stack call functions shouldnt report any scratch usage as it is
10632             // kernel's responsibility to account for stack usage of entire call
10633             // tree.
10634             if (!kernel.fg.getIsStackCallFunc())
10635             {
10636                 jitInfo->spillMemUsed = spillMemUsed;
10637                 kernel.getGTPinData()->setScratchNextFree(spillMemUsed);
10638             }
10639         }
10640         jitInfo->numGRFSpillFill = GRFSpillFillCount;
10641     }
10642 
10643     if (builder.getOption(vISA_LocalDeclareSplitInGlobalRA))
10644     {
10645         removeSplitDecl();
10646     }
10647 
10648     return VISA_SUCCESS;
10649 }
10650 
10651 /********************************************************************************************************************************************/
10652 /********************************************************Spill Code Clean up ****************************************************************/
10653 /********************************************************************************************************************************************/
10654 
10655 #define SPILL_MEMORY_OVERLAP(x, y) \
10656     (!(x->leftOff > y->rightOff || y->leftOff > x->rightOff))
10657 
10658 #define SPILL_MEMORY_OVERWRITE(target_memory, overwrite_memory) \
10659     (target_memory->leftOff >= overwrite_memory->leftOff && overwrite_memory->rightOff >= target_memory->rightOff)
10660 
10661 #define IS_FLAG_MOVE(inst)  (\
10662     inst->opcode() == G4_mov &&  \
10663     (inst->getDst() && inst->getSrc(0)) && \
10664     (inst->getDst()->getTopDcl() && inst->getSrc(0)->getTopDcl()) && \
10665     ((inst->getDst()->getTopDcl()->getRegFile() == G4_FLAG && inst->getSrc(0)->getTopDcl()->getRegFile() == G4_GRF) || \
10666     (inst->getDst()->getTopDcl()->getRegFile() == G4_GRF && inst->getSrc(0)->getTopDcl()->getRegFile() == G4_FLAG)))
10667 
10668 #define IS_SPILL_KILL_CANDIDATE(preScratchAccess) \
10669     (preScratchAccess->isSpill && !preScratchAccess->fillInUse) \
10670 
10671 #define IS_USE_KILL_CANDIDATE(preScratchAccess) \
10672     (!(preScratchAccess->regKilled || preScratchAccess->regPartialKilled || preScratchAccess->scratchDefined)) \
10673 
10674 #define IS_GRF_RANGE_OVERLAP(s1, e1, sa) \
10675     (e1 >= sa->linearizedStart && sa->linearizedEnd >= s1)
10676 
10677 #define IS_SCRATCH_RANGE_OVERLAP(s1, e1, sa) \
10678     (!(e1 < sa->leftOff || sa->rightOff < s1))
10679 
10680 #define IS_MERGEABLE_SCRATCH_RANGES(r1, r2) \
10681     (!(((int)r1.leftOff - (int)r2.rightOff)> 1 || ((int)r2.leftOff - (int)r1.rightOff) > 1))
10682 
10683 #define IS_MERGEABLE_GRF_RANGES(r1, r2) \
10684     (!(((int)r1.linearizedStart - (int)r2.linearizedEnd) > 1 || ((int)r2.linearizedStart - (int)r1.linearizedEnd) > 1))
10685 
10686 #define IS_GRF_RANGE_OVERWRITE(sa, s1, e1) \
10687     (s1 <= sa->linearizedStart && sa->linearizedEnd <= e1)
10688 
10689 #define IS_SCRATCH_RANGE_OVERWRITE(sa, s1, e1) \
10690     (s1 <= sa->leftOff && sa->rightOff <= e1)
10691 
10692 #define IS_FLAG_RANGE_OVERLAP(s1, e1, sa) \
10693     (!(e1 < sa->linearizedStart || sa->linearizedEnd < s1))
10694 
10695 #define IS_FLAG_RANGE_OVERWRITE(t, s, e) \
10696     ((s <= t->linearizedStart && t->linearizedEnd <= e))
10697 
FlagLineraizedStartAndEnd(G4_Declare * topdcl,unsigned & linearizedStart,unsigned & linearizedEnd)10698 void  FlagSpillCleanup::FlagLineraizedStartAndEnd(G4_Declare*  topdcl,
10699     unsigned& linearizedStart,
10700     unsigned& linearizedEnd)
10701 {
10702     const G4_Areg* areg = topdcl->getRegVar()->getPhyReg()->asAreg();
10703     linearizedStart = areg->getFlagNum() * 4;
10704     linearizedStart += topdcl->getRegVar()->getPhyRegOff() * topdcl->getElemSize();
10705     linearizedEnd = linearizedStart + topdcl->getByteSize();
10706     return;
10707 }
10708 
10709 /*
10710  * Reuse previous register
10711  */
replaceWithPreDcl(IR_Builder & builder,SCRATCH_ACCESS * scratchAccess,SCRATCH_ACCESS * preScratchAccess)10712 bool FlagSpillCleanup::replaceWithPreDcl(
10713     IR_Builder&     builder,
10714     SCRATCH_ACCESS* scratchAccess,
10715     SCRATCH_ACCESS* preScratchAccess)
10716 {
10717     int preRegOff = 0;
10718     int payloadHeaderSize = 0;
10719     G4_Operand *reuseOpnd = NULL;
10720     G4_INST *preInst = *preScratchAccess->inst_it;
10721 
10722     //Get reuse operand
10723     if (preScratchAccess->isSpill)
10724     {
10725         reuseOpnd = preInst->getSrc(0);
10726         preRegOff = reuseOpnd->asSrcRegRegion()->getSubRegOff();
10727         reuseOpnd = preInst->getSrc(0);
10728     }
10729     else
10730     {
10731         reuseOpnd = preInst->getDst();
10732         preRegOff = reuseOpnd->asDstRegRegion()->getSubRegOff();//For flag register, only subRegOff
10733     }
10734     G4_Declare *dcl = reuseOpnd->getBase()->asRegVar()->getDeclare();
10735 
10736     if (builder.WaDisableSendSrcDstOverlap())
10737     {
10738         for (auto &renameOpnd : scratchAccess->renameOperandVec)
10739         {
10740             if (renameOpnd.second < -1) //Flag
10741             {
10742                 break;
10743             }
10744 
10745             G4_INST *inst = renameOpnd.first;
10746 
10747             if (renameOpnd.second >= 0)
10748             {
10749                 if (inst->isSend() && !inst->getDst()->isNullReg())
10750                 {
10751                     G4_DstRegRegion* dst = inst->getDst();
10752                     bool noOverlap = dst->getLinearizedEnd() < preScratchAccess->linearizedStart ||
10753                         preScratchAccess->linearizedEnd < dst->getLinearizedStart();
10754                     if (!noOverlap)
10755                     {
10756                         return false;
10757                     }
10758                 }
10759             }
10760         }
10761     }
10762 
10763     //Replace the declare for all operands assciated with this scratch fill.
10764     for (auto &renameOpnd : scratchAccess->renameOperandVec)
10765     {
10766         G4_INST *inst = renameOpnd.first;
10767 
10768         if (renameOpnd.second == -3) //Flag modifier
10769         {
10770             G4_CondMod* mod = inst->getCondMod();
10771             int regOff = preRegOff;
10772             G4_CondMod* mod_Opnd = builder.createCondMod(mod->getMod(),
10773                 dcl->getRegVar(),
10774                 (unsigned short)regOff);
10775 
10776             inst->setCondMod(mod_Opnd);
10777 
10778         }
10779         else if (renameOpnd.second == -2) //Flag predicate
10780         {
10781             G4_Predicate* predicate = inst->getPredicate();
10782             int regOff = preRegOff;
10783             G4_Predicate * pred_Opnd = builder.createPredicate(predicate->getState(),
10784                 dcl->getRegVar(),
10785                 (unsigned short)regOff,
10786                 predicate->getControl());
10787 
10788             inst->setPredicate(pred_Opnd);
10789         }
10790         else if (renameOpnd.second == -1)  //GRF dst
10791         {
10792             G4_DstRegRegion *orgDstRegion = inst->getDst();
10793             int regOff = preRegOff + (scratchAccess->leftOff - preScratchAccess->leftOff) / numEltPerGRF<Type_UB>() + payloadHeaderSize / numEltPerGRF<Type_UB>();
10794             G4_DstRegRegion * dstOpnd = builder.createDst(
10795                 dcl->getRegVar(),
10796                 (short)regOff,
10797                 orgDstRegion->getSubRegOff(),
10798                 orgDstRegion->getHorzStride(), orgDstRegion->getType());
10799             inst->setDest(dstOpnd);
10800         }
10801         else //GRF src
10802         {
10803             G4_Operand *opnd = inst->getSrc(renameOpnd.second);
10804             G4_SrcRegRegion *orgSrcRegion = opnd->asSrcRegRegion();
10805 
10806             int regOff = preRegOff + (scratchAccess->leftOff - preScratchAccess->leftOff) / numEltPerGRF<Type_UB>() + payloadHeaderSize / numEltPerGRF<Type_UB>();
10807             G4_Operand * srcOpnd = builder.createSrcRegRegion(orgSrcRegion->getModifier(),
10808                 orgSrcRegion->getRegAccess(),
10809                 dcl->getRegVar(),
10810                 (short)regOff,
10811                 orgSrcRegion->getSubRegOff(),
10812                 orgSrcRegion->getRegion(),
10813                 orgSrcRegion->getType());
10814 
10815             inst->setSrc(srcOpnd, renameOpnd.second);
10816         }
10817     }
10818 
10819     return true;
10820 }
10821 
10822 /*
10823  *  1) The reuse target register in pre scratch access may be partial killed,
10824  *  2) and the corresponding scracth memory range is overlap with the memory of current scratch access.
10825  *  In both cases, the current fill can not be removed
10826  */
scratchKilledByPartial(SCRATCH_ACCESS * scratchAccess,SCRATCH_ACCESS * preScratchAccess)10827 bool FlagSpillCleanup::scratchKilledByPartial(
10828     SCRATCH_ACCESS* scratchAccess,
10829     SCRATCH_ACCESS* preScratchAccess)
10830 {
10831     bool killed = false;
10832 
10833     for (auto &range : preScratchAccess->killedScratchRange)
10834     {
10835         if (!(scratchAccess->leftOff > range.rightOff ||
10836             range.leftOff > scratchAccess->rightOff))
10837         {
10838             killed = true;
10839         }
10840     }
10841 
10842     for (auto &range : preScratchAccess->killedRegRange)
10843     {
10844         //Map the register kill to scratch kill
10845         unsigned leftOff = preScratchAccess->leftOff + (range.linearizedStart - preScratchAccess->linearizedStart);
10846         unsigned rightOff = preScratchAccess->leftOff + (range.linearizedEnd - preScratchAccess->linearizedStart);
10847 
10848         if (!(scratchAccess->leftOff > rightOff ||
10849             leftOff > scratchAccess->rightOff))
10850         {
10851             killed = true;
10852         }
10853     }
10854 
10855     return killed;
10856 }
10857 
10858 /*
10859  *  Record all killed GRF ranges.
10860  *  do merging of ranges when possible.
10861  */
addKilledGRFRanges(unsigned linearizedStart,unsigned linearizedEnd,SCRATCH_ACCESS * scratchAccess,G4_Predicate * predicate)10862 bool FlagSpillCleanup::addKilledGRFRanges(
10863     unsigned    linearizedStart,
10864     unsigned    linearizedEnd,
10865     SCRATCH_ACCESS* scratchAccess,
10866     G4_Predicate*   predicate)
10867 {
10868     REG_RANGE range;
10869     range.linearizedStart = std::max(scratchAccess->linearizedStart, linearizedStart);
10870     range.linearizedEnd = std::min(scratchAccess->linearizedEnd, linearizedEnd);
10871     range.predicate = predicate ? true : false;
10872 
10873     if (scratchAccess->killedRegRange.size() == 0)
10874     {
10875         scratchAccess->killedRegRange.push_back(range);
10876     }
10877     else
10878     {
10879         bool merged = false;
10880         REG_RANGE_VEC_ITER range_iter = scratchAccess->killedRegRange.begin();
10881         REG_RANGE_VEC_ITER range_iter_next;
10882         REG_RANGE *merged_range = NULL;
10883         while (range_iter != scratchAccess->killedRegRange.end())
10884         {
10885             REG_RANGE &killedRange = *(range_iter);
10886             range_iter_next = range_iter;
10887             range_iter_next++;
10888 
10889             if (killedRange.predicate) //With predicate, the range can not be merged with others
10890             {
10891                 range_iter = range_iter_next;
10892                 continue;
10893             }
10894 
10895             if (!merged && IS_MERGEABLE_GRF_RANGES(range, killedRange))
10896             {
10897                 killedRange.linearizedStart = std::min(killedRange.linearizedStart, range.linearizedStart);
10898                 killedRange.linearizedEnd = std::max(killedRange.linearizedEnd, range.linearizedEnd);
10899                 merged = true;
10900                 merged_range = &killedRange;
10901             }
10902             else if (merged)
10903             {
10904                 if (IS_MERGEABLE_GRF_RANGES((*merged_range), killedRange))
10905                 {
10906                     merged_range->linearizedStart = std::min(killedRange.linearizedStart, merged_range->linearizedStart);
10907                     merged_range->linearizedEnd = std::max(killedRange.linearizedEnd, merged_range->linearizedEnd);
10908                 }
10909             }
10910             if (IS_GRF_RANGE_OVERWRITE(scratchAccess, killedRange.linearizedStart, killedRange.linearizedEnd))
10911             {
10912                 scratchAccess->regKilled = true;
10913                 return true;
10914             }
10915             range_iter = range_iter_next;
10916         }
10917         if (!merged)
10918         {
10919             scratchAccess->killedRegRange.push_back(range);
10920         }
10921     }
10922 
10923     return false;
10924 }
10925 
10926 /*
10927  * Check if the register in previous scratch access is fully killed by current register define
10928  */
regFullyKilled(SCRATCH_ACCESS * scratchAccess,unsigned linearizedStart,unsigned linearizedEnd,unsigned short maskFlag)10929 bool FlagSpillCleanup::regFullyKilled(
10930     SCRATCH_ACCESS* scratchAccess,
10931     unsigned        linearizedStart,
10932     unsigned        linearizedEnd,
10933     unsigned short      maskFlag)
10934 {
10935 
10936     if (IS_FLAG_RANGE_OVERWRITE(scratchAccess, linearizedStart, linearizedEnd))
10937     {
10938         if (maskFlag & InstOpt_WriteEnable)  // No mask == all range killed
10939         {
10940             return true;
10941         }
10942 
10943         if (linearizedStart == scratchAccess->linearizedStart &&
10944             linearizedEnd == scratchAccess->linearizedEnd &&
10945             scratchAccess->maskFlag == maskFlag)
10946         {
10947             return true;
10948         }
10949     }
10950 
10951     return false;
10952 }
10953 
10954 /*
10955  *  Check only part of scratch register is killed, at the same time no overlap.
10956  *  This is to make sure if the associated fill is removed, the define register can be replaced with reuse register or not.
10957  */
inRangePartialKilled(SCRATCH_ACCESS * scratchAccess,unsigned linearizedStart,unsigned linearizedEnd,unsigned short maskFlag)10958 bool FlagSpillCleanup::inRangePartialKilled(
10959     SCRATCH_ACCESS* scratchAccess,
10960     unsigned    linearizedStart,
10961     unsigned    linearizedEnd,
10962     unsigned short  maskFlag)
10963 {
10964     if ((scratchAccess->linearizedStart <= linearizedStart &&
10965         scratchAccess->linearizedEnd >= linearizedEnd))
10966     {
10967         if (maskFlag & InstOpt_WriteEnable)
10968         {
10969             return true;
10970         }
10971 
10972         if (scratchAccess->linearizedStart == linearizedStart &&
10973             scratchAccess->linearizedEnd == linearizedEnd &&
10974             scratchAccess->maskFlag == maskFlag)
10975         {
10976             return true;
10977         }
10978     }
10979 
10980     return false;
10981 }
10982 
10983 /*
10984  * Register kill analysis
10985  */
regDefineAnalysis(SCRATCH_ACCESS * scratchAccess,unsigned linearizedStart,unsigned linearizedEnd,unsigned short maskFlag,G4_Predicate * predicate)10986 bool FlagSpillCleanup::regDefineAnalysis(
10987     SCRATCH_ACCESS* scratchAccess,
10988     unsigned       linearizedStart,
10989     unsigned       linearizedEnd,
10990     unsigned short     maskFlag,
10991     G4_Predicate*      predicate)
10992 {
10993     if (regFullyKilled(scratchAccess, linearizedStart, linearizedEnd, maskFlag))
10994     {
10995         return true;
10996     }
10997     else if (!scratchAccess->regKilled)
10998     {
10999         // Handle partial overlap
11000         // What about the mask?
11001         if (addKilledGRFRanges(linearizedStart, linearizedEnd, scratchAccess, predicate))
11002         {
11003             //The register range is killed by accumulated partial range kills
11004             return true;
11005         }
11006         scratchAccess->regPartialKilled = true;
11007     }
11008 
11009     return false;
11010 }
11011 
regDefineFlag(SCRATCH_PTR_LIST * scratchTraceList,G4_INST * inst,G4_Operand * opnd)11012 void FlagSpillCleanup::regDefineFlag(
11013     SCRATCH_PTR_LIST* scratchTraceList,
11014     G4_INST*          inst,
11015     G4_Operand*       opnd)
11016 {
11017     //Get the linearized address in GRF register file
11018     unsigned linearizedStart = 0;
11019     unsigned linearizedEnd = 0;
11020     G4_Predicate* predicate = inst->getPredicate();
11021     G4_Declare*  topdcl = opnd->getTopDcl();
11022 
11023     FlagLineraizedStartAndEnd(opnd->getTopDcl(), linearizedStart, linearizedEnd);
11024 
11025     //Impact on previous scratch access
11026     SCRATCH_PTR_LIST_ITER it = scratchTraceList->begin();
11027     SCRATCH_PTR_LIST_ITER itEnd = scratchTraceList->end();
11028 
11029     if (it != itEnd &&
11030         inst == *(scratchTraceList->back()->inst_it))
11031     {
11032         itEnd--;
11033     }
11034 
11035     while (it != itEnd)
11036     {
11037         SCRATCH_PTR_LIST_ITER kt = it;
11038         kt++;
11039 
11040         SCRATCH_ACCESS * scratchAccess = *it;
11041 
11042         //Not instruction itself, def->use can not happen in single instruction.
11043         if (scratchAccess->regKilled)
11044         {
11045             it = kt;
11046             continue;
11047         }
11048 
11049         // Checked if the registers used in the previous scratch accesses (both spill and fill) are killed (redefined).
11050         if (linearizedEnd &&
11051             IS_FLAG_RANGE_OVERLAP(linearizedStart, linearizedEnd, scratchAccess))
11052         {
11053             //E mask
11054             unsigned maskFlag = (inst->getOption() & 0xFFF010C);
11055 
11056             if (regDefineAnalysis(scratchAccess, linearizedStart, linearizedEnd, (unsigned short)maskFlag, predicate))
11057             {
11058                 //Fully killed
11059                 scratchAccess->regKilled = true;
11060                 if (scratchAccess->evicted)  //Not in use
11061                 {
11062                     scratchTraceList->erase(it); //The previous one is not candidate for future use
11063                 }
11064             }
11065 
11066             // For prefill and associated define and spill instructions
11067             // 1. Same dcl is used
11068             // 2. If the prefill register is fulled killed,
11069             //     a. The prefill instruction can be removed.
11070             //     b. But the define and instruction's registers are kept and will not reuse previous one.
11071             // 3. If the prefill register is partial killed, and the killed register region is part of prefill region.
11072             //     a. The prefill instruction can be removed.
11073             //     b. and the register in define and spill instruction can reuse previous one.
11074             // 4. Otherwise, the (pre)fill instruction can not be removed, and no reuse will happen.
11075             // 5. For pure fill, it's no killed by same declare
11076             G4_Declare *preDcl = scratchAccess->flagOpnd->getTopDcl();
11077 
11078             if (topdcl == preDcl)
11079             {
11080                 if (inRangePartialKilled(scratchAccess, linearizedStart, linearizedEnd, (unsigned short)maskFlag))
11081                 {
11082                     scratchAccess->renameOperandVec.emplace_back(inst, -1);
11083                     scratchAccess->inRangePartialKilled = true;
11084                 }
11085                 else
11086                 {
11087                     scratchAccess->removeable = false;
11088                 }
11089             }
11090         }
11091 
11092         it = kt;
11093     }
11094 }
11095 
11096 /*
11097  *  Analysis the use of register to determine if the scratchAccess can be removed or not
11098  *
11099  */
regUseAnalysis(SCRATCH_ACCESS * scratchAccess,unsigned linearizedStart,unsigned linearizedEnd)11100 bool FlagSpillCleanup::regUseAnalysis(
11101     SCRATCH_ACCESS* scratchAccess,
11102     unsigned    linearizedStart,
11103     unsigned    linearizedEnd)
11104 {
11105     //GRF in previous fill is used as part of current reg,
11106     //In this case, the fill can not be removed since the reuse can not happen.
11107     //Caller gauranteed the overlap of the registers
11108     if (linearizedEnd > scratchAccess->linearizedEnd ||
11109         linearizedStart < scratchAccess->linearizedStart)
11110     {
11111         return true;
11112     }
11113 
11114     //Can not be removed when the previous scratch access is killed or partial killed
11115     //before the use of current scratch access register
11116     //b
11117     SCRATCH_ACCESS * preScratchAccess = scratchAccess->preScratchAccess;
11118     if (preScratchAccess &&
11119         (preScratchAccess->regKilled ||
11120          scratchKilledByPartial(scratchAccess, preScratchAccess)))
11121     {
11122         return true;
11123     }
11124 
11125     //Back trace to update the reachable scratch accesses
11126     if (scratchAccess->prePreScratchAccess)
11127     {
11128         SCRATCH_ACCESS * prePreScratchAccess = preScratchAccess;
11129         preScratchAccess = scratchAccess;
11130 
11131         do {
11132             if ((prePreScratchAccess->regKilled ||
11133                 scratchKilledByPartial(scratchAccess, prePreScratchAccess)))
11134             {
11135                 scratchAccess->prePreScratchAccess = preScratchAccess;
11136                 break;
11137             }
11138             preScratchAccess = prePreScratchAccess;
11139             prePreScratchAccess = preScratchAccess->preScratchAccess;
11140         } while (prePreScratchAccess && preScratchAccess != scratchAccess->prePreScratchAccess);
11141     }
11142 
11143     return false;
11144 }
11145 
regUseFlag(SCRATCH_PTR_LIST * scratchTraceList,G4_INST * inst,G4_Operand * opnd,int opndIndex)11146 void FlagSpillCleanup::regUseFlag(
11147     SCRATCH_PTR_LIST*  scratchTraceList,
11148     G4_INST*           inst,
11149     G4_Operand*        opnd,
11150     int                opndIndex)
11151 {
11152     //Get the linearized address in GRF register file
11153     unsigned linearizedStart = 0;
11154     unsigned linearizedEnd = 0;
11155     G4_Declare *topdcl = NULL;
11156 
11157     topdcl = opnd->getTopDcl();
11158     FlagLineraizedStartAndEnd(opnd->getTopDcl(), linearizedStart, linearizedEnd);
11159 
11160     //Impact on previous scratch access
11161     for (SCRATCH_ACCESS * scratchAccess : *scratchTraceList)
11162     {
11163         if (linearizedEnd &&
11164             IS_FLAG_RANGE_OVERLAP(linearizedStart, linearizedEnd, scratchAccess))
11165         {
11166             //Not handle indirect GRF
11167             if (inst->isEOT() ||
11168                 inst->isPseudoUse())
11169             {
11170                 scratchAccess->removeable = false;
11171                 continue;
11172             }
11173 
11174             if (scratchAccess->flagOpnd->getTopDcl() == topdcl)  //Same declare
11175             {
11176                 if (regUseAnalysis(scratchAccess, linearizedStart, linearizedEnd))
11177                 {
11178                     //The filled register is in use
11179                     scratchAccess->removeable = false;
11180                 }
11181                 else if (scratchAccess->inRangePartialKilled || !scratchAccess->regKilled)
11182                 {
11183                     //can reuse previous register
11184                     scratchAccess->renameOperandVec.emplace_back(inst, opndIndex);
11185                 }
11186             }
11187         }
11188     }
11189 }
11190 
regUseScratch(SCRATCH_PTR_LIST * scratchTraceList,G4_INST * inst,G4_Operand * opnd,Gen4_Operand_Number opndNum)11191 void FlagSpillCleanup::regUseScratch(
11192     SCRATCH_PTR_LIST*  scratchTraceList,
11193     G4_INST*           inst,
11194     G4_Operand*        opnd,
11195     Gen4_Operand_Number opndNum)
11196 {
11197     const G4_Declare *topdcl = opnd->getTopDcl();
11198 
11199     //Impact on previous scratch access
11200     for (SCRATCH_ACCESS *scratchAccess : *scratchTraceList)
11201     {
11202         if (topdcl == scratchAccess->scratchDcl)
11203         {
11204             if (opndNum == Opnd_dst)
11205             {
11206                 scratchAccess->scratchDefined = true;
11207             }
11208             else
11209             {
11210                 scratchAccess->removeable = false;
11211             }
11212         }
11213     }
11214 }
11215 
initializeScratchAccess(SCRATCH_ACCESS * scratchAccess,INST_LIST_ITER inst_it)11216 void FlagSpillCleanup::initializeScratchAccess(
11217     SCRATCH_ACCESS *scratchAccess, INST_LIST_ITER inst_it)
11218 {
11219 #ifdef _DEBUG
11220     scratchAccess->regNum = -1;
11221 #endif
11222     scratchAccess->scratchDcl = NULL;
11223     scratchAccess->flagOpnd = NULL;
11224 
11225     scratchAccess->linearizedStart = 0;
11226     scratchAccess->linearizedEnd = 0;
11227     scratchAccess->leftOff = 0;
11228     scratchAccess->rightOff = 0;
11229     scratchAccess->useCount = 0;
11230 
11231     scratchAccess->isSpill = false;
11232     scratchAccess->isBlockLocal = false;
11233     scratchAccess->directKill = false;
11234 
11235     scratchAccess->regKilled = false;
11236     scratchAccess->regPartialKilled = false;
11237     scratchAccess->regOverKilled = false;
11238     scratchAccess->inRangePartialKilled = false;
11239     scratchAccess->regInUse = false;
11240 
11241     scratchAccess->fillInUse = false;
11242     scratchAccess->removeable = true;
11243     scratchAccess->instKilled = false;
11244     scratchAccess->evicted = false;
11245     scratchAccess->scratchDefined = false;
11246 
11247     scratchAccess->preScratchAccess = NULL;
11248     scratchAccess->prePreScratchAccess = NULL;
11249     scratchAccess->preFillAccess = NULL;
11250 
11251     scratchAccess->inst_it = inst_it;
11252     G4_INST *inst = *inst_it;
11253     scratchAccess->maskFlag = (inst->getOption() & 0xFFF010C);
11254 
11255     return;
11256 }
11257 
initializeFlagScratchAccess(SCRATCH_PTR_VEC * scratchAccessList,SCRATCH_ACCESS * & scratchAccess,INST_LIST_ITER inst_it)11258 bool FlagSpillCleanup::initializeFlagScratchAccess(
11259     SCRATCH_PTR_VEC* scratchAccessList,
11260     SCRATCH_ACCESS*   &scratchAccess,
11261     INST_LIST_ITER    inst_it)
11262 {
11263     G4_INST* inst = (*inst_it);
11264 
11265     G4_DstRegRegion* dst = inst->getDst();
11266     G4_Operand* src = inst->getSrc(0);
11267     G4_Declare* topDcl_1 = dst->getTopDcl();
11268     G4_Declare* topDcl_2 = src->getTopDcl();
11269 
11270     //Create the spill/fill description
11271     if (topDcl_1->getRegFile() == G4_FLAG && topDcl_2->getRegFile() == G4_GRF)
11272     {
11273         if (src->asSrcRegRegion()->getBase()->isRegVar() &&
11274             src->asSrcRegRegion()->getBase()->asRegVar()->isRegVarAddrSpillLoc())
11275         {
11276             scratchAccess = new SCRATCH_ACCESS;
11277             scratchAccessList->push_back(scratchAccess);
11278             initializeScratchAccess(scratchAccess, inst_it);
11279             //Fill
11280 #ifdef _DEBUG
11281             scratchAccess->regNum = topDcl_1->getRegVar()->getPhyReg()->asAreg()->getArchRegType();
11282 #endif
11283             scratchAccess->scratchDcl = topDcl_2;  //Spill location
11284 
11285             if (gra.isBlockLocal(topDcl_2))
11286             {
11287                 scratchAccess->isBlockLocal = true;
11288             }
11289             FlagLineraizedStartAndEnd(topDcl_1, scratchAccess->linearizedStart, scratchAccess->linearizedEnd);
11290             scratchAccess->flagOpnd = dst;
11291             if (inst->getPredicate())
11292             {
11293                 scratchAccess->removeable = false; //Partil spill/fill cannot be removed
11294                 scratchAccess->instKilled = true; //Not really killed, mark so that the instruction depends on current one will not be removed.
11295             }
11296 
11297             return true;
11298         }
11299     }
11300     else
11301     {   //Spill
11302         if (dst->getBase()->isRegVar() &&
11303             dst->getBase()->asRegVar()->isRegVarAddrSpillLoc())
11304         {
11305             scratchAccess = new SCRATCH_ACCESS;
11306             scratchAccessList->push_back(scratchAccess);
11307             initializeScratchAccess(scratchAccess, inst_it);
11308 #ifdef _DEBUG
11309             scratchAccess->regNum = topDcl_2->getRegVar()->getPhyReg()->asAreg()->getArchRegType();
11310 #endif
11311             scratchAccess->scratchDcl = topDcl_1;
11312 
11313             if (gra.isBlockLocal(topDcl_1))
11314             {
11315                 scratchAccess->isBlockLocal = true;
11316             }
11317 
11318             scratchAccess->isSpill = true;
11319             FlagLineraizedStartAndEnd(topDcl_2, scratchAccess->linearizedStart, scratchAccess->linearizedEnd);
11320             scratchAccess->flagOpnd = src;
11321             if (inst->getPredicate())
11322             {
11323                 scratchAccess->removeable = false; //Partil spill/fill cannot be removed
11324                 scratchAccess->instKilled = true; //Not really killed, mark so that the instruction depends on current one will not be removed.
11325             }
11326 
11327             return true;
11328         }
11329     }
11330 
11331     return false;
11332 }
11333 
freeScratchAccess(SCRATCH_PTR_VEC * scratchAccessList)11334 void FlagSpillCleanup::freeScratchAccess(SCRATCH_PTR_VEC *scratchAccessList)
11335 {
11336     for (SCRATCH_ACCESS *scratchAccess : *scratchAccessList)
11337     {
11338         delete scratchAccess;
11339     }
11340 
11341     scratchAccessList->clear();
11342 
11343     return;
11344 }
11345 
11346 //Check the flag define instruction.
flagDefine(SCRATCH_PTR_LIST & scratchTraceList,G4_INST * inst)11347 void FlagSpillCleanup::flagDefine(
11348     SCRATCH_PTR_LIST& scratchTraceList,
11349     G4_INST*          inst)
11350 {
11351     G4_DstRegRegion* dst = inst->getDst();
11352 
11353     if (dst)
11354     {
11355         G4_Declare* topdcl = NULL;
11356         topdcl = GetTopDclFromRegRegion(dst);
11357 
11358         if (topdcl && topdcl->getRegFile() == G4_FLAG)
11359         {
11360             //Flag register define
11361             regDefineFlag(&scratchTraceList, inst, dst);
11362         }
11363     }
11364 
11365     G4_CondMod* mod = inst->getCondMod();
11366     if (!mod)
11367     {
11368         return;
11369     }
11370 
11371     // ConMod, handled as register define
11372     unsigned maskFlag = (inst->getOption() & 0xFFF010C);
11373 
11374     unsigned linearizedStart = 0;
11375     unsigned linearizedEnd = 0;
11376 
11377     G4_VarBase *flagReg = mod->getBase();
11378     if (!flagReg)
11379     {
11380         return;
11381     }
11382 
11383     G4_Declare* topdcl = flagReg->asRegVar()->getDeclare();
11384     FlagLineraizedStartAndEnd(topdcl, linearizedStart, linearizedEnd);
11385 
11386     SCRATCH_PTR_LIST_ITER it = scratchTraceList.begin();
11387     SCRATCH_PTR_LIST_ITER itEnd = scratchTraceList.end();
11388     while (it != itEnd)
11389     {
11390         SCRATCH_PTR_LIST_ITER kt = it;
11391         kt++;
11392 
11393         SCRATCH_ACCESS *preScratchAccess = *it;
11394         if (IS_FLAG_RANGE_OVERLAP(linearizedStart, linearizedEnd, preScratchAccess))
11395         {
11396             G4_Declare *preDcl = preScratchAccess->flagOpnd->getTopDcl();
11397 
11398             if (regDefineAnalysis(preScratchAccess, linearizedStart, linearizedEnd, (unsigned short)maskFlag, NULL))
11399             {
11400                 preScratchAccess->regKilled = true;
11401                 if (preScratchAccess->evicted)  //Not in use
11402                 {
11403                     scratchTraceList.erase(it); //The previous one is not candidate for reuse
11404                 }
11405             }
11406             if (topdcl == preDcl)
11407             {
11408                 if (preScratchAccess->inRangePartialKilled)
11409                 {
11410                     preScratchAccess->renameOperandVec.emplace_back(inst, -3);
11411                 }
11412                 else
11413                 {
11414                     preScratchAccess->removeable = false;
11415                 }
11416             }
11417         }
11418         it = kt;
11419     }
11420 
11421     return;
11422 }
11423 
scratchUse(SCRATCH_PTR_LIST & scratchTraceList,G4_INST * inst)11424 void FlagSpillCleanup::scratchUse(SCRATCH_PTR_LIST& scratchTraceList, G4_INST* inst)
11425 {
11426     G4_DstRegRegion* dst = inst->getDst();
11427 
11428     if (dst)
11429     {
11430         G4_Declare* topdcl = NULL;
11431         topdcl = GetTopDclFromRegRegion(dst);
11432 
11433         if (topdcl && topdcl->getRegFile() == G4_GRF)
11434         {
11435             //Flag scratch variable is redefined
11436             regUseScratch(&scratchTraceList, inst, dst, Opnd_dst);
11437         }
11438     }
11439 
11440     for (unsigned i = 0; i < G4_MAX_SRCS; i++)
11441     {
11442         G4_Operand* src = inst->getSrc(i);
11443 
11444         if (src && src->isSrcRegRegion())
11445         {
11446             G4_Declare* topdcl = NULL;
11447 
11448             if (inst->getSrc(i)->asSrcRegRegion()->getBase()->isRegVar())
11449             {
11450                 topdcl = GetTopDclFromRegRegion(src);
11451             }
11452 
11453             if (!topdcl || (topdcl->getRegFile() == G4_FLAG))
11454             {
11455                 continue;
11456             }
11457 
11458             regUseScratch(&scratchTraceList, inst, src, Opnd_src0);
11459         }
11460     }
11461 }
11462 
flagUse(SCRATCH_PTR_LIST & scratchTraceList,G4_INST * inst)11463 void FlagSpillCleanup::flagUse(SCRATCH_PTR_LIST& scratchTraceList, G4_INST* inst)
11464 {
11465     for (unsigned i = 0; i < G4_MAX_SRCS; i++)
11466     {
11467         G4_Operand* src = inst->getSrc(i);
11468 
11469         if (src && src->isSrcRegRegion())
11470         {
11471             G4_Declare* topdcl = NULL;
11472 
11473             if (inst->getSrc(i)->asSrcRegRegion()->getBase()->isRegVar())
11474             {
11475                 topdcl = GetTopDclFromRegRegion(src);
11476             }
11477 
11478             if (!topdcl || (topdcl->getRegFile() != G4_FLAG))
11479             {
11480                 continue;
11481             }
11482 
11483             regUseFlag(&scratchTraceList, inst, src, i);
11484         }
11485     }
11486 
11487     //Flag register is used as predicate
11488     G4_Predicate* predicate = inst->getPredicate();
11489     if (!predicate)
11490     {
11491         return;
11492     }
11493 
11494     G4_VarBase *flagReg = predicate->getBase();
11495     if (!flagReg)
11496     {
11497         return;
11498     }
11499 
11500     G4_Declare* topdcl = flagReg->asRegVar()->getDeclare();
11501     unsigned linearizedStart = 0;
11502     unsigned linearizedEnd = 0;
11503     FlagLineraizedStartAndEnd(topdcl, linearizedStart, linearizedEnd);
11504 
11505     for (SCRATCH_ACCESS * preScratchAccess : scratchTraceList)
11506     {
11507         if (IS_FLAG_RANGE_OVERLAP(linearizedStart, linearizedEnd, preScratchAccess))
11508         {
11509             G4_Declare *preDcl = preScratchAccess->flagOpnd->getTopDcl();
11510             //Use should have same top declare
11511             if (preDcl == topdcl)
11512             {
11513                 if (regUseAnalysis(preScratchAccess, linearizedStart, linearizedEnd))
11514                 {
11515                     preScratchAccess->removeable = false;
11516                 }
11517                 else if (preScratchAccess->inRangePartialKilled || !preScratchAccess->regKilled)
11518                 {
11519                     //can reuse previous register
11520                     preScratchAccess->renameOperandVec.emplace_back(inst, -2);
11521                 }
11522             }
11523         }
11524     }
11525 
11526     return;
11527 }
11528 
flagScratchDefineUse(G4_BB * bb,SCRATCH_PTR_LIST * scratchTraceList,SCRATCH_PTR_VEC * candidateList,SCRATCH_ACCESS * scratchAccess,CLEAN_NUM_PROFILE * clean_num_profile)11529 bool FlagSpillCleanup::flagScratchDefineUse(
11530     G4_BB* bb,
11531     SCRATCH_PTR_LIST*  scratchTraceList,
11532     SCRATCH_PTR_VEC*  candidateList,
11533     SCRATCH_ACCESS*    scratchAccess,
11534     CLEAN_NUM_PROFILE* clean_num_profile)
11535 {
11536     SCRATCH_PTR_LIST_ITER it = scratchTraceList->begin();
11537     SCRATCH_PTR_LIST_ITER itEnd = scratchTraceList->end();
11538 
11539     while (it != itEnd)
11540     {
11541         SCRATCH_PTR_LIST_ITER kt = it;
11542         kt++;
11543 
11544         SCRATCH_ACCESS * preScratchAccess = *it;
11545 
11546         //Evicted
11547         if (preScratchAccess->evicted)
11548         {
11549             it = kt;
11550             continue;
11551         }
11552 
11553         //Same scratch declare
11554         if (preScratchAccess->scratchDcl == scratchAccess->scratchDcl) //Same scratch location
11555         {
11556             if (scratchAccess->isSpill)  //Current is spill
11557             {
11558                 if (IS_SPILL_KILL_CANDIDATE(preScratchAccess))  //previoius is spill as well and previous spill is not used
11559                 {
11560                     //kill the previous spill
11561                     bb->erase(preScratchAccess->inst_it);
11562                     preScratchAccess->instKilled = true;
11563                     clean_num_profile->spill_clean_num[0]++;
11564                     scratchTraceList->erase(it); //The previous one is not candidate for reuse
11565                     it = kt;
11566 
11567                     continue;
11568                 }
11569 
11570                 preScratchAccess->evicted = true;
11571                 scratchTraceList->erase(it); //The previous one is not a good candidate for reuse any more
11572             }
11573             else  //Current is fill
11574             {
11575                 preScratchAccess->fillInUse = true;
11576                 preScratchAccess->useCount++;
11577 
11578                 if (IS_USE_KILL_CANDIDATE(preScratchAccess))   //Is not used before
11579                 {
11580                     scratchAccess->preScratchAccess = preScratchAccess;   //set previous scrach location define
11581                     candidateList->push_back(scratchAccess);  //Add to candidate list
11582                     if (IS_FLAG_RANGE_OVERWRITE(scratchAccess, preScratchAccess->linearizedStart, preScratchAccess->linearizedEnd))
11583                     {
11584                         //Exactly same GRF, it's useless fill, since prevous fill or spill not been killed
11585                         scratchAccess->directKill = true;
11586                         scratchTraceList->push_back(scratchAccess);
11587                         return true;
11588                     }
11589                 }
11590             }
11591         }
11592         it = kt;
11593     }
11594 
11595     scratchTraceList->push_back(scratchAccess);
11596 
11597     return false;
11598 }
11599 
flagSpillFillClean(G4_BB * bb,INST_LIST_ITER inst_it,SCRATCH_PTR_VEC & scratchAccessList,SCRATCH_PTR_LIST & scratchTraceList,SCRATCH_PTR_VEC & candidateList,CLEAN_NUM_PROFILE * clean_num_profile)11600 void FlagSpillCleanup::flagSpillFillClean(
11601     G4_BB* bb,
11602     INST_LIST_ITER     inst_it,
11603     SCRATCH_PTR_VEC&  scratchAccessList,
11604     SCRATCH_PTR_LIST&  scratchTraceList,
11605     SCRATCH_PTR_VEC&  candidateList,
11606     CLEAN_NUM_PROFILE* clean_num_profile)
11607 {
11608     G4_INST* inst = (*inst_it);
11609     if (inst->isPseudoKill())
11610     {
11611         return;
11612     }
11613 
11614     bool noDefineAnalysis = false;
11615 
11616     //Check if there is flag use
11617     flagUse(scratchTraceList, inst);
11618 
11619     //Check if it's spill/fill of the flag
11620     if (IS_FLAG_MOVE(inst))
11621     {
11622         SCRATCH_ACCESS *scratchAccess = NULL;
11623 
11624         if (initializeFlagScratchAccess(&scratchAccessList, scratchAccess, inst_it))
11625         {
11626             //Build the trace list and the candidate list
11627             //Trace list includes all spill/fill
11628             //Candidate includues ??
11629             //Checking if the spill/fill can be removed at the same time by comparing previous one.
11630             noDefineAnalysis = flagScratchDefineUse(bb, &scratchTraceList, &candidateList, scratchAccess, clean_num_profile);
11631         }
11632     }
11633     else
11634     {
11635         scratchUse(scratchTraceList, inst);
11636     }
11637 
11638     //Check if there is flag define
11639     if (!noDefineAnalysis)
11640     {
11641         flagDefine(scratchTraceList, inst);
11642     }
11643 
11644     return;
11645 }
11646 
11647 #ifdef _DEBUG
11648 #define FILL_DEBUG_THRESHOLD 0xffffffff
11649 #define SPILL_DEBUG_THRESHOLD 0xffffffff //25
11650 #endif
11651 
regFillClean(IR_Builder & builder,G4_BB * bb,SCRATCH_PTR_VEC & candidateList,CLEAN_NUM_PROFILE * clean_num_profile)11652 void FlagSpillCleanup::regFillClean(
11653     IR_Builder&        builder,
11654     G4_BB*             bb,
11655     SCRATCH_PTR_VEC&  candidateList,
11656     CLEAN_NUM_PROFILE* clean_num_profile)
11657 {
11658     for (SCRATCH_ACCESS * scratchAccess : candidateList)
11659     {
11660         SCRATCH_ACCESS* preScratchAccess = scratchAccess->preScratchAccess;
11661 
11662         // Since the reuse happens from front to end.
11663         // If the pre scratchAccess is killed, current candidate can not reuse previous register any more
11664         if (!scratchAccess->instKilled &&
11665             (scratchAccess->removeable && scratchAccess->directKill))
11666         {
11667             if (scratchAccess->prePreScratchAccess)
11668             {
11669                 while (preScratchAccess &&
11670                     preScratchAccess->preScratchAccess &&
11671                     preScratchAccess != scratchAccess->prePreScratchAccess)
11672                 {
11673                     //If possible, propagate to previous scratchAccess
11674                     if (preScratchAccess->preFillAccess)
11675                     {
11676                         //to jump over prefill.
11677                         if (preScratchAccess->isSpill &&
11678                             preScratchAccess->preFillAccess &&
11679                             preScratchAccess->preFillAccess->instKilled &&
11680                             preScratchAccess->preScratchAccess)
11681                         {
11682                             preScratchAccess = preScratchAccess->preScratchAccess;
11683                         }
11684                         else
11685                         {
11686                             break;
11687                         }
11688                     }
11689                     else
11690                     {
11691                         if (!preScratchAccess->instKilled)
11692                         {
11693                             break;
11694                         }
11695                         preScratchAccess = preScratchAccess->preScratchAccess;
11696                     }
11697                 }
11698 
11699                 if (preScratchAccess)
11700                 {
11701                     if (preScratchAccess->isSpill &&
11702                         preScratchAccess->preFillAccess &&
11703                         preScratchAccess->preFillAccess->instKilled)
11704                     {
11705                     }
11706                     else if (!preScratchAccess->instKilled)
11707                     {
11708                         if (replaceWithPreDcl(builder, scratchAccess, preScratchAccess))
11709                         {
11710                             bb->erase(scratchAccess->inst_it);
11711                             scratchAccess->instKilled = true;
11712                             scratchAccess->preScratchAccess->useCount--;
11713                             clean_num_profile->fill_clean_num[0]++;
11714                         }
11715                     }
11716                 }
11717             }
11718             else
11719             {
11720                 if (preScratchAccess && !preScratchAccess->instKilled)
11721                 {
11722                     if (replaceWithPreDcl(builder, scratchAccess, preScratchAccess))
11723                     {
11724                         bb->erase(scratchAccess->inst_it);
11725                         scratchAccess->instKilled = true;
11726                         scratchAccess->preScratchAccess->useCount--;
11727                         clean_num_profile->fill_clean_num[0]++;
11728                     }
11729                 }
11730             }
11731         }
11732 #ifdef _DEBUG
11733         if (clean_num_profile->fill_clean_num[0] > FILL_DEBUG_THRESHOLD)
11734             return;
11735 #endif
11736     }
11737 
11738     return;
11739 }
11740 
regSpillClean(IR_Builder & builder,G4_BB * bb,SCRATCH_PTR_VEC & candidateList,CLEAN_NUM_PROFILE * clean_num_profile)11741 void FlagSpillCleanup::regSpillClean(
11742     IR_Builder&        builder,
11743     G4_BB*             bb,
11744     SCRATCH_PTR_VEC&  candidateList,
11745     CLEAN_NUM_PROFILE* clean_num_profile)
11746 {
11747     for (SCRATCH_ACCESS * scratchAccess : candidateList)
11748     {
11749         if (scratchAccess->instKilled)
11750         {
11751             continue;
11752         }
11753         if (!scratchAccess->instKilled &&
11754             scratchAccess->isSpill &&
11755             scratchAccess->removeable &&
11756             scratchAccess->evicted &&
11757             scratchAccess->useCount == 0)
11758         {
11759             bb->erase(scratchAccess->inst_it);
11760             scratchAccess->instKilled = true;
11761             clean_num_profile->spill_clean_num[0]++;
11762 #ifdef _DEBUG
11763             if (clean_num_profile->spill_clean_num[0] > SPILL_DEBUG_THRESHOLD)
11764             {
11765                 return;
11766             }
11767 #endif
11768         }
11769     }
11770 
11771     return;
11772 }
11773 
11774 
11775 // Replace Scratch Block Read/Write message with OWord Block Read/Write message
11776 // For spill code clean up, clean target may exist in all WAW, RAR, RAW, WAR.
spillFillCodeCleanFlag(IR_Builder & builder,G4_Kernel & kernel,CLEAN_NUM_PROFILE * clean_num_profile)11777 void FlagSpillCleanup::spillFillCodeCleanFlag(
11778     IR_Builder&        builder,
11779     G4_Kernel&         kernel,
11780     CLEAN_NUM_PROFILE* clean_num_profile)
11781 {
11782     SCRATCH_PTR_VEC scratchAccessList;
11783     SCRATCH_PTR_LIST scratchTraceList;
11784     SCRATCH_PTR_VEC candidateList;
11785     FlowGraph& fg = kernel.fg;
11786 
11787     int candidate_size = 0;
11788     for (auto bb : fg)
11789     {
11790         INST_LIST_ITER inst_it = bb->begin();
11791 
11792         scratchTraceList.clear();
11793         candidateList.clear();
11794         freeScratchAccess(&scratchAccessList);
11795 
11796         //Top down scan within BB
11797         while (inst_it != bb->end())
11798         {
11799             INST_LIST_ITER inst_it_next = inst_it;
11800             inst_it_next++;
11801 
11802             flagSpillFillClean(bb, inst_it, scratchAccessList, scratchTraceList, candidateList, clean_num_profile);
11803 
11804             inst_it = inst_it_next;
11805         }
11806 
11807 #ifdef _DEBUG
11808         candidate_size += (int)candidateList.size();
11809 #endif
11810         //Clean the fills.
11811         regFillClean(builder, bb, candidateList, clean_num_profile);
11812 
11813 #ifdef _DEBUG
11814         if (clean_num_profile->fill_clean_num[0] > FILL_DEBUG_THRESHOLD)
11815             return;
11816 #endif
11817         //Clean the spills
11818         regSpillClean(builder, bb, scratchAccessList, clean_num_profile);
11819 
11820 #ifdef _DEBUG
11821         if (clean_num_profile->spill_clean_num[0] > SPILL_DEBUG_THRESHOLD)
11822         {
11823             return;
11824         }
11825 #endif
11826     }
11827 
11828     freeScratchAccess(&scratchAccessList);
11829     scratchTraceList.clear();
11830     candidateList.clear();
11831 
11832 #ifdef DEBUG_VERBOSE_ON
11833     printf("Candidate size: %d\n", candidate_size);
11834 #endif
11835 
11836     return;
11837 }
11838 
11839 // Insert declarations with pre-assigned registers in kernel
11840 // this is needed for HRA, and the fake declares will be removed at the end of HRA
insertPhyRegDecls()11841 void GlobalRA::insertPhyRegDecls()
11842 {
11843     int numGRF = kernel.getNumRegTotal();
11844     std::vector<bool> grfUsed(numGRF, false);
11845     GRFDclsForHRA.resize(numGRF);
11846 
11847     for (auto curBB : kernel.fg)
11848     {
11849         if (auto summary = kernel.fg.getBBLRASummary(curBB))
11850         {
11851             for (int i = 0; i < numGRF; i++)
11852             {
11853                 if (summary->isGRFBusy(i))
11854                 {
11855                     grfUsed[i] = true;
11856                 }
11857             }
11858         }
11859     }
11860 
11861     // Insert declarations for each GRF that is used
11862     unsigned numGRFsUsed = 0;
11863     for (int i = 0; i < numGRF; i++)
11864     {
11865         if (grfUsed[i] == true)
11866         {
11867             const char* dclName = builder.getNameString(builder.mem, 10, "r%d", i);
11868             G4_Declare* phyRegDcl = builder.createDeclareNoLookup(
11869                 dclName, G4_GRF, numEltPerGRF<Type_UD>(), 1, Type_D, Regular, NULL, NULL);
11870             G4_Greg* phyReg = builder.phyregpool.getGreg(i);
11871             phyRegDcl->getRegVar()->setPhyReg(phyReg, 0);
11872             GRFDclsForHRA[i] = phyRegDcl;
11873             numGRFsUsed++;
11874         }
11875     }
11876 
11877     if (builder.getOption(vISA_OptReport))
11878     {
11879         std::ofstream optreport;
11880         getOptReportStream(optreport, builder.getOptions());
11881         optreport << "Local RA used " << numGRFsUsed << " GRFs\n";
11882     }
11883 }
11884 
11885 // compute physical register info and adjust foot print
11886 // find indexed GRFs and construct a foot print for them
11887 // set live operand in each instruction
computePhyReg()11888 void GlobalRA::computePhyReg()
11889 {
11890     auto& fg = kernel.fg;
11891     for (auto bb : fg)
11892     {
11893         for (auto inst : *bb)
11894         {
11895             if (inst->isPseudoKill() ||
11896                 inst->isLifeTimeEnd() ||
11897                 inst->isPseudoUse())
11898             {
11899                 continue;
11900             }
11901 
11902             if (inst->getDst() &&
11903                 !(inst->hasNULLDst()))
11904             {
11905                 G4_DstRegRegion *currDstRegion = inst->getDst();
11906                 if (currDstRegion->getBase()->isRegVar() &&
11907                     currDstRegion->getBase()->asRegVar()->getDeclare()->getGRFBaseOffset() == 0)
11908                 {
11909                     // Need to compute linearized offset only once per dcl
11910                     currDstRegion->computePReg();
11911                 }
11912             }
11913 
11914             for (unsigned j = 0, size = inst->getNumSrc(); j < size; j++)
11915             {
11916                 G4_Operand *curr_src = inst->getSrc(j);
11917                 if (!curr_src || curr_src->isImm() ||
11918                     (inst->opcode() == G4_math && j == 1 && curr_src->isNullReg()) ||
11919                     curr_src->isLabel())
11920                 {
11921                     continue;
11922                 }
11923 
11924                 if (curr_src->isSrcRegRegion() &&
11925                     curr_src->asSrcRegRegion()->getBase() &&
11926                     curr_src->asSrcRegRegion()->getBase()->isRegVar() &&
11927                     curr_src->asSrcRegRegion()->getBase()->asRegVar()->getDeclare()->getGRFBaseOffset() == 0)
11928                 {
11929                     curr_src->asSrcRegRegion()->computePReg();
11930                 }
11931             }
11932         }
11933     }
11934 }
11935 
dumpRegisterPressure()11936 void GraphColor::dumpRegisterPressure()
11937 {
11938     RPE rpe(gra, &liveAnalysis);
11939     uint32_t max = 0;
11940     std::vector<G4_INST*> maxInst;
11941     rpe.run();
11942 
11943     for (auto bb : builder.kernel.fg)
11944     {
11945         std::cerr << "BB " << bb->getId() << ": (Pred: ";
11946         for (auto pred : bb->Preds)
11947         {
11948             std::cerr << pred->getId() << ",";
11949         }
11950         std::cerr << " Succ: ";
11951         for (auto succ : bb->Succs)
11952         {
11953             std::cerr << succ->getId() << ",";
11954         }
11955         std::cerr << ")\n";
11956         for (auto inst : *bb)
11957         {
11958             uint32_t pressure = rpe.getRegisterPressure(inst);
11959             if (pressure > max)
11960             {
11961                 max = pressure;
11962                 maxInst.clear();
11963                 maxInst.push_back(inst);
11964             }
11965             else if (pressure == max)
11966             {
11967                 maxInst.push_back(inst);
11968             }
11969 
11970             std::cerr << "[" << pressure << "] ";
11971             inst->dump();
11972         }
11973     }
11974     std::cerr << "max pressure: " << max << ", " << maxInst.size() << " inst(s)\n";
11975     for (auto inst : maxInst)
11976     {
11977         inst->dump();
11978     }
11979 }
11980 
fixAlignment()11981 void GlobalRA::fixAlignment()
11982 {
11983     // Copy over alignment from G4_RegVar to GlobalRA instance
11984     // Rest of RA shouldnt have to read/modify alignment of G4_RegVar
11985     copyAlignment();
11986 
11987     if (kernel.getSimdSize() == g4::SIMD32)
11988     {
11989         // we have to force all flags to be 32-bit aligned even if they are < 32-bit,
11990         // due to potential emask usage.
11991         // ToDo: may be better to simply allocate them as 32-bit?
11992         for (auto dcl : kernel.Declares)
11993         {
11994             if (dcl->getRegFile() & G4_FLAG)
11995             {
11996                 setSubRegAlign(dcl, G4_SubReg_Align::Even_Word);
11997             }
11998         }
11999     }
12000 
12001     if (builder.getPlatform() == GENX_BDW)
12002     {
12003         // BDW requires even_word alignment for scalar HF variables
12004         for (auto dcl : kernel.Declares)
12005         {
12006             if (dcl->getElemType() == Type_HF && dcl->getSubRegAlign() == Any)
12007             {
12008                 setSubRegAlign(dcl, Even_Word);
12009             }
12010         }
12011     }
12012 
12013     // ToDo: remove these as it should be done by HWConformity
12014     for (auto BB : kernel.fg)
12015     {
12016         for (auto inst : *BB)
12017         {
12018             G4_DstRegRegion* dst = inst->getDst();
12019             if (dst && dst->getTopDcl())
12020             {
12021                 G4_RegVar* var = dst->getBase()->asRegVar();
12022                 if (inst->isSend() && dst->getRegAccess() == Direct)
12023                 {
12024                     if (!var->isPhyRegAssigned())
12025                     {
12026                         setSubRegAlign(dst->getTopDcl(), GRFALIGN);
12027                     }
12028                 }
12029 
12030                 if (!var->isPhyRegAssigned() && var->getDeclare()->getNumRows() <= 1
12031                     && dst->getRegAccess() == Direct && var->getDeclare()->getSubRegAlign() == Any)
12032                 {
12033                     if (inst->isAccSrcInst())
12034                     {
12035                         setSubRegAlign(dst->getTopDcl(), var->getDeclare()->getRegFile() != G4_ADDRESS ? GRFALIGN : Eight_Word);
12036                     }
12037                 }
12038             }
12039         }
12040     }
12041 }
12042 
verifyAlign(G4_Declare * dcl)12043 void VerifyAugmentation::verifyAlign(G4_Declare* dcl)
12044 {
12045     // Verify that dcl with Default32Bit align mask are 2GRF aligned
12046     auto it = masks.find(dcl);
12047     if (it == masks.end())
12048         return;
12049 
12050     if (dcl->getByteSize() >= numEltPerGRF<Type_UD>() * TypeSize(Type_UD) &&
12051         dcl->getByteSize() <= 2 * numEltPerGRF<Type_UD>() * TypeSize(Type_UD) &&
12052         kernel->getSimdSize() > numEltPerGRF<Type_UD>())
12053     {
12054         auto assignment = dcl->getRegVar()->getPhyReg();
12055         if (assignment && assignment->isGreg())
12056         {
12057             auto phyRegNum = assignment->asGreg()->getRegNum();
12058             auto augMask = std::get<1>((*it).second);
12059             if (phyRegNum % 2 != 0 &&
12060                 augMask == AugmentationMasks::Default32Bit)
12061             {
12062                 printf("Dcl %s is Default32Bit but assignment is not Even aligned\n", dcl->getName());
12063             }
12064         }
12065     }
12066 }
12067 
dump(const char * dclName)12068 void VerifyAugmentation::dump(const char* dclName)
12069 {
12070     std::string dclStr = dclName;
12071     for (auto& m : masks)
12072     {
12073         std::string first = m.first->getName();
12074         if (first == dclStr)
12075         {
12076             printf("%s, %d, %s\n", dclName, m.first->getRegVar()->getId(), getStr(std::get<1>(m.second)));
12077         }
12078     }
12079 }
12080 
labelBBs()12081 void VerifyAugmentation::labelBBs()
12082 {
12083     std::string prev = "X:";
12084     unsigned id = 0;
12085     for (auto bb : kernel->fg)
12086     {
12087         if (bbLabels.find(bb) == bbLabels.end())
12088             bbLabels[bb] = prev;
12089         else
12090             prev = bbLabels[bb];
12091 
12092         if (bb->back()->opcode() == G4_opcode::G4_if)
12093         {
12094             auto TBB = bb->Succs.front();
12095             auto FBB = bb->Succs.back();
12096 
12097             bool hasEndif = false;
12098             for (auto inst : *FBB)
12099             {
12100                 if (inst->opcode() == G4_opcode::G4_endif)
12101                 {
12102                     hasEndif = true;
12103                     break;
12104                 }
12105             }
12106 
12107             bbLabels[TBB] = prev + "T" + std::to_string(id) + ":";
12108 
12109             if (!hasEndif)
12110             {
12111                 // else
12112                 bbLabels[FBB] = prev + "F" + std::to_string(id) + ":";
12113             }
12114             else
12115             {
12116                 // endif block
12117                 bbLabels[FBB] = prev;
12118             }
12119 
12120             prev = prev + "T" + std::to_string(id) + ":";
12121 
12122             id++;
12123         }
12124         else if (bb->back()->opcode() == G4_opcode::G4_else)
12125         {
12126             auto succBB = bb->Succs.front();
12127             auto lbl = prev;
12128             lbl.pop_back();
12129             while (lbl.back() != ':')
12130             {
12131                 lbl.pop_back();
12132             }
12133 
12134             bbLabels[succBB] = lbl;
12135         }
12136         else if (bb->back()->opcode() == G4_opcode::G4_endif)
12137         {
12138 
12139         }
12140     }
12141 
12142 #if 1
12143     for (auto bb : kernel->fg)
12144     {
12145         printf("BB%d -> %s\n", bb->getId(), bbLabels[bb].data());
12146     }
12147 #endif
12148 }
12149 
getGRFBaseOffset(const G4_Declare * dcl)12150 unsigned getGRFBaseOffset(const G4_Declare* dcl)
12151 {
12152     unsigned regNum = dcl->getRegVar()->getPhyReg()->asGreg()->getRegNum();
12153     unsigned regOff = dcl->getRegVar()->getPhyRegOff();
12154     auto type = dcl->getElemType();
12155     return (regNum * numEltPerGRF<Type_UB>()) + (regOff * TypeSize(type));
12156 }
12157 
interfereBetween(G4_Declare * dcl1,G4_Declare * dcl2)12158 bool VerifyAugmentation::interfereBetween(G4_Declare* dcl1, G4_Declare* dcl2)
12159 {
12160     bool interferes = true;
12161     unsigned v1 = dcl1->getRegVar()->getId();
12162     unsigned v2 = dcl2->getRegVar()->getId();
12163     bool v1Partaker = dcl1->getRegVar()->isRegAllocPartaker();
12164     bool v2Partaker = dcl2->getRegVar()->isRegAllocPartaker();
12165 
12166     if (v1Partaker && v2Partaker)
12167     {
12168         auto interferes = intf->interfereBetween(v1, v2);
12169         if (!interferes)
12170         {
12171             if (dcl1->getIsPartialDcl())
12172             {
12173                 interferes |= intf->interfereBetween(gra->getSplittedDeclare(dcl1)->getRegVar()->getId(), v2);
12174                 if (dcl2->getIsPartialDcl())
12175                 {
12176                     interferes |= intf->interfereBetween(v1,
12177                         gra->getSplittedDeclare(dcl2)->getRegVar()->getId());
12178                     interferes |= intf->interfereBetween(gra->getSplittedDeclare(dcl1)->getRegVar()->getId(),
12179                         gra->getSplittedDeclare(dcl2)->getRegVar()->getId());
12180                 }
12181             }
12182             else if (dcl2->getIsPartialDcl())
12183             {
12184                 interferes |= intf->interfereBetween(v1, gra->getSplittedDeclare(dcl2)->getRegVar()->getId());
12185             }
12186         }
12187         return interferes;
12188     }
12189     else if (!v1Partaker && v2Partaker)
12190     {
12191         // v1 is assigned by LRA
12192         unsigned startGRF = dcl1->getRegVar()->getPhyReg()->asGreg()->getRegNum();
12193         unsigned numGRFs = dcl1->getNumRows();
12194 
12195         for (unsigned grf = startGRF; grf != (startGRF + numGRFs); grf++)
12196         {
12197             for (unsigned var = 0; var != numVars; var++)
12198             {
12199                 if (lrs[var] &&
12200                     lrs[var]->getPhyReg() == kernel->fg.builder->phyregpool.getGreg(grf) &&
12201                     std::string(lrs[var]->getVar()->getName()) == "r" + std::to_string(grf))
12202                 {
12203                     if (!intf->interfereBetween(var, v2))
12204                     {
12205                         interferes = false;
12206                     }
12207                 }
12208             }
12209         }
12210     }
12211     else if (v1Partaker && !v2Partaker)
12212     {
12213         return interfereBetween(dcl2, dcl1);
12214     }
12215     else if (!v1Partaker && !v2Partaker)
12216     {
12217         // both assigned by LRA
12218         if (dcl1->getRegFile() == G4_RegFileKind::G4_GRF && dcl2->getRegFile() == G4_RegFileKind::G4_GRF)
12219         {
12220             auto lr1 = gra->getLocalLR(dcl1);
12221             auto lr2 = gra->getLocalLR(dcl2);
12222 
12223             if (lr1->getAssigned() && lr2->getAssigned())
12224             {
12225                 auto preg1Start = getGRFBaseOffset(dcl1);
12226                 auto preg2Start = getGRFBaseOffset(dcl2);
12227                 auto preg1End = preg1Start + dcl1->getByteSize();
12228                 auto preg2End = preg2Start + dcl2->getByteSize();
12229 
12230                 if (preg2Start >= preg1Start && preg2Start < preg1End)
12231                 {
12232                     return false;
12233                 }
12234                 else if (preg1Start >= preg2Start && preg1Start < preg2End)
12235                 {
12236                     return false;
12237                 }
12238             }
12239         }
12240 
12241         interferes = true;
12242     }
12243 
12244     return interferes;
12245 }
12246 
verify()12247 void VerifyAugmentation::verify()
12248 {
12249     std::cerr << "Start verification for kernel: " << kernel->getOptions()->getOptionCstr(VISA_AsmFileName) << std::endl;
12250 
12251     for (auto dcl : kernel->Declares)
12252     {
12253         if (dcl->getIsSplittedDcl())
12254         {
12255             auto& tup = masks[dcl];
12256             std::cerr << dcl->getName() << "(" << getStr(std::get<1>(tup)) << ") is split" << std::endl;
12257             for (const G4_Declare *subDcl : gra->getSubDclList(dcl))
12258             {
12259                 auto& tupSub = masks[subDcl];
12260                 std::cerr << "\t" << subDcl->getName() << " (" << getStr(std::get<1>(tupSub)) << ")" << std::endl;
12261             }
12262         }
12263     }
12264 
12265     std::cerr << std::endl << std::endl << std::endl;
12266 
12267     auto overlapDcl = [](G4_Declare* dcl1, G4_Declare* dcl2)
12268     {
12269         if (dcl1->getRegFile() == G4_RegFileKind::G4_GRF && dcl2->getRegFile() == G4_RegFileKind::G4_GRF)
12270         {
12271             auto preg1Start = getGRFBaseOffset(dcl1);
12272             auto preg2Start = getGRFBaseOffset(dcl2);
12273             auto preg1End = preg1Start + dcl1->getByteSize();
12274             auto preg2End = preg2Start + dcl2->getByteSize();
12275 
12276             if (preg2Start >= preg1Start && preg2Start < preg1End)
12277             {
12278                 return true;
12279             }
12280             else if (preg1Start >= preg2Start && preg1Start < preg2End)
12281             {
12282                 return true;
12283             }
12284         }
12285         return false;
12286     };
12287 
12288     std::list<G4_Declare*> active;
12289     for (auto dcl : sortedLiveRanges)
12290     {
12291         auto& tup = masks[dcl];
12292         unsigned startIdx = std::get<2>(tup)->getLexicalId();
12293         auto dclMask = std::get<1>(tup);
12294 
12295         auto getMaskStr = [](AugmentationMasks m)
12296         {
12297             std::string str = "Undetermined";
12298             if (m == AugmentationMasks::Default16Bit)
12299                 str = "Default16Bit";
12300             else if (m == AugmentationMasks::Default32Bit)
12301                 str = "Default32Bit";
12302             else if (m == AugmentationMasks::Default64Bit)
12303                 str = "Default64Bit";
12304             else if (m == AugmentationMasks::NonDefault)
12305                 str = "NonDefault";
12306             else if (m == AugmentationMasks::DefaultPredicateMask)
12307                 str = "DefaultPredicateMask";
12308             str.append("\n");
12309 
12310             return str;
12311         };
12312 
12313         std::cerr << dcl->getName() << " - " << getMaskStr(dclMask);
12314 
12315         verifyAlign(dcl);
12316 
12317         for (auto it = active.begin(); it != active.end();)
12318         {
12319             auto activeDcl = (*it);
12320             auto& tupActive = masks[activeDcl];
12321             if (startIdx >= std::get<3>(tupActive)->getLexicalId())
12322             {
12323                 it = active.erase(it);
12324                 continue;
12325             }
12326             it++;
12327         }
12328 
12329         for (auto activeDcl : active)
12330         {
12331             auto& tupActive = masks[activeDcl];
12332             auto aDclMask = std::get<1>(tupActive);
12333 
12334             if (dclMask != aDclMask)
12335             {
12336                 bool interfere = interfereBetween(activeDcl, dcl);
12337 
12338                 if (activeDcl->getIsPartialDcl() || dcl->getIsPartialDcl())
12339                     continue;
12340 
12341                 if (!interfere)
12342                 {
12343                     std::cerr << dcl->getRegVar()->getName() << "(" << getStr(dclMask) << ") and " << activeDcl->getRegVar()->getName() << "(" <<
12344                         getStr(aDclMask) << ") are overlapping with incompatible emask but not masked as interfering" << std::endl;
12345                 }
12346 
12347                 if (overlapDcl(activeDcl, dcl))
12348                 {
12349                     if (!interfere)
12350                     {
12351                         std::cerr << dcl->getRegVar()->getName() << "(" << getStr(dclMask) << ") and " << activeDcl->getName() << "(" <<
12352                             getStr(aDclMask) << ") use overlapping physical assignments but not marked as interfering" << std::endl;
12353                     }
12354                 }
12355             }
12356         }
12357 
12358         active.push_back(dcl);
12359     }
12360 
12361     std::cerr << "End verification for kenel: " << kernel->getOptions()->getOptionCstr(VISA_AsmFileName) << std::endl << std::endl << std::endl;
12362 
12363     return;
12364 
12365 #if 0
12366     // Following is useful for debugging when test has only if-else-endif constructs
12367     labelBBs();
12368     populateBBLexId();
12369     std::string msg;
12370     for (auto dcl : sortedLiveRanges)
12371     {
12372         auto lr = DclLRMap[dcl];
12373         if (lr->getPhyReg() && isClobbered(lr, msg))
12374         {
12375             printf("%s clobbered:\n\t%s\n\n", dcl->getName(), msg.data());
12376         }
12377     }
12378 #endif
12379 }
12380 
populateBBLexId()12381 void VerifyAugmentation::populateBBLexId()
12382 {
12383     for (auto bb : kernel->fg)
12384     {
12385         if (bb->size() > 0)
12386             BBLexId.push_back(std::make_tuple(bb, bb->front()->getLexicalId(), bb->back()->getLexicalId()));
12387     }
12388 }
12389 
isClobbered(LiveRange * lr,std::string & msg)12390 bool VerifyAugmentation::isClobbered(LiveRange* lr, std::string& msg)
12391 {
12392     msg.clear();
12393 
12394     auto& tup = masks[lr->getDcl()];
12395 
12396     auto startLexId = std::get<2>(tup)->getLexicalId();
12397     auto endLexId = std::get<3>(tup)->getLexicalId();
12398 
12399     std::vector<std::pair<G4_INST*, G4_BB*>> insts;
12400     std::vector<std::tuple<INST_LIST_ITER, G4_BB*>> defs;
12401     std::vector<std::tuple<INST_LIST_ITER, G4_BB*>> uses;
12402 
12403     for (auto bb : kernel->fg)
12404     {
12405         if (bb->size() == 0)
12406             continue;
12407 
12408         if (bb->back()->getLexicalId() > endLexId && bb->front()->getLexicalId() > endLexId)
12409             continue;
12410 
12411         if (bb->back()->getLexicalId() < startLexId && bb->front()->getLexicalId() < startLexId)
12412             continue;
12413 
12414         // lr is active in current bb
12415         for (auto instIt = bb->begin(), end = bb->end(); instIt != end; instIt++)
12416         {
12417             auto inst = (*instIt);
12418             if (inst->isPseudoKill())
12419                 continue;
12420 
12421             if (inst->getLexicalId() > startLexId && inst->getLexicalId() <= endLexId)
12422             {
12423                 insts.push_back(std::make_pair(inst, bb));
12424                 auto dst = inst->getDst();
12425                 if (dst &&
12426                     dst->isDstRegRegion())
12427                 {
12428                     auto topdcl = dst->asDstRegRegion()->getTopDcl();
12429                     if (topdcl == lr->getDcl())
12430                         defs.push_back(std::make_tuple(instIt, bb));
12431                 }
12432 
12433                 for (unsigned i = 0; i != G4_MAX_SRCS; i++)
12434                 {
12435                     auto src = inst->getSrc(i);
12436                     if (src && src->isSrcRegRegion())
12437                     {
12438                         auto topdcl = src->asSrcRegRegion()->getTopDcl();
12439                         if (topdcl == lr->getDcl())
12440                             uses.push_back(std::make_tuple(instIt, bb));
12441                     }
12442                 }
12443             }
12444         }
12445     }
12446 
12447     for (auto& use : uses)
12448     {
12449         auto& useStr = bbLabels[std::get<1>(use)];
12450         auto inst = *std::get<0>(use);
12451         MUST_BE_TRUE(useStr.size() > 0, "empty string found");
12452         std::list<std::tuple<G4_INST*, G4_BB*>> rd;
12453 
12454         for (unsigned i = 0; i != G4_MAX_SRCS; i++)
12455         {
12456             auto src = inst->getSrc(i);
12457             if (src && src->isSrcRegRegion() && src->asSrcRegRegion()->getTopDcl() == lr->getDcl())
12458             {
12459                 unsigned lb = 0, rb = 0;
12460                 lb = lr->getPhyReg()->asGreg()->getRegNum() * numEltPerGRF<Type_UB>() + (lr->getPhyRegOff()*lr->getDcl()->getElemSize());
12461                 lb += src->getLeftBound();
12462                 rb = lb + src->getRightBound() - src->getLeftBound();
12463 
12464                 for (auto& otherInsts : insts)
12465                 {
12466                     if (otherInsts.first->getLexicalId() > inst->getLexicalId())
12467                         break;
12468 
12469                     auto oiDst = otherInsts.first->getDst();
12470                     auto oiBB = otherInsts.second;
12471                     if (oiDst && oiDst->isDstRegRegion() && oiDst->getTopDcl())
12472                     {
12473                         unsigned oilb = 0, oirb = 0;
12474                         auto oiLR = DclLRMap[oiDst->getTopDcl()];
12475                         if (oiLR && !oiLR->getPhyReg())
12476                             continue;
12477 
12478                         oilb = oiLR->getPhyReg()->asGreg()->getRegNum()*numEltPerGRF<Type_UB>() +
12479                             (oiLR->getPhyRegOff()*oiLR->getDcl()->getElemSize());
12480                         oilb += oiDst->getLeftBound();
12481                         oirb = oilb + oiDst->getRightBound() - oiDst->getLeftBound();
12482 
12483                         if (oilb <= (unsigned)rb && oirb >= (unsigned)lb)
12484                         {
12485                             rd.push_back(std::make_tuple(otherInsts.first, oiBB));
12486                         }
12487                     }
12488                 }
12489             }
12490         }
12491 
12492         auto isComplementary = [](std::string& cur, std::string& other)
12493         {
12494             if (cur.size() < other.size())
12495                 return false;
12496 
12497             if (cur.substr(0, other.size() - 1) == other.substr(0, other.size() - 1))
12498             {
12499                 char lastAlphabet = cur.at(other.size() - 1);
12500                 if (lastAlphabet == 'T' && other.back() == 'F')
12501                     return true;
12502                 if (lastAlphabet == 'F' && other.back() == 'T')
12503                     return true;
12504             }
12505 
12506             return false;
12507         };
12508 
12509         auto isSameEM = [](G4_INST* inst1, G4_INST* inst2)
12510         {
12511             if (inst1->getMaskOption() == inst2->getMaskOption() &&
12512                 inst1->getMaskOffset() == inst2->getMaskOffset())
12513                 return true;
12514             return false;
12515         };
12516 
12517         if (rd.size() > 0)
12518         {
12519             printf("Current use str = %s for inst:\t", useStr.data());
12520             inst->emit(std::cerr);
12521             printf("\t$%d\n", inst->getCISAOff());
12522         }
12523         // process all reaching defs
12524         for (auto rid = rd.begin(); rid != rd.end();)
12525         {
12526             auto& reachingDef = (*rid);
12527 
12528             auto& str = bbLabels[std::get<1>(reachingDef)];
12529 
12530             // skip rd if it is from complementary branch
12531             if (isComplementary(str, useStr) && isSameEM(inst, std::get<0>(reachingDef)))
12532             {
12533 #if 0
12534                 printf("\tFollowing in complementary branch %s, removed:\t", str.data());
12535                 std::get<0>(reachingDef)->emit(std::cerr);
12536                 printf("\t$%d\n", std::get<0>(reachingDef)->getCISAOff());
12537 #endif
12538                 rid = rd.erase(rid);
12539                 continue;
12540             }
12541             rid++;
12542         }
12543 
12544         // keep rd that appears last in its BB
12545         for (auto rid = rd.begin(); rid != rd.end();)
12546         {
12547             auto ridBB = std::get<1>(*rid);
12548             for (auto rid1 = rd.begin(); rid1 != rd.end();)
12549             {
12550                 if (*rid == *rid1)
12551                 {
12552                     rid1++;
12553                     continue;
12554                 }
12555 
12556                 auto rid1BB = std::get<1>(*rid1);
12557                 if (ridBB == rid1BB &&
12558                     std::get<0>(*rid)->getLexicalId() > std::get<0>(*rid1)->getLexicalId())
12559                 {
12560 #if 0
12561                     printf("\tErasing inst at $%d due to later def at $%d\n", std::get<0>(*rid1)->getLexicalId(),
12562                         std::get<0>(*rid)->getLexicalId());
12563 #endif
12564                     rid1 = rd.erase(rid1);
12565                     continue;
12566                 }
12567                 rid1++;
12568             }
12569 
12570             if (rid != rd.end())
12571                 rid++;
12572         }
12573 
12574         if (rd.size() > 0)
12575         {
12576             bool printed = false;
12577             // display left overs in rd from different dcl
12578             for (auto& reachingDef : rd)
12579             {
12580                 if (std::get<0>(reachingDef)->getDst()->getTopDcl() == lr->getDcl()->getRootDeclare())
12581                     continue;
12582 
12583                 if (inst->getCISAOff() == std::get<0>(reachingDef)->getCISAOff())
12584                     continue;
12585 
12586                 if (!printed)
12587                 {
12588                     printf("\tLeft-over rd:\n");
12589                     printed = true;
12590                 }
12591                 printf("\t");
12592                 std::get<0>(reachingDef)->emit(std::cerr);
12593                 printf("\t$%d\n", std::get<0>(reachingDef)->getCISAOff());
12594             }
12595         }
12596     }
12597 
12598     return false;
12599 }
12600 
loadAugData(std::vector<G4_Declare * > & s,LiveRange * const * l,unsigned n,const Interference * i,GlobalRA & g)12601 void VerifyAugmentation::loadAugData(std::vector<G4_Declare*>& s, LiveRange* const * l, unsigned n, const Interference* i, GlobalRA& g)
12602 {
12603     reset();
12604     sortedLiveRanges = s;
12605     gra = &g;
12606     kernel = &gra->kernel;
12607     lrs = l;
12608     numVars = n;
12609     intf = i;
12610 
12611     for (unsigned i = 0; i != numVars; i++)
12612     {
12613         DclLRMap[lrs[i]->getDcl()] = lrs[i];
12614     }
12615     for (auto dcl : kernel->Declares)
12616     {
12617         if (dcl->getRegFile() == G4_RegFileKind::G4_GRF ||
12618             dcl->getRegFile() == G4_RegFileKind::G4_INPUT)
12619         {
12620             LiveRange* lr = nullptr;
12621             auto it = DclLRMap.find(dcl);
12622             if (it != DclLRMap.end())
12623             {
12624                 lr = it->second;
12625             }
12626             auto start = gra->getStartInterval(dcl);
12627             auto end = gra->getEndInterval(dcl);
12628             masks[dcl] = std::make_tuple(lr, gra->getAugmentationMask(dcl), start, end);
12629         }
12630     }
12631 }
12632 
12633 //
12634 // DFS to check if there is any conflict in subroutine return location
12635 //
isSubRetLocConflict(G4_BB * bb,std::vector<unsigned> & usedLoc,unsigned stackTop)12636 bool GlobalRA::isSubRetLocConflict(G4_BB *bb, std::vector<unsigned> &usedLoc, unsigned stackTop)
12637 {
12638     auto& fg = kernel.fg;
12639     if (bb->isAlreadyTraversed(fg.getTraversalNum()))
12640         return false;
12641     bb->markTraversed(fg.getTraversalNum());
12642 
12643     G4_INST* lastInst = bb->size() == 0 ? NULL : bb->back();
12644     if (lastInst && lastInst->isReturn())
12645     {
12646         if (lastInst->getPredicate() == NULL)
12647             return false;
12648         else
12649         {
12650             return isSubRetLocConflict(bb->fallThroughBB(), usedLoc, stackTop);
12651         }
12652     }
12653     else if (lastInst && lastInst->isCall())     // need to traverse to next level
12654     {
12655         unsigned curSubRetLoc = getSubRetLoc(bb);
12656         //
12657         // check conflict firstly
12658         //
12659         for (unsigned i = 0; i<stackTop; i++)
12660             if (usedLoc[i] == curSubRetLoc)
12661                 return true;
12662         //
12663         // then traverse all the subroutines and return BB
12664         //
12665         usedLoc[stackTop] = curSubRetLoc;
12666         unsigned afterCallId = bb->BBAfterCall()->getId();
12667 
12668         // call can have 1 or 2 successors
12669         // If it has 1 then it is sub-entry block, if it has 2
12670         // then call has to be predicated. In case of predication,
12671         // 1st successor is physically following BB, 2nd is
12672         // sub-entry.
12673         if (lastInst->getPredicate())
12674         {
12675             MUST_BE_TRUE(bb->Succs.size() == 2, "Expecting 2 successor BBs for predicated call");
12676             if (isSubRetLocConflict(bb->Succs.back(), usedLoc, stackTop))
12677                 return true;
12678         }
12679 
12680         if (bb->BBAfterCall()->getId() == afterCallId)
12681         {
12682             if (isSubRetLocConflict(bb->BBAfterCall(), usedLoc, stackTop))
12683                 return true;
12684         }
12685     }
12686     else
12687     {
12688         for (G4_BB *succ : bb->Succs)
12689             if (isSubRetLocConflict(succ, usedLoc, stackTop))
12690                 return true;
12691     }
12692 
12693     return false;
12694 }
12695 
12696 //
12697 // The routine traverses all BBs that can be reached from the entry of a subroutine (not
12698 // traversing into nested subroutine calls). Mark retLoc[bb] = entryId (to associate bb
12699 // with the subroutine entry. When two subroutines share code, we return the location of the
12700 // subroutine that was previously traversed so that the two routines can then use
12701 // the same location to save their return addresses.
12702 //
determineReturnAddrLoc(unsigned entryId,unsigned * retLoc,G4_BB * bb)12703 unsigned GlobalRA::determineReturnAddrLoc(unsigned entryId, unsigned* retLoc, G4_BB* bb)
12704 {
12705     auto& fg = kernel.fg;
12706     if (bb->isAlreadyTraversed(fg.getTraversalNum()))
12707         return retLoc[bb->getId()];
12708     bb->markTraversed(fg.getTraversalNum());
12709 
12710     if (retLoc[bb->getId()] != UNDEFINED_VAL)
12711         return retLoc[bb->getId()];
12712     else
12713     {
12714         retLoc[bb->getId()] = entryId;
12715         G4_INST* lastInst = bb->size() == 0 ? NULL : bb->back();
12716 
12717         if (lastInst && lastInst->isReturn())
12718         {
12719             if (lastInst->getPredicate() == NULL)
12720                 return entryId;
12721             else
12722                 return determineReturnAddrLoc(entryId, retLoc, bb->fallThroughBB());
12723         }
12724         else if (lastInst && lastInst->isCall()) // skip nested subroutine calls
12725         {
12726             return determineReturnAddrLoc(entryId, retLoc, bb->BBAfterCall());
12727         }
12728         unsigned sharedId = entryId;
12729         for (G4_BB *succ : bb->Succs)
12730         {
12731             unsigned loc = determineReturnAddrLoc(entryId, retLoc, succ);
12732             if (loc != entryId)
12733             {
12734                 while (retLoc[loc] != loc)  // find the root of subroutine loc
12735                     loc = retLoc[loc];      // follow the link to reach the root
12736                 if (sharedId == entryId)
12737                 {
12738                     sharedId = loc;
12739                 }
12740                 else if (sharedId != loc)
12741                 {
12742                     //
12743                     // The current subroutine share code with two other subroutines, we
12744                     // force all three of them to use the same location by linking them
12745                     // togethers.
12746                     //
12747                     retLoc[loc] = sharedId;
12748                 }
12749             }
12750         }
12751         return sharedId;
12752     }
12753 }
12754 
assignLocForReturnAddr()12755 void GlobalRA::assignLocForReturnAddr()
12756 {
12757     auto& fg = kernel.fg;
12758     unsigned* retLoc = (unsigned*)builder.mem.alloc(fg.getNumBB() * sizeof(unsigned));
12759     //
12760     // a data structure for doing a quick map[id] ---> block
12761     //
12762     G4_BB**  BBs = (G4_BB**)builder.mem.alloc(fg.getNumBB() * sizeof(G4_BB*));
12763     for (G4_BB *bb : fg)
12764     {
12765         unsigned i = bb->getId();
12766         retLoc[i] = UNDEFINED_VAL;
12767         BBs[i] = bb;                                                     // BBs are sorted by ID
12768     }
12769 
12770     //
12771     // Firstly, keep the original algorithm unchanged to mark the retLoc
12772     //
12773     std::vector<G4_BB *> caller;                                          // just to accelerate the algorithm later
12774 
12775     for (unsigned i = 0, bbNum = fg.getNumBB(); i < bbNum; i++)
12776     {
12777         G4_BB* bb = BBs[i];
12778         if (bb->isEndWithCall() == false)
12779         {
12780             continue;
12781         }
12782 
12783 #ifdef _DEBUG
12784         G4_INST *last = bb->empty() ? NULL : bb->back();
12785         MUST_BE_TRUE(last, ERROR_FLOWGRAPH);
12786 #endif
12787 
12788         caller.push_back(bb);                   // record the callers, just to accelerate the algorithm
12789 
12790         G4_BB* subEntry = bb->getCalleeInfo()->getInitBB();
12791         if (retLoc[subEntry->getId()] != UNDEFINED_VAL) // a loc has been assigned to the subroutine
12792         {
12793             // Need to setSubRetLoc if subEntry is part of another subRoutine because,
12794             // in the final phase, we use SubRetLoc != UNDEFINED_VAL to indicate
12795             // a block is an entry of a subroutine.
12796             setSubRetLoc(subEntry, retLoc[subEntry->getId()]);
12797         }
12798         else
12799         {
12800             fg.prepareTraversal();
12801             unsigned loc = determineReturnAddrLoc(subEntry->getId(), retLoc, subEntry);
12802             if (loc != subEntry->getId())
12803             {
12804                 retLoc[subEntry->getId()] = loc;
12805             }
12806             setSubRetLoc(subEntry, loc);
12807             //
12808             // We do not merge indirect call here, because it will createt additional (bb->getSubRetLoc() != bb->getId())  cases that kill the share code detection
12809             //
12810         }
12811 
12812         // retBB is the exit basic block of callee, ie the block with return statement at end
12813         G4_BB* retBB = bb->getCalleeInfo()->getExitBB();
12814 
12815         if (retLoc[retBB->getId()] == UNDEFINED_VAL)
12816         {
12817             // retBB block was unreachable so retLoc element corresponding to that block was
12818             // left undefined
12819             retLoc[retBB->getId()] = getSubRetLoc(subEntry);
12820         }
12821     }
12822 #ifdef DEBUG_VERBOSE_ON
12823     DEBUG_MSG(std::endl << "Before merge indirect call: " << std::endl);
12824     for (unsigned i = 0; i < fg.getNumBB(); i++)
12825         if (retLoc[i] == UNDEFINED_VAL) {
12826             DEBUG_MSG("BB" << i << ": X   ");
12827         }
12828         else {
12829             DEBUG_MSG("BB" << i << ": " << retLoc[i] << "   ");
12830         }
12831         DEBUG_MSG(std::endl);
12832 #endif
12833 
12834         //
12835         // this final phase is needed. Consider the following scenario.  Sub2 shared code with both
12836         // Sub1 and Sub3. All three must use the same location to save return addresses. If we traverse
12837         // Sub1 then Sub3, retLoc[Sub1] and retLoc[Sub3] all point to their own roots.  As we traverse
12838         // Sub2, code sharing is detected, we need to this phase to make sure that Sub1 and Sub3 use the
12839         // same location.
12840         //
12841         for (unsigned i = 0, bbNum = fg.getNumBB(); i < bbNum; i++)
12842         {
12843             G4_BB* bb = BBs[i];
12844             if (getSubRetLoc(bb) != UNDEFINED_VAL)
12845             {
12846                 if (getSubRetLoc(bb) != bb->getId())
12847                 {
12848                     unsigned loc = bb->getId();
12849                     while (retLoc[loc] != loc)  // not root
12850                         loc = retLoc[loc];  // follow the link to reach the root
12851                 }
12852             }
12853         }
12854 
12855         //
12856         // Merge the retLoc in indirect call cases
12857         //
12858         for (G4_BB *bb : caller)
12859         {
12860             G4_INST *last = bb->empty() ? NULL : bb->back();
12861             MUST_BE_TRUE(last, ERROR_FLOWGRAPH);
12862 
12863             unsigned fallThroughId = bb->fallThroughBB() == NULL ? UNDEFINED_VAL : bb->fallThroughBB()->getId();
12864             if ((last && last->getPredicate() == NULL && bb->Succs.size() > 1) || (last && last->getPredicate() != NULL && bb->Succs.size() > 2))
12865             {
12866                 //
12867                 // merge all subroutines to the last one, it is a trick to conduct the conditional call by using last one instead of first one
12868                 //
12869                 unsigned masterEntryId = bb->Succs.back()->getId();
12870                 //
12871                 // find the root of the master subroutine
12872                 //
12873                 unsigned masterRetLoc = masterEntryId;
12874                 while (retLoc[masterRetLoc] != masterRetLoc)
12875                     masterRetLoc = retLoc[masterRetLoc];
12876                 //
12877                 // check other subroutines in one vertex
12878                 //
12879                 for (G4_BB *subBB : bb->Succs)
12880                 {
12881                     if (subBB->getId() != masterEntryId && subBB->getId() != fallThroughId)
12882                     {
12883                         //
12884                         // find the root of the current subroutine
12885                         //
12886                         unsigned loc = subBB->getId();
12887                         while (retLoc[loc] != loc)
12888                             loc = retLoc[loc];
12889                         //
12890                         // Merge: let all the items in retLoc with value loc pointing to masterRetLoc
12891                         // Suppose indirect call X calls subroutine A and B, indirect call Y calls B and C, and indirect call Z calls C and D.
12892                         // Before merge, the A~D will be assigned different return location. Suppose we process the callers in order X-->Z-->Y in the merge,
12893                         // if we just modified the return locations of one indirect call, we will fail to merge the return locations of A~D.
12894                         //
12895                         if (loc != masterRetLoc)
12896                         {
12897                             for (unsigned i = 0; i < fg.getNumBB(); i++)
12898                                 if (retLoc[i] == loc)
12899                                     retLoc[i] = masterRetLoc;
12900                         }
12901                     }
12902                 }
12903             }
12904         }
12905 
12906 #ifdef DEBUG_VERBOSE_ON
12907         DEBUG_MSG(std::endl << "After merge indirect call: " << std::endl);
12908         for (unsigned i = 0; i < fg.getNumBB(); i++)
12909             if (retLoc[i] == UNDEFINED_VAL) {
12910                 DEBUG_MSG("BB" << i << ": X   ");
12911             }
12912             else {
12913                 DEBUG_MSG("BB" << i << ": " << retLoc[i] << "   ");
12914             }
12915             DEBUG_MSG(std::endl << std::endl);
12916 #endif
12917 
12918             //
12919             //  Assign ret loc for subroutines firstly, and then check if it is wrong (due to circle in call graph).
12920             //
12921             for (unsigned i = 0, bbNum = fg.getNumBB(); i < bbNum; i++)
12922             {
12923                 //
12924                 // reset the return BB's retLoc
12925                 //
12926                 unsigned loc = i;
12927                 if (retLoc[i] != UNDEFINED_VAL)
12928                 {
12929                     while (retLoc[loc] != loc)
12930                         loc = retLoc[loc];
12931                     retLoc[i] = loc;
12932                     setSubRetLoc(BBs[i], retLoc[loc]);
12933                 }
12934             }
12935 
12936             for (G4_BB *bb : caller)
12937             {
12938                 //
12939                 // set caller BB's retLoc
12940                 //
12941 #ifdef _DEBUG
12942                 G4_INST *last = bb->empty() ? NULL : bb->back();
12943                 MUST_BE_TRUE(last, ERROR_FLOWGRAPH);
12944 #endif
12945                 G4_BB *subBB = bb->getCalleeInfo()->getInitBB();
12946                 //
12947                 // 1: Must use retLoc here, because some subBB is also the caller of another subroutine, so the entry loc in BB may be changed in this step
12948                 // 2: In some cases, the caller BB is also the entry BB. At this time, the associated entry BB ID will be overwritten. However, it will not impact the
12949                 // conflict detection and return location assignment, since we only check the return BB and/or caller BB in these two moudles.
12950                 //
12951                 setSubRetLoc(bb, retLoc[subBB->getId()]);
12952             }
12953 
12954 #ifdef _DEBUG
12955             for (unsigned i = 0; i < fg.getNumBB(); i++)
12956             {
12957                 G4_BB* bb = BBs[i];
12958                 if (getSubRetLoc(bb) != UNDEFINED_VAL)
12959                 {
12960                     if (!bb->empty() && bb->front()->isLabel())
12961                     {
12962                         DEBUG_VERBOSE(((G4_Label*)bb->front()->getSrc(0))->getLabel()
12963                             << " assigned location " << getSubRetLoc(bb) << std::endl);
12964                     }
12965                 }
12966             }
12967 #endif
12968 
12969             //
12970             // detect the conflict (circle) at last
12971             //
12972             std::vector<unsigned> usedLoc(fg.getNumBB());
12973             unsigned stackTop = 0;
12974             for (G4_BB *bb : caller)
12975             {
12976                 //
12977                 // Must re-start the traversal from each caller, otherwise will lose some circle cases like TestRA_Call_1_1_3B, D, F, G, H
12978                 //
12979                 fg.prepareTraversal();
12980 
12981                 usedLoc[stackTop] = getSubRetLoc(bb);
12982 
12983                 G4_BB* subEntry = bb->Succs.back();
12984 
12985                 if (isSubRetLocConflict(subEntry, usedLoc, stackTop + 1))
12986                 {
12987                     MUST_BE_TRUE(false,
12988                         "ERROR: Fail to assign call-return variables due to cycle in call graph!");
12989                 }
12990             }
12991 
12992             insertCallReturnVar();
12993 }
12994 
insertCallReturnVar()12995 void GlobalRA::insertCallReturnVar()
12996 {
12997     for (auto bb : kernel.fg)
12998     {
12999         G4_INST *last = bb->empty() ? NULL : bb->back();
13000         if (last)
13001         {
13002             if (last->isCall())
13003             {
13004                 insertSaveAddr(bb);
13005             }
13006             else
13007             {
13008                 if (last->isReturn())
13009                 {
13010                     // G4_BB_EXIT_TYPE is just a dummy BB, and the return will be the last
13011                     // inst in each of its predecessors
13012                     insertRestoreAddr(bb);
13013                 }
13014             }
13015         }
13016     }
13017 }
13018 
insertSaveAddr(G4_BB * bb)13019 void GlobalRA::insertSaveAddr(G4_BB* bb)
13020 {
13021     MUST_BE_TRUE(bb != NULL, ERROR_INTERNAL_ARGUMENT);
13022     MUST_BE_TRUE(getSubRetLoc(bb) != UNDEFINED_VAL,
13023         ERROR_FLOWGRAPH); // must have a assigned loc
13024 
13025 
13026     G4_INST *last = bb->back();
13027     MUST_BE_TRUE1(last->isCall(), last->getLineNo(),
13028         ERROR_FLOWGRAPH);
13029     if (last->getDst() == NULL)
13030     {
13031         unsigned loc = getSubRetLoc(bb);
13032         G4_Declare* dcl = getRetDecl(loc);
13033 
13034         last->setDest(builder.createDst(dcl->getRegVar(), 0, 0, 1, Type_UD)); // RET__loc12<1>:ud
13035 
13036         last->setExecSize(g4::SIMD2);
13037     }
13038 }
13039 
insertRestoreAddr(G4_BB * bb)13040 void GlobalRA::insertRestoreAddr(G4_BB* bb)
13041 {
13042     MUST_BE_TRUE(bb != NULL, ERROR_INTERNAL_ARGUMENT);
13043 
13044     G4_INST *last = bb->back();
13045     MUST_BE_TRUE1(last->isReturn(), last->getLineNo(),
13046         ERROR_FLOWGRAPH);
13047     if (last->getSrc(0) == NULL)
13048     {
13049         unsigned loc = getSubRetLoc(bb);
13050         G4_Declare* dcl = getRetDecl(loc);
13051 
13052         G4_SrcRegRegion* new_src = builder.createSrc(
13053             dcl->getRegVar(),
13054             0,
13055             0,
13056             builder.createRegionDesc(0, 2, 1),
13057             Type_UD);
13058 
13059         last->setSrc(new_src, 0);
13060         last->setDest(builder.createNullDst(Type_UD));
13061 
13062         last->setExecSize(g4::SIMD2);
13063     }
13064 }
13065 
13066 // This function returns the weight of interference edge lr1--lr2,
13067 // which is used for computing the degree of lr1.
13068 //
13069 // When there is no alignment restriction, we should use the normal weight,
13070 // which is lr1_nreg + lr2_nreg - 1.
13071 //
13072 // Otherewise, we need to take into account additional space that may be
13073 // required because of the alignment restriction. For example,
13074 // if lr1 has even alignment and lr2 has no alignment restriction,
13075 // we need to consider the following cases that would require the
13076 // maximal available GRF space for successful allocation:
13077 // 1) lr1's size is odd, lr2's size is odd and lr2's start position is even,
13078 //    the total space required would be (lr1_nreg + lr2_nreg + 1)
13079 // 2) lr1's size is odd, lr2's size is even and lr2's start position is even,
13080 //    the total space required would be (lr1_nreg + lr2_nreg)
13081 // 3) lr1's size is even, lr2's size is odd and lr2's start position is odd,
13082 //    the total space required would be (lr1_nreg + lr2_nreg)
13083 // 4) lr1's size is even, lr2's size is even and lr2's start position is odd,
13084 //    the total space required would be (lr1_nreg + lr2_nreg + 1)
13085 // The above logic can be simplified to the following formula:
13086 //    lr1_nreg + lr2_nreg + 1 - ((lr1_nreg + lr2_nreg) % 2)
13087 //
13088 // If both lr1 and lr2 have even alignment restriction,
13089 // we need to consider the following cases that would require the
13090 // maximal available GRF space for successful allocation:
13091 // 1) lr1's size is odd, lr2's size is odd and lr2's start position is even,
13092 //    the total space required would be (lr1_nreg + lr2_nreg + 1)
13093 // 2) lr1's size is odd, lr2's size is even and lr2's start position is even,
13094 //    the total space required would be (lr1_nreg + lr2_nreg)
13095 // 3) lr1's size is even, lr2's size is odd and lr2's start position is even,
13096 //    the total space required would be (lr1_nreg + lr2_nreg)
13097 // 4) lr1's size is even, lr2's size is even and lr2's start position is even,
13098 //    the total space required would be (lr1_nreg + lr2_nreg - 1)
13099 // The above logic can be simplified to the following formula:
13100 //    lr1_nreg + lr2_nreg - 1 + (lr1_nreg % 2) + (lr2_nreg % 2)
13101 //
edgeWeightGRF(const LiveRange * lr1,const LiveRange * lr2)13102 unsigned GraphColor::edgeWeightGRF(const LiveRange* lr1, const LiveRange* lr2)
13103 {
13104     bool lr1EvenAlign = gra.isEvenAligned(lr1->getDcl());
13105     bool lr2EvenAlign = gra.isEvenAligned(lr2->getDcl());
13106     unsigned lr1_nreg = lr1->getNumRegNeeded();
13107     unsigned lr2_nreg = lr2->getNumRegNeeded();
13108 
13109     if (!lr1EvenAlign)
13110     {
13111         return lr1_nreg + lr2_nreg - 1;
13112     }
13113     else if (!lr2EvenAlign)
13114     {
13115         unsigned sum = lr1_nreg + lr2_nreg;
13116         return sum + 1 - ((sum) % 2);
13117     }
13118     else if (lr2EvenAlign)
13119     {
13120         return lr1_nreg + lr2_nreg - 1 + (lr1_nreg % 2) + (lr2_nreg % 2);
13121     }
13122     else
13123     {
13124         assert(false && "should be unreachable");
13125         return 0;
13126     }
13127 }
13128 
edgeWeightARF(const LiveRange * lr1,const LiveRange * lr2)13129 unsigned GraphColor::edgeWeightARF(const LiveRange* lr1, const LiveRange* lr2)
13130 {
13131     if (lr1->getRegKind() == G4_FLAG)
13132     {
13133         G4_SubReg_Align lr1_align = gra.getSubRegAlign(lr1->getVar()->getDeclare());
13134         G4_SubReg_Align lr2_align = gra.getSubRegAlign(lr2->getVar()->getDeclare());
13135         unsigned lr1_nreg = lr1->getNumRegNeeded();
13136         unsigned lr2_nreg = lr2->getNumRegNeeded();
13137 
13138         if (lr1_align == Any)
13139         {
13140             return  lr1_nreg + lr2_nreg - 1;
13141         }
13142         else if (lr1_align == Even_Word && lr2_align == Any)
13143         {
13144             return lr1_nreg + lr2_nreg + 1 - ((lr1_nreg + lr2_nreg) % 2);
13145         }
13146         else if (lr1_align == Even_Word && lr2_align == Even_Word)
13147         {
13148             if (lr1_nreg % 2 == 0 && lr2_nreg % 2 == 0)
13149             {
13150                 return lr1_nreg + lr2_nreg - 2;
13151             }
13152             else
13153             {
13154                 return lr1_nreg + lr2_nreg - 1 + (lr1_nreg % 2) + (lr2_nreg % 2);
13155             }
13156         }
13157         else
13158         {
13159             MUST_BE_TRUE(false, "Found unsupported subRegAlignment in flag register allocation!");
13160             return 0;
13161         }
13162     }
13163     else if (lr1->getRegKind() == G4_ADDRESS)
13164     {
13165         G4_SubReg_Align lr1_align = gra.getSubRegAlign(lr1->getVar()->getDeclare());
13166         G4_SubReg_Align lr2_align = gra.getSubRegAlign(lr2->getVar()->getDeclare());
13167         unsigned lr1_nreg = lr1->getNumRegNeeded();
13168         unsigned lr2_nreg = lr2->getNumRegNeeded();
13169 
13170         if (lr1_align == Any)
13171         {
13172             return  lr1_nreg + lr2_nreg - 1;
13173         }
13174         else if (lr1_align == Four_Word && lr2_align == Any)
13175         {
13176             return lr1_nreg + lr2_nreg + 3 - (lr1_nreg + lr2_nreg) % 4;
13177         }
13178         else if (lr1_align == Four_Word && lr2_align == Four_Word)
13179         {
13180             return lr1_nreg + lr2_nreg - 1 + (4 - lr1_nreg % 4) % 4 + (4 - lr2_nreg % 4) % 4;
13181         }
13182         else if (lr1_align == Eight_Word && lr2_align == Any)
13183         {
13184             return lr1_nreg + lr2_nreg + 7 - (lr1_nreg + lr2_nreg) % 8;
13185         }
13186         else if (lr1_align == Eight_Word && lr2_align == Four_Word)
13187         {
13188             if (((8 - lr1_nreg % 8) % 8) >= 4)
13189                 return lr1_nreg + lr2_nreg - 1 + (8 - lr1_nreg % 8) % 8 - 4;
13190             return lr1_nreg + lr2_nreg - 1 + (8 - lr1_nreg % 8) % 8 +
13191                 (4 - lr2_nreg % 4) % 4;
13192         }
13193         else if (lr1_align == Eight_Word && lr2_align == Eight_Word)
13194         {
13195             return lr1_nreg + lr2_nreg - 1 + (8 - lr1_nreg % 8) % 8 +
13196                 (8 - lr2_nreg % 8) % 8;
13197         }
13198         else
13199         {
13200             MUST_BE_TRUE(false, "Found unsupported subRegAlignment in address register allocation!");
13201             return 0;
13202         }
13203     }
13204     MUST_BE_TRUE(false, "Found unsupported ARF reg type in register allocation!");
13205     return 0;
13206 }
13207 
fixSrc0IndirFcall()13208 void GlobalRA::fixSrc0IndirFcall()
13209 {
13210     // Indirect calls look like:
13211     // mov (1|NM) V10    0x123456:ud
13212     // fcall (1) dst     V10 <-- V10 which is src0 contains %ip to jump to
13213     //
13214     // In this function, we want to set V10 to r125.0 which is same as dst of fcall
13215     // as per ABI. This way, when inserting save/restore code around fcall, no
13216     // special checks are needed to handle V10.
13217     //
13218     // But this works only if V10 is a local. If it not a local we create a mov
13219     // that copies V10 in to a new temp variable. And then we map this temp
13220     // variable to r125.0. Hopefully V10 being global would be a rare occurence.
13221     for (auto bb : kernel.fg)
13222     {
13223         if (bb->isEndWithFCall())
13224         {
13225             auto fcall = bb->back()->asCFInst();
13226             if (!fcall->getSrc(0) ||
13227                 !fcall->getSrc(0)->isSrcRegRegion())
13228                 continue;
13229 
13230             auto src0Rgn = fcall->getSrc(0)->asSrcRegRegion();
13231             auto src0Dcl = src0Rgn->getBase()->asRegVar()->getDeclare();
13232             auto src0TopDcl = src0Rgn->getTopDcl();
13233 
13234             if (src0Dcl != src0TopDcl ||
13235                 !isBlockLocal(src0TopDcl) ||
13236                 src0TopDcl->getNumElems() > 1)
13237             {
13238                 // create a copy
13239                 auto tmpDcl = kernel.fg.builder->createHardwiredDeclare(1, src0Rgn->getType(), kernel.getFPSPGRF(),
13240                     IR_Builder::SubRegs_Stackcall::Ret_IP);
13241                 auto dst = kernel.fg.builder->createDst(tmpDcl->getRegVar(), src0Rgn->getType());
13242                 auto src = kernel.fg.builder->duplicateOperand(src0Rgn);
13243                 auto copy = kernel.fg.builder->createMov(g4::SIMD1, dst, src, InstOpt_WriteEnable, false);
13244                 auto iter = std::find_if(bb->begin(), bb->end(), [](G4_INST* inst) { return inst->isFCall(); });
13245                 bb->insertBefore(iter, copy);
13246                 auto newSrc = kernel.fg.builder->createSrc(tmpDcl->getRegVar(), 0, 0, kernel.fg.builder->getRegionScalar(),
13247                     src0Rgn->getType());
13248                 fcall->setSrc(newSrc, 0);
13249             }
13250             else
13251             {
13252                 src0TopDcl->getRegVar()->setPhyReg(fcall->getDst()->getBase()->asRegVar()->getPhyReg(),
13253                     fcall->getDst()->getBase()->asRegVar()->getPhyRegOff());
13254             }
13255         }
13256     }
13257 }
13258 
dump(const char * s,LiveRange ** lrs,unsigned size)13259 bool dump(const char* s, LiveRange** lrs, unsigned size)
13260 {
13261     // Utility function to dump lr from name.
13262     // Returns true if lr name found.
13263     std::string name = s;
13264     for (unsigned i = 0; i != size; i++)
13265     {
13266         auto lr = lrs[i];
13267         if (lr && name.compare(lr->getVar()->getName()) == 0)
13268         {
13269             lr->dump();
13270             return true;
13271         }
13272     }
13273     return false;
13274 }
13275 
dump(const char * s,const G4_Kernel * kernel)13276 bool dump(const char* s, const G4_Kernel* kernel)
13277 {
13278     // Utility function to dump dcl for given variable name.
13279     // Returns true if variable found.
13280     std::string name = s;
13281     for (auto dcl : kernel->Declares)
13282     {
13283         if (name.compare(dcl->getName()) == 0)
13284         {
13285             dcl->dump();
13286             return true;
13287         }
13288     }
13289     return false;
13290 }
13291 
dumpIntf(const char * s) const13292 bool Interference::dumpIntf(const char* s) const
13293 {
13294     // Utility function to dump intf for given variable based on name.
13295     // Returns true if variable found.
13296     std::cout << "\n\n **** Interference Table ****\n";
13297     for (unsigned i = 0; i < maxId; i++)
13298     {
13299         std::string name = lrs[i]->getVar()->getName();
13300         if (name.compare(s) == 0)
13301         {
13302             std::cout << "(" << i << ") ";
13303             lrs[i]->dump();
13304             std::cout << "\n";
13305             for (unsigned j = 0; j < maxId; j++)
13306             {
13307                 if (interfereBetween(i, j))
13308                 {
13309                     std::cout << "\t";
13310                     lrs[j]->getVar()->emit(std::cout);
13311                 }
13312             }
13313             std::cout << "\n";
13314             return true;
13315         }
13316     }
13317     return false;
13318 }
13319 
setAllocHint(unsigned h)13320 void LiveRange::setAllocHint(unsigned h)
13321 {
13322     if ((h + dcl->getNumRows()) <= gra.kernel.getNumRegTotal())
13323         allocHint = h;
13324 }
13325 
13326 // sortedIntervals comes from augmentation.
13327 // This can be invoked either post RA where phy regs are assigned to dcls,
13328 // or after assignColors with lrs and numLRs passed which makes this function
13329 // use temp allocations from lrs. Doesnt handle sub-routines yet.
dumpRegChart(std::ostream & os,LiveRange ** lrs,unsigned numLRs)13330 void RegChartDump::dumpRegChart(std::ostream& os, LiveRange** lrs, unsigned numLRs)
13331 {
13332     constexpr unsigned N = 128;
13333     std::unordered_map<G4_INST*, std::bitset<N>> busyGRFPerInst;
13334     bool dumpHex = false;
13335 
13336     auto getPhyReg = [&](const G4_Declare* dcl)
13337     {
13338         auto preg = dcl->getRegVar()->getPhyReg();
13339         if (preg)
13340             return preg;
13341 
13342         for (unsigned i = 0; i != numLRs; i++)
13343         {
13344             const LiveRange* lr = lrs[i];
13345             if (lr->getDcl() == dcl)
13346             {
13347                 preg = lr->getPhyReg();
13348                 break;
13349             }
13350         }
13351 
13352         return preg;
13353     };
13354 
13355     for (auto dcl : sortedLiveIntervals)
13356     {
13357         if (dcl->getRegFile() != G4_RegFileKind::G4_GRF &&
13358             dcl->getRegFile() != G4_RegFileKind::G4_INPUT)
13359             continue;
13360 
13361         auto phyReg = getPhyReg(dcl);
13362         if (!phyReg)
13363             continue;
13364 
13365         if (!phyReg->isGreg())
13366             continue;
13367 
13368         auto GRFStart = phyReg->asGreg()->getRegNum();
13369         auto numRows = dcl->getNumRows();
13370 
13371         auto startInst = startEnd[dcl].first;
13372         auto endInst = startEnd[dcl].second;
13373 
13374         bool start = (dcl->getRegFile() == G4_RegFileKind::G4_INPUT);
13375         bool done = false;
13376         for (auto bb : gra.kernel.fg)
13377         {
13378             for (auto inst : *bb)
13379             {
13380                 if (inst == startInst)
13381                 {
13382                     start = true;
13383                     continue;
13384                 }
13385 
13386                 if (!start)
13387                     continue;
13388 
13389                 for (unsigned i = GRFStart; i != (GRFStart + numRows); i++)
13390                 {
13391                     busyGRFPerInst[inst].set(i, true);
13392                 }
13393 
13394                 if (inst == endInst ||
13395                     endInst == startInst)
13396                 {
13397                     done = true;
13398                     break;
13399                 }
13400             }
13401 
13402             if (done)
13403                 break;
13404         }
13405     }
13406 
13407     // Now emit instructions with GRFs
13408     for (auto bb : gra.kernel.fg)
13409     {
13410         for (auto inst : *bb)
13411         {
13412             constexpr unsigned maxInstLen = 80;
13413             auto item = busyGRFPerInst[inst];
13414             std::stringstream ss;
13415             inst->emit(ss);
13416             auto len = ss.str().length();
13417 
13418             if (len <= maxInstLen)
13419             {
13420                 os << ss.str();
13421                 for (unsigned i = 0; i != maxInstLen - ss.str().length(); i++)
13422                     os << " ";
13423             }
13424             else
13425             {
13426                 auto tmpStr = ss.str();
13427                 auto limitedStr = tmpStr.substr(0, maxInstLen);
13428                 os << std::string(limitedStr);
13429             }
13430 
13431             os << "        ";
13432 
13433             if (!dumpHex)
13434             {
13435                 // dump GRFs | - busy, * - free
13436                 for (unsigned i = 0; i != N; i++)
13437                 {
13438                     // emit in groups of 10 GRFs
13439                     if (i > 0 && (i % 10) == 0)
13440                         os << "  ";
13441 
13442                     if (item[i] == true)
13443                         os << "|"; // busy
13444                     else
13445                         os << "*"; // free
13446                 }
13447             }
13448             else
13449             {
13450                 for (unsigned i = 0; i != N; i+=sizeof(unsigned short)*8)
13451                 {
13452                     unsigned short busyGRFs = 0;
13453                     for (unsigned j = 0; j != sizeof(unsigned short)*8; j++)
13454                     {
13455                         auto offset = i + j;
13456                         if (offset < N)
13457                         {
13458                             if (item[offset])
13459                                 busyGRFs |= (1 << j);
13460                         }
13461                     }
13462                     printf("r%d:%4x      ", i, busyGRFs);
13463                 }
13464             }
13465             os << std::endl;
13466         }
13467         os << std::endl;
13468     }
13469 }
13470 
recordLiveIntervals(const std::vector<G4_Declare * > & dcls)13471 void RegChartDump::recordLiveIntervals(const std::vector<G4_Declare*>& dcls)
13472 {
13473     sortedLiveIntervals = dcls;
13474     for (auto dcl : dcls)
13475     {
13476         auto start = gra.getStartInterval(dcl);
13477         auto end = gra.getEndInterval(dcl);
13478         startEnd.insert(std::make_pair(dcl, std::make_pair(start, end)));
13479     }
13480 }
13481