1 /*========================== begin_copyright_notice ============================
2
3 Copyright (C) 2017-2021 Intel Corporation
4
5 SPDX-License-Identifier: MIT
6
7 ============================= end_copyright_notice ===========================*/
8
9 #include "BuildIR.h"
10 #include "DebugInfo.h"
11 #include "FlowGraph.h"
12 #include "GraphColor.h"
13 #include "LocalRA.h"
14 #include "LinearScanRA.h"
15 #include "Optimizer.h"
16 #include "SCCAnalysis.h"
17 #include "SpillCleanup.h"
18 #include "SpillCode.h"
19 #include "Rematerialization.h"
20 #include "RPE.h"
21 #include "Timer.h"
22
23 #include <algorithm>
24 #include <cmath> // sqrt
25 #include <fstream>
26 #include <iostream>
27 #include <list>
28 #include <sstream>
29 #include "SplitAlignedScalars.h"
30
31 using namespace vISA;
32
33 #define GRAPH_COLOR_MEM_SIZE 16*1024
34 #define SCRATCH_MSG_LIMIT (128 * 1024)
35 #define FAIL_SAFE_RA_LIMIT 3
36
37 const RAVarInfo GlobalRA::defaultValues;
38 const char GlobalRA::StackCallStr[] = "StackCall";
39
40 static const unsigned IN_LOOP_REFERENCE_COUNT_FACTOR = 4;
41
42 #define BANK_CONFLICT_HEURISTIC_INST 0.04
43 #define BANK_CONFLICT_HEURISTIC_REF_COUNT 0.25
44 #define BANK_CONFLICT_HEURISTIC_LOOP_ITERATION 5
45 #define BANK_CONFLICT_SEND_INST_CYCLE 60 //Some send 200, some 400 we choose the small one
46 #define BANK_CONFLICT_SIMD8_OVERHEAD_CYCLE 1
47 #define BANK_CONFLICT_SIMD16_OVERHEAD_CYCLE 2
48 #define INTERNAL_CONFLICT_RATIO_HEURISTIC 0.25
49
50 #define NOMASK_BYTE 0x80
51
52
Interference(const LivenessAnalysis * l,LiveRange ** const & lr,unsigned n,unsigned ns,unsigned nm,GlobalRA & g)53 Interference::Interference(const LivenessAnalysis* l, LiveRange** const & lr, unsigned n, unsigned ns, unsigned nm,
54 GlobalRA& g) : gra(g), kernel(g.kernel), lrs(lr),
55 builder(*g.kernel.fg.builder), maxId(n), splitStartId(ns), splitNum(nm),
56 liveAnalysis(l), rowSize(maxId / BITS_DWORD + 1)
57 {
58 }
59
varSplitCheckBeforeIntf(unsigned v1,unsigned v2) const60 inline bool Interference::varSplitCheckBeforeIntf(unsigned v1, unsigned v2) const
61 {
62 const LiveRange * l1 = lrs[v1];
63 const LiveRange * l2 = lrs[v2];
64
65 if (!l1->getIsPartialDcl() &&
66 !l2->getIsPartialDcl())
67 {
68 return false;
69 }
70
71 //Don't do interference for two split declares
72 if (l1->getIsPartialDcl() &&
73 l2->getIsPartialDcl())
74 {
75 return true;
76 }
77
78 unsigned p1 = v1;
79 unsigned p2 = v2;
80 //Don't do inteference for child and parent delcares
81 if (l1->getIsPartialDcl())
82 {
83 p1 = l1->getParentLRID();
84 }
85
86 if (l2->getIsPartialDcl())
87 {
88 p2 = l2->getParentLRID();
89 }
90
91 if (p1 == p2)
92 {
93 return true;
94 }
95
96 return false;
97 }
98
setupBankAccordingToSiblingOperand(BankConflict assignedBank,unsigned offset,bool oneGRFBank)99 BankConflict BankConflictPass::setupBankAccordingToSiblingOperand(BankConflict assignedBank, unsigned offset, bool oneGRFBank)
100 {
101 BankConflict tgtBank;
102
103 MUST_BE_TRUE(assignedBank != BANK_CONFLICT_NONE, "sibling bank is not assigned");
104
105 //Set according to sibling
106 tgtBank = (assignedBank == BANK_CONFLICT_FIRST_HALF_EVEN || assignedBank == BANK_CONFLICT_FIRST_HALF_ODD) ?
107 (assignedBank == BANK_CONFLICT_FIRST_HALF_EVEN ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_SECOND_HALF_EVEN) :
108 (assignedBank == BANK_CONFLICT_SECOND_HALF_EVEN ? BANK_CONFLICT_FIRST_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN);
109
110 //Adjust according to the offset
111 if (oneGRFBank)
112 {
113 if (offset % 2)
114 {
115 if (tgtBank == BANK_CONFLICT_SECOND_HALF_EVEN ||
116 tgtBank == BANK_CONFLICT_FIRST_HALF_EVEN)
117 {
118 tgtBank = (tgtBank == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_FIRST_HALF_ODD : BANK_CONFLICT_SECOND_HALF_ODD;
119 }
120 else
121 {
122 tgtBank = (tgtBank == BANK_CONFLICT_FIRST_HALF_ODD) ? BANK_CONFLICT_FIRST_HALF_EVEN : BANK_CONFLICT_SECOND_HALF_EVEN;
123 }
124 }
125 }
126 else
127 {
128 if (offset % 4 >= 2)
129 {
130 if (tgtBank == BANK_CONFLICT_SECOND_HALF_EVEN ||
131 tgtBank == BANK_CONFLICT_FIRST_HALF_EVEN)
132 {
133 tgtBank = (tgtBank == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_FIRST_HALF_ODD : BANK_CONFLICT_SECOND_HALF_ODD;
134 }
135 else
136 {
137 tgtBank = (tgtBank == BANK_CONFLICT_FIRST_HALF_ODD) ? BANK_CONFLICT_FIRST_HALF_EVEN : BANK_CONFLICT_SECOND_HALF_EVEN;
138 }
139 }
140 }
141
142 return tgtBank;
143 }
144
refNumBasedSort(const unsigned * refNum,unsigned * index)145 void refNumBasedSort(const unsigned *refNum, unsigned *index)
146 {
147 if (refNum[2] > refNum[1])
148 {
149 index[0] = 2;
150 index[1] = 1;
151 }
152 else
153 {
154 index[0] = 1;
155 index[1] = 2;
156 }
157
158 index[2] = 0;
159
160 return;
161 }
162
hasInternalConflict3Srcs(BankConflict * srcBC)163 bool BankConflictPass::hasInternalConflict3Srcs(BankConflict *srcBC)
164 {
165 if (((srcBC[0] == BANK_CONFLICT_SECOND_HALF_EVEN ||
166 srcBC[0] == BANK_CONFLICT_FIRST_HALF_EVEN) &&
167 (srcBC[1] == BANK_CONFLICT_SECOND_HALF_EVEN ||
168 srcBC[1] == BANK_CONFLICT_FIRST_HALF_EVEN) &&
169 (srcBC[2] == BANK_CONFLICT_SECOND_HALF_EVEN ||
170 srcBC[2] == BANK_CONFLICT_FIRST_HALF_EVEN)) ||
171 ((srcBC[0] == BANK_CONFLICT_SECOND_HALF_ODD ||
172 srcBC[0] == BANK_CONFLICT_FIRST_HALF_ODD) &&
173 (srcBC[1] == BANK_CONFLICT_SECOND_HALF_ODD ||
174 srcBC[1] == BANK_CONFLICT_FIRST_HALF_ODD) &&
175 (srcBC[2] == BANK_CONFLICT_SECOND_HALF_ODD ||
176 srcBC[2] == BANK_CONFLICT_FIRST_HALF_ODD)))
177 {
178 return true;
179 }
180 if ((srcBC[0] < BANK_CONFLICT_SECOND_HALF_EVEN &&
181 srcBC[1] < BANK_CONFLICT_SECOND_HALF_EVEN &&
182 srcBC[2] < BANK_CONFLICT_SECOND_HALF_EVEN) ||
183 (srcBC[0] >= BANK_CONFLICT_SECOND_HALF_EVEN &&
184 srcBC[1] >= BANK_CONFLICT_SECOND_HALF_EVEN &&
185 srcBC[2] >= BANK_CONFLICT_SECOND_HALF_EVEN))
186 {
187 return true;
188 }
189
190 return false;
191 }
192
setupEvenOddBankConflictsForDecls(G4_Declare * dcl_1,G4_Declare * dcl_2,unsigned offset1,unsigned offset2,BankConflict & srcBC1,BankConflict & srcBC2)193 void BankConflictPass::setupEvenOddBankConflictsForDecls(G4_Declare * dcl_1, G4_Declare * dcl_2,
194 unsigned offset1, unsigned offset2,
195 BankConflict &srcBC1, BankConflict &srcBC2)
196 {
197 ASSERT_USER(srcBC1 == BANK_CONFLICT_NONE, "Wrong Bank initial value");
198 ASSERT_USER(srcBC2 == BANK_CONFLICT_NONE, "Wrong Bank initial value");
199
200 unsigned refNum1 = gra.getNumRefs(dcl_1);
201 unsigned refNum2 = gra.getNumRefs(dcl_2);
202
203 BankConflict bank1 = BANK_CONFLICT_NONE;
204 BankConflict bank2 = BANK_CONFLICT_NONE;
205
206 bank1 = (refNum1 >= refNum2) ? BANK_CONFLICT_FIRST_HALF_EVEN : BANK_CONFLICT_SECOND_HALF_ODD;
207 bank2 = (bank1 == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
208
209 srcBC1 = bank1;
210 srcBC2 = bank2;
211
212 //Adjust only for the single bank allocation
213 if ((offset1 + offset2) % 2)
214 {
215 if (refNum1 >= refNum2)
216 {
217 bank2 = (bank2 == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
218 }
219 else
220 {
221 bank1 = (bank1 == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
222 }
223 }
224
225 gra.setBankConflict(dcl_1, bank1);
226 gra.setBankConflict(dcl_2, bank2);
227
228 return;
229 }
230
231
232 //
233 // inst opcode is G4_mad. This function sets up a simple state machine to prevent conflict
234 // between src 1 and 2 of mad inst. Following is how GRF file is divided in to banks:
235 // bank-block A = 0, 2, 4, 6, ..., 62
236 // bank-block B = 1, 3, 5, 7, ..., 63
237 // bank-block C = 64, 66, 68, ..., 126
238 // bank-block D = 65, 67, 69, ..., 127
239 //
240 // For ternary ops, if src1 and src2 are to the same bank then there will be an access collision.
241 // But unary and binary ops will have no collision, no matter what registers they use. The reason
242 // is second and third src operands are read in the same clock cycle, which is different than
243 // when src0 operand is read. This is true upto pre-SKL.
244 //
245 // Bank Conflict Herustics:
246 // 1. Try to balance the used registers in two banks for the potential conflicted registers.
247 // 2. reference number is used to decide which to be assigned first
248 // 3. When conflict detected, bank can be updated according to the reference count.
249 //
setupBankConflictsOneGRFOld(G4_INST * inst,int & bank1RegNum,int & bank2RegNum,float GRFRatio,unsigned & internalConflict)250 void BankConflictPass::setupBankConflictsOneGRFOld(G4_INST* inst, int &bank1RegNum, int &bank2RegNum, float GRFRatio, unsigned &internalConflict)
251 {
252 BankConflict srcBC[3];
253 unsigned regNum[3];
254 unsigned refNum[3];
255 unsigned offset[3];
256 G4_Declare * dcls[3];
257 G4_Declare * opndDcls[3];
258 int bank_num = 0;
259
260 for (int i = 0; i < 3; i++)
261 {
262 dcls[i] = nullptr;
263 opndDcls[i] = nullptr;
264
265 G4_Operand* src = inst->getSrc(i);
266 if (!src || !src->isSrcRegRegion() || src->isAccReg())
267 {
268 // bank conflict not possible
269 return;
270 }
271
272 dcls[i] = GetTopDclFromRegRegion(src);
273 opndDcls[i] = src->getBase()->asRegVar()->getDeclare();
274
275 regNum[i] = dcls[i]->getNumRows();
276 refNum[i] = gra.getNumRefs(dcls[i]);
277 offset[i] = (opndDcls[i]->getOffsetFromBase() + src->getLeftBound()) / numEltPerGRF<Type_UB>();
278 srcBC[i] = gra.getBankConflict(dcls[i]);
279
280 if (src->getBase()->asRegVar()->isPhyRegAssigned())
281 {
282 unsigned reg = src->getBase()->asRegVar()->getPhyReg()->asGreg()->getRegNum();
283 if ((reg + offset[i]) < SECOND_HALF_BANK_START_GRF)
284 {
285 srcBC[i] = ((reg + offset[i]) % 2) ? BANK_CONFLICT_FIRST_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
286 }
287 else
288 {
289 srcBC[i] = ((reg + offset[i]) % 2) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_SECOND_HALF_EVEN;
290 }
291 if (reg < SECOND_HALF_BANK_START_GRF)
292 {
293 bank1RegNum += regNum[i];
294 }
295 else
296 {
297 bank2RegNum += regNum[i];
298 }
299 gra.setBankConflict(dcls[i], srcBC[i]);
300 }
301 else if (srcBC[i] != BANK_CONFLICT_NONE)
302 {
303 if (offset[i] % 2)
304 {
305 //Get operand's bank from declare's bank
306 if (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN ||
307 srcBC[i] == BANK_CONFLICT_FIRST_HALF_ODD)
308 {
309 srcBC[i] = (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_FIRST_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
310 }
311 else
312 {
313 srcBC[i] = (srcBC[i] == BANK_CONFLICT_SECOND_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_SECOND_HALF_EVEN;
314 }
315 }
316 }
317
318 if (i > 0)
319 {
320 bank_num += srcBC[i];
321 }
322 }
323
324 //In case src1 and src2 share same declare, i.e. use same regsiter
325 if (bank_num == 0 &&
326 dcls[1] == dcls[2])
327 {
328 BankConflict bank1 = ((bank1RegNum * GRFRatio) > bank2RegNum) ? BANK_CONFLICT_SECOND_HALF_EVEN : BANK_CONFLICT_FIRST_HALF_EVEN;
329
330 gra.setBankConflict(dcls[1], bank1);
331 srcBC[1] = bank1;
332 srcBC[2] = bank1;
333 bank_num += bank1 * 2;
334 if (bank1 < BANK_CONFLICT_SECOND_HALF_EVEN)
335 {
336 bank1RegNum += regNum[1];
337 }
338 else
339 {
340 bank2RegNum += regNum[1];
341 }
342 }
343
344 //No bank assigned to src 1, 2.
345 //assign the two delcares into different bundles/banks.
346 if (bank_num == 0)
347 {
348 BankConflict bank1 = BANK_CONFLICT_NONE;
349 BankConflict bank2 = BANK_CONFLICT_NONE;
350 bool bank1First = false;
351 if (GRFRatio == 1.0)
352 {
353 //For global RA: Try to reduce the size of bank 2
354 if ((float)refNum[1] / regNum[1] >= (float)refNum[2] / regNum[2])
355 {
356 bank1 = BANK_CONFLICT_SECOND_HALF_EVEN;
357 bank2 = BANK_CONFLICT_FIRST_HALF_ODD;
358 bank1First = true;
359 }
360 else
361 {
362 bank2 = BANK_CONFLICT_SECOND_HALF_EVEN;
363 bank1 = BANK_CONFLICT_FIRST_HALF_ODD;
364 }
365 }
366 else
367 {
368 //For local RA: Try to balance two banks
369 if (refNum[1] >= refNum[2])
370 {
371 bank1 = ((bank1RegNum * GRFRatio) > bank2RegNum) ? BANK_CONFLICT_SECOND_HALF_EVEN : BANK_CONFLICT_FIRST_HALF_EVEN;
372 bank2 = (bank1 == BANK_CONFLICT_SECOND_HALF_EVEN) ? BANK_CONFLICT_FIRST_HALF_ODD : BANK_CONFLICT_SECOND_HALF_ODD;
373 bank1First = true;
374 }
375 else
376 {
377 bank2 = (bank1RegNum * GRFRatio) > bank2RegNum ? BANK_CONFLICT_SECOND_HALF_EVEN : BANK_CONFLICT_FIRST_HALF_EVEN;
378 bank1 = (bank2 == BANK_CONFLICT_SECOND_HALF_EVEN) ? BANK_CONFLICT_FIRST_HALF_ODD : BANK_CONFLICT_SECOND_HALF_ODD;
379 }
380 }
381
382 //Adjust only for the single bank allocation
383 if ((offset[1] + offset[2]) % 2)
384 {
385 if (bank1First)
386 {
387 bank2 = (bank2 == BANK_CONFLICT_FIRST_HALF_ODD) ? BANK_CONFLICT_FIRST_HALF_EVEN : BANK_CONFLICT_SECOND_HALF_EVEN;
388 }
389 else
390 {
391 bank1 = (bank1 == BANK_CONFLICT_SECOND_HALF_ODD) ? BANK_CONFLICT_SECOND_HALF_EVEN : BANK_CONFLICT_FIRST_HALF_EVEN;
392 }
393 }
394
395 if (bank1 >= BANK_CONFLICT_SECOND_HALF_EVEN)
396 {
397 bank2RegNum += regNum[1];
398 bank1RegNum += regNum[2];
399 }
400 else
401 {
402 bank1RegNum += regNum[1];
403 bank2RegNum += regNum[2];
404 }
405
406 gra.setBankConflict(dcls[1], bank1);
407 gra.setBankConflict(dcls[2], bank2);
408 }
409 else
410 {
411 if (srcBC[1] == BANK_CONFLICT_NONE || srcBC[2] == BANK_CONFLICT_NONE)
412 {
413 //One source operand is assigned bank already
414 if (srcBC[2] == BANK_CONFLICT_NONE)
415 {
416 srcBC[2] = setupBankAccordingToSiblingOperand(srcBC[1], offset[2], true);
417 gra.setBankConflict(dcls[2], srcBC[2]);
418
419 if (srcBC[2] < BANK_CONFLICT_SECOND_HALF_EVEN)
420 bank1RegNum += regNum[2];
421 else
422 bank2RegNum += regNum[2];
423 }
424 else
425 {
426 srcBC[1] = setupBankAccordingToSiblingOperand(srcBC[2], offset[1], true);
427 gra.setBankConflict(dcls[1], srcBC[1]);
428 if (srcBC[1] < BANK_CONFLICT_SECOND_HALF_EVEN)
429 bank1RegNum += regNum[1];
430 else
431 bank2RegNum += regNum[1];
432 }
433 }
434 else if (dcls[1] != dcls[2])
435 {
436 if (((srcBC[1] == BANK_CONFLICT_SECOND_HALF_EVEN ||
437 srcBC[1] == BANK_CONFLICT_FIRST_HALF_EVEN) &&
438 (srcBC[2] == BANK_CONFLICT_SECOND_HALF_EVEN ||
439 srcBC[2] == BANK_CONFLICT_FIRST_HALF_EVEN)) ||
440 ((srcBC[1] == BANK_CONFLICT_SECOND_HALF_ODD ||
441 srcBC[1] == BANK_CONFLICT_FIRST_HALF_ODD) &&
442 (srcBC[2] == BANK_CONFLICT_SECOND_HALF_ODD ||
443 srcBC[2] == BANK_CONFLICT_FIRST_HALF_ODD)))
444 {
445 internalConflict++;
446 }
447 if ((srcBC[1] < BANK_CONFLICT_SECOND_HALF_EVEN &&
448 srcBC[2] < BANK_CONFLICT_SECOND_HALF_EVEN) ||
449 (srcBC[1] >= BANK_CONFLICT_SECOND_HALF_EVEN &&
450 srcBC[2] >= BANK_CONFLICT_SECOND_HALF_EVEN))
451 {
452 internalConflict++;
453 }
454 }
455 }
456
457 #ifdef DEBUG_VERBOSE_ON
458 for (int i = 0; i < 3; i++)
459 {
460 if (opndDcls[i])
461 {
462 printf("%s, %s\n", opndDcls[i]->getName(), dcls[i]->getBankConflict() > 2 ?
463 (dcls[i]->getBankConflict() == BANK_CONFLICT_SECOND_HALF_EVEN ? "HIGH_EVEN" : "HIGH_ODD") :
464 dcls[i]->getBankConflict() > 0 ?
465 (dcls[i]->getBankConflict() == BANK_CONFLICT_FIRST_HALF_EVEN ? "LOW_EVEN" : "LOW_ODD") : "NONE");
466 }
467 }
468 printf("Bank1 number: %d; Bank2 number: %d\n", bank1RegNum, bank2RegNum);
469 #endif
470
471 return;
472 }
473
getBanks(G4_INST * inst,BankConflict * srcBC,G4_Declare ** dcls,G4_Declare ** opndDcls,unsigned * offset)474 void BankConflictPass::getBanks(G4_INST* inst, BankConflict *srcBC, G4_Declare **dcls, G4_Declare **opndDcls, unsigned *offset)
475 {
476 for (int i = 0; i < 3; i++)
477 {
478 dcls[i] = nullptr;
479 opndDcls[i] = nullptr;
480 srcBC[i] = BANK_CONFLICT_NONE;
481
482 G4_Operand* src = inst->getSrc(i);
483 if (!src || !src->isSrcRegRegion() || src->isAccReg())
484 {
485 return;
486 }
487
488 dcls[i] = GetTopDclFromRegRegion(src);
489 if (!dcls[i])
490 {
491 continue;
492 }
493 opndDcls[i] = src->getBase()->asRegVar()->getDeclare();
494
495 offset[i] = (opndDcls[i]->getOffsetFromBase() + src->getLeftBound()) / numEltPerGRF<Type_UB>();
496 srcBC[i] = gra.getBankConflict(dcls[i]);
497
498 if (src->getBase()->asRegVar()->isPhyRegAssigned())
499 {
500 unsigned reg = src->getBase()->asRegVar()->getPhyReg()->asGreg()->getRegNum();
501 srcBC[i] = ((reg + offset[i]) % 2) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
502 }
503 else if (srcBC[i] != BANK_CONFLICT_NONE)
504 {
505 if (offset[i] % 2)
506 {
507 if (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN)
508 {
509 srcBC[i] = BANK_CONFLICT_SECOND_HALF_ODD;
510 }
511 else
512 {
513 srcBC[i] = BANK_CONFLICT_FIRST_HALF_EVEN;
514 }
515 }
516 }
517 }
518
519 return;
520 }
521
getPrevBanks(G4_INST * inst,BankConflict * srcBC,G4_Declare ** dcls,G4_Declare ** opndDcls,unsigned * offset)522 void BankConflictPass::getPrevBanks(G4_INST* inst, BankConflict *srcBC, G4_Declare **dcls, G4_Declare **opndDcls, unsigned *offset)
523 {
524 int execSize[G4_MAX_SRCS];
525
526 for (int i = 1; i < 3; i++)
527 {
528 dcls[i] = nullptr;
529 opndDcls[i] = nullptr;
530 srcBC[i] = BANK_CONFLICT_NONE;
531
532 G4_Operand* src = inst->getSrc(i);
533 if (!src || !src->isSrcRegRegion())
534 {
535 return;
536 }
537 dcls[i] = GetTopDclFromRegRegion(src);
538 if (dcls[i]->getRegFile() != G4_GRF)
539 {
540 return;
541 }
542 execSize[i] = src->getLinearizedEnd() - src->getLinearizedStart() + 1;
543
544 opndDcls[i] = src->getBase()->asRegVar()->getDeclare();
545
546 offset[i] = (opndDcls[i]->getOffsetFromBase() + src->getLeftBound()) / numEltPerGRF<Type_UB>();
547 srcBC[i] = gra.getBankConflict(dcls[i]);
548
549 if (src->getBase()->asRegVar()->isPhyRegAssigned())
550 {
551 unsigned reg = src->getBase()->asRegVar()->getPhyReg()->asGreg()->getRegNum();
552 srcBC[i] = ((reg + offset[i]) % 2) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
553 }
554 else if (srcBC[i] != BANK_CONFLICT_NONE)
555 {
556 if (offset[i] % 2)
557 {
558 if (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN)
559 {
560 srcBC[i] = BANK_CONFLICT_SECOND_HALF_ODD;
561 }
562 else
563 {
564 srcBC[i] = BANK_CONFLICT_FIRST_HALF_EVEN;
565 }
566 }
567 }
568 if (execSize[i] > 32)
569 {
570 srcBC[i] = (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
571 }
572 }
573
574 return;
575 }
576
577
578
setupBankForSrc0(G4_INST * inst,G4_INST * prevInst)579 void BankConflictPass::setupBankForSrc0(G4_INST* inst, G4_INST* prevInst)
580 {
581 BankConflict srcBC[3];
582 G4_Declare * dcls[3];
583 G4_Declare * opndDcls[3];
584 unsigned offset[3];
585
586 BankConflict prevSrcBC[3];
587 G4_Declare * prevDcls[3];
588 G4_Declare * prevOpndDcls[3];
589 unsigned prevOffset[3];
590
591 if (prevInst->isSend() ||
592 prevInst->isMath())
593 {
594 return;
595 }
596
597 getBanks(inst, srcBC, dcls, opndDcls, offset);
598 getPrevBanks(prevInst, prevSrcBC, prevDcls, prevOpndDcls, prevOffset);
599
600 if (dcls[0] != nullptr &&
601 srcBC[0] == BANK_CONFLICT_NONE &&
602 prevSrcBC[1] != BANK_CONFLICT_NONE &&
603 prevSrcBC[2] != BANK_CONFLICT_NONE)
604 {
605 if (prevSrcBC[1] == prevSrcBC[2])
606 {
607 if (prevSrcBC[1] == BANK_CONFLICT_FIRST_HALF_EVEN)
608 {
609 srcBC[0] = offset[0] % 2 ? BANK_CONFLICT_FIRST_HALF_EVEN : BANK_CONFLICT_SECOND_HALF_ODD;
610 }
611 else
612 {
613 srcBC[0] = offset[0] % 2 ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
614 }
615
616 gra.setBankConflict(dcls[0], srcBC[0]);
617 }
618 }
619
620 return;
621 }
622
setupBankConflictsforTwoGRFs(G4_INST * inst)623 void BankConflictPass::setupBankConflictsforTwoGRFs(G4_INST* inst)
624 {
625 BankConflict srcBC[3];
626 unsigned refNum[3];
627 unsigned offset[3];
628 G4_Declare * dcls[3];
629 G4_Declare * opndDcls[3];
630 int bank_num = 0;
631 int execSize[3];
632
633 for (int i = 0; i < 3; i++)
634 {
635 dcls[i] = nullptr;
636 opndDcls[i] = nullptr;
637 execSize[i] = 0;
638
639 G4_Operand* src = inst->getSrc(i);
640 if (!src || !src->isSrcRegRegion() || src->isAccReg())
641 {
642 // bank conflict not possible
643 return;
644 }
645 execSize[i] = src->getLinearizedEnd() - src->getLinearizedStart() + 1;
646
647 dcls[i] = GetTopDclFromRegRegion(src);
648 opndDcls[i] = src->getBase()->asRegVar()->getDeclare();
649
650 refNum[i] = gra.getNumRefs(dcls[i]);
651 offset[i] = (opndDcls[i]->getOffsetFromBase() + src->getLeftBound()) / numEltPerGRF<Type_UB>();
652 srcBC[i] = gra.getBankConflict(dcls[i]);
653
654 if (src->getBase()->asRegVar()->isPhyRegAssigned())
655 {
656 unsigned reg = src->getBase()->asRegVar()->getPhyReg()->asGreg()->getRegNum();
657 srcBC[i] = ((reg + offset[i]) % 2) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
658 gra.setBankConflict(dcls[i], srcBC[i]);
659 }
660 else if (srcBC[i] != BANK_CONFLICT_NONE)
661 {
662 if (offset[i] % 2)
663 {
664 if (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN)
665 {
666 srcBC[i] = BANK_CONFLICT_SECOND_HALF_ODD;
667 }
668 else
669 {
670 srcBC[i] = BANK_CONFLICT_FIRST_HALF_EVEN;
671 }
672 }
673 }
674 if (i != 0)
675 {
676 bank_num += srcBC[i];
677 }
678 }
679
680 int simd8SrcNum = 0;
681 for (int i = 0; i < 3; i++)
682 {
683 if (execSize[i] <= 32)
684 {
685 simd8SrcNum++;
686 }
687 }
688
689 //In case (src0) src1 and src2 use same declare, i.e. use same regsiter
690 if ((dcls[0] == dcls[1]) && (dcls[1] == dcls[2]))
691 {
692 return;
693 }
694
695 //No bank assigned to src operands,
696 //assign the two delcares into different bundles/banks.
697 if (simd8SrcNum <= 1) //All simd16, do even align
698 {
699 for (int i = 0; i < 3; i++)
700 {
701 if (execSize[i] > 32)
702 {
703 srcBC[i] = offset[i] % 2 ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
704 gra.setBankConflict(dcls[i], srcBC[i]);
705 }
706 }
707 }
708 else if (bank_num == 0)
709 {
710 unsigned index[3];
711
712 refNumBasedSort(refNum, index);
713
714 if (dcls[index[0]] != dcls[index[1]])
715 {
716 setupEvenOddBankConflictsForDecls(dcls[index[0]], dcls[index[1]],
717 offset[index[0]], offset[index[1]],
718 srcBC[index[0]], srcBC[index[1]]);
719 }
720 }
721 else
722 {
723 if (srcBC[1] != BANK_CONFLICT_NONE)
724 {
725 srcBC[2] = (srcBC[1] == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
726 if (offset[2] % 2)
727 {
728 srcBC[2] = (srcBC[2] == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
729 }
730 gra.setBankConflict(dcls[2], srcBC[2]);
731 }
732 else
733 {
734 srcBC[1] = (srcBC[2] == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
735 if (offset[1] % 2)
736 {
737 srcBC[1] = (srcBC[1] == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
738 }
739 gra.setBankConflict(dcls[1], srcBC[1]);
740 }
741 }
742
743 #ifdef DEBUG_VERBOSE_ON
744 for (int i = 0; i < 3; i++)
745 {
746 if (opndDcls[i])
747 {
748 printf("%s, %s\n", opndDcls[i]->getName(), dcls[i]->getBankConflict() > 2 ?
749 (dcls[i]->getBankConflict() == BANK_CONFLICT_SECOND_HALF_EVEN ? "HIGH_EVEN" : "HIGH_ODD") :
750 dcls[i]->getBankConflict() > 0 ?
751 (dcls[i]->getBankConflict() == BANK_CONFLICT_FIRST_HALF_EVEN ? "LOW_EVEN" : "LOW_ODD") : "NONE");
752 }
753 }
754 printf("Bank1 number: %d; Bank2 number: %d\n", bank1RegNum, bank2RegNum);
755 #endif
756
757 return;
758 }
759
isOddOffset(unsigned offset) const760 bool BankConflictPass::isOddOffset(unsigned offset) const
761 {
762 if (gra.kernel.fg.builder->oneGRFBankDivision())
763 {
764 return (offset % 2);
765 }
766 else
767 {
768 return ((offset % 4) / 2);
769 }
770 }
771
setupBankConflictsforDPAS(G4_INST * inst)772 void BankConflictPass::setupBankConflictsforDPAS(G4_INST* inst)
773 {
774 BankConflict srcBC[3];
775 unsigned refNum[3];
776 unsigned offset[3];
777 G4_Declare * dcls[3];
778 G4_Declare * opndDcls[3];
779 int bank_num = 0;
780
781 if (!inst->isDpas())
782 {
783 return;
784 }
785
786
787 for (int i = 0; i < 3; i += 1)
788 {
789 opndDcls[i] = nullptr;
790
791 G4_Operand* src = inst->getSrc(i);
792
793 dcls[i] = GetTopDclFromRegRegion(src);
794 if (dcls[i])
795 {
796 opndDcls[i] = src->getBase()->asRegVar()->getDeclare();
797
798 refNum[i] = gra.getNumRefs(dcls[i]);
799 offset[i] = (opndDcls[i]->getOffsetFromBase() + src->getLeftBound()) / numEltPerGRF<Type_UB>();
800 srcBC[i] = gra.getBankConflict(dcls[i]);
801
802 if (srcBC[i] != BANK_CONFLICT_NONE)
803 {
804 if (isOddOffset(offset[i]))
805 {
806 if (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN)
807 {
808 srcBC[i] = BANK_CONFLICT_SECOND_HALF_ODD;
809 }
810 else
811 {
812 srcBC[i] = BANK_CONFLICT_FIRST_HALF_EVEN;
813 }
814 }
815 if (i != 1)
816 {
817 bank_num++;
818 }
819 }
820 }
821 }
822 if (dcls[0] && dcls[1])
823 {
824 gra.addBundleConflictDcl(dcls[0], dcls[1], offset[0] - offset[1]);
825 gra.addBundleConflictDcl(dcls[1], dcls[0], offset[1] - offset[0]);
826 }
827 if (dcls[1] && dcls[2])
828 {
829 gra.addBundleConflictDcl(dcls[2], dcls[1], offset[2] - offset[1]);
830 gra.addBundleConflictDcl(dcls[1], dcls[2], offset[1] - offset[2]);
831 }
832 #if 0
833 if (gra.kernel.getOption(vISA_forceBCR) && dcls[0] && dcls[2])
834 {
835 gra.addBundleConflictDcl(dcls[2], dcls[0], offset[2] - offset[0]);
836 gra.addBundleConflictDcl(dcls[0], dcls[2], offset[0] - offset[2]);
837 }
838 #endif
839
840 //In case (src0) src1 and src2 use same declare, i.e. use same regsiter
841 if (dcls[0] == dcls[2] ||
842 !dcls[0] || !dcls[2])
843 {
844 return;
845 }
846
847 if (bank_num == 0)
848 {
849 srcBC[0] = refNum[0] > refNum[2] ? BANK_CONFLICT_FIRST_HALF_EVEN : BANK_CONFLICT_SECOND_HALF_ODD;
850 srcBC[2] = refNum[0] > refNum[2] ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
851 if (isOddOffset(offset[0]))
852 {
853 srcBC[0] = (srcBC[0] == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
854 }
855 if (isOddOffset(offset[2]))
856 {
857 srcBC[2] = (srcBC[2] == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
858 }
859 gra.setBankConflict(dcls[0], srcBC[0]);
860 gra.setBankConflict(dcls[2], srcBC[2]);
861
862 }
863 else if (bank_num == 1)
864 {
865 if (srcBC[0] != BANK_CONFLICT_NONE)
866 {
867 srcBC[2] = (srcBC[0] == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
868 if (isOddOffset(offset[2]))
869 {
870 srcBC[2] = (srcBC[2] == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
871 }
872 gra.setBankConflict(dcls[2], srcBC[2]);
873 }
874 else
875 {
876 srcBC[0] = (srcBC[2] == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
877 if (offset[0] % 2)
878 {
879 srcBC[0] = (srcBC[0] == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
880 }
881 gra.setBankConflict(dcls[0], srcBC[0]);
882 }
883 }
884
885 #ifdef DEBUG_VERBOSE_ON
886 for (int i = 0; i < 3; i += 2)
887 {
888 if (opndDcls[i])
889 {
890 printf("%s, ", opndDcls[i]->getName());
891
892 if (gra.getBankConflict(dcls[i]) == BANK_CONFLICT_FIRST_HALF_EVEN)
893 {
894 printf("%s\n", "EVEN");
895 }
896 else if (gra.getBankConflict(dcls[i]) == BANK_CONFLICT_SECOND_HALF_ODD)
897 {
898 printf("%s\n", "ODD");
899 }
900 else
901 {
902 printf("%s\n", "NONE");
903 }
904 }
905 }
906 #endif
907
908 return;
909 }
910
setupBankConflictsforMad(G4_INST * inst)911 void BankConflictPass::setupBankConflictsforMad(G4_INST* inst)
912 {
913 BankConflict srcBC[3];
914 unsigned offset[3];
915 G4_Declare * dcls[3];
916 G4_Declare * opndDcls[3];
917 BankConflict assignedBank = BANK_CONFLICT_NONE; //Flip for next
918
919 for (int i = 0; i < 3; i += 1)
920 {
921 dcls[i] = nullptr;
922 opndDcls[i] = nullptr;
923
924 G4_Operand* src = inst->getSrc(i);
925 if (!src || !src->isSrcRegRegion() || src->isAccReg())
926 {
927 // bank conflict not possible
928 continue;
929 }
930
931 dcls[i] = GetTopDclFromRegRegion(src);
932 opndDcls[i] = src->getBase()->asRegVar()->getDeclare();
933 offset[i] = (opndDcls[i]->getOffsetFromBase() + src->getLeftBound()) / numEltPerGRF<Type_UB>();
934 srcBC[i] = gra.getBankConflict(dcls[i]);
935
936 if (srcBC[i] != BANK_CONFLICT_NONE)
937 {
938 if (isOddOffset(offset[i]))
939 {
940 if (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN)
941 {
942 srcBC[i] = BANK_CONFLICT_SECOND_HALF_ODD;
943 }
944 else
945 {
946 srcBC[i] = BANK_CONFLICT_FIRST_HALF_EVEN;
947 }
948 }
949 if (assignedBank != BANK_CONFLICT_SECOND_HALF_EVEN)
950 {
951 if (assignedBank == BANK_CONFLICT_NONE)
952 {
953 assignedBank = srcBC[i];
954 }
955 else if (assignedBank != srcBC[i])
956 {
957 assignedBank = BANK_CONFLICT_SECOND_HALF_EVEN; //BANK_CONFLICT_SECOND_HALF_EVEN is used to represent all banks are assigned
958 }
959 }
960 }
961 }
962
963 for (int k = 0; k < 2; k++)
964 {
965 for (int i = 2; i != -1; i--)
966 {
967 if (!dcls[i])
968 {
969 continue;
970 }
971
972 LocalLiveRange* lr = gra.getLocalLR(dcls[i]);
973 if (!lr ||
974 (k == 0 && !lr->isLiveRangeLocal()))
975 {
976 continue;
977 }
978
979 if (k == 1 && lr->isLiveRangeLocal())
980 {
981 continue;
982 }
983
984 if (assignedBank == BANK_CONFLICT_SECOND_HALF_EVEN)
985 {
986 continue;
987 }
988
989 srcBC[i] = gra.getBankConflict(dcls[i]);
990 if (srcBC[i] != BANK_CONFLICT_NONE)
991 {
992 if (isOddOffset(offset[i]))
993 {
994 if (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN)
995 {
996 srcBC[i] = BANK_CONFLICT_SECOND_HALF_ODD;
997 }
998 else
999 {
1000 srcBC[i] = BANK_CONFLICT_FIRST_HALF_EVEN;
1001 }
1002 }
1003
1004 if (assignedBank == BANK_CONFLICT_NONE)
1005 {
1006 assignedBank = srcBC[i];
1007 }
1008 else if (srcBC[i] != assignedBank)
1009 {
1010 assignedBank = BANK_CONFLICT_SECOND_HALF_EVEN;
1011 }
1012
1013 continue;
1014 }
1015
1016 if (assignedBank == BANK_CONFLICT_NONE)
1017 {
1018 srcBC[i] = BANK_CONFLICT_FIRST_HALF_EVEN;
1019 assignedBank = srcBC[i];
1020 if (isOddOffset(offset[i]))
1021 {
1022 srcBC[i] = (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
1023 }
1024 gra.setBankConflict(dcls[i], srcBC[i]);
1025 }
1026 else
1027 {
1028 srcBC[i] = (assignedBank == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
1029 if (isOddOffset(offset[i]))
1030 {
1031 srcBC[i] = (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN) ? BANK_CONFLICT_SECOND_HALF_ODD : BANK_CONFLICT_FIRST_HALF_EVEN;
1032 }
1033 gra.setBankConflict(dcls[i], srcBC[i]);
1034 assignedBank = BANK_CONFLICT_SECOND_HALF_EVEN;
1035 }
1036 }
1037 }
1038
1039 #ifdef DEBUG_VERBOSE_ON
1040 printf("$%d:\n", inst->getCISAOff());
1041 for (int i = 0; i < 3; i++)
1042 {
1043 if (dcls[i])
1044 {
1045 printf("%s, ", dcls[i]->getName());
1046
1047 if (gra.getBankConflict(dcls[i]) == BANK_CONFLICT_FIRST_HALF_EVEN)
1048 {
1049 printf("%s\n", "EVEN");
1050 }
1051 else if (gra.getBankConflict(dcls[i]) == BANK_CONFLICT_SECOND_HALF_ODD)
1052 {
1053 printf("%s\n", "ODD");
1054 }
1055 else
1056 {
1057 printf("%s\n", "NONE");
1058 }
1059 }
1060 }
1061 printf("\n");
1062 #endif
1063
1064 return;
1065 }
1066
setupBankConflictsForBB(G4_BB * bb,unsigned & threeSourceInstNum,unsigned & sendInstNum,unsigned numRegLRA,unsigned & internalConflict)1067 void BankConflictPass::setupBankConflictsForBB(
1068 G4_BB* bb,
1069 unsigned &threeSourceInstNum,
1070 unsigned &sendInstNum,
1071 unsigned numRegLRA,
1072 unsigned & internalConflict)
1073 {
1074 int bank1RegNum = 0;
1075 int bank2RegNum = 0;
1076 float GRFRatio = 0;
1077 G4_INST* prevInst = nullptr;
1078
1079 if (numRegLRA)
1080 {
1081 GRFRatio = ((float)(numRegLRA - SECOND_HALF_BANK_START_GRF)) / SECOND_HALF_BANK_START_GRF;
1082 }
1083
1084 for (auto i = bb->rbegin(), rend = bb->rend();
1085 i != rend;
1086 i++)
1087 {
1088 G4_INST* inst = (*i);
1089 if (inst->getNumSrc() == 3 && !inst->isSend())
1090 {
1091 threeSourceInstNum++;
1092 setupBankConflictsOneGRFOld(inst, bank1RegNum, bank2RegNum, GRFRatio, internalConflict);
1093 }
1094 if (inst->isSend() && !inst->isEOT())
1095 {
1096 //Why only data port read causes issue?
1097 if (inst->getMsgDesc()->isRead())
1098 {
1099 sendInstNum++;
1100 }
1101 }
1102 }
1103
1104 if ((float)threeSourceInstNum / bb->size() > 0.1)
1105 {
1106 if (!gra.kernel.fg.builder->lowHighBundle() && gra.kernel.fg.builder->hasEarlyGRFRead())
1107 {
1108 for (G4_INST* inst : *bb)
1109 {
1110 if (prevInst && inst->getNumSrc() == 3 && !inst->isSend())
1111 {
1112 setupBankForSrc0(inst, prevInst);
1113 }
1114 prevInst = inst;
1115 }
1116 }
1117 }
1118 }
1119
setupBankConflictsForBBTGL(G4_BB * bb,unsigned & threeSourceInstNum,unsigned & sendInstNum,unsigned numRegLRA,unsigned & internalConflict)1120 void BankConflictPass::setupBankConflictsForBBTGL(
1121 G4_BB* bb,
1122 unsigned& threeSourceInstNum,
1123 unsigned& sendInstNum,
1124 unsigned numRegLRA,
1125 unsigned& internalConflict)
1126 {
1127 float GRFRatio = 0;
1128 G4_INST* prevInst = nullptr;
1129
1130 if (numRegLRA)
1131 {
1132 GRFRatio = ((float)(numRegLRA - SECOND_HALF_BANK_START_GRF)) / SECOND_HALF_BANK_START_GRF;
1133 }
1134
1135 for (auto i = bb->rbegin(), rend = bb->rend();
1136 i != rend;
1137 i++)
1138 {
1139 G4_INST* inst = (*i);
1140 if (inst->isSend() || inst->isCFInst() || inst->isLabel() || inst->isOptBarrier())
1141 {
1142 if (inst->isSend() && !inst->isEOT())
1143 {
1144 // Why only data port read causes issue?
1145 if (inst->getMsgDesc()->isRead())
1146 {
1147 sendInstNum++;
1148 }
1149 }
1150 continue;
1151 }
1152 if (inst->getNumSrc() == 3)
1153 {
1154 threeSourceInstNum++;
1155 if (inst->isDpas())
1156 {
1157 hasDpasInst = true;
1158 setupBankConflictsforDPAS(inst);
1159 }
1160 else
1161 {
1162 setupBankConflictsforMad(inst);
1163 }
1164 }
1165 else if (gra.kernel.getOption(vISA_forceBCR) && !forGlobal && inst->getNumSrc() == 2)
1166 {
1167 threeSourceInstNum++;
1168 setupBankConflictsforMad(inst);
1169 }
1170 }
1171
1172 if ((float)threeSourceInstNum / bb->size() > 0.1)
1173 {
1174 if (!gra.kernel.fg.builder->lowHighBundle() && gra.kernel.fg.builder->hasEarlyGRFRead())
1175 {
1176 for (G4_INST* inst : *bb)
1177 {
1178 if (prevInst && inst->getNumSrc() == 3 && !inst->isSend())
1179 {
1180 setupBankForSrc0(inst, prevInst);
1181 }
1182 prevInst = inst;
1183 }
1184 }
1185 }
1186 }
1187
1188 //Use for BB sorting according to the loop nest level and the BB size.
compareBBLoopLevel(G4_BB * bb1,G4_BB * bb2)1189 bool compareBBLoopLevel(G4_BB* bb1, G4_BB* bb2)
1190 {
1191 if (bb1->getNestLevel() > bb2->getNestLevel())
1192 {
1193 return true;
1194 }
1195 else if (bb1->getNestLevel() == bb2->getNestLevel())
1196 {
1197 return bb1->size() > bb2->size();
1198 }
1199
1200 return false;
1201 }
1202
1203 /*
1204 * output:
1205 * threeSourceCandidate, if there are enough three source instructions
1206 * return value, if do bank confliction reduction to RR RA.
1207 */
setupBankConflictsForKernel(bool doLocalRR,bool & threeSourceCandidate,unsigned numRegLRA,bool & highInternalConflict)1208 bool BankConflictPass::setupBankConflictsForKernel(bool doLocalRR, bool &threeSourceCandidate, unsigned numRegLRA, bool &highInternalConflict)
1209 {
1210 unsigned threeSourceInstNumInKernel = 0;
1211 unsigned internalConflict = 0;
1212 unsigned instNumInKernel = 0;
1213 unsigned sendInstNumInKernel = 0;
1214
1215 std::vector<G4_BB *> orderedBBs(gra.kernel.fg.cbegin(), gra.kernel.fg.cend());
1216 std::sort(orderedBBs.begin(), orderedBBs.end(), compareBBLoopLevel);
1217
1218 for (auto bb : orderedBBs)
1219 {
1220 unsigned instNum = 0;
1221 unsigned sendInstNum = 0;
1222 unsigned threeSourceInstNum = 0;
1223 unsigned conflicts = 0;
1224
1225 unsigned loopNestLevel = 0;
1226
1227 if (gra.kernel.fg.builder->lowHighBundle())
1228 {
1229 setupBankConflictsForBB(bb, threeSourceInstNum, sendInstNum, numRegLRA, conflicts);
1230 }
1231 else
1232 {
1233 setupBankConflictsForBBTGL(bb, threeSourceInstNum, sendInstNum, numRegLRA, conflicts);
1234 }
1235
1236 loopNestLevel = bb->getNestLevel() + 1;
1237
1238 if (threeSourceInstNum)
1239 {
1240 instNum = (uint32_t)bb->size() * loopNestLevel * BANK_CONFLICT_HEURISTIC_LOOP_ITERATION;
1241 threeSourceInstNum = threeSourceInstNum * loopNestLevel * BANK_CONFLICT_HEURISTIC_LOOP_ITERATION;
1242 sendInstNum = sendInstNum * loopNestLevel * BANK_CONFLICT_HEURISTIC_LOOP_ITERATION;
1243 conflicts = conflicts * loopNestLevel * BANK_CONFLICT_HEURISTIC_LOOP_ITERATION;
1244 internalConflict += conflicts;
1245 threeSourceInstNumInKernel += threeSourceInstNum;
1246 instNumInKernel += instNum;
1247 sendInstNumInKernel += sendInstNum;
1248 }
1249 }
1250
1251 if (!threeSourceInstNumInKernel ||
1252 (float)threeSourceInstNumInKernel / instNumInKernel < BANK_CONFLICT_HEURISTIC_INST)
1253 {
1254 return false;
1255 }
1256
1257 highInternalConflict = ((float)internalConflict / threeSourceInstNumInKernel) > INTERNAL_CONFLICT_RATIO_HEURISTIC;
1258
1259 //Bank conflict reduction is done only when there is enough three source instructions.
1260 threeSourceCandidate = true;
1261
1262 if (doLocalRR && sendInstNumInKernel)
1263 {
1264 if (!hasDpasInst && (sendInstNumInKernel > threeSourceInstNumInKernel))
1265 {
1266 return false;
1267 }
1268 }
1269
1270 return true;
1271 }
1272
areAllDefsNoMask(G4_Declare * dcl)1273 bool GlobalRA::areAllDefsNoMask(G4_Declare* dcl)
1274 {
1275 bool retval = true;
1276 auto& maskUsed = getMask(dcl);
1277 if (maskUsed.size() > 0 &&
1278 getAugmentationMask(dcl) != AugmentationMasks::NonDefault)
1279 {
1280 auto byteSize = dcl->getByteSize();
1281 for (unsigned i = 0; i < byteSize; i++)
1282 {
1283 if (maskUsed[i] != NOMASK_BYTE)
1284 {
1285 retval = false;
1286 break;
1287 }
1288 }
1289 }
1290 else
1291 {
1292 if (getAugmentationMask(dcl) == AugmentationMasks::NonDefault)
1293 retval = true;
1294 else
1295 retval = false;
1296 }
1297 return retval;
1298 }
1299
getBankAlign(const G4_Declare * dcl) const1300 BankAlign GlobalRA::getBankAlign(const G4_Declare* dcl) const
1301 {
1302 const IR_Builder* builder = kernel.fg.builder;
1303 switch (getBankConflict(dcl))
1304 {
1305 case BANK_CONFLICT_FIRST_HALF_EVEN:
1306 case BANK_CONFLICT_SECOND_HALF_EVEN:
1307 return builder->oneGRFBankDivision() ? BankAlign::Even : BankAlign::Even2GRF;
1308 case BANK_CONFLICT_FIRST_HALF_ODD:
1309 case BANK_CONFLICT_SECOND_HALF_ODD:
1310 return builder->oneGRFBankDivision() ? BankAlign::Odd : BankAlign::Odd2GRF;
1311 default:
1312 return BankAlign::Either;
1313 }
1314 }
1315
emitFGWithLiveness(const LivenessAnalysis & liveAnalysis) const1316 void GlobalRA::emitFGWithLiveness(const LivenessAnalysis& liveAnalysis) const
1317 {
1318 #ifdef DEBUG_VERBOSE_ON
1319 for (G4_BB* bb : kernel.fg)
1320 {
1321 DEBUG_VERBOSE(std::endl << "-----------------------------------------------------------------");
1322 DEBUG_VERBOSE(std::endl << "BB" << bb->getId() << ":");
1323 DEBUG_VERBOSE(std::endl << "Preds: ");
1324 for (const G4_BB* pred : bb->Preds)
1325 {
1326 DEBUG_VERBOSE("BB" << pred->getId() << ", ");
1327 }
1328
1329 DEBUG_VERBOSE(std::endl << "Succs: ");
1330 for (const G4_BB* succ : bb->Succs)
1331 {
1332 DEBUG_VERBOSE("BB" << succ->getId() << ", ");
1333 }
1334
1335 if (kernel.getOption(vISA_LocalRA))
1336 {
1337 if (auto summary = kernel.fg.getBBLRASummary(bb))
1338 {
1339 DEBUG_VERBOSE(std::endl << "Local RA: ");
1340 {
1341 for (unsigned i = 0; i < kernel.getNumRegTotal(); i++)
1342 {
1343 if (summary->isGRFBusy(i))
1344 {
1345 DEBUG_VERBOSE("r" << i << ", ");
1346 }
1347 }
1348 }
1349 }
1350 }
1351
1352 DEBUG_VERBOSE(std::endl << "Gen: ");
1353 for (const G4_Declare * dcl : kernel.Declares)
1354 {
1355 if (dcl->getAliasDeclare() != NULL)
1356 continue;
1357
1358 if (dcl->getRegVar()->isRegAllocPartaker())
1359 {
1360 if (liveAnalysis.use_gen[bb->getId()].isSet(dcl->getRegVar()->getId()))
1361 {
1362 DEBUG_VERBOSE(dcl->getName() << ", ");
1363 }
1364 }
1365 }
1366
1367 DEBUG_VERBOSE(std::endl << "Kill: ");
1368 for (const G4_Declare * dcl : kernel.Declares)
1369 {
1370 if (dcl->getAliasDeclare() != NULL)
1371 continue;
1372
1373 if (dcl->getRegVar()->isRegAllocPartaker())
1374 {
1375 if (liveAnalysis.use_kill[bb->getId()].isSet(dcl->getRegVar()->getId()))
1376 {
1377 DEBUG_VERBOSE(dcl->getName() << ", ");
1378 }
1379 }
1380 }
1381
1382 DEBUG_VERBOSE(std::endl << "Live-in: ");
1383 for (const G4_Declare * dcl : kernel.Declares)
1384 {
1385 if (dcl->getAliasDeclare() != NULL)
1386 continue;
1387
1388 if (dcl->getRegVar()->isRegAllocPartaker())
1389 {
1390 if (liveAnalysis.isLiveAtEntry(bb, dcl->getRegVar()->getId()))
1391 {
1392 DEBUG_VERBOSE(dcl->getName() << ", ");
1393 }
1394 }
1395 }
1396
1397 DEBUG_VERBOSE(std::endl << "Live-out: ");
1398 for (const G4_Declare * dcl : kernel.Declares)
1399 {
1400 if (dcl->getAliasDeclare() != NULL)
1401 continue;
1402
1403 if (dcl->getRegVar()->isRegAllocPartaker())
1404 {
1405 if (liveAnalysis.isLiveAtExit(bb, dcl->getRegVar()->getId()))
1406 {
1407 DEBUG_VERBOSE(dcl->getName() << ", ");
1408 }
1409 }
1410 }
1411
1412 DEBUG_VERBOSE(std::endl);
1413
1414 bb->emit(COUT_ERROR);
1415 }
1416 #endif
1417 }
1418
reportSpillInfo(const LivenessAnalysis & liveness,const GraphColor & coloring) const1419 void GlobalRA::reportSpillInfo(const LivenessAnalysis& liveness, const GraphColor& coloring) const
1420 {
1421 // Emit out interference graph of each spill candidate
1422 // and if a spill candidate is a local range, emit its
1423 // start and end line number in file
1424 std::ofstream optreport;
1425 getOptReportStream(optreport, coloring.getOptions());
1426 LiveRange** lrs = coloring.getLiveRanges();
1427
1428 for (const vISA::LiveRange* slr : coloring.getSpilledLiveRanges())
1429 {
1430 if (slr->getRegKind() == G4_GRF) {
1431 const G4_RegVar* spillVar = slr->getVar();
1432 optreport << "Spill candidate " << spillVar->getName() << " intf:";
1433 optreport << "\t(" << spillVar->getDeclare()->getTotalElems() << "):" <<
1434 TypeSymbol(spillVar->getDeclare()->getElemType()) << std::endl;
1435
1436 if (getLocalLR(spillVar->getDeclare()) != NULL)
1437 {
1438 if (getLocalLR(spillVar->getDeclare())->isLiveRangeLocal())
1439 {
1440 int start, end;
1441 unsigned dummy;
1442 start = getLocalLR(spillVar->getDeclare())->getFirstRef(dummy)->getLineNo();
1443 end = getLocalLR(spillVar->getDeclare())->getLastRef(dummy)->getLineNo();
1444
1445 optreport << "(Liverange is local starting at line #" << start <<
1446 " and ending at line #" << end << ")" << std::endl;
1447 }
1448 }
1449
1450 const Interference* intf = coloring.getIntf();
1451 unsigned spillVarId = slr->getVar()->getId();
1452
1453 for (int i = 0; i < (int)liveness.getNumSelectedVar(); i++)
1454 {
1455 if (intf->interfereBetween(spillVarId, i))
1456 {
1457 const G4_RegVar* intfRangeVar = lrs[i]->getVar();
1458
1459 optreport << "\t" << intfRangeVar->getName() << "(" <<
1460 intfRangeVar->getDeclare()->getTotalElems() << "):" <<
1461 TypeSymbol(intfRangeVar->getDeclare()->getElemType());
1462
1463 if (lrs[i]->getPhyReg() == NULL)
1464 {
1465 optreport << " --- spilled";
1466 }
1467
1468 optreport << ", " << std::endl;
1469 }
1470 }
1471
1472 optreport << std::endl << std::endl;
1473 }
1474 }
1475
1476 closeOptReportStream(optreport);
1477 }
1478
1479
LiveRange(G4_RegVar * v,GlobalRA & g)1480 LiveRange::LiveRange(G4_RegVar* v, GlobalRA& g) : var(v), dcl(v->getDeclare()), regKind(dcl->getRegFile()), gra(g)
1481 {
1482 isCandidate = true;
1483
1484 if (getRegKind() == G4_ADDRESS)
1485 numRegNeeded = v->getDeclare()->getNumElems() * v->getDeclare()->getElemSize() / G4_WSIZE;
1486 else if (getRegKind() == G4_FLAG)
1487 {
1488 // number of elements are in words
1489 numRegNeeded = v->getDeclare()->getNumElems();
1490 }
1491 else
1492 {
1493 // number of GRFs
1494 numRegNeeded = v->getDeclare()->getNumRows();
1495 }
1496 }
1497
checkForInfiniteSpillCost(G4_BB * bb,std::list<G4_INST * >::reverse_iterator & it)1498 void LiveRange::checkForInfiniteSpillCost(G4_BB* bb, std::list<G4_INST*>::reverse_iterator& it)
1499 {
1500 // G4_INST at *it defines liverange object (this ptr)
1501 // If next instruction of iterator uses same liverange then
1502 // it may be a potential infinite spill cost candidate.
1503 // To confirm, following requirements should be fulfilled:
1504 // a. this liverange is not a global
1505 // b. this liverange is defined/used in these 2 instructions only
1506 //
1507 // The idea is for ranges marked with infinite spill cost,
1508 // coloring will attempt to put them on top of stack so they
1509 // have higher chance of getting a color. If a range that should
1510 // be infinite spill cost is not marked as being so, the only
1511 // downside is extra compile time spent in inserting spill code
1512 // and then punting out when later spilled code will cause
1513 // even more spills.
1514 //
1515 // The assumption is that current live-range is a current register
1516 // allocation candidate.
1517 //
1518 G4_INST* curInst = (*it);
1519
1520 // Skip the check if curInst is a pseudoKill
1521 // Otherwise, it may invalidate a previously marked infinite
1522 // spill cost candidate, e.g.,
1523 // pseudo_kill (1) P1(0,0)[1]:uw [Align1]
1524 // mov (1) P1(0,0)[1]:uw TV1(8,0)[0;1,0]:uw [Align1, NoMask]
1525 // (+P1.0) sel (16) V65(0,0)[1]:f TV0(0,0)[0;1,0]:f 0:f [Align1, H1]
1526 if (curInst->isPseudoKill())
1527 {
1528 return;
1529 }
1530
1531 // Check whether dst variable is a global
1532 if (gra.isBlockLocal(this->getDcl()) == false)
1533 {
1534 isCandidate = false;
1535 isInfiniteCost = false;
1536
1537 return;
1538 }
1539
1540 G4_DstRegRegion* dst = curInst->getDst();
1541 // If cur instruction dst is indirect write then return
1542 if (dst &&
1543 dst->getRegAccess() == IndirGRF &&
1544 dst->getBase()->asRegVar()->getId() == this->getVar()->getId())
1545 {
1546 return;
1547 }
1548
1549 // isCandidate is set to true only for first definition ever seen.
1550 // If more than 1 def if found this gets set to false.
1551 const std::list<G4_INST*>::reverse_iterator rbegin = bb->rbegin();
1552 if (this->isCandidate == true && it != rbegin)
1553 {
1554 G4_INST* nextInst = NULL;
1555 if (this->getRefCount() != 2 ||
1556 (this->getRegKind() == G4_GRF && this->getDcl()->getAddressed() == true))
1557 {
1558 // If a liverange has > 2 refs then it
1559 // cannot be a candidate.
1560 // Also an address taken GRF is not a candidate.
1561 // This represents an early exit.
1562 isCandidate = false;
1563 isInfiniteCost = false;
1564
1565 return;
1566 }
1567
1568 // Skip all pseudo kills
1569 std::list<G4_INST*>::reverse_iterator next = it;
1570 while (true)
1571 {
1572 if (next == rbegin)
1573 {
1574 isCandidate = isInfiniteCost = false;
1575 return;
1576 }
1577 --next;
1578
1579 // This is not a pseudo-kill instruction, then find
1580 // the desired next instruction. Otherwise, continue.
1581 nextInst = *next;
1582 if (!(nextInst->isPseudoKill()))
1583 break;
1584 }
1585
1586 // Check whether this liverange is used in nextInst
1587 for (unsigned i = 0; i < G4_MAX_SRCS; i++)
1588 {
1589 G4_Operand* src = nextInst->getSrc(i);
1590
1591 if (src &&
1592 src->isSrcRegRegion() &&
1593 src->getBase()->isRegAllocPartaker())
1594 {
1595 // src can be Direct/Indirect
1596 G4_SrcRegRegion* srcRgn = src->asSrcRegRegion();
1597
1598 if (srcRgn->getRegAccess() == Direct &&
1599 srcRgn->getBase()->isRegVar() &&
1600 srcRgn->getBase()->asRegVar()->getId() == this->getVar()->getId())
1601 {
1602 // Def-use found back-to-back
1603 isInfiniteCost = true;
1604 // Identify no more candidates
1605 isCandidate = false;
1606 }
1607 else if (this->getRegKind() == G4_ADDRESS &&
1608 srcRgn->getRegAccess() == IndirGRF &&
1609 srcRgn->getBase()->isRegVar() &&
1610 srcRgn->getBase()->asRegVar()->getId() == this->getVar()->getId())
1611 {
1612 // Def-use found back-to-back
1613 isInfiniteCost = true;
1614 // Identify no more candidates
1615 isCandidate = false;
1616 }
1617 }
1618 }
1619
1620 G4_DstRegRegion* nextDst = nextInst->getDst();
1621 if (isCandidate == true &&
1622 this->getRegKind() == G4_ADDRESS &&
1623 nextDst &&
1624 nextDst->getRegAccess() == IndirGRF &&
1625 nextDst->getBase()->isRegVar() &&
1626 nextDst->getBase()->asRegVar()->isRegAllocPartaker() &&
1627 nextDst->getBase()->asRegVar()->getId() == this->getVar()->getId())
1628 {
1629 // Pattern found:
1630 // A0=
1631 // r[A0]=
1632 isInfiniteCost = true;
1633 // Identify no more candidates
1634 isCandidate = false;
1635 }
1636
1637 if (isCandidate == true &&
1638 this->getRegKind() == G4_FLAG &&
1639 nextInst->getPredicate() &&
1640 nextInst->getPredicate()->getBase() &&
1641 nextInst->getPredicate()->getBase()->isRegVar() &&
1642 nextInst->getPredicate()->getBase()->asRegVar()->isRegAllocPartaker() &&
1643 nextInst->getPredicate()->getBase()->asRegVar()->getId() == this->getVar()->getId())
1644 {
1645 // Pattern found:
1646 // P0 = or cmp.P0 = <-- P0 defined
1647 // (P0) ... <-- P0 used as predicate
1648 isInfiniteCost = true;
1649 // Identify no more candidates
1650 isCandidate = false;
1651 }
1652
1653 #ifdef DEBUG_VERBOSE_ON
1654 if (isInfiniteCost == true)
1655 {
1656 DEBUG_VERBOSE("Marking " << this->getDcl()->getName() <<
1657 " as having infinite spill cost due to back-to-back def-use" << std::endl);
1658 }
1659 #endif
1660
1661 // Once a def is seen, stop looking for more defs
1662 isCandidate = false;
1663 }
1664 else
1665 {
1666 #ifdef DEBUG_VERBOSE_ON
1667 if (isInfiniteCost == true)
1668 {
1669 DEBUG_VERBOSE("Unmarking " << this->getDcl()->getName() <<
1670 " as having infinite spill cost" << std::endl);
1671 }
1672 #endif
1673 isCandidate = false;
1674 isInfiniteCost = false;
1675 }
1676 }
1677
1678 //
1679 // return true, if live ranges v1 and v2 interfere
1680 //
interfereBetween(unsigned v1,unsigned v2) const1681 bool Interference::interfereBetween(unsigned v1, unsigned v2) const
1682 {
1683 if (v1 > v2)
1684 {
1685 std::swap(v1, v2);
1686 }
1687
1688 if (useDenseMatrix())
1689 {
1690 unsigned col = v2 / BITS_DWORD;
1691 return matrix[v1 * rowSize + col] & (1 << (v2 % BITS_DWORD));
1692 }
1693 else
1694 {
1695 auto&& set = sparseMatrix[v1];
1696 return set.find(v2) != set.end();
1697 }
1698 }
1699
1700 //
1701 // init live vector with all live ranges that are live at the exit
1702 // also set the next seq use of any live range that is live across to be INT_MAX
1703 // to indicate that this live range does not have exclusive sequential uses and hence
1704 // is not a candidate for being marked with an infinite spill cost.
1705 //
buildInterferenceAtBBExit(const G4_BB * bb,BitSet & live)1706 void Interference::buildInterferenceAtBBExit(const G4_BB* bb, BitSet& live)
1707 {
1708
1709 // live must be empty at this point
1710 live = liveAnalysis->use_out[bb->getId()];
1711 live &= liveAnalysis->def_out[bb->getId()];
1712 }
1713
1714 //
1715 // Filter out partial or splitted declares in batch interference.
1716 //
filterSplitDclares(unsigned startIdx,unsigned endIdx,unsigned n,unsigned col,unsigned & elt,bool is_partial)1717 inline void Interference::filterSplitDclares(unsigned startIdx, unsigned endIdx, unsigned n, unsigned col, unsigned &elt, bool is_partial)
1718 {
1719
1720 if (is_partial) //Don't interference with parent
1721 {
1722 unsigned rowSplited = n / BITS_DWORD;
1723 if (rowSplited == col)
1724 {
1725 elt &= ~(1 << (n % BITS_DWORD));
1726 }
1727 }
1728
1729 //if current is splitted dcl, don't interference with any of its child nodes.
1730 //if current is partial dcl, don't interference with any other child nodes.
1731 if (col >= startIdx / BITS_DWORD && col < (endIdx / BITS_DWORD + 1))
1732 {
1733 unsigned selt = 0;
1734 unsigned start_id = col * BITS_DWORD > startIdx ? 0 : startIdx % BITS_DWORD;
1735 unsigned end_id = (col + 1) * BITS_DWORD > endIdx ? endIdx % BITS_DWORD : BITS_DWORD;
1736
1737 for (unsigned i = start_id; i < end_id; i++)
1738 {
1739 selt |= 1 << i;
1740 }
1741 elt &= ~selt;
1742 }
1743
1744 return;
1745 }
1746
1747 //
1748 // set interference for all live ranges that are currently live
1749 // for partial declares, following rules are applied
1750 // a. current partial declare does not interference with any other partial declare
1751 // b. current parent declare does not interference with its children declares, can children declare interference with parent declare?
1752 // c. current partial declare does not interference with hybrid declares added by local RA, the reason is simple, these declares are assigned register already.
1753 //
buildInterferenceWithLive(const BitSet & live,unsigned i)1754 void Interference::buildInterferenceWithLive(const BitSet& live, unsigned i)
1755 {
1756 const LiveRange* lr = lrs[i];
1757 bool is_partial = lr->getIsPartialDcl();
1758 bool is_splitted = lr->getIsSplittedDcl();
1759 unsigned n = 0;
1760
1761 // For none partial varaible, interference with all varaibles
1762 unsigned numDwords = maxId / BITS_DWORD;
1763 unsigned numBits = maxId % BITS_DWORD;
1764
1765 if (numBits)
1766 {
1767 numDwords++;
1768 }
1769
1770 unsigned start_idx = 0;
1771 unsigned end_idx = 0;
1772 if (is_splitted) //if current is splitted dcl, don't interference with all its child nodes.
1773 {
1774 start_idx = lr->getDcl()->getSplitVarStartID();
1775 end_idx = start_idx + gra.getSplitVarNum(lr->getDcl());
1776 }
1777
1778 if (is_partial) //if current is partial dcl, don't interference with all other partial dcls, and it's parent dcl.
1779 {
1780 n = gra.getSplittedDeclare(lr->getDcl())->getRegVar()->getId();
1781 start_idx = splitStartId;
1782 end_idx = splitStartId + splitNum;
1783 }
1784
1785 unsigned colEnd = i / BITS_DWORD;
1786
1787 // Set column bits in intf graph
1788 for (unsigned k = 0; k < colEnd; k++)
1789 {
1790 unsigned elt = live.getElt(k);
1791
1792 if (elt != 0)
1793 {
1794 if (is_partial || is_splitted)
1795 {
1796 filterSplitDclares(start_idx, end_idx, n, k, elt, is_partial);
1797 }
1798
1799 for (unsigned j = 0; j < BITS_DWORD; j++)
1800 {
1801 if (elt & (1 << j))
1802 {
1803 unsigned curPos = j + (k * BITS_DWORD);
1804 safeSetInterference(curPos, i);
1805 }
1806 }
1807 }
1808 }
1809
1810 // Set dword at transition point from column to row
1811 unsigned elt = live.getElt(colEnd);
1812 //checkAndSetIntf guarantee partial and splitted cases
1813 if (elt != 0)
1814 {
1815 for (unsigned j = 0; j < BITS_DWORD; j++)
1816 {
1817 if (elt & (1 << j))
1818 {
1819 unsigned curPos = j + (colEnd * BITS_DWORD);
1820 if (!varSplitCheckBeforeIntf(i, curPos))
1821 {
1822 checkAndSetIntf(i, curPos);
1823 }
1824 }
1825 }
1826 }
1827
1828 colEnd++;
1829 // Set row intf graph
1830 for (unsigned k = colEnd; k < numDwords; k++)
1831 {
1832 unsigned elt = live.getElt(k);
1833
1834 if (is_partial || is_splitted)
1835 {
1836 filterSplitDclares(start_idx, end_idx, n, k, elt, is_partial);
1837 }
1838
1839 if (elt != 0)
1840 {
1841 setBlockInterferencesOneWay(i, k, elt);
1842 }
1843 }
1844 }
1845
buildInterferenceWithSubDcl(unsigned lr_id,G4_Operand * opnd,BitSet & live,bool setLive,bool setIntf)1846 void Interference::buildInterferenceWithSubDcl(unsigned lr_id, G4_Operand *opnd, BitSet& live, bool setLive, bool setIntf)
1847 {
1848
1849 const G4_Declare *dcl = lrs[lr_id]->getDcl();
1850 for (const G4_Declare *subDcl : gra.getSubDclList(dcl))
1851 {
1852 unsigned leftBound = gra.getSubOffset(subDcl);
1853 unsigned rightBound = leftBound + subDcl->getByteSize() - 1;
1854 if (!(opnd->getRightBound() < leftBound || rightBound < opnd->getLeftBound()))
1855 {
1856 int subID = subDcl->getRegVar()->getId();
1857
1858 if (setIntf)
1859 {
1860 buildInterferenceWithLive(live, subID);
1861 }
1862 if (setLive)
1863 {
1864 live.set(subID, true);
1865 }
1866 }
1867 }
1868
1869 return;
1870 }
1871
buildInterferenceWithAllSubDcl(unsigned v1,unsigned v2)1872 void Interference::buildInterferenceWithAllSubDcl(unsigned v1, unsigned v2)
1873 {
1874 const G4_Declare * d1 = lrs[v1]->getDcl();
1875 const G4_Declare * d2 = lrs[v2]->getDcl();
1876
1877 if (d1->getIsSplittedDcl() && !d2->getIsPartialDcl())
1878 {
1879 for (const G4_Declare *subDcl : gra.getSubDclList(d1))
1880 {
1881 int subID = subDcl->getRegVar()->getId();
1882 checkAndSetIntf(v2, subID);
1883 }
1884 }
1885
1886 if (d2->getIsSplittedDcl() && !d1->getIsPartialDcl())
1887 {
1888 for (const G4_Declare *subDcl : gra.getSubDclList(d2))
1889 {
1890 int subID = subDcl->getRegVar()->getId();
1891 checkAndSetIntf(v1, subID);
1892 }
1893 }
1894
1895 return;
1896 }
1897 //
1898 // Bias the live ranges in "live" to be assigned the callee-save registers as they
1899 // are live through a stack call. Exclude file scope variables as they are always
1900 // save/restore before/after call and are better assigned to the caller-save space.
1901 //
addCalleeSaveBias(const BitSet & live)1902 void Interference::addCalleeSaveBias(const BitSet& live)
1903 {
1904 for (unsigned i = 0; i < maxId; i++)
1905 {
1906 if (live.isSet(i))
1907 {
1908 lrs[i]->setCallerSaveBias(false);
1909 lrs[i]->setCalleeSaveBias(true);
1910 }
1911 }
1912 }
1913
buildInterferenceAmongLiveOuts()1914 void Interference::buildInterferenceAmongLiveOuts()
1915 {
1916 // Mark interference between dcls marked as Output.
1917 //
1918 // Interference computation marks interference for a
1919 // variable only when definition for that variable is
1920 // seen, not otherwise.
1921 //
1922 // This method is useful when definition of such
1923 // "Output" variables are emitted to program post RA.
1924 //
1925 // It is safe to mark interference between all "Output"
1926 // dcls even when their definition is present in the program.
1927
1928 // First gather all Output dcls in a vector to avoid an O(N^2)
1929 // lookup. Number of OutputDcls should be small.
1930 std::vector<G4_Declare*> OutputDcls;
1931 for (auto dcl : kernel.Declares)
1932 {
1933 if (!dcl->getRegVar()->isRegAllocPartaker() ||
1934 !dcl->isOutput())
1935 continue;
1936
1937 OutputDcls.push_back(dcl);
1938 }
1939
1940 for (auto dcl1 : OutputDcls)
1941 {
1942 // dcl1 is RA partaker iter and is marked as Output
1943 for (auto dcl2 : OutputDcls)
1944 {
1945 if (dcl1 == dcl2)
1946 continue;
1947
1948 checkAndSetIntf(dcl1->getRegVar()->getId(), dcl2->getRegVar()->getId());
1949 }
1950 }
1951 }
1952
buildInterferenceAmongLiveIns()1953 void Interference::buildInterferenceAmongLiveIns()
1954 {
1955 //
1956 // Build interference between all live-ins. If all live-ins are only
1957 // read then their interference will be skipped in earlier phase.
1958 // For eg, arg and globals are both live-in. And both may only have
1959 // uses in function and no def.
1960 //
1961 const G4_BB* entryBB = kernel.fg.getEntryBB();
1962
1963
1964 for (unsigned i = 0; i < liveAnalysis->getNumSelectedGlobalVar(); i++)
1965 {
1966 if (liveAnalysis->isLiveAtEntry(entryBB, i))
1967 {
1968 //Mark reference can not gaurantee all the varaibles are local, update here
1969 if (lrs[i]->getDcl()->getIsSplittedDcl())
1970 {
1971 lrs[i]->getDcl()->setIsSplittedDcl(false);
1972 lrs[i]->setIsSplittedDcl(false);
1973 }
1974
1975 for (unsigned j = i + 1; j < liveAnalysis->getNumSelectedGlobalVar(); j++)
1976 {
1977 if (liveAnalysis->isLiveAtEntry(entryBB, j))
1978 {
1979 if (lrs[i]->getDcl()->getRegFile() == G4_INPUT &&
1980 lrs[i]->getVar()->getPhyReg() != NULL &&
1981 lrs[j]->getDcl()->getRegFile() == G4_INPUT &&
1982 lrs[j]->getVar()->getPhyReg() != NULL)
1983 {
1984 continue;
1985 }
1986 else
1987 {
1988 if (!varSplitCheckBeforeIntf(i, j))
1989 {
1990 checkAndSetIntf(i, j);
1991 }
1992 }
1993 }
1994 }
1995 }
1996 }
1997 }
1998
markInterferenceForSend(G4_BB * bb,G4_INST * inst,G4_DstRegRegion * dst)1999 void Interference::markInterferenceForSend(G4_BB* bb,
2000 G4_INST* inst,
2001 G4_DstRegRegion* dst)
2002 {
2003 bool isDstRegAllocPartaker = false;
2004 bool isDstLocallyAssigned = false;
2005 unsigned dstId = 0;
2006 int dstPreg = 0, dstNumRows = 0;
2007
2008 if (dst->getBase()->isRegVar())
2009 {
2010 if (dst->getBase()->isRegAllocPartaker())
2011 {
2012 G4_DstRegRegion* dstRgn = dst;
2013 isDstRegAllocPartaker = true;
2014 dstId = ((G4_RegVar*)dstRgn->getBase())->getId();
2015 }
2016 else if (kernel.getOption(vISA_LocalRA))
2017 {
2018 LocalLiveRange* localLR = NULL;
2019 G4_Declare* topdcl = GetTopDclFromRegRegion(dst);
2020
2021 if (topdcl)
2022 localLR = gra.getLocalLR(topdcl);
2023
2024 if (localLR && localLR->getAssigned())
2025 {
2026 int sreg;
2027 G4_VarBase* preg = localLR->getPhyReg(sreg);
2028
2029 MUST_BE_TRUE(preg->isGreg(), "Register in dst was not GRF");
2030
2031 isDstLocallyAssigned = true;
2032 dstPreg = preg->asGreg()->getRegNum();
2033 dstNumRows = localLR->getTopDcl()->getNumRows();
2034 }
2035 }
2036
2037 if (isDstRegAllocPartaker || isDstLocallyAssigned)
2038 {
2039 for (unsigned j = 0; j < G4_MAX_SRCS; j++)
2040 {
2041 G4_Operand* src = inst->getSrc(j);
2042 if (src != NULL &&
2043 src->isSrcRegRegion() &&
2044 src->asSrcRegRegion()->getBase()->isRegVar())
2045 {
2046 if (src->asSrcRegRegion()->getBase()->isRegAllocPartaker())
2047 {
2048 unsigned srcId = src->asSrcRegRegion()->getBase()->asRegVar()->getId();
2049
2050 if (isDstRegAllocPartaker)
2051 {
2052 if (!varSplitCheckBeforeIntf(dstId, srcId))
2053 {
2054 checkAndSetIntf(dstId, srcId);
2055 buildInterferenceWithAllSubDcl(dstId, srcId);
2056 }
2057 }
2058 else
2059 {
2060 for (int j = dstPreg, sum = dstPreg + dstNumRows; j < sum; j++)
2061 {
2062 int k = getGRFDclForHRA(j)->getRegVar()->getId();
2063 if (!varSplitCheckBeforeIntf(k, srcId))
2064 {
2065 checkAndSetIntf(k, srcId);
2066 buildInterferenceWithAllSubDcl(k, srcId);
2067 }
2068 }
2069 }
2070 }
2071 else if (kernel.getOption(vISA_LocalRA) && isDstRegAllocPartaker)
2072 {
2073 LocalLiveRange* localLR = nullptr;
2074 const G4_Declare* topdcl = GetTopDclFromRegRegion(src);
2075
2076 if (topdcl)
2077 localLR = gra.getLocalLR(topdcl);
2078
2079 if (localLR && localLR->getAssigned())
2080 {
2081 int sreg;
2082 G4_VarBase* preg = localLR->getPhyReg(sreg);
2083 int numrows = localLR->getTopDcl()->getNumRows();
2084
2085 MUST_BE_TRUE(preg->isGreg(), "Register in src was not GRF");
2086
2087 int reg = preg->asGreg()->getRegNum();
2088
2089 for (int j = reg, sum = reg + numrows; j < sum; j++)
2090 {
2091 int k = getGRFDclForHRA(j)->getRegVar()->getId();
2092 if (!varSplitCheckBeforeIntf(dstId, k))
2093 {
2094 checkAndSetIntf(dstId, k);
2095 buildInterferenceWithAllSubDcl(dstId, k);
2096 }
2097 }
2098 }
2099 }
2100 }
2101 }
2102 }
2103 }
2104 }
2105
markInterferenceToAvoidDstSrcOverlap(G4_BB * bb,G4_INST * inst)2106 void Interference::markInterferenceToAvoidDstSrcOverlap(G4_BB* bb,
2107 G4_INST* inst)
2108 {
2109 bool isDstRegAllocPartaker = false;
2110 bool isDstLocallyAssigned = false;
2111 unsigned dstId = 0;
2112 int dstPreg = 0, dstNumRows = 0;
2113 bool dstOpndNumRows = false;
2114
2115 G4_DstRegRegion* dst = inst->getDst();
2116 if (dst->getBase()->isRegVar() && (dst->getTopDcl()->getRegFile() == G4_GRF))
2117 {
2118 G4_Declare* dstDcl = dst->getTopDcl();
2119 int dstOffset = dst->getLeftBound() / numEltPerGRF<Type_UB>();
2120 bool isDstEvenAlign = gra.isEvenAligned(dstDcl);
2121
2122 if (dst->getBase()->isRegAllocPartaker())
2123 {
2124 isDstRegAllocPartaker = true;
2125 dstId = ((G4_RegVar*)dst->getBase())->getId();
2126 dstOpndNumRows = dst->getLinearizedEnd() - dst->getLinearizedStart() + 1 > numEltPerGRF<Type_UB>();
2127 }
2128 else if (kernel.getOption(vISA_LocalRA))
2129 {
2130 LocalLiveRange* localLR = NULL;
2131 G4_Declare* topdcl = GetTopDclFromRegRegion(dst);
2132
2133 if (topdcl)
2134 localLR = gra.getLocalLR(topdcl);
2135 if (localLR && localLR->getAssigned())
2136 {
2137 int sreg;
2138 G4_VarBase* preg = localLR->getPhyReg(sreg);
2139
2140 MUST_BE_TRUE(preg->isGreg(), "Register in dst was not GRF");
2141
2142 isDstLocallyAssigned = true;
2143 dstPreg = preg->asGreg()->getRegNum();
2144 dstNumRows = localLR->getTopDcl()->getNumRows();
2145 dstOpndNumRows = dst->getLinearizedEnd() - dst->getLinearizedStart() + 1 > numEltPerGRF<Type_UB>();
2146 isDstEvenAlign = (dstPreg % 2 == 0);
2147 }
2148 }
2149
2150 if (isDstRegAllocPartaker || isDstLocallyAssigned)
2151 {
2152 for (unsigned j = 0; j < G4_MAX_SRCS; j++)
2153 {
2154 if (inst->isDpas() && j != 1)
2155 continue;
2156 G4_Operand* src = inst->getSrc(j);
2157 if (src != NULL &&
2158 src->isSrcRegRegion() &&
2159 src->asSrcRegRegion()->getBase()->isRegVar() )
2160 {
2161 G4_SrcRegRegion* srcRgn = src->asSrcRegRegion();
2162 G4_Declare* srcDcl = src->getTopDcl();
2163 if (srcRgn->getRegAccess() == Direct &&
2164 (src->getTopDcl()->getRegFile() == G4_GRF || src->getTopDcl()->getRegFile() == G4_INPUT))
2165 {
2166 int srcOffset = src->getLeftBound() / numEltPerGRF<Type_UB>();
2167 bool srcOpndNumRows = srcRgn->getLinearizedEnd() - srcRgn->getLinearizedStart() + 1 > numEltPerGRF<Type_UB>();
2168
2169 int srcReg = 0;
2170 bool isSrcEvenAlign = gra.isEvenAligned(srcDcl);
2171 if (!src->asSrcRegRegion()->getBase()->isRegAllocPartaker() &&
2172 kernel.getOption(vISA_LocalRA))
2173 {
2174 int sreg;
2175 LocalLiveRange* localLR = NULL;
2176 G4_Declare* topdcl = GetTopDclFromRegRegion(src);
2177
2178 if (topdcl)
2179 localLR = gra.getLocalLR(topdcl);
2180 if (localLR && localLR->getAssigned())
2181 {
2182 G4_VarBase* preg = localLR->getPhyReg(sreg);
2183
2184 MUST_BE_TRUE(preg->isGreg(), "Register in src was not GRF");
2185 srcReg = preg->asGreg()->getRegNum();
2186 isSrcEvenAlign = (srcReg % 2 == 0);
2187 }
2188 }
2189
2190 if (srcDcl->getRegFile() == G4_INPUT &&
2191 srcDcl->getRegVar()->getPhyReg() != NULL &&
2192 srcDcl->getRegVar()->getPhyReg()->isGreg())
2193 {
2194 srcReg = srcDcl->getRegVar()->getPhyReg()->asGreg()->getRegNum();
2195 isSrcEvenAlign = (srcReg % 2 == 0);
2196 }
2197
2198 if (dstOpndNumRows || srcOpndNumRows)
2199 {
2200 if (!(isDstEvenAlign && isSrcEvenAlign &&
2201 srcOffset % 2 == dstOffset % 2 &&
2202 dstOpndNumRows && srcOpndNumRows))
2203 {
2204 if (src->asSrcRegRegion()->getBase()->isRegAllocPartaker())
2205 {
2206 unsigned srcId = src->asSrcRegRegion()->getBase()->asRegVar()->getId();
2207 #ifdef DEBUG_VERBOSE_ON
2208 printf("Src%d ", j);
2209 inst->dump();
2210 #endif
2211 if (isDstRegAllocPartaker)
2212 {
2213 if (!varSplitCheckBeforeIntf(dstId, srcId))
2214 {
2215 checkAndSetIntf(dstId, srcId);
2216 buildInterferenceWithAllSubDcl(dstId, srcId);
2217 }
2218 }
2219 else
2220 {
2221 for (int j = dstPreg, sum = dstPreg + dstNumRows; j < sum; j++)
2222 {
2223 int k = getGRFDclForHRA(j)->getRegVar()->getId();
2224 if (!varSplitCheckBeforeIntf(k, srcId))
2225 {
2226 checkAndSetIntf(k, srcId);
2227 buildInterferenceWithAllSubDcl(k, srcId);
2228 }
2229 }
2230 }
2231 }
2232 else if (kernel.getOption(vISA_LocalRA) && isDstRegAllocPartaker)
2233 {
2234 LocalLiveRange* localLR = NULL;
2235 G4_Declare* topdcl = GetTopDclFromRegRegion(src);
2236
2237 if (topdcl)
2238 localLR = gra.getLocalLR(topdcl);
2239
2240 if (localLR && localLR->getAssigned())
2241 {
2242 int reg, sreg, numrows;
2243 G4_VarBase* preg = localLR->getPhyReg(sreg);
2244 numrows = localLR->getTopDcl()->getNumRows();
2245
2246 MUST_BE_TRUE(preg->isGreg(), "Register in src was not GRF");
2247
2248 reg = preg->asGreg()->getRegNum();
2249 #ifdef DEBUG_VERBOSE_ON
2250 printf("Src%d ", j);
2251 inst->dump();
2252 #endif
2253 for (int j = reg, sum = reg + numrows; j < sum; j++)
2254 {
2255 int k = getGRFDclForHRA(j)->getRegVar()->getId();
2256 if (!varSplitCheckBeforeIntf(dstId, k))
2257 {
2258 checkAndSetIntf(dstId, k);
2259 buildInterferenceWithAllSubDcl(dstId, k);
2260 }
2261 }
2262 }
2263 }
2264 }
2265 }
2266 }
2267 else if (srcRgn->getRegAccess() == IndirGRF)
2268 {
2269 // make every var in points-to set live
2270 const REGVAR_VECTOR& pointsToSet = liveAnalysis->getPointsToAnalysis().getAllInPointsToOrIndrUse(srcRgn, bb);
2271 for (auto pt : pointsToSet)
2272 {
2273 if (pt.var->isRegAllocPartaker())
2274 {
2275 unsigned srcId = pt.var->getId();
2276 if (isDstRegAllocPartaker)
2277 {
2278 if (!varSplitCheckBeforeIntf(dstId, srcId))
2279 {
2280 checkAndSetIntf(dstId, srcId);
2281 buildInterferenceWithAllSubDcl(dstId, srcId);
2282 }
2283 }
2284 else
2285 {
2286 for (int j = dstPreg, sum = dstPreg + dstNumRows; j < sum; j++)
2287 {
2288 int k = getGRFDclForHRA(j)->getRegVar()->getId();
2289 if (!varSplitCheckBeforeIntf(k, srcId))
2290 {
2291 checkAndSetIntf(k, srcId);
2292 buildInterferenceWithAllSubDcl(k, srcId);
2293 }
2294 }
2295 }
2296 }
2297 }
2298 }
2299 }
2300 }
2301 }
2302 }
2303 }
2304
getRefCount(int loopNestLevel)2305 uint32_t GlobalRA::getRefCount(int loopNestLevel)
2306 {
2307 if (loopNestLevel == 0)
2308 {
2309 return 1;
2310 }
2311 return (uint32_t)std::pow(IN_LOOP_REFERENCE_COUNT_FACTOR, std::min(loopNestLevel, 8));
2312 }
2313
2314 // handle return value interference for fcall
buildInterferenceForFcall(G4_BB * bb,BitSet & live,G4_INST * inst,std::list<G4_INST * >::reverse_iterator i,const G4_VarBase * regVar)2315 void Interference::buildInterferenceForFcall(G4_BB* bb, BitSet& live, G4_INST* inst, std::list<G4_INST*>::reverse_iterator i, const G4_VarBase* regVar)
2316 {
2317 assert(inst->opcode() == G4_pseudo_fcall && "expect fcall inst");
2318 unsigned refCount = GlobalRA::getRefCount(kernel.getOption(vISA_ConsiderLoopInfoInRA) ?
2319 bb->getNestLevel() : 0);
2320
2321 if (regVar->isRegAllocPartaker())
2322 {
2323 unsigned id = static_cast<const G4_RegVar*>(regVar)->getId();
2324 lrs[id]->setRefCount(lrs[id]->getRefCount() + refCount);
2325
2326 buildInterferenceWithLive(live, id);
2327 updateLiveness(live, id, false);
2328 }
2329 }
2330
isReRAPass()2331 bool GlobalRA::isReRAPass()
2332 {
2333 auto gtPinInfo = kernel.getGTPinData();
2334 bool reRAPass = gtPinInfo && gtPinInfo->isReRAPass();
2335 return reRAPass;
2336 }
2337
buildInterferenceForDst(G4_BB * bb,BitSet & live,G4_INST * inst,std::list<G4_INST * >::reverse_iterator i,G4_DstRegRegion * dst)2338 void Interference::buildInterferenceForDst(G4_BB* bb, BitSet& live, G4_INST* inst, std::list<G4_INST*>::reverse_iterator i, G4_DstRegRegion* dst)
2339 {
2340 unsigned refCount = GlobalRA::getRefCount(kernel.getOption(vISA_ConsiderLoopInfoInRA) ?
2341 bb->getNestLevel() : 0);
2342
2343 if (dst->getBase()->isRegAllocPartaker())
2344 {
2345 unsigned id = ((G4_RegVar*)dst->getBase())->getId();
2346 //
2347 // In following code,
2348 // pseudo_kill V10
2349 // mov (8) V10, V11
2350 //
2351 // V10 and V11 do not interfere and can be assigned
2352 // same register.
2353 //
2354 // Following condition skips marking interference for
2355 // pseudo_kill nodes.
2356 //
2357 if (!inst->isPseudoKill() &&
2358 !inst->isLifeTimeEnd())
2359 {
2360 lrs[id]->setRefCount(lrs[id]->getRefCount() + refCount); // update reference count
2361
2362 buildInterferenceWithLive(live, id);
2363 if (lrs[id]->getIsSplittedDcl())
2364 {
2365 buildInterferenceWithSubDcl(id, (G4_Operand *)dst, live, false, true);
2366 }
2367 }
2368
2369 //
2370 // if the write does not cover the whole dst region, we should continue let the
2371 // liveness propagate upwards
2372 //
2373 if (liveAnalysis->writeWholeRegion(bb, inst, dst, builder.getOptions()) ||
2374 inst->isPseudoKill())
2375 {
2376 updateLiveness(live, id, false);
2377
2378 if (lrs[id]->getIsSplittedDcl())
2379 {
2380 for (unsigned i = lrs[id]->getDcl()->getSplitVarStartID();
2381 i < lrs[id]->getDcl()->getSplitVarStartID() + gra.getSplitVarNum(lrs[id]->getDcl());
2382 i++)
2383 {
2384 live.set(i, false); //kill all childs, there may be not used childs generated due to splitting, killed also.
2385 }
2386 }
2387 }
2388
2389 // Indirect defs are actually uses of address reg
2390 lrs[id]->checkForInfiniteSpillCost(bb, i);
2391 }
2392 else if (dst->isIndirect() && liveAnalysis->livenessClass(G4_GRF))
2393 {
2394 //
2395 // add interferences to the list of potential indirect destination accesses.
2396 //
2397 const REGVAR_VECTOR& pointsToSet = liveAnalysis->getPointsToAnalysis().getAllInPointsToOrIndrUse(dst, bb);
2398 for (auto pt : pointsToSet)
2399 {
2400 if (pt.var->isRegAllocPartaker())
2401 {
2402 buildInterferenceWithLive(live, pt.var->getId());
2403 }
2404 }
2405 }
2406 }
2407
2408
buildInterferenceWithinBB(G4_BB * bb,BitSet & live)2409 void Interference::buildInterferenceWithinBB(G4_BB* bb, BitSet& live)
2410 {
2411 DebugInfoState state;
2412 unsigned refCount = GlobalRA::getRefCount(kernel.getOption(vISA_ConsiderLoopInfoInRA) ?
2413 bb->getNestLevel() : 0);
2414
2415 for (auto i = bb->rbegin(); i != bb->rend(); i++)
2416 {
2417 G4_INST* inst = (*i);
2418
2419 G4_DstRegRegion* dst = inst->getDst();
2420 if (dst)
2421 {
2422 buildInterferenceForDst(bb, live, inst, i, dst);
2423 }
2424
2425 if (inst->opcode() == G4_pseudo_fcall)
2426 {
2427 if (liveAnalysis->livenessClass(G4_GRF))
2428 {
2429 G4_FCALL* fcall = kernel.fg.builder->getFcallInfo(bb->back());
2430 G4_Declare* arg = kernel.fg.builder->getStackCallArg();
2431 G4_Declare* ret = kernel.fg.builder->getStackCallRet();
2432 MUST_BE_TRUE(fcall != NULL, "fcall info not found");
2433 uint16_t retSize = fcall->getRetSize();
2434 uint16_t argSize = fcall->getArgSize();
2435 if (ret && retSize > 0 && ret->getRegVar())
2436 {
2437 buildInterferenceForFcall(bb, live, inst, i, ret->getRegVar());
2438 }
2439 if (arg && argSize > 0 && arg->getRegVar())
2440 {
2441 auto id = arg->getRegVar()->getId();
2442 updateLiveness(live, id, true);
2443 }
2444 }
2445 else if (liveAnalysis->livenessClass(G4_ADDRESS))
2446 {
2447 // assume callee will use A0
2448 auto A0Dcl = kernel.fg.fcallToPseudoDclMap[inst->asCFInst()].A0;
2449 buildInterferenceWithLive(live, A0Dcl->getRegVar()->getId());
2450 }
2451 else if (liveAnalysis->livenessClass(G4_FLAG))
2452 {
2453 // assume callee will use both F0 and F1
2454 auto flagDcl = kernel.fg.fcallToPseudoDclMap[inst->asCFInst()].Flag;
2455 buildInterferenceWithLive(live, flagDcl->getRegVar()->getId());
2456 }
2457 }
2458
2459 if ((inst->isSend() || inst->isFillIntrinsic()) && !dst->isNullReg() &&
2460 kernel.fg.builder->WaDisableSendSrcDstOverlap())
2461 {
2462 markInterferenceForSend(bb, inst, dst);
2463 }
2464 else if (kernel.fg.builder->avoidDstSrcOverlap() && dst && !dst->isNullReg())
2465 {
2466 markInterferenceToAvoidDstSrcOverlap(bb, inst);
2467 }
2468
2469 if ((inst->isSend() || inst->isFillIntrinsic()) && !dst->isNullReg())
2470 {
2471 //r127 must not be used for return address when there is a src and dest overlap in send instruction.
2472 //This applies to split-send as well
2473 if (kernel.fg.builder->needsToReserveR127() && liveAnalysis->livenessClass(G4_GRF))
2474 {
2475 if (dst->getBase()->isRegAllocPartaker() && !dst->getBase()->asRegVar()->isPhyRegAssigned())
2476 {
2477 int dstId = dst->getBase()->asRegVar()->getId();
2478 lrs[dstId]->markForbidden(kernel.getNumRegTotal() - 1, 1);
2479 }
2480 }
2481 }
2482
2483 if (inst->isSplitSend() && !inst->getSrc(1)->isNullReg())
2484 {
2485 G4_SrcRegRegion* src0 = inst->getSrc(0)->asSrcRegRegion();
2486 G4_SrcRegRegion* src1 = inst->getSrc(1)->asSrcRegRegion();
2487
2488 if (src0->getBase()->isRegAllocPartaker() && src1->getBase()->isRegAllocPartaker())
2489 {
2490 // src0 and src1 of split send may not overlap. In normal cases this is handled automatically
2491 // as we add interference edge when we reach src0/src1's def. If one source is an
2492 // undefined variable (this can happen for URB write payload) and the other an input, however,
2493 // we could miss the interference edge between the two. So we add it explicitly here
2494 int src0Id = src0->getBase()->asRegVar()->getId();
2495 int src1Id = src1->getBase()->asRegVar()->getId();
2496
2497 checkAndSetIntf(src0Id, src1Id);
2498 buildInterferenceWithAllSubDcl(src0Id, src1Id);
2499 }
2500 }
2501
2502 //DPAS: As part of same instruction, src1 should not have overlap with dst. Src0 and src2 are okay to have overlap
2503 if (inst->isDpas() && !inst->getSrc(1)->isNullReg())
2504 {
2505 G4_SrcRegRegion* src1 = inst->getSrc(1)->asSrcRegRegion();
2506 if (dst->getBase()->isRegAllocPartaker() &&
2507 src1->getBase()->isRegAllocPartaker())
2508 {
2509 int dstId = dst->getBase()->asRegVar()->getId();
2510 int src1Id = src1->getBase()->asRegVar()->getId();
2511 checkAndSetIntf(dstId, src1Id);
2512 buildInterferenceWithAllSubDcl(dstId, src1Id);
2513 }
2514 }
2515
2516 //
2517 // process each source operand
2518 //
2519 for (unsigned j = 0; j < G4_MAX_SRCS; j++)
2520 {
2521 G4_Operand* src = inst->getSrc(j);
2522 if (src == NULL)
2523 {
2524 continue;
2525 }
2526 if (src->isSrcRegRegion())
2527 {
2528 G4_SrcRegRegion *srcRegion = src->asSrcRegRegion();
2529 if (srcRegion->getBase()->isRegAllocPartaker())
2530 {
2531 unsigned id = ((G4_RegVar*)(srcRegion)->getBase())->getId();
2532 lrs[id]->setRefCount(lrs[id]->getRefCount() + refCount); // update reference count
2533
2534 if (!inst->isLifeTimeEnd())
2535 {
2536 updateLiveness(live, id, true);
2537 if (lrs[id]->getIsSplittedDcl())
2538 {
2539 buildInterferenceWithSubDcl(id, src, live, true, false);
2540 }
2541 }
2542
2543 if (inst->isEOT() && liveAnalysis->livenessClass(G4_GRF))
2544 {
2545 //mark the liveRange as the EOT source
2546 lrs[id]->setEOTSrc();
2547 if (builder.hasEOTGRFBinding())
2548 {
2549 lrs[id]->markForbidden(0, kernel.getNumRegTotal() - 16);
2550 }
2551 }
2552
2553 if (inst->isReturn())
2554 {
2555 lrs[id]->setRetIp();
2556 }
2557 }
2558 else if (srcRegion->isIndirect() && liveAnalysis->livenessClass(G4_GRF))
2559 {
2560 // make every var in points-to set live
2561 const REGVAR_VECTOR& pointsToSet = liveAnalysis->getPointsToAnalysis().getAllInPointsToOrIndrUse(srcRegion, bb);
2562 for (auto pt : pointsToSet)
2563 {
2564 if (pt.var->isRegAllocPartaker())
2565 {
2566 updateLiveness(live, pt.var->getId(), true);
2567 }
2568 }
2569 }
2570 }
2571 }
2572
2573 //
2574 // Process register-indirect destination uses of ARF.
2575 //
2576 if (dst) {
2577 if (dst->getBase()->isRegAllocPartaker() &&
2578 dst->getRegAccess() != Direct) {
2579 live.set(dst->getBase()->asRegVar()->getId(), true);
2580 }
2581 }
2582
2583 //
2584 // Process condMod
2585 //
2586 G4_CondMod* mod = inst->getCondMod();
2587 if (mod != NULL) {
2588 G4_VarBase *flagReg = mod->getBase();
2589 if (flagReg != NULL)
2590 {
2591 unsigned id = flagReg->asRegVar()->getId();
2592 if (flagReg->asRegVar()->isRegAllocPartaker())
2593 {
2594 lrs[id]->setRefCount(lrs[id]->getRefCount() + refCount); // update reference count
2595 buildInterferenceWithLive(live, id);
2596
2597 if (liveAnalysis->writeWholeRegion(bb, inst, flagReg))
2598 {
2599 updateLiveness(live, id, false);
2600 }
2601
2602 lrs[id]->checkForInfiniteSpillCost(bb, i);
2603 }
2604 }
2605 else
2606 {
2607 MUST_BE_TRUE((inst->opcode() == G4_sel ||
2608 inst->opcode() == G4_csel) &&
2609 inst->getCondMod() != NULL,
2610 "Invalid CondMod");
2611 }
2612 }
2613
2614 //
2615 // Process predicate
2616 //
2617 G4_Predicate* predicate = inst->getPredicate();
2618 if (predicate != NULL) {
2619 G4_VarBase *flagReg = predicate->getBase();
2620 unsigned id = flagReg->asRegVar()->getId();
2621 if (flagReg->asRegVar()->isRegAllocPartaker())
2622 {
2623 lrs[id]->setRefCount(lrs[id]->getRefCount() + refCount); // update reference count
2624 live.set(id, true);
2625 }
2626 }
2627
2628 // Update debug info intervals based on live set
2629 if (builder.getOption(vISA_GenerateDebugInfo))
2630 {
2631 updateDebugInfo(kernel, inst, *liveAnalysis, lrs, live, &state, inst == bb->front());
2632 }
2633 }
2634 }
2635
applyPartitionBias()2636 void Interference::applyPartitionBias()
2637 {
2638 // Any variable that interferes with a VCA dcl is live through an fcall.
2639 // This function makes such variables callee save biased to avoid save/restore
2640 // code around fcall. Save/restore may still be needed in case this is a
2641 // stack call function (vs kernel), but a single save/restore sequence can
2642 // free the callee save register throughout the function.
2643 for (unsigned int i = 0; i != liveAnalysis->getNumSelectedGlobalVar(); i++)
2644 {
2645 if (kernel.fg.isPseudoVCADcl(lrs[i]->getDcl()))
2646 {
2647 const auto& intfs = sparseIntf[i];
2648 for (const auto edge : intfs)
2649 {
2650 // no point adding bias to any variable already assigned
2651 if (lrs[edge]->getPhyReg())
2652 continue;
2653
2654 lrs[edge]->setCalleeSaveBias(true);
2655 lrs[edge]->setCallerSaveBias(false);
2656 }
2657 }
2658 }
2659 }
2660
computeInterference()2661 void Interference::computeInterference()
2662 {
2663 startTimer(TimerID::INTERFERENCE);
2664 //
2665 // create bool vector, live, to track live ranges that are currently live
2666 //
2667 BitSet live(maxId, false);
2668
2669 buildInterferenceAmongLiveOuts();
2670
2671 for (G4_BB *bb : kernel.fg)
2672 {
2673 //
2674 // mark all live ranges dead
2675 //
2676 live.clear();
2677 //
2678 // start with all live ranges that are live at the exit of BB
2679 //
2680 buildInterferenceAtBBExit(bb, live);
2681 //
2682 // traverse inst in the reverse order
2683 //
2684
2685 buildInterferenceWithinBB(bb, live);
2686 }
2687
2688 buildInterferenceAmongLiveIns();
2689
2690 //
2691 // Build interference with physical registers assigned by local RA
2692 //
2693 if (kernel.getOption(vISA_LocalRA))
2694 {
2695 for (auto curBB : kernel.fg)
2696 {
2697 buildInterferenceWithLocalRA(curBB);
2698 }
2699 }
2700
2701 if (builder.getOption(vISA_RATrace))
2702 {
2703 RPE rpe(gra, liveAnalysis);
2704 rpe.run();
2705 std::cout << "\t--max RP: " << rpe.getMaxRP() << "\n";
2706 }
2707
2708 // Augment interference graph to accomodate non-default masks
2709 Augmentation aug(kernel, *this, *liveAnalysis, lrs, gra);
2710 aug.augmentIntfGraph();
2711
2712 generateSparseIntfGraph();
2713
2714 // apply callee save bias after augmentation as interference graph is up-to-date.
2715 if (kernel.fg.getHasStackCalls())
2716 {
2717 applyPartitionBias();
2718 }
2719 }
2720
2721 #define SPARSE_INTF_VEC_SIZE 64
2722
generateSparseIntfGraph()2723 void Interference::generateSparseIntfGraph()
2724 {
2725 // Generate sparse intf graph from the dense one
2726 unsigned numVars = liveAnalysis->getNumSelectedVar();
2727
2728 sparseIntf.resize(numVars);
2729
2730 for (unsigned row = 0; row < numVars; row++)
2731 {
2732 sparseIntf[row].reserve(SPARSE_INTF_VEC_SIZE);
2733 }
2734
2735 if (useDenseMatrix())
2736 {
2737 // Iterate over intf graph matrix
2738 for (unsigned row = 0; row < numVars; row++)
2739 {
2740 unsigned rowOffset = row * rowSize;
2741 unsigned colStart = (row + 1) / BITS_DWORD;
2742 for (unsigned j = colStart; j < rowSize; j++)
2743 {
2744 unsigned intfBlk = getInterferenceBlk(rowOffset + j);
2745 if (intfBlk != 0)
2746 {
2747 for (unsigned k = 0; k < BITS_DWORD; k++)
2748 {
2749 if (intfBlk & (1 << k))
2750 {
2751 unsigned v2 = (j*BITS_DWORD) + k;
2752 if (v2 != row)
2753 {
2754 sparseIntf[v2].emplace_back(row);
2755 sparseIntf[row].emplace_back(v2);
2756 }
2757 }
2758 }
2759 }
2760 }
2761 }
2762 }
2763 else
2764 {
2765 for (uint32_t v1 = 0; v1 < maxId; ++v1)
2766 {
2767 auto&& intfSet = sparseMatrix[v1];
2768 for (uint32_t v2 : intfSet)
2769 {
2770 sparseIntf[v1].emplace_back(v2);
2771 sparseIntf[v2].emplace_back(v1);
2772 }
2773 }
2774 }
2775
2776 if (builder.getOption(vISA_RATrace))
2777 {
2778 uint32_t numNeighbor = 0;
2779 uint32_t maxNeighbor = 0;
2780 uint32_t maxIndex = 0;
2781 for (int i = 0, numVar = (int) sparseIntf.size(); i < numVar; ++i)
2782 {
2783 if (lrs[i]->getPhyReg() == nullptr)
2784 {
2785 auto intf = sparseIntf[i];
2786 numNeighbor += (uint32_t)intf.size();
2787 maxNeighbor = std::max(maxNeighbor, (uint32_t)intf.size());
2788 if (maxNeighbor == (uint32_t)intf.size())
2789 {
2790 maxIndex = i;
2791 }
2792 }
2793 }
2794 float avgNeighbor = ((float)numNeighbor) / sparseIntf.size();
2795 std::cout << "\t--avg # neighbors: " << std::setprecision(6) << avgNeighbor << "\n";
2796 std::cout << "\t--max # neighbors: " << maxNeighbor << " (" << lrs[maxIndex]->getDcl()->getName() << ")\n";
2797 }
2798
2799 stopTimer(TimerID::INTERFERENCE);
2800 }
2801
2802 // This function can be invoked before local RA or after augmentation.
2803 // This function will update sub-reg data only for non-NoMask vars and
2804 // leave others unchanged, ie their value will be as per HW conformity
2805 // or earlier phase.
updateSubRegAlignment(G4_SubReg_Align subAlign)2806 void GlobalRA::updateSubRegAlignment(G4_SubReg_Align subAlign)
2807 {
2808 // Update alignment of all GRF declares to sub-align
2809 for (auto dcl : kernel.Declares)
2810 {
2811 if (dcl->getRegFile() & G4_GRF && !dcl->getIsPartialDcl())
2812 {
2813 G4_Declare* topdcl = dcl->getRootDeclare();
2814
2815 if (!areAllDefsNoMask(topdcl) &&
2816 getAugmentationMask(topdcl) != AugmentationMasks::NonDefault)
2817 {
2818 dcl->setSubRegAlign(subAlign);
2819 setSubRegAlign(dcl, subAlign);
2820 }
2821 }
2822 }
2823 }
2824
evenAlignNeeded(G4_Declare * dcl)2825 bool GlobalRA::evenAlignNeeded(G4_Declare* dcl)
2826 {
2827 if (GlobalRA::useGenericAugAlign())
2828 {
2829 // Return true if even alignment is needed
2830 // Even align needed if for given SIMD size and elem type,
2831 // a complete def uses between 1-2 GRFs.
2832 auto kernelSimdSizeToUse = kernel.getSimdSizeWithSlicing();
2833 G4_Declare* topdcl = dcl->getRootDeclare();
2834 auto topdclAugMask = getAugmentationMask(topdcl);
2835
2836 if (!areAllDefsNoMask(topdcl) && !topdcl->getIsPartialDcl() &&
2837 topdclAugMask != AugmentationMasks::NonDefault)
2838 {
2839 auto elemSizeToUse = topdcl->getElemSize();
2840 if (elemSizeToUse < 4 && topdclAugMask == AugmentationMasks::Default32Bit)
2841 // :uw with hstride 2 can also be Default32Bit and hence needs even alignment
2842 elemSizeToUse = 4;
2843 else if (elemSizeToUse < 8 && topdclAugMask == AugmentationMasks::Default64Bit)
2844 elemSizeToUse = 8;
2845
2846 if (// Even align if size is between 1-2 GRFs, for >2GRF sizes use weak edges
2847 (elemSizeToUse * kernelSimdSizeToUse) > (unsigned)numEltPerGRF<Type_UB>() &&
2848 (elemSizeToUse * kernelSimdSizeToUse) <= (unsigned)(2 * numEltPerGRF<Type_UB>()) &&
2849 !(kernel.fg.builder->getOption(vISA_enablePreemption) &&
2850 dcl == kernel.fg.builder->getBuiltinR0()))
2851 {
2852 return true;
2853 }
2854 }
2855 }
2856 else
2857 {
2858 if (dcl->getRegFile() & G4_GRF)
2859 {
2860 G4_Declare* topdcl = dcl->getRootDeclare();
2861 auto topdclAugMask = getAugmentationMask(topdcl);
2862
2863 if (!areAllDefsNoMask(topdcl) && !topdcl->getIsPartialDcl() &&
2864 topdclAugMask != AugmentationMasks::NonDefault &&
2865 topdclAugMask != AugmentationMasks::Default64Bit)
2866 {
2867 if ((topdcl->getElemSize() >= 4 || topdclAugMask == AugmentationMasks::Default32Bit) &&
2868 topdcl->getByteSize() >= numEltPerGRF<Type_UB>() &&
2869 !(kernel.fg.builder->getOption(vISA_enablePreemption) &&
2870 dcl == kernel.fg.builder->getBuiltinR0()))
2871 {
2872 return true;
2873 }
2874 }
2875 }
2876 }
2877
2878 return false;
2879 }
2880
2881 // This function can be invoked before local RA or after augmentation.
evenAlign()2882 void GlobalRA::evenAlign()
2883 {
2884 // Update alignment of all GRF declares to align
2885 for (auto dcl : kernel.Declares)
2886 {
2887 if (dcl->getRegFile() & G4_GRF)
2888 {
2889 if (evenAlignNeeded(dcl))
2890 {
2891 setEvenAligned(dcl, true);
2892 }
2893 }
2894 }
2895 }
2896
getBankAlignment(LiveRange * lr,BankAlign & align)2897 void GlobalRA::getBankAlignment(LiveRange* lr, BankAlign &align)
2898 {
2899 G4_Declare *dcl = lr->getDcl();
2900 if (kernel.getSimdSize() < g4::SIMD16)
2901 {
2902 return;
2903 }
2904
2905 if (dcl->getRegFile() & G4_GRF)
2906 {
2907 G4_Declare* topdcl = dcl->getRootDeclare();
2908 auto topdclBC = getBankConflict(topdcl);
2909
2910 if (topdclBC != BANK_CONFLICT_NONE)
2911 {
2912 if (topdcl->getElemSize() >= 4 &&
2913 topdcl->getNumRows() > 1 &&
2914 !(kernel.fg.builder->getOption(vISA_enablePreemption) &&
2915 dcl == kernel.fg.builder->getBuiltinR0()))
2916 {
2917 if (topdclBC == BANK_CONFLICT_SECOND_HALF_EVEN ||
2918 topdclBC == BANK_CONFLICT_SECOND_HALF_ODD)
2919 {
2920 align = BankAlign::Odd;
2921 }
2922 }
2923 }
2924 }
2925 }
2926
Augmentation(G4_Kernel & k,Interference & i,const LivenessAnalysis & l,LiveRange * const ranges[],GlobalRA & g)2927 Augmentation::Augmentation(G4_Kernel& k, Interference& i, const LivenessAnalysis& l, LiveRange* const ranges[], GlobalRA& g) :
2928 kernel(k), intf(i), gra(g), liveAnalysis(l), lrs(ranges), fcallRetMap(g.fcallRetMap), m(kernel.fg.mem)
2929 {
2930 }
2931
2932 // For Scatter read, the channel is not handled as the block read.
2933 // Update the emask according to the definition of VISA
updateDstMaskForGather(G4_INST * inst,std::vector<unsigned char> & mask)2934 bool Augmentation::updateDstMaskForGather(G4_INST* inst, std::vector<unsigned char>& mask)
2935 {
2936 if (const G4_SendDescRaw *d = inst->getMsgDescRaw()) {
2937 return updateDstMaskForGatherRaw(inst, mask, d);
2938 } else if (const G4_SendDescLdSt *d = inst->getMsgDescLdSt()) {
2939 return updateDstMaskForGatherLdSt(inst, mask, d);
2940 } else {
2941 ASSERT_USER(false, "unexpected descriptor");
2942 return false;
2943 }
2944 }
2945
updateMaskSIMT(unsigned char curEMBit,unsigned char execSize,std::vector<unsigned char> & mask,unsigned dataSizeBytes,unsigned vecElems)2946 static void updateMaskSIMT(
2947 unsigned char curEMBit,
2948 unsigned char execSize,
2949 std::vector<unsigned char>& mask,
2950 unsigned dataSizeBytes, unsigned vecElems)
2951 {
2952 unsigned blockSize = dataSizeBytes;
2953 unsigned blockNum = vecElems;
2954 for (unsigned i = 0; i < execSize; i++)
2955 {
2956 for (unsigned j = 0; j < blockNum; j++)
2957 {
2958 for (unsigned k = 0; k < blockSize; k++)
2959 {
2960 mask[(j * execSize + i) * blockSize + k] = curEMBit;
2961 }
2962 }
2963 if (curEMBit != NOMASK_BYTE)
2964 {
2965 curEMBit++;
2966 ASSERT_USER(curEMBit <= 32, "Illegal mask channel");
2967 }
2968 }
2969 }
2970
updateDstMaskForGatherRaw(G4_INST * inst,std::vector<unsigned char> & mask,const G4_SendDescRaw * msgDesc)2971 bool Augmentation::updateDstMaskForGatherRaw(
2972 G4_INST* inst, std::vector<unsigned char>& mask, const G4_SendDescRaw* msgDesc)
2973 {
2974 unsigned char execSize = inst->getExecSize();
2975 const G4_DstRegRegion* dst = inst->getDst();
2976 unsigned char curEMBit = (unsigned char)inst->getMaskOffset();
2977 unsigned short elemSize = dst->getElemSize();
2978
2979 if (inst->isWriteEnableInst())
2980 {
2981 curEMBit = NOMASK_BYTE;
2982 }
2983
2984 SFID funcID = msgDesc->getFuncId();
2985
2986 switch (funcID)
2987 {
2988 case SFID::DP_DC1:
2989 switch (msgDesc->getHdcMessageType())
2990 {
2991 case DC1_A64_SCATTERED_READ: //a64 scattered read: svm_gather
2992 {
2993 unsigned blockNum = msgDesc->getBlockNum();
2994 unsigned blockSize = msgDesc->getBlockSize();
2995
2996 for (unsigned i = 0; i < execSize; i++)
2997 {
2998 for (unsigned j = 0; j < blockNum; j++)
2999 {
3000 for (unsigned k = 0; k < blockSize; k++)
3001 {
3002 mask[(j * execSize + i) * blockSize + k] = curEMBit;
3003 }
3004 }
3005 if (curEMBit != NOMASK_BYTE)
3006 {
3007 curEMBit++;
3008 ASSERT_USER(curEMBit <= 32, "Illegal mask channel");
3009 }
3010 }
3011 return true;
3012 }
3013 break;
3014
3015 case DC1_A64_UNTYPED_SURFACE_READ: //SVM gather 4
3016 case DC1_UNTYPED_SURFACE_READ: //VISA gather 4
3017 case DC1_TYPED_SURFACE_READ: //Gather 4 typed
3018 {
3019 unsigned channelNum = msgDesc->getEnabledChannelNum();
3020 if (channelNum == 0)
3021 {
3022 return false;
3023 }
3024 if (elemSize < 4)
3025 {
3026 elemSize = 4;
3027 }
3028
3029 for (unsigned i = 0; i < channelNum; i++)
3030 {
3031 for (unsigned j = 0; j < execSize; j++)
3032 {
3033 for (unsigned k = 0; k < elemSize; k++)
3034 {
3035 mask[(i * execSize + j)*elemSize + k] = curEMBit;
3036 }
3037 if (curEMBit != NOMASK_BYTE)
3038 {
3039 curEMBit++;
3040 ASSERT_USER(curEMBit <= 32, "Illegal mask channel");
3041 }
3042 }
3043 curEMBit = (unsigned char)inst->getMaskOffset();
3044 }
3045 return true;
3046 }
3047 break;
3048
3049 default: return false;
3050 }
3051 break;
3052 case SFID::DP_DC2:
3053 switch (msgDesc->getHdcMessageType())
3054 {
3055 case DC2_UNTYPED_SURFACE_READ: //gather 4 scaled
3056 case DC2_A64_UNTYPED_SURFACE_READ: //SVM gather 4 scaled
3057 {
3058 unsigned channelNum = msgDesc->getEnabledChannelNum();
3059 if (channelNum == 0)
3060 {
3061 return false;
3062 }
3063 if (elemSize < 4)
3064 {
3065 elemSize = 4;
3066 }
3067
3068 for (unsigned i = 0; i < channelNum; i++)
3069 {
3070 for (unsigned j = 0; j < execSize; j++)
3071 {
3072 for (unsigned k = 0; k < elemSize; k++)
3073 {
3074 mask[(i * execSize + j)*elemSize + k] = curEMBit;
3075 }
3076 if (curEMBit != NOMASK_BYTE)
3077 {
3078 curEMBit++;
3079 ASSERT_USER(curEMBit <= 32, "Illegal mask channel");
3080 }
3081 }
3082 curEMBit = (unsigned char)inst->getMaskOffset();
3083 }
3084 return true;
3085 }
3086
3087 case DC2_BYTE_SCATTERED_READ: //scaled byte scattered read: gather_scaled, handled as block read write
3088 default: return false;
3089 }
3090 break;
3091 case SFID::DP_DC0:
3092 switch (msgDesc->getHdcMessageType())
3093 {
3094 case DC_DWORD_SCATTERED_READ: //dword scattered read: gather(dword), handled as block read write
3095 case DC_BYTE_SCATTERED_READ: //byte scattered read: gather(byte), handled as block read write
3096 default: return false;
3097 }
3098 break;
3099
3100 case SFID::SAMPLER:
3101 {
3102 unsigned respLength = msgDesc->ResponseLength();
3103 if (respLength * numEltPerGRF<Type_UB>() != dst->getTopDcl()->getByteSize() &&
3104 msgDesc->isFence())
3105 {
3106 // since send dst size is not exactly equal to ResponseLength encoded in
3107 // the descriptor, conservatively treat the send as being non-default
3108 auto sz = dst->getTopDcl()->getByteSize();
3109 for (unsigned int i = 0; i != sz; ++i)
3110 mask[i] = NOMASK_BYTE;
3111 return true;
3112 }
3113 unsigned char curEMBit = (unsigned char)inst->getMaskOffset();
3114 elemSize = msgDesc->is16BitReturn() ? 2 : 4;
3115 unsigned warpNum = respLength * numEltPerGRF<Type_UB>() / (execSize * elemSize);
3116 if (inst->isWriteEnableInst())
3117 {
3118 curEMBit = NOMASK_BYTE;
3119 }
3120 for (unsigned i = 0; i < warpNum; i++)
3121 {
3122 for (unsigned j = 0; j < execSize; j++)
3123 {
3124 for (unsigned k = 0; k < elemSize; k++)
3125 {
3126 mask[(i * execSize + j)*elemSize + k] = curEMBit;
3127 }
3128 if (curEMBit != NOMASK_BYTE)
3129 {
3130 curEMBit++;
3131 ASSERT_USER(curEMBit <= 32, "Illegal mask channel");
3132 }
3133 }
3134 curEMBit = (unsigned char)inst->getMaskOffset();
3135 }
3136 return true;
3137 }
3138
3139 break;
3140
3141 case SFID::UGM:
3142 case SFID::UGML:
3143 case SFID::SLM:
3144 {
3145 uint32_t desc = msgDesc->getDesc();
3146 uint32_t op = (desc & 0x3F); // [5:0]
3147 uint32_t dszEncd = (desc >> 9) & 0x7; // [11:9]
3148 bool isTranspose = ((desc >> 15) & 0x1) != 0; // [15]
3149 if (op == LSC_LOAD && !isTranspose) { // transpose not supported yet
3150 int dataSzReg = 0;
3151 switch (dszEncd) { // dat size [11:9]
3152 case 0: dataSzReg = 1; break; // d8
3153 case 1: dataSzReg = 2; break; // d16
3154 default: dataSzReg = 4; break; // d32, d8u32, d16u32, d16u32h
3155 case 3: dataSzReg = 8; break; // d64
3156 }
3157 int vecSz = 0;
3158 int vecSzEncd = (desc >> 12) & 0x7; // [14:12]
3159 if (vecSzEncd <= 3) {
3160 vecSz = vecSzEncd + 1; // V1, V2, V3, V4
3161 } else {
3162 vecSz = 4 << (vecSzEncd - 3); // V8, V16, V32, V64
3163 }
3164 updateMaskSIMT(curEMBit, execSize, mask,
3165 (unsigned)dataSzReg,
3166 (unsigned)vecSz);
3167 return true;
3168 }
3169 }
3170 default: return false;
3171 }
3172
3173 return false;
3174 }
3175
updateDstMaskForGatherLdSt(G4_INST * inst,std::vector<unsigned char> & mask,const G4_SendDescLdSt * msgDesc)3176 bool Augmentation::updateDstMaskForGatherLdSt(
3177 G4_INST* inst, std::vector<unsigned char>& mask, const G4_SendDescLdSt *msgDesc)
3178 {
3179 // as in the raw case only support SIMT
3180 if (msgDesc->op != LdStOp::LOAD || msgDesc->order == LdStOrder::SCALAR) {
3181 return false;
3182 }
3183 unsigned char curEMBit = (unsigned char)inst->getMaskOffset();
3184 unsigned char execSize = inst->getExecSize();
3185 updateMaskSIMT(curEMBit, execSize, mask,
3186 msgDesc->elemBitsReg, msgDesc->elemPerAddr);
3187
3188 return true;
3189 }
3190
3191 // Value stored at each byte in mask determines which bits
3192 // of EM enable that byte for writing. When checkCmodOnly
3193 // is set dst is ignored and mask only for cmod is set. For
3194 // flag declares, mask is at bit granularity rather than byte.
3195 // Function updates mask field in declaration of correspoing
3196 // variable - dst or cmod.
updateDstMask(G4_INST * inst,bool checkCmodOnly)3197 void Augmentation::updateDstMask(G4_INST* inst, bool checkCmodOnly)
3198 {
3199 G4_DstRegRegion* dst = inst->getDst();
3200 G4_CondMod* cmod = inst->getCondMod();
3201
3202 if ((checkCmodOnly == false && dst &&
3203 dst->getBase() &&
3204 dst->getBase()->isRegVar()) ||
3205 (checkCmodOnly == true && cmod != NULL && cmod->getBase() != NULL))
3206 {
3207 int dclOffset = 0;
3208 G4_Declare* topdcl = NULL;
3209
3210 if (checkCmodOnly == false)
3211 {
3212 topdcl = dst->getBase()->asRegVar()->getDeclare();
3213 }
3214 else
3215 {
3216 topdcl = cmod->asCondMod()->getTopDcl();
3217 }
3218
3219 while (topdcl->getAliasDeclare() != nullptr)
3220 {
3221 dclOffset += topdcl->getAliasOffset();
3222 topdcl = topdcl->getAliasDeclare();
3223 }
3224
3225 auto& mask = const_cast<std::vector<unsigned char>&>(gra.getMask(topdcl));
3226
3227 unsigned size = topdcl->getByteSize();
3228 if (checkCmodOnly == true || dst->isFlag())
3229 {
3230 size *= BITS_PER_BYTE;
3231 }
3232
3233 if (mask.size() == 0)
3234 {
3235 mask.resize(size);
3236 }
3237
3238 MUST_BE_TRUE(mask.size() > 0, "Valid mask not found for dcl " << topdcl->getName());
3239
3240 unsigned short hstride, elemSize;
3241 short row, subReg;
3242 unsigned startByte;
3243
3244 if (checkCmodOnly == false)
3245 {
3246 hstride = dst->getHorzStride();
3247
3248 row = dst->getRegOff();
3249 subReg = dst->getSubRegOff();
3250 elemSize = dst->getElemSize();
3251
3252 if (inst->isSend() && !inst->isEOT())
3253 {
3254 if (updateDstMaskForGather(inst, mask))
3255 {
3256 return;
3257 }
3258 }
3259
3260 if (dst->isFlag())
3261 {
3262 elemSize = 1;
3263 }
3264
3265 startByte = (row * getGRFSize()) + (subReg * elemSize);
3266
3267 if (dst->isFlag())
3268 {
3269 startByte = (row * 32) + (subReg * 8);
3270 }
3271 }
3272 else
3273 {
3274 hstride = 1;
3275 row = 0;
3276 elemSize = 1;
3277 startByte = cmod->asCondMod()->getLeftBound();
3278 }
3279
3280 unsigned rb = 0xffffffff;
3281
3282 if (checkCmodOnly == true)
3283 {
3284 rb = cmod->asCondMod()->getRightBound();
3285 }
3286 else
3287 {
3288 rb = dst->getRightBound();
3289 }
3290
3291 unsigned char curEMBit = (unsigned char)inst->getMaskOffset();
3292 if (inst->isWriteEnableInst())
3293 {
3294 curEMBit = NOMASK_BYTE;
3295 }
3296
3297 for (unsigned i = dclOffset + startByte;
3298 i <= rb;
3299 i += (hstride * elemSize))
3300 {
3301 for (int j = 0; j < elemSize; j++)
3302 {
3303 MUST_BE_TRUE2(i + j < size, "updateDstMask writing past end of mask array size:" << size, inst);
3304 mask[i + j] |= curEMBit;
3305 }
3306 if (curEMBit != NOMASK_BYTE)
3307 {
3308 curEMBit++;
3309 }
3310 }
3311 }
3312 }
3313
getByteSizeFromMask(AugmentationMasks type)3314 unsigned Augmentation::getByteSizeFromMask(AugmentationMasks type)
3315 {
3316 if (type == AugmentationMasks::Default16Bit)
3317 {
3318 return 2;
3319 }
3320 else if (type == AugmentationMasks::Default32Bit)
3321 {
3322 return 4;
3323 }
3324 else if (type == AugmentationMasks::Default64Bit)
3325 {
3326 return 8;
3327 }
3328
3329 MUST_BE_TRUE(false, "Unexpected type of mask");
3330
3331 return 0;
3332 }
3333
isDefaultMaskDcl(G4_Declare * dcl,unsigned simdSize,AugmentationMasks type)3334 bool Augmentation::isDefaultMaskDcl(G4_Declare* dcl, unsigned simdSize, AugmentationMasks type)
3335 {
3336 // default mask is one where dst's hstride is 1 and
3337 // elem size is 4
3338 bool isDefault = false;
3339 auto& mask = gra.getMask(dcl);
3340
3341 unsigned byteSize = getByteSizeFromMask(type);
3342
3343 // treat simd32 as simd16 as the instruction is always split to 2 simd16
3344 if (simdSize == 32)
3345 {
3346 simdSize = 16;
3347 }
3348 if (mask.size() > 0)
3349 {
3350 G4_Declare* topdcl = dcl->getRootDeclare();
3351 bool isFlagDcl = (topdcl->getRegFile() == G4_FLAG);
3352
3353 unsigned size = topdcl->getByteSize();
3354 unsigned char curEMBit = 0;
3355 bool found = true;
3356 unsigned wrapAround = simdSize*byteSize;
3357
3358 if (isFlagDcl == true)
3359 {
3360 size *= BITS_PER_BYTE;
3361 wrapAround = 16;
3362 }
3363
3364 for (unsigned i = 0; i < size; i += 1)
3365 {
3366 if (isFlagDcl == true)
3367 {
3368 curEMBit++;
3369 }
3370 else
3371 {
3372 if (byteSize && i%byteSize == 0)
3373 {
3374 curEMBit++;
3375 }
3376 }
3377
3378 if (i%wrapAround == 0)
3379 {
3380 // Wrap around based on simd size
3381 // For SIMD8 wrap around each row,
3382 // for SIMD16 wrap around every other row
3383 curEMBit = 0;
3384 }
3385
3386 if (mask[i] != curEMBit &&
3387 // For flags, we set bytesize = 2 although
3388 // the kernel is SIMD8. This means higher 8
3389 // bits of mask will be set to 0 since those
3390 // bits are never defined. Such masks need
3391 // not be considered non-default.
3392 !(isFlagDcl == true && mask[i] == 0))
3393 {
3394 found = false;
3395 break;
3396 }
3397 }
3398
3399 if (found == true)
3400 {
3401 isDefault = true;
3402 }
3403 }
3404
3405 return isDefault;
3406 }
3407
isDefaultMaskSubDeclare(unsigned char * mask,unsigned lb,unsigned rb,G4_Declare * dcl,unsigned simdSize)3408 bool Augmentation::isDefaultMaskSubDeclare(unsigned char* mask, unsigned lb, unsigned rb, G4_Declare* dcl, unsigned simdSize)
3409 {
3410 bool isDefault = false;
3411
3412 // treat simd32 as simd16 as the instruction is always split to 2 simd16
3413 if (simdSize == 32)
3414 {
3415 simdSize = 16;
3416 }
3417
3418 if (mask != NULL)
3419 {
3420 unsigned size = dcl->getByteSize();
3421 unsigned char curEMBit = 0;
3422 bool found = true;
3423 unsigned wrapAround = simdSize * 4;
3424 unsigned leftBound = gra.getSubOffset(dcl);
3425 unsigned rightBound = leftBound + size - 1;
3426
3427 ASSERT_USER(rightBound <= rb, "Wrong sub declare right bound!");
3428
3429 for (unsigned i = lb; i < rightBound + 1; i += 1)
3430 {
3431 if ((i - lb) % 4 == 0)
3432 {
3433 curEMBit++;
3434 }
3435
3436 if ((i - lb) % wrapAround == 0)
3437 {
3438 curEMBit = 0;
3439 }
3440
3441 if (i >= leftBound)
3442 {
3443 if (mask[i] != curEMBit)
3444 {
3445 found = false;
3446 break;
3447 }
3448 }
3449 }
3450
3451 if (found == true)
3452 {
3453 isDefault = true;
3454 }
3455 }
3456
3457 return isDefault;
3458 }
3459
3460
verifyMaskIfInit(G4_Declare * dcl,AugmentationMasks mask)3461 bool Augmentation::verifyMaskIfInit(G4_Declare* dcl, AugmentationMasks mask)
3462 {
3463 // Return true if dcl mask is either undetermined or same as mask
3464 auto m = gra.getAugmentationMask(dcl);
3465 if (m == mask ||
3466 m == AugmentationMasks::Undetermined)
3467 {
3468 return true;
3469 }
3470
3471 return false;
3472 }
3473
checkGRFPattern2(G4_Declare * dcl,G4_DstRegRegion * dst,unsigned maskOff,unsigned lb,unsigned rb,unsigned execSize)3474 bool Augmentation::checkGRFPattern2(G4_Declare* dcl, G4_DstRegRegion* dst, unsigned maskOff,
3475 unsigned lb, unsigned rb, unsigned execSize)
3476 {
3477 auto opndByteSize = dst->getTypeSize();
3478 unsigned modWith = opndByteSize*kernel.getSimdSize();
3479 if (lb % modWith - (maskOff * opndByteSize * dst->getHorzStride()) <= opndByteSize)
3480 {
3481 if ((lb + (execSize * opndByteSize * dst->getHorzStride() - dst->getHorzStride()) - rb) < opndByteSize)
3482 {
3483 if (opndByteSize == 2 &&
3484 verifyMaskIfInit(dcl, AugmentationMasks::Default32Bit))
3485 {
3486 gra.setAugmentationMask(dcl, AugmentationMasks::Default32Bit);
3487 return true;
3488 }
3489 else if (opndByteSize == 4 &&
3490 verifyMaskIfInit(dcl, AugmentationMasks::Default64Bit))
3491 {
3492 gra.setAugmentationMask(dcl, AugmentationMasks::Default64Bit);
3493 return true;
3494 }
3495 else
3496 {
3497 gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
3498 return true;
3499 }
3500 }
3501 }
3502
3503 return false;
3504 }
3505
3506 // Returns true if dcl mask deemed to be non-default, false otherwise.
checkGRFPattern1(G4_Declare * dcl,G4_DstRegRegion * dst,unsigned maskOff,unsigned lb,unsigned rb,unsigned execSize)3507 bool Augmentation::checkGRFPattern1(G4_Declare* dcl, G4_DstRegRegion* dst, unsigned maskOff,
3508 unsigned lb, unsigned rb, unsigned execSize)
3509 {
3510 auto opndByteSize = dst->getTypeSize();
3511 unsigned modWith = opndByteSize*kernel.getSimdSize();
3512 if (dst->getHorzStride() == 1)
3513 {
3514 if ((lb%modWith == (maskOff * opndByteSize) &&
3515 rb == (lb + (execSize * opndByteSize) - 1)))
3516 {
3517 // This will be taken only when hstride = 1
3518 if (opndByteSize == 2 &&
3519 verifyMaskIfInit(dcl, AugmentationMasks::Default16Bit))
3520 {
3521 gra.setAugmentationMask(dcl, AugmentationMasks::Default16Bit);
3522 return true;
3523 }
3524 else if (opndByteSize == 4 &&
3525 verifyMaskIfInit(dcl, AugmentationMasks::Default32Bit))
3526 {
3527 gra.setAugmentationMask(dcl, AugmentationMasks::Default32Bit);
3528 return true;
3529 }
3530 else if (opndByteSize == 8 &&
3531 verifyMaskIfInit(dcl, AugmentationMasks::Default64Bit))
3532 {
3533 gra.setAugmentationMask(dcl, AugmentationMasks::Default64Bit);
3534 return true;
3535 }
3536 else
3537 {
3538 gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
3539 return true;
3540 }
3541 }
3542 }
3543
3544 return false;
3545 }
3546
markNonDefaultDstRgn(G4_INST * inst,G4_Operand * opnd)3547 void Augmentation::markNonDefaultDstRgn(G4_INST* inst, G4_Operand* opnd)
3548 {
3549 if (inst->isPseudoKill())
3550 {
3551 return;
3552 }
3553
3554 G4_DstRegRegion* dst = nullptr;
3555 G4_CondMod* condMod = nullptr;
3556 if (opnd->isDstRegRegion())
3557 {
3558 dst = opnd->asDstRegRegion();
3559 }
3560 else if (opnd->isCondMod())
3561 {
3562 condMod = opnd->asCondMod();
3563 }
3564 else
3565 {
3566 MUST_BE_TRUE(false, "Dont know how to handle this type of operand");
3567 }
3568
3569 // Handle condMod
3570 if (condMod && condMod->getBase())
3571 {
3572 G4_Declare* dcl = condMod->getTopDcl();
3573 dcl = dcl->getRootDeclare();
3574
3575 if (inst->isWriteEnableInst() ||
3576 opnd->getLeftBound() != inst->getMaskOffset())
3577 {
3578 gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
3579 return;
3580 }
3581
3582 if (verifyMaskIfInit(dcl, AugmentationMasks::DefaultPredicateMask))
3583 {
3584 gra.setAugmentationMask(dcl, AugmentationMasks::DefaultPredicateMask);
3585 }
3586 return;
3587 }
3588
3589 // Handle dst
3590 if (inst->isCall() || inst->isCallerSave())
3591 {
3592 const G4_Declare* dcl = dst->getBase()->asRegVar()->getDeclare();
3593 if (dcl && liveAnalysis.livenessClass(dcl->getRegFile()))
3594 {
3595 gra.setAugmentationMask(dcl->getRootDeclare(), AugmentationMasks::NonDefault);
3596 }
3597 return;
3598 }
3599
3600 bool isFlagRA = liveAnalysis.livenessClass(G4_FLAG);
3601 if (dst &&
3602 dst->getBase() &&
3603 dst->getBase()->isRegVar())
3604 {
3605 G4_Declare* dcl = dst->getBase()->asRegVar()->getDeclare();
3606 if (!liveAnalysis.livenessClass(dcl->getRegFile()))
3607 {
3608 return;
3609 }
3610 unsigned offTopDcl = 0;
3611 while (dcl->getAliasDeclare())
3612 {
3613 offTopDcl += dcl->getAliasOffset();
3614 dcl = dcl->getAliasDeclare();
3615 }
3616
3617 // NoMask instructions's dst is always non-default
3618 if (inst->isWriteEnableInst())
3619 {
3620 gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
3621 return;
3622 }
3623
3624 if (gra.getAugmentationMask(dcl) == AugmentationMasks::NonDefault)
3625 return;
3626
3627 unsigned maskOff = inst->getMaskOffset();
3628 unsigned lb = dst->getLeftBound() + offTopDcl;
3629 unsigned rb = dst->getRightBound() + offTopDcl;
3630 unsigned execSize = inst->getExecSize();
3631
3632 if (dcl->getAddressed())
3633 {
3634 gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
3635 return;
3636 }
3637
3638 if (!isFlagRA)
3639 {
3640 // Treat send as special case because update mask for scatter
3641 // has some special checks.
3642 if (inst->isSend())
3643 {
3644 if (gra.getAugmentationMask(dcl) == AugmentationMasks::NonDefault)
3645 {
3646 return;
3647 }
3648
3649 updateDstMask(inst, false);
3650 if (isDefaultMaskDcl(dcl, kernel.getSimdSize(), AugmentationMasks::Default16Bit))
3651 {
3652 gra.setAugmentationMask(dcl, AugmentationMasks::Default16Bit);
3653 }
3654 else if (isDefaultMaskDcl(dcl, kernel.getSimdSize(), AugmentationMasks::Default32Bit))
3655 {
3656 gra.setAugmentationMask(dcl, AugmentationMasks::Default32Bit);
3657 }
3658 else if (isDefaultMaskDcl(dcl, kernel.getSimdSize(), AugmentationMasks::Default64Bit))
3659 {
3660 bool useNonDefault = false;
3661 useNonDefault |= (kernel.getSimdSize() >= g4::SIMD16 && dcl->getTotalElems() > 8);
3662 useNonDefault |= (kernel.getSimdSize() == g4::SIMD8 && dcl->getTotalElems() > 4);
3663
3664 if (useNonDefault)
3665 {
3666 gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
3667 }
3668 else
3669 {
3670 gra.setAugmentationMask(dcl, AugmentationMasks::Default64Bit);
3671 }
3672 }
3673 else
3674 {
3675 gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
3676 return;
3677 }
3678 }
3679 else
3680 {
3681 bool found = false;
3682 // default one
3683 found |= checkGRFPattern1(dcl, dst, maskOff, lb, rb, execSize);
3684 if (!found ||
3685 gra.getAugmentationMask(dcl) == AugmentationMasks::Undetermined)
3686 {
3687 // hstride = 2 case
3688 found |= checkGRFPattern2(dcl, dst, maskOff, lb, rb, execSize);
3689 }
3690
3691 if (!found ||
3692 gra.getAugmentationMask(dcl) == AugmentationMasks::Undetermined)
3693 {
3694 gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
3695 }
3696 }
3697 }
3698 else
3699 {
3700 // Handle flag register as destination here
3701 if (!(lb == maskOff && rb == (lb + execSize - 1)))
3702 {
3703 gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
3704 return;
3705 }
3706
3707 if (verifyMaskIfInit(dcl, AugmentationMasks::DefaultPredicateMask))
3708 {
3709 gra.setAugmentationMask(dcl, AugmentationMasks::DefaultPredicateMask);
3710 }
3711 }
3712 }
3713 }
3714
3715 // Returns true if any inst found using non-default mask.
3716 // This function sets up lexical id of all instructions.
markNonDefaultMaskDef()3717 bool Augmentation::markNonDefaultMaskDef()
3718 {
3719 // Iterate dcls list and mark obvious ones as non-default.
3720 // Obvoius non-default is 1 element, ie uniform dcl.
3721 for (auto dcl : kernel.Declares)
3722 {
3723 auto dclRegFile = dcl->getRegFile();
3724 if (!liveAnalysis.livenessClass(dclRegFile))
3725 continue;
3726
3727 if (dclRegFile == G4_GRF || dclRegFile == G4_INPUT || dclRegFile == G4_ADDRESS)
3728 {
3729 if (dcl->getTotalElems() < 8 || dclRegFile == G4_INPUT)
3730 {
3731 gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
3732 }
3733 }
3734 else if (dclRegFile == G4_FLAG)
3735 {
3736 // Flags are processed when processing instructions
3737 }
3738 }
3739
3740 unsigned id = 0;
3741 bool isFlagRA = liveAnalysis.livenessClass(G4_FLAG);
3742
3743 for (auto bb : kernel.fg)
3744 {
3745 for (auto inst : *bb)
3746 {
3747 inst->setLexicalId(id++);
3748
3749 G4_DstRegRegion* dst = inst->getDst();
3750
3751 if (dst)
3752 {
3753 markNonDefaultDstRgn(inst, dst);
3754 }
3755
3756 if (isFlagRA &&
3757 inst->getCondMod())
3758 {
3759 markNonDefaultDstRgn(inst, inst->getCondMod());
3760 }
3761 }
3762 }
3763
3764 // Update whether each dcl is default/not
3765 AugmentationMasks prevAugMask = AugmentationMasks::Undetermined;
3766 bool nonDefaultMaskDefFound = false;
3767
3768 for (auto dcl : kernel.Declares)
3769 {
3770 if (liveAnalysis.livenessClass(dcl->getRegFile()))
3771 {
3772 if (gra.getAugmentationMask(dcl) == AugmentationMasks::Undetermined)
3773 {
3774 gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
3775 nonDefaultMaskDefFound = true;
3776 }
3777
3778 if(kernel.getOption(vISA_forceBCR) && gra.getBankConflict(dcl) != BANK_CONFLICT_NONE)
3779 {
3780 gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
3781 nonDefaultMaskDefFound = true;
3782 }
3783
3784 if (!nonDefaultMaskDefFound &&
3785 gra.getAugmentationMask(dcl) != prevAugMask &&
3786 prevAugMask != AugmentationMasks::Undetermined)
3787 {
3788 nonDefaultMaskDefFound = true;
3789 }
3790
3791 prevAugMask = gra.getAugmentationMask(dcl);
3792 }
3793
3794 bool checkLRAAlign = false;
3795 if (liveAnalysis.livenessClass(G4_GRF))
3796 {
3797 if ((GlobalRA::useGenericAugAlign() && gra.evenAlignNeeded(dcl)))
3798 checkLRAAlign = true;
3799 else if (gra.getAugmentationMask(dcl) == AugmentationMasks::Default32Bit &&
3800 kernel.getSimdSize() > numEltPerGRF<Type_UD>())
3801 checkLRAAlign = true;
3802 }
3803
3804 if (checkLRAAlign)
3805 {
3806 auto dclLR = gra.getLocalLR(dcl);
3807 if (dclLR)
3808 {
3809 int s;
3810 auto phyReg = dclLR->getPhyReg(s);
3811 if (phyReg && phyReg->asGreg()->getRegNum() % 2 != 0)
3812 {
3813 // If LRA assignment is not 2GRF aligned for then
3814 // mark it as non-default. GRA candidates cannot fully
3815 // overlap with such ranges. Partial overlap is illegal.
3816 gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
3817 nonDefaultMaskDefFound = true;
3818 }
3819 }
3820 }
3821 }
3822
3823 return nonDefaultMaskDefFound;
3824 }
3825
getTopmostBBDst(G4_BB * src,G4_BB * end,G4_BB * origSrc,unsigned traversal)3826 G4_BB* Augmentation::getTopmostBBDst(G4_BB* src, G4_BB* end, G4_BB* origSrc, unsigned traversal)
3827 {
3828 // Start from src BB and do a DFS. If any back-edges
3829 // are found then recursively invoke itself with dst
3830 // of back-edge. Any path that reaches BB "end"
3831 // will not be propagated forward.
3832 unsigned topLexId = src->front()->getLexicalId();
3833 G4_BB* topmostBB = src;
3834
3835 if (src != end)
3836 {
3837 src->markTraversed(traversal);
3838 src->setNestLevel();
3839
3840 for (G4_BB* succ : src->Succs)
3841 {
3842 if (succ == origSrc)
3843 {
3844 // Src of traversal traversed again without
3845 // ever traversing end node. So abort this path.
3846 return nullptr;
3847 }
3848
3849 if (succ->isAlreadyTraversed(traversal) == true)
3850 continue;
3851
3852 G4_BB* recursiveTopMostBB = getTopmostBBDst(succ, end, origSrc, traversal);
3853
3854 if (recursiveTopMostBB != NULL)
3855 {
3856 unsigned recursiveTopMostBBLexId = recursiveTopMostBB->front()->getLexicalId();
3857
3858 if (recursiveTopMostBBLexId < topLexId)
3859 {
3860 topmostBB = recursiveTopMostBB;
3861 topLexId = recursiveTopMostBBLexId;
3862 }
3863 }
3864 else
3865 {
3866 if (src != origSrc)
3867 {
3868 topmostBB = NULL;
3869 topLexId = 0;
3870 }
3871 }
3872
3873 succ->markTraversed(traversal);
3874 succ->setNestLevel();
3875 }
3876 }
3877
3878 return topmostBB;
3879 }
3880
updateStartIntervalForSubDcl(G4_Declare * dcl,G4_INST * curInst,G4_Operand * opnd)3881 void Augmentation::updateStartIntervalForSubDcl(G4_Declare* dcl, G4_INST* curInst, G4_Operand *opnd)
3882 {
3883 for (const G4_Declare *subDcl : gra.getSubDclList(dcl))
3884 {
3885 unsigned leftBound = gra.getSubOffset(subDcl);
3886 unsigned rightBound = leftBound + subDcl->getByteSize() - 1;
3887 if (!(opnd->getRightBound() < leftBound || rightBound < opnd->getLeftBound()))
3888 {
3889 auto subDclStartInterval = gra.getStartInterval(subDcl);
3890 if (subDclStartInterval == NULL ||
3891 (subDclStartInterval->getLexicalId() > curInst->getLexicalId()))
3892 {
3893 gra.setStartInterval(subDcl, curInst);
3894 }
3895
3896 auto subDclEndIntrval = gra.getEndInterval(subDcl);
3897 if (subDclEndIntrval == NULL ||
3898 (subDclEndIntrval->getLexicalId() < curInst->getLexicalId()))
3899 {
3900 gra.setEndInterval(subDcl, curInst);
3901 }
3902 }
3903 }
3904
3905 return;
3906 }
3907
updateEndIntervalForSubDcl(G4_Declare * dcl,G4_INST * curInst,G4_Operand * opnd)3908 void Augmentation::updateEndIntervalForSubDcl(G4_Declare* dcl, G4_INST* curInst, G4_Operand *opnd)
3909 {
3910 for (const G4_Declare *subDcl : gra.getSubDclList(dcl))
3911 {
3912 unsigned leftBound = gra.getSubOffset(subDcl);
3913 unsigned rightBound = leftBound + subDcl->getByteSize() - 1;
3914 if (!(opnd->getRightBound() < leftBound || rightBound < opnd->getLeftBound()))
3915 {
3916 auto subDclEndInterval = gra.getEndInterval(subDcl);
3917 if (subDclEndInterval == NULL ||
3918 (subDclEndInterval->getLexicalId() < curInst->getLexicalId()))
3919 {
3920 gra.setEndInterval(subDcl, curInst);
3921 }
3922
3923 auto subDclStartInterval = gra.getStartInterval(subDcl);
3924 if (subDclStartInterval == NULL ||
3925 (subDclStartInterval->getLexicalId() > curInst->getLexicalId()))
3926 {
3927 gra.setStartInterval(subDcl, curInst);
3928 }
3929 }
3930 }
3931
3932 return;
3933 }
3934
updateStartInterval(const G4_Declare * dcl,G4_INST * curInst)3935 void Augmentation::updateStartInterval(const G4_Declare* dcl, G4_INST* curInst)
3936 {
3937 auto dclStartInterval = gra.getStartInterval(dcl);
3938 if (dclStartInterval == NULL ||
3939 (dclStartInterval->getLexicalId() > curInst->getLexicalId()))
3940 {
3941 gra.setStartInterval(dcl, curInst);
3942 }
3943
3944 auto dclEndInterval = gra.getEndInterval(dcl);
3945 if (dclEndInterval == NULL ||
3946 (dclEndInterval->getLexicalId() < curInst->getLexicalId()))
3947 {
3948 gra.setEndInterval(dcl, curInst);
3949 }
3950 }
3951
updateEndInterval(const G4_Declare * dcl,G4_INST * curInst)3952 void Augmentation::updateEndInterval(const G4_Declare* dcl, G4_INST* curInst)
3953 {
3954 auto dclEndInterval = gra.getEndInterval(dcl);
3955 if (dclEndInterval == NULL ||
3956 (dclEndInterval->getLexicalId() < curInst->getLexicalId()))
3957 {
3958 gra.setEndInterval(dcl, curInst);
3959 }
3960
3961 auto dclStartInterval = gra.getStartInterval(dcl);
3962 if (dclStartInterval == NULL ||
3963 (dclStartInterval->getLexicalId() > curInst->getLexicalId()))
3964 {
3965 gra.setStartInterval(dcl, curInst);
3966 }
3967 }
3968
updateStartIntervalForLocal(G4_Declare * dcl,G4_INST * curInst,G4_Operand * opnd)3969 void Augmentation::updateStartIntervalForLocal(G4_Declare* dcl, G4_INST* curInst, G4_Operand *opnd)
3970 {
3971 updateStartInterval(dcl, curInst);
3972 if (dcl->getIsSplittedDcl())
3973 {
3974 updateStartIntervalForSubDcl(dcl, curInst, opnd);
3975 }
3976 }
3977
updateEndIntervalForLocal(G4_Declare * dcl,G4_INST * curInst,G4_Operand * opnd)3978 void Augmentation::updateEndIntervalForLocal(G4_Declare* dcl, G4_INST* curInst, G4_Operand *opnd)
3979 {
3980 updateEndInterval(dcl, curInst);
3981 if (dcl->getIsSplittedDcl())
3982 {
3983 updateEndIntervalForSubDcl(dcl, curInst, opnd);
3984 }
3985 }
3986
3987
3988
printLiveIntervals()3989 void GlobalRA::printLiveIntervals()
3990 {
3991 for (const G4_Declare * dcl : kernel.Declares)
3992 {
3993 if (getStartInterval(dcl) != nullptr ||
3994 getEndInterval(dcl) != nullptr)
3995 {
3996 DEBUG_VERBOSE(dcl->getName() << " (");
3997
3998 if (getStartInterval(dcl) != nullptr)
3999 {
4000 DEBUG_VERBOSE(getStartInterval(dcl)->getLexicalId());
4001 }
4002 else
4003 {
4004 DEBUG_VERBOSE("*");
4005 }
4006
4007 DEBUG_VERBOSE(", ");
4008
4009 if (getEndInterval(dcl) != nullptr)
4010 {
4011 DEBUG_VERBOSE(getEndInterval(dcl)->getLexicalId());
4012 }
4013 else
4014 {
4015 DEBUG_VERBOSE("*");
4016 }
4017
4018 DEBUG_VERBOSE("] " << std::endl);
4019 }
4020 }
4021 }
4022
4023 #ifdef DEBUG_VERBOSE_ON
calculateBankConflictsInBB(G4_BB * bb,int & even_odd_num,int & low_high_num,int & threeSourceNum,bool twoSrcsBank)4024 static int calculateBankConflictsInBB(G4_BB* bb, int &even_odd_num, int &low_high_num, int &threeSourceNum, bool twoSrcsBank)
4025 {
4026 int conflict_num = 0;
4027
4028 for (std::list<G4_INST*>::reverse_iterator i = bb->rbegin();
4029 i != bb->rend();
4030 i++)
4031 {
4032 bool hasSrc0 = false;
4033 int regNum0 = 0;
4034 int regNum1 = 0;
4035 int regNum2 = 0;
4036
4037 const G4_INST* inst = (*i);
4038
4039 if (!(inst->getNumSrc() == 3 && !inst->isSend()))
4040 continue;
4041
4042 const G4_Operand* src0 = inst->getSrc(0);
4043 const G4_Operand* src1 = inst->getSrc(1);
4044 const G4_Operand* src2 = inst->getSrc(2);
4045
4046
4047 if (src1 && src1->isSrcRegRegion() &&
4048 src1->getBase() && src1->getBase()->asRegVar()->isPhyRegAssigned())
4049 {
4050 regNum1 = src1->getBase()->asRegVar()->getPhyReg()->getRegNum();
4051 }
4052 if (src2 && src2->isSrcRegRegion() &&
4053 src2->getBase() && src2->getBase()->asRegVar()->isPhyRegAssigned())
4054 {
4055 regNum2 = src2->getBase()->asRegVar()->getPhyReg()->getRegNum();
4056 }
4057
4058 if ((src0 && src0->isSrcRegRegion()) &&
4059 src0->getBase() && src0->getBase()->asRegVar()->isPhyRegAssigned())
4060 {
4061 regNum0 = src0->getBase()->asRegVar()->getPhyReg()->getRegNum();
4062 }
4063
4064 if (regNum1 == regNum2 && regNum0 == regNum1)
4065 continue;
4066
4067 if (!twoSrcsBank)
4068 {
4069 if (regNum0 < SECOND_HALF_BANK_START_GRF &&
4070 regNum1 < SECOND_HALF_BANK_START_GRF &&
4071 regNum2 < SECOND_HALF_BANK_START_GRF ||
4072 regNum0 >= SECOND_HALF_BANK_START_GRF &&
4073 regNum1 >= SECOND_HALF_BANK_START_GRF &&
4074 regNum2 >= SECOND_HALF_BANK_START_GRF)
4075 {
4076 if (regNum1 % 2 == regNum2 % 2 &&
4077 regNum0 % 2 == regNum1 % 2)
4078 {
4079 conflict_num++;
4080 }
4081 }
4082 }
4083 else
4084 {
4085 if ((regNum1 % 2) == (regNum2 % 2))
4086 {
4087 if (regNum1 < SECOND_HALF_BANK_START_GRF &&
4088 regNum2 < SECOND_HALF_BANK_START_GRF ||
4089 regNum1 >= SECOND_HALF_BANK_START_GRF &&
4090 regNum2 >= SECOND_HALF_BANK_START_GRF)
4091 {
4092 conflict_num++;
4093 }
4094 else
4095 {
4096 low_high_num++;
4097 }
4098 }
4099 else
4100 {
4101 even_odd_num++;
4102 }
4103 }
4104 threeSourceNum++;
4105 }
4106
4107 return conflict_num;
4108 }
4109
calculateBankConflicts(G4_Kernel & kernel)4110 static int calculateBankConflicts(G4_Kernel& kernel)
4111 {
4112 bool SIMD16 = (kernel.getSimdSize() >= 16);
4113 bool twoSrcsConflict = kernel.fg.builder->twoSourcesCollision();
4114
4115 for (G4_BB* curBB : kernel.fg)
4116 {
4117 int even_odd_num = 0;
4118 int low_high_num = 0;
4119 int threeSourceNum = 0;
4120
4121 int conflict_num = calculateBankConflictsInBB(curBB, even_odd_num, low_high_num, threeSourceNum, twoSrcsConflict);
4122 if (threeSourceNum)
4123 {
4124 if (SIMD16)
4125 {
4126 printf("SIMD16, BB: %d, Even_odd: %d, low_high: %d, Conflicts: %d, Three: %d, Insts: %d, kernel: %s\n", curBB->getId(), even_odd_num, low_high_num, conflict_num, threeSourceNum, curBB->size(), kernel.getName());
4127 }
4128 else
4129 {
4130 printf("SIMD8, BB: %d, Even_odd: %d, low_high: %d, Conflicts: %d, Three: %d, Insts: %d, kernel: %s\n", curBB->getId(), even_odd_num, low_high_num, conflict_num, threeSourceNum, curBB->size(), kernel.getName());
4131 }
4132 }
4133 }
4134
4135 return 0;
4136 }
4137 #endif
4138
buildLiveIntervals()4139 void Augmentation::buildLiveIntervals()
4140 {
4141 // Treat variables live-in to program first
4142 G4_BB* entryBB = kernel.fg.getEntryBB();
4143
4144 // Live-in variables have their start interval start with
4145 // first instruction of entry BB
4146 for (unsigned i = 0; i < liveAnalysis.getNumSelectedGlobalVar(); i++)
4147 {
4148 if (liveAnalysis.isLiveAtEntry(entryBB, i))
4149 {
4150 const G4_Declare* dcl = lrs[i]->getDcl()->getRootDeclare();
4151
4152 updateStartInterval(dcl, entryBB->front());
4153 }
4154 }
4155
4156 unsigned funcCnt = 0;
4157
4158 for (G4_BB* curBB : kernel.fg)
4159 {
4160 for (G4_INST* inst : *curBB)
4161 {
4162 if (inst->isPseudoKill() == true)
4163 {
4164 continue;
4165 }
4166
4167 G4_DstRegRegion* dst = inst->getDst();
4168
4169 if (inst->isCall() == true)
4170 {
4171 const char* name = kernel.fg.builder->getNameString(kernel.fg.builder->mem, 32, "SCALL_%d", funcCnt++);
4172 G4_Declare* scallDcl = kernel.fg.builder->createDeclareNoLookup(name, G4_GRF, 1, 1, Type_UD);
4173
4174 updateStartInterval(scallDcl, inst);
4175 updateEndInterval(scallDcl, inst);
4176
4177 FuncInfo* callee = curBB->getCalleeInfo();
4178 std::pair<G4_INST*, FuncInfo*> callInfo(inst, callee);
4179 callDclMap.emplace(scallDcl, callInfo);
4180
4181 continue;
4182 }
4183
4184 if (dst &&
4185 dst->getRegAccess() == Direct &&
4186 dst->getBase())
4187 {
4188 // Destination
4189 G4_Declare* defdcl = GetTopDclFromRegRegion(dst);
4190
4191 if (dst->getBase()->isRegAllocPartaker())
4192 {
4193 if (defdcl &&
4194 gra.getLocalLR(defdcl))
4195 {
4196 updateStartIntervalForLocal(defdcl, inst, dst);
4197 }
4198 else
4199 {
4200 updateStartInterval(defdcl, inst);
4201 }
4202 }
4203 else if (liveAnalysis.livenessClass(G4_GRF))
4204 {
4205 LocalLiveRange* defdclLR;
4206
4207 // Handle ranges allocated by local RA
4208 if (defdcl &&
4209 (defdclLR = gra.getLocalLR(defdcl)) &&
4210 defdclLR->getAssigned() == true &&
4211 !defdclLR->isEOT())
4212 {
4213 updateStartInterval(defdcl, inst);
4214 }
4215 }
4216 }
4217 else if (liveAnalysis.livenessClass(G4_ADDRESS) &&
4218 dst &&
4219 dst->getRegAccess() == IndirGRF &&
4220 dst->getBase() &&
4221 dst->getBase()->isRegVar())
4222 {
4223 // Destination is indirect
4224 G4_Declare* defdcl = dst->getBaseRegVarRootDeclare();
4225
4226 updateEndInterval(defdcl, inst);
4227 }
4228
4229 if (liveAnalysis.livenessClass(G4_FLAG))
4230 {
4231 G4_CondMod* cmod = inst->getCondMod();
4232
4233 if (cmod != nullptr &&
4234 cmod->getBase() != nullptr)
4235 {
4236 // Conditional modifier
4237 G4_Declare* dcl = cmod->getBaseRegVarRootDeclare();
4238
4239 updateStartInterval(dcl, inst);
4240 }
4241 }
4242
4243 for (unsigned i = 0; i < G4_MAX_SRCS; i++)
4244 {
4245 G4_Operand* src = inst->getSrc(i);
4246 if (!src || !src->isSrcRegRegion())
4247 {
4248 continue;
4249 }
4250 G4_SrcRegRegion* srcRegion = src->asSrcRegRegion();
4251
4252 if (srcRegion->getRegAccess() == Direct && srcRegion->getBase())
4253 {
4254 G4_Declare* usedcl = GetTopDclFromRegRegion(src);
4255
4256 if (srcRegion->getBase()->isRegAllocPartaker())
4257 {
4258 if (gra.getLocalLR(usedcl))
4259 {
4260 updateEndIntervalForLocal(usedcl, inst, src);
4261 }
4262 else
4263 {
4264 updateEndInterval(usedcl, inst);
4265 }
4266 }
4267 else if (liveAnalysis.livenessClass(G4_GRF))
4268 {
4269 LocalLiveRange* usedclLR = nullptr;
4270 if (usedcl &&
4271 (usedclLR = gra.getLocalLR(usedcl)) &&
4272 usedclLR->getAssigned() == true &&
4273 !usedclLR->isEOT())
4274 {
4275 updateEndInterval(usedcl, inst);
4276 }
4277 }
4278 }
4279 else if (liveAnalysis.livenessClass(G4_GRF) && srcRegion->isIndirect())
4280 {
4281 const REGVAR_VECTOR& pointsToSet = liveAnalysis.getPointsToAnalysis().getAllInPointsToOrIndrUse(srcRegion, curBB);
4282 for (auto pointsToVar : pointsToSet)
4283 {
4284 if (pointsToVar.var->isRegAllocPartaker())
4285 {
4286 updateEndInterval(pointsToVar.var->getDeclare()->getRootDeclare(), inst);
4287 }
4288 }
4289 }
4290 else if (liveAnalysis.livenessClass(G4_ADDRESS) &&
4291 srcRegion->getRegAccess() == IndirGRF &&
4292 srcRegion->getBase() &&
4293 srcRegion->getBase()->isRegVar())
4294 {
4295 G4_Declare* usedcl = src->getBaseRegVarRootDeclare();
4296
4297 updateEndInterval(usedcl, inst);
4298 }
4299 }
4300
4301 if (liveAnalysis.livenessClass(G4_FLAG))
4302 {
4303 G4_Predicate* pred = inst->getPredicate();
4304
4305 if (pred != NULL)
4306 {
4307 // Predicate
4308 G4_Declare* dcl = pred->getBaseRegVarRootDeclare();
4309
4310 updateEndInterval(dcl, inst);
4311 }
4312 }
4313 }
4314 }
4315
4316 // extend all variables that are live at bb entry to the given inst
4317 // ToDo: this seems very slow when # variable is large, should look for sparse implementation
4318 auto extendVarLiveness = [this](G4_BB* bb, G4_INST* inst)
4319 {
4320 for (unsigned i = 0; i < liveAnalysis.getNumSelectedGlobalVar(); i++)
4321 {
4322 if (liveAnalysis.isLiveAtEntry(bb, i) == true)
4323 {
4324 // Extend ith live-interval
4325 G4_Declare* dcl = lrs[i]->getDcl()->getRootDeclare();
4326
4327 #ifdef DEBUG_VERBOSE_ON
4328 unsigned oldStart = dcl->getStartInterval()->getLexicalId();
4329 #endif
4330
4331 updateStartInterval(dcl, inst);
4332
4333 #ifdef DEBUG_VREBOSE_ON
4334 if (oldStart > dcl->getStartInterval()->getLexicalId())
4335 {
4336 std::cout << "Extending " << dcl->getName() << " from old start " <<
4337 oldStart << " to " <<
4338 startInst->getLexicalId() <<
4339 " due to back-edge" <<
4340 std::endl;
4341 }
4342 #endif
4343 }
4344 }
4345 };
4346
4347 if (!kernel.fg.isReducible())
4348 {
4349 //use SCC instead
4350 //FIXME: does augmentation work in the presence of subroutine? neither SCCAnalysis nor findNaturalLoops
4351 //considers the call graph
4352 SCCAnalysis SCCFinder(kernel.fg);
4353 SCCFinder.run();
4354 for (auto iter = SCCFinder.SCC_begin(), iterEnd = SCCFinder.SCC_end(); iter != iterEnd; ++iter)
4355 {
4356 auto&& anSCC = *iter;
4357 std::unordered_set<G4_BB*> SCCSucc; // any successor BB of the SCC
4358 G4_BB* headBB = anSCC.getEarliestBB();
4359 for (auto BI = anSCC.body_begin(), BIEnd = anSCC.body_end(); BI != BIEnd; ++BI)
4360 {
4361 G4_BB* bb = *BI;
4362 for (auto succ : bb->Succs)
4363 {
4364 if (!anSCC.isMember(succ))
4365 {
4366 SCCSucc.insert(succ);
4367 }
4368 }
4369 }
4370 for (auto exitBB : SCCSucc)
4371 {
4372 extendVarLiveness(exitBB, headBB->front());
4373 }
4374 }
4375 }
4376 else
4377 {
4378 // process each natural loop
4379 for (auto&& iter : kernel.fg.naturalLoops)
4380 {
4381 auto&& backEdge = iter.first;
4382 G4_INST* startInst = (backEdge.second)->front();
4383 const std::set<G4_BB*>& loopBody = iter.second;
4384
4385 for (auto block : loopBody)
4386 {
4387 // FIXME: this may process a BB multiple times
4388 for (auto succBB : block->Succs)
4389 {
4390 if (loopBody.find(succBB) == loopBody.end())
4391 {
4392 G4_BB* exitBB = succBB;
4393
4394 unsigned latchBBId = (backEdge.first)->getId();
4395 unsigned exitBBId = succBB->getId();
4396 if (exitBBId < latchBBId &&
4397 succBB->Succs.size() == 1)
4398 {
4399 exitBB = succBB->Succs.front();
4400 }
4401
4402 #ifdef DEBUG_VERBOSE_ON
4403 std::cout << "==> Extend live-in for BB" << exitBB->getId() << std::endl;
4404 exitBB->emit(std::cout);
4405 #endif
4406 extendVarLiveness(exitBB, startInst);
4407 }
4408 }
4409 }
4410
4411 G4_BB* startBB = backEdge.second;
4412 G4_BB* EndBB = backEdge.first;
4413 for (unsigned i = 0; i < liveAnalysis.getNumSelectedGlobalVar(); i++)
4414 {
4415 if (liveAnalysis.isLiveAtEntry(startBB, i) == true &&
4416 liveAnalysis.isLiveAtExit(EndBB, i) == true)
4417 {
4418 const G4_Declare* dcl = lrs[i]->getDcl()->getRootDeclare();
4419
4420 #ifdef DEBUG_VERBOSE_ON
4421 unsigned oldEnd = dcl->getEndInterval()->getLexicalId();
4422 #endif
4423
4424 updateEndInterval(dcl, EndBB->back());
4425
4426 #ifdef DEBUG_VERBOSE_ON
4427 if (oldEnd < dcl->getEndInterval()->getLexicalId())
4428 {
4429 std::cout << "Extending " << dcl->getName() << " from old end " <<
4430 oldEnd << " to " <<
4431 dcl->getEndInterval()->getLexicalId() <<
4432 " due to back-edge" <<
4433 std::endl;
4434 }
4435 #endif
4436 }
4437 }
4438
4439 }
4440 }
4441
4442 #ifdef DEBUG_VERBOSE_ON
4443 // Print calculated live-ranges
4444 gra.printLiveIntervals();
4445 #endif
4446 }
4447
clearIntervalInfo()4448 void Augmentation::clearIntervalInfo()
4449 {
4450 // Clear out calculated information so that subsequent RA
4451 // iterations dont have stale information
4452 for (DECLARE_LIST_ITER dcl_it = kernel.Declares.begin(), end = kernel.Declares.end();
4453 dcl_it != end;
4454 dcl_it++)
4455 {
4456 gra.setStartInterval(*dcl_it, nullptr);
4457 gra.setEndInterval(*dcl_it, nullptr);
4458 gra.setMask(*dcl_it, {});
4459 gra.setAugmentationMask(*dcl_it, AugmentationMasks::Undetermined);
4460 }
4461 }
4462
4463 class compareInterval
4464 {
4465 public:
4466 GlobalRA& gra;
4467
compareInterval(GlobalRA & g)4468 compareInterval(GlobalRA& g) : gra(g)
4469 {
4470 }
4471
operator ()(G4_Declare * dcl1,G4_Declare * dcl2)4472 bool operator()(G4_Declare* dcl1, G4_Declare* dcl2)
4473 {
4474 return gra.getStartInterval(dcl1)->getLexicalId() < gra.getStartInterval(dcl2)->getLexicalId();
4475 }
4476 };
4477
sortLiveIntervals()4478 void Augmentation::sortLiveIntervals()
4479 {
4480 // Sort all intervals in kernel based on their starting point in
4481 // ascending order and return them in sortedIntervals vector
4482 // This is actually more efficient (at least according to vTune) than the O(N)
4483 // bucket sort algorithm below, since it avoids most of the malloc/free overhead from the vector.resize()
4484 for (G4_Declare* dcl : kernel.Declares)
4485 {
4486 if (gra.getStartInterval(dcl) != NULL)
4487 {
4488 sortedIntervals.push_back(dcl);
4489 }
4490 }
4491
4492 std::sort(sortedIntervals.begin(), sortedIntervals.end(), compareInterval(gra));
4493
4494 #ifdef DEBUG_VERBOSE_ON
4495 DEBUG_VERBOSE("Live-intervals in sorted order: " << std::endl);
4496 for (const G4_Declare* dcl : sortedIntervals)
4497 {
4498 DEBUG_VERBOSE(dcl->getName() << " - " <<
4499 "(" << dcl->getStartInterval()->getLexicalId() <<
4500 ", " << dcl->getEndInterval()->getLexicalId() <<
4501 "]" << std::endl);
4502 }
4503 #endif
4504 }
4505
getEnd(const G4_Declare * dcl) const4506 unsigned Augmentation::getEnd(const G4_Declare* dcl) const
4507 {
4508 return gra.getEndInterval(dcl)->getLexicalId();
4509 }
4510
4511 // Mark interference between dcls. Either one of dcls may have
4512 // register assigned by local RA so handle those cases too.
4513 // Re-entrant function.
handleSIMDIntf(G4_Declare * firstDcl,G4_Declare * secondDcl,bool isCall)4514 void Augmentation::handleSIMDIntf(G4_Declare* firstDcl, G4_Declare* secondDcl, bool isCall)
4515 {
4516 auto markIntfWithLRAAssignment = [](const G4_Declare* firstDcl, const G4_Declare* lraAssigned, Interference& intf)
4517 {
4518 unsigned numRows = lraAssigned->getNumRows();
4519 const G4_VarBase* preg = lraAssigned->getRegVar()->getPhyReg();
4520 MUST_BE_TRUE(preg->isGreg(), "Expecting a physical register during building interference among incompatible masks");
4521 unsigned start = preg->asGreg()->getRegNum();
4522
4523 for (unsigned i = start; i < (start + numRows); i++)
4524 {
4525 auto GRFDcl = intf.getGRFDclForHRA(i);
4526 intf.checkAndSetIntf(firstDcl->getRegVar()->getId(), GRFDcl->getRegVar()->getId());
4527
4528 #ifdef DEBUG_VERBOSE_ON
4529 DEBUG_VERBOSE("Marking interference between " << firstDcl->getName() <<
4530 " and " << GRFDcl->getName() << std::endl);
4531 #endif
4532 }
4533 };
4534
4535 if (firstDcl->getRegFile() == G4_INPUT &&
4536 firstDcl->getRegVar()->getPhyReg() != NULL &&
4537 secondDcl->getRegFile() == G4_INPUT &&
4538 secondDcl->getRegVar()->getPhyReg() != NULL)
4539 {
4540 return;
4541 }
4542
4543 auto contain = [](const auto& C, auto pred)
4544 {
4545 return std::find_if(C.cbegin(), C.cend(), pred) != C.cend();
4546 };
4547
4548 bool isFirstDcl = true;
4549
4550 auto pred = [firstDcl, secondDcl, &isFirstDcl](const auto& el)
4551 {
4552 if (el.second.VCA == firstDcl) return true;
4553 if (el.second.VCA == secondDcl)
4554 {
4555 isFirstDcl = false;
4556 return true;
4557 }
4558 return false;
4559 };
4560
4561 if (contain(kernel.fg.fcallToPseudoDclMap, pred))
4562 {
4563 // Mark intf for following pattern:
4564 // V33 =
4565 // ...
4566 // if
4567 // = V33
4568 // fcall
4569 // ...
4570 // else
4571 // = V33
4572 // endif
4573 //
4574 // V33 will interfere with VCA_SAVE pseudo node.
4575 // It also needs to interfere with retval to
4576 // ensure V33 and retval dont get same allocation.
4577 // Note that if V33 is actually live after fcall
4578 // then graph coloring will do this for us. In this
4579 // case however we need to rely on augmentation.
4580 FCALL_RET_MAP_ITER retIter = isFirstDcl ? fcallRetMap.find(firstDcl) : fcallRetMap.find(secondDcl);
4581 if (retIter != fcallRetMap.end())
4582 {
4583 G4_Declare* retVar = retIter->second;
4584 LocalLiveRange* otherDclLR;
4585 G4_Declare* otherDcl = isFirstDcl ? secondDcl : firstDcl;
4586 if (otherDcl->getRegVar()->isRegAllocPartaker())
4587 intf.checkAndSetIntf(otherDcl->getRegVar()->getId(), retVar->getRegVar()->getId());
4588 else if ((otherDclLR = gra.getLocalLR(otherDcl)) &&
4589 otherDclLR->getAssigned() &&
4590 !otherDclLR->isEOT())
4591 {
4592 markIntfWithLRAAssignment(retVar, otherDcl, intf);
4593 }
4594 }
4595 }
4596
4597 if (firstDcl->getRegVar()->isRegAllocPartaker() &&
4598 secondDcl->getRegVar()->isRegAllocPartaker())
4599 {
4600 if (!intf.varSplitCheckBeforeIntf(firstDcl->getRegVar()->getId(), secondDcl->getRegVar()->getId()))
4601 {
4602 intf.checkAndSetIntf(firstDcl->getRegVar()->getId(), secondDcl->getRegVar()->getId());
4603 if (isCall)
4604 {
4605 intf.buildInterferenceWithAllSubDcl(firstDcl->getRegVar()->getId(), secondDcl->getRegVar()->getId());
4606 }
4607 #ifdef DEBUG_VERBOSE_ON
4608 DEBUG_VERBOSE("Marking interference between " << firstDcl->getName() <<
4609 " and " << secondDcl->getName() << std::endl);
4610 #endif
4611 }
4612 }
4613 else if (liveAnalysis.livenessClass(G4_GRF))
4614 {
4615 LocalLiveRange* secondDclLR = nullptr, *firstDclLR = nullptr;
4616
4617 if (firstDcl->getRegVar()->isRegAllocPartaker() &&
4618 (secondDclLR = gra.getLocalLR(secondDcl)) &&
4619 secondDclLR->getAssigned() &&
4620 !secondDclLR->isEOT())
4621 {
4622 // secondDcl was assigned by local RA and it uses
4623 markIntfWithLRAAssignment(firstDcl, secondDcl, intf);
4624 }
4625 else if (secondDcl->getRegVar()->isRegAllocPartaker() &&
4626 (firstDclLR = gra.getLocalLR(firstDcl)) &&
4627 firstDclLR->getAssigned() &&
4628 !firstDclLR->isEOT())
4629 {
4630 // Call self with reversed parameters instead of re-implementing
4631 // above code
4632 handleSIMDIntf(secondDcl, firstDcl, isCall);
4633 }
4634 }
4635 }
4636
isNoMask(const G4_Declare * dcl,unsigned size) const4637 bool Augmentation::isNoMask(const G4_Declare* dcl, unsigned size) const
4638 {
4639 auto& mask = gra.getMask(dcl);
4640 bool result = false;
4641
4642 if (mask.size() > 0)
4643 {
4644 result = true;
4645
4646 for (unsigned i = 0; i < size; i++)
4647 {
4648 if (mask[i] != NOMASK_BYTE)
4649 {
4650 result = false;
4651 }
4652 }
4653 }
4654
4655 return result;
4656 }
4657
isConsecutiveBits(const G4_Declare * dcl,unsigned size) const4658 bool Augmentation::isConsecutiveBits(const G4_Declare* dcl, unsigned size) const
4659 {
4660 auto& mask = gra.getMask(dcl);
4661 bool result = false;
4662
4663 if (mask.size() > 0)
4664 {
4665 result = true;
4666
4667 for (unsigned i = 0; i < size; i++)
4668 {
4669 if (mask[i] != i)
4670 {
4671 result = false;
4672 }
4673 }
4674 }
4675
4676 return result;
4677 }
4678
isCompatible(const G4_Declare * testDcl,const G4_Declare * biggerDcl) const4679 bool Augmentation::isCompatible(const G4_Declare* testDcl, const G4_Declare* biggerDcl) const
4680 {
4681 bool compatible = false;
4682
4683 unsigned testSize = testDcl->getRegVar()->isFlag() ? testDcl->getNumberFlagElements() : testDcl->getByteSize();
4684 unsigned biggerSize = biggerDcl->getRegVar()->isFlag() ? biggerDcl->getNumberFlagElements() : biggerDcl->getByteSize();
4685 unsigned size = (testSize < biggerSize ? testSize : biggerSize);
4686
4687 // Masks are compatible when:
4688 // i. Both decls have exactly 1 EM bit defining each byte
4689 // (This means a dcl with Q1 in one inst and Q2 in another
4690 // instruction writing same subregisters is not a candidate
4691 // for next step).
4692 // ii. Bytes at common indices are enabled by same EM bit
4693 // (This means NoMask dcl is compatible with NoMask dcl and
4694 // not with any other dcl).
4695 // UPDATE: (ii) above is now altered such that NoMask dcls
4696 // that overlap are considered to be incompatible. This is to
4697 // handle removal of JIP edges (then->else edge).
4698
4699 auto& testMask = gra.getMask(testDcl);
4700 auto& biggerMask = gra.getMask(biggerDcl);
4701
4702 if (testMask.size() > 0 && biggerMask.size() > 0)
4703 {
4704 // Lets pattern match
4705 if (testDcl->getRegFile() == G4_FLAG)
4706 {
4707 if (isConsecutiveBits(testDcl, size) &&
4708 isConsecutiveBits(biggerDcl, size))
4709 {
4710 compatible = true;
4711 }
4712 }
4713 else
4714 {
4715 // Add another pattern to check here
4716 }
4717 }
4718
4719 return compatible;
4720 }
4721
expireIntervals(unsigned startIdx)4722 void Augmentation::expireIntervals(unsigned startIdx)
4723 {
4724 // Expire elements from both lists
4725 while (defaultMask.size() > 0)
4726 {
4727 if (gra.getEndInterval(defaultMask.front())->getLexicalId() <= startIdx)
4728 {
4729 #ifdef DEBUG_VERBOSE_ON
4730 DEBUG_VERBOSE("Expiring " << defaultMask.front()->getName() << std::endl);
4731 #endif
4732 defaultMask.pop_front();
4733 }
4734 else
4735 {
4736 break;
4737 }
4738 }
4739
4740 while (nonDefaultMask.size() > 0)
4741 {
4742 if (gra.getEndInterval(nonDefaultMask.front())->getLexicalId() <= startIdx)
4743 {
4744 #ifdef DEBUG_VERBOSE_ON
4745 DEBUG_VERBOSE("Expiring " << nonDefaultMask.front()->getName() << std::endl);
4746 #endif
4747 nonDefaultMask.pop_front();
4748 }
4749 else
4750 {
4751 break;
4752 }
4753 }
4754 }
4755
4756 // Return true if edge between dcl1 and dcl2 is strong.
isStrongEdgeBetween(const G4_Declare * dcl1,const G4_Declare * dcl2) const4757 bool Interference::isStrongEdgeBetween(const G4_Declare* dcl1, const G4_Declare* dcl2) const
4758 {
4759 auto dcl1RegVar = dcl1->getRegVar();
4760 auto dcl2RegVar = dcl2->getRegVar();
4761 auto dcl1RAPartaker = dcl1RegVar->isRegAllocPartaker();
4762 auto dcl2RAPartaker = dcl2RegVar->isRegAllocPartaker();
4763
4764 if (dcl1RAPartaker &&
4765 dcl2RAPartaker)
4766 {
4767 if (interfereBetween(dcl1RegVar->getId(),
4768 dcl2RegVar->getId()))
4769 {
4770 return true;
4771 }
4772 else
4773 {
4774 return false;
4775 }
4776 }
4777
4778 if (dcl1RAPartaker)
4779 {
4780 auto dcl2NumRows = dcl2->getNumRows();
4781 auto startPhyReg = dcl2RegVar->getPhyReg()->asGreg()->getRegNum();
4782 auto dcl2LR = gra.getLocalLR(dcl2);
4783
4784 if (dcl2LR &&
4785 dcl2LR->getAssigned())
4786 {
4787 bool allEdgesStrong = true;
4788 for (unsigned i = startPhyReg; i < (startPhyReg + dcl2NumRows); i++)
4789 {
4790 const G4_Declare* lraPreg = getGRFDclForHRA(i);
4791 allEdgesStrong &= interfereBetween(lraPreg->getRegVar()->getId(), dcl1RegVar->getId());
4792 }
4793
4794 if (allEdgesStrong)
4795 return true;
4796 }
4797 }
4798 else
4799 {
4800 return isStrongEdgeBetween(dcl2, dcl1);
4801 }
4802
4803 return false;
4804 }
4805
weakEdgeNeeded(AugmentationMasks defaultDclMask,AugmentationMasks newDclMask)4806 bool Augmentation::weakEdgeNeeded(AugmentationMasks defaultDclMask, AugmentationMasks newDclMask)
4807 {
4808 if (GlobalRA::useGenericAugAlign())
4809 {
4810 // Weak edge needed in case #GRF exceeds 2
4811 if (newDclMask == AugmentationMasks::Default64Bit)
4812 return (TypeSize(Type_Q) * kernel.getSimdSizeWithSlicing()) > (unsigned)(2 * numEltPerGRF<Type_UB>());
4813
4814 if (newDclMask == AugmentationMasks::Default32Bit)
4815 {
4816 // Even align up to 2 GRFs size variable, use weak edges beyond
4817 return (TypeSize(Type_D) * kernel.getSimdSizeWithSlicing()) > (unsigned)(2 * numEltPerGRF<Type_UB>());
4818 }
4819 }
4820 else
4821 {
4822 return (defaultDclMask == AugmentationMasks::Default64Bit &&
4823 newDclMask == AugmentationMasks::Default64Bit);
4824 }
4825
4826 return false;
4827 }
4828
4829 //
4830 // Mark interference between newDcl and other incompatible dcls in current active lists.
4831 //
addSIMDIntfDclForCallSite(MaskDeclares * maskDeclares)4832 void Augmentation::addSIMDIntfDclForCallSite(MaskDeclares* maskDeclares)
4833 {
4834 for (auto defaultDcl : defaultMask)
4835 {
4836 maskDeclares->first.set(defaultDcl->getRegVar()->getId(), true);
4837 }
4838
4839 for (auto nonDefaultDcl : nonDefaultMask)
4840 {
4841 maskDeclares->second.set(nonDefaultDcl->getRegVar()->getId(), true);
4842 }
4843 }
4844
addSIMDIntfForRetDclares(G4_Declare * newDcl)4845 void Augmentation::addSIMDIntfForRetDclares(G4_Declare* newDcl)
4846 {
4847 auto dclIt = retDeclares.find(newDcl);
4848 MaskDeclares* mask = nullptr;
4849 if (dclIt == retDeclares.end())
4850 {
4851 MaskDeclares newMask;
4852 newMask.first.resize(liveAnalysis.getNumSelectedGlobalVar());
4853 newMask.second.resize(liveAnalysis.getNumSelectedGlobalVar());
4854 retDeclares[newDcl] = std::move(newMask);
4855 mask = &retDeclares[newDcl];
4856 }
4857 else
4858 {
4859 mask = &dclIt->second;
4860 }
4861 addSIMDIntfDclForCallSite(mask);
4862 }
4863
4864 //
4865 // Mark interference between newDcl and other incompatible dcls in current active lists.
4866 //
buildSIMDIntfDcl(G4_Declare * newDcl,bool isCall)4867 void Augmentation::buildSIMDIntfDcl(G4_Declare* newDcl, bool isCall)
4868 {
4869 auto newDclAugMask = gra.getAugmentationMask(newDcl);
4870
4871 for (auto defaultDcl : defaultMask)
4872 {
4873 if (gra.getAugmentationMask(defaultDcl) != newDclAugMask)
4874 {
4875 handleSIMDIntf(defaultDcl, newDcl, isCall);
4876 }
4877 else
4878 {
4879 if (liveAnalysis.livenessClass(G4_GRF) &&
4880 // Populate compatible sparse intf data structure
4881 // only for weak edges.
4882 weakEdgeNeeded(gra.getAugmentationMask(defaultDcl), newDclAugMask))
4883 {
4884 if (defaultDcl->getRegVar()->isPhyRegAssigned() &&
4885 newDcl->getRegVar()->isPhyRegAssigned())
4886 {
4887 continue;
4888 }
4889
4890 if (intf.isStrongEdgeBetween(defaultDcl, newDcl))
4891 {
4892 // No need to add weak edge
4893 continue;
4894 }
4895
4896 // defaultDcl and newDcl are compatible live-ranges and can have weak edge in intf graph
4897 auto it = intf.compatibleSparseIntf.find(defaultDcl);
4898 if (it != intf.compatibleSparseIntf.end())
4899 {
4900 it->second.push_back(newDcl);
4901 }
4902 else
4903 {
4904 std::vector<G4_Declare*> v(1, newDcl);
4905 intf.compatibleSparseIntf.insert(
4906 std::make_pair(defaultDcl, v));
4907 }
4908
4909 it = intf.compatibleSparseIntf.find(newDcl);
4910 if (it != intf.compatibleSparseIntf.end())
4911 {
4912 it->second.push_back(defaultDcl);
4913 }
4914 else
4915 {
4916 std::vector<G4_Declare*> v(1, defaultDcl);
4917 intf.compatibleSparseIntf.insert(
4918 std::make_pair(newDcl, v));
4919 }
4920 }
4921 }
4922 }
4923
4924 // Mark interference among non-default mask variables
4925 for (auto nonDefaultDcl : nonDefaultMask)
4926 {
4927 // Non-default masks are different so mark interference
4928 handleSIMDIntf(nonDefaultDcl, newDcl, isCall);
4929 }
4930 }
4931
4932 //
4933 // Mark interference between newDcl and other incompatible dcls in current active lists.
4934 // If newDcl was created for a subroutine call, do this for all varaibles in function summary.
4935 //
buildSIMDIntfAll(G4_Declare * newDcl)4936 void Augmentation::buildSIMDIntfAll(G4_Declare* newDcl)
4937 {
4938 auto callDclMapIt = callDclMap.find(newDcl);
4939 if (callDclMapIt != callDclMap.end())
4940 {
4941
4942 G4_Declare* varDcl = NULL;
4943
4944 if (liveAnalysis.livenessClass(G4_GRF)) //For return value
4945 {
4946 G4_INST* callInst = callDclMapIt->second.first;
4947 varDcl = callInst->getDst()->getBase()->asRegVar()->getDeclare();
4948 addSIMDIntfForRetDclares(varDcl);
4949 }
4950
4951 auto& func = callDclMapIt->second.second;
4952 addSIMDIntfDclForCallSite(&callsiteDeclares[func]);
4953
4954 return;
4955 }
4956
4957 buildSIMDIntfDcl(newDcl, false);
4958 return;
4959 }
4960
buildSIMDIntfAllOld(G4_Declare * newDcl)4961 void Augmentation::buildSIMDIntfAllOld(G4_Declare* newDcl)
4962 {
4963 auto callDclMapIt = callDclMap.find(newDcl);
4964 if (callDclMapIt != callDclMap.end())
4965 {
4966
4967 G4_Declare* varDcl = NULL;
4968
4969 if (liveAnalysis.livenessClass(G4_GRF)) //For return value
4970 {
4971 G4_INST* callInst = callDclMapIt->second.first;
4972 varDcl = callInst->getDst()->getBase()->asRegVar()->getDeclare();
4973 buildSIMDIntfDcl(varDcl, false);
4974 }
4975
4976 auto& func = callDclMapIt->second.second;
4977 for (unsigned i = 0; i < liveAnalysis.getNumSelectedVar(); i++)
4978 {
4979 auto maydef = liveAnalysis.subroutineMaydef.find(func);
4980 if (maydef != liveAnalysis.subroutineMaydef.end() && maydef->second.isSet(i))
4981 {
4982 varDcl = lrs[i]->getDcl();
4983 buildSIMDIntfDcl(varDcl, true);
4984 }
4985 }
4986 }
4987 else
4988 {
4989 buildSIMDIntfDcl(newDcl, false);
4990 }
4991 }
4992
updateActiveList(G4_Declare * newDcl,std::list<G4_Declare * > * dclMaskList)4993 void Augmentation::updateActiveList(G4_Declare* newDcl, std::list<G4_Declare*>* dclMaskList)
4994 {
4995 bool done = false;
4996
4997 for (auto defaultIt = dclMaskList->begin();
4998 defaultIt != dclMaskList->end();
4999 defaultIt++)
5000 {
5001 G4_Declare* defaultDcl = (*defaultIt);
5002
5003 if (gra.getEndInterval(defaultDcl)->getLexicalId() >= gra.getEndInterval(newDcl)->getLexicalId())
5004 {
5005 dclMaskList->insert(defaultIt, newDcl);
5006 done = true;
5007 break;
5008 }
5009 }
5010
5011 if (done == false)
5012 {
5013 dclMaskList->push_back(newDcl);
5014 }
5015 }
5016
5017 //
5018 // Perform linear scan and mark interference between conflicting dcls with incompatible masks.
5019 //
buildInterferenceIncompatibleMask()5020 void Augmentation::buildInterferenceIncompatibleMask()
5021 {
5022 // Create 2 active lists - 1 for holding active live-intervals
5023 // with non-default mask and other for default mask
5024
5025 for (G4_Declare *newDcl : sortedIntervals)
5026 {
5027 unsigned startIdx = gra.getStartInterval(newDcl)->getLexicalId();
5028 #ifdef DEBUG_VERBOSE_ON
5029 DEBUG_VERBOSE("New idx " << startIdx << std::endl);
5030 #endif
5031
5032 expireIntervals(startIdx);
5033 if (!kernel.fg.builder->getOption(vISA_UseOldSubRoutineAugIntf))
5034 {
5035 buildSIMDIntfAll(newDcl);
5036 }
5037 else
5038 {
5039 buildSIMDIntfAllOld(newDcl);
5040 }
5041
5042 // Add newDcl to correct list
5043 if (gra.getHasNonDefaultMaskDef(newDcl) || newDcl->getAddressed() == true)
5044 {
5045 updateActiveList(newDcl, &nonDefaultMask);
5046 #ifdef DEBUG_VERBOSE_ON
5047 DEBUG_VERBOSE("Adding " << newDcl->getName() <<
5048 " to non-default list" << std::endl);
5049 #endif
5050 }
5051 else
5052 {
5053 updateActiveList(newDcl, &defaultMask);
5054 #ifdef DEBUG_VERBOSE_ON
5055 DEBUG_VERBOSE("Adding " << newDcl->getName() <<
5056 " to default list" << std::endl);
5057 #endif
5058 }
5059 }
5060
5061 if (!kernel.fg.builder->getOption(vISA_UseOldSubRoutineAugIntf))
5062 {
5063 for (auto func : kernel.fg.funcInfoTable)
5064 {
5065 buildInteferenceForCallsite(func);
5066 }
5067 buildInteferenceForRetDeclares();
5068 }
5069 }
5070
buildInteferenceForCallSiteOrRetDeclare(G4_Declare * newDcl,MaskDeclares * mask)5071 void Augmentation::buildInteferenceForCallSiteOrRetDeclare(G4_Declare* newDcl, MaskDeclares* mask)
5072 {
5073
5074 for (unsigned i = 0; i < liveAnalysis.getNumSelectedGlobalVar(); i++)
5075 {
5076 auto newDclAugMask = gra.getAugmentationMask(newDcl);
5077
5078 if (mask->first.isSet(i))
5079 {
5080 G4_Declare* defaultDcl = lrs[i]->getDcl();
5081 if (gra.getAugmentationMask(defaultDcl) != newDclAugMask)
5082 {
5083 handleSIMDIntf(defaultDcl, newDcl, true);
5084 }
5085 else
5086 {
5087 if (liveAnalysis.livenessClass(G4_GRF) &&
5088 // Populate compatible sparse intf data structure
5089 // only for weak edges.
5090 weakEdgeNeeded(gra.getAugmentationMask(defaultDcl), newDclAugMask))
5091 {
5092 if (defaultDcl->getRegVar()->isPhyRegAssigned() &&
5093 newDcl->getRegVar()->isPhyRegAssigned())
5094 {
5095 continue;
5096 }
5097
5098 if (intf.isStrongEdgeBetween(defaultDcl, newDcl))
5099 {
5100 // No need to add weak edge
5101 continue;
5102 }
5103
5104 // defaultDcl and newDcl are compatible live-ranges and can have weak edge in intf graph
5105 auto it = intf.compatibleSparseIntf.find(defaultDcl);
5106 if (it != intf.compatibleSparseIntf.end())
5107 {
5108 it->second.push_back(newDcl);
5109 }
5110 else
5111 {
5112 std::vector<G4_Declare*> v(1, newDcl);
5113 intf.compatibleSparseIntf.insert(
5114 std::make_pair(defaultDcl, v));
5115 }
5116
5117 it = intf.compatibleSparseIntf.find(newDcl);
5118 if (it != intf.compatibleSparseIntf.end())
5119 {
5120 it->second.push_back(defaultDcl);
5121 }
5122 else
5123 {
5124 std::vector<G4_Declare*> v(1, defaultDcl);
5125 intf.compatibleSparseIntf.insert(
5126 std::make_pair(newDcl, v));
5127 }
5128 }
5129 }
5130 }
5131
5132 // Mark interference among non-default mask variables
5133 if (mask->second.isSet(i))
5134 {
5135 G4_Declare* nonDefaultDcl = lrs[i]->getDcl();
5136 // Non-default masks are different so mark interference
5137 handleSIMDIntf(nonDefaultDcl, newDcl, true);
5138 }
5139 }
5140 }
5141
buildInteferenceForCallsite(FuncInfo * func)5142 void Augmentation::buildInteferenceForCallsite(FuncInfo* func)
5143 {
5144 for (unsigned i = 0; i < liveAnalysis.getNumSelectedVar(); i++)
5145 {
5146 auto maydef = liveAnalysis.subroutineMaydef.find(func);
5147 if (maydef != liveAnalysis.subroutineMaydef.end() && maydef->second.isSet(i))
5148 {
5149 G4_Declare* varDcl = lrs[i]->getDcl();
5150 buildInteferenceForCallSiteOrRetDeclare(varDcl, &callsiteDeclares[func]);
5151 }
5152 }
5153 if (kernel.getOption(vISA_LocalRA))
5154 {
5155 for (uint32_t j = 0; j < kernel.getNumRegTotal(); j++)
5156 {
5157 if (localSummaryOfCallee[func].isGRFBusy(j))
5158 {
5159 G4_Declare* varDcl = gra.getGRFDclForHRA(j);
5160 buildInteferenceForCallSiteOrRetDeclare(varDcl, &callsiteDeclares[func]);
5161 }
5162 }
5163 }
5164 }
5165
buildInteferenceForRetDeclares()5166 void Augmentation::buildInteferenceForRetDeclares()
5167 {
5168 for (auto retDclIt : retDeclares)
5169 {
5170 buildInteferenceForCallSiteOrRetDeclare(retDclIt.first, &retDclIt.second);
5171 }
5172 }
5173
buildSummaryForCallees()5174 void Augmentation::buildSummaryForCallees()
5175 {
5176 int totalGRFNum = kernel.getNumRegTotal();
5177
5178 for (auto func : kernel.fg.sortedFuncTable)
5179 {
5180 unsigned fid = func->getId();
5181 if (fid == UINT_MAX)
5182 {
5183 // entry kernel
5184 continue;
5185 }
5186 PhyRegSummary funcSummary(totalGRFNum);
5187 for (auto&& bb : func->getBBList())
5188 {
5189 if (auto summary = kernel.fg.getBBLRASummary(bb))
5190 {
5191 for (int i = 0; i < totalGRFNum; i++)
5192 {
5193 if (summary->isGRFBusy(i))
5194 {
5195 funcSummary.setGRFBusy(i);
5196 }
5197 }
5198 }
5199 }
5200
5201 for (auto&& callee : func->getCallees())
5202 {
5203 PhyRegSummary* summary = &localSummaryOfCallee[callee];
5204 if (summary)
5205 {
5206 for (int i = 0; i < totalGRFNum; i++)
5207 {
5208 if (summary->isGRFBusy(i))
5209 {
5210 funcSummary.setGRFBusy(i);
5211 }
5212 }
5213 }
5214 }
5215 localSummaryOfCallee[func] = funcSummary;
5216 }
5217 }
5218
augmentIntfGraph()5219 void Augmentation::augmentIntfGraph()
5220 {
5221 if (!(kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D &&
5222 !liveAnalysis.livenessClass(G4_ADDRESS) &&
5223 kernel.fg.size() > 2))
5224 {
5225 if (!kernel.getOption(vISA_DumpRegChart))
5226 {
5227 return;
5228 }
5229 }
5230
5231 for (auto func : kernel.fg.funcInfoTable)
5232 {
5233 auto& item = callsiteDeclares[func];
5234 item.first.resize(liveAnalysis.getNumSelectedGlobalVar());
5235 item.second.resize(liveAnalysis.getNumSelectedGlobalVar());
5236 }
5237
5238 if (kernel.getOption(vISA_LocalRA))
5239 {
5240 buildSummaryForCallees();
5241 }
5242
5243 // First check whether any definitions exist with incompatible mask
5244 bool nonDefaultMaskDef = markNonDefaultMaskDef();
5245
5246 if (nonDefaultMaskDef == true)
5247 {
5248 // Atleast one definition with non-default mask was found so
5249 // perform steps to augment intf graph with such defs
5250
5251 // Now build live-intervals globally. This function will
5252 // calculate live-intervals and assign start/end inst
5253 // for respective declares.
5254 buildLiveIntervals();
5255
5256 // Sort live-intervals based on their start
5257 sortLiveIntervals();
5258
5259 if (kernel.getOption(vISA_DumpRegChart))
5260 {
5261 gra.regChart = std::make_unique<RegChartDump>(gra);
5262 gra.regChart->recordLiveIntervals(sortedIntervals);
5263 }
5264
5265 if (gra.verifyAugmentation)
5266 {
5267 gra.verifyAugmentation->loadAugData(sortedIntervals, lrs, intf.liveAnalysis->getNumSelectedVar(), &intf, gra);
5268 }
5269
5270 if (kernel.getOption(vISA_SpillAnalysis))
5271 {
5272 if (gra.spillAnalysis.get())
5273 gra.spillAnalysis->LoadAugIntervals(sortedIntervals, gra);
5274 }
5275
5276 if (kernel.fg.builder->getOption(vISA_GenerateDebugInfo))
5277 {
5278 // Following is done to prevent passing GlobalRA to debug info function
5279 // for clear interface.
5280 std::vector<std::tuple<G4_Declare*, G4_INST*, G4_INST*>> dclIntervals;
5281 dclIntervals.reserve(sortedIntervals.size());
5282 for (auto& dcl : sortedIntervals)
5283 {
5284 dclIntervals.push_back(std::make_tuple(dcl, gra.getStartInterval(dcl), gra.getEndInterval(dcl)));
5285 }
5286 updateDebugInfo(kernel, dclIntervals);
5287 }
5288
5289 // Perform linear scan to augment graph
5290 buildInterferenceIncompatibleMask();
5291
5292 if (liveAnalysis.livenessClass(G4_GRF))
5293 {
5294 if ((GlobalRA::useGenericAugAlign() && kernel.getSimdSize() >= numEltPerGRF<Type_UD>()) ||
5295 (!GlobalRA::useGenericAugAlign() && kernel.getSimdSize() > numEltPerGRF<Type_UD>()))
5296 {
5297 // Set alignment of all GRF candidates
5298 // to 2GRF except for NoMask variables
5299 #ifdef DEBUG_VERBOSE_ON
5300 DEBUG_VERBOSE("Kernel size is SIMD" << kernel.getSimdSize() << " so updating all GRFs to be 2GRF aligned" << std::endl);
5301 #endif
5302 gra.evenAlign();
5303 }
5304 gra.updateSubRegAlignment(GRFALIGN);
5305 }
5306
5307 // Clear information calculated in this iteration of RA so
5308 // a later RA iteration does not use stale information
5309 clearIntervalInfo();
5310 }
5311 }
5312
buildInterferenceWithLocalRA(G4_BB * bb)5313 void Interference::buildInterferenceWithLocalRA(G4_BB* bb)
5314 {
5315 auto LRASummary = kernel.fg.getBBLRASummary(bb);
5316 if (LRASummary == nullptr)
5317 {
5318 return;
5319 }
5320
5321 BitSet cur(kernel.getNumRegTotal(), true);
5322 BitSet live(maxId, false);
5323 std::vector<int> curUpdate;
5324
5325 buildInterferenceAtBBExit(bb, live);
5326
5327 #ifdef DEBUG_VERBOSE_ON
5328 DEBUG_VERBOSE("BB" << bb->getId() << std::endl);
5329 #endif
5330
5331 for (INST_LIST_RITER rit = bb->rbegin(), rend = bb->rend();
5332 rit != rend;
5333 rit++)
5334 {
5335 bool update = false;
5336 G4_INST* inst = (*rit);
5337 curUpdate.clear();
5338
5339 #ifdef DEBUG_VERBOSE_ON
5340 inst->emit(COUT_ERROR);
5341 DEBUG_VERBOSE(" //" << inst->getLineNo() << ":$" << inst->getCISAOff());
5342 DEBUG_VERBOSE(std::endl);
5343 #endif
5344
5345 // Any physical registers defined will be marked available if
5346 // current inst is first def or if complete region is written
5347 G4_DstRegRegion* dst = inst->getDst();
5348
5349 if (dst &&
5350 dst->getBase()->isRegVar())
5351 {
5352 LocalLiveRange* localLR = NULL;
5353 G4_Declare* topdcl = GetTopDclFromRegRegion(dst);
5354 unsigned t;
5355
5356 if (topdcl)
5357 localLR = gra.getLocalLR(topdcl);
5358
5359 if (localLR && localLR->getAssigned() && !localLR->isEOT())
5360 {
5361 int reg, sreg, numrows;
5362 G4_VarBase* preg = localLR->getPhyReg(sreg);
5363 numrows = localLR->getTopDcl()->getNumRows();
5364
5365 MUST_BE_TRUE(preg->isGreg(), "Register in dst was not GRF");
5366
5367 reg = preg->asGreg()->getRegNum();
5368
5369 // Check whether the dst physical register is busy/available.
5370 // If it is available, and we still see a def that means there was no
5371 // corresponding use. In such cases mark the physical register as
5372 // busy, so interference building can take place correctly.
5373 for (int j = reg, sum = reg + numrows; j < sum; j++)
5374 {
5375 int k = getGRFDclForHRA(j)->getRegVar()->getId();
5376
5377 if (cur.isSet(j) == true)
5378 {
5379 buildInterferenceWithLive(live, k);
5380 #ifdef DEBUG_VERBOSE_ON
5381 DEBUG_VERBOSE("Found no use for r" << j << ".0 so marking it as interfering with live set" << std::endl);
5382 #endif
5383 }
5384 }
5385
5386 if ((localLR->getFirstRef(t) == inst) ||
5387 liveAnalysis->writeWholeRegion(bb, inst, dst, builder.getOptions()))
5388 {
5389 // Last row may be only partially used by the current dcl
5390 // so we still need to pessimistically mark last range as
5391 // busy. Because some other src opnd that is live may still
5392 // be using the remaining GRF.
5393 if (localLR->getSizeInWords() % numEltPerGRF<Type_UW>() != 0)
5394 numrows--;
5395
5396 for (int j = reg, sum = reg + numrows; j < sum; j++)
5397 {
5398 cur.set(j, true);
5399 #ifdef DEBUG_VERBOSE_ON
5400 DEBUG_VERBOSE("Setting r" << j << ".0 available" << std::endl);
5401 #endif
5402 }
5403
5404 // Build interference only for point ranges, ideally which shouldnt exist
5405 // These are ranges that have a def, but no use
5406 if (localLR->getFirstRef(t) == localLR->getLastRef(t))
5407 {
5408 for (int j = reg; j < reg + localLR->getTopDcl()->getNumRows(); j++)
5409 {
5410 int k = getGRFDclForHRA(j)->getRegVar()->getId();
5411 buildInterferenceWithLive(live, k);
5412 }
5413 }
5414 }
5415 }
5416 else if (dst->getBase()->isRegAllocPartaker()) {
5417 // Global range
5418
5419 // In bottom-up order if the live-range has not started then
5420 // a use was not seen for this def. But we need to ensure this
5421 // variable interferes with all other live vars.
5422 bool isPointRange = !live.isSet(dst->getBase()->asRegVar()->getId());
5423
5424 if (isPointRange)
5425 {
5426 // Mark interference with all busy physical registers
5427 for (unsigned i = 0; i < kernel.getNumRegTotal(); i++)
5428 {
5429 if (cur.isSet(i) == false)
5430 {
5431 int k = getGRFDclForHRA(i)->getRegVar()->getId();
5432 checkAndSetIntf(dst->getBase()->asRegVar()->getId(), k);
5433 }
5434 }
5435 }
5436
5437 if (liveAnalysis->writeWholeRegion(bb, inst, dst, builder.getOptions()) ||
5438 inst->isPseudoKill())
5439 {
5440 // Whole write or first def found so mark this operand as not live for earlier instructions
5441 auto id = dst->getBase()->asRegVar()->getId();
5442 updateLiveness(live, id, false);
5443 }
5444 }
5445 }
5446
5447 // Any physical registers used by src opnds will be busy before the current inst
5448 for (int i = 0; i < G4_MAX_SRCS; i++)
5449 {
5450 G4_Operand* src = inst->getSrc(i);
5451
5452 if (src &&
5453 src->isSrcRegRegion() &&
5454 src->asSrcRegRegion()->getBase()->isRegVar())
5455 {
5456 LocalLiveRange* localLR = NULL;
5457 G4_Declare* topdcl = GetTopDclFromRegRegion(src);
5458
5459 if (topdcl)
5460 localLR = gra.getLocalLR(topdcl);
5461
5462 if (localLR && localLR->getAssigned() && !localLR->isEOT())
5463 {
5464 int sreg;
5465 G4_VarBase* preg = localLR->getPhyReg(sreg);
5466 int numrows = localLR->getTopDcl()->getNumRows();
5467
5468 MUST_BE_TRUE(preg->isGreg(), "Register in src was not GRF");
5469
5470 int reg = preg->asGreg()->getRegNum();
5471
5472 for (int j = reg, sum = reg + numrows; j < sum; j++)
5473 {
5474 int k = getGRFDclForHRA(j)->getRegVar()->getId();
5475
5476 if (cur.isSet(j) == true)
5477 {
5478 // G4_RegVar with id k was marked free, but becomes
5479 // busy at this instruction. For incremental updates
5480 // push this to a vector and use it while updating
5481 // interference graph incrementally.
5482 curUpdate.push_back(k);
5483 }
5484
5485 cur.set(j, false);
5486 #ifdef DEBUG_VERBOSE_ON
5487 DEBUG_VERBOSE("Setting r" << j << ".0 busy" << std::endl);
5488 #endif
5489 }
5490 }
5491 else if (src->asSrcRegRegion()->getBase()->isRegAllocPartaker())
5492 {
5493 if (live.isSet(src->asSrcRegRegion()->getBase()->asRegVar()->getId()) == false)
5494 update = true;
5495
5496 // Mark operand as live from this inst upwards
5497 auto id = src->asSrcRegRegion()->getBase()->asRegVar()->getId();
5498 updateLiveness(live, id, true);
5499 }
5500 }
5501 }
5502
5503 if (update == true)
5504 {
5505 // Mark interference with all live
5506 for (unsigned i = 0; i < kernel.getNumRegTotal(); i++)
5507 {
5508 if (cur.isSet(i) == false)
5509 {
5510 int k = getGRFDclForHRA(i)->getRegVar()->getId();
5511 buildInterferenceWithLive(live, k);
5512 }
5513 }
5514 }
5515 else {
5516 if (curUpdate.size() > 0)
5517 {
5518 // Perform incremental update. This code is executed when:
5519 // 1) live set is unchanged, ie no new global range was started in inst
5520 // 2) cur set has changed, ie an earlier free GRF has become busy
5521 // Any new busy GRFs will have to be marked as interfering with
5522 // currently live-ranges. There is no need to iterate over all
5523 // busy GRFs. Instead only those GRFs that have got busy in this iteration
5524 // can be considered for incremental updates.
5525 for (int k : curUpdate)
5526 {
5527 buildInterferenceWithLive(live, k);
5528 }
5529 }
5530 }
5531 }
5532
5533 for (unsigned i = 0; i < maxId; i++)
5534 {
5535 bool isAddrSensitive = liveAnalysis->isAddressSensitive(i);
5536
5537 // If a range is Address taken AND (live-in or live-out or killed)
5538 // mark it to interfere with all physical registers used by local RA
5539 // FIXME: need to check if this is actually needed
5540 if (isAddrSensitive)
5541 {
5542 bool assigned = (lrs[i]->getVar()->getPhyReg() != NULL);
5543 if (!assigned)
5544 {
5545 bool isLiveIn = liveAnalysis->isLiveAtEntry(bb, i);
5546 bool isLiveOut = liveAnalysis->isLiveAtExit(bb, i);
5547 bool isKilled = liveAnalysis->use_kill[bb->getId()].isSet(i);
5548 if (isLiveIn || isLiveOut || isKilled)
5549 {
5550 // Make it to interfere with all physical registers used in the BB
5551 for (uint32_t j = 0, numReg = kernel.getNumRegTotal(); j < numReg; j++)
5552 {
5553 if (LRASummary->isGRFBusy(j))
5554 {
5555 int k = getGRFDclForHRA(j)->getRegVar()->getId();
5556 checkAndSetIntf(i, k);
5557 }
5558 }
5559 }
5560 }
5561 }
5562 }
5563 }
5564
5565
interferenceVerificationForSplit() const5566 void Interference::interferenceVerificationForSplit() const
5567 {
5568
5569 std::cout << "\n\n **** Interference Verification Table ****\n";
5570 for (unsigned i = 0; i < maxId; i++)
5571 {
5572 std::cout << "(" << i << ") ";
5573 //lrs[i]->dump();
5574 for (unsigned j = 0; j < maxId; j++)
5575 {
5576 if (interfereBetween(i, j))
5577 {
5578 if (!interfereBetween(gra.getSplittedDeclare(lrs[i]->getDcl())->getRegVar()->getId(), j) &&
5579 (gra.getSplittedDeclare(lrs[i]->getDcl()) != lrs[j]->getDcl()))
5580 {
5581 std::cout << "\t";
5582 lrs[j]->getVar()->emit(std::cout);
5583 }
5584 }
5585 }
5586 std::cout << "\n";
5587 }
5588 }
5589
linearScanVerify() const5590 bool Interference::linearScanVerify() const
5591 {
5592 std::cout << "--------------- " << kernel.getName() << " ----------------" << "\n";
5593
5594 for (unsigned i = 0; i < maxId; i++)
5595 {
5596 G4_VarBase* phyReg_i = lrs[i]->getVar()->getPhyReg();
5597 if (!phyReg_i || !phyReg_i->isGreg() || gra.isUndefinedDcl(lrs[i]->getDcl()) || lrs[i]->getDcl()->getRegVar()->isNullReg())
5598 {
5599 continue;
5600 }
5601 unsigned regOff_i = lrs[i]->getVar()->getPhyRegOff() * lrs[i]->getVar()->getDeclare()->getElemSize();
5602 unsigned GRFStart_i = phyReg_i->asGreg()->getRegNum() * numEltPerGRF<Type_UB>() + regOff_i;
5603 unsigned elemsSize_i = lrs[i]->getVar()->getDeclare()->getNumElems() * lrs[i]->getVar()->getDeclare()->getElemSize();
5604 unsigned GRFEnd_i = GRFStart_i + elemsSize_i - 1;
5605
5606 for (unsigned j = 0; j < maxId; j++)
5607 {
5608 if (interfereBetween(i, j))
5609 {
5610 if (gra.isUndefinedDcl(lrs[j]->getDcl()) || builder.kernel.fg.isPseudoDcl(lrs[j]->getDcl()) || lrs[j]->getDcl()->getRegVar()->isNullReg())
5611 {
5612 continue;
5613 }
5614
5615 G4_VarBase* phyReg_j = lrs[j]->getVar()->getPhyReg();
5616 unsigned regOff_j = lrs[j]->getVar()->getPhyRegOff() * lrs[j]->getVar()->getDeclare()->getElemSize();
5617 unsigned GRFStart_j = phyReg_j->asGreg()->getRegNum() * numEltPerGRF<Type_UB>() + regOff_j;
5618 unsigned elemsSize_j = lrs[j]->getVar()->getDeclare()->getNumElems() * lrs[j]->getVar()->getDeclare()->getElemSize();
5619 unsigned GRFEnd_j = GRFStart_j + elemsSize_j - 1;
5620 if (!(GRFEnd_i < GRFStart_j || GRFEnd_j < GRFStart_i))
5621 {
5622 LSLiveRange* i_LSLR = gra.getLSLR(lrs[i]->getDcl());
5623 LSLiveRange* j_LSLR = gra.getLSLR(lrs[j]->getDcl());
5624 unsigned i_start = 0;
5625 unsigned i_end = 0;
5626 if (i_LSLR) //For the stack call or some other function which will add extra declares after allocation
5627 {
5628 i_LSLR->getFirstRef(i_start);
5629 i_LSLR->getLastRef(i_end);
5630 }
5631
5632 unsigned j_start = 0;
5633 unsigned j_end = 0;
5634 if (j_LSLR)
5635 {
5636 j_LSLR->getFirstRef(j_start);
5637 j_LSLR->getLastRef(j_end);
5638 }
5639
5640 std::cout << "(" << i << "," << j << ")" << lrs[i]->getDcl()->getName() << "(" << GRFStart_i << ":" << GRFEnd_i << ")[" << i_start << "," << i_end << "] vs "
5641 << lrs[j]->getDcl()->getName() << "(" << GRFStart_i << ":" << GRFEnd_j << ")[" << j_start << "," << j_end << "]" << "\n";
5642 }
5643 }
5644 }
5645 }
5646
5647 return true;
5648 }
5649
dumpInterference() const5650 void Interference::dumpInterference() const
5651 {
5652
5653 std::cout << "\n\n **** Interference Table ****\n";
5654 for (unsigned i = 0; i < maxId; i++)
5655 {
5656 std::cout << "(" << i << ") ";
5657 lrs[i]->dump();
5658 std::cout << "\n";
5659 for (unsigned j = 0; j < maxId; j++)
5660 {
5661 if (interfereBetween(i, j))
5662 {
5663 std::cout << "\t";
5664 lrs[j]->getVar()->emit(std::cout);
5665 }
5666 }
5667 std::cout << "\n\n";
5668 }
5669 }
5670
dumpVarInterference() const5671 void Interference::dumpVarInterference() const
5672 {
5673
5674 std::cout << "\n\n **** Var Interference Table ****\n";
5675 for (G4_Declare* decl : gra.kernel.Declares)
5676 {
5677 if (decl->getRegVar()->isRegAllocPartaker())
5678 {
5679 unsigned i = decl->getRegVar()->getId();
5680 //std::cout << "(" << i << ") ";
5681 lrs[i]->dump();
5682 std::cout << "\n";
5683 for (G4_Declare* decl : gra.kernel.Declares)
5684 {
5685 if (decl->getRegVar()->isRegAllocPartaker())
5686 {
5687 unsigned j = decl->getRegVar()->getId();
5688 if (interfereBetween(i, j))
5689 {
5690 std::cout << "\t";
5691 lrs[j]->getVar()->emit(std::cout);
5692 }
5693 }
5694 }
5695 std::cout << "\n\n";
5696 }
5697 }
5698 }
5699
GraphColor(LivenessAnalysis & live,unsigned totalGRF,bool hybrid,bool forceSpill_)5700 GraphColor::GraphColor(LivenessAnalysis& live, unsigned totalGRF, bool hybrid, bool forceSpill_) :
5701 gra(live.gra), totalGRFRegCount(totalGRF), numVar(live.getNumSelectedVar()), numSplitStartID(live.getNumSplitStartID()), numSplitVar(live.getNumSplitVar()),
5702 intf(&live, lrs, live.getNumSelectedVar(), live.getNumSplitStartID(), live.getNumSplitVar(), gra), regPool(gra.regPool),
5703 builder(gra.builder), isHybrid(hybrid),
5704 forceSpill(forceSpill_), mem(GRAPH_COLOR_MEM_SIZE),
5705 kernel(gra.kernel), liveAnalysis(live)
5706 {
5707 spAddrRegSig = (unsigned*)mem.alloc(getNumAddrRegisters() * sizeof(unsigned));
5708 m_options = builder.getOptions();
5709 }
5710
5711 //
5712 // lrs[i] gives the live range whose id is i
5713 //
createLiveRanges(unsigned reserveSpillSize)5714 void GraphColor::createLiveRanges(unsigned reserveSpillSize)
5715 {
5716 lrs = (LiveRange**)mem.alloc(sizeof(LiveRange*)*numVar);
5717 bool hasStackCall = builder.kernel.fg.getHasStackCalls() || builder.kernel.fg.getIsStackCallFunc();
5718 // Modification For Alias Dcl
5719 for (auto dcl : gra.kernel.Declares)
5720 {
5721 G4_RegVar* var = dcl->getRegVar();
5722 // Do not include alias var in liverange creation
5723 if (!var->isRegAllocPartaker() || dcl->getAliasDeclare() != NULL)
5724 {
5725 continue;
5726 }
5727 lrs[var->getId()] = new (mem)LiveRange(var, this->gra);
5728 unsigned reservedGRFNum = m_options->getuInt32Option(vISA_ReservedGRFNum);
5729
5730 if (builder.kernel.fg.isPseudoDcl(dcl))
5731 {
5732 lrs[var->getId()]->setIsPseudoNode();
5733 }
5734 if (dcl->getIsPartialDcl())
5735 {
5736 if (G4_Declare * parentDcl = this->gra.getSplittedDeclare(dcl))
5737 {
5738 lrs[var->getId()]->setParentLRID(parentDcl->getRegVar()->getId());
5739 lrs[var->getId()]->setIsPartialDcl();
5740 }
5741 }
5742 if (dcl->getIsSplittedDcl())
5743 {
5744 lrs[var->getId()]->setIsSplittedDcl(true);
5745 }
5746 lrs[var->getId()]->setBC(gra.getBankConflict(dcl));
5747
5748 lrs[var->getId()]->allocForbidden(mem, hasStackCall, reserveSpillSize, reservedGRFNum);
5749 lrs[var->getId()]->setCallerSaveBias(hasStackCall);
5750 G4_Declare* varDcl = lrs[var->getId()]->getDcl();
5751 if (builder.kernel.fg.isPseudoVCADcl(varDcl))
5752 {
5753 lrs[var->getId()]->allocForbiddenCallerSave(mem, &builder.kernel);
5754 }
5755 else if (builder.kernel.fg.isPseudoVCEDcl(varDcl))
5756 {
5757 lrs[var->getId()]->allocForbiddenCalleeSave(mem, &builder.kernel);
5758 }
5759 else if (varDcl == gra.getOldFPDcl())
5760 {
5761 lrs[var->getId()]->allocForbiddenCallerSave(mem, &builder.kernel);
5762 }
5763 }
5764 }
5765
computeDegreeForGRF()5766 void GraphColor::computeDegreeForGRF()
5767 {
5768 for (unsigned i = 0; i < numVar; i++)
5769 {
5770 unsigned degree = 0;
5771
5772 if (!(lrs[i]->getIsPseudoNode()) &&
5773 !(lrs[i]->getIsPartialDcl()))
5774 {
5775 const std::vector<unsigned>& intfs = intf.getSparseIntfForVar(i);
5776 unsigned bankDegree = 0;
5777 auto lraBC = lrs[i]->getBC();
5778 bool isOdd = (lraBC == BANK_CONFLICT_SECOND_HALF_EVEN ||
5779 lraBC == BANK_CONFLICT_SECOND_HALF_ODD);
5780
5781
5782 auto computeDegree = [&](LiveRange* lr1)
5783 {
5784 if (!lr1->getIsPartialDcl())
5785 {
5786 unsigned edgeDegree = edgeWeightGRF(lrs[i], lr1);
5787
5788 degree += edgeDegree;
5789
5790 auto lrsitBC = lr1->getBC();
5791 bool isOddBC = (lrsitBC == BANK_CONFLICT_SECOND_HALF_EVEN ||
5792 lrsitBC == BANK_CONFLICT_SECOND_HALF_ODD);
5793
5794 if ((isOdd && isOddBC) ||
5795 (!isOdd && !isOddBC))
5796 {
5797 bankDegree += edgeDegree;
5798 }
5799 }
5800 };
5801
5802 for (auto it : intfs)
5803 {
5804 computeDegree(lrs[it]);
5805 }
5806
5807 // consider weak edges in degree computation
5808 auto* weakEdges = intf.getCompatibleSparseIntf(lrs[i]->getDcl());
5809 if (weakEdges)
5810 {
5811 for (auto weakNeighbor : *weakEdges)
5812 {
5813 if (!weakNeighbor->getRegVar()->isRegAllocPartaker())
5814 continue;
5815
5816 computeDegree(lrs[weakNeighbor->getRegVar()->getId()]);
5817 }
5818 }
5819
5820 if (isOdd)
5821 {
5822 oddTotalDegree += bankDegree; //std::max(bankDegree, oddMaxDegree);
5823 oddTotalRegNum += lrs[i]->getNumRegNeeded();
5824 oddMaxRegNum = std::max(oddMaxRegNum, lrs[i]->getNumRegNeeded());
5825 }
5826 else
5827 {
5828 evenTotalDegree += bankDegree; //std::max(bankDegree, evenMaxDegree);
5829 evenTotalRegNum += lrs[i]->getNumRegNeeded();
5830 evenMaxRegNum = std::max(evenMaxRegNum, lrs[i]->getNumRegNeeded());
5831 }
5832 }
5833
5834 lrs[i]->setDegree(degree);
5835 }
5836
5837 if (kernel.getOption(vISA_SpillAnalysis))
5838 {
5839 for (unsigned int i = 0; i != numVar; ++i)
5840 {
5841 auto dcl = lrs[i]->getDcl();
5842 auto degree = lrs[i]->getDegree();
5843 gra.spillAnalysis->LoadDegree(dcl, degree);
5844 }
5845 }
5846 }
5847
computeDegreeForARF()5848 void GraphColor::computeDegreeForARF()
5849 {
5850 for (unsigned i = 0; i < numVar; i++)
5851 {
5852 unsigned degree = 0;
5853
5854 if (!(lrs[i]->getIsPseudoNode()))
5855 {
5856 const std::vector<unsigned>& intfs = intf.getSparseIntfForVar(i);
5857 for (auto it : intfs)
5858 {
5859 degree += edgeWeightARF(lrs[i], lrs[it]);
5860 }
5861 }
5862
5863 lrs[i]->setDegree(degree);
5864 }
5865 }
5866
computeSpillCosts(bool useSplitLLRHeuristic)5867 void GraphColor::computeSpillCosts(bool useSplitLLRHeuristic)
5868 {
5869 std::vector <LiveRange *> addressSensitiveVars;
5870 float maxNormalCost = 0.0f;
5871
5872 for (unsigned i = 0; i < numVar; i++)
5873 {
5874 G4_Declare* dcl = lrs[i]->getDcl();
5875
5876 if (dcl->getIsPartialDcl())
5877 {
5878 continue;
5879 }
5880 //
5881 // The spill cost of pseudo nodes inserted to aid generation of save/restore code
5882 // must be the minimum so that such nodes go to the bootom of the color stack.
5883 //
5884 if (builder.kernel.fg.isPseudoDcl(dcl))
5885 {
5886 if (builder.kernel.fg.isPseudoVCADcl(dcl))
5887 {
5888 lrs[i]->setSpillCost(MINSPILLCOST + 1);
5889 }
5890 else
5891 {
5892 lrs[i]->setSpillCost(MINSPILLCOST);
5893 }
5894 }
5895
5896 auto dclLR = gra.getLocalLR(dcl);
5897 if (dclLR != NULL &&
5898 dclLR->getSplit())
5899 {
5900 lrs[i]->setSpillCost(MINSPILLCOST + 2);
5901 }
5902 //
5903 // Give the tiny spill/fill ranges an infinite spill cost, so that they are
5904 // picked first for coloring.
5905 // Also ARF live ranges with exclusively sequential references within the code are
5906 // assigned an infinite spill cost as spilling them will not lower the register
5907 // pressure in the region they are referenced. This does not necessarily hold for
5908 // GRF live ranges are these are potentially large in size but the portions
5909 // accessed by each sequential use are limited to 2 registers for general instructions
5910 // and 8 registers for SEND instructions.
5911 //
5912 else if (gra.isAddrFlagSpillDcl(dcl) ||
5913 lrs[i]->isRetIp() ||
5914 lrs[i]->getIsInfiniteSpillCost() == true ||
5915 ((lrs[i]->getVar()->isRegVarTransient() == true ||
5916 lrs[i]->getVar()->isRegVarTmp() == true) &&
5917 lrs[i]->getVar()->isSpilled() == false) ||
5918 dcl == gra.getOldFPDcl() ||
5919 (m_options->getOption(vISA_enablePreemption) &&
5920 dcl == builder.getBuiltinR0()))
5921 {
5922 lrs[i]->setSpillCost(MAXSPILLCOST);
5923 }
5924 else if (dcl->isDoNotSpill())
5925 {
5926 lrs[i]->setSpillCost(MAXSPILLCOST);
5927 }
5928 //
5929 // Calculate spill costs of regular nodes.
5930 //
5931 else
5932 {
5933 float spillCost = 0.0f;
5934 // NOTE: Add 1 to degree to avoid divide-by-0, as a live range may have no neighbors
5935 if (builder.kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D)
5936 {
5937 if (useSplitLLRHeuristic)
5938 {
5939 spillCost = 1.0f*lrs[i]->getRefCount() / (lrs[i]->getDegree() + 1);
5940 }
5941 else
5942 {
5943 assert(lrs[i]->getDcl()->getTotalElems() > 0);
5944 unsigned short numRows = lrs[i]->getDcl()->getNumRows();
5945 spillCost = 1.0f * lrs[i]->getRefCount() * lrs[i]->getRefCount() * lrs[i]->getDcl()->getByteSize() *
5946 (float)sqrt(lrs[i]->getDcl()->getByteSize())
5947 / ((float)sqrt(lrs[i]->getDegree() + 1) * (float)(sqrt(sqrt(numRows))));
5948 }
5949 }
5950 else
5951 {
5952 spillCost =
5953 liveAnalysis.livenessClass(G4_GRF) ?
5954 lrs[i]->getDegree() : 1.0f*lrs[i]->getRefCount()*lrs[i]->getRefCount() / (lrs[i]->getDegree() + 1);
5955 }
5956
5957 lrs[i]->setSpillCost(spillCost);
5958
5959 // Track address sensitive live range.
5960 if (liveAnalysis.isAddressSensitive(i))
5961 {
5962 addressSensitiveVars.push_back(lrs[i]);
5963 }
5964 else
5965 {
5966 // Set the spill cost of all other normal live ranges, and
5967 // track the max normal cost.
5968 if (maxNormalCost < spillCost)
5969 {
5970 maxNormalCost = spillCost;
5971 }
5972 }
5973 }
5974 }
5975
5976 //
5977 // Set the spill cost of address sensitive live ranges above all the
5978 // normal live ranges, so that they get colored before all the normal
5979 // live ranges.
5980 //
5981 for (LiveRange *lr : addressSensitiveVars)
5982 {
5983 if (lr->getSpillCost() != MAXSPILLCOST)
5984 {
5985 lr->setSpillCost(maxNormalCost + lr->getSpillCost());
5986 }
5987 }
5988 }
5989
5990
5991 //
5992 // subtract lr's neighbors that are still in work list
5993 //
relaxNeighborDegreeGRF(LiveRange * lr)5994 void GraphColor::relaxNeighborDegreeGRF(LiveRange* lr)
5995 {
5996 if (!(lr->getIsPseudoNode()) &&
5997 !(lr->getIsPartialDcl()))
5998 {
5999 unsigned lr_id = lr->getVar()->getId();
6000
6001 // relax degree between 2 nodes
6002 auto relaxDegree = [&](LiveRange* lr1)
6003 {
6004 if (lr1->getActive() &&
6005 !lr1->getIsPseudoNode() &&
6006 !(lr1->getIsPartialDcl()))
6007 {
6008 unsigned w = edgeWeightGRF(lr1, lr);
6009
6010 #ifdef DEBUG_VERBOSE_ON
6011 DEBUG_VERBOSE("\t relax ");
6012 lr1->dump();
6013 DEBUG_VERBOSE(" degree(" << lr1->getDegree() << ") - " << w << std::endl);
6014 #endif
6015 lr1->subtractDegree(w);
6016
6017 unsigned availColor = numColor;
6018 availColor = numColor - lr1->getNumForbidden();
6019
6020 if (lr1->getDegree() + lr1->getNumRegNeeded() <= availColor)
6021 {
6022 unconstrainedWorklist.push_back(lr1);
6023 lr1->setActive(false);
6024 }
6025 }
6026 };
6027
6028 const std::vector<unsigned>& intfs = intf.getSparseIntfForVar(lr_id);
6029 for (auto it : intfs)
6030 {
6031 LiveRange* lrs_it = lrs[it];
6032
6033 relaxDegree(lrs_it);
6034 }
6035
6036 auto* weakEdges = intf.getCompatibleSparseIntf(lr->getDcl());
6037 if (weakEdges)
6038 {
6039 for (auto weakNeighbor : *weakEdges)
6040 {
6041 if (!weakNeighbor->getRegVar()->isRegAllocPartaker())
6042 continue;
6043 auto lr1 = lrs[weakNeighbor->getRegVar()->getId()];
6044 relaxDegree(lr1);
6045 }
6046 }
6047 }
6048 }
relaxNeighborDegreeARF(LiveRange * lr)6049 void GraphColor::relaxNeighborDegreeARF(LiveRange* lr)
6050 {
6051 if (!(lr->getIsPseudoNode()))
6052 {
6053 unsigned lr_id = lr->getVar()->getId();
6054 const std::vector<unsigned>& intfs = intf.getSparseIntfForVar(lr_id);
6055 for (auto it : intfs)
6056 {
6057 LiveRange* lrs_it = lrs[it];
6058
6059 if (lrs_it->getActive() &&
6060 !lrs_it->getIsPseudoNode())
6061 {
6062 unsigned w = edgeWeightARF(lrs_it, lr);
6063
6064 #ifdef DEBUG_VERBOSE_ON
6065 DEBUG_VERBOSE("\t relax ");
6066 lrs_it->dump();
6067 DEBUG_VERBOSE(" degree(" << lrs_it->getDegree() << ") - " << w << std::endl);
6068 #endif
6069 lrs_it->subtractDegree(w);
6070
6071 unsigned availColor = numColor;
6072
6073 if (lrs_it->getDegree() + lrs_it->getNumRegNeeded() <= availColor)
6074 {
6075 unconstrainedWorklist.push_back(lrs_it);
6076 lrs_it->setActive(false);
6077 }
6078 }
6079 }
6080 }
6081 }
6082
6083
compareSpillCost(LiveRange * lr1,LiveRange * lr2)6084 static bool compareSpillCost(LiveRange* lr1, LiveRange* lr2)
6085 {
6086 return lr1->getSpillCost() < lr2->getSpillCost() ||
6087 (lr1->getSpillCost() == lr2->getSpillCost() && lr1->getVar()->getId() < lr2->getVar()->getId());
6088 }
6089
6090 //
6091 // All nodes in work list are all contrained (whose degree > max color)
6092 // find one contrained node and move it to order list
6093 //
removeConstrained()6094 void GraphColor::removeConstrained()
6095 {
6096 if (!constrainedWorklist.empty())
6097 {
6098 LiveRange* lr = constrainedWorklist.front();
6099 constrainedWorklist.pop_front();
6100
6101 if (lr->getActive())
6102 {
6103
6104 #ifdef DEBUG_VERBOSE_ON
6105 DEBUG_VERBOSE(".... Remove Constrained ");
6106 lr->dump();
6107 DEBUG_VERBOSE(std::endl);
6108 #endif
6109
6110 if (liveAnalysis.livenessClass(G4_GRF))
6111 {
6112 relaxNeighborDegreeGRF(lr);
6113 }
6114 else
6115 {
6116 relaxNeighborDegreeARF(lr);
6117 }
6118 colorOrder.push_back(lr);
6119 lr->setActive(false);
6120 }
6121 }
6122 }
6123
6124
determineColorOrdering()6125 void GraphColor::determineColorOrdering()
6126 {
6127 numColor = 0;
6128 if (liveAnalysis.livenessClass(G4_GRF))
6129 numColor = totalGRFRegCount;
6130 else if (liveAnalysis.livenessClass(G4_ADDRESS))
6131 numColor = getNumAddrRegisters();
6132 else if (liveAnalysis.livenessClass(G4_FLAG))
6133 numColor = builder.getNumFlagRegisters();
6134
6135 unsigned numUnassignedVar = liveAnalysis.getNumUnassignedVar();
6136
6137 //
6138 // create an array for sorting live ranges
6139 //
6140 std::vector<LiveRange*> sorted;
6141 sorted.reserve(numUnassignedVar);
6142 unsigned j = 0;
6143 for (unsigned i = 0; i < numVar; i++)
6144 {
6145 if (lrs[i]->getPhyReg() == nullptr && !lrs[i]->getIsPartialDcl())
6146 {
6147 sorted.push_back(lrs[i]);
6148 j++;
6149 }
6150 }
6151 MUST_BE_TRUE(j == numUnassignedVar, ERROR_GRAPHCOLOR);
6152
6153 //
6154 // sort the live range array
6155 //
6156 std::sort(sorted.begin(), sorted.end(), compareSpillCost);
6157
6158 for (unsigned i = 0; i < numUnassignedVar; i++)
6159 {
6160 LiveRange* lr = sorted[i];
6161 unsigned availColor = numColor;
6162 availColor = numColor - lr->getNumForbidden();
6163
6164 if (lr->getDegree() + lr->getNumRegNeeded() <= availColor)
6165 {
6166 unconstrainedWorklist.push_back(lr);
6167 lr->setActive(false);
6168 }
6169 else
6170 {
6171 constrainedWorklist.push_back(lr);
6172 lr->setActive(true);
6173 }
6174 }
6175
6176 #ifdef DEBUG_VERBOSE_ON
6177 DEBUG_VERBOSE("\nSPILL COST" << std::endl);
6178 for (unsigned i = 0; i < numUnassignedVar; i++)
6179 {
6180 sorted[i]->dump();
6181 DEBUG_VERBOSE("\t spillCost=" << sorted[i]->getSpillCost());
6182 DEBUG_VERBOSE("\t degree=" << sorted[i]->getDegree());
6183 DEBUG_VERBOSE("\t refCnt=" << sorted[i]->getRefCount());
6184 DEBUG_VERBOSE("\t size=" << sorted[i]->getDcl()->getByteSize());
6185 DEBUG_VERBOSE(std::endl);
6186 }
6187 DEBUG_VERBOSE(std::endl);
6188 #endif
6189
6190 while (!constrainedWorklist.empty() ||
6191 !unconstrainedWorklist.empty())
6192 {
6193 while (!unconstrainedWorklist.empty())
6194 {
6195 LiveRange* lr = unconstrainedWorklist.front();
6196 unconstrainedWorklist.pop_front();
6197
6198 #ifdef DEBUG_VERBOSE_ON
6199 DEBUG_VERBOSE(".... Remove Unconstrained ");
6200 lr->dump();
6201 DEBUG_VERBOSE(std::endl);
6202 #endif
6203
6204 if (liveAnalysis.livenessClass(G4_GRF))
6205 {
6206 relaxNeighborDegreeGRF(lr);
6207 }
6208 else
6209 {
6210 relaxNeighborDegreeARF(lr);
6211 }
6212 colorOrder.push_back(lr);
6213 }
6214
6215 removeConstrained();
6216 }
6217 }
6218
updateRegUsage(LiveRange * lr)6219 void PhyRegUsage::updateRegUsage(LiveRange* lr)
6220 {
6221 G4_Declare* dcl = lr->getDcl();
6222 G4_VarBase* pr;
6223 if (lr->getIsPartialDcl())
6224 {
6225 pr = lrs[lr->getParentLRID()]->getPhyReg();
6226 }
6227 else
6228 {
6229 pr = lr->getPhyReg();
6230 }
6231
6232 if (!pr)
6233 {
6234 return;
6235 }
6236 if (pr->isGreg())
6237 {
6238 if (dcl->getIsPartialDcl())
6239 {
6240 //Assumptions:
6241 // 1. the offset of the sub declare must be G4_WSIZE aligned
6242 // 2. the size of the subdeclare must be G4_WSIZE aligned
6243 markBusyForDclSplit(G4_GRF,
6244 ((G4_Greg*)pr)->getRegNum(),
6245 (lrs[lr->getParentLRID()]->getPhyRegOff() * TypeSize(dcl->getElemType()) + gra.getSubOffset(dcl)) / G4_WSIZE,
6246 dcl->getByteSize() / G4_WSIZE,
6247 dcl->getNumRows());
6248 }
6249 else
6250 {
6251 markBusyGRF(((G4_Greg*)pr)->getRegNum(),
6252 PhyRegUsage::offsetAllocUnit(lr->getPhyRegOff(), dcl->getElemType()),
6253 dcl->getWordSize(),
6254 lr->getNumRegNeeded(), dcl->isPreDefinedVar());
6255 }
6256 }
6257 else if (pr->isFlag())
6258 {
6259 auto flagWordOffset = lr->getPhyReg()->asAreg()->getFlagNum() * 2;
6260 markBusyFlag(0,
6261 PhyRegUsage::offsetAllocUnit(
6262 flagWordOffset + lr->getPhyRegOff(),
6263 dcl->getElemType()),
6264 PhyRegUsage::numAllocUnit(dcl->getNumElems(), dcl->getElemType()),
6265 dcl->getNumRows());
6266 }
6267 else if (pr->isAreg())
6268 {
6269 markBusyAddress(0,
6270 PhyRegUsage::offsetAllocUnit(lr->getPhyRegOff(), dcl->getElemType()),
6271 PhyRegUsage::numAllocUnit(dcl->getNumElems(), dcl->getElemType()),
6272 dcl->getNumRows());
6273 }
6274 else
6275 {
6276 MUST_BE_TRUE(false, ERROR_GRAPHCOLOR); // un-handled reg type
6277 }
6278 }
6279
assignColors(ColorHeuristic colorHeuristicGRF,bool doBankConflict,bool highInternalConflict,bool honorHints)6280 bool GraphColor::assignColors(ColorHeuristic colorHeuristicGRF, bool doBankConflict, bool highInternalConflict, bool honorHints)
6281 {
6282 if (builder.getOption(vISA_RATrace))
6283 {
6284 std::cout << "\t--" << (colorHeuristicGRF == ROUND_ROBIN ? "round-robin" : "first-fit") <<
6285 (doBankConflict ? " BCR" : "") << " graph coloring\n";
6286 }
6287
6288 unsigned startARFReg = 0;
6289 unsigned startFLAGReg = 0;
6290 unsigned startGRFReg = 0;
6291 unsigned bank1_end = 0;
6292 unsigned bank2_end = totalGRFRegCount - 1;
6293 unsigned bank1_start = 0;
6294 unsigned bank2_start = totalGRFRegCount - 1;
6295 unsigned totalGRFNum = kernel.getNumRegTotal();
6296 bool oneGRFBankDivision = gra.kernel.fg.builder->oneGRFBankDivision();
6297 bool allocFromBanks = liveAnalysis.livenessClass(G4_GRF) && builder.lowHighBundle() &&
6298 !builder.getOptions()->getuInt32Option(vISA_ReservedGRFNum) &&
6299 doBankConflict &&
6300 ((oneGRFBankDivision && gra.kernel.getSimdSize() >= g4::SIMD16) || (!oneGRFBankDivision && highInternalConflict));
6301
6302 if (allocFromBanks &&
6303 (colorHeuristicGRF == ROUND_ROBIN))
6304 {
6305 bank1_end = (unsigned)((totalGRFRegCount - 1) * (((float)evenTotalDegree / evenTotalRegNum) / (((float)evenTotalDegree / evenTotalRegNum) + ((float)oddTotalDegree / oddTotalRegNum))));
6306 if (bank1_end < evenMaxRegNum ||
6307 totalGRFRegCount - bank1_end < oddMaxRegNum ||
6308 bank1_end == totalGRFRegCount - 1 ||
6309 bank1_end == 0)
6310 {
6311 return false;
6312 }
6313
6314 bank2_end = bank1_end + 1;
6315 }
6316
6317 bool* availableGregs = (bool *)mem.alloc(sizeof(bool)* totalGRFNum);
6318 uint32_t* availableSubRegs = (uint32_t *)mem.alloc(sizeof(uint32_t)* totalGRFNum);
6319 bool* availableAddrs = (bool *)mem.alloc(sizeof(bool)* getNumAddrRegisters());
6320 bool* availableFlags = (bool *)mem.alloc(sizeof(bool)* builder.getNumFlagRegisters());
6321 uint8_t* weakEdgeUsage = (uint8_t*)mem.alloc(sizeof(uint8_t)*totalGRFNum);
6322 G4_RegFileKind rFile = G4_GRF;
6323 if (liveAnalysis.livenessClass(G4_FLAG))
6324 rFile = G4_FLAG;
6325 else if (liveAnalysis.livenessClass(G4_ADDRESS))
6326 rFile = G4_ADDRESS;
6327
6328 unsigned maxGRFCanBeUsed = totalGRFRegCount;
6329 PhyRegUsageParms parms(gra, lrs, rFile, maxGRFCanBeUsed, startARFReg, startFLAGReg, startGRFReg, bank1_start, bank1_end, bank2_start, bank2_end,
6330 doBankConflict, availableGregs, availableSubRegs, availableAddrs, availableFlags, weakEdgeUsage);
6331 bool noIndirForceSpills = builder.getOption(vISA_NoIndirectForceSpills);
6332
6333 auto& varSplitPass = *gra.getVarSplitPass();
6334
6335 // Returns true when valid assignment is found or when lr is added to spilled set.
6336 // Adding to spill set happens only if heuristic is not round_robin (FF may not spill).
6337 // Parameter returnFalseOnFail is set when the function is required to return false on
6338 // assignment failure.
6339 // When parameter spillAllowed is set to true, this function adds lr to spilled set. If
6340 // spillAllowed is false, the lr is not added to spill set. This logic is useful to
6341 // try re-allocation of a child/parent dcl when split is enabled.
6342 // ignoreChildrenIntf is set to true when all children are assigned to consecutive ranges
6343 // and we want to get fully coalesceable assignment for parent. In such circumstance, we
6344 // dont want to account for interference between parent/child since doing so cannot result
6345 // in a coalesceable assignment.
6346 auto assignColor = [&](LiveRange* lr, bool ignoreChildrenIntf = false, bool spillAllowed = true, bool returnFalseOnFail = false)
6347 {
6348 auto lrVar = lr->getVar();
6349
6350 //
6351 // assign register to live ranges
6352 //
6353 if (lr->getPhyReg() == NULL && !lrVar->isSpilled() && !lr->getIsPartialDcl()) // no assigned register yet and not spilled
6354 {
6355 G4_Declare* parentDcl = nullptr;
6356 bool skipParentIntf = false;
6357 if (lr->hasAllocHint())
6358 {
6359 parms.startGRFReg = (lr->getAllocHint() >= maxGRFCanBeUsed ? 0 : lr->getAllocHint());
6360 if (varSplitPass.isPartialDcl(lr->getDcl()))
6361 {
6362 parentDcl = varSplitPass.getParentDcl(lr->getDcl());
6363 if (parentDcl)
6364 {
6365 auto parentGRF = parentDcl->getRegVar()->getPhyReg();
6366 if (!parentGRF && parentDcl->getRegVar()->isRegAllocPartaker())
6367 {
6368 parentGRF = lrs[parentDcl->getRegVar()->getId()]->getPhyReg();
6369 }
6370 if (parentGRF)
6371 {
6372 // mark interference between partial lr and all
6373 // other GRFs allocated to parent dcl. this logic
6374 // allows either coalesceable allocation or a
6375 // fully non-overlapping assignment.
6376 auto siblingNum = varSplitPass.getSiblingNum(lr->getDcl());
6377 auto parentGRFNum = parentGRF->asGreg()->getRegNum();
6378 auto parentNumRows = parentDcl->getNumRows();
6379 auto numRows = lr->getDcl()->getNumRows();
6380 for (unsigned i = parentGRFNum; i != (parentGRFNum + parentNumRows); i += numRows)
6381 {
6382 if ((i - parentGRFNum) == siblingNum * numRows)
6383 continue;
6384 lr->markForbidden(i, numRows);
6385 }
6386 skipParentIntf = true;
6387 }
6388 }
6389 }
6390 }
6391
6392 unsigned lr_id = lrVar->getId();
6393 //
6394 // compute what registers are already assigned
6395 //
6396 PhyRegUsage regUsage(parms);
6397
6398 const std::vector<unsigned>& intfs = intf.getSparseIntfForVar(lr_id);
6399 auto weakEdgeSet = intf.getCompatibleSparseIntf(lrVar->getDeclare()->getRootDeclare());
6400 for (auto it : intfs)
6401 {
6402 LiveRange* lrTemp = lrs[it];
6403 if (lrTemp->getPhyReg() != nullptr || lrTemp->getIsPartialDcl())
6404 {
6405 if (lrTemp->getIsSplittedDcl()) //Only interfere with children declares
6406 {
6407 continue;
6408 }
6409
6410 if (skipParentIntf && lrTemp->getDcl() == parentDcl)
6411 continue;
6412
6413 if (ignoreChildrenIntf && varSplitPass.isParentChildRelation(lr->getDcl(), lrTemp->getDcl()))
6414 continue;
6415
6416 regUsage.updateRegUsage(lrTemp);
6417 }
6418 }
6419
6420 if (weakEdgeSet)
6421 {
6422 regUsage.runOverlapTest(true);
6423 for (auto weakDcl : *weakEdgeSet)
6424 {
6425 auto regVar = weakDcl->getRootDeclare()->getRegVar();
6426 unsigned pvar = 0, numRegs = 0;
6427 if (regVar->isPhyRegAssigned())
6428 {
6429 // This branch will be taken for dcls assigned
6430 // regs by LRA.
6431 pvar = regVar->getPhyReg()->asGreg()->getRegNum();
6432 numRegs = weakDcl->getNumRows();
6433 }
6434 else
6435 {
6436 // For dcls not assigned regs by LRA, lookup temp
6437 // registers assigned to LiveRange instances.
6438 auto id = regVar->getId();
6439 auto lr = lrs[id];
6440 auto phyReg = lr->getPhyReg();
6441 if (phyReg)
6442 {
6443 pvar = phyReg->asGreg()->getRegNum();
6444 numRegs = weakDcl->getNumRows();
6445 }
6446 }
6447
6448 // For now it is assumed only 8-byte types will appear
6449 // here. If other sized types will also appear then
6450 // augmentation mask also needs to be sent in
6451 // weak edge data structure below.
6452 for (unsigned r = pvar; r < (pvar + numRegs); r++)
6453 {
6454 auto use = regUsage.getWeakEdgeUse(r);
6455 if (use == 0 || use == (r - pvar + 1))
6456 {
6457 regUsage.setWeakEdgeUse(r, r - pvar + 1);
6458 }
6459 else
6460 {
6461 // Indiates two neighbors use a physical
6462 // register with different overlap.
6463 regUsage.setWeakEdgeUse(r, 0xff);
6464 }
6465 }
6466 }
6467 }
6468
6469 ColorHeuristic heuristic = colorHeuristicGRF;
6470
6471 bool failed_alloc = false;
6472 G4_Declare* dcl = lrVar->getDeclare();
6473
6474 if (!(noIndirForceSpills &&
6475 liveAnalysis.isAddressSensitive(lr_id)) &&
6476 forceSpill &&
6477 (dcl->getRegFile() == G4_GRF || dcl->getRegFile() == G4_FLAG) &&
6478 lr->getRefCount() != 0 &&
6479 lr->getSpillCost() != MAXSPILLCOST)
6480 {
6481 failed_alloc = true;
6482 }
6483
6484 if (dcl->getNumRows() > totalGRFNum)
6485 {
6486 // we sure as hell won't get an assignment
6487 failed_alloc = true;
6488 }
6489
6490 if (!failed_alloc)
6491 {
6492 // When evenAlignNeeded is true, it is binding for correctness
6493 bool evenAlignNeeded = gra.isEvenAligned(lrVar->getDeclare());
6494 BankAlign align = evenAlignNeeded ? BankAlign::Even : BankAlign::Either;
6495 if (allocFromBanks && !lr->hasAllocHint())
6496 {
6497
6498 if (!isHybrid && oneGRFBankDivision &&
6499 (!evenAlignNeeded || getPlatformGeneration(builder.getPlatform()) == PlatformGen::GEN9))
6500 {
6501 gra.getBankAlignment(lr, align);
6502 }
6503 failed_alloc |= !regUsage.assignGRFRegsFromBanks(lr, align, lr->getForbidden(),
6504 heuristic, oneGRFBankDivision);
6505 }
6506 else
6507 {
6508 failed_alloc |= !regUsage.assignRegs(highInternalConflict, lr, lr->getForbidden(),
6509 align, gra.getSubRegAlign(lrVar->getDeclare()), heuristic, lr->getSpillCost(),
6510 lr->hasAllocHint());
6511 }
6512 }
6513
6514 //
6515 // assign unused color
6516 //
6517 if (failed_alloc)
6518 {
6519 //
6520 // for GRF register assignment, if we are performing round-robin (1st pass) then abort on spill
6521 //
6522 if ((heuristic == ROUND_ROBIN || (doBankConflict && !kernel.getOption(vISA_forceBCR))) &&
6523 (lr->getRegKind() == G4_GRF || lr->getRegKind() == G4_FLAG))
6524 {
6525 return false;
6526 }
6527 else if (kernel.fg.isPseudoDcl(dcl))
6528 {
6529 // these pseudo dcls are not (and cannot be) spilled, but instead save/restore code will
6530 // be inserted in stack call prolog/epilog
6531 }
6532 else
6533 {
6534 // for first-fit register assignment track spilled live ranges
6535 if (spillAllowed)
6536 {
6537 // When retrying a coalesceable assignment, dont spill
6538 // if there is no GRF available.
6539 spilledLRs.push_back(lr);
6540 lr->setSpilled(true);
6541 }
6542 }
6543
6544 if (returnFalseOnFail)
6545 {
6546 return false;
6547 }
6548 }
6549 else
6550 {
6551 // Allocation succeeded, set hint if this is a split/child dcl
6552 if (!ignoreChildrenIntf &&
6553 (varSplitPass.isSplitDcl(dcl) || varSplitPass.isPartialDcl(dcl)))
6554 {
6555 varSplitPass.writeHints(dcl, lrs);
6556 }
6557 }
6558 }
6559 #ifdef DEBUG_VERBOSE_ON
6560 lr->dump();
6561 COUT_ERROR << std::endl;
6562 #endif
6563 return true;
6564 };
6565
6566 // colorOrder is in reverse order (unconstrained at front)
6567 for (auto iter = colorOrder.rbegin(), iterEnd = colorOrder.rend(); iter != iterEnd; ++iter)
6568 {
6569 auto lr = (*iter);
6570
6571 // in case child/parent was already spilled earlier, dont recolor
6572 if (lr->isSpilled())
6573 continue;
6574
6575 bool ret = assignColor(lr);
6576
6577 // early exit
6578 if (!ret)
6579 return false;
6580
6581 if (lr->getSpillCost() == MAXSPILLCOST &&
6582 !lr->getPhyReg() &&
6583 honorHints)
6584 {
6585 // infinite spill cost range spilled
6586 // undo all allocations done to split vars
6587 // and skip adhering to hints for preserving
6588 // correctness.
6589 resetTemporaryRegisterAssignments();
6590 return assignColors(colorHeuristicGRF, doBankConflict, highInternalConflict, false);
6591 }
6592
6593 if (honorHints && gra.getIterNo() == 0)
6594 {
6595 // attempt coalescing in non-spill iteration only
6596 if (varSplitPass.isSplitDcl(lr->getDcl()))
6597 {
6598 // Try allocating children, out of order in hopes
6599 // of getting a coalesceable assignment
6600 auto children = varSplitPass.getChildren(lr->getDcl());
6601 for (auto child : *children)
6602 {
6603 if (child->getRegVar()->isRegAllocPartaker())
6604 {
6605 auto childLR = lrs[child->getRegVar()->getId()];
6606 if (!childLR->getPhyReg())
6607 {
6608 auto isChildSpilled = childLR->isSpilled();
6609 assignColor(childLR, false, !isChildSpilled);
6610 // if allocated GRF is different than hint, then
6611 // undo allocation and let coloring take its course.
6612 // this can be done only if the childLR wasnt
6613 // already processed in colorOrder.
6614 if (!isChildSpilled && childLR->getPhyReg())
6615 {
6616 auto hint = childLR->getAllocHint();
6617 if (childLR->getPhyReg()->asGreg()->getRegNum() != hint)
6618 {
6619 // this is executed only if childLR is guaranteed to be
6620 // processed later on in colorOrder.
6621 childLR->resetPhyReg();
6622 }
6623 }
6624 else if (isChildSpilled && childLR->getPhyReg())
6625 {
6626 // was spilled earlier, got allocation now
6627 spilledLRs.remove(childLR);
6628 }
6629 }
6630 else
6631 {
6632 // retry allocating as per hint
6633 auto oldPhyReg = childLR->getPhyReg();
6634 auto oldPhySubReg = childLR->getPhyRegOff();
6635 auto hint = childLR->getAllocHint();
6636 if (oldPhyReg->asGreg()->getRegNum() == hint)
6637 continue;
6638 childLR->resetPhyReg();
6639 bool success = assignColor(childLR, false, false, true);
6640 if (!success || childLR->getPhyReg()->asGreg()->getRegNum() != hint)
6641 childLR->setPhyReg(oldPhyReg, oldPhySubReg);
6642 }
6643 }
6644 }
6645 }
6646
6647 // if all children are assigned consecutive GRFs but parent isnt
6648 // then try re-assigning parent
6649 if (varSplitPass.isPartialDcl(lr->getDcl()) &&
6650 varSplitPass.reallocParent(lr->getDcl(), getLiveRanges()))
6651 {
6652 auto parentDcl = varSplitPass.getParentDcl(lr->getDcl());
6653 auto parentLR = getLiveRanges()[parentDcl->getRegVar()->getId()];
6654 auto oldPhyReg = parentLR->getPhyReg();
6655 auto oldPhySubReg = parentLR->getPhyRegOff();
6656 bool isParentSpilled = parentLR->isSpilled();
6657 parentLR->resetPhyReg();
6658 varSplitPass.writeHints(lr->getDcl(), getLiveRanges());
6659 assignColor(parentLR, true, !isParentSpilled);
6660 // If parent's assigned GRF is non-coalesceable assignment then
6661 // undo it as it is risky to keep this because parent's intf
6662 // doesnt include children.
6663 auto newParentAssignment = parentLR->getPhyReg();
6664 if ((newParentAssignment && newParentAssignment->asGreg()->getRegNum() != parentLR->getAllocHint()) ||
6665 !newParentAssignment)
6666 parentLR->setPhyReg(oldPhyReg, oldPhySubReg);
6667
6668 if (isParentSpilled && parentLR->getPhyReg())
6669 {
6670 // remove parent from spill list since it got an allocation this time
6671 spilledLRs.remove(parentLR);
6672 parentLR->setSpilled(false);
6673 }
6674 }
6675 }
6676 }
6677
6678 // record RA type
6679 if (liveAnalysis.livenessClass(G4_GRF))
6680 {
6681 if (colorHeuristicGRF == ROUND_ROBIN)
6682 {
6683 kernel.setRAType(doBankConflict ? RA_Type::GRAPH_COLORING_RR_BC_RA : RA_Type::GRAPH_COLORING_RR_RA);
6684 }
6685 else
6686 {
6687 kernel.setRAType(doBankConflict ? RA_Type::GRAPH_COLORING_FF_BC_RA : RA_Type::GRAPH_COLORING_FF_RA);
6688 }
6689 }
6690
6691 #ifdef _DEBUG
6692 // Verify that spilledLRs has no duplicate
6693 for (auto item : spilledLRs)
6694 {
6695 unsigned count = 0;
6696 for (auto checkItem : spilledLRs)
6697 {
6698 if (checkItem == item)
6699 {
6700 MUST_BE_TRUE(count == 0, "Duplicate entry found in spilledLRs");
6701 count++;
6702 }
6703 }
6704 }
6705
6706 // Verify that none of spilledLRs have an allocation
6707 for (auto lr : spilledLRs)
6708 {
6709 MUST_BE_TRUE(lr->getPhyReg() == nullptr, "Spilled LR contains valid allocation");
6710 }
6711
6712 // Verify that all spilled LRs are synced
6713 for (auto lr : spilledLRs)
6714 {
6715 MUST_BE_TRUE(lr->isSpilled(), "LR not marked as spilled, but inserted in spilledLRs list");
6716 }
6717
6718 // Verify if all LRs have either an allocation or are spilled
6719 for (auto lr : colorOrder)
6720 {
6721 if (!kernel.fg.isPseudoDcl(lr->getDcl()))
6722 {
6723 MUST_BE_TRUE(lr->isSpilled() || lr->getPhyReg() || lr->getDcl()->isSpilled(), "Range without allocation and not spilled");
6724 }
6725 }
6726 #endif
6727
6728 return true;
6729 }
6730
6731 template <class REGION_TYPE>
getRegionDisp(REGION_TYPE * region)6732 unsigned GlobalRA::getRegionDisp(
6733 REGION_TYPE * region
6734 )
6735 {
6736 unsigned rowOffset = numEltPerGRF<Type_UB>() * region->getRegOff();
6737 unsigned columnOffset = region->getSubRegOff() * region->getElemSize();
6738 return rowOffset + columnOffset;
6739 }
6740
addEUFusionWAInsts(G4_INST * inst)6741 void GlobalRA::addEUFusionWAInsts(G4_INST* inst)
6742 {
6743 if(EUFusionWANeeded())
6744 EUFusionWAInsts.insert(inst);
6745 }
6746
getRegionByteSize(G4_DstRegRegion * region,unsigned execSize)6747 unsigned GlobalRA::getRegionByteSize(
6748 G4_DstRegRegion * region,
6749 unsigned execSize
6750 )
6751 {
6752 unsigned size = region->getHorzStride() * region->getElemSize() *
6753 (execSize - 1) + region->getElemSize();
6754
6755 return size;
6756 }
6757
6758 #define OWORD_BYTE_SIZE 16
6759
6760 template <class REGION_TYPE>
isUnalignedRegion(REGION_TYPE * region,unsigned execSize)6761 bool GlobalRA::isUnalignedRegion(
6762 REGION_TYPE * region,
6763 unsigned execSize
6764 )
6765 {
6766 unsigned regionDisp = getRegionDisp(region);
6767 unsigned regionByteSize = getRegionByteSize(region, execSize);
6768
6769 if (regionDisp%numEltPerGRF<Type_UB>() == 0 && regionByteSize%numEltPerGRF<Type_UB>() == 0)
6770 {
6771 return
6772 regionByteSize / numEltPerGRF<Type_UB>() != 1 &&
6773 regionByteSize / numEltPerGRF<Type_UB>() != 2 &&
6774 regionByteSize / numEltPerGRF<Type_UB>() != 4;
6775 }
6776 return true;
6777
6778 }
6779
shouldPreloadDst(G4_INST * instContext,G4_BB * curBB)6780 bool GlobalRA::shouldPreloadDst(
6781 G4_INST * instContext,
6782 G4_BB* curBB
6783 )
6784 {
6785 // Check for partial and unaligned regions and add pre-load code, if
6786 // necessary.
6787 auto spilledRangeRegion = instContext->getDst();
6788 uint8_t execSize = instContext->getExecSize();
6789
6790 if (isPartialRegion(spilledRangeRegion, execSize) ||
6791 isUnalignedRegion(spilledRangeRegion, execSize) ||
6792 instContext->isPartialWriteForSpill(!curBB->isAllLaneActive())) {
6793 return true;
6794 }
6795 // No pre-load for whole and aligned region writes
6796 else {
6797 return false;
6798 }
6799 }
6800
livenessCandidate(const G4_Declare * decl) const6801 bool GlobalRA::livenessCandidate(const G4_Declare* decl) const
6802 {
6803 if (decl->getAliasDeclare())
6804 {
6805 return false;
6806 }
6807
6808 if ((G4_GRF & decl->getRegFile()))
6809 {
6810 if ((decl->getRegFile() & G4_INPUT) && decl->getRegVar()->isPhyRegAssigned() && !decl->getRegVar()->isGreg())
6811 {
6812 return false;
6813 }
6814 if (decl->getByteSize() == 0)
6815 {
6816 // regrettably, this can happen for arg/retval pre-defined variable
6817 return false;
6818 }
6819 return true;
6820 }
6821 else
6822 {
6823 return false;
6824 }
6825 }
6826
determineSpillRegSize(unsigned & spillRegSize,unsigned & indrSpillRegSize)6827 void GlobalRA::determineSpillRegSize(unsigned& spillRegSize, unsigned& indrSpillRegSize)
6828 {
6829 // Iterate over all BBs
6830 for (auto curBB : kernel.fg)
6831 {
6832 // Iterate over all insts
6833 for (INST_LIST_ITER inst_it = curBB->begin(), iend = curBB->end(); inst_it != iend; ++inst_it)
6834 {
6835 unsigned currentSpillRegSize = 0;
6836 unsigned currentIndrSpillRegSize = 0;
6837
6838 G4_INST* curInst = (*inst_it);
6839
6840 if (curInst->isPseudoKill() ||
6841 curInst->isLifeTimeEnd() ||
6842 curInst->opcode() == G4_pseudo_fcall ||
6843 curInst->opcode() == G4_pseudo_fret)
6844 {
6845 continue;
6846 }
6847
6848 if (curInst->isSend())
6849 {
6850 G4_SendDesc* msgDesc = curInst->getMsgDesc();
6851
6852 unsigned dstSpillRegSize = 0;
6853 dstSpillRegSize = msgDesc->getDstLenRegs();
6854
6855 unsigned src0FillRegSize = 0;
6856 src0FillRegSize = msgDesc->getSrc0LenRegs();
6857
6858 unsigned src1FillRegSize = 0;
6859 if (curInst->isSplitSend())
6860 {
6861 src1FillRegSize = msgDesc->getSrc1LenRegs();
6862 }
6863
6864 if (!kernel.fg.builder->useSends())
6865 {
6866 dstSpillRegSize++;
6867 }
6868
6869 currentSpillRegSize = dstSpillRegSize + src0FillRegSize + src1FillRegSize;
6870 }
6871 else if (curInst->isDpas())
6872 {
6873 unsigned dstSpillRegSize = 0;
6874 G4_DstRegRegion* dst = curInst->getDst();
6875 if (dst && dst->getBase()->isRegVar())
6876 {
6877 dstSpillRegSize = dst->getBase()->asRegVar()->getDeclare()->getNumRows();
6878 }
6879
6880 unsigned srcFillRegSize = 0;
6881 for (int i = 0, srcNum = curInst->getNumSrc(); i < srcNum; i++)
6882 {
6883 G4_Operand* src = curInst->getSrc(i);
6884
6885 if (src &&
6886 src->isSrcRegRegion() &&
6887 src->asSrcRegRegion()->getBase()->isRegVar())
6888 {
6889 if (src->asSrcRegRegion()->getBase()->asRegVar()->getDeclare()->getRegFile() == G4_GRF)
6890 {
6891 unsigned srcSize = src->getBase()->asRegVar()->getDeclare()->getNumRows();
6892 //FIXME, currently we only use the max src size.
6893 //To save the spill registers, it's better the space can be determined by checking if the variable is really spilled or not.
6894 srcFillRegSize += srcSize;
6895 }
6896 }
6897 }
6898 currentSpillRegSize = srcFillRegSize + dstSpillRegSize;
6899 }
6900 else
6901 {
6902 ORG_REGVAR_VECTOR indrVars;
6903
6904 unsigned dstSpillRegSize = 0;
6905 unsigned indrDstSpillRegSize = 0;
6906 if (G4_Inst_Table[curInst->opcode()].n_dst == 1)
6907 {
6908 G4_DstRegRegion* dst = curInst->getDst();
6909
6910 if (dst &&
6911 dst->getBase()->isRegVar())
6912 {
6913 if (dst->getBase()->asRegVar()->getDeclare()->getRegFile() == G4_GRF)
6914 {
6915 if (dst->isCrossGRFDst())
6916 {
6917 dstSpillRegSize = 2;
6918 }
6919 else
6920 {
6921 dstSpillRegSize = 1;
6922 }
6923
6924 if (shouldPreloadDst(curInst, curBB))
6925 {
6926 dstSpillRegSize *= 3;
6927 }
6928 else
6929 {
6930 dstSpillRegSize *= 2;
6931 }
6932
6933 if (!kernel.fg.builder->useSends())
6934 {
6935 dstSpillRegSize++;
6936 }
6937 }
6938 else if (dst->getRegAccess() == IndirGRF)
6939 {
6940 auto pointsToSet = pointsToAnalysis.getAllInPointsTo(dst->getBase()->asRegVar());
6941 if (pointsToSet != nullptr)
6942 {
6943 for (auto pt : *pointsToSet)
6944 {
6945 if (pt.var->isRegAllocPartaker() ||
6946 ((builder.getOption(vISA_HybridRAWithSpill) || builder.getOption(vISA_FastCompileRA)) && livenessCandidate(pt.var->getDeclare())))
6947 {
6948 indrVars.push_back(pt.var);
6949 indrDstSpillRegSize += pt.var->getDeclare()->getNumRows();
6950 }
6951 }
6952 }
6953 }
6954 }
6955 }
6956
6957 unsigned srcFillRegSize = 0;
6958 unsigned indirSrcFillRegSize = 0;
6959 // Scan srcs
6960 for (int i = 0, srcNum = curInst->getNumSrc(); i < srcNum; i++)
6961 {
6962 G4_Operand* src = curInst->getSrc(i);
6963
6964 if (src &&
6965 src->isSrcRegRegion() &&
6966 src->asSrcRegRegion()->getBase()->isRegVar())
6967 {
6968 if (src->asSrcRegRegion()->getBase()->asRegVar()->getDeclare()->getRegFile() == G4_GRF)
6969 {
6970 if (src->asSrcRegRegion()->crossGRF())
6971 {
6972 srcFillRegSize += 2;
6973 }
6974 else
6975 {
6976 srcFillRegSize += 1;
6977 }
6978 }
6979 else if (src->asSrcRegRegion()->getRegAccess() == IndirGRF)
6980 {
6981 auto pointsToSet = pointsToAnalysis.getAllInPointsTo(src->asSrcRegRegion()->getBase()->asRegVar());
6982 if (pointsToSet != nullptr)
6983 {
6984 for (auto pt : *pointsToSet)
6985 {
6986 if (pt.var->isRegAllocPartaker() ||
6987 ((builder.getOption(vISA_HybridRAWithSpill) || builder.getOption(vISA_FastCompileRA)) && livenessCandidate(pt.var->getDeclare())))
6988 {
6989 if (std::find(indrVars.begin(), indrVars.end(), pt.var) == indrVars.end())
6990 {
6991 indrVars.push_back(pt.var);
6992 indirSrcFillRegSize += pt.var->getDeclare()->getNumRows();
6993 }
6994 }
6995 }
6996 }
6997 }
6998 }
6999 }
7000
7001 if (builder.avoidDstSrcOverlap())
7002 {
7003 currentSpillRegSize = srcFillRegSize + dstSpillRegSize;
7004 }
7005 else
7006 {
7007 currentSpillRegSize = srcFillRegSize > dstSpillRegSize ? srcFillRegSize : dstSpillRegSize;
7008 }
7009 currentIndrSpillRegSize = indrDstSpillRegSize + indirSrcFillRegSize;
7010 }
7011
7012 spillRegSize = std::max(spillRegSize, currentSpillRegSize);
7013 indrSpillRegSize = std::max(indrSpillRegSize, currentIndrSpillRegSize);
7014 }
7015 }
7016 }
7017
7018
regAlloc(bool doBankConflictReduction,bool highInternalConflict,bool reserveSpillReg,unsigned & spillRegSize,unsigned & indrSpillRegSize,const RPE * rpe)7019 bool GraphColor::regAlloc(
7020 bool doBankConflictReduction,
7021 bool highInternalConflict,
7022 bool reserveSpillReg, unsigned& spillRegSize, unsigned& indrSpillRegSize,
7023 const RPE* rpe)
7024 {
7025
7026 bool useSplitLLRHeuristic = false;
7027
7028 if (builder.getOption(vISA_RATrace))
7029 {
7030 std::cout << "\t--# variables: " << liveAnalysis.getNumSelectedVar() << "\n";
7031 }
7032
7033 unsigned reserveSpillSize = 0;
7034 if (reserveSpillReg)
7035 {
7036 gra.determineSpillRegSize(spillRegSize, indrSpillRegSize);
7037 reserveSpillSize = spillRegSize + indrSpillRegSize;
7038 MUST_BE_TRUE(reserveSpillSize < kernel.getNumCalleeSaveRegs(), "Invalid reserveSpillSize in fail-safe RA!");
7039 totalGRFRegCount -= reserveSpillSize;
7040 }
7041
7042 // Copy over alignment for vars inserted by RA
7043 gra.copyMissingAlignment();
7044
7045 //
7046 // create an array of live ranges.
7047 //
7048 createLiveRanges(reserveSpillSize);
7049 //
7050 // set the pre-assigned registers
7051 //
7052 for (unsigned i = 0; i < numVar; i++)
7053 {
7054 if (lrs[i]->getVar()->getPhyReg())
7055 {
7056 lrs[i]->setPhyReg(lrs[i]->getVar()->getPhyReg(), lrs[i]->getVar()->getPhyRegOff());
7057 }
7058
7059 G4_Declare* dcl = lrs[i]->getDcl();
7060 if (!useSplitLLRHeuristic)
7061 {
7062 auto dclLR = gra.getLocalLR(dcl);
7063
7064 if (dclLR != nullptr &&
7065 dclLR->getSplit())
7066 {
7067 useSplitLLRHeuristic = true;
7068 }
7069 }
7070
7071 }
7072
7073 //
7074 // compute interference matrix
7075 //
7076 intf.init(mem);
7077 intf.computeInterference();
7078
7079 TIME_SCOPE(COLORING);
7080 //
7081 // compute degree and spill costs for each live range
7082 //
7083 if (liveAnalysis.livenessClass(G4_GRF))
7084 {
7085 computeDegreeForGRF();
7086 }
7087 else
7088 {
7089 computeDegreeForARF();
7090 }
7091 computeSpillCosts(useSplitLLRHeuristic);
7092
7093 if (kernel.getOption(vISA_DumpRAIntfGraph))
7094 intf.dumpInterference();
7095 //
7096 // determine coloring order
7097 //
7098 determineColorOrdering();
7099
7100 //
7101 // Set up the sub-reg alignment from declare information
7102 //
7103 for (unsigned i = 0; i < numVar; i++)
7104 {
7105 G4_Declare* dcl = lrs[i]->getDcl();
7106
7107 if (gra.getSubRegAlign(dcl) == Any && !dcl->getIsPartialDcl())
7108 {
7109 //
7110 // multi-row, subreg alignment = 16 words
7111 //
7112 if (dcl->getNumRows() > 1)
7113 {
7114 gra.setSubRegAlign(lrs[i]->getVar()->getDeclare(), GRFALIGN);
7115 }
7116 //
7117 // single-row
7118 //
7119 else if (gra.getSubRegAlign(lrs[i]->getVar()->getDeclare()) == Any)
7120 {
7121 //
7122 // set up Odd word or Even word sub reg alignment
7123 //
7124 unsigned nbytes = dcl->getNumElems() * TypeSize(dcl->getElemType());
7125 unsigned nwords = nbytes / G4_WSIZE + nbytes % G4_WSIZE;
7126 if (nwords >= 2 && lrs[i]->getRegKind() == G4_GRF)
7127 {
7128 gra.setSubRegAlign(lrs[i]->getVar()->getDeclare(), Even_Word);
7129 }
7130 }
7131 }
7132 }
7133 //
7134 // assign registers for GRFs, GRFs are first attempted to be assigned using round-robin and if it fails
7135 // then we retry using a first-fit heuristic.
7136 //
7137 if (liveAnalysis.livenessClass(G4_GRF))
7138 {
7139 bool hasStackCall = kernel.fg.getHasStackCalls() || kernel.fg.getIsStackCallFunc();
7140
7141 bool willSpill = ((builder.getOption(vISA_FastCompileRA) || builder.getOption(vISA_HybridRAWithSpill)) && !hasStackCall) ||
7142 (kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D &&
7143 rpe->getMaxRP() >= kernel.getNumRegTotal() + 24);
7144 if (willSpill)
7145 {
7146 // go straight to first_fit to save compile time since we are definitely spilling
7147 // we do this for 3D only since with indirect/subroutine the RP pressure can be very unreliable
7148 // FIXME: due to factors like local split and scalar variables that are not accurately modeled in RP estimate,
7149 // RA may succeed even when RP is > total #GRF. We should investigate these cases and fix RPE
7150 assignColors(FIRST_FIT, false, false);
7151 //assert(requireSpillCode() && "inaccurate GRF pressure estimate");
7152 return !requireSpillCode();
7153 }
7154
7155 if (kernel.getOption(vISA_RoundRobin) && !hasStackCall && !gra.isReRAPass())
7156 {
7157 if (assignColors(ROUND_ROBIN, doBankConflictReduction, highInternalConflict) == false)
7158 {
7159 resetTemporaryRegisterAssignments();
7160 bool success = assignColors(FIRST_FIT, doBankConflictReduction, highInternalConflict);
7161
7162 if (!success && doBankConflictReduction && isHybrid)
7163 {
7164 return false;
7165 }
7166
7167 if (!kernel.getOption(vISA_forceBCR))
7168 {
7169 if (!success && doBankConflictReduction)
7170 {
7171 resetTemporaryRegisterAssignments();
7172 kernel.getOptions()->setOption(vISA_enableBundleCR, false);
7173 assignColors(FIRST_FIT, false, false);
7174 kernel.getOptions()->setOption(vISA_enableBundleCR, true);
7175 }
7176 }
7177 }
7178 }
7179 else
7180 {
7181 bool success = assignColors(FIRST_FIT, true, highInternalConflict);
7182 if (!success)
7183 {
7184 resetTemporaryRegisterAssignments();
7185 assignColors(FIRST_FIT, false, false);
7186 }
7187 }
7188 }
7189 else if (liveAnalysis.livenessClass(G4_FLAG))
7190 {
7191 if (kernel.getOption(vISA_RoundRobin))
7192 {
7193 if (assignColors(ROUND_ROBIN, false, false) == false)
7194 {
7195 resetTemporaryRegisterAssignments();
7196 assignColors(FIRST_FIT, false, false);
7197 }
7198 }
7199 else
7200 {
7201 assignColors(FIRST_FIT, false, false);
7202 }
7203 }
7204 else
7205 {
7206 // assign registers for ARFs using a first-fit heuristic
7207 assignColors(FIRST_FIT, false, false);
7208 }
7209
7210 return (requireSpillCode() == false);
7211 }
7212
confirmRegisterAssignments()7213 void GraphColor::confirmRegisterAssignments()
7214 {
7215 for (unsigned i = 0; i < numVar; i++)
7216 {
7217 if (lrs[i]->getPhyReg()) {
7218 if (lrs[i]->getVar()->getPhyReg()) {
7219 MUST_BE_TRUE((lrs[i]->getVar()->getPhyReg() == lrs[i]->getPhyReg()), ERROR_GRAPHCOLOR);
7220 }
7221 else {
7222 lrs[i]->getVar()->setPhyReg(lrs[i]->getPhyReg(), lrs[i]->getPhyRegOff());
7223 }
7224 }
7225 }
7226 }
7227
resetTemporaryRegisterAssignments()7228 void GraphColor::resetTemporaryRegisterAssignments()
7229 {
7230 for (unsigned i = 0; i < numVar; i++)
7231 {
7232 if (lrs[i]->getVar()->getPhyReg() == NULL) {
7233 lrs[i]->resetPhyReg();
7234 lrs[i]->resetAllocHint();
7235 lrs[i]->setSpilled(false);
7236 }
7237 }
7238 spilledLRs.clear();
7239 }
7240
cleanupRedundantARFFillCode()7241 void GraphColor::cleanupRedundantARFFillCode()
7242 {
7243 for (G4_BB *bb : builder.kernel.fg)
7244 {
7245 clearSpillAddrLocSignature();
7246
7247 for (std::list<G4_INST*>::iterator i = bb->begin(); i != bb->end();)
7248 {
7249 G4_INST* inst = (*i);
7250
7251 //
7252 // process writes to spill storage (GRF) of addr regs
7253 //
7254 G4_DstRegRegion* dst = inst->getDst();
7255
7256 if (dst && dst->getBase() &&
7257 dst->getBase()->isRegVar() &&
7258 (kernel.fg.isPseudoA0Dcl(dst->getBase()->asRegVar()->getDeclare()) ||
7259 inst->isPseudoKill()))
7260 {
7261 i++;
7262 continue;
7263 }
7264
7265 if (dst != NULL &&
7266 dst->getRegAccess() == Direct) {
7267
7268 if (dst->getBase()->isRegVar() &&
7269 dst->getBase()->asRegVar()->isRegVarAddrSpillLoc())
7270 {
7271 pruneActiveSpillAddrLocs(dst, inst->getExecSize(), inst->getExecType());
7272 }
7273 //
7274 // process writes to (allocated) addr regs
7275 //
7276 else if (dst->getBase()->isRegAllocPartaker())
7277 {
7278 G4_RegVar* addrReg = dst->getBase()->asRegVar();
7279
7280 if (gra.isAddrFlagSpillDcl(addrReg->getDeclare()))
7281 {
7282 G4_SrcRegRegion* srcRgn = inst->getSrc(0)->asSrcRegRegion();
7283
7284 if (redundantAddrFill(dst, srcRgn, inst->getExecSize())) {
7285 std::list<G4_INST*>::iterator j = i++;
7286 bb->erase(j);
7287 continue;
7288 }
7289 else {
7290 updateActiveSpillAddrLocs(dst, srcRgn, inst->getExecSize());
7291 }
7292 }
7293 else {
7294 pruneActiveSpillAddrLocs(dst, inst->getExecSize(), inst->getExecType());
7295 }
7296 }
7297 }
7298
7299 i++;
7300 }
7301 }
7302 }
7303
pruneActiveSpillAddrLocs(G4_DstRegRegion * dstRegion,unsigned exec_size,G4_Type exec_type)7304 void GraphColor::pruneActiveSpillAddrLocs(G4_DstRegRegion* dstRegion, unsigned exec_size, G4_Type exec_type)
7305 {
7306 if (dstRegion->getBase()->asRegVar()->isRegVarAddrSpillLoc()) {
7307 MUST_BE_TRUE(((exec_type == Type_UW || exec_type == Type_W) && exec_size <= getNumAddrRegisters()) ||
7308 (exec_size == 1), "Unexpected ADDR spill loc update format!");
7309 MUST_BE_TRUE(dstRegion->getRegAccess() == Direct, "Unexpected ADDR spill loc");
7310
7311 G4_RegVarAddrSpillLoc * spillLocReg = static_cast<G4_RegVarAddrSpillLoc*>(dstRegion->getBase());
7312 unsigned startId = spillLocReg->getLocId() + dstRegion->getSubRegOff();
7313 unsigned endId = startId + exec_size * dstRegion->getHorzStride();
7314
7315 for (unsigned i = 0, horzStride = dstRegion->getHorzStride(); i < getNumAddrRegisters(); i += horzStride)
7316 {
7317 if (spAddrRegSig[i] >= startId && spAddrRegSig[i] < endId)
7318 {
7319 spAddrRegSig[i] = 0;
7320 }
7321 }
7322 }
7323 else if (dstRegion->getBase()->asRegVar()->isPhyRegAssigned()) {
7324 G4_RegVar* addrReg = dstRegion->getBase()->asRegVar();
7325 MUST_BE_TRUE(addrReg->getPhyReg()->isA0(), "Unknown error in ADDR reg spill code cleanup!");
7326 unsigned startId = addrReg->getPhyRegOff();
7327 unsigned endId = startId + exec_size * dstRegion->getHorzStride();
7328 MUST_BE_TRUE(endId <= getNumAddrRegisters(), "Unknown error in ADDR reg spill code cleanup!");
7329
7330 for (unsigned i = startId; i < endId; i += dstRegion->getHorzStride())
7331 {
7332 spAddrRegSig[i] = 0;
7333 }
7334 }
7335 else {
7336 MUST_BE_TRUE(false, "Unknown error in ADDR reg spill code cleanup!");
7337 }
7338 }
7339
updateActiveSpillAddrLocs(G4_DstRegRegion * tmpDstRegion,G4_SrcRegRegion * srcRegion,unsigned exec_size)7340 void GraphColor::updateActiveSpillAddrLocs(G4_DstRegRegion* tmpDstRegion, G4_SrcRegRegion* srcRegion, unsigned exec_size)
7341 {
7342 MUST_BE_TRUE(gra.isAddrFlagSpillDcl(tmpDstRegion->getBase()->asRegVar()->getDeclare()), "Unknown error in ADDR reg spill code cleanup!");
7343 G4_RegVar* addrReg = tmpDstRegion->getBase()->asRegVar();
7344 MUST_BE_TRUE(addrReg->getPhyReg()->isA0(), "Unknown error in ADDR reg spill code cleanup!");
7345 unsigned startAddrId = addrReg->getPhyRegOff();
7346 unsigned endAddrId = startAddrId + exec_size * tmpDstRegion->getHorzStride();
7347 MUST_BE_TRUE(endAddrId <= getNumAddrRegisters(), "Unknown error in ADDR reg spill code cleanup!");
7348
7349 MUST_BE_TRUE(srcRegion->getBase()->asRegVar()->isRegVarAddrSpillLoc(), "Unknown error in ADDR reg spill code cleanup!");
7350 G4_RegVarAddrSpillLoc * spillLocReg = static_cast<G4_RegVarAddrSpillLoc*>(srcRegion->getBase());
7351 unsigned startLocId = spillLocReg->getLocId() + srcRegion->getSubRegOff();
7352
7353 for (unsigned i = startAddrId, j = startLocId; i < endAddrId;
7354 i += tmpDstRegion->getHorzStride(), j += srcRegion->getRegion()->horzStride)
7355 {
7356 spAddrRegSig[i] = j;
7357 }
7358 }
7359
redundantAddrFill(G4_DstRegRegion * tmpDstRegion,G4_SrcRegRegion * srcRegion,unsigned exec_size)7360 bool GraphColor::redundantAddrFill(G4_DstRegRegion* tmpDstRegion, G4_SrcRegRegion* srcRegion, unsigned exec_size)
7361 {
7362 bool match = true;
7363
7364 MUST_BE_TRUE(gra.isAddrFlagSpillDcl(tmpDstRegion->getBase()->asRegVar()->getDeclare()), "Unknown error in ADDR reg spill code cleanup!");
7365 G4_RegVar* addrReg = tmpDstRegion->getBase()->asRegVar();
7366 MUST_BE_TRUE(addrReg->getPhyReg()->isA0(), "Unknown error in ADDR reg spill code cleanup!");
7367 unsigned startAddrId = addrReg->getPhyRegOff();
7368 unsigned endAddrId = startAddrId + exec_size * tmpDstRegion->getHorzStride();
7369 MUST_BE_TRUE(endAddrId <= getNumAddrRegisters(), "Unknown error in ADDR reg spill code cleanup!");
7370
7371 MUST_BE_TRUE(srcRegion->getBase()->asRegVar()->isRegVarAddrSpillLoc(), "Unknown error in ADDR reg spill code cleanup!");
7372 G4_RegVarAddrSpillLoc * spillLocReg = static_cast<G4_RegVarAddrSpillLoc*>(srcRegion->getBase());
7373 unsigned startLocId = spillLocReg->getLocId() + srcRegion->getSubRegOff();
7374
7375 for (unsigned i = startAddrId, j = startLocId; i < endAddrId;
7376 i += tmpDstRegion->getHorzStride(), j += srcRegion->getRegion()->horzStride)
7377 {
7378 if (spAddrRegSig[i] != j)
7379 {
7380 match = false;
7381 break;
7382 }
7383 }
7384
7385 return match;
7386 }
7387
sendBlockSizeCode(unsigned owordSize)7388 unsigned GlobalRA::sendBlockSizeCode(unsigned owordSize)
7389 {
7390 unsigned code;
7391
7392 switch (owordSize) {
7393 case 1:
7394 code = 0;
7395 break;
7396 case 2:
7397 code = 2;
7398 break;
7399 case 4:
7400 code = 3;
7401 break;
7402 case 8:
7403 code = 4;
7404 break;
7405 case 16:
7406 code = 5;
7407 break;
7408 default:
7409 MUST_BE_TRUE(false, ERROR_REGALLOC);
7410 code = 0;
7411 }
7412
7413 return code;
7414 }
7415
7416 #define STATELESS_SURFACE_INDEX 0xFF
7417 #define HEADER_PRESENT 0x80000
7418 #define SEND_OWORD_READ_TYPE 0
7419 #define SEND_OWORD_WRITE_TYPE 8
7420 #define SEND_MSG_TYPE_BIT_OFFSET 14
7421 #define SEND_RSP_LENGTH_BIT_OFFSET 20
7422 #define SEND_MSG_LENGTH_BIT_OFFSET 25
7423 #define SEND_DESC_DATA_SIZE_BIT_OFFSET 8
7424
createMsgDesc(unsigned owordSize,bool writeType,bool isSplitSend)7425 G4_Imm* GlobalRA::createMsgDesc(unsigned owordSize, bool writeType, bool isSplitSend)
7426 {
7427 // If isSplitSend = true then messageLength = 1 and extMesLength = (owordSize/2) GRFs
7428 unsigned message = STATELESS_SURFACE_INDEX;
7429 message |= HEADER_PRESENT;
7430 if (writeType)
7431 {
7432 unsigned messageType = SEND_OWORD_WRITE_TYPE;
7433 message |= messageType << SEND_MSG_TYPE_BIT_OFFSET;
7434 unsigned messageLength = 1;
7435 if (!isSplitSend)
7436 {
7437 messageLength += owordToGRFSize(ROUND(owordSize, numEltPerGRF<Type_UB>()/OWORD_BYTE_SIZE));
7438 }
7439 message |= messageLength << SEND_MSG_LENGTH_BIT_OFFSET;
7440 }
7441 else
7442 {
7443 unsigned messageType = SEND_OWORD_READ_TYPE;
7444 message |= messageType << SEND_MSG_TYPE_BIT_OFFSET;
7445 unsigned responseLength = owordToGRFSize(ROUND(owordSize, numEltPerGRF<Type_UB>() / OWORD_BYTE_SIZE));
7446 message |= responseLength << SEND_RSP_LENGTH_BIT_OFFSET;
7447 unsigned messageLength = 1;
7448 message |= messageLength << SEND_MSG_LENGTH_BIT_OFFSET;
7449 }
7450 unsigned writeOwordSize = sendBlockSizeCode(owordSize);
7451 message |= writeOwordSize << SEND_DESC_DATA_SIZE_BIT_OFFSET;
7452 return builder.createImm(message, Type_UD);
7453 }
7454
stackCallProlog()7455 void GlobalRA::stackCallProlog()
7456 {
7457 // mov (8) r126.0<1>:ud r0.0<8;8,1>:ud
7458 // This sets up the header for oword block r/w used for caller/callee-save
7459
7460 // Kernel should've already setup r0 in r126.
7461 // Useful data in r126 is expected to be preserved by all functions.
7462 if (kernel.fg.getIsStackCallFunc())
7463 {
7464 if (kernel.getOption(vISA_skipFDE))
7465 return;
7466
7467 // emit frame descriptor
7468 auto payload = builder.createHardwiredDeclare(8, Type_UD, kernel.getFPSPGRF(), 0);
7469 payload->setName(builder.getNameString(builder.kernel.fg.mem, 24, "FrameDescriptorGRF"));
7470 auto payloadSrc = builder.createSrcRegRegion(payload, builder.getRegionStride1());
7471 const unsigned execSize = 8;
7472 G4_DstRegRegion* postDst = builder.createNullDst(Type_UD);
7473 G4_INST* store = nullptr;
7474 if (builder.supportsLSC())
7475 {
7476 auto headerOpnd = getSpillFillHeader(*kernel.fg.builder, nullptr);
7477 store = builder.createSpill(postDst, headerOpnd, payloadSrc, G4_ExecSize(execSize), 1, 0, builder.getBESP(), InstOpt_WriteEnable, false);
7478 }
7479 else
7480 {
7481 store = builder.createSpill(postDst, payloadSrc, G4_ExecSize(execSize), 1, 0, builder.getBESP(), InstOpt_WriteEnable, false);
7482 }
7483 builder.setFDSpillInst(store);
7484 G4_BB* entryBB = builder.kernel.fg.getEntryBB();
7485 auto iter = std::find_if(entryBB->begin(), entryBB->end(), [](G4_INST* inst) { return !inst->isLabel(); });
7486 entryBB->insertBefore(iter, store);
7487
7488 if (EUFusionWANeeded())
7489 {
7490 auto oldSaveInst = builder.getPartFDSaveInst();
7491 builder.setPartFDSaveInst(store);
7492 entryBB->remove(oldSaveInst);
7493 }
7494
7495 addEUFusionWAInsts(store);
7496
7497 return;
7498 }
7499
7500 auto dstRgn = builder.createDstRegRegion(builder.kernel.fg.scratchRegDcl, 1);
7501 auto srcRgn = builder.createSrcRegRegion(builder.getBuiltinR0(), builder.getRegionStride1());
7502
7503 G4_INST* mov = builder.createMov(G4_ExecSize(numEltPerGRF<Type_UD>()), dstRgn, srcRgn, InstOpt_WriteEnable, false);
7504
7505 G4_BB* entryBB = builder.kernel.fg.getEntryBB();
7506 auto iter = std::find_if(entryBB->begin(), entryBB->end(), [](G4_INST* inst) { return !inst->isLabel(); });
7507 entryBB->insertBefore(iter, mov);
7508 }
7509
7510 //
7511 // Generate the save code for startReg to startReg+owordSize/2.
7512 //
saveRegs(unsigned startReg,unsigned owordSize,G4_Declare * scratchRegDcl,G4_Declare * framePtr,unsigned frameOwordOffset,G4_BB * bb,INST_LIST_ITER insertIt,std::unordered_set<G4_INST * > & group)7513 void GlobalRA::saveRegs(
7514 unsigned startReg, unsigned owordSize, G4_Declare* scratchRegDcl, G4_Declare* framePtr,
7515 unsigned frameOwordOffset, G4_BB* bb, INST_LIST_ITER insertIt, std::unordered_set<G4_INST*>& group)
7516 {
7517
7518 assert(builder.getPlatform() >= GENX_SKL && "stack call only supported on SKL+");
7519
7520 if (owordSize == 8 || owordSize == 4 || owordSize == 2)
7521 {
7522 // add (1) r126.2<1>:ud r125.3<0;1,0>:ud 0x2:ud
7523 // sends (8) null<1>:ud r126.0 r1.0 ...
7524 G4_ExecSize execSize = (owordSize > 2) ? g4::SIMD16 : g4::SIMD8;
7525 unsigned messageLength = GlobalRA::owordToGRFSize(owordSize);
7526 G4_Declare* msgDcl = builder.createTempVar(messageLength * GENX_DATAPORT_IO_SZ,
7527 Type_UD, GRFALIGN, StackCallStr);
7528 msgDcl->getRegVar()->setPhyReg(regPool.getGreg(startReg), 0);
7529 auto sendSrc2 = builder.createSrc(msgDcl->getRegVar(), 0, 0,
7530 builder.getRegionStride1(), Type_UD);
7531 G4_DstRegRegion* dst = builder.createNullDst((execSize > 8) ? Type_UW : Type_UD);
7532 G4_INST* spillIntrinsic = nullptr;
7533 if (builder.supportsLSC())
7534 {
7535 auto headerOpnd = getSpillFillHeader(*kernel.fg.builder, nullptr);
7536 spillIntrinsic = builder.createSpill(dst, headerOpnd, sendSrc2, execSize, messageLength, frameOwordOffset / 2, framePtr, InstOpt_WriteEnable, false);
7537 }
7538 else
7539 spillIntrinsic = builder.createSpill(dst, sendSrc2, execSize, messageLength, frameOwordOffset/2, framePtr, InstOpt_WriteEnable, false);
7540 spillIntrinsic->inheritDIFrom(*insertIt);
7541 bb->insertBefore(insertIt, spillIntrinsic);
7542 group.insert(spillIntrinsic);
7543 }
7544 else if (owordSize > 8)
7545 {
7546 saveRegs(startReg, 8, scratchRegDcl, framePtr, frameOwordOffset, bb, insertIt, group);
7547 saveRegs(startReg + GlobalRA::owordToGRFSize(8), owordSize - 8, scratchRegDcl, framePtr, frameOwordOffset + 8, bb, insertIt, group);
7548 }
7549 //
7550 // Split into chunks of sizes 4 and remaining owords.
7551 //
7552 else if (owordSize > 4)
7553 {
7554 saveRegs(startReg, 4, scratchRegDcl, framePtr, frameOwordOffset, bb, insertIt, group);
7555 saveRegs(startReg + GlobalRA::owordToGRFSize(4), owordSize - 4, scratchRegDcl, framePtr, frameOwordOffset + 4, bb, insertIt, group);
7556 }
7557 //
7558 // Split into chunks of sizes 2 and remaining owords.
7559 //
7560 else if (owordSize > 2)
7561 {
7562 saveRegs(startReg, 2, scratchRegDcl, framePtr, frameOwordOffset, bb, insertIt, group);
7563 saveRegs(startReg + GlobalRA::owordToGRFSize(2), owordSize - 2, scratchRegDcl, framePtr, frameOwordOffset + 2, bb, insertIt, group);
7564 }
7565 else
7566 {
7567 MUST_BE_TRUE(false, ERROR_REGALLOC);
7568 }
7569 }
7570
7571 //
7572 // Generate the save code for the i/p saveRegs.
7573 //
saveActiveRegs(std::vector<bool> & saveRegs,unsigned startReg,unsigned frameOffset,G4_BB * bb,INST_LIST_ITER insertIt,std::unordered_set<G4_INST * > & group)7574 void GlobalRA::saveActiveRegs(
7575 std::vector<bool>& saveRegs, unsigned startReg, unsigned frameOffset,
7576 G4_BB* bb, INST_LIST_ITER insertIt, std::unordered_set<G4_INST*>& group)
7577 {
7578 G4_Declare* scratchRegDcl = builder.kernel.fg.scratchRegDcl;
7579 G4_Declare* framePtr = builder.kernel.fg.framePtrDcl;
7580
7581 unsigned frameOwordPos = frameOffset;
7582 unsigned startPos = 0;
7583
7584 while (startPos < saveRegs.size())
7585 {
7586 for (; startPos < saveRegs.size() && saveRegs[startPos] == false; startPos++);
7587 if (startPos < saveRegs.size() && saveRegs[startPos]) {
7588 unsigned endPos = startPos + 1;
7589 for (; endPos < saveRegs.size() && saveRegs[endPos] == true; endPos++);
7590 unsigned owordSize = (endPos - startPos) * GlobalRA::GRFSizeToOwords(1);
7591 owordSize = std::max(owordSize, GlobalRA::GRFSizeToOwords(1));
7592 this->saveRegs(startPos + startReg, owordSize, scratchRegDcl, framePtr, frameOwordPos, bb, insertIt, group);
7593 frameOwordPos += owordSize;
7594 startPos = endPos;
7595 }
7596 }
7597 }
7598
getScratchSurface() const7599 G4_SrcRegRegion* GraphColor::getScratchSurface() const
7600 {
7601 if (builder.hasScratchSurface())
7602 {
7603 return builder.createSrcRegRegion(builder.getBuiltinScratchSurface(), builder.getRegionScalar());
7604 }
7605 return nullptr; // use stateless access
7606 }
7607
7608 //
7609 // Generate the restore code for startReg to startReg+owordSize/2.
7610 //
restoreRegs(unsigned startReg,unsigned owordSize,G4_Declare * scratchRegDcl,G4_Declare * framePtr,unsigned frameOwordOffset,G4_BB * bb,INST_LIST_ITER insertIt,std::unordered_set<G4_INST * > & group,bool caller)7611 void GlobalRA::restoreRegs(
7612 unsigned startReg, unsigned owordSize, G4_Declare* scratchRegDcl, G4_Declare* framePtr,
7613 unsigned frameOwordOffset, G4_BB* bb, INST_LIST_ITER insertIt, std::unordered_set<G4_INST*>& group, bool caller)
7614 {
7615 //
7616 // Process chunks of size 8, 4, 2 and 1.
7617 //
7618 if (owordSize == 8 || owordSize == 4 || owordSize == 2)
7619 {
7620 G4_ExecSize execSize = (owordSize > 2) ? g4::SIMD16 : g4::SIMD8;
7621 unsigned responseLength = GlobalRA::owordToGRFSize(owordSize);
7622 G4_Declare* dstDcl = builder.createTempVar(responseLength * GENX_DATAPORT_IO_SZ,
7623 Type_UD, GRFALIGN, StackCallStr);
7624 if (caller)
7625 {
7626 kernel.callerRestoreDecls.push_back(dstDcl);
7627 }
7628 dstDcl->getRegVar()->setPhyReg(regPool.getGreg(startReg), 0);
7629 G4_DstRegRegion* dstRgn = builder.createDst(dstDcl->getRegVar(), 0, 0, 1, (execSize > 8) ? Type_UW : Type_UD);
7630 G4_INST* fillIntrinsic = nullptr;
7631 if (builder.supportsLSC())
7632 {
7633 auto headerOpnd = getSpillFillHeader(*kernel.fg.builder, nullptr);
7634 fillIntrinsic = builder.createFill(headerOpnd, dstRgn, execSize, responseLength, frameOwordOffset / 2, framePtr, InstOpt_WriteEnable, false);
7635 }
7636 else
7637 fillIntrinsic = builder.createFill(dstRgn, execSize, responseLength, frameOwordOffset / 2, framePtr, InstOpt_WriteEnable, false);
7638 fillIntrinsic->inheritDIFrom(*insertIt);
7639 bb->insertBefore(insertIt, fillIntrinsic);
7640 group.insert(fillIntrinsic);
7641 }
7642 //
7643 // Split into chunks of sizes 8 and remaining owords.
7644 //
7645 else if (owordSize > 8)
7646 {
7647 restoreRegs(startReg, 8, scratchRegDcl, framePtr, frameOwordOffset, bb, insertIt, group, caller);
7648 restoreRegs(startReg + GlobalRA::owordToGRFSize(8), owordSize - 8, scratchRegDcl, framePtr, frameOwordOffset + 8, bb, insertIt, group, caller);
7649 }
7650 //
7651 // Split into chunks of sizes 4 and remaining owords.
7652 //
7653 else if (owordSize > 4)
7654 {
7655 restoreRegs(startReg, 4, scratchRegDcl, framePtr, frameOwordOffset, bb, insertIt, group, caller);
7656 restoreRegs(startReg + GlobalRA::owordToGRFSize(4), owordSize - 4, scratchRegDcl, framePtr, frameOwordOffset + 4, bb, insertIt, group, caller);
7657 }
7658 //
7659 // Split into chunks of sizes 2 and remaining owords.
7660 //
7661 else if (owordSize > 2)
7662 {
7663 restoreRegs(startReg, 2, scratchRegDcl, framePtr, frameOwordOffset, bb, insertIt, group, caller);
7664 restoreRegs(startReg + GlobalRA::owordToGRFSize(2), owordSize - 2, scratchRegDcl, framePtr, frameOwordOffset + 2, bb, insertIt, group, caller);
7665 }
7666 else
7667 {
7668 MUST_BE_TRUE(false, ERROR_REGALLOC);
7669 }
7670 }
7671
7672 //
7673 // Generate the restore code for the i/p restoreRegs.
7674 //
restoreActiveRegs(std::vector<bool> & restoreRegs,unsigned startReg,unsigned frameOffset,G4_BB * bb,INST_LIST_ITER insertIt,std::unordered_set<G4_INST * > & group,bool caller)7675 void GlobalRA::restoreActiveRegs(
7676 std::vector<bool>& restoreRegs, unsigned startReg, unsigned frameOffset,
7677 G4_BB* bb, INST_LIST_ITER insertIt, std::unordered_set<G4_INST*>& group, bool caller)
7678 {
7679 G4_Declare* scratchRegDcl = builder.kernel.fg.scratchRegDcl;
7680 G4_Declare* framePtr = builder.kernel.fg.framePtrDcl;
7681
7682 unsigned frameOwordPos = frameOffset;
7683 unsigned startPos = 0;
7684
7685 while (startPos < restoreRegs.size())
7686 {
7687 for (; startPos < restoreRegs.size() && restoreRegs[startPos] == false; startPos++);
7688 if (startPos < restoreRegs.size() && restoreRegs[startPos]) {
7689 unsigned endPos = startPos + 1;
7690 for (; endPos < restoreRegs.size() && restoreRegs[endPos] == true; endPos++);
7691 unsigned owordSize = (endPos - startPos) * GlobalRA::GRFSizeToOwords(1);
7692 owordSize = std::max(owordSize, GlobalRA::GRFSizeToOwords(1));
7693 this->restoreRegs(startPos + startReg, owordSize, scratchRegDcl, framePtr, frameOwordPos, bb, insertIt, group, caller);
7694 frameOwordPos += owordSize;
7695 startPos = endPos;
7696 }
7697 }
7698 }
7699
7700 //
7701 // Optimize the reg footprint so as to reduce the number of "send" instructions required for
7702 // save/restore, at the cost of a little additional save/restore memory (if any). Since we
7703 // are using oword read/write for save/restore, we can only read/write only in units of 1, 2
7704 // or 4 regs per "send" instruction.
7705 //
OptimizeActiveRegsFootprint(std::vector<bool> & saveRegs)7706 void GlobalRA::OptimizeActiveRegsFootprint(std::vector<bool>& saveRegs)
7707 {
7708 unsigned startPos = 0;
7709 while (startPos < saveRegs.size())
7710 {
7711 for (; startPos < saveRegs.size() && !saveRegs[startPos]; ++startPos);
7712 if (startPos == saveRegs.size())
7713 {
7714 break;
7715 }
7716 if (startPos + 4 <= saveRegs.size())
7717 {
7718 if (saveRegs[startPos] & saveRegs[startPos + 2] & !saveRegs[startPos + 3])
7719 {
7720 saveRegs[startPos + 1] = saveRegs[startPos + 3] = true;
7721 }
7722 else if (saveRegs[startPos] & saveRegs[startPos + 3])
7723 {
7724 if (startPos + 4 < saveRegs.size())
7725 {
7726 if (!saveRegs[startPos + 4])
7727 {
7728 saveRegs[startPos + 1] = saveRegs[startPos + 2] = true;
7729 }
7730 }
7731 else
7732 {
7733 saveRegs[startPos + 1] = saveRegs[startPos + 2] = true;
7734 }
7735 }
7736 }
7737 unsigned winBound = std::min(static_cast<unsigned>(saveRegs.size()), startPos + 4);
7738 for (; startPos < winBound && saveRegs[startPos]; ++startPos);
7739 }
7740 }
7741
OptimizeActiveRegsFootprint(std::vector<bool> & saveRegs,std::vector<bool> & retRegs)7742 void GlobalRA::OptimizeActiveRegsFootprint(std::vector<bool>& saveRegs, std::vector<bool>& retRegs)
7743 {
7744 unsigned startPos = 0;
7745 while (startPos < saveRegs.size())
7746 {
7747 for (; startPos < saveRegs.size() && !saveRegs[startPos]; ++startPos);
7748 if (startPos == saveRegs.size())
7749 {
7750 break;
7751 }
7752 if (startPos + 4 <= saveRegs.size())
7753 {
7754 if (saveRegs[startPos] & saveRegs[startPos + 2])
7755 {
7756 if (!saveRegs[startPos + 1] & !retRegs[startPos + 1])
7757 {
7758 saveRegs[startPos + 1] = true;
7759 }
7760 if (!saveRegs[startPos + 3] & !retRegs[startPos + 3])
7761 {
7762 saveRegs[startPos + 3] = true;
7763 }
7764 }
7765 else if (saveRegs[startPos] & saveRegs[startPos + 3])
7766 {
7767 if (startPos + 4 < saveRegs.size())
7768 {
7769 if (!saveRegs[startPos + 4])
7770 {
7771 if (!saveRegs[startPos + 1] & !retRegs[startPos + 1])
7772 {
7773 saveRegs[startPos + 1] = true;
7774 }
7775 if (!saveRegs[startPos + 2] & !retRegs[startPos + 2])
7776 {
7777 saveRegs[startPos + 2] = true;
7778 }
7779 }
7780 }
7781 else
7782 {
7783 if (!saveRegs[startPos + 1] & !retRegs[startPos + 1])
7784 {
7785 saveRegs[startPos + 1] = true;
7786 }
7787 if (!saveRegs[startPos + 2] & !retRegs[startPos + 2])
7788 {
7789 saveRegs[startPos + 2] = true;
7790 }
7791 }
7792 }
7793 }
7794 unsigned winBound = std::min(static_cast<unsigned>(saveRegs.size()), startPos + 4);
7795 for (; startPos < winBound && saveRegs[startPos]; ++startPos);
7796 }
7797 }
7798
getCallerSaveRegisters()7799 void GraphColor::getCallerSaveRegisters()
7800 {
7801 unsigned callerSaveNumGRF = builder.kernel.getCallerSaveLastGRF() + 1;
7802
7803 for (BB_LIST_ITER it = builder.kernel.fg.begin(); it != builder.kernel.fg.end(); ++it)
7804 {
7805 if ((*it)->isEndWithFCall())
7806 {
7807 //
7808 // Determine the caller-save registers per call site.
7809 //
7810 gra.callerSaveRegsMap[(*it)].resize(callerSaveNumGRF, false);
7811 gra.retRegsMap[(*it)].resize(callerSaveNumGRF, false);
7812 unsigned callerSaveRegCount = 0;
7813 G4_INST* callInst = (*it)->back();
7814 unsigned pseudoVCAId = builder.kernel.fg.fcallToPseudoDclMap[callInst->asCFInst()].VCA->getRegVar()->getId();
7815 ASSERT_USER((*it)->Succs.size() == 1, "fcall basic block cannot have more than 1 successor");
7816
7817 for (unsigned i = 0; i < numVar; i++)
7818 {
7819 if (i != pseudoVCAId &&
7820 kernel.fg.isPseudoVCEDcl(lrs[i]->getDcl()) != true &&
7821 intf.interfereBetween(pseudoVCAId, i) == true)
7822 {
7823 if (!builder.isPreDefArg(lrs[i]->getDcl()))
7824 {
7825 // NOTE: Spilled live ranges should not be caller-save.
7826 MUST_BE_TRUE(lrs[i]->getPhyReg()->isGreg(), ERROR_REGALLOC);
7827 unsigned startReg = lrs[i]->getPhyReg()->asGreg()->getRegNum();
7828 unsigned endReg = startReg + lrs[i]->getDcl()->getNumRows();
7829 startReg = (startReg < callerSaveNumGRF) ? startReg : callerSaveNumGRF;
7830 startReg = (startReg > 0) ? startReg : 1;
7831 endReg = (endReg < callerSaveNumGRF) ? endReg : callerSaveNumGRF;
7832 endReg = (endReg > 0) ? endReg : 1;
7833 for (unsigned j = startReg; j < endReg; j++)
7834 {
7835 if (builder.isPreDefRet(lrs[i]->getDcl()))
7836 {
7837 if (gra.retRegsMap[(*it)][j] == false)
7838 {
7839 gra.retRegsMap[(*it)][j] = true;
7840 }
7841 }
7842 else
7843 {
7844 if (gra.callerSaveRegsMap[(*it)][j] == false)
7845 {
7846 gra.callerSaveRegsMap[(*it)][j] = true;
7847 callerSaveRegCount++;
7848 }
7849 }
7850 }
7851 }
7852 }
7853 }
7854
7855 gra.callerSaveRegCountMap[(*it)] = callerSaveRegCount;
7856
7857 if (builder.kernel.getOption(vISA_OptReport))
7858 {
7859 std::ofstream optreport;
7860 getOptReportStream(optreport, builder.kernel.getOptions());
7861 optreport << "Caller save size: " << callerSaveRegCount * getGRFSize() <<
7862 " bytes for fcall at cisa id " <<
7863 (*it)->back()->getCISAOff() << std::endl;
7864 closeOptReportStream(optreport);
7865 }
7866 }
7867 }
7868 }
7869
7870 //
7871 // Add caller save/restore code before/after each stack call.
7872 //
addCallerSaveRestoreCode()7873 void GlobalRA::addCallerSaveRestoreCode()
7874 {
7875 uint32_t maxCallerSaveSize = 0;
7876
7877 for (G4_BB* bb : builder.kernel.fg)
7878 {
7879 if (bb->isEndWithFCall())
7880 {
7881 //
7882 // Determine the caller-save registers per call site.
7883 //
7884 G4_INST* callInst = bb->back();
7885 G4_BB* afterFCallBB = bb->Succs.front();
7886
7887 OptimizeActiveRegsFootprint(callerSaveRegsMap[bb], retRegsMap[bb]);
7888
7889 unsigned callerSaveRegsWritten = 0;
7890 for (bool csr : callerSaveRegsMap[bb])
7891 callerSaveRegsWritten += (csr ? 1 : 0);
7892
7893 INST_LIST_ITER insertSaveIt = bb->end();
7894 --insertSaveIt, --insertSaveIt;
7895 while ((*insertSaveIt)->isPseudoKill())
7896 {
7897 --insertSaveIt;
7898 }
7899 MUST_BE_TRUE((*insertSaveIt)->isCallerSave(), ERROR_REGALLOC);
7900 INST_LIST_ITER rmIt = insertSaveIt;
7901 if (insertSaveIt == bb->begin())
7902 {
7903 insertSaveIt = bb->end();
7904 }
7905
7906 if (insertSaveIt != bb->end())
7907 {
7908 ++insertSaveIt;
7909 }
7910 else
7911 {
7912 insertSaveIt = bb->begin();
7913 }
7914 if (callerSaveRegCountMap[bb] > 0)
7915 {
7916 if (builder.kernel.getOption(vISA_GenerateDebugInfo))
7917 {
7918 builder.kernel.getKernelDebugInfo()->clearOldInstList();
7919 builder.kernel.getKernelDebugInfo()->setOldInstList(bb);
7920 }
7921
7922 saveActiveRegs(callerSaveRegsMap[bb], 0, builder.kernel.fg.callerSaveAreaOffset,
7923 bb, insertSaveIt, callerSaveInsts[callInst]);
7924
7925 if (builder.kernel.getOption(vISA_GenerateDebugInfo))
7926 {
7927 auto deltaInstList = builder.kernel.getKernelDebugInfo()->getDeltaInstructions(bb);
7928 for (auto jt : deltaInstList)
7929 {
7930 builder.kernel.getKernelDebugInfo()->addCallerSaveInst(bb, jt);
7931 }
7932 }
7933 }
7934 bb->erase(rmIt);
7935 INST_LIST_ITER insertRestIt = afterFCallBB->begin();
7936 for (; !(*insertRestIt)->isCallerRestore(); ++insertRestIt);
7937 if (callerSaveRegCountMap[bb] > 0)
7938 {
7939 if (builder.kernel.getOption(vISA_GenerateDebugInfo))
7940 {
7941 builder.kernel.getKernelDebugInfo()->clearOldInstList();
7942 builder.kernel.getKernelDebugInfo()->setOldInstList(afterFCallBB);
7943 }
7944
7945 restoreActiveRegs(callerSaveRegsMap[bb], 0, builder.kernel.fg.callerSaveAreaOffset,
7946 afterFCallBB, insertRestIt, callerRestoreInsts[callInst], true);
7947
7948 if (builder.kernel.getOption(vISA_GenerateDebugInfo))
7949 {
7950 auto deltaInsts = builder.kernel.getKernelDebugInfo()->getDeltaInstructions(afterFCallBB);
7951 for (auto jt : deltaInsts)
7952 {
7953 builder.kernel.getKernelDebugInfo()->addCallerRestoreInst(bb, jt);
7954 }
7955 }
7956 }
7957 afterFCallBB->erase(insertRestIt);
7958
7959 maxCallerSaveSize = std::max(maxCallerSaveSize, callerSaveRegsWritten * getGRFSize());
7960 }
7961 }
7962
7963 auto byteOffset = builder.kernel.fg.callerSaveAreaOffset * 16 + maxCallerSaveSize;
7964 builder.kernel.fg.frameSizeInOWord = ROUND(byteOffset, 64) / 16;
7965
7966 builder.instList.clear();
7967 }
7968
getCalleeSaveRegisters()7969 void GraphColor::getCalleeSaveRegisters()
7970 {
7971 unsigned callerSaveNumGRF = builder.kernel.getCallerSaveLastGRF() + 1;
7972 unsigned numCalleeSaveRegs = builder.kernel.getNumCalleeSaveRegs();
7973
7974 // Determine the callee-save registers.
7975
7976 gra.calleeSaveRegs.resize(numCalleeSaveRegs, false);
7977 gra.calleeSaveRegCount = 0;
7978
7979 unsigned pseudoVCEId = builder.kernel.fg.pseudoVCEDcl->getRegVar()->getId();
7980 unsigned stackCallStartReg = builder.kernel.getStackCallStartReg();
7981 for (unsigned i = 0; i < numVar; i++)
7982 {
7983 if (pseudoVCEId != i && intf.interfereBetween(pseudoVCEId, i))
7984 {
7985 if (lrs[i]->getPhyReg())
7986 {
7987 MUST_BE_TRUE(lrs[i]->getPhyReg()->isGreg(), ERROR_REGALLOC);
7988 unsigned startReg = lrs[i]->getPhyReg()->asGreg()->getRegNum();
7989 unsigned endReg = startReg + lrs[i]->getDcl()->getNumRows();
7990 startReg = (startReg >= callerSaveNumGRF) ? startReg : callerSaveNumGRF;
7991 startReg = (startReg < stackCallStartReg) ? startReg : stackCallStartReg;
7992 endReg = (endReg >= callerSaveNumGRF) ? endReg : callerSaveNumGRF;
7993 endReg = (endReg < stackCallStartReg) ? endReg : stackCallStartReg;
7994 for (unsigned j = startReg; j < endReg; j++)
7995 {
7996 if (gra.calleeSaveRegs[j - callerSaveNumGRF] == false)
7997 {
7998 gra.calleeSaveRegs[j - callerSaveNumGRF] = true;
7999 gra.calleeSaveRegCount++;
8000 }
8001 }
8002 }
8003 }
8004 }
8005 }
8006
8007 //
8008 // Add callee save/restore code at stack call function entry/exit.
8009 //
addCalleeSaveRestoreCode()8010 void GlobalRA::addCalleeSaveRestoreCode()
8011 {
8012 unsigned callerSaveNumGRF = builder.kernel.getCallerSaveLastGRF() + 1;
8013
8014 OptimizeActiveRegsFootprint(calleeSaveRegs);
8015 unsigned calleeSaveRegsWritten = 0;
8016 for (bool b : calleeSaveRegs)
8017 calleeSaveRegsWritten += (b ? 1 : 0);
8018
8019 INST_LIST_ITER insertSaveIt = builder.kernel.fg.getEntryBB()->end();
8020 for (--insertSaveIt; !(*insertSaveIt)->isCalleeSave(); --insertSaveIt);
8021 if (calleeSaveRegCount > 0)
8022 {
8023 if (builder.kernel.getOption(vISA_GenerateDebugInfo))
8024 {
8025 // Store old inst list so we can separate callee save
8026 // instructions that get inserted.
8027 builder.kernel.getKernelDebugInfo()->clearOldInstList();
8028 builder.kernel.getKernelDebugInfo()->setOldInstList
8029 (builder.kernel.fg.getEntryBB());
8030 }
8031 saveActiveRegs(calleeSaveRegs, callerSaveNumGRF, builder.kernel.fg.calleeSaveAreaOffset,
8032 builder.kernel.fg.getEntryBB(), insertSaveIt, calleeSaveInsts);
8033
8034 if (builder.kernel.getOption(vISA_GenerateDebugInfo))
8035 {
8036 // Delta of oldInstList and current instList are all
8037 // callee save instructions.
8038 auto instList = builder.kernel.getKernelDebugInfo()->getDeltaInstructions
8039 (builder.kernel.fg.getEntryBB());
8040 for (auto inst : instList)
8041 {
8042 builder.kernel.getKernelDebugInfo()->addCalleeSaveInst(inst);
8043 }
8044 }
8045 }
8046 builder.kernel.fg.getEntryBB()->erase(insertSaveIt);
8047 INST_LIST_ITER insertRestIt = builder.kernel.fg.getUniqueReturnBlock()->end();
8048 for (--insertRestIt; !(*insertRestIt)->isCalleeRestore(); --insertRestIt);
8049 INST_LIST_ITER eraseIt = insertRestIt++;
8050 if (calleeSaveRegCount > 0)
8051 {
8052 if (builder.kernel.getOption(vISA_GenerateDebugInfo))
8053 {
8054 // Store old inst list so we can separate callee save
8055 // instructions that get inserted.
8056 builder.kernel.getKernelDebugInfo()->clearOldInstList();
8057 builder.kernel.getKernelDebugInfo()->setOldInstList
8058 (builder.kernel.fg.getUniqueReturnBlock());
8059 }
8060
8061 restoreActiveRegs(calleeSaveRegs, callerSaveNumGRF, builder.kernel.fg.calleeSaveAreaOffset,
8062 builder.kernel.fg.getUniqueReturnBlock(), insertRestIt, calleeRestoreInsts, false);
8063
8064 if (builder.kernel.getOption(vISA_GenerateDebugInfo))
8065 {
8066 auto instList = builder.kernel.getKernelDebugInfo()->getDeltaInstructions
8067 (builder.kernel.fg.getUniqueReturnBlock());
8068 for (auto inst : instList)
8069 {
8070 builder.kernel.getKernelDebugInfo()->addCalleeRestoreInst(inst);
8071 }
8072 }
8073 }
8074 builder.kernel.fg.getUniqueReturnBlock()->erase(eraseIt);
8075
8076 builder.instList.clear();
8077
8078 // caller-save starts after callee-save and is 64-byte aligned
8079 auto byteOffset = builder.kernel.fg.calleeSaveAreaOffset * 16 + calleeSaveRegsWritten * getGRFSize();
8080 builder.kernel.fg.callerSaveAreaOffset = ROUND(byteOffset, 64) / 16;
8081 if (builder.kernel.getOption(vISA_OptReport))
8082 {
8083 std::ofstream optreport;
8084 getOptReportStream(optreport, builder.kernel.getOptions());
8085 optreport << "Callee save size: " << calleeSaveRegCount * getGRFSize() <<
8086 " bytes" << std::endl;
8087 closeOptReportStream(optreport);
8088 }
8089 }
8090
8091 //
8092 // Add code to setup the stack frame in callee.
8093 //
addGenxMainStackSetupCode()8094 void GlobalRA::addGenxMainStackSetupCode()
8095 {
8096 uint32_t fpInitVal = (uint32_t)kernel.getInt32KernelAttr(Attributes::ATTR_SpillMemOffset);
8097 // FIXME: a potential failure here is that frameSizeInOword is already the offset based on
8098 // GlobalSratchOffset, which is the value of fpInitVal. So below we generate code to do
8099 // SP = fpInitVal + frameSize, which does not make sense. It is correct now since when there's stack call,
8100 // IGC will not use scratch, so fpInitVal will be 0.
8101 unsigned frameSize = builder.kernel.fg.frameSizeInOWord;
8102 uint16_t factor = 1;
8103 if (useLscForSpillFill)
8104 factor = 16;
8105 G4_Declare* framePtr = builder.kernel.fg.framePtrDcl;
8106 G4_Declare* stackPtr = builder.kernel.fg.stackPtrDcl;
8107
8108 auto entryBB = builder.kernel.fg.getEntryBB();
8109 auto insertIt = std::find_if(entryBB->begin(), entryBB->end(), [](G4_INST* inst) { return !inst->isLabel(); });
8110 //
8111 // FP = spillMemOffset
8112 //
8113 {
8114 G4_DstRegRegion* dst = builder.createDst(framePtr->getRegVar(), 0, 0, 1, Type_UD);
8115 G4_Imm * src = builder.createImm(fpInitVal, Type_UD);
8116 G4_INST* fpInst = builder.createMov(g4::SIMD1, dst, src, InstOpt_WriteEnable, false);
8117 insertIt = entryBB->insertBefore(insertIt, fpInst);
8118
8119 setBEFPSetupInst(fpInst);
8120
8121 if (builder.kernel.getOption(vISA_GenerateDebugInfo))
8122 {
8123 builder.kernel.getKernelDebugInfo()->setBEFPSetupInst(fpInst);
8124 builder.kernel.getKernelDebugInfo()->setFrameSize(frameSize * 16);
8125 }
8126 }
8127 //
8128 // SP = FP + FrameSize (overflow-area offset + overflow-area size)
8129 //
8130 {
8131 G4_DstRegRegion* dst = builder.createDst(stackPtr->getRegVar(), 0, 0, 1, Type_UD);
8132 G4_Imm * src = builder.createImm(fpInitVal + frameSize*factor, Type_UD);
8133 G4_INST* spIncInst = builder.createMov(g4::SIMD1, dst, src, InstOpt_WriteEnable, false);
8134 entryBB->insertBefore(++insertIt, spIncInst);
8135 }
8136
8137 if (builder.kernel.getOption(vISA_OptReport))
8138 {
8139 std::ofstream optreport;
8140 getOptReportStream(optreport, builder.kernel.getOptions());
8141 optreport << "Total frame size: " << frameSize * 16 << " bytes" << std::endl;
8142 closeOptReportStream(optreport);
8143 }
8144 }
8145
8146 //
8147 // Add code to setup the stack frame in callee.
8148 //
addCalleeStackSetupCode()8149 void GlobalRA::addCalleeStackSetupCode()
8150 {
8151 int frameSize = (int)builder.kernel.fg.frameSizeInOWord;
8152 uint16_t factor = 1;
8153 // convert framesize to bytes from oword for LSC
8154 if (useLscForSpillFill)
8155 factor = 16;
8156 G4_Declare* framePtr = builder.kernel.fg.framePtrDcl;
8157 G4_Declare* stackPtr = builder.kernel.fg.stackPtrDcl;
8158
8159 MUST_BE_TRUE(frameSize > 0, "frame size cannot be 0");
8160
8161 //
8162 // BE_FP = BE_SP
8163 // BE_SP += FrameSize
8164 //
8165 {
8166 G4_DstRegRegion* dst = builder.createDst(stackPtr->getRegVar(), 0, 0, 1, Type_UD);
8167 G4_DstRegRegion* fp_dst = builder.createDst(framePtr->getRegVar(), 0, 0, 1, Type_UD);
8168 const RegionDesc* rDesc = builder.getRegionScalar();
8169 G4_Operand* src0 = builder.createSrc(stackPtr->getRegVar(), 0, 0, rDesc, Type_UD);
8170 G4_Operand* sp_src = builder.createSrc(stackPtr->getRegVar(), 0, 0, rDesc, Type_UD);
8171 G4_Imm * src1 = builder.createImm(frameSize*factor, Type_UD);
8172 auto createBEFP = builder.createMov(g4::SIMD1, fp_dst, sp_src, InstOpt_WriteEnable, false);
8173 createBEFP->addComment("vISA_FP = vISA_SP");
8174 auto addInst = builder.createBinOp(G4_add, g4::SIMD1,
8175 dst, src0, src1, InstOpt_WriteEnable, false);
8176 addInst->addComment("vISA_SP += vISA_frameSize");
8177 G4_BB* entryBB = builder.kernel.fg.getEntryBB();
8178 auto insertIt = std::find(entryBB->begin(), entryBB->end(), getSaveBE_FPInst());
8179 MUST_BE_TRUE(insertIt != entryBB->end(), "Can't find BE_FP store inst");
8180
8181 setBEFPSetupInst(createBEFP);
8182
8183 if (builder.kernel.getOption(vISA_GenerateDebugInfo))
8184 {
8185 builder.kernel.getKernelDebugInfo()->setBEFPSetupInst(createBEFP);
8186 builder.kernel.getKernelDebugInfo()->setFrameSize(frameSize * 16);
8187 }
8188
8189 addEUFusionWAInsts(createBEFP);
8190 addEUFusionWAInsts(addInst);
8191
8192 insertIt++;
8193 entryBB->insertBefore(insertIt, createBEFP);
8194 entryBB->insertBefore(insertIt, addInst);
8195 }
8196
8197 // Stack is destroyed in function addStoreRestoreToReturn() where part FDE is restored before fret.
8198 // This is an optimization as 1 SIMD4 instruction restores ret %ip, ret EM, caller's BE_FP, BE_SP.
8199
8200 builder.instList.clear();
8201
8202 if (builder.kernel.getOption(vISA_OptReport))
8203 {
8204 std::ofstream optreport;
8205 getOptReportStream(optreport, builder.kernel.getOptions());
8206 optreport << std::endl << "Total frame size: "
8207 << frameSize * 16 << " bytes" << std::endl;
8208 closeOptReportStream(optreport);
8209 }
8210 }
8211
8212 //
8213 // Add A0 save/restore code for stack calls.
8214 //
addA0SaveRestoreCode()8215 void GraphColor::addA0SaveRestoreCode()
8216 {
8217 uint8_t numA0Elements = (uint8_t)getNumAddrRegisters();
8218
8219 int count = 0;
8220 for (auto bb : builder.kernel.fg)
8221 {
8222 if (bb->isEndWithFCall())
8223 {
8224 G4_BB* succ = bb->Succs.front();
8225 auto fcallInst = bb->back()->asCFInst();
8226 G4_RegVar* assocPseudoA0 = bb->getParent().fcallToPseudoDclMap[fcallInst].A0->getRegVar();
8227
8228 if (!assocPseudoA0->getPhyReg())
8229 {
8230 // Insert save/restore code because the pseudo node did not get an allocation
8231 const char* name = builder.getNameString(builder.mem, 20, "SA0_%d", count++);
8232 G4_Declare* savedDcl = builder.createDeclareNoLookup(name, G4_GRF, numA0Elements, 1, Type_UW);
8233
8234 {
8235 //
8236 // (W) mov (16) TMP_GRF<1>:uw a0.0<16;16,1>:uw
8237 //
8238 G4_DstRegRegion* dst = builder.createDst(savedDcl->getRegVar(), 0, 0, 1, Type_UW);
8239 const RegionDesc* rDesc = builder.getRegionStride1();
8240 G4_Operand* src = builder.createSrc(regPool.getAddrReg(), 0, 0, rDesc, Type_UW);
8241 G4_INST* saveInst = builder.createMov(
8242 G4_ExecSize(numA0Elements), dst, src, InstOpt_WriteEnable, false);
8243 INST_LIST_ITER insertIt = std::prev(bb->end());
8244 bb->insertBefore(insertIt, saveInst);
8245 }
8246
8247 {
8248 //
8249 // (W) mov (16) a0.0<1>:uw TMP_GRF<16;16,1>:uw
8250 //
8251 G4_DstRegRegion* dst = builder.createDst(regPool.getAddrReg(), 0, 0, 1, Type_UW);
8252 const RegionDesc* rDesc = builder.getRegionStride1();
8253 G4_Operand* src = builder.createSrc(savedDcl->getRegVar(), 0, 0, rDesc, Type_UW);
8254 G4_INST* restoreInst = builder.createMov(
8255 G4_ExecSize(numA0Elements), dst, src, InstOpt_WriteEnable, false);
8256 auto insertIt = std::find_if(succ->begin(), succ->end(), [](G4_INST* inst) { return !inst->isLabel(); });
8257 succ->insertBefore(insertIt, restoreInst);
8258 }
8259 }
8260 }
8261 }
8262
8263 builder.instList.clear();
8264 }
8265
8266 //
8267 // Add Flag save/restore code for stack calls.
8268 //
addFlagSaveRestoreCode()8269 void GraphColor::addFlagSaveRestoreCode()
8270 {
8271 int count = 0;
8272 int num32BitFlags = builder.getNumFlagRegisters() / 2;
8273
8274 // each 32-bit flag gets a declare
8275 // ToDo: should we use flag ARF directly here?
8276 std::vector<G4_Declare*> tmpFlags;
8277 for (int i = 0; i < num32BitFlags; ++i)
8278 {
8279 G4_Declare* tmpFlag = builder.createTempFlag(2);
8280 tmpFlag->getRegVar()->setPhyReg(regPool.getFlagAreg(i), 0);
8281 tmpFlags.push_back(tmpFlag);
8282 }
8283
8284 for (auto bb : builder.kernel.fg)
8285 {
8286 if (bb->isEndWithFCall())
8287 {
8288 G4_BB* succ = bb->Succs.front();
8289 auto fcallInst = bb->back()->asCFInst();
8290 G4_RegVar* assocPseudoFlag = bb->getParent().fcallToPseudoDclMap[fcallInst].Flag->getRegVar();
8291
8292 if (!assocPseudoFlag->getPhyReg())
8293 {
8294 // Insert save/restore code because the pseudo node did not get an allocation
8295 const char* name = builder.getNameString(builder.mem, 32, "SFLAG_%d", count++);
8296 G4_Declare* savedDcl1 = builder.createDeclareNoLookup(name, G4_GRF, num32BitFlags, 1, Type_UD);
8297 {
8298 //
8299 // (W) mov (1) TMP_GRF.0<1>:ud f0.0:ud
8300 // (W) mov (1) TMP_GRF.1<1>:ud f1.0:ud
8301 //
8302 auto createFlagSaveInst = [&](int index)
8303 {
8304 auto flagDcl = tmpFlags[index];
8305 G4_DstRegRegion* dst = builder.createDst(savedDcl1->getRegVar(), 0, index, 1, Type_UD);
8306 G4_Operand* src = builder.createSrc(flagDcl->getRegVar(), 0, 0,
8307 builder.getRegionScalar(), Type_UD);
8308 return builder.createMov(g4::SIMD1, dst, src, InstOpt_WriteEnable, false);
8309 };
8310
8311 auto iter = std::prev(bb->end());
8312 for (int i = 0; i < num32BitFlags; ++i)
8313 {
8314 auto saveInst = createFlagSaveInst(i);
8315 bb->insertBefore(iter, saveInst);
8316 }
8317 }
8318
8319 {
8320 //
8321 // mov (1) f0.0:ud TMP_GRF.0<0;1,0>:ud
8322 // mov (1) f1.0:ud TMP_GRF.1<0;1,0>:ud
8323 //
8324 auto createRestoreFlagInst = [&](int index)
8325 {
8326 auto flagDcl = tmpFlags[index];
8327 G4_DstRegRegion* dst = builder.createDst(flagDcl->getRegVar(), 0, 0, 1, Type_UD);
8328 const RegionDesc* rDesc = builder.getRegionScalar();
8329 G4_Operand* src = builder.createSrc(savedDcl1->getRegVar(), 0, index, rDesc, Type_UD);
8330 return builder.createMov(g4::SIMD1, dst, src, InstOpt_WriteEnable, false);
8331 };
8332 auto insertIt = std::find_if(succ->begin(), succ->end(), [](G4_INST* inst) { return !inst->isLabel(); });
8333 for (int i = 0; i < num32BitFlags; ++i)
8334 {
8335 auto restoreInst = createRestoreFlagInst(i);
8336 succ->insertBefore(insertIt, restoreInst);
8337 }
8338 }
8339 }
8340 }
8341 }
8342
8343 builder.instList.clear();
8344 }
8345
getSaveRestoreRegister()8346 void GraphColor::getSaveRestoreRegister()
8347 {
8348 if (!builder.getIsKernel())
8349 {
8350 getCalleeSaveRegisters();
8351 }
8352 getCallerSaveRegisters();
8353 }
8354 //
8355 // Add GRF caller/callee save/restore code for stack calls.
8356 // localSpillAreaOwordsize specifices the starting offset of the caller/callee-save area in this frame.
8357 // It is 64-byte aligned.
8358 //
addSaveRestoreCode(unsigned localSpillAreaOwordSize)8359 void GlobalRA::addSaveRestoreCode(unsigned localSpillAreaOwordSize)
8360 {
8361 auto gtpin = builder.kernel.getGTPinData();
8362 if (gtpin &&
8363 gtpin->isFirstRAPass())
8364 {
8365 gtpin->markInsts();
8366 }
8367
8368 if (builder.getIsKernel())
8369 {
8370 builder.kernel.fg.callerSaveAreaOffset = localSpillAreaOwordSize;
8371 }
8372 else
8373 {
8374 builder.kernel.fg.calleeSaveAreaOffset = localSpillAreaOwordSize;
8375 addCalleeSaveRestoreCode();
8376 }
8377 addCallerSaveRestoreCode();
8378 if (builder.getIsKernel())
8379 {
8380 addGenxMainStackSetupCode();
8381 }
8382 else
8383 {
8384 addCalleeStackSetupCode();
8385 }
8386 stackCallProlog();
8387 builder.instList.clear();
8388 }
8389
8390 //
8391 // If the graph has stack calls, then add the caller-save pseudo code immediately before and
8392 // after the stack call. The pseudo code is either converted to actual save/restore code or
8393 // is eliminated at the end of coloringRegAlloc().
8394 //
addCallerSavePseudoCode()8395 void GlobalRA::addCallerSavePseudoCode()
8396 {
8397 unsigned retID = 0;
8398
8399 for (G4_BB* bb : builder.kernel.fg)
8400 {
8401 if (bb->isEndWithFCall())
8402 {
8403 // GRF caller save/restore
8404 auto fcallInst = bb->back()->asCFInst();
8405 G4_Declare* pseudoVCADcl = bb->getParent().fcallToPseudoDclMap[fcallInst].VCA;
8406 G4_DstRegRegion* dst = builder.createDst(pseudoVCADcl->getRegVar(), 0, 0, 1, Type_UD);
8407 G4_INST* saveInst = builder.createInternalIntrinsicInst(
8408 nullptr, Intrinsic::CallerSave, g4::SIMD1, dst, nullptr, nullptr, nullptr, InstOpt_WriteEnable);
8409 saveInst->inheritDIFrom(fcallInst);
8410 INST_LIST_ITER callBBIt = bb->end();
8411 bb->insertBefore(--callBBIt, saveInst);
8412
8413 G4_FCALL* fcall = builder.getFcallInfo(bb->back());
8414 MUST_BE_TRUE(fcall != NULL, "fcall info not found");
8415 uint16_t retSize = fcall->getRetSize();
8416 if (retSize > 0)
8417 {
8418 const char* name = builder.getNameString(builder.mem, 32, "FCALL_RETVAL_%d", retID++);
8419 auto retDcl = builder.createHardwiredDeclare(numEltPerGRF<Type_UD>() * retSize, Type_UD, IR_Builder::ArgRet_Stackcall::Ret, 0);
8420 retDcl->setName(name);
8421 fcallRetMap.emplace(pseudoVCADcl, retDcl);
8422 }
8423
8424 ASSERT_USER(bb->Succs.size() == 1, "fcall basic block cannot have more than 1 successor node");
8425
8426 G4_BB* retBB = bb->Succs.front();
8427 const RegionDesc* rd = builder.getRegionScalar();
8428 G4_Operand* src = builder.createSrc(pseudoVCADcl->getRegVar(), 0, 0, rd, Type_UD);
8429 INST_LIST_ITER retBBIt = retBB->begin();
8430 for (; retBBIt != retBB->end() && (*retBBIt)->isLabel(); ++retBBIt);
8431 G4_INST* restoreInst =
8432 builder.createInternalIntrinsicInst(
8433 nullptr, Intrinsic::CallerRestore, g4::SIMD1, nullptr, src, nullptr, nullptr, InstOpt_WriteEnable);
8434 restoreInst->inheritDIFrom(fcallInst);
8435 retBB->insertBefore(retBBIt, restoreInst);
8436 }
8437 }
8438 builder.instList.clear();
8439 }
8440
8441 //
8442 // If the graph has stack calls, then add the callee-save pseudo code at the entry/exit blocks
8443 // of the function. The pseudo code is either converted to actual save/restore code or is
8444 // eliminated at the end of coloringRegAlloc().
8445 //
addCalleeSavePseudoCode()8446 void GlobalRA::addCalleeSavePseudoCode()
8447 {
8448 G4_Declare* pseudoVCEDcl = builder.kernel.fg.pseudoVCEDcl;
8449
8450 G4_DstRegRegion* dst = builder.createDst(pseudoVCEDcl->getRegVar(), 0, 0, 1, Type_UD);
8451 auto saveInst = builder.createInternalIntrinsicInst(
8452 nullptr, Intrinsic::CalleeSave, g4::SIMD1, dst, nullptr, nullptr, nullptr, InstOpt_WriteEnable);
8453 INST_LIST_ITER insertIt = builder.kernel.fg.getEntryBB()->begin();
8454 for (; insertIt != builder.kernel.fg.getEntryBB()->end() && (*insertIt)->isLabel();
8455 ++insertIt)
8456 { /* void */
8457 };
8458 builder.kernel.fg.getEntryBB()->insertBefore(insertIt, saveInst);
8459
8460 G4_BB* exitBB = builder.kernel.fg.getUniqueReturnBlock();
8461 const RegionDesc* rDesc = builder.getRegionScalar();
8462 G4_Operand* src = builder.createSrc(pseudoVCEDcl->getRegVar(), 0, 0, rDesc, Type_UD);
8463 G4_INST* restoreInst =
8464 builder.createInternalIntrinsicInst(
8465 nullptr, Intrinsic::CalleeRestore, g4::SIMD1, nullptr, src, nullptr, nullptr, InstOpt_WriteEnable);
8466 INST_LIST_ITER exitBBIt = exitBB->end();
8467 --exitBBIt;
8468 MUST_BE_TRUE((*exitBBIt)->isFReturn(), ERROR_REGALLOC);
8469 exitBB->insertBefore(exitBBIt, restoreInst);
8470 builder.instList.clear();
8471 }
8472
8473 //
8474 // Insert store r125.[0-4] at entry and restore before return.
8475 // Dst of store will be a hardwired temp at upper end of caller save area.
8476 // This method emits:
8477 // (W) mov (4) SR_BEStack<1>:ud r125.0<4;4,1>:ud <-- in prolog
8478 // (W) mov (4) r125.0<1>:ud SR_BEStack<4;4,1>:ud <-- in epilog
addStoreRestoreToReturn()8479 void GlobalRA::addStoreRestoreToReturn()
8480 {
8481
8482 unsigned regNum = builder.kernel.getCallerSaveLastGRF();
8483 unsigned subRegNum = numEltPerGRF<Type_UD>() - 4;
8484 oldFPDcl = builder.createHardwiredDeclare(4, Type_UD, regNum, subRegNum);
8485 oldFPDcl->setName(builder.getNameString(builder.kernel.fg.mem, 24, "CallerSaveRetIp_BE_FP"));
8486
8487 G4_DstRegRegion* oldFPDst = builder.createDst(oldFPDcl->getRegVar(), 0, 0, 1, Type_UD);
8488 const RegionDesc* rd = builder.getRegionStride1();
8489 G4_Operand* oldFPSrc = builder.createSrc(oldFPDcl->getRegVar(), 0, 0, rd, Type_UD);
8490
8491 auto SRDecl = builder.createHardwiredDeclare(4, Type_UD, builder.kernel.getFPSPGRF(), IR_Builder::SubRegs_Stackcall::Ret_IP);
8492 SRDecl->setName(builder.getNameString(builder.kernel.fg.mem, 24, "SR_BEStack"));
8493 G4_DstRegRegion* FPdst = builder.createDst(SRDecl->getRegVar(), 0, 0, 1, Type_UD);
8494 rd = builder.getRegionStride1();
8495 G4_Operand* FPsrc = builder.createSrc(SRDecl->getRegVar(), 0, 0, rd, Type_UD);
8496
8497 saveBE_FPInst = builder.createMov(g4::SIMD4, oldFPDst, FPsrc, InstOpt_WriteEnable, false);
8498 saveBE_FPInst->addComment("save vISA SP/FP to temp");
8499 builder.setPartFDSaveInst(saveBE_FPInst);
8500
8501 auto entryBB = builder.kernel.fg.getEntryBB();
8502 auto insertIt = std::find_if(entryBB->begin(), entryBB->end(), [](G4_INST* inst) { return !inst->isLabel(); });
8503 entryBB->insertBefore(insertIt, saveBE_FPInst);
8504
8505 auto fretBB = builder.kernel.fg.getUniqueReturnBlock();
8506 auto iter = std::prev(fretBB->end());
8507 assert((*iter)->isFReturn() && "fret BB must end with fret");
8508
8509 if (!EUFusionWANeeded())
8510 {
8511 restoreBE_FPInst = builder.createMov(g4::SIMD4, FPdst, oldFPSrc, InstOpt_WriteEnable, false);
8512 fretBB->insertBefore(iter, restoreBE_FPInst);
8513 }
8514 else
8515 {
8516 // emit frame descriptor
8517 auto dstDcl = builder.createHardwiredDeclare(8, Type_UD, kernel.getFPSPGRF(), 0);
8518 dstDcl->setName(builder.getNameString(builder.kernel.fg.mem, 24, "FrameDescriptorGRF"));
8519 auto dstData = builder.createDstRegRegion(dstDcl, 1);
8520 const unsigned execSize = 8;
8521 G4_INST* load = nullptr;
8522 if (builder.supportsLSC())
8523 {
8524 auto headerOpnd = getSpillFillHeader(*kernel.fg.builder, nullptr);
8525 load = builder.createFill(headerOpnd, dstData, G4_ExecSize(execSize), 1, 0, builder.getBEFP(), InstOpt_WriteEnable, false);
8526 }
8527 else
8528 {
8529 load = builder.createFill(dstData, G4_ExecSize(execSize), 1, 0, builder.getBEFP(), InstOpt_WriteEnable, false);
8530 }
8531 fretBB->insertBefore(iter, load);
8532 addEUFusionWAInsts(load);
8533 restoreBE_FPInst = load;
8534 }
8535
8536 restoreBE_FPInst->addComment("restore vISA SP/FP from temp");
8537
8538 if (builder.kernel.getOption(vISA_GenerateDebugInfo))
8539 {
8540 builder.kernel.getKernelDebugInfo()->setCallerBEFPRestoreInst(restoreBE_FPInst);
8541 builder.kernel.getKernelDebugInfo()->setCallerSPRestoreInst(restoreBE_FPInst);
8542 builder.kernel.getKernelDebugInfo()->setCallerBEFPSaveInst(saveBE_FPInst);
8543 }
8544
8545 auto gtpin = builder.kernel.getGTPinData();
8546 if (gtpin &&
8547 gtpin->isFirstRAPass())
8548 {
8549 gtpin->markInst(saveBE_FPInst);
8550 gtpin->markInst(restoreBE_FPInst);
8551 }
8552 }
8553
reportUndefinedUses(LivenessAnalysis & liveAnalysis,G4_BB * bb,G4_INST * inst,G4_Declare * referencedDcl,std::set<G4_Declare * > & defs,std::ofstream & optreport,Gen4_Operand_Number opndNum)8554 void GlobalRA::reportUndefinedUses(
8555 LivenessAnalysis& liveAnalysis, G4_BB* bb, G4_INST* inst, G4_Declare* referencedDcl,
8556 std::set<G4_Declare*>& defs, std::ofstream& optreport, Gen4_Operand_Number opndNum)
8557 {
8558 // Get topmost dcl
8559 while (referencedDcl->getAliasDeclare() != NULL)
8560 {
8561 referencedDcl = referencedDcl->getAliasDeclare();
8562 }
8563
8564 if (referencedDcl->getAddressed() == true)
8565 {
8566 // Dont run analysis for addressed opnds.
8567 // Specifically, we dont analyze following,
8568 //
8569 // A0 = &V1
8570 // r[A0] = 0 <-- V1 indirectly defined
8571 // ... = V1 <-- Use-before-def warning for V1 skipped due to indirect def
8572 //
8573
8574 return;
8575 }
8576
8577 if (referencedDcl->getRegVar()->isRegAllocPartaker())
8578 {
8579 const char* opndName = "";
8580
8581 if (opndNum == Opnd_pred)
8582 {
8583 opndName = "predicate";
8584 }
8585 else if (opndNum == Opnd_src0)
8586 {
8587 opndName = "src0";
8588 }
8589 else if (opndNum == Opnd_src1)
8590 {
8591 opndName = "src1";
8592 }
8593 else if (opndNum == Opnd_src2)
8594 {
8595 opndName = "src2";
8596 }
8597
8598 unsigned id = referencedDcl->getRegVar()->getId();
8599 if (liveAnalysis.def_in[bb->getId()].isSet(id) == false &&
8600 defs.find(referencedDcl) == defs.end())
8601 {
8602 // Def not found for use so report it
8603 optreport << "Def not found for use " << referencedDcl->getName() <<
8604 " (" << opndName << ") at CISA offset " << inst->getCISAOff() << ", src line " <<
8605 inst->getLineNo() << ":" << std::endl;
8606 inst->emit(optreport);
8607 optreport << std::endl << std::endl;
8608 }
8609 }
8610 }
8611
updateDefSet(std::set<G4_Declare * > & defs,G4_Declare * referencedDcl)8612 void GlobalRA::updateDefSet(std::set<G4_Declare*>& defs, G4_Declare* referencedDcl)
8613 {
8614 // Get topmost dcl
8615 while (referencedDcl->getAliasDeclare() != NULL)
8616 {
8617 referencedDcl = referencedDcl->getAliasDeclare();
8618 }
8619
8620 defs.insert(referencedDcl);
8621 }
8622
detectUndefinedUses(LivenessAnalysis & liveAnalysis,G4_Kernel & kernel)8623 void GlobalRA::detectUndefinedUses(LivenessAnalysis& liveAnalysis, G4_Kernel& kernel)
8624 {
8625 // This function iterates over each inst and checks whether there is
8626 // a reaching def for each src operand. If not, it reports it to
8627 // opt report.
8628 std::ofstream optreport;
8629 getOptReportStream(optreport, kernel.getOptions());
8630
8631 optreport << std::endl;
8632 if (liveAnalysis.livenessClass(G4_FLAG))
8633 {
8634 optreport << "=== Uses with reaching def - Flags ===" << std::endl;
8635 }
8636 else if (liveAnalysis.livenessClass(G4_ADDRESS))
8637 {
8638 optreport << "=== Uses with reaching def - Address ===" << std::endl;
8639 }
8640 else
8641 {
8642 optreport << "=== Uses with reaching def - GRF ===" << std::endl;
8643 }
8644 if (kernel.getOption(vISA_LocalRA))
8645 {
8646 optreport << "(Use -nolocalra switch for accurate results of uses without reaching defs)" << std::endl;
8647 }
8648
8649 for (G4_BB* bb : kernel.fg)
8650 {
8651 std::set<G4_Declare*> defs;
8652 std::set<G4_Declare*>::iterator defs_it;
8653 G4_Declare* referencedDcl = NULL;
8654
8655 for (G4_INST* inst : *bb)
8656 {
8657 // Src/predicate opnds are uses
8658 if (inst->getPredicate() &&
8659 inst->getPredicate()->getBase() &&
8660 inst->getPredicate()->getBase()->isRegVar() &&
8661 inst->getPredicate()->getBase()->isRegAllocPartaker())
8662 {
8663 referencedDcl = inst->getPredicate()->asPredicate()->getBase()->asRegVar()->getDeclare();
8664 reportUndefinedUses(liveAnalysis, bb, inst, referencedDcl, defs, optreport, Opnd_pred);
8665 }
8666
8667 for (unsigned i = 0; i < G4_MAX_SRCS; i++)
8668 {
8669 G4_Operand* opnd = inst->getSrc(i);
8670
8671 if (opnd &&
8672 opnd->isAddrExp() == false &&
8673 opnd->getBase() &&
8674 opnd->getBase()->isRegVar() &&
8675 opnd->getBase()->isRegAllocPartaker())
8676 {
8677 referencedDcl = opnd->getBase()->asRegVar()->getDeclare();
8678 reportUndefinedUses(liveAnalysis, bb, inst, referencedDcl, defs, optreport, (Gen4_Operand_Number)(i + Opnd_src0));
8679 }
8680 }
8681
8682 // Dst/cond modifier opnds are defs
8683 if (inst->getCondModBase() &&
8684 inst->getCondMod()->getBase()->isRegVar() &&
8685 inst->getCondMod()->getBase()->isRegAllocPartaker())
8686 {
8687 referencedDcl = inst->getCondMod()->asCondMod()->getBase()->asRegVar()->getDeclare();
8688 updateDefSet(defs, referencedDcl);
8689 }
8690
8691 if (inst->getDst() &&
8692 inst->getDst()->getBase() &&
8693 inst->getDst()->getBase()->isRegVar() &&
8694 inst->getDst()->getBase()->isRegAllocPartaker())
8695 {
8696 referencedDcl = inst->getDst()->getBase()->asRegVar()->getDeclare();
8697 updateDefSet(defs, referencedDcl);
8698 }
8699 }
8700 }
8701
8702 optreport << std::endl << std::endl;
8703
8704 closeOptReportStream(optreport);
8705 }
8706
detectNeverDefinedUses()8707 void GlobalRA::detectNeverDefinedUses()
8708 {
8709 // Detect variables that are used but never defined in entire CFG.
8710 // This does not use liveness information.
8711 // Hold all decls from symbol table as key.
8712 // Boolean mapped value determines whether the dcl is
8713 // defined in kernel or not.
8714 std::map<G4_Declare*, bool> vars;
8715 std::map<G4_Declare*, bool>::iterator map_it;
8716
8717 for (auto bb : kernel.fg)
8718 {
8719 for (G4_INST* inst : *bb)
8720 {
8721 G4_Declare* referencedDcl = nullptr;
8722
8723 if (inst->getDst() &&
8724 inst->getDst()->getBase() &&
8725 inst->getDst()->getBase()->isRegVar())
8726 {
8727 referencedDcl = inst->getDst()->getBaseRegVarRootDeclare();
8728
8729 // Always insert top-most dcl
8730 map_it = vars.find(referencedDcl);
8731 if (map_it == vars.end())
8732 {
8733 vars.emplace(referencedDcl, true);
8734 }
8735 else
8736 {
8737 map_it->second = true;
8738 }
8739 }
8740
8741 if (inst->getCondModBase() &&
8742 inst->getCondMod()->getBase()->isRegVar())
8743 {
8744 referencedDcl = inst->getCondMod()->getBaseRegVarRootDeclare();
8745
8746 map_it = vars.find(referencedDcl);
8747 if (map_it == vars.end())
8748 {
8749 vars.emplace(referencedDcl, true);
8750 }
8751 else
8752 {
8753 map_it->second = true;
8754 }
8755 }
8756
8757 if (inst->getPredicate() &&
8758 inst->getPredicate()->getBase() &&
8759 inst->getPredicate()->getBase()->isRegVar())
8760 {
8761 referencedDcl = inst->getPredicate()->getBaseRegVarRootDeclare();
8762
8763 // Check whether dcl was already added to list.
8764 // If not, add it with flag set to false to indicate
8765 // that a use was found but a def hasnt been seen yet.
8766 map_it = vars.find(referencedDcl);
8767 if (map_it == vars.end())
8768 {
8769 vars.emplace(referencedDcl, false);
8770 }
8771 }
8772
8773 for (unsigned i = 0; i < G4_MAX_SRCS; i++)
8774 {
8775 G4_Operand* opnd = inst->getSrc(i);
8776
8777 if (opnd &&
8778 opnd->getBase() &&
8779 opnd->getBase()->isRegVar())
8780 {
8781 referencedDcl = opnd->getBaseRegVarRootDeclare();
8782
8783 map_it = vars.find(referencedDcl);
8784 if (map_it == vars.end())
8785 {
8786 vars.emplace(referencedDcl, false);
8787 }
8788 }
8789 }
8790 }
8791 }
8792
8793 std::ofstream optreport;
8794 getOptReportStream(optreport, kernel.getOptions());
8795 optreport << std::endl << "=== Variables used but never defined ===" << std::endl << std::endl;
8796
8797 for (auto dcl : kernel.Declares)
8798 {
8799 while (dcl->getAliasDeclare() != NULL)
8800 {
8801 dcl = dcl->getAliasDeclare();
8802 }
8803
8804 map_it = vars.find(dcl);
8805 if (map_it != vars.end())
8806 {
8807 if (map_it->second == false &&
8808 dcl->getRegFile() != G4_INPUT &&
8809 dcl->getAddressed() == false)
8810 {
8811 // No def found for this non-input variable in
8812 // entire CFG so report it.
8813 optreport << dcl->getName();
8814 if (dcl->getRegFile() == G4_GRF)
8815 {
8816 optreport << " (General)";
8817 }
8818 else if (dcl->getRegFile() == G4_ADDRESS)
8819 {
8820 optreport << " (Address)";
8821 }
8822 else if (dcl->getRegFile() == G4_FLAG)
8823 {
8824 optreport << " (Flag)";
8825 }
8826
8827 optreport << std::endl;
8828 }
8829 }
8830 }
8831
8832 optreport << std::endl << std::endl;
8833
8834 closeOptReportStream(optreport);
8835 }
8836
emitVarLiveIntervals()8837 void GlobalRA::emitVarLiveIntervals()
8838 {
8839 for (auto dcl : kernel.Declares)
8840 {
8841 std::vector<std::pair<uint32_t, uint32_t>> liveIntervals;
8842 LiveIntervalInfo* lr = kernel.getKernelDebugInfo()->getLiveIntervalInfo(dcl, false);
8843
8844 if (lr != NULL)
8845 {
8846 lr->getLiveIntervals(liveIntervals);
8847
8848 if (liveIntervals.size() > 0)
8849 {
8850 DEBUG_VERBOSE(dcl->getName() << " - ");
8851 }
8852
8853 for (auto&& i : liveIntervals)
8854 {
8855 std::cerr << "(" << i.first << ", " << i.second << ")\n";
8856 }
8857 }
8858 }
8859 }
8860
8861 //
8862 // Check the overlap of two sources' ranges and do range splitting
8863 // Such as, range1: 0~63, range2: 32~95 --> 0~31,32~63,64~95
8864 // or, range1: 0~63, range2: 32~63 --> 0~31,32~63
8865 //
splitVarRange(VarRange * src1,VarRange * src2,std::stack<VarRange * > * toDelete)8866 VarRange* VarSplit::splitVarRange(VarRange *src1,
8867 VarRange *src2,
8868 std::stack<VarRange*> *toDelete)
8869 {
8870 VarRange * new_var_range = nullptr;
8871
8872 ASSERT_USER(!(src1->leftBound == src2->leftBound && src1->rightBound == src2->rightBound), "Same ranges can not be spiltted");
8873
8874 if (src1->leftBound > src2->rightBound ||
8875 src1->rightBound < src2->leftBound) //No overlap
8876 {
8877 return NULL;
8878 }
8879
8880 unsigned left1 = std::min(src1->leftBound, src2->leftBound); //left
8881 unsigned right1 = std::max(src1->leftBound, src2->leftBound);
8882
8883 unsigned left2 = std::min(src1->rightBound, src2->rightBound); //right
8884 unsigned right2 = std::max(src1->rightBound, src2->rightBound);
8885
8886 if (left1 == right1) //Same left
8887 {
8888 src1->leftBound = left1;
8889 src1->rightBound = left2;
8890
8891 src2->leftBound = left2 + 1;
8892 src2->rightBound = right2;
8893 }
8894 else if (left2 == right2) //Same right
8895 {
8896 src1->leftBound = left1;
8897 src1->rightBound = right1 - 1;
8898 src2->leftBound = right1;
8899 src2->rightBound = right2;
8900 }
8901 else //No same boundary
8902 {
8903 src1->leftBound = left1; //Left one: in list already
8904 src1->rightBound = right1 - 1;
8905
8906 src2->leftBound = left2 + 1; //Right one: keep in list
8907 src2->rightBound = right2;
8908
8909 new_var_range = new VarRange;
8910 new_var_range->leftBound = right1; //Middle one: need add one range object
8911 new_var_range->rightBound = left2;
8912 toDelete->push(new_var_range);
8913 }
8914
8915 return new_var_range;
8916 }
8917
8918 //
8919 // Scan the range list, Insert the new range into the range list.
8920 // Range splitting is applied if required.
8921 //
rangeListSpliting(VAR_RANGE_LIST * rangeList,G4_Operand * opnd,std::stack<VarRange * > * toDelete)8922 void VarSplit::rangeListSpliting(VAR_RANGE_LIST *rangeList, G4_Operand *opnd, std::stack<VarRange*> *toDelete)
8923 {
8924 VarRange *range = new VarRange;
8925 range->leftBound = opnd->getLeftBound();
8926 range->rightBound = opnd->getRightBound();
8927 toDelete->push(range);
8928
8929 VAR_RANGE_LIST_ITER it = rangeList->begin();
8930
8931 //The ranges in the list are ordered from low to high
8932 while (it != rangeList->end())
8933 {
8934 if ((*it)->leftBound == range->leftBound &&
8935 ((*it)->rightBound == range->rightBound))
8936 {
8937 //Same range exists in the list already
8938 return;
8939 }
8940
8941 if ((*it)->leftBound > range->rightBound)
8942 {
8943 //The range item in the list is on the right of current range, insert it before the postion.
8944 //Since the whole range is inserted first, all the ranges should be continous.
8945 ASSERT_USER((*it)->leftBound - range->rightBound == 1, "none continous spliting happened\n");
8946 rangeList->insert(it, range);
8947 return;
8948 }
8949
8950 //Overlap happened, do splitting.
8951 //(*lt) is updated to the left range
8952 //"range" is updated to the right range
8953 //If "newRange" is not NULL, it's the middle range.
8954 VarRange *newRange = splitVarRange((*it), range, toDelete);
8955
8956 //Insert the middle one
8957 it++;
8958 if (newRange)
8959 {
8960 it = rangeList->insert(it, newRange);
8961 }
8962 }
8963
8964 rangeList->push_back(range); //Insert the right one
8965
8966 return;
8967 }
8968
getHeightWidth(G4_Type type,unsigned numberElements,unsigned short & dclWidth,unsigned short & dclHeight,int & totalByteSize)8969 void VarSplit::getHeightWidth(G4_Type type, unsigned numberElements, unsigned short &dclWidth, unsigned short &dclHeight, int &totalByteSize)
8970 {
8971 dclWidth = 1, dclHeight = 1;
8972 totalByteSize = numberElements * TypeSize(type);
8973 if (totalByteSize <= (int)numEltPerGRF<Type_UB>())
8974 {
8975 dclWidth = (uint16_t)numberElements;
8976 }
8977 else {
8978 // here we assume that the start point of the var is the beginning of a GRF?
8979 // so subregister must be 0?
8980 dclWidth = numEltPerGRF<Type_UB>() / TypeSize(type);
8981 dclHeight = totalByteSize / numEltPerGRF<Type_UB>();
8982 if (totalByteSize % numEltPerGRF<Type_UB>() != 0) {
8983 dclHeight++;
8984 }
8985 }
8986 }
8987
8988
createSubDcls(G4_Kernel & kernel,G4_Declare * oldDcl,std::vector<G4_Declare * > & splitDclList)8989 void VarSplit::createSubDcls(G4_Kernel& kernel, G4_Declare* oldDcl, std::vector<G4_Declare*> &splitDclList)
8990 {
8991 if (oldDcl->getByteSize() <= numEltPerGRF<Type_UB>() || oldDcl->getByteSize() % numEltPerGRF<Type_UB>())
8992 {
8993 return;
8994 }
8995
8996 int splitVarSize = kernel.getSimdSize() == g4::SIMD8 ? 1 : 2;
8997 for (unsigned i = 0, bSizePerGRFSize = (oldDcl->getByteSize() / numEltPerGRF<Type_UB>()); i < bSizePerGRFSize; i += splitVarSize)
8998 {
8999 G4_Declare* splitDcl = NULL;
9000 unsigned leftBound = i * numEltPerGRF<Type_UB>();
9001 unsigned rightBound = (i + splitVarSize) * numEltPerGRF<Type_UB>() - 1;
9002 unsigned short dclWidth = 0;
9003 unsigned short dclHeight = 0;
9004 int dclTotalSize = 0;
9005
9006 getHeightWidth(oldDcl->getElemType(), (rightBound - leftBound + 1) / oldDcl->getElemSize(), dclWidth, dclHeight, dclTotalSize);
9007 const char* splitDclName = kernel.fg.builder->getNameString(kernel.fg.builder->mem, 16, "split_%d_%s", i, oldDcl->getName());
9008 splitDcl = kernel.fg.builder->createDeclareNoLookup(splitDclName, G4_GRF, dclWidth, dclHeight, oldDcl->getElemType());
9009 gra.setSubOffset(splitDcl, leftBound);
9010 splitDcl->copyAlign(oldDcl);
9011 gra.copyAlignment(splitDcl, oldDcl);
9012 unsigned nElementSize = (rightBound - leftBound + 1) / oldDcl->getElemSize();
9013 if ((rightBound - leftBound + 1) % oldDcl->getElemSize())
9014 {
9015 nElementSize++;
9016 }
9017 splitDcl->setTotalElems(nElementSize);
9018 splitDclList.push_back(splitDcl);
9019 }
9020
9021 return;
9022 }
9023
insertMovesToTemp(IR_Builder & builder,G4_Declare * oldDcl,G4_Operand * dstOpnd,G4_BB * bb,INST_LIST_ITER instIter,std::vector<G4_Declare * > & splitDclList)9024 void VarSplit::insertMovesToTemp(
9025 IR_Builder& builder, G4_Declare* oldDcl, G4_Operand *dstOpnd, G4_BB* bb,
9026 INST_LIST_ITER instIter, std::vector<G4_Declare*> &splitDclList)
9027 {
9028 G4_INST *inst = (*instIter);
9029 INST_LIST_ITER iter = instIter;
9030 iter++;
9031
9032 for (size_t i = 0, size = splitDclList.size(); i < size; i++)
9033 {
9034 G4_Declare * subDcl = splitDclList[i];
9035 unsigned leftBound = gra.getSubOffset(subDcl);
9036 unsigned rightBound = leftBound + subDcl->getByteSize() - 1;
9037
9038 if (!(dstOpnd->getRightBound() < leftBound || rightBound < dstOpnd->getLeftBound()))
9039 {
9040 unsigned maskFlag = (inst->getOption() & 0xFFF010C);
9041 G4_DstRegRegion* dst = builder.createDstRegRegion(subDcl, 1);
9042 auto src = builder.createSrc(oldDcl->getRegVar(),
9043 (gra.getSubOffset(subDcl)) / numEltPerGRF<Type_UB>(), 0, builder.getRegionStride1(), oldDcl->getElemType());
9044 G4_INST* splitInst = builder.createMov(G4_ExecSize(subDcl->getTotalElems()), dst, src, maskFlag, false);
9045 bb->insertBefore(iter, splitInst);
9046 }
9047 }
9048
9049 return;
9050 }
9051
insertMovesFromTemp(G4_Kernel & kernel,G4_Declare * oldDcl,int index,G4_Operand * srcOpnd,int pos,G4_BB * bb,INST_LIST_ITER instIter,std::vector<G4_Declare * > & splitDclList)9052 void VarSplit::insertMovesFromTemp(G4_Kernel& kernel, G4_Declare* oldDcl, int index, G4_Operand *srcOpnd, int pos, G4_BB* bb, INST_LIST_ITER instIter, std::vector<G4_Declare*> &splitDclList)
9053 {
9054 G4_INST *inst = (*instIter);
9055
9056 int sizeInGRF = (srcOpnd->getRightBound() - srcOpnd->getLeftBound() + numEltPerGRF<Type_UB>() - 1) /
9057 numEltPerGRF<Type_UB>();
9058 int splitSize = kernel.getSimdSize() == g4::SIMD8 ? 1 : 2;
9059 if (sizeInGRF != splitSize)
9060 {
9061 unsigned short dclWidth = 0;
9062 unsigned short dclHeight = 0;
9063 int dclTotalSize = 0;
9064 G4_SrcRegRegion* oldSrc = srcOpnd->asSrcRegRegion();
9065 getHeightWidth(oldSrc->getType(), (srcOpnd->getRightBound() - srcOpnd->getLeftBound() + 1) / oldSrc->getElemSize(), dclWidth, dclHeight, dclTotalSize);
9066 const char* newDclName = kernel.fg.builder->getNameString(kernel.fg.builder->mem, 16, "copy_%d_%s", index, oldDcl->getName());
9067 G4_Declare * newDcl = kernel.fg.builder->createDeclareNoLookup(newDclName, G4_GRF, dclWidth, dclHeight, oldSrc->getType());
9068 newDcl->copyAlign(oldDcl);
9069 gra.copyAlignment(newDcl, oldDcl);
9070
9071 unsigned newLeftBound = 0;
9072
9073 for (size_t i = 0, size = splitDclList.size(); i < size; i++)
9074 {
9075 G4_Declare * subDcl = splitDclList[i];
9076 unsigned leftBound = gra.getSubOffset(subDcl);
9077 unsigned rightBound = leftBound + subDcl->getByteSize() - 1;
9078
9079 if (!(srcOpnd->getRightBound() < leftBound || rightBound < srcOpnd->getLeftBound()))
9080 {
9081
9082 G4_DstRegRegion* dst = kernel.fg.builder->createDst(
9083 newDcl->getRegVar(),
9084 newLeftBound / numEltPerGRF<Type_UB>(),
9085 0,
9086 1,
9087 oldSrc->getType());
9088 newLeftBound += subDcl->getByteSize();
9089 G4_SrcRegRegion* src = kernel.fg.builder->createSrc(
9090 subDcl->getRegVar(),
9091 0,
9092 0,
9093 kernel.fg.builder->getRegionStride1(),
9094 oldSrc->getType());
9095 G4_INST* movInst = kernel.fg.builder->createMov(
9096 G4_ExecSize(subDcl->getTotalElems()), dst, src, InstOpt_WriteEnable, false);
9097 bb->insertBefore(instIter, movInst);
9098 }
9099 }
9100 auto newSrc = kernel.fg.builder->createSrcRegRegion(oldSrc->getModifier(), Direct, newDcl->getRegVar(),
9101 0, oldSrc->getSubRegOff(), oldSrc->getRegion(), newDcl->getElemType());
9102 inst->setSrc(newSrc, pos);
9103 }
9104 else
9105 {
9106 for (size_t i = 0, size = splitDclList.size(); i < size; i++)
9107 {
9108 G4_Declare * subDcl = splitDclList[i];
9109 unsigned leftBound = gra.getSubOffset(subDcl);
9110 unsigned rightBound = leftBound + subDcl->getByteSize() - 1;
9111
9112 if (!(srcOpnd->getRightBound() < leftBound || rightBound < srcOpnd->getLeftBound()))
9113 {
9114 G4_SrcRegRegion* oldSrc = srcOpnd->asSrcRegRegion();
9115 G4_SrcRegRegion* newSrc = kernel.fg.builder->createSrcRegRegion(
9116 oldSrc->getModifier(),
9117 Direct,
9118 subDcl->getRegVar(),
9119 0,
9120 oldSrc->getSubRegOff(),
9121 oldSrc->getRegion(),
9122 oldSrc->getType());
9123 inst->setSrc(newSrc, pos);
9124 break;
9125 }
9126 }
9127 }
9128
9129 return;
9130 }
9131
canDoGlobalSplit(IR_Builder & builder,G4_Kernel & kernel,uint32_t sendSpillRefCount)9132 bool VarSplit::canDoGlobalSplit(IR_Builder& builder, G4_Kernel &kernel, uint32_t sendSpillRefCount)
9133 {
9134 if (!builder.getOption(vISA_GlobalSendVarSplit))
9135 {
9136 return false;
9137 }
9138
9139 if (!builder.getOption(vISA_Debug) && //Not work in debug mode
9140 kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D && //Only works for 3D/OCL/OGL
9141 sendSpillRefCount)
9142 {
9143 return true;
9144 }
9145
9146 return false;
9147 }
9148
globalSplit(IR_Builder & builder,G4_Kernel & kernel)9149 void VarSplit::globalSplit(IR_Builder& builder, G4_Kernel &kernel)
9150 {
9151 typedef std::list<std::tuple<G4_BB*, G4_Operand*, int, unsigned, INST_LIST_ITER>> SPLIT_OPERANDS;
9152 typedef std::list<std::tuple<G4_BB*, G4_Operand*, int, unsigned, INST_LIST_ITER>>::iterator SPLIT_OPERANDS_ITER;
9153 typedef std::map<G4_RegVar*, SPLIT_OPERANDS> SPLIT_DECL_OPERANDS;
9154 typedef std::map<G4_RegVar*, SPLIT_OPERANDS>::iterator SPLIT_DECL_OPERANDS_ITER;
9155
9156 SPLIT_DECL_OPERANDS splitDcls;
9157 unsigned instIndex = 0;
9158 int splitSize = kernel.getSimdSize() == g4::SIMD8 ? 1 : 2;
9159 for (auto bb : kernel.fg)
9160 {
9161 for (INST_LIST_ITER it = bb->begin(), iend = bb->end(); it != iend; ++it, ++instIndex)
9162 {
9163 G4_INST* inst = (*it);
9164 G4_DstRegRegion* dst = inst->getDst();
9165
9166 if (inst->isLifeTimeEnd() || inst->isPseudoKill())
9167 {
9168 continue;
9169 }
9170
9171 //
9172 // process send destination operand
9173 //
9174 if (inst->isSend() &&
9175 inst->getMsgDesc()->getDstLenRegs() > (size_t)splitSize &&
9176 inst->asSendInst()->isDirectSplittableSend())
9177 {
9178 G4_DstRegRegion* dstrgn = dst;
9179 G4_Declare* topdcl = GetTopDclFromRegRegion(dstrgn);
9180
9181 if (topdcl &&
9182 dstrgn->getRegAccess() == Direct &&
9183 !topdcl->getAddressed() &&
9184 topdcl->getRegFile() != G4_INPUT &&
9185 (dstrgn->getRightBound() - dstrgn->getLeftBound() + 1) == topdcl->getByteSize() &&
9186 (dstrgn->getRightBound() - dstrgn->getLeftBound()) > numEltPerGRF<Type_UB>())
9187 {
9188 //The tuple<G4_BB*, G4_Operand*, int pos, unsigned instIndex, INST_LIST_ITER>,
9189 //these info are tuning and split operand/instruction generation
9190 splitDcls[topdcl->getRegVar()].push_front(make_tuple(bb, dst, 0, instIndex, it));
9191 }
9192 }
9193 }
9194 }
9195
9196 instIndex = 0;
9197 for (auto bb : kernel.fg)
9198 {
9199 for (INST_LIST_ITER it = bb->begin(), end = bb->end(); it != end; ++it, ++instIndex)
9200 {
9201
9202 G4_INST* inst = (*it);
9203
9204 if (inst->isLifeTimeEnd() || inst->isPseudoKill())
9205 {
9206 continue;
9207 }
9208
9209 //
9210 // process each source operand
9211 //
9212 for (unsigned j = 0; j < G4_MAX_SRCS; j++)
9213 {
9214 G4_Operand* src = inst->getSrc(j);
9215
9216 if (src == NULL)
9217 {
9218 continue;
9219 }
9220
9221 if (src->isSrcRegRegion())
9222 {
9223 G4_Declare* topdcl = GetTopDclFromRegRegion(src);
9224
9225 if (topdcl &&
9226 topdcl->getRegFile() != G4_INPUT &&
9227 !topdcl->getAddressed() &&
9228 splitDcls.find(topdcl->getRegVar()) != splitDcls.end() &&
9229 ((src->asSrcRegRegion()->getRightBound() - src->asSrcRegRegion()->getLeftBound() + 1) < topdcl->getByteSize()) &&
9230 src->asSrcRegRegion()->getRegAccess() == Direct) //We don't split the indirect access
9231 {
9232 splitDcls[topdcl->getRegVar()].push_back(make_tuple(bb, src, j, instIndex, it));
9233 }
9234 }
9235 }
9236 }
9237 }
9238
9239 for (SPLIT_DECL_OPERANDS_ITER it = splitDcls.begin();
9240 it != splitDcls.end();)
9241 {
9242 unsigned srcIndex = 0xFFFFFFFF;
9243 unsigned dstIndex = 0;
9244 SPLIT_DECL_OPERANDS_ITER succIt = it;
9245 succIt++;
9246 G4_Declare * topDcl = it->first->getDeclare();
9247 if (topDcl->getByteSize() <= numEltPerGRF<Type_UB>() * 2u)
9248 {
9249 splitDcls.erase(it);
9250 it = succIt;
9251 continue;
9252 }
9253
9254 bool hasSrcOpearnd = false;
9255 for (SPLIT_OPERANDS_ITER vt = it->second.begin(); vt != it->second.end(); vt++)
9256 {
9257 G4_BB *bb = nullptr;
9258 G4_Operand *opnd = nullptr;
9259 INST_LIST_ITER instIter;
9260 int pos = 0;
9261 unsigned iIndex = 0;
9262
9263 std::tie(bb, opnd, pos, iIndex, instIter) = (*vt);
9264
9265 if (opnd == nullptr)
9266 {
9267 continue;
9268 }
9269
9270 if (opnd->isDstRegRegion())
9271 {
9272 dstIndex = std::max(dstIndex, iIndex);
9273 }
9274
9275 if (opnd->isSrcRegRegion())
9276 {
9277 srcIndex = std::min(srcIndex, iIndex);
9278 hasSrcOpearnd = true;
9279 }
9280 }
9281
9282 if (!hasSrcOpearnd || (dstIndex > srcIndex &&
9283 dstIndex - srcIndex < it->second.size() + 1))
9284 {
9285 splitDcls.erase(it);
9286 it = succIt;
9287 continue;
9288 }
9289
9290 it++;
9291 }
9292
9293 for (SPLIT_DECL_OPERANDS_ITER it = splitDcls.begin();
9294 it != splitDcls.end();
9295 it++)
9296 {
9297 G4_Declare * topDcl = it->first->getDeclare();
9298 std::vector<G4_Declare*> splitDclList;
9299 splitDclList.clear();
9300
9301 createSubDcls(kernel, topDcl, splitDclList);
9302 int srcIndex = 0;
9303 for (SPLIT_OPERANDS_ITER vt = it->second.begin(); vt != it->second.end(); vt++)
9304 {
9305 G4_BB *bb = nullptr;
9306 G4_Operand *opnd = nullptr;
9307 INST_LIST_ITER instIter;
9308 int pos = 0;
9309 unsigned instIndex = 0;
9310 std::tie(bb, opnd, pos, instIndex, instIter) = (*vt);
9311
9312 if (opnd == nullptr)
9313 {
9314 continue;
9315 }
9316
9317 if (opnd->isDstRegRegion())
9318 {
9319 insertMovesToTemp(builder, topDcl, opnd, bb, instIter, splitDclList);
9320 }
9321
9322 if (opnd->isSrcRegRegion())
9323 {
9324 insertMovesFromTemp(kernel, topDcl, srcIndex, opnd, pos, bb, instIter, splitDclList);
9325 }
9326
9327 srcIndex++;
9328 }
9329 }
9330
9331 return;
9332 }
9333
localSplit(IR_Builder & builder,G4_BB * bb)9334 void VarSplit::localSplit(IR_Builder& builder,
9335 G4_BB* bb)
9336 {
9337 class CmpRegVarId
9338 {
9339 public:
9340 bool operator()(G4_RegVar* first, G4_RegVar* second) const
9341 {
9342 return first->getDeclare()->getDeclId() < second->getDeclare()->getDeclId();
9343 }
9344 };
9345 std::map<G4_RegVar*, std::vector<std::pair<G4_Operand*, INST_LIST_ITER>>, CmpRegVarId> localRanges;
9346 std::map<G4_RegVar*, std::vector<std::pair<G4_Operand*, INST_LIST_ITER>>, CmpRegVarId>::iterator localRangesIt;
9347 std::map<G4_RegVar*, VarRangeListPackage, CmpRegVarId> varRanges;
9348 std::map<G4_RegVar*, VarRangeListPackage, CmpRegVarId>::iterator varRangesIt;
9349 std::stack<VarRange*> toDelete;
9350
9351 //
9352 // Iterate instruction in BB from back to front
9353 //
9354 for (INST_LIST::reverse_iterator rit = bb->rbegin(), rend = bb->rend(); rit != rend; ++rit)
9355 {
9356 G4_INST* i = (*rit);
9357 G4_DstRegRegion* dst = i->getDst();
9358
9359 if (i->isLifeTimeEnd() || i->isPseudoKill())
9360 {
9361 continue;
9362 }
9363
9364 //
9365 // process destination operand
9366 //
9367 if (dst != NULL)
9368 {
9369 G4_DstRegRegion* dstrgn = dst;
9370
9371 //It's RA candidate
9372 G4_Declare* topdcl = GetTopDclFromRegRegion(dstrgn);
9373
9374 LocalLiveRange* topdclLR = nullptr;
9375 //Local only
9376 if ((topdcl &&
9377 (topdclLR = gra.getLocalLR(topdcl)) &&
9378 topdcl->getIsRefInSendDcl() &&
9379 topdclLR->isLiveRangeLocal()) &&
9380 topdcl->getRegFile() == G4_GRF)
9381 {
9382 varRangesIt = varRanges.find(topdcl->getRegVar());
9383 INST_LIST_ITER iterToInsert = rit.base();
9384 iterToInsert--; //Point to the iterator of current instruction
9385 if (varRangesIt == varRanges.end())
9386 {
9387 VarRange* new_range = new VarRange;
9388 new_range->leftBound = 0;
9389 new_range->rightBound = topdcl->getByteSize() - 1;
9390 toDelete.push(new_range);
9391 varRanges[topdcl->getRegVar()].list.push_back(new_range);
9392 }
9393 else
9394 {
9395 rangeListSpliting(&(varRanges[topdcl->getRegVar()].list), dstrgn, &toDelete);
9396 }
9397
9398 localRanges[topdcl->getRegVar()].emplace_back(dst, iterToInsert); // Ordered from back to front.
9399 }
9400 }
9401
9402 //
9403 // process each source operand
9404 //
9405 for (unsigned j = 0; j < G4_MAX_SRCS; j++)
9406 {
9407 G4_Operand* src = i->getSrc(j);
9408
9409 if (src == NULL)
9410 {
9411 continue;
9412 }
9413
9414 //Local only
9415 if (src->isSrcRegRegion())
9416 {
9417 G4_Declare* topdcl = GetTopDclFromRegRegion(src);
9418 LocalLiveRange* topdclLR = nullptr;
9419
9420 if (topdcl &&
9421 (topdclLR = gra.getLocalLR(topdcl)) &&
9422 topdcl->getIsRefInSendDcl() &&
9423 topdclLR->isLiveRangeLocal() &&
9424 topdcl->getRegFile() == G4_GRF)
9425 {
9426 G4_VarBase* base = (topdcl != NULL ? topdcl->getRegVar() : src->asSrcRegRegion()->getBase());
9427
9428 INST_LIST_ITER iterToInsert = rit.base();
9429 iterToInsert--;
9430
9431 varRangesIt = varRanges.find(base->asRegVar());
9432 if (varRangesIt == varRanges.end())
9433 {
9434 VarRange* new_range = new VarRange;
9435 new_range->leftBound = 0;
9436 new_range->rightBound = topdcl->getByteSize() - 1;
9437 toDelete.push(new_range);
9438 varRanges[topdcl->getRegVar()].list.push_back(new_range);
9439 }
9440
9441 rangeListSpliting(&(varRanges[topdcl->getRegVar()].list), src, &toDelete);
9442
9443 localRanges[topdcl->getRegVar()].emplace_back(src, iterToInsert); // Ordered from back to front.
9444 }
9445 }
9446 }
9447 }
9448
9449 //Clean the varaibles without no partial usage, or whose partial live range is too short
9450 std::map<G4_RegVar*, VarRangeListPackage>::iterator it = varRanges.begin();
9451 while (it != varRanges.end())
9452 {
9453 std::map<G4_RegVar*, VarRangeListPackage>::iterator succ_it = it;
9454 succ_it++;
9455
9456 //No partial
9457 if (it->second.list.size() <= 1)
9458 {
9459 varRanges.erase(it);
9460 it = succ_it;
9461 continue;
9462 }
9463
9464 //If total GRF size divides partial number is less than 16 bytes (half GRF), remove it
9465 if (((*it->second.list.rbegin())->rightBound - (*it->second.list.begin())->leftBound) / it->second.list.size() < numEltPerGRF<Type_UW>() * 2 / 2)
9466 {
9467 varRanges.erase(it);
9468 it = succ_it;
9469 continue;
9470 }
9471
9472 G4_Declare * topDcl = it->first->getDeclare();
9473 bool aligned = true;
9474 for (const VarRange *vr : it->second.list)
9475 {
9476 unsigned leftBound = vr->leftBound;
9477 unsigned rightBound = vr->rightBound;
9478 int elementSize = topDcl->getElemSize() > G4_WSIZE ? topDcl->getElemSize() : G4_WSIZE;
9479 unsigned short elemsNum = (rightBound - leftBound + 1) / elementSize;
9480
9481 if (!elemsNum)
9482 {
9483 aligned = false;
9484 break;
9485 }
9486
9487 //TODO: we can merge serveral unaligned sub declares into one aligned. Such as [0-1], [2-63] --> [0-63]
9488 if (leftBound % numEltPerGRF<Type_UW>() || (rightBound + 1) % numEltPerGRF<Type_UW>())
9489 {
9490 aligned = false;
9491 break;
9492 }
9493 }
9494
9495 if (!aligned)
9496 {
9497 varRanges.erase(it);
9498 it = succ_it;
9499 continue;
9500 }
9501
9502
9503 it = succ_it;
9504 }
9505
9506 int splitid = 0;
9507 for (std::map<G4_RegVar*, VarRangeListPackage>::iterator it = varRanges.begin();
9508 it != varRanges.end();
9509 it++)
9510 {
9511 G4_Declare * topDcl = it->first->getDeclare();
9512 const char * dclName = topDcl->getName();
9513
9514 topDcl->setIsSplittedDcl(true);
9515
9516 // Vertical split: varaible split
9517 unsigned splitVarNum = 0;
9518 unsigned pre_rightBound = 0;
9519 for (VAR_RANGE_LIST_ITER vt = it->second.list.begin(); vt != it->second.list.end(); vt++)
9520 {
9521 unsigned leftBound = (*vt)->leftBound;
9522 unsigned rightBound = (*vt)->rightBound;
9523 int elementSize = topDcl->getElemSize() > G4_WSIZE ? topDcl->getElemSize() : G4_WSIZE;
9524 unsigned short elemsNum = (rightBound - leftBound + 1) / elementSize;
9525
9526 if (!elemsNum)
9527 {
9528 assert(0);
9529 pre_rightBound = rightBound;
9530 continue;
9531 }
9532
9533 if (leftBound && pre_rightBound + 1 != leftBound)
9534 {
9535 assert(0);
9536 }
9537 pre_rightBound = rightBound;
9538
9539 std::stringstream nameStrm;
9540 nameStrm << dclName << "_" << splitid << "_" << leftBound << "_" << rightBound << std::ends;
9541 int nameLen = unsigned(nameStrm.str().length()) + 1;
9542 const char* name = builder.getNameString(builder.mem, nameLen, "%s_%d_%d_%d", dclName, splitid, leftBound, rightBound);
9543
9544 unsigned short dclWidth = 0;
9545 unsigned short dclHeight = 0;
9546 int dclTotalSize = 0;
9547
9548 getHeightWidth(topDcl->getElemType(), (rightBound - leftBound + 1) / topDcl->getElemSize(), dclWidth, dclHeight, dclTotalSize);
9549 G4_Declare* partialDcl = builder.createDeclareNoLookup(name, G4_GRF, dclWidth, dclHeight, topDcl->getElemType());
9550 gra.setSubOffset(partialDcl, leftBound);
9551 partialDcl->setIsPartialDcl(true);
9552 gra.setSplittedDeclare(partialDcl, topDcl);
9553 unsigned nElementSize = (rightBound - leftBound + 1) / topDcl->getElemSize();
9554 if ((rightBound - leftBound + 1) % topDcl->getElemSize())
9555 {
9556 nElementSize++;
9557 }
9558 partialDcl->setTotalElems(nElementSize);
9559 gra.addSubDcl(topDcl, partialDcl);
9560 splitVarNum++;
9561 #ifdef DEBUG_VERBOSE_ON
9562 std::cout << "==> Sub Declare: " << splitid << "::" << name << std::endl;
9563 #endif
9564 splitid++;
9565 }
9566 if (splitVarNum)
9567 {
9568 gra.setSplitVarNum(topDcl, splitVarNum);
9569 }
9570 }
9571
9572 while (toDelete.size() > 0)
9573 {
9574 delete toDelete.top();
9575 toDelete.pop();
9576 }
9577
9578 return;
9579 }
9580
addrRegAlloc()9581 void GlobalRA::addrRegAlloc()
9582 {
9583 uint32_t addrSpillId = 0;
9584 unsigned maxRAIterations = 10;
9585 unsigned iterationNo = 0;
9586
9587 while (iterationNo < maxRAIterations)
9588 {
9589 if (builder.getOption(vISA_RATrace))
9590 {
9591 std::cout << "--address RA iteration " << iterationNo << "\n";
9592 }
9593 //
9594 // choose reg vars whose reg file kind is ARF
9595 //
9596 LivenessAnalysis liveAnalysis(*this, G4_ADDRESS);
9597 liveAnalysis.computeLiveness();
9598
9599 //
9600 // if no reg var needs to reg allocated, then skip reg allocation
9601 //
9602 if (liveAnalysis.getNumSelectedVar() > 0)
9603 {
9604 GraphColor coloring(liveAnalysis, kernel.getNumRegTotal(), false, false);
9605 unsigned spillRegSize = 0;
9606 unsigned indrSpillRegSize = 0;
9607 if (!coloring.regAlloc(false, false, false, spillRegSize, indrSpillRegSize, nullptr))
9608 {
9609 SpillManager spillARF(*this, coloring.getSpilledLiveRanges(), addrSpillId);
9610 spillARF.insertSpillCode();
9611 addrSpillId = spillARF.getNextTempDclId();
9612
9613 //
9614 // if new addr temps are created, we need to do RA again so that newly created temps
9615 // can get registers. If there are no more newly created temps, we then commit reg assignments
9616 //
9617 if (spillARF.isAnyNewTempCreated() == false)
9618 {
9619 coloring.confirmRegisterAssignments();
9620 coloring.cleanupRedundantARFFillCode();
9621 if ((builder.kernel.fg.getHasStackCalls() || builder.kernel.fg.getIsStackCallFunc()))
9622 {
9623 coloring.addA0SaveRestoreCode();
9624 }
9625 break; // no more new addr temps; done with ARF allocation
9626 }
9627 }
9628 else // successfully allocate register without spilling
9629 {
9630 coloring.confirmRegisterAssignments();
9631 coloring.cleanupRedundantARFFillCode();
9632 if ((builder.kernel.fg.getHasStackCalls() || builder.kernel.fg.getIsStackCallFunc()))
9633 {
9634 coloring.addA0SaveRestoreCode();
9635 }
9636 if (builder.getOption(vISA_OptReport))
9637 {
9638 detectUndefinedUses(liveAnalysis, kernel);
9639 }
9640
9641 break; // done with ARF allocation
9642 }
9643 }
9644 else {
9645 break; // no ARF allocation needed
9646 }
9647 kernel.dumpToFile("after.Address_RA." + std::to_string(iterationNo));
9648 iterationNo++;
9649
9650
9651 }
9652
9653 MUST_BE_TRUE(iterationNo < maxRAIterations, "Address RA has failed.");
9654 }
9655
flagRegAlloc()9656 void GlobalRA::flagRegAlloc()
9657 {
9658 uint32_t flagSpillId = 0;
9659 unsigned maxRAIterations = 10;
9660 uint32_t iterationNo = 0;
9661 bool spillingFlag = false;
9662
9663 while (iterationNo < maxRAIterations)
9664 {
9665 if (builder.getOption(vISA_RATrace))
9666 {
9667 std::cout << "--flag RA iteration " << iterationNo << "\n";
9668 }
9669
9670 //
9671 // choose reg vars whose reg file kind is FLAG
9672 //
9673 LivenessAnalysis liveAnalysis(*this, G4_FLAG);
9674 liveAnalysis.computeLiveness();
9675
9676 //
9677 // if no reg var needs to reg allocated, then skip reg allocation
9678 //
9679 if (liveAnalysis.getNumSelectedVar() > 0)
9680 {
9681 GraphColor coloring(liveAnalysis, kernel.getNumRegTotal(), false, false);
9682 unsigned spillRegSize = 0;
9683 unsigned indrSpillRegSize = 0;
9684 if (!coloring.regAlloc(false, false, false, spillRegSize, indrSpillRegSize, nullptr))
9685 {
9686 SpillManager spillFlag(*this, coloring.getSpilledLiveRanges(), flagSpillId);
9687 spillFlag.insertSpillCode();
9688 #ifdef DEBUG_VERBOSE_ON
9689 printf("FLAG Spill inst count: %d\n", spillFlag.getNumFlagSpillStore());
9690 printf("FLAG Fill inst count: %d\n", spillFlag.getNumFlagSpillLoad());
9691 printf("*************************\n");
9692 #endif
9693 flagSpillId = spillFlag.getNextTempDclId();
9694
9695 spillingFlag = true;
9696 if (spillFlag.isAnyNewTempCreated() == false)
9697 {
9698 coloring.confirmRegisterAssignments();
9699
9700 if ((builder.kernel.fg.getHasStackCalls() || builder.kernel.fg.getIsStackCallFunc()))
9701 {
9702 coloring.addFlagSaveRestoreCode();
9703 }
9704 break;
9705 }
9706 builder.getJitInfo()->numFlagSpillStore = spillFlag.getNumFlagSpillStore();
9707 builder.getJitInfo()->numFlagSpillLoad = spillFlag.getNumFlagSpillLoad();
9708 }
9709 else // successfully allocate register without spilling
9710 {
9711 coloring.confirmRegisterAssignments();
9712 if ((builder.kernel.fg.getHasStackCalls() || builder.kernel.fg.getIsStackCallFunc()))
9713 {
9714 coloring.addFlagSaveRestoreCode();
9715 }
9716
9717 if (spillingFlag && builder.getOption(vISA_FlagSpillCodeCleanup))
9718 {
9719 CLEAN_NUM_PROFILE clean_num_profile;
9720
9721 FlagSpillCleanup f(*this);
9722 f.spillFillCodeCleanFlag(builder, kernel, &clean_num_profile);
9723
9724 #ifdef DEBUG_VERBOSE_ON1
9725 for (int i = 0; i < 3; i++)
9726 {
9727 printf("Profiler %d Spill clean: %d\n", i, clean_num_profile.spill_clean_num[i]);
9728 printf("Profiler %d Fill clean: %d\n", i, clean_num_profile.fill_clean_num[i]);
9729 clean_num += clean_num_profile.spill_clean_num[i];
9730 clean_num += clean_num_profile.fill_clean_num[i];
9731 }
9732 printf("**Flag clean num: %d\n", clean_num);
9733 #endif
9734 }
9735
9736 if (builder.getOption(vISA_OptReport))
9737 {
9738 detectUndefinedUses(liveAnalysis, kernel);
9739 }
9740
9741 break; // done with FLAG allocation
9742 }
9743 }
9744 else {
9745 break; // no FLAG allocation needed
9746 }
9747 kernel.dumpToFile("after.Flag_RA." + std::to_string(iterationNo));
9748 iterationNo++;
9749 }
9750
9751 MUST_BE_TRUE(iterationNo < maxRAIterations, "Flag RA has failed.");
9752 }
9753
assignRegForAliasDcl()9754 void GlobalRA::assignRegForAliasDcl()
9755 {
9756 //
9757 // assign Reg for Alias DCL
9758 //
9759 for (G4_Declare *dcl : kernel.Declares)
9760 {
9761 G4_RegVar * AliasRegVar;
9762 G4_RegVar * CurrentRegVar;
9763 unsigned tempoffset;
9764
9765 if (dcl->getAliasDeclare() != NULL)
9766 {
9767 AliasRegVar = dcl->getAliasDeclare()->getRegVar();
9768 CurrentRegVar = dcl->getRegVar();
9769 tempoffset = AliasRegVar->getPhyRegOff()*AliasRegVar->getDeclare()->getElemSize() + dcl->getAliasOffset();
9770 if (AliasRegVar->getPhyReg() != NULL)
9771 {
9772 //
9773 // alias register assignment for A0
9774 //
9775 if (CurrentRegVar->getDeclare()->useGRF())
9776 {
9777 // if the tempoffset is one grf
9778 if (tempoffset < numEltPerGRF<Type_UW>() * 2u)
9779 {
9780 CurrentRegVar->setPhyReg(AliasRegVar->getPhyReg(), tempoffset / CurrentRegVar->getDeclare()->getElemSize());
9781 }
9782 // tempoffset covers several GRFs
9783 else
9784 {
9785 unsigned addtionalrow = tempoffset / (numEltPerGRF<Type_UW>() * 2);
9786 unsigned actualoffset = tempoffset % (numEltPerGRF<Type_UW>() * 2);
9787 bool valid = false;
9788 unsigned orignalrow = AliasRegVar->ExRegNum(valid);
9789 MUST_BE_TRUE(valid == true, ERROR_REGALLOC);
9790 CurrentRegVar->setPhyReg(regPool.getGreg(orignalrow + addtionalrow), actualoffset / CurrentRegVar->getDeclare()->getElemSize());
9791 }
9792 }
9793 else if (CurrentRegVar->getDeclare()->getRegFile() == G4_ADDRESS)
9794 {
9795 MUST_BE_TRUE(tempoffset < getNumAddrRegisters() * 2,
9796 ERROR_REGALLOC); // Must hold tempoffset in one A0 reg
9797 CurrentRegVar->setPhyReg(AliasRegVar->getPhyReg(), tempoffset / CurrentRegVar->getDeclare()->getElemSize());
9798 }
9799 else
9800 {
9801 MUST_BE_TRUE(false, ERROR_REGALLOC);
9802 }
9803 }
9804 else {
9805 // Propagate addr taken spill/fill to aliases
9806 CurrentRegVar->getDeclare()->setAddrTakenSpillFill(AliasRegVar->getDeclare()->getAddrTakenSpillFill());
9807
9808 if (dcl->isSpilled() == false)
9809 dcl->setSpillFlag();
9810 }
9811 }
9812 }
9813
9814 return;
9815 }
9816
removeSplitDecl()9817 void GlobalRA::removeSplitDecl()
9818 {
9819 for (auto dcl : kernel.Declares)
9820 {
9821 if (!getSubDclList(dcl).empty())
9822 {
9823 clearSubDcl(dcl);
9824 dcl->setIsSplittedDcl(false);
9825 }
9826 }
9827
9828 kernel.Declares.erase(std::remove_if(kernel.Declares.begin(), kernel.Declares.end(),
9829 [](G4_Declare* dcl) { return dcl->getIsPartialDcl(); }), kernel.Declares.end());
9830 }
9831
9832 // FIXME: doBankConflictReduction and highInternalConflict are computed by local RA
9833 // they should be moved to some common code
hybridRA(bool doBankConflictReduction,bool highInternalConflict,LocalRA & lra)9834 bool GlobalRA::hybridRA(bool doBankConflictReduction, bool highInternalConflict, LocalRA& lra)
9835 {
9836 if (builder.getOption(vISA_RATrace))
9837 {
9838 std::cout << "--hybrid RA--\n";
9839 }
9840 uint32_t numOrigDcl = (uint32_t) kernel.Declares.size();
9841 insertPhyRegDecls();
9842
9843 LivenessAnalysis liveAnalysis(*this, G4_GRF | G4_INPUT);
9844 liveAnalysis.computeLiveness();
9845
9846 if (liveAnalysis.getNumSelectedVar() > 0)
9847 {
9848 RPE rpe(*this, &liveAnalysis);
9849 rpe.run();
9850
9851 bool spillLikely = kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D &&
9852 rpe.getMaxRP() >= kernel.getNumRegTotal() - 16;
9853 if (spillLikely)
9854 {
9855 if (builder.getOption(vISA_RATrace))
9856 {
9857 std::cout << "\t--skip hybrid RA due to high pressure: " << rpe.getMaxRP() << "\n";
9858 }
9859 kernel.Declares.resize(numOrigDcl);
9860 lra.undoLocalRAAssignments(false);
9861 return false;
9862 }
9863
9864 GraphColor coloring(liveAnalysis, kernel.getNumRegTotal(), true, false);
9865
9866 unsigned spillRegSize = 0;
9867 unsigned indrSpillRegSize = 0;
9868 bool isColoringGood =
9869 coloring.regAlloc(doBankConflictReduction, highInternalConflict, false, spillRegSize, indrSpillRegSize, &rpe);
9870 if (!isColoringGood)
9871 {
9872 if (!kernel.getOption(vISA_Debug))
9873 {
9874 // Why?? Keep LRA results when -debug is passed
9875 kernel.Declares.resize(numOrigDcl);
9876 lra.undoLocalRAAssignments(false);
9877 }
9878 // Restore alignment in case LRA modified it
9879 copyAlignment();
9880 return false;
9881 }
9882 coloring.confirmRegisterAssignments();
9883
9884 if (kernel.fg.getHasStackCalls() || kernel.fg.getIsStackCallFunc())
9885 {
9886 coloring.getSaveRestoreRegister();
9887 addSaveRestoreCode(0);
9888 }
9889
9890 if (verifyAugmentation)
9891 {
9892 assignRegForAliasDcl();
9893 computePhyReg();
9894 verifyAugmentation->verify();
9895 }
9896 }
9897
9898 kernel.setRAType(doBankConflictReduction ? RA_Type::HYBRID_BC_RA : RA_Type::HYBRID_RA);
9899 return true;
9900 }
9901
canDoHRA(G4_Kernel & kernel)9902 bool canDoHRA(G4_Kernel& kernel)
9903 {
9904 bool ret = true;
9905
9906 if (kernel.getVarSplitPass()->splitOccured())
9907 {
9908 ret = false;
9909 }
9910
9911 return ret;
9912 }
9913
9914 //
9915 // graph coloring entry point. returns nonzero if RA fails
9916 //
coloringRegAlloc()9917 int GlobalRA::coloringRegAlloc()
9918 {
9919 if (kernel.getOption(vISA_OptReport))
9920 {
9921 std::ofstream optreport;
9922 getOptReportStream(optreport, builder.getOptions());
9923 optreport << std::endl << "=== Register Allocation ===" << std::endl;
9924 if (builder.getIsKernel() == false)
9925 {
9926 optreport << "Function: " << kernel.getName() << std::endl;
9927 }
9928 else
9929 {
9930 optreport << "Kernel: " << kernel.getName() << std::endl;
9931 }
9932 closeOptReportStream(optreport);
9933
9934 detectNeverDefinedUses();
9935 }
9936
9937 bool hasStackCall = kernel.fg.getHasStackCalls() || kernel.fg.getIsStackCallFunc();
9938
9939 // this needs to be called before addr/flag RA since it changes their alignment as well
9940 fixAlignment();
9941
9942 {
9943 TIME_SCOPE(ADDR_FLAG_RA);
9944
9945 addrRegAlloc();
9946
9947 flagRegAlloc();
9948 }
9949
9950 // LSC messages are used when:
9951 // a. Stack call is used on PVC+,
9952 // b. Spill size exceeds what can be represented using hword msg on PVC+
9953 if (builder.supportsLSC()) {
9954 useLscForSpillFill = true;
9955 useLscForNonStackCallSpillFill =
9956 builder.getOption(vISA_lscNonStackSpill) != 0;
9957 }
9958
9959 if (builder.hasFusedEUWA() && !builder.getIsPayload())
9960 {
9961 if (G4_BB* entryBB = (*kernel.fg.begin()))
9962 {
9963 INST_LIST_ITER inst_it = entryBB->begin();
9964 const INST_LIST_ITER inst_ie = entryBB->end();
9965 while (inst_it != inst_ie && (*inst_it)->isLabel())
9966 {
9967 inst_it++;
9968 }
9969 G4_INST* euWAInst = builder.createEUWASpill(false);
9970 entryBB->insertBefore(inst_it, euWAInst);
9971 }
9972 }
9973
9974 //
9975 // If the graph has stack calls, then add the caller-save/callee-save pseudo
9976 // declares and code. This currently must be done after flag/addr RA due to
9977 // the assumption about the location of the pseudo save/restore instructions
9978 //
9979 if (hasStackCall)
9980 {
9981 addCallerSavePseudoCode();
9982
9983 // Only GENX sub-graphs require callee-save code.
9984
9985 if (builder.getIsKernel() == false)
9986 {
9987 addCalleeSavePseudoCode();
9988 addStoreRestoreToReturn();
9989 }
9990
9991 // bind builtinR0 to the reserved stack call ABI GRF so that caller and
9992 // callee can agree on which GRF to use for r0
9993 builder.getBuiltinR0()->getRegVar()->setPhyReg(
9994 builder.phyregpool.getGreg(kernel.getThreadHeaderGRF()), 0);
9995 }
9996
9997 if (kernel.getOption(vISA_SpillAnalysis))
9998 {
9999 spillAnalysis = std::make_unique<SpillAnalysis>();
10000 }
10001
10002 if (!isReRAPass())
10003 {
10004 //Global linear scan RA
10005 if (builder.getOption(vISA_LinearScan))
10006 {
10007 copyMissingAlignment();
10008 BankConflictPass bc(*this, false);
10009 LivenessAnalysis liveAnalysis(*this, G4_GRF | G4_INPUT);
10010 liveAnalysis.computeLiveness();
10011
10012 TIME_SCOPE(LINEARSCAN_RA);
10013 LinearScanRA lra(bc, *this, liveAnalysis);
10014 int success = lra.doLinearScanRA();
10015 if (success == VISA_SUCCESS)
10016 {
10017 // TODO: Get correct spillSize from LinearScanRA
10018 unsigned spillSize = 0;
10019 expandSpillFillIntrinsics(spillSize);
10020 assignRegForAliasDcl();
10021 computePhyReg();
10022 if (builder.getOption(vISA_verifyLinearScan))
10023 {
10024 resetGlobalRAStates();
10025 markGraphBlockLocalVars();
10026 LivenessAnalysis live(*this, G4_GRF | G4_INPUT, false, true);
10027 live.computeLiveness();
10028 GraphColor coloring(live, kernel.getNumRegTotal(), false, false);
10029 vISA::Mem_Manager mem(GRAPH_COLOR_MEM_SIZE);
10030 coloring.createLiveRanges(0);
10031 LiveRange** lrs = coloring.getLRs();
10032 Interference intf(&live, lrs, live.getNumSelectedVar(), live.getNumSplitStartID(), live.getNumSplitVar(), *this);
10033 intf.init(mem);
10034 intf.computeInterference();
10035
10036 if(kernel.getOption(vISA_DumpRAIntfGraph))
10037 intf.dumpInterference();
10038 intf.linearScanVerify();
10039 }
10040 return VISA_SUCCESS;
10041 }
10042
10043 if (success == VISA_SPILL)
10044 {
10045 return VISA_SPILL;
10046 }
10047 }
10048 else if (builder.getOption(vISA_LocalRA) && !hasStackCall)
10049 {
10050 copyMissingAlignment();
10051 BankConflictPass bc(*this, false);
10052 LocalRA lra(bc, *this);
10053 bool success = lra.localRA();
10054 if (!success && !builder.getOption(vISA_HybridRAWithSpill))
10055 {
10056 if (canDoHRA(kernel))
10057 {
10058 success = hybridRA(lra.doHybridBCR(), lra.hasHighInternalBC(), lra);
10059 }
10060 else
10061 {
10062 if (builder.getOption(vISA_RATrace))
10063 {
10064 std::cout << "\t--skip HRA due to var split. undo LRA results." << "\n";
10065 }
10066 lra.undoLocalRAAssignments(false);
10067 }
10068 }
10069 if (success)
10070 {
10071 // either local or hybrid RA succeeds
10072 assignRegForAliasDcl();
10073 computePhyReg();
10074 return VISA_SUCCESS;
10075 }
10076 if (builder.getOption(vISA_HybridRAWithSpill))
10077 {
10078 insertPhyRegDecls();
10079 }
10080 }
10081 }
10082
10083 startTimer(TimerID::GRF_GLOBAL_RA);
10084 const unsigned maxRAIterations = 10;
10085 unsigned iterationNo = 0;
10086
10087 int globalScratchOffset = kernel.getInt32KernelAttr(Attributes::ATTR_SpillMemOffset);
10088 bool useScratchMsgForSpill = !hasStackCall && (globalScratchOffset < (int)(SCRATCH_MSG_LIMIT * 0.6)
10089 // useScratchMsgForSpill is true for
10090 // * scratch msg
10091 // * LSC msg
10092 // Spill insertion module decides whether to expand a fill/spill to scratch or LSC
10093 // depending on spill offset. oword is supported for PVC but it is not emitted in
10094 // favor of LSC.
10095 || builder.supportsLSC());
10096 bool enableSpillSpaceCompression = builder.getOption(vISA_SpillSpaceCompression);
10097
10098 uint32_t nextSpillOffset = 0;
10099 uint32_t scratchOffset = 0;
10100
10101 if (kernel.fg.getIsStackCallFunc())
10102 {
10103 // Allocate space to store Frame Descriptor
10104 nextSpillOffset += 32;
10105 scratchOffset += 32;
10106 }
10107
10108 uint32_t GRFSpillFillCount = 0;
10109 uint32_t sendAssociatedGRFSpillFillCount = 0;
10110 unsigned fastCompileIter = 1;
10111 bool fastCompile =
10112 (builder.getOption(vISA_FastCompileRA) || builder.getOption(vISA_HybridRAWithSpill)) &&
10113 !hasStackCall;
10114
10115 if (fastCompile)
10116 {
10117 fastCompileIter = 0;
10118 }
10119
10120 unsigned failSafeRAIteration = (builder.getOption(vISA_FastSpill) || fastCompile) ? fastCompileIter : FAIL_SAFE_RA_LIMIT;
10121 if (failSafeRAIteration == 0)
10122 {
10123 builder.getSpillFillHeader();
10124 builder.getOldA0Dot2Temp();
10125 if (builder.hasScratchSurface())
10126 {
10127 builder.initScratchSurfaceOffset();
10128 }
10129 //BuiltinR0 may be spilled which is not allowed.
10130 //FIXME: BuiltinR0 spill cost has been set to MAX already,
10131 //keep spilling means there is some issue in cost model
10132 builder.getBuiltinR0()->setLiveOut();
10133 builder.getBuiltinR0()->getRegVar()->setPhyReg(
10134 builder.phyregpool.getGreg(0), 0);
10135 }
10136 bool rematDone = false, alignedScalarSplitDone = false;
10137 bool reserveSpillReg = false;
10138 VarSplit splitPass(*this);
10139
10140 while (iterationNo < maxRAIterations)
10141 {
10142 if (builder.getOption(vISA_RATrace))
10143 {
10144 std::cout << "--GRF RA iteration " << iterationNo << "--" << kernel.getName() << "\n";
10145 }
10146 setIterNo(iterationNo);
10147
10148 if (!builder.getOption(vISA_HybridRAWithSpill))
10149 {
10150 resetGlobalRAStates();
10151 }
10152
10153 if (builder.getOption(vISA_clearScratchWritesBeforeEOT) &&
10154 (globalScratchOffset + nextSpillOffset) > 0)
10155 {
10156 // we need to set r0 be live out for this WA
10157 builder.getBuiltinR0()->setLiveOut();
10158 }
10159
10160 //Identify the local variables to speedup following analysis
10161 if (!builder.getOption(vISA_HybridRAWithSpill))
10162 {
10163 markGraphBlockLocalVars();
10164 }
10165
10166 if (kernel.getOption(vISA_SpillAnalysis))
10167 {
10168 spillAnalysis->Clear();
10169 }
10170
10171 //Do variable splitting in each iteration
10172 if (builder.getOption(vISA_LocalDeclareSplitInGlobalRA))
10173 {
10174 if (builder.getOption(vISA_RATrace))
10175 {
10176 std::cout << "\t--split local send--\n";
10177 }
10178 for (auto bb : kernel.fg)
10179 {
10180 if (bb->isSendInBB())
10181 {
10182 splitPass.localSplit(builder, bb);
10183 }
10184 }
10185 }
10186
10187 bool doBankConflictReduction = false;
10188 bool highInternalConflict = false; // this is set by setupBankConflictsForKernel
10189
10190 if (builder.getOption(vISA_LocalBankConflictReduction) &&
10191 builder.hasBankCollision())
10192 {
10193 bool reduceBCInRR = false;
10194 bool reduceBCInTAandFF = false;
10195 BankConflictPass bc(*this, true);
10196
10197 reduceBCInRR = bc.setupBankConflictsForKernel(true, reduceBCInTAandFF, SECOND_HALF_BANK_START_GRF * 2, highInternalConflict);
10198 doBankConflictReduction = reduceBCInRR && reduceBCInTAandFF;
10199 }
10200
10201 bool allowAddrTaken = builder.getOption(vISA_FastSpill) || fastCompile ||
10202 !kernel.getHasAddrTaken();
10203 if (builder.getOption(vISA_FailSafeRA) &&
10204 kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D &&
10205 !hasStackCall &&
10206 ((iterationNo == maxRAIterations - 1) ||
10207 (allowAddrTaken &&
10208 iterationNo == failSafeRAIteration)))
10209 {
10210 if (builder.getOption(vISA_RATrace))
10211 {
10212 std::cout << "\t--enable failSafe RA\n";
10213 }
10214 reserveSpillReg = true;
10215 }
10216
10217 LivenessAnalysis liveAnalysis(*this, G4_GRF | G4_INPUT);
10218 liveAnalysis.computeLiveness();
10219 if (builder.getOption(vISA_dumpLiveness))
10220 {
10221 liveAnalysis.dump();
10222 }
10223
10224 #ifdef DEBUG_VERBOSE_ON
10225 emitFGWithLiveness(liveAnalysis);
10226 #endif
10227 //
10228 // if no reg var needs to reg allocated, then skip reg allocation
10229 //
10230 if (liveAnalysis.getNumSelectedVar() > 0)
10231 {
10232 // force spill should be done only for the 1st iteration
10233 bool forceSpill = iterationNo > 0 ? false : builder.getOption(vISA_ForceSpills);
10234 RPE rpe(*this, &liveAnalysis);
10235 if (!fastCompile)
10236 {
10237 rpe.run();
10238 }
10239 GraphColor coloring(liveAnalysis, kernel.getNumRegTotal(), false, forceSpill);
10240
10241 if (builder.getOption(vISA_dumpRPE) && iterationNo == 0 && !rematDone)
10242 {
10243 // dump pressure the first time we enter global RA
10244 coloring.dumpRegisterPressure();
10245 }
10246
10247 unsigned spillRegSize = 0;
10248 unsigned indrSpillRegSize = 0;
10249 bool isColoringGood =
10250 coloring.regAlloc(doBankConflictReduction, highInternalConflict, reserveSpillReg, spillRegSize, indrSpillRegSize, &rpe);
10251 if (!isColoringGood)
10252 {
10253 if (isReRAPass())
10254 {
10255 // Dont modify program if reRA pass spills
10256 return VISA_SPILL;
10257 }
10258
10259 bool runRemat = kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_CM
10260 ? true : kernel.getSimdSize() < numEltPerGRF<Type_UB>();
10261 // -noremat takes precedence over -forceremat
10262 bool rematOn = !kernel.getOption(vISA_Debug) &&
10263 !kernel.getOption(vISA_NoRemat) &&
10264 !kernel.getOption(vISA_FastSpill) &&
10265 !fastCompile &&
10266 (kernel.getOption(vISA_ForceRemat) || runRemat);
10267 bool rerunGRA = false;
10268 bool globalSplitChange = false;
10269
10270 if (!rematDone &&
10271 rematOn)
10272 {
10273 if (builder.getOption(vISA_RATrace))
10274 {
10275 std::cout << "\t--rematerialize\n";
10276 }
10277 Rematerialization remat(kernel, liveAnalysis, coloring, rpe, *this);
10278 remat.run();
10279 rematDone = true;
10280
10281 // Re-run GRA loop only if remat caused changes to IR
10282 rerunGRA |= remat.getChangesMade();
10283 }
10284
10285 if (kernel.getOption(vISA_SplitGRFAlignedScalar) &&
10286 !fastCompile &&
10287 !kernel.getOption(vISA_FastSpill) &&
10288 !alignedScalarSplitDone)
10289 {
10290 SplitAlignedScalars split(*this, coloring);
10291 split.run();
10292 alignedScalarSplitDone = true;
10293
10294 // Re-run GRA loop if changes were made to IR
10295 rerunGRA |= split.getChangesMade();
10296 }
10297
10298 //Calculate the spill caused by send to decide if global splitting is required or not
10299 for (auto spilled : coloring.getSpilledLiveRanges())
10300 {
10301 auto spillDcl = spilled->getDcl();
10302 if (spillDcl->getIsRefInSendDcl() && spillDcl->getNumRows() > 1)
10303 {
10304 sendAssociatedGRFSpillFillCount += spilled->getRefCount();
10305 }
10306 }
10307
10308 int instNum = 0;
10309 for (auto bb : kernel.fg)
10310 {
10311 instNum += (int)bb->size();
10312 }
10313
10314 if (iterationNo == 0 && //Only works when first iteration of Global RA failed.
10315 !splitPass.didGlobalSplit && //Do only one time.
10316 splitPass.canDoGlobalSplit(builder, kernel, sendAssociatedGRFSpillFillCount))
10317 {
10318 if (builder.getOption(vISA_RATrace))
10319 {
10320 std::cout << "\t--global send split\n";
10321 }
10322 splitPass.globalSplit(builder, kernel);
10323 splitPass.didGlobalSplit = true;
10324 globalSplitChange = true;
10325 }
10326
10327 if (iterationNo == 0 &&
10328 (rerunGRA || globalSplitChange || kernel.getOption(vISA_forceBCR)))
10329 {
10330 if (kernel.getOption(vISA_forceBCR))
10331 {
10332 kernel.getOptions()->setOption(vISA_forceBCR, false);
10333 }
10334
10335 continue;
10336 }
10337
10338 if (iterationNo == 0 && !fastCompile &&
10339 kernel.getOption(vISA_DoSplitOnSpill))
10340 {
10341 LoopVarSplit loopSplit(kernel, &coloring, &rpe);
10342 kernel.fg.getLoops().computePreheaders();
10343 loopSplit.run();
10344 }
10345
10346 //Calculate the spill caused by send to decide if global splitting is required or not
10347 for (auto spilled : coloring.getSpilledLiveRanges())
10348 {
10349 GRFSpillFillCount += spilled->getRefCount();
10350 }
10351
10352 if (builder.getOption(vISA_OptReport) && iterationNo == 0)
10353 {
10354 // Dump out interference graph information of spill candidates
10355 reportSpillInfo(liveAnalysis, coloring);
10356 }
10357
10358 // vISA_AbortOnSpillThreshold is defined as [0..200]
10359 // where 0 means abort on any spill and 200 means never abort
10360 auto underSpillThreshold = [this](int numSpill, int asmCount)
10361 {
10362 int threshold = std::min(builder.getOptions()->getuInt32Option(vISA_AbortOnSpillThreshold), 200u);
10363 return (numSpill * 200) < (threshold * asmCount);
10364 };
10365
10366 bool isUnderThreshold = underSpillThreshold(GRFSpillFillCount, instNum);
10367 if (isUnderThreshold)
10368 {
10369 if (auto jitInfo = builder.getJitInfo())
10370 {
10371 jitInfo->avoidRetry = true;
10372 }
10373 }
10374
10375 if (builder.getOption(vISA_AbortOnSpill) && !isUnderThreshold)
10376 {
10377 // update jit metadata information
10378 if (auto jitInfo = builder.getJitInfo())
10379 {
10380 jitInfo->isSpill = true;
10381 jitInfo->spillMemUsed = 0;
10382 jitInfo->numAsmCount = instNum;
10383 jitInfo->numGRFSpillFill = GRFSpillFillCount;
10384 }
10385
10386 // Early exit when -abortonspill is passed, instead of
10387 // spending time inserting spill code and then aborting.
10388 stopTimer(TimerID::GRF_GLOBAL_RA);
10389 return VISA_SPILL;
10390 }
10391
10392 if (iterationNo == 0 &&
10393 enableSpillSpaceCompression &&
10394 kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D &&
10395 !hasStackCall)
10396 {
10397 unsigned spillSize = 0;
10398 const LIVERANGE_LIST& spilledLRs = coloring.getSpilledLiveRanges();
10399 for (auto lr : spilledLRs)
10400 {
10401 spillSize += lr->getDcl()->getByteSize();
10402 }
10403 if ((int)(spillSize * 1.5) < (SCRATCH_MSG_LIMIT - globalScratchOffset))
10404 {
10405 enableSpillSpaceCompression = false;
10406 }
10407 }
10408
10409 startTimer(TimerID::SPILL);
10410 SpillManagerGRF spillGRF(*this,
10411 nextSpillOffset,
10412 liveAnalysis.getNumSelectedVar(),
10413 &liveAnalysis,
10414 coloring.getLiveRanges(),
10415 coloring.getIntf(),
10416 &coloring.getSpilledLiveRanges(),
10417 iterationNo++,
10418 reserveSpillReg,
10419 spillRegSize,
10420 indrSpillRegSize,
10421 enableSpillSpaceCompression,
10422 useScratchMsgForSpill,
10423 builder.avoidDstSrcOverlap());
10424
10425 if (kernel.getOption(vISA_SpillAnalysis))
10426 {
10427 spillAnalysis->Do(&liveAnalysis, &coloring, &spillGRF);
10428 }
10429
10430 bool success = spillGRF.insertSpillFillCode(&kernel, pointsToAnalysis);
10431 nextSpillOffset = spillGRF.getNextOffset();
10432
10433 if (builder.hasScratchSurface() && !hasStackCall &&
10434 (nextSpillOffset + globalScratchOffset) > SCRATCH_MSG_LIMIT)
10435 {
10436 // create temp variable to store old a0.2 - this is marked as live-in and live-out.
10437 // because the variable is emitted only post RA to preserve old value of a0.2.
10438 kernel.fg.builder->getOldA0Dot2Temp();
10439 } else if (useLscForNonStackCallSpillFill) {
10440 kernel.fg.builder->getOldA0Dot2Temp();
10441 }
10442
10443 if (builder.getOption(vISA_RATrace))
10444 {
10445 auto&& spills = coloring.getSpilledLiveRanges();
10446 std::cout << "\t--# variables spilled: " << spills.size() << "\n";
10447 if (spills.size() < 100)
10448 {
10449 std::cout << "\t--spilled variables: ";
10450 for (auto&& lr : spills)
10451 {
10452 std::cout << lr->getDcl()->getName() << " ";
10453 }
10454 std::cout << "\n";
10455 }
10456 std::cout << "\t--current spill size: " << nextSpillOffset << "\n";
10457 }
10458
10459 if (!success)
10460 {
10461 iterationNo = maxRAIterations;
10462 break;
10463 }
10464
10465 kernel.dumpToFile("after.Spill_GRF." + std::to_string(iterationNo));
10466 scratchOffset = std::max(scratchOffset, spillGRF.getNextScratchOffset());
10467
10468 bool disableSpillCoalecse = builder.getOption(vISA_DisableSpillCoalescing) ||
10469 builder.getOption(vISA_FastSpill) || fastCompile || builder.getOption(vISA_Debug) ||
10470 // spill cleanup is not support when we use oword msg for spill/fill for non-stack calls.
10471 (!useScratchMsgForSpill && !hasStackCall);
10472
10473 if (!reserveSpillReg && !disableSpillCoalecse && builder.useSends())
10474 {
10475 CoalesceSpillFills c(kernel, liveAnalysis, coloring, spillGRF, iterationNo, rpe, *this);
10476 c.run();
10477 }
10478
10479 if (iterationNo == FAIL_SAFE_RA_LIMIT)
10480 {
10481 if (coloring.getSpilledLiveRanges().size() < 2)
10482 {
10483 // give regular RA one more try as we are close to success
10484 failSafeRAIteration++;
10485 }
10486 }
10487 stopTimer(TimerID::SPILL);
10488 }
10489 // RA successfully allocates regs
10490 if (isColoringGood == true || reserveSpillReg)
10491 {
10492 coloring.confirmRegisterAssignments();
10493
10494 if (hasStackCall)
10495 {
10496 // spill/fill intrinsics expect offset in HWord, so round up to 64 byte but maintain it in OWord unit
10497 // ToDo: we really need to change everything to byte for everyone's sanity..
10498 unsigned localSpillAreaOwordSize = ROUND(scratchOffset, 64) / 16;
10499 coloring.getSaveRestoreRegister();
10500 addSaveRestoreCode(localSpillAreaOwordSize);
10501 }
10502
10503 if (kernel.getOption(vISA_DumpRegChart))
10504 {
10505 assignRegForAliasDcl();
10506 computePhyReg();
10507 // invoke before expanding spill/fill since
10508 // it modifies IR
10509 regChart->dumpRegChart(std::cerr);
10510 }
10511
10512 expandSpillFillIntrinsics(nextSpillOffset);
10513
10514 if (builder.getOption(vISA_OptReport))
10515 {
10516 detectUndefinedUses(liveAnalysis, kernel);
10517 }
10518
10519 if (nextSpillOffset)
10520 {
10521 switch (kernel.getRAType())
10522 {
10523 case RA_Type::GRAPH_COLORING_RR_BC_RA:
10524 kernel.setRAType(RA_Type::GRAPH_COLORING_SPILL_RR_BC_RA);
10525 break;
10526 case RA_Type::GRAPH_COLORING_FF_BC_RA:
10527 kernel.setRAType(RA_Type::GRAPH_COLORING_SPILL_FF_BC_RA);
10528 break;
10529 case RA_Type::GRAPH_COLORING_RR_RA:
10530 kernel.setRAType(RA_Type::GRAPH_COLORING_SPILL_RR_RA);
10531 break;
10532 case RA_Type::GRAPH_COLORING_FF_RA:
10533 kernel.setRAType(RA_Type::GRAPH_COLORING_SPILL_FF_RA);
10534 break;
10535 default:
10536 assert(0);
10537 break;
10538 }
10539 }
10540
10541 if (verifyAugmentation)
10542 {
10543 assignRegForAliasDcl();
10544 computePhyReg();
10545 verifyAugmentation->verify();
10546 }
10547 break; // done
10548 }
10549 }
10550 else
10551 {
10552 break;
10553 }
10554 }
10555 assignRegForAliasDcl();
10556 computePhyReg();
10557
10558 stopTimer(TimerID::GRF_GLOBAL_RA);
10559 //
10560 // Report failure to allocate due to excessive register pressure.
10561 //
10562 if (!reserveSpillReg && (iterationNo == maxRAIterations))
10563 {
10564 std::stringstream spilledVars;
10565 for (auto dcl : kernel.Declares)
10566 {
10567 if (dcl->isSpilled() && dcl->getRegFile() == G4_GRF)
10568 {
10569 spilledVars << dcl->getName() << "\t";
10570 }
10571 }
10572
10573 MUST_BE_TRUE(false,
10574 "ERROR: " << kernel.getNumRegTotal() - builder.getOptions()->getuInt32Option(vISA_ReservedGRFNum)
10575 << " GRF registers are NOT enough to compile kernel " << kernel.getName() << "!"
10576 << " The maximum register pressure in the kernel is higher"
10577 << " than the available physical registers in hardware (even"
10578 << " with spill code)."
10579 << " Please consider rewriting the kernel."
10580 << " Compiling with the symbolic register option and inspecting the"
10581 << " spilled registers may help in determining the region of high pressure.\n"
10582 << "The spilling virtual registers are as follows: "
10583 << spilledVars.str());
10584
10585 return VISA_SPILL;
10586 }
10587
10588 // this includes vISA's scratch space use only and does not include whatever IGC may use for private memory
10589 uint32_t spillMemUsed = ROUND(nextSpillOffset, numEltPerGRF<Type_UB>());
10590
10591 if (spillMemUsed &&
10592 !(kernel.fg.getHasStackCalls() || kernel.fg.getIsStackCallFunc()))
10593 {
10594 builder.criticalMsgStream() << "Spill memory used = " << spillMemUsed << " bytes for kernel " <<
10595 kernel.getName() << "\n Compiling kernel with spill code may degrade performance." <<
10596 " Please consider rewriting the kernel to use less registers.\n";
10597 }
10598
10599 // update jit metadata information for spill
10600 if (auto jitInfo = builder.getJitInfo())
10601 {
10602 jitInfo->isSpill = spillMemUsed > 0;
10603 jitInfo->hasStackcalls = kernel.fg.getHasStackCalls();
10604
10605 if (jitInfo->hasStackcalls && builder.getIsKernel()) {
10606 // jitInfo->spillMemUsed is the entire visa stack size. Consider the caller/callee
10607 // save size if having caller/callee save
10608 // globalScratchOffset in unit of byte, others in Oword
10609 //
10610 // vISA stack
10611 // globalScratchOffset -> ---------------------
10612 // FIXME: should be 0-based | spill |
10613 // | |
10614 // calleeSaveAreaOffset -> ---------------------
10615 // | callee save |
10616 // callerSaveAreaOffset -> ---------------------
10617 // | caller save |
10618 // paramOverflowAreaOffset -> ---------------------
10619
10620 // Since it is difficult to predict amount of space needed to store stack, we
10621 // reserve 64k. Reserving PTSS is ideal, but it can lead to OOM on machines
10622 // with large number of threads.
10623 unsigned int scratchAllocation = 1024 * kernel.getOptions()->getuInt32Option(vISA_ScratchAllocForStackInKB);
10624 jitInfo->spillMemUsed = scratchAllocation;
10625 jitInfo->isSpill = true;
10626
10627 // reserve spillMemUsed #bytes at upper end
10628 kernel.getGTPinData()->setScratchNextFree(scratchAllocation - kernel.getGTPinData()->getNumBytesScratchUse());
10629 }
10630 else {
10631 // stack call functions shouldnt report any scratch usage as it is
10632 // kernel's responsibility to account for stack usage of entire call
10633 // tree.
10634 if (!kernel.fg.getIsStackCallFunc())
10635 {
10636 jitInfo->spillMemUsed = spillMemUsed;
10637 kernel.getGTPinData()->setScratchNextFree(spillMemUsed);
10638 }
10639 }
10640 jitInfo->numGRFSpillFill = GRFSpillFillCount;
10641 }
10642
10643 if (builder.getOption(vISA_LocalDeclareSplitInGlobalRA))
10644 {
10645 removeSplitDecl();
10646 }
10647
10648 return VISA_SUCCESS;
10649 }
10650
10651 /********************************************************************************************************************************************/
10652 /********************************************************Spill Code Clean up ****************************************************************/
10653 /********************************************************************************************************************************************/
10654
10655 #define SPILL_MEMORY_OVERLAP(x, y) \
10656 (!(x->leftOff > y->rightOff || y->leftOff > x->rightOff))
10657
10658 #define SPILL_MEMORY_OVERWRITE(target_memory, overwrite_memory) \
10659 (target_memory->leftOff >= overwrite_memory->leftOff && overwrite_memory->rightOff >= target_memory->rightOff)
10660
10661 #define IS_FLAG_MOVE(inst) (\
10662 inst->opcode() == G4_mov && \
10663 (inst->getDst() && inst->getSrc(0)) && \
10664 (inst->getDst()->getTopDcl() && inst->getSrc(0)->getTopDcl()) && \
10665 ((inst->getDst()->getTopDcl()->getRegFile() == G4_FLAG && inst->getSrc(0)->getTopDcl()->getRegFile() == G4_GRF) || \
10666 (inst->getDst()->getTopDcl()->getRegFile() == G4_GRF && inst->getSrc(0)->getTopDcl()->getRegFile() == G4_FLAG)))
10667
10668 #define IS_SPILL_KILL_CANDIDATE(preScratchAccess) \
10669 (preScratchAccess->isSpill && !preScratchAccess->fillInUse) \
10670
10671 #define IS_USE_KILL_CANDIDATE(preScratchAccess) \
10672 (!(preScratchAccess->regKilled || preScratchAccess->regPartialKilled || preScratchAccess->scratchDefined)) \
10673
10674 #define IS_GRF_RANGE_OVERLAP(s1, e1, sa) \
10675 (e1 >= sa->linearizedStart && sa->linearizedEnd >= s1)
10676
10677 #define IS_SCRATCH_RANGE_OVERLAP(s1, e1, sa) \
10678 (!(e1 < sa->leftOff || sa->rightOff < s1))
10679
10680 #define IS_MERGEABLE_SCRATCH_RANGES(r1, r2) \
10681 (!(((int)r1.leftOff - (int)r2.rightOff)> 1 || ((int)r2.leftOff - (int)r1.rightOff) > 1))
10682
10683 #define IS_MERGEABLE_GRF_RANGES(r1, r2) \
10684 (!(((int)r1.linearizedStart - (int)r2.linearizedEnd) > 1 || ((int)r2.linearizedStart - (int)r1.linearizedEnd) > 1))
10685
10686 #define IS_GRF_RANGE_OVERWRITE(sa, s1, e1) \
10687 (s1 <= sa->linearizedStart && sa->linearizedEnd <= e1)
10688
10689 #define IS_SCRATCH_RANGE_OVERWRITE(sa, s1, e1) \
10690 (s1 <= sa->leftOff && sa->rightOff <= e1)
10691
10692 #define IS_FLAG_RANGE_OVERLAP(s1, e1, sa) \
10693 (!(e1 < sa->linearizedStart || sa->linearizedEnd < s1))
10694
10695 #define IS_FLAG_RANGE_OVERWRITE(t, s, e) \
10696 ((s <= t->linearizedStart && t->linearizedEnd <= e))
10697
FlagLineraizedStartAndEnd(G4_Declare * topdcl,unsigned & linearizedStart,unsigned & linearizedEnd)10698 void FlagSpillCleanup::FlagLineraizedStartAndEnd(G4_Declare* topdcl,
10699 unsigned& linearizedStart,
10700 unsigned& linearizedEnd)
10701 {
10702 const G4_Areg* areg = topdcl->getRegVar()->getPhyReg()->asAreg();
10703 linearizedStart = areg->getFlagNum() * 4;
10704 linearizedStart += topdcl->getRegVar()->getPhyRegOff() * topdcl->getElemSize();
10705 linearizedEnd = linearizedStart + topdcl->getByteSize();
10706 return;
10707 }
10708
10709 /*
10710 * Reuse previous register
10711 */
replaceWithPreDcl(IR_Builder & builder,SCRATCH_ACCESS * scratchAccess,SCRATCH_ACCESS * preScratchAccess)10712 bool FlagSpillCleanup::replaceWithPreDcl(
10713 IR_Builder& builder,
10714 SCRATCH_ACCESS* scratchAccess,
10715 SCRATCH_ACCESS* preScratchAccess)
10716 {
10717 int preRegOff = 0;
10718 int payloadHeaderSize = 0;
10719 G4_Operand *reuseOpnd = NULL;
10720 G4_INST *preInst = *preScratchAccess->inst_it;
10721
10722 //Get reuse operand
10723 if (preScratchAccess->isSpill)
10724 {
10725 reuseOpnd = preInst->getSrc(0);
10726 preRegOff = reuseOpnd->asSrcRegRegion()->getSubRegOff();
10727 reuseOpnd = preInst->getSrc(0);
10728 }
10729 else
10730 {
10731 reuseOpnd = preInst->getDst();
10732 preRegOff = reuseOpnd->asDstRegRegion()->getSubRegOff();//For flag register, only subRegOff
10733 }
10734 G4_Declare *dcl = reuseOpnd->getBase()->asRegVar()->getDeclare();
10735
10736 if (builder.WaDisableSendSrcDstOverlap())
10737 {
10738 for (auto &renameOpnd : scratchAccess->renameOperandVec)
10739 {
10740 if (renameOpnd.second < -1) //Flag
10741 {
10742 break;
10743 }
10744
10745 G4_INST *inst = renameOpnd.first;
10746
10747 if (renameOpnd.second >= 0)
10748 {
10749 if (inst->isSend() && !inst->getDst()->isNullReg())
10750 {
10751 G4_DstRegRegion* dst = inst->getDst();
10752 bool noOverlap = dst->getLinearizedEnd() < preScratchAccess->linearizedStart ||
10753 preScratchAccess->linearizedEnd < dst->getLinearizedStart();
10754 if (!noOverlap)
10755 {
10756 return false;
10757 }
10758 }
10759 }
10760 }
10761 }
10762
10763 //Replace the declare for all operands assciated with this scratch fill.
10764 for (auto &renameOpnd : scratchAccess->renameOperandVec)
10765 {
10766 G4_INST *inst = renameOpnd.first;
10767
10768 if (renameOpnd.second == -3) //Flag modifier
10769 {
10770 G4_CondMod* mod = inst->getCondMod();
10771 int regOff = preRegOff;
10772 G4_CondMod* mod_Opnd = builder.createCondMod(mod->getMod(),
10773 dcl->getRegVar(),
10774 (unsigned short)regOff);
10775
10776 inst->setCondMod(mod_Opnd);
10777
10778 }
10779 else if (renameOpnd.second == -2) //Flag predicate
10780 {
10781 G4_Predicate* predicate = inst->getPredicate();
10782 int regOff = preRegOff;
10783 G4_Predicate * pred_Opnd = builder.createPredicate(predicate->getState(),
10784 dcl->getRegVar(),
10785 (unsigned short)regOff,
10786 predicate->getControl());
10787
10788 inst->setPredicate(pred_Opnd);
10789 }
10790 else if (renameOpnd.second == -1) //GRF dst
10791 {
10792 G4_DstRegRegion *orgDstRegion = inst->getDst();
10793 int regOff = preRegOff + (scratchAccess->leftOff - preScratchAccess->leftOff) / numEltPerGRF<Type_UB>() + payloadHeaderSize / numEltPerGRF<Type_UB>();
10794 G4_DstRegRegion * dstOpnd = builder.createDst(
10795 dcl->getRegVar(),
10796 (short)regOff,
10797 orgDstRegion->getSubRegOff(),
10798 orgDstRegion->getHorzStride(), orgDstRegion->getType());
10799 inst->setDest(dstOpnd);
10800 }
10801 else //GRF src
10802 {
10803 G4_Operand *opnd = inst->getSrc(renameOpnd.second);
10804 G4_SrcRegRegion *orgSrcRegion = opnd->asSrcRegRegion();
10805
10806 int regOff = preRegOff + (scratchAccess->leftOff - preScratchAccess->leftOff) / numEltPerGRF<Type_UB>() + payloadHeaderSize / numEltPerGRF<Type_UB>();
10807 G4_Operand * srcOpnd = builder.createSrcRegRegion(orgSrcRegion->getModifier(),
10808 orgSrcRegion->getRegAccess(),
10809 dcl->getRegVar(),
10810 (short)regOff,
10811 orgSrcRegion->getSubRegOff(),
10812 orgSrcRegion->getRegion(),
10813 orgSrcRegion->getType());
10814
10815 inst->setSrc(srcOpnd, renameOpnd.second);
10816 }
10817 }
10818
10819 return true;
10820 }
10821
10822 /*
10823 * 1) The reuse target register in pre scratch access may be partial killed,
10824 * 2) and the corresponding scracth memory range is overlap with the memory of current scratch access.
10825 * In both cases, the current fill can not be removed
10826 */
scratchKilledByPartial(SCRATCH_ACCESS * scratchAccess,SCRATCH_ACCESS * preScratchAccess)10827 bool FlagSpillCleanup::scratchKilledByPartial(
10828 SCRATCH_ACCESS* scratchAccess,
10829 SCRATCH_ACCESS* preScratchAccess)
10830 {
10831 bool killed = false;
10832
10833 for (auto &range : preScratchAccess->killedScratchRange)
10834 {
10835 if (!(scratchAccess->leftOff > range.rightOff ||
10836 range.leftOff > scratchAccess->rightOff))
10837 {
10838 killed = true;
10839 }
10840 }
10841
10842 for (auto &range : preScratchAccess->killedRegRange)
10843 {
10844 //Map the register kill to scratch kill
10845 unsigned leftOff = preScratchAccess->leftOff + (range.linearizedStart - preScratchAccess->linearizedStart);
10846 unsigned rightOff = preScratchAccess->leftOff + (range.linearizedEnd - preScratchAccess->linearizedStart);
10847
10848 if (!(scratchAccess->leftOff > rightOff ||
10849 leftOff > scratchAccess->rightOff))
10850 {
10851 killed = true;
10852 }
10853 }
10854
10855 return killed;
10856 }
10857
10858 /*
10859 * Record all killed GRF ranges.
10860 * do merging of ranges when possible.
10861 */
addKilledGRFRanges(unsigned linearizedStart,unsigned linearizedEnd,SCRATCH_ACCESS * scratchAccess,G4_Predicate * predicate)10862 bool FlagSpillCleanup::addKilledGRFRanges(
10863 unsigned linearizedStart,
10864 unsigned linearizedEnd,
10865 SCRATCH_ACCESS* scratchAccess,
10866 G4_Predicate* predicate)
10867 {
10868 REG_RANGE range;
10869 range.linearizedStart = std::max(scratchAccess->linearizedStart, linearizedStart);
10870 range.linearizedEnd = std::min(scratchAccess->linearizedEnd, linearizedEnd);
10871 range.predicate = predicate ? true : false;
10872
10873 if (scratchAccess->killedRegRange.size() == 0)
10874 {
10875 scratchAccess->killedRegRange.push_back(range);
10876 }
10877 else
10878 {
10879 bool merged = false;
10880 REG_RANGE_VEC_ITER range_iter = scratchAccess->killedRegRange.begin();
10881 REG_RANGE_VEC_ITER range_iter_next;
10882 REG_RANGE *merged_range = NULL;
10883 while (range_iter != scratchAccess->killedRegRange.end())
10884 {
10885 REG_RANGE &killedRange = *(range_iter);
10886 range_iter_next = range_iter;
10887 range_iter_next++;
10888
10889 if (killedRange.predicate) //With predicate, the range can not be merged with others
10890 {
10891 range_iter = range_iter_next;
10892 continue;
10893 }
10894
10895 if (!merged && IS_MERGEABLE_GRF_RANGES(range, killedRange))
10896 {
10897 killedRange.linearizedStart = std::min(killedRange.linearizedStart, range.linearizedStart);
10898 killedRange.linearizedEnd = std::max(killedRange.linearizedEnd, range.linearizedEnd);
10899 merged = true;
10900 merged_range = &killedRange;
10901 }
10902 else if (merged)
10903 {
10904 if (IS_MERGEABLE_GRF_RANGES((*merged_range), killedRange))
10905 {
10906 merged_range->linearizedStart = std::min(killedRange.linearizedStart, merged_range->linearizedStart);
10907 merged_range->linearizedEnd = std::max(killedRange.linearizedEnd, merged_range->linearizedEnd);
10908 }
10909 }
10910 if (IS_GRF_RANGE_OVERWRITE(scratchAccess, killedRange.linearizedStart, killedRange.linearizedEnd))
10911 {
10912 scratchAccess->regKilled = true;
10913 return true;
10914 }
10915 range_iter = range_iter_next;
10916 }
10917 if (!merged)
10918 {
10919 scratchAccess->killedRegRange.push_back(range);
10920 }
10921 }
10922
10923 return false;
10924 }
10925
10926 /*
10927 * Check if the register in previous scratch access is fully killed by current register define
10928 */
regFullyKilled(SCRATCH_ACCESS * scratchAccess,unsigned linearizedStart,unsigned linearizedEnd,unsigned short maskFlag)10929 bool FlagSpillCleanup::regFullyKilled(
10930 SCRATCH_ACCESS* scratchAccess,
10931 unsigned linearizedStart,
10932 unsigned linearizedEnd,
10933 unsigned short maskFlag)
10934 {
10935
10936 if (IS_FLAG_RANGE_OVERWRITE(scratchAccess, linearizedStart, linearizedEnd))
10937 {
10938 if (maskFlag & InstOpt_WriteEnable) // No mask == all range killed
10939 {
10940 return true;
10941 }
10942
10943 if (linearizedStart == scratchAccess->linearizedStart &&
10944 linearizedEnd == scratchAccess->linearizedEnd &&
10945 scratchAccess->maskFlag == maskFlag)
10946 {
10947 return true;
10948 }
10949 }
10950
10951 return false;
10952 }
10953
10954 /*
10955 * Check only part of scratch register is killed, at the same time no overlap.
10956 * This is to make sure if the associated fill is removed, the define register can be replaced with reuse register or not.
10957 */
inRangePartialKilled(SCRATCH_ACCESS * scratchAccess,unsigned linearizedStart,unsigned linearizedEnd,unsigned short maskFlag)10958 bool FlagSpillCleanup::inRangePartialKilled(
10959 SCRATCH_ACCESS* scratchAccess,
10960 unsigned linearizedStart,
10961 unsigned linearizedEnd,
10962 unsigned short maskFlag)
10963 {
10964 if ((scratchAccess->linearizedStart <= linearizedStart &&
10965 scratchAccess->linearizedEnd >= linearizedEnd))
10966 {
10967 if (maskFlag & InstOpt_WriteEnable)
10968 {
10969 return true;
10970 }
10971
10972 if (scratchAccess->linearizedStart == linearizedStart &&
10973 scratchAccess->linearizedEnd == linearizedEnd &&
10974 scratchAccess->maskFlag == maskFlag)
10975 {
10976 return true;
10977 }
10978 }
10979
10980 return false;
10981 }
10982
10983 /*
10984 * Register kill analysis
10985 */
regDefineAnalysis(SCRATCH_ACCESS * scratchAccess,unsigned linearizedStart,unsigned linearizedEnd,unsigned short maskFlag,G4_Predicate * predicate)10986 bool FlagSpillCleanup::regDefineAnalysis(
10987 SCRATCH_ACCESS* scratchAccess,
10988 unsigned linearizedStart,
10989 unsigned linearizedEnd,
10990 unsigned short maskFlag,
10991 G4_Predicate* predicate)
10992 {
10993 if (regFullyKilled(scratchAccess, linearizedStart, linearizedEnd, maskFlag))
10994 {
10995 return true;
10996 }
10997 else if (!scratchAccess->regKilled)
10998 {
10999 // Handle partial overlap
11000 // What about the mask?
11001 if (addKilledGRFRanges(linearizedStart, linearizedEnd, scratchAccess, predicate))
11002 {
11003 //The register range is killed by accumulated partial range kills
11004 return true;
11005 }
11006 scratchAccess->regPartialKilled = true;
11007 }
11008
11009 return false;
11010 }
11011
regDefineFlag(SCRATCH_PTR_LIST * scratchTraceList,G4_INST * inst,G4_Operand * opnd)11012 void FlagSpillCleanup::regDefineFlag(
11013 SCRATCH_PTR_LIST* scratchTraceList,
11014 G4_INST* inst,
11015 G4_Operand* opnd)
11016 {
11017 //Get the linearized address in GRF register file
11018 unsigned linearizedStart = 0;
11019 unsigned linearizedEnd = 0;
11020 G4_Predicate* predicate = inst->getPredicate();
11021 G4_Declare* topdcl = opnd->getTopDcl();
11022
11023 FlagLineraizedStartAndEnd(opnd->getTopDcl(), linearizedStart, linearizedEnd);
11024
11025 //Impact on previous scratch access
11026 SCRATCH_PTR_LIST_ITER it = scratchTraceList->begin();
11027 SCRATCH_PTR_LIST_ITER itEnd = scratchTraceList->end();
11028
11029 if (it != itEnd &&
11030 inst == *(scratchTraceList->back()->inst_it))
11031 {
11032 itEnd--;
11033 }
11034
11035 while (it != itEnd)
11036 {
11037 SCRATCH_PTR_LIST_ITER kt = it;
11038 kt++;
11039
11040 SCRATCH_ACCESS * scratchAccess = *it;
11041
11042 //Not instruction itself, def->use can not happen in single instruction.
11043 if (scratchAccess->regKilled)
11044 {
11045 it = kt;
11046 continue;
11047 }
11048
11049 // Checked if the registers used in the previous scratch accesses (both spill and fill) are killed (redefined).
11050 if (linearizedEnd &&
11051 IS_FLAG_RANGE_OVERLAP(linearizedStart, linearizedEnd, scratchAccess))
11052 {
11053 //E mask
11054 unsigned maskFlag = (inst->getOption() & 0xFFF010C);
11055
11056 if (regDefineAnalysis(scratchAccess, linearizedStart, linearizedEnd, (unsigned short)maskFlag, predicate))
11057 {
11058 //Fully killed
11059 scratchAccess->regKilled = true;
11060 if (scratchAccess->evicted) //Not in use
11061 {
11062 scratchTraceList->erase(it); //The previous one is not candidate for future use
11063 }
11064 }
11065
11066 // For prefill and associated define and spill instructions
11067 // 1. Same dcl is used
11068 // 2. If the prefill register is fulled killed,
11069 // a. The prefill instruction can be removed.
11070 // b. But the define and instruction's registers are kept and will not reuse previous one.
11071 // 3. If the prefill register is partial killed, and the killed register region is part of prefill region.
11072 // a. The prefill instruction can be removed.
11073 // b. and the register in define and spill instruction can reuse previous one.
11074 // 4. Otherwise, the (pre)fill instruction can not be removed, and no reuse will happen.
11075 // 5. For pure fill, it's no killed by same declare
11076 G4_Declare *preDcl = scratchAccess->flagOpnd->getTopDcl();
11077
11078 if (topdcl == preDcl)
11079 {
11080 if (inRangePartialKilled(scratchAccess, linearizedStart, linearizedEnd, (unsigned short)maskFlag))
11081 {
11082 scratchAccess->renameOperandVec.emplace_back(inst, -1);
11083 scratchAccess->inRangePartialKilled = true;
11084 }
11085 else
11086 {
11087 scratchAccess->removeable = false;
11088 }
11089 }
11090 }
11091
11092 it = kt;
11093 }
11094 }
11095
11096 /*
11097 * Analysis the use of register to determine if the scratchAccess can be removed or not
11098 *
11099 */
regUseAnalysis(SCRATCH_ACCESS * scratchAccess,unsigned linearizedStart,unsigned linearizedEnd)11100 bool FlagSpillCleanup::regUseAnalysis(
11101 SCRATCH_ACCESS* scratchAccess,
11102 unsigned linearizedStart,
11103 unsigned linearizedEnd)
11104 {
11105 //GRF in previous fill is used as part of current reg,
11106 //In this case, the fill can not be removed since the reuse can not happen.
11107 //Caller gauranteed the overlap of the registers
11108 if (linearizedEnd > scratchAccess->linearizedEnd ||
11109 linearizedStart < scratchAccess->linearizedStart)
11110 {
11111 return true;
11112 }
11113
11114 //Can not be removed when the previous scratch access is killed or partial killed
11115 //before the use of current scratch access register
11116 //b
11117 SCRATCH_ACCESS * preScratchAccess = scratchAccess->preScratchAccess;
11118 if (preScratchAccess &&
11119 (preScratchAccess->regKilled ||
11120 scratchKilledByPartial(scratchAccess, preScratchAccess)))
11121 {
11122 return true;
11123 }
11124
11125 //Back trace to update the reachable scratch accesses
11126 if (scratchAccess->prePreScratchAccess)
11127 {
11128 SCRATCH_ACCESS * prePreScratchAccess = preScratchAccess;
11129 preScratchAccess = scratchAccess;
11130
11131 do {
11132 if ((prePreScratchAccess->regKilled ||
11133 scratchKilledByPartial(scratchAccess, prePreScratchAccess)))
11134 {
11135 scratchAccess->prePreScratchAccess = preScratchAccess;
11136 break;
11137 }
11138 preScratchAccess = prePreScratchAccess;
11139 prePreScratchAccess = preScratchAccess->preScratchAccess;
11140 } while (prePreScratchAccess && preScratchAccess != scratchAccess->prePreScratchAccess);
11141 }
11142
11143 return false;
11144 }
11145
regUseFlag(SCRATCH_PTR_LIST * scratchTraceList,G4_INST * inst,G4_Operand * opnd,int opndIndex)11146 void FlagSpillCleanup::regUseFlag(
11147 SCRATCH_PTR_LIST* scratchTraceList,
11148 G4_INST* inst,
11149 G4_Operand* opnd,
11150 int opndIndex)
11151 {
11152 //Get the linearized address in GRF register file
11153 unsigned linearizedStart = 0;
11154 unsigned linearizedEnd = 0;
11155 G4_Declare *topdcl = NULL;
11156
11157 topdcl = opnd->getTopDcl();
11158 FlagLineraizedStartAndEnd(opnd->getTopDcl(), linearizedStart, linearizedEnd);
11159
11160 //Impact on previous scratch access
11161 for (SCRATCH_ACCESS * scratchAccess : *scratchTraceList)
11162 {
11163 if (linearizedEnd &&
11164 IS_FLAG_RANGE_OVERLAP(linearizedStart, linearizedEnd, scratchAccess))
11165 {
11166 //Not handle indirect GRF
11167 if (inst->isEOT() ||
11168 inst->isPseudoUse())
11169 {
11170 scratchAccess->removeable = false;
11171 continue;
11172 }
11173
11174 if (scratchAccess->flagOpnd->getTopDcl() == topdcl) //Same declare
11175 {
11176 if (regUseAnalysis(scratchAccess, linearizedStart, linearizedEnd))
11177 {
11178 //The filled register is in use
11179 scratchAccess->removeable = false;
11180 }
11181 else if (scratchAccess->inRangePartialKilled || !scratchAccess->regKilled)
11182 {
11183 //can reuse previous register
11184 scratchAccess->renameOperandVec.emplace_back(inst, opndIndex);
11185 }
11186 }
11187 }
11188 }
11189 }
11190
regUseScratch(SCRATCH_PTR_LIST * scratchTraceList,G4_INST * inst,G4_Operand * opnd,Gen4_Operand_Number opndNum)11191 void FlagSpillCleanup::regUseScratch(
11192 SCRATCH_PTR_LIST* scratchTraceList,
11193 G4_INST* inst,
11194 G4_Operand* opnd,
11195 Gen4_Operand_Number opndNum)
11196 {
11197 const G4_Declare *topdcl = opnd->getTopDcl();
11198
11199 //Impact on previous scratch access
11200 for (SCRATCH_ACCESS *scratchAccess : *scratchTraceList)
11201 {
11202 if (topdcl == scratchAccess->scratchDcl)
11203 {
11204 if (opndNum == Opnd_dst)
11205 {
11206 scratchAccess->scratchDefined = true;
11207 }
11208 else
11209 {
11210 scratchAccess->removeable = false;
11211 }
11212 }
11213 }
11214 }
11215
initializeScratchAccess(SCRATCH_ACCESS * scratchAccess,INST_LIST_ITER inst_it)11216 void FlagSpillCleanup::initializeScratchAccess(
11217 SCRATCH_ACCESS *scratchAccess, INST_LIST_ITER inst_it)
11218 {
11219 #ifdef _DEBUG
11220 scratchAccess->regNum = -1;
11221 #endif
11222 scratchAccess->scratchDcl = NULL;
11223 scratchAccess->flagOpnd = NULL;
11224
11225 scratchAccess->linearizedStart = 0;
11226 scratchAccess->linearizedEnd = 0;
11227 scratchAccess->leftOff = 0;
11228 scratchAccess->rightOff = 0;
11229 scratchAccess->useCount = 0;
11230
11231 scratchAccess->isSpill = false;
11232 scratchAccess->isBlockLocal = false;
11233 scratchAccess->directKill = false;
11234
11235 scratchAccess->regKilled = false;
11236 scratchAccess->regPartialKilled = false;
11237 scratchAccess->regOverKilled = false;
11238 scratchAccess->inRangePartialKilled = false;
11239 scratchAccess->regInUse = false;
11240
11241 scratchAccess->fillInUse = false;
11242 scratchAccess->removeable = true;
11243 scratchAccess->instKilled = false;
11244 scratchAccess->evicted = false;
11245 scratchAccess->scratchDefined = false;
11246
11247 scratchAccess->preScratchAccess = NULL;
11248 scratchAccess->prePreScratchAccess = NULL;
11249 scratchAccess->preFillAccess = NULL;
11250
11251 scratchAccess->inst_it = inst_it;
11252 G4_INST *inst = *inst_it;
11253 scratchAccess->maskFlag = (inst->getOption() & 0xFFF010C);
11254
11255 return;
11256 }
11257
initializeFlagScratchAccess(SCRATCH_PTR_VEC * scratchAccessList,SCRATCH_ACCESS * & scratchAccess,INST_LIST_ITER inst_it)11258 bool FlagSpillCleanup::initializeFlagScratchAccess(
11259 SCRATCH_PTR_VEC* scratchAccessList,
11260 SCRATCH_ACCESS* &scratchAccess,
11261 INST_LIST_ITER inst_it)
11262 {
11263 G4_INST* inst = (*inst_it);
11264
11265 G4_DstRegRegion* dst = inst->getDst();
11266 G4_Operand* src = inst->getSrc(0);
11267 G4_Declare* topDcl_1 = dst->getTopDcl();
11268 G4_Declare* topDcl_2 = src->getTopDcl();
11269
11270 //Create the spill/fill description
11271 if (topDcl_1->getRegFile() == G4_FLAG && topDcl_2->getRegFile() == G4_GRF)
11272 {
11273 if (src->asSrcRegRegion()->getBase()->isRegVar() &&
11274 src->asSrcRegRegion()->getBase()->asRegVar()->isRegVarAddrSpillLoc())
11275 {
11276 scratchAccess = new SCRATCH_ACCESS;
11277 scratchAccessList->push_back(scratchAccess);
11278 initializeScratchAccess(scratchAccess, inst_it);
11279 //Fill
11280 #ifdef _DEBUG
11281 scratchAccess->regNum = topDcl_1->getRegVar()->getPhyReg()->asAreg()->getArchRegType();
11282 #endif
11283 scratchAccess->scratchDcl = topDcl_2; //Spill location
11284
11285 if (gra.isBlockLocal(topDcl_2))
11286 {
11287 scratchAccess->isBlockLocal = true;
11288 }
11289 FlagLineraizedStartAndEnd(topDcl_1, scratchAccess->linearizedStart, scratchAccess->linearizedEnd);
11290 scratchAccess->flagOpnd = dst;
11291 if (inst->getPredicate())
11292 {
11293 scratchAccess->removeable = false; //Partil spill/fill cannot be removed
11294 scratchAccess->instKilled = true; //Not really killed, mark so that the instruction depends on current one will not be removed.
11295 }
11296
11297 return true;
11298 }
11299 }
11300 else
11301 { //Spill
11302 if (dst->getBase()->isRegVar() &&
11303 dst->getBase()->asRegVar()->isRegVarAddrSpillLoc())
11304 {
11305 scratchAccess = new SCRATCH_ACCESS;
11306 scratchAccessList->push_back(scratchAccess);
11307 initializeScratchAccess(scratchAccess, inst_it);
11308 #ifdef _DEBUG
11309 scratchAccess->regNum = topDcl_2->getRegVar()->getPhyReg()->asAreg()->getArchRegType();
11310 #endif
11311 scratchAccess->scratchDcl = topDcl_1;
11312
11313 if (gra.isBlockLocal(topDcl_1))
11314 {
11315 scratchAccess->isBlockLocal = true;
11316 }
11317
11318 scratchAccess->isSpill = true;
11319 FlagLineraizedStartAndEnd(topDcl_2, scratchAccess->linearizedStart, scratchAccess->linearizedEnd);
11320 scratchAccess->flagOpnd = src;
11321 if (inst->getPredicate())
11322 {
11323 scratchAccess->removeable = false; //Partil spill/fill cannot be removed
11324 scratchAccess->instKilled = true; //Not really killed, mark so that the instruction depends on current one will not be removed.
11325 }
11326
11327 return true;
11328 }
11329 }
11330
11331 return false;
11332 }
11333
freeScratchAccess(SCRATCH_PTR_VEC * scratchAccessList)11334 void FlagSpillCleanup::freeScratchAccess(SCRATCH_PTR_VEC *scratchAccessList)
11335 {
11336 for (SCRATCH_ACCESS *scratchAccess : *scratchAccessList)
11337 {
11338 delete scratchAccess;
11339 }
11340
11341 scratchAccessList->clear();
11342
11343 return;
11344 }
11345
11346 //Check the flag define instruction.
flagDefine(SCRATCH_PTR_LIST & scratchTraceList,G4_INST * inst)11347 void FlagSpillCleanup::flagDefine(
11348 SCRATCH_PTR_LIST& scratchTraceList,
11349 G4_INST* inst)
11350 {
11351 G4_DstRegRegion* dst = inst->getDst();
11352
11353 if (dst)
11354 {
11355 G4_Declare* topdcl = NULL;
11356 topdcl = GetTopDclFromRegRegion(dst);
11357
11358 if (topdcl && topdcl->getRegFile() == G4_FLAG)
11359 {
11360 //Flag register define
11361 regDefineFlag(&scratchTraceList, inst, dst);
11362 }
11363 }
11364
11365 G4_CondMod* mod = inst->getCondMod();
11366 if (!mod)
11367 {
11368 return;
11369 }
11370
11371 // ConMod, handled as register define
11372 unsigned maskFlag = (inst->getOption() & 0xFFF010C);
11373
11374 unsigned linearizedStart = 0;
11375 unsigned linearizedEnd = 0;
11376
11377 G4_VarBase *flagReg = mod->getBase();
11378 if (!flagReg)
11379 {
11380 return;
11381 }
11382
11383 G4_Declare* topdcl = flagReg->asRegVar()->getDeclare();
11384 FlagLineraizedStartAndEnd(topdcl, linearizedStart, linearizedEnd);
11385
11386 SCRATCH_PTR_LIST_ITER it = scratchTraceList.begin();
11387 SCRATCH_PTR_LIST_ITER itEnd = scratchTraceList.end();
11388 while (it != itEnd)
11389 {
11390 SCRATCH_PTR_LIST_ITER kt = it;
11391 kt++;
11392
11393 SCRATCH_ACCESS *preScratchAccess = *it;
11394 if (IS_FLAG_RANGE_OVERLAP(linearizedStart, linearizedEnd, preScratchAccess))
11395 {
11396 G4_Declare *preDcl = preScratchAccess->flagOpnd->getTopDcl();
11397
11398 if (regDefineAnalysis(preScratchAccess, linearizedStart, linearizedEnd, (unsigned short)maskFlag, NULL))
11399 {
11400 preScratchAccess->regKilled = true;
11401 if (preScratchAccess->evicted) //Not in use
11402 {
11403 scratchTraceList.erase(it); //The previous one is not candidate for reuse
11404 }
11405 }
11406 if (topdcl == preDcl)
11407 {
11408 if (preScratchAccess->inRangePartialKilled)
11409 {
11410 preScratchAccess->renameOperandVec.emplace_back(inst, -3);
11411 }
11412 else
11413 {
11414 preScratchAccess->removeable = false;
11415 }
11416 }
11417 }
11418 it = kt;
11419 }
11420
11421 return;
11422 }
11423
scratchUse(SCRATCH_PTR_LIST & scratchTraceList,G4_INST * inst)11424 void FlagSpillCleanup::scratchUse(SCRATCH_PTR_LIST& scratchTraceList, G4_INST* inst)
11425 {
11426 G4_DstRegRegion* dst = inst->getDst();
11427
11428 if (dst)
11429 {
11430 G4_Declare* topdcl = NULL;
11431 topdcl = GetTopDclFromRegRegion(dst);
11432
11433 if (topdcl && topdcl->getRegFile() == G4_GRF)
11434 {
11435 //Flag scratch variable is redefined
11436 regUseScratch(&scratchTraceList, inst, dst, Opnd_dst);
11437 }
11438 }
11439
11440 for (unsigned i = 0; i < G4_MAX_SRCS; i++)
11441 {
11442 G4_Operand* src = inst->getSrc(i);
11443
11444 if (src && src->isSrcRegRegion())
11445 {
11446 G4_Declare* topdcl = NULL;
11447
11448 if (inst->getSrc(i)->asSrcRegRegion()->getBase()->isRegVar())
11449 {
11450 topdcl = GetTopDclFromRegRegion(src);
11451 }
11452
11453 if (!topdcl || (topdcl->getRegFile() == G4_FLAG))
11454 {
11455 continue;
11456 }
11457
11458 regUseScratch(&scratchTraceList, inst, src, Opnd_src0);
11459 }
11460 }
11461 }
11462
flagUse(SCRATCH_PTR_LIST & scratchTraceList,G4_INST * inst)11463 void FlagSpillCleanup::flagUse(SCRATCH_PTR_LIST& scratchTraceList, G4_INST* inst)
11464 {
11465 for (unsigned i = 0; i < G4_MAX_SRCS; i++)
11466 {
11467 G4_Operand* src = inst->getSrc(i);
11468
11469 if (src && src->isSrcRegRegion())
11470 {
11471 G4_Declare* topdcl = NULL;
11472
11473 if (inst->getSrc(i)->asSrcRegRegion()->getBase()->isRegVar())
11474 {
11475 topdcl = GetTopDclFromRegRegion(src);
11476 }
11477
11478 if (!topdcl || (topdcl->getRegFile() != G4_FLAG))
11479 {
11480 continue;
11481 }
11482
11483 regUseFlag(&scratchTraceList, inst, src, i);
11484 }
11485 }
11486
11487 //Flag register is used as predicate
11488 G4_Predicate* predicate = inst->getPredicate();
11489 if (!predicate)
11490 {
11491 return;
11492 }
11493
11494 G4_VarBase *flagReg = predicate->getBase();
11495 if (!flagReg)
11496 {
11497 return;
11498 }
11499
11500 G4_Declare* topdcl = flagReg->asRegVar()->getDeclare();
11501 unsigned linearizedStart = 0;
11502 unsigned linearizedEnd = 0;
11503 FlagLineraizedStartAndEnd(topdcl, linearizedStart, linearizedEnd);
11504
11505 for (SCRATCH_ACCESS * preScratchAccess : scratchTraceList)
11506 {
11507 if (IS_FLAG_RANGE_OVERLAP(linearizedStart, linearizedEnd, preScratchAccess))
11508 {
11509 G4_Declare *preDcl = preScratchAccess->flagOpnd->getTopDcl();
11510 //Use should have same top declare
11511 if (preDcl == topdcl)
11512 {
11513 if (regUseAnalysis(preScratchAccess, linearizedStart, linearizedEnd))
11514 {
11515 preScratchAccess->removeable = false;
11516 }
11517 else if (preScratchAccess->inRangePartialKilled || !preScratchAccess->regKilled)
11518 {
11519 //can reuse previous register
11520 preScratchAccess->renameOperandVec.emplace_back(inst, -2);
11521 }
11522 }
11523 }
11524 }
11525
11526 return;
11527 }
11528
flagScratchDefineUse(G4_BB * bb,SCRATCH_PTR_LIST * scratchTraceList,SCRATCH_PTR_VEC * candidateList,SCRATCH_ACCESS * scratchAccess,CLEAN_NUM_PROFILE * clean_num_profile)11529 bool FlagSpillCleanup::flagScratchDefineUse(
11530 G4_BB* bb,
11531 SCRATCH_PTR_LIST* scratchTraceList,
11532 SCRATCH_PTR_VEC* candidateList,
11533 SCRATCH_ACCESS* scratchAccess,
11534 CLEAN_NUM_PROFILE* clean_num_profile)
11535 {
11536 SCRATCH_PTR_LIST_ITER it = scratchTraceList->begin();
11537 SCRATCH_PTR_LIST_ITER itEnd = scratchTraceList->end();
11538
11539 while (it != itEnd)
11540 {
11541 SCRATCH_PTR_LIST_ITER kt = it;
11542 kt++;
11543
11544 SCRATCH_ACCESS * preScratchAccess = *it;
11545
11546 //Evicted
11547 if (preScratchAccess->evicted)
11548 {
11549 it = kt;
11550 continue;
11551 }
11552
11553 //Same scratch declare
11554 if (preScratchAccess->scratchDcl == scratchAccess->scratchDcl) //Same scratch location
11555 {
11556 if (scratchAccess->isSpill) //Current is spill
11557 {
11558 if (IS_SPILL_KILL_CANDIDATE(preScratchAccess)) //previoius is spill as well and previous spill is not used
11559 {
11560 //kill the previous spill
11561 bb->erase(preScratchAccess->inst_it);
11562 preScratchAccess->instKilled = true;
11563 clean_num_profile->spill_clean_num[0]++;
11564 scratchTraceList->erase(it); //The previous one is not candidate for reuse
11565 it = kt;
11566
11567 continue;
11568 }
11569
11570 preScratchAccess->evicted = true;
11571 scratchTraceList->erase(it); //The previous one is not a good candidate for reuse any more
11572 }
11573 else //Current is fill
11574 {
11575 preScratchAccess->fillInUse = true;
11576 preScratchAccess->useCount++;
11577
11578 if (IS_USE_KILL_CANDIDATE(preScratchAccess)) //Is not used before
11579 {
11580 scratchAccess->preScratchAccess = preScratchAccess; //set previous scrach location define
11581 candidateList->push_back(scratchAccess); //Add to candidate list
11582 if (IS_FLAG_RANGE_OVERWRITE(scratchAccess, preScratchAccess->linearizedStart, preScratchAccess->linearizedEnd))
11583 {
11584 //Exactly same GRF, it's useless fill, since prevous fill or spill not been killed
11585 scratchAccess->directKill = true;
11586 scratchTraceList->push_back(scratchAccess);
11587 return true;
11588 }
11589 }
11590 }
11591 }
11592 it = kt;
11593 }
11594
11595 scratchTraceList->push_back(scratchAccess);
11596
11597 return false;
11598 }
11599
flagSpillFillClean(G4_BB * bb,INST_LIST_ITER inst_it,SCRATCH_PTR_VEC & scratchAccessList,SCRATCH_PTR_LIST & scratchTraceList,SCRATCH_PTR_VEC & candidateList,CLEAN_NUM_PROFILE * clean_num_profile)11600 void FlagSpillCleanup::flagSpillFillClean(
11601 G4_BB* bb,
11602 INST_LIST_ITER inst_it,
11603 SCRATCH_PTR_VEC& scratchAccessList,
11604 SCRATCH_PTR_LIST& scratchTraceList,
11605 SCRATCH_PTR_VEC& candidateList,
11606 CLEAN_NUM_PROFILE* clean_num_profile)
11607 {
11608 G4_INST* inst = (*inst_it);
11609 if (inst->isPseudoKill())
11610 {
11611 return;
11612 }
11613
11614 bool noDefineAnalysis = false;
11615
11616 //Check if there is flag use
11617 flagUse(scratchTraceList, inst);
11618
11619 //Check if it's spill/fill of the flag
11620 if (IS_FLAG_MOVE(inst))
11621 {
11622 SCRATCH_ACCESS *scratchAccess = NULL;
11623
11624 if (initializeFlagScratchAccess(&scratchAccessList, scratchAccess, inst_it))
11625 {
11626 //Build the trace list and the candidate list
11627 //Trace list includes all spill/fill
11628 //Candidate includues ??
11629 //Checking if the spill/fill can be removed at the same time by comparing previous one.
11630 noDefineAnalysis = flagScratchDefineUse(bb, &scratchTraceList, &candidateList, scratchAccess, clean_num_profile);
11631 }
11632 }
11633 else
11634 {
11635 scratchUse(scratchTraceList, inst);
11636 }
11637
11638 //Check if there is flag define
11639 if (!noDefineAnalysis)
11640 {
11641 flagDefine(scratchTraceList, inst);
11642 }
11643
11644 return;
11645 }
11646
11647 #ifdef _DEBUG
11648 #define FILL_DEBUG_THRESHOLD 0xffffffff
11649 #define SPILL_DEBUG_THRESHOLD 0xffffffff //25
11650 #endif
11651
regFillClean(IR_Builder & builder,G4_BB * bb,SCRATCH_PTR_VEC & candidateList,CLEAN_NUM_PROFILE * clean_num_profile)11652 void FlagSpillCleanup::regFillClean(
11653 IR_Builder& builder,
11654 G4_BB* bb,
11655 SCRATCH_PTR_VEC& candidateList,
11656 CLEAN_NUM_PROFILE* clean_num_profile)
11657 {
11658 for (SCRATCH_ACCESS * scratchAccess : candidateList)
11659 {
11660 SCRATCH_ACCESS* preScratchAccess = scratchAccess->preScratchAccess;
11661
11662 // Since the reuse happens from front to end.
11663 // If the pre scratchAccess is killed, current candidate can not reuse previous register any more
11664 if (!scratchAccess->instKilled &&
11665 (scratchAccess->removeable && scratchAccess->directKill))
11666 {
11667 if (scratchAccess->prePreScratchAccess)
11668 {
11669 while (preScratchAccess &&
11670 preScratchAccess->preScratchAccess &&
11671 preScratchAccess != scratchAccess->prePreScratchAccess)
11672 {
11673 //If possible, propagate to previous scratchAccess
11674 if (preScratchAccess->preFillAccess)
11675 {
11676 //to jump over prefill.
11677 if (preScratchAccess->isSpill &&
11678 preScratchAccess->preFillAccess &&
11679 preScratchAccess->preFillAccess->instKilled &&
11680 preScratchAccess->preScratchAccess)
11681 {
11682 preScratchAccess = preScratchAccess->preScratchAccess;
11683 }
11684 else
11685 {
11686 break;
11687 }
11688 }
11689 else
11690 {
11691 if (!preScratchAccess->instKilled)
11692 {
11693 break;
11694 }
11695 preScratchAccess = preScratchAccess->preScratchAccess;
11696 }
11697 }
11698
11699 if (preScratchAccess)
11700 {
11701 if (preScratchAccess->isSpill &&
11702 preScratchAccess->preFillAccess &&
11703 preScratchAccess->preFillAccess->instKilled)
11704 {
11705 }
11706 else if (!preScratchAccess->instKilled)
11707 {
11708 if (replaceWithPreDcl(builder, scratchAccess, preScratchAccess))
11709 {
11710 bb->erase(scratchAccess->inst_it);
11711 scratchAccess->instKilled = true;
11712 scratchAccess->preScratchAccess->useCount--;
11713 clean_num_profile->fill_clean_num[0]++;
11714 }
11715 }
11716 }
11717 }
11718 else
11719 {
11720 if (preScratchAccess && !preScratchAccess->instKilled)
11721 {
11722 if (replaceWithPreDcl(builder, scratchAccess, preScratchAccess))
11723 {
11724 bb->erase(scratchAccess->inst_it);
11725 scratchAccess->instKilled = true;
11726 scratchAccess->preScratchAccess->useCount--;
11727 clean_num_profile->fill_clean_num[0]++;
11728 }
11729 }
11730 }
11731 }
11732 #ifdef _DEBUG
11733 if (clean_num_profile->fill_clean_num[0] > FILL_DEBUG_THRESHOLD)
11734 return;
11735 #endif
11736 }
11737
11738 return;
11739 }
11740
regSpillClean(IR_Builder & builder,G4_BB * bb,SCRATCH_PTR_VEC & candidateList,CLEAN_NUM_PROFILE * clean_num_profile)11741 void FlagSpillCleanup::regSpillClean(
11742 IR_Builder& builder,
11743 G4_BB* bb,
11744 SCRATCH_PTR_VEC& candidateList,
11745 CLEAN_NUM_PROFILE* clean_num_profile)
11746 {
11747 for (SCRATCH_ACCESS * scratchAccess : candidateList)
11748 {
11749 if (scratchAccess->instKilled)
11750 {
11751 continue;
11752 }
11753 if (!scratchAccess->instKilled &&
11754 scratchAccess->isSpill &&
11755 scratchAccess->removeable &&
11756 scratchAccess->evicted &&
11757 scratchAccess->useCount == 0)
11758 {
11759 bb->erase(scratchAccess->inst_it);
11760 scratchAccess->instKilled = true;
11761 clean_num_profile->spill_clean_num[0]++;
11762 #ifdef _DEBUG
11763 if (clean_num_profile->spill_clean_num[0] > SPILL_DEBUG_THRESHOLD)
11764 {
11765 return;
11766 }
11767 #endif
11768 }
11769 }
11770
11771 return;
11772 }
11773
11774
11775 // Replace Scratch Block Read/Write message with OWord Block Read/Write message
11776 // For spill code clean up, clean target may exist in all WAW, RAR, RAW, WAR.
spillFillCodeCleanFlag(IR_Builder & builder,G4_Kernel & kernel,CLEAN_NUM_PROFILE * clean_num_profile)11777 void FlagSpillCleanup::spillFillCodeCleanFlag(
11778 IR_Builder& builder,
11779 G4_Kernel& kernel,
11780 CLEAN_NUM_PROFILE* clean_num_profile)
11781 {
11782 SCRATCH_PTR_VEC scratchAccessList;
11783 SCRATCH_PTR_LIST scratchTraceList;
11784 SCRATCH_PTR_VEC candidateList;
11785 FlowGraph& fg = kernel.fg;
11786
11787 int candidate_size = 0;
11788 for (auto bb : fg)
11789 {
11790 INST_LIST_ITER inst_it = bb->begin();
11791
11792 scratchTraceList.clear();
11793 candidateList.clear();
11794 freeScratchAccess(&scratchAccessList);
11795
11796 //Top down scan within BB
11797 while (inst_it != bb->end())
11798 {
11799 INST_LIST_ITER inst_it_next = inst_it;
11800 inst_it_next++;
11801
11802 flagSpillFillClean(bb, inst_it, scratchAccessList, scratchTraceList, candidateList, clean_num_profile);
11803
11804 inst_it = inst_it_next;
11805 }
11806
11807 #ifdef _DEBUG
11808 candidate_size += (int)candidateList.size();
11809 #endif
11810 //Clean the fills.
11811 regFillClean(builder, bb, candidateList, clean_num_profile);
11812
11813 #ifdef _DEBUG
11814 if (clean_num_profile->fill_clean_num[0] > FILL_DEBUG_THRESHOLD)
11815 return;
11816 #endif
11817 //Clean the spills
11818 regSpillClean(builder, bb, scratchAccessList, clean_num_profile);
11819
11820 #ifdef _DEBUG
11821 if (clean_num_profile->spill_clean_num[0] > SPILL_DEBUG_THRESHOLD)
11822 {
11823 return;
11824 }
11825 #endif
11826 }
11827
11828 freeScratchAccess(&scratchAccessList);
11829 scratchTraceList.clear();
11830 candidateList.clear();
11831
11832 #ifdef DEBUG_VERBOSE_ON
11833 printf("Candidate size: %d\n", candidate_size);
11834 #endif
11835
11836 return;
11837 }
11838
11839 // Insert declarations with pre-assigned registers in kernel
11840 // this is needed for HRA, and the fake declares will be removed at the end of HRA
insertPhyRegDecls()11841 void GlobalRA::insertPhyRegDecls()
11842 {
11843 int numGRF = kernel.getNumRegTotal();
11844 std::vector<bool> grfUsed(numGRF, false);
11845 GRFDclsForHRA.resize(numGRF);
11846
11847 for (auto curBB : kernel.fg)
11848 {
11849 if (auto summary = kernel.fg.getBBLRASummary(curBB))
11850 {
11851 for (int i = 0; i < numGRF; i++)
11852 {
11853 if (summary->isGRFBusy(i))
11854 {
11855 grfUsed[i] = true;
11856 }
11857 }
11858 }
11859 }
11860
11861 // Insert declarations for each GRF that is used
11862 unsigned numGRFsUsed = 0;
11863 for (int i = 0; i < numGRF; i++)
11864 {
11865 if (grfUsed[i] == true)
11866 {
11867 const char* dclName = builder.getNameString(builder.mem, 10, "r%d", i);
11868 G4_Declare* phyRegDcl = builder.createDeclareNoLookup(
11869 dclName, G4_GRF, numEltPerGRF<Type_UD>(), 1, Type_D, Regular, NULL, NULL);
11870 G4_Greg* phyReg = builder.phyregpool.getGreg(i);
11871 phyRegDcl->getRegVar()->setPhyReg(phyReg, 0);
11872 GRFDclsForHRA[i] = phyRegDcl;
11873 numGRFsUsed++;
11874 }
11875 }
11876
11877 if (builder.getOption(vISA_OptReport))
11878 {
11879 std::ofstream optreport;
11880 getOptReportStream(optreport, builder.getOptions());
11881 optreport << "Local RA used " << numGRFsUsed << " GRFs\n";
11882 }
11883 }
11884
11885 // compute physical register info and adjust foot print
11886 // find indexed GRFs and construct a foot print for them
11887 // set live operand in each instruction
computePhyReg()11888 void GlobalRA::computePhyReg()
11889 {
11890 auto& fg = kernel.fg;
11891 for (auto bb : fg)
11892 {
11893 for (auto inst : *bb)
11894 {
11895 if (inst->isPseudoKill() ||
11896 inst->isLifeTimeEnd() ||
11897 inst->isPseudoUse())
11898 {
11899 continue;
11900 }
11901
11902 if (inst->getDst() &&
11903 !(inst->hasNULLDst()))
11904 {
11905 G4_DstRegRegion *currDstRegion = inst->getDst();
11906 if (currDstRegion->getBase()->isRegVar() &&
11907 currDstRegion->getBase()->asRegVar()->getDeclare()->getGRFBaseOffset() == 0)
11908 {
11909 // Need to compute linearized offset only once per dcl
11910 currDstRegion->computePReg();
11911 }
11912 }
11913
11914 for (unsigned j = 0, size = inst->getNumSrc(); j < size; j++)
11915 {
11916 G4_Operand *curr_src = inst->getSrc(j);
11917 if (!curr_src || curr_src->isImm() ||
11918 (inst->opcode() == G4_math && j == 1 && curr_src->isNullReg()) ||
11919 curr_src->isLabel())
11920 {
11921 continue;
11922 }
11923
11924 if (curr_src->isSrcRegRegion() &&
11925 curr_src->asSrcRegRegion()->getBase() &&
11926 curr_src->asSrcRegRegion()->getBase()->isRegVar() &&
11927 curr_src->asSrcRegRegion()->getBase()->asRegVar()->getDeclare()->getGRFBaseOffset() == 0)
11928 {
11929 curr_src->asSrcRegRegion()->computePReg();
11930 }
11931 }
11932 }
11933 }
11934 }
11935
dumpRegisterPressure()11936 void GraphColor::dumpRegisterPressure()
11937 {
11938 RPE rpe(gra, &liveAnalysis);
11939 uint32_t max = 0;
11940 std::vector<G4_INST*> maxInst;
11941 rpe.run();
11942
11943 for (auto bb : builder.kernel.fg)
11944 {
11945 std::cerr << "BB " << bb->getId() << ": (Pred: ";
11946 for (auto pred : bb->Preds)
11947 {
11948 std::cerr << pred->getId() << ",";
11949 }
11950 std::cerr << " Succ: ";
11951 for (auto succ : bb->Succs)
11952 {
11953 std::cerr << succ->getId() << ",";
11954 }
11955 std::cerr << ")\n";
11956 for (auto inst : *bb)
11957 {
11958 uint32_t pressure = rpe.getRegisterPressure(inst);
11959 if (pressure > max)
11960 {
11961 max = pressure;
11962 maxInst.clear();
11963 maxInst.push_back(inst);
11964 }
11965 else if (pressure == max)
11966 {
11967 maxInst.push_back(inst);
11968 }
11969
11970 std::cerr << "[" << pressure << "] ";
11971 inst->dump();
11972 }
11973 }
11974 std::cerr << "max pressure: " << max << ", " << maxInst.size() << " inst(s)\n";
11975 for (auto inst : maxInst)
11976 {
11977 inst->dump();
11978 }
11979 }
11980
fixAlignment()11981 void GlobalRA::fixAlignment()
11982 {
11983 // Copy over alignment from G4_RegVar to GlobalRA instance
11984 // Rest of RA shouldnt have to read/modify alignment of G4_RegVar
11985 copyAlignment();
11986
11987 if (kernel.getSimdSize() == g4::SIMD32)
11988 {
11989 // we have to force all flags to be 32-bit aligned even if they are < 32-bit,
11990 // due to potential emask usage.
11991 // ToDo: may be better to simply allocate them as 32-bit?
11992 for (auto dcl : kernel.Declares)
11993 {
11994 if (dcl->getRegFile() & G4_FLAG)
11995 {
11996 setSubRegAlign(dcl, G4_SubReg_Align::Even_Word);
11997 }
11998 }
11999 }
12000
12001 if (builder.getPlatform() == GENX_BDW)
12002 {
12003 // BDW requires even_word alignment for scalar HF variables
12004 for (auto dcl : kernel.Declares)
12005 {
12006 if (dcl->getElemType() == Type_HF && dcl->getSubRegAlign() == Any)
12007 {
12008 setSubRegAlign(dcl, Even_Word);
12009 }
12010 }
12011 }
12012
12013 // ToDo: remove these as it should be done by HWConformity
12014 for (auto BB : kernel.fg)
12015 {
12016 for (auto inst : *BB)
12017 {
12018 G4_DstRegRegion* dst = inst->getDst();
12019 if (dst && dst->getTopDcl())
12020 {
12021 G4_RegVar* var = dst->getBase()->asRegVar();
12022 if (inst->isSend() && dst->getRegAccess() == Direct)
12023 {
12024 if (!var->isPhyRegAssigned())
12025 {
12026 setSubRegAlign(dst->getTopDcl(), GRFALIGN);
12027 }
12028 }
12029
12030 if (!var->isPhyRegAssigned() && var->getDeclare()->getNumRows() <= 1
12031 && dst->getRegAccess() == Direct && var->getDeclare()->getSubRegAlign() == Any)
12032 {
12033 if (inst->isAccSrcInst())
12034 {
12035 setSubRegAlign(dst->getTopDcl(), var->getDeclare()->getRegFile() != G4_ADDRESS ? GRFALIGN : Eight_Word);
12036 }
12037 }
12038 }
12039 }
12040 }
12041 }
12042
verifyAlign(G4_Declare * dcl)12043 void VerifyAugmentation::verifyAlign(G4_Declare* dcl)
12044 {
12045 // Verify that dcl with Default32Bit align mask are 2GRF aligned
12046 auto it = masks.find(dcl);
12047 if (it == masks.end())
12048 return;
12049
12050 if (dcl->getByteSize() >= numEltPerGRF<Type_UD>() * TypeSize(Type_UD) &&
12051 dcl->getByteSize() <= 2 * numEltPerGRF<Type_UD>() * TypeSize(Type_UD) &&
12052 kernel->getSimdSize() > numEltPerGRF<Type_UD>())
12053 {
12054 auto assignment = dcl->getRegVar()->getPhyReg();
12055 if (assignment && assignment->isGreg())
12056 {
12057 auto phyRegNum = assignment->asGreg()->getRegNum();
12058 auto augMask = std::get<1>((*it).second);
12059 if (phyRegNum % 2 != 0 &&
12060 augMask == AugmentationMasks::Default32Bit)
12061 {
12062 printf("Dcl %s is Default32Bit but assignment is not Even aligned\n", dcl->getName());
12063 }
12064 }
12065 }
12066 }
12067
dump(const char * dclName)12068 void VerifyAugmentation::dump(const char* dclName)
12069 {
12070 std::string dclStr = dclName;
12071 for (auto& m : masks)
12072 {
12073 std::string first = m.first->getName();
12074 if (first == dclStr)
12075 {
12076 printf("%s, %d, %s\n", dclName, m.first->getRegVar()->getId(), getStr(std::get<1>(m.second)));
12077 }
12078 }
12079 }
12080
labelBBs()12081 void VerifyAugmentation::labelBBs()
12082 {
12083 std::string prev = "X:";
12084 unsigned id = 0;
12085 for (auto bb : kernel->fg)
12086 {
12087 if (bbLabels.find(bb) == bbLabels.end())
12088 bbLabels[bb] = prev;
12089 else
12090 prev = bbLabels[bb];
12091
12092 if (bb->back()->opcode() == G4_opcode::G4_if)
12093 {
12094 auto TBB = bb->Succs.front();
12095 auto FBB = bb->Succs.back();
12096
12097 bool hasEndif = false;
12098 for (auto inst : *FBB)
12099 {
12100 if (inst->opcode() == G4_opcode::G4_endif)
12101 {
12102 hasEndif = true;
12103 break;
12104 }
12105 }
12106
12107 bbLabels[TBB] = prev + "T" + std::to_string(id) + ":";
12108
12109 if (!hasEndif)
12110 {
12111 // else
12112 bbLabels[FBB] = prev + "F" + std::to_string(id) + ":";
12113 }
12114 else
12115 {
12116 // endif block
12117 bbLabels[FBB] = prev;
12118 }
12119
12120 prev = prev + "T" + std::to_string(id) + ":";
12121
12122 id++;
12123 }
12124 else if (bb->back()->opcode() == G4_opcode::G4_else)
12125 {
12126 auto succBB = bb->Succs.front();
12127 auto lbl = prev;
12128 lbl.pop_back();
12129 while (lbl.back() != ':')
12130 {
12131 lbl.pop_back();
12132 }
12133
12134 bbLabels[succBB] = lbl;
12135 }
12136 else if (bb->back()->opcode() == G4_opcode::G4_endif)
12137 {
12138
12139 }
12140 }
12141
12142 #if 1
12143 for (auto bb : kernel->fg)
12144 {
12145 printf("BB%d -> %s\n", bb->getId(), bbLabels[bb].data());
12146 }
12147 #endif
12148 }
12149
getGRFBaseOffset(const G4_Declare * dcl)12150 unsigned getGRFBaseOffset(const G4_Declare* dcl)
12151 {
12152 unsigned regNum = dcl->getRegVar()->getPhyReg()->asGreg()->getRegNum();
12153 unsigned regOff = dcl->getRegVar()->getPhyRegOff();
12154 auto type = dcl->getElemType();
12155 return (regNum * numEltPerGRF<Type_UB>()) + (regOff * TypeSize(type));
12156 }
12157
interfereBetween(G4_Declare * dcl1,G4_Declare * dcl2)12158 bool VerifyAugmentation::interfereBetween(G4_Declare* dcl1, G4_Declare* dcl2)
12159 {
12160 bool interferes = true;
12161 unsigned v1 = dcl1->getRegVar()->getId();
12162 unsigned v2 = dcl2->getRegVar()->getId();
12163 bool v1Partaker = dcl1->getRegVar()->isRegAllocPartaker();
12164 bool v2Partaker = dcl2->getRegVar()->isRegAllocPartaker();
12165
12166 if (v1Partaker && v2Partaker)
12167 {
12168 auto interferes = intf->interfereBetween(v1, v2);
12169 if (!interferes)
12170 {
12171 if (dcl1->getIsPartialDcl())
12172 {
12173 interferes |= intf->interfereBetween(gra->getSplittedDeclare(dcl1)->getRegVar()->getId(), v2);
12174 if (dcl2->getIsPartialDcl())
12175 {
12176 interferes |= intf->interfereBetween(v1,
12177 gra->getSplittedDeclare(dcl2)->getRegVar()->getId());
12178 interferes |= intf->interfereBetween(gra->getSplittedDeclare(dcl1)->getRegVar()->getId(),
12179 gra->getSplittedDeclare(dcl2)->getRegVar()->getId());
12180 }
12181 }
12182 else if (dcl2->getIsPartialDcl())
12183 {
12184 interferes |= intf->interfereBetween(v1, gra->getSplittedDeclare(dcl2)->getRegVar()->getId());
12185 }
12186 }
12187 return interferes;
12188 }
12189 else if (!v1Partaker && v2Partaker)
12190 {
12191 // v1 is assigned by LRA
12192 unsigned startGRF = dcl1->getRegVar()->getPhyReg()->asGreg()->getRegNum();
12193 unsigned numGRFs = dcl1->getNumRows();
12194
12195 for (unsigned grf = startGRF; grf != (startGRF + numGRFs); grf++)
12196 {
12197 for (unsigned var = 0; var != numVars; var++)
12198 {
12199 if (lrs[var] &&
12200 lrs[var]->getPhyReg() == kernel->fg.builder->phyregpool.getGreg(grf) &&
12201 std::string(lrs[var]->getVar()->getName()) == "r" + std::to_string(grf))
12202 {
12203 if (!intf->interfereBetween(var, v2))
12204 {
12205 interferes = false;
12206 }
12207 }
12208 }
12209 }
12210 }
12211 else if (v1Partaker && !v2Partaker)
12212 {
12213 return interfereBetween(dcl2, dcl1);
12214 }
12215 else if (!v1Partaker && !v2Partaker)
12216 {
12217 // both assigned by LRA
12218 if (dcl1->getRegFile() == G4_RegFileKind::G4_GRF && dcl2->getRegFile() == G4_RegFileKind::G4_GRF)
12219 {
12220 auto lr1 = gra->getLocalLR(dcl1);
12221 auto lr2 = gra->getLocalLR(dcl2);
12222
12223 if (lr1->getAssigned() && lr2->getAssigned())
12224 {
12225 auto preg1Start = getGRFBaseOffset(dcl1);
12226 auto preg2Start = getGRFBaseOffset(dcl2);
12227 auto preg1End = preg1Start + dcl1->getByteSize();
12228 auto preg2End = preg2Start + dcl2->getByteSize();
12229
12230 if (preg2Start >= preg1Start && preg2Start < preg1End)
12231 {
12232 return false;
12233 }
12234 else if (preg1Start >= preg2Start && preg1Start < preg2End)
12235 {
12236 return false;
12237 }
12238 }
12239 }
12240
12241 interferes = true;
12242 }
12243
12244 return interferes;
12245 }
12246
verify()12247 void VerifyAugmentation::verify()
12248 {
12249 std::cerr << "Start verification for kernel: " << kernel->getOptions()->getOptionCstr(VISA_AsmFileName) << std::endl;
12250
12251 for (auto dcl : kernel->Declares)
12252 {
12253 if (dcl->getIsSplittedDcl())
12254 {
12255 auto& tup = masks[dcl];
12256 std::cerr << dcl->getName() << "(" << getStr(std::get<1>(tup)) << ") is split" << std::endl;
12257 for (const G4_Declare *subDcl : gra->getSubDclList(dcl))
12258 {
12259 auto& tupSub = masks[subDcl];
12260 std::cerr << "\t" << subDcl->getName() << " (" << getStr(std::get<1>(tupSub)) << ")" << std::endl;
12261 }
12262 }
12263 }
12264
12265 std::cerr << std::endl << std::endl << std::endl;
12266
12267 auto overlapDcl = [](G4_Declare* dcl1, G4_Declare* dcl2)
12268 {
12269 if (dcl1->getRegFile() == G4_RegFileKind::G4_GRF && dcl2->getRegFile() == G4_RegFileKind::G4_GRF)
12270 {
12271 auto preg1Start = getGRFBaseOffset(dcl1);
12272 auto preg2Start = getGRFBaseOffset(dcl2);
12273 auto preg1End = preg1Start + dcl1->getByteSize();
12274 auto preg2End = preg2Start + dcl2->getByteSize();
12275
12276 if (preg2Start >= preg1Start && preg2Start < preg1End)
12277 {
12278 return true;
12279 }
12280 else if (preg1Start >= preg2Start && preg1Start < preg2End)
12281 {
12282 return true;
12283 }
12284 }
12285 return false;
12286 };
12287
12288 std::list<G4_Declare*> active;
12289 for (auto dcl : sortedLiveRanges)
12290 {
12291 auto& tup = masks[dcl];
12292 unsigned startIdx = std::get<2>(tup)->getLexicalId();
12293 auto dclMask = std::get<1>(tup);
12294
12295 auto getMaskStr = [](AugmentationMasks m)
12296 {
12297 std::string str = "Undetermined";
12298 if (m == AugmentationMasks::Default16Bit)
12299 str = "Default16Bit";
12300 else if (m == AugmentationMasks::Default32Bit)
12301 str = "Default32Bit";
12302 else if (m == AugmentationMasks::Default64Bit)
12303 str = "Default64Bit";
12304 else if (m == AugmentationMasks::NonDefault)
12305 str = "NonDefault";
12306 else if (m == AugmentationMasks::DefaultPredicateMask)
12307 str = "DefaultPredicateMask";
12308 str.append("\n");
12309
12310 return str;
12311 };
12312
12313 std::cerr << dcl->getName() << " - " << getMaskStr(dclMask);
12314
12315 verifyAlign(dcl);
12316
12317 for (auto it = active.begin(); it != active.end();)
12318 {
12319 auto activeDcl = (*it);
12320 auto& tupActive = masks[activeDcl];
12321 if (startIdx >= std::get<3>(tupActive)->getLexicalId())
12322 {
12323 it = active.erase(it);
12324 continue;
12325 }
12326 it++;
12327 }
12328
12329 for (auto activeDcl : active)
12330 {
12331 auto& tupActive = masks[activeDcl];
12332 auto aDclMask = std::get<1>(tupActive);
12333
12334 if (dclMask != aDclMask)
12335 {
12336 bool interfere = interfereBetween(activeDcl, dcl);
12337
12338 if (activeDcl->getIsPartialDcl() || dcl->getIsPartialDcl())
12339 continue;
12340
12341 if (!interfere)
12342 {
12343 std::cerr << dcl->getRegVar()->getName() << "(" << getStr(dclMask) << ") and " << activeDcl->getRegVar()->getName() << "(" <<
12344 getStr(aDclMask) << ") are overlapping with incompatible emask but not masked as interfering" << std::endl;
12345 }
12346
12347 if (overlapDcl(activeDcl, dcl))
12348 {
12349 if (!interfere)
12350 {
12351 std::cerr << dcl->getRegVar()->getName() << "(" << getStr(dclMask) << ") and " << activeDcl->getName() << "(" <<
12352 getStr(aDclMask) << ") use overlapping physical assignments but not marked as interfering" << std::endl;
12353 }
12354 }
12355 }
12356 }
12357
12358 active.push_back(dcl);
12359 }
12360
12361 std::cerr << "End verification for kenel: " << kernel->getOptions()->getOptionCstr(VISA_AsmFileName) << std::endl << std::endl << std::endl;
12362
12363 return;
12364
12365 #if 0
12366 // Following is useful for debugging when test has only if-else-endif constructs
12367 labelBBs();
12368 populateBBLexId();
12369 std::string msg;
12370 for (auto dcl : sortedLiveRanges)
12371 {
12372 auto lr = DclLRMap[dcl];
12373 if (lr->getPhyReg() && isClobbered(lr, msg))
12374 {
12375 printf("%s clobbered:\n\t%s\n\n", dcl->getName(), msg.data());
12376 }
12377 }
12378 #endif
12379 }
12380
populateBBLexId()12381 void VerifyAugmentation::populateBBLexId()
12382 {
12383 for (auto bb : kernel->fg)
12384 {
12385 if (bb->size() > 0)
12386 BBLexId.push_back(std::make_tuple(bb, bb->front()->getLexicalId(), bb->back()->getLexicalId()));
12387 }
12388 }
12389
isClobbered(LiveRange * lr,std::string & msg)12390 bool VerifyAugmentation::isClobbered(LiveRange* lr, std::string& msg)
12391 {
12392 msg.clear();
12393
12394 auto& tup = masks[lr->getDcl()];
12395
12396 auto startLexId = std::get<2>(tup)->getLexicalId();
12397 auto endLexId = std::get<3>(tup)->getLexicalId();
12398
12399 std::vector<std::pair<G4_INST*, G4_BB*>> insts;
12400 std::vector<std::tuple<INST_LIST_ITER, G4_BB*>> defs;
12401 std::vector<std::tuple<INST_LIST_ITER, G4_BB*>> uses;
12402
12403 for (auto bb : kernel->fg)
12404 {
12405 if (bb->size() == 0)
12406 continue;
12407
12408 if (bb->back()->getLexicalId() > endLexId && bb->front()->getLexicalId() > endLexId)
12409 continue;
12410
12411 if (bb->back()->getLexicalId() < startLexId && bb->front()->getLexicalId() < startLexId)
12412 continue;
12413
12414 // lr is active in current bb
12415 for (auto instIt = bb->begin(), end = bb->end(); instIt != end; instIt++)
12416 {
12417 auto inst = (*instIt);
12418 if (inst->isPseudoKill())
12419 continue;
12420
12421 if (inst->getLexicalId() > startLexId && inst->getLexicalId() <= endLexId)
12422 {
12423 insts.push_back(std::make_pair(inst, bb));
12424 auto dst = inst->getDst();
12425 if (dst &&
12426 dst->isDstRegRegion())
12427 {
12428 auto topdcl = dst->asDstRegRegion()->getTopDcl();
12429 if (topdcl == lr->getDcl())
12430 defs.push_back(std::make_tuple(instIt, bb));
12431 }
12432
12433 for (unsigned i = 0; i != G4_MAX_SRCS; i++)
12434 {
12435 auto src = inst->getSrc(i);
12436 if (src && src->isSrcRegRegion())
12437 {
12438 auto topdcl = src->asSrcRegRegion()->getTopDcl();
12439 if (topdcl == lr->getDcl())
12440 uses.push_back(std::make_tuple(instIt, bb));
12441 }
12442 }
12443 }
12444 }
12445 }
12446
12447 for (auto& use : uses)
12448 {
12449 auto& useStr = bbLabels[std::get<1>(use)];
12450 auto inst = *std::get<0>(use);
12451 MUST_BE_TRUE(useStr.size() > 0, "empty string found");
12452 std::list<std::tuple<G4_INST*, G4_BB*>> rd;
12453
12454 for (unsigned i = 0; i != G4_MAX_SRCS; i++)
12455 {
12456 auto src = inst->getSrc(i);
12457 if (src && src->isSrcRegRegion() && src->asSrcRegRegion()->getTopDcl() == lr->getDcl())
12458 {
12459 unsigned lb = 0, rb = 0;
12460 lb = lr->getPhyReg()->asGreg()->getRegNum() * numEltPerGRF<Type_UB>() + (lr->getPhyRegOff()*lr->getDcl()->getElemSize());
12461 lb += src->getLeftBound();
12462 rb = lb + src->getRightBound() - src->getLeftBound();
12463
12464 for (auto& otherInsts : insts)
12465 {
12466 if (otherInsts.first->getLexicalId() > inst->getLexicalId())
12467 break;
12468
12469 auto oiDst = otherInsts.first->getDst();
12470 auto oiBB = otherInsts.second;
12471 if (oiDst && oiDst->isDstRegRegion() && oiDst->getTopDcl())
12472 {
12473 unsigned oilb = 0, oirb = 0;
12474 auto oiLR = DclLRMap[oiDst->getTopDcl()];
12475 if (oiLR && !oiLR->getPhyReg())
12476 continue;
12477
12478 oilb = oiLR->getPhyReg()->asGreg()->getRegNum()*numEltPerGRF<Type_UB>() +
12479 (oiLR->getPhyRegOff()*oiLR->getDcl()->getElemSize());
12480 oilb += oiDst->getLeftBound();
12481 oirb = oilb + oiDst->getRightBound() - oiDst->getLeftBound();
12482
12483 if (oilb <= (unsigned)rb && oirb >= (unsigned)lb)
12484 {
12485 rd.push_back(std::make_tuple(otherInsts.first, oiBB));
12486 }
12487 }
12488 }
12489 }
12490 }
12491
12492 auto isComplementary = [](std::string& cur, std::string& other)
12493 {
12494 if (cur.size() < other.size())
12495 return false;
12496
12497 if (cur.substr(0, other.size() - 1) == other.substr(0, other.size() - 1))
12498 {
12499 char lastAlphabet = cur.at(other.size() - 1);
12500 if (lastAlphabet == 'T' && other.back() == 'F')
12501 return true;
12502 if (lastAlphabet == 'F' && other.back() == 'T')
12503 return true;
12504 }
12505
12506 return false;
12507 };
12508
12509 auto isSameEM = [](G4_INST* inst1, G4_INST* inst2)
12510 {
12511 if (inst1->getMaskOption() == inst2->getMaskOption() &&
12512 inst1->getMaskOffset() == inst2->getMaskOffset())
12513 return true;
12514 return false;
12515 };
12516
12517 if (rd.size() > 0)
12518 {
12519 printf("Current use str = %s for inst:\t", useStr.data());
12520 inst->emit(std::cerr);
12521 printf("\t$%d\n", inst->getCISAOff());
12522 }
12523 // process all reaching defs
12524 for (auto rid = rd.begin(); rid != rd.end();)
12525 {
12526 auto& reachingDef = (*rid);
12527
12528 auto& str = bbLabels[std::get<1>(reachingDef)];
12529
12530 // skip rd if it is from complementary branch
12531 if (isComplementary(str, useStr) && isSameEM(inst, std::get<0>(reachingDef)))
12532 {
12533 #if 0
12534 printf("\tFollowing in complementary branch %s, removed:\t", str.data());
12535 std::get<0>(reachingDef)->emit(std::cerr);
12536 printf("\t$%d\n", std::get<0>(reachingDef)->getCISAOff());
12537 #endif
12538 rid = rd.erase(rid);
12539 continue;
12540 }
12541 rid++;
12542 }
12543
12544 // keep rd that appears last in its BB
12545 for (auto rid = rd.begin(); rid != rd.end();)
12546 {
12547 auto ridBB = std::get<1>(*rid);
12548 for (auto rid1 = rd.begin(); rid1 != rd.end();)
12549 {
12550 if (*rid == *rid1)
12551 {
12552 rid1++;
12553 continue;
12554 }
12555
12556 auto rid1BB = std::get<1>(*rid1);
12557 if (ridBB == rid1BB &&
12558 std::get<0>(*rid)->getLexicalId() > std::get<0>(*rid1)->getLexicalId())
12559 {
12560 #if 0
12561 printf("\tErasing inst at $%d due to later def at $%d\n", std::get<0>(*rid1)->getLexicalId(),
12562 std::get<0>(*rid)->getLexicalId());
12563 #endif
12564 rid1 = rd.erase(rid1);
12565 continue;
12566 }
12567 rid1++;
12568 }
12569
12570 if (rid != rd.end())
12571 rid++;
12572 }
12573
12574 if (rd.size() > 0)
12575 {
12576 bool printed = false;
12577 // display left overs in rd from different dcl
12578 for (auto& reachingDef : rd)
12579 {
12580 if (std::get<0>(reachingDef)->getDst()->getTopDcl() == lr->getDcl()->getRootDeclare())
12581 continue;
12582
12583 if (inst->getCISAOff() == std::get<0>(reachingDef)->getCISAOff())
12584 continue;
12585
12586 if (!printed)
12587 {
12588 printf("\tLeft-over rd:\n");
12589 printed = true;
12590 }
12591 printf("\t");
12592 std::get<0>(reachingDef)->emit(std::cerr);
12593 printf("\t$%d\n", std::get<0>(reachingDef)->getCISAOff());
12594 }
12595 }
12596 }
12597
12598 return false;
12599 }
12600
loadAugData(std::vector<G4_Declare * > & s,LiveRange * const * l,unsigned n,const Interference * i,GlobalRA & g)12601 void VerifyAugmentation::loadAugData(std::vector<G4_Declare*>& s, LiveRange* const * l, unsigned n, const Interference* i, GlobalRA& g)
12602 {
12603 reset();
12604 sortedLiveRanges = s;
12605 gra = &g;
12606 kernel = &gra->kernel;
12607 lrs = l;
12608 numVars = n;
12609 intf = i;
12610
12611 for (unsigned i = 0; i != numVars; i++)
12612 {
12613 DclLRMap[lrs[i]->getDcl()] = lrs[i];
12614 }
12615 for (auto dcl : kernel->Declares)
12616 {
12617 if (dcl->getRegFile() == G4_RegFileKind::G4_GRF ||
12618 dcl->getRegFile() == G4_RegFileKind::G4_INPUT)
12619 {
12620 LiveRange* lr = nullptr;
12621 auto it = DclLRMap.find(dcl);
12622 if (it != DclLRMap.end())
12623 {
12624 lr = it->second;
12625 }
12626 auto start = gra->getStartInterval(dcl);
12627 auto end = gra->getEndInterval(dcl);
12628 masks[dcl] = std::make_tuple(lr, gra->getAugmentationMask(dcl), start, end);
12629 }
12630 }
12631 }
12632
12633 //
12634 // DFS to check if there is any conflict in subroutine return location
12635 //
isSubRetLocConflict(G4_BB * bb,std::vector<unsigned> & usedLoc,unsigned stackTop)12636 bool GlobalRA::isSubRetLocConflict(G4_BB *bb, std::vector<unsigned> &usedLoc, unsigned stackTop)
12637 {
12638 auto& fg = kernel.fg;
12639 if (bb->isAlreadyTraversed(fg.getTraversalNum()))
12640 return false;
12641 bb->markTraversed(fg.getTraversalNum());
12642
12643 G4_INST* lastInst = bb->size() == 0 ? NULL : bb->back();
12644 if (lastInst && lastInst->isReturn())
12645 {
12646 if (lastInst->getPredicate() == NULL)
12647 return false;
12648 else
12649 {
12650 return isSubRetLocConflict(bb->fallThroughBB(), usedLoc, stackTop);
12651 }
12652 }
12653 else if (lastInst && lastInst->isCall()) // need to traverse to next level
12654 {
12655 unsigned curSubRetLoc = getSubRetLoc(bb);
12656 //
12657 // check conflict firstly
12658 //
12659 for (unsigned i = 0; i<stackTop; i++)
12660 if (usedLoc[i] == curSubRetLoc)
12661 return true;
12662 //
12663 // then traverse all the subroutines and return BB
12664 //
12665 usedLoc[stackTop] = curSubRetLoc;
12666 unsigned afterCallId = bb->BBAfterCall()->getId();
12667
12668 // call can have 1 or 2 successors
12669 // If it has 1 then it is sub-entry block, if it has 2
12670 // then call has to be predicated. In case of predication,
12671 // 1st successor is physically following BB, 2nd is
12672 // sub-entry.
12673 if (lastInst->getPredicate())
12674 {
12675 MUST_BE_TRUE(bb->Succs.size() == 2, "Expecting 2 successor BBs for predicated call");
12676 if (isSubRetLocConflict(bb->Succs.back(), usedLoc, stackTop))
12677 return true;
12678 }
12679
12680 if (bb->BBAfterCall()->getId() == afterCallId)
12681 {
12682 if (isSubRetLocConflict(bb->BBAfterCall(), usedLoc, stackTop))
12683 return true;
12684 }
12685 }
12686 else
12687 {
12688 for (G4_BB *succ : bb->Succs)
12689 if (isSubRetLocConflict(succ, usedLoc, stackTop))
12690 return true;
12691 }
12692
12693 return false;
12694 }
12695
12696 //
12697 // The routine traverses all BBs that can be reached from the entry of a subroutine (not
12698 // traversing into nested subroutine calls). Mark retLoc[bb] = entryId (to associate bb
12699 // with the subroutine entry. When two subroutines share code, we return the location of the
12700 // subroutine that was previously traversed so that the two routines can then use
12701 // the same location to save their return addresses.
12702 //
determineReturnAddrLoc(unsigned entryId,unsigned * retLoc,G4_BB * bb)12703 unsigned GlobalRA::determineReturnAddrLoc(unsigned entryId, unsigned* retLoc, G4_BB* bb)
12704 {
12705 auto& fg = kernel.fg;
12706 if (bb->isAlreadyTraversed(fg.getTraversalNum()))
12707 return retLoc[bb->getId()];
12708 bb->markTraversed(fg.getTraversalNum());
12709
12710 if (retLoc[bb->getId()] != UNDEFINED_VAL)
12711 return retLoc[bb->getId()];
12712 else
12713 {
12714 retLoc[bb->getId()] = entryId;
12715 G4_INST* lastInst = bb->size() == 0 ? NULL : bb->back();
12716
12717 if (lastInst && lastInst->isReturn())
12718 {
12719 if (lastInst->getPredicate() == NULL)
12720 return entryId;
12721 else
12722 return determineReturnAddrLoc(entryId, retLoc, bb->fallThroughBB());
12723 }
12724 else if (lastInst && lastInst->isCall()) // skip nested subroutine calls
12725 {
12726 return determineReturnAddrLoc(entryId, retLoc, bb->BBAfterCall());
12727 }
12728 unsigned sharedId = entryId;
12729 for (G4_BB *succ : bb->Succs)
12730 {
12731 unsigned loc = determineReturnAddrLoc(entryId, retLoc, succ);
12732 if (loc != entryId)
12733 {
12734 while (retLoc[loc] != loc) // find the root of subroutine loc
12735 loc = retLoc[loc]; // follow the link to reach the root
12736 if (sharedId == entryId)
12737 {
12738 sharedId = loc;
12739 }
12740 else if (sharedId != loc)
12741 {
12742 //
12743 // The current subroutine share code with two other subroutines, we
12744 // force all three of them to use the same location by linking them
12745 // togethers.
12746 //
12747 retLoc[loc] = sharedId;
12748 }
12749 }
12750 }
12751 return sharedId;
12752 }
12753 }
12754
assignLocForReturnAddr()12755 void GlobalRA::assignLocForReturnAddr()
12756 {
12757 auto& fg = kernel.fg;
12758 unsigned* retLoc = (unsigned*)builder.mem.alloc(fg.getNumBB() * sizeof(unsigned));
12759 //
12760 // a data structure for doing a quick map[id] ---> block
12761 //
12762 G4_BB** BBs = (G4_BB**)builder.mem.alloc(fg.getNumBB() * sizeof(G4_BB*));
12763 for (G4_BB *bb : fg)
12764 {
12765 unsigned i = bb->getId();
12766 retLoc[i] = UNDEFINED_VAL;
12767 BBs[i] = bb; // BBs are sorted by ID
12768 }
12769
12770 //
12771 // Firstly, keep the original algorithm unchanged to mark the retLoc
12772 //
12773 std::vector<G4_BB *> caller; // just to accelerate the algorithm later
12774
12775 for (unsigned i = 0, bbNum = fg.getNumBB(); i < bbNum; i++)
12776 {
12777 G4_BB* bb = BBs[i];
12778 if (bb->isEndWithCall() == false)
12779 {
12780 continue;
12781 }
12782
12783 #ifdef _DEBUG
12784 G4_INST *last = bb->empty() ? NULL : bb->back();
12785 MUST_BE_TRUE(last, ERROR_FLOWGRAPH);
12786 #endif
12787
12788 caller.push_back(bb); // record the callers, just to accelerate the algorithm
12789
12790 G4_BB* subEntry = bb->getCalleeInfo()->getInitBB();
12791 if (retLoc[subEntry->getId()] != UNDEFINED_VAL) // a loc has been assigned to the subroutine
12792 {
12793 // Need to setSubRetLoc if subEntry is part of another subRoutine because,
12794 // in the final phase, we use SubRetLoc != UNDEFINED_VAL to indicate
12795 // a block is an entry of a subroutine.
12796 setSubRetLoc(subEntry, retLoc[subEntry->getId()]);
12797 }
12798 else
12799 {
12800 fg.prepareTraversal();
12801 unsigned loc = determineReturnAddrLoc(subEntry->getId(), retLoc, subEntry);
12802 if (loc != subEntry->getId())
12803 {
12804 retLoc[subEntry->getId()] = loc;
12805 }
12806 setSubRetLoc(subEntry, loc);
12807 //
12808 // We do not merge indirect call here, because it will createt additional (bb->getSubRetLoc() != bb->getId()) cases that kill the share code detection
12809 //
12810 }
12811
12812 // retBB is the exit basic block of callee, ie the block with return statement at end
12813 G4_BB* retBB = bb->getCalleeInfo()->getExitBB();
12814
12815 if (retLoc[retBB->getId()] == UNDEFINED_VAL)
12816 {
12817 // retBB block was unreachable so retLoc element corresponding to that block was
12818 // left undefined
12819 retLoc[retBB->getId()] = getSubRetLoc(subEntry);
12820 }
12821 }
12822 #ifdef DEBUG_VERBOSE_ON
12823 DEBUG_MSG(std::endl << "Before merge indirect call: " << std::endl);
12824 for (unsigned i = 0; i < fg.getNumBB(); i++)
12825 if (retLoc[i] == UNDEFINED_VAL) {
12826 DEBUG_MSG("BB" << i << ": X ");
12827 }
12828 else {
12829 DEBUG_MSG("BB" << i << ": " << retLoc[i] << " ");
12830 }
12831 DEBUG_MSG(std::endl);
12832 #endif
12833
12834 //
12835 // this final phase is needed. Consider the following scenario. Sub2 shared code with both
12836 // Sub1 and Sub3. All three must use the same location to save return addresses. If we traverse
12837 // Sub1 then Sub3, retLoc[Sub1] and retLoc[Sub3] all point to their own roots. As we traverse
12838 // Sub2, code sharing is detected, we need to this phase to make sure that Sub1 and Sub3 use the
12839 // same location.
12840 //
12841 for (unsigned i = 0, bbNum = fg.getNumBB(); i < bbNum; i++)
12842 {
12843 G4_BB* bb = BBs[i];
12844 if (getSubRetLoc(bb) != UNDEFINED_VAL)
12845 {
12846 if (getSubRetLoc(bb) != bb->getId())
12847 {
12848 unsigned loc = bb->getId();
12849 while (retLoc[loc] != loc) // not root
12850 loc = retLoc[loc]; // follow the link to reach the root
12851 }
12852 }
12853 }
12854
12855 //
12856 // Merge the retLoc in indirect call cases
12857 //
12858 for (G4_BB *bb : caller)
12859 {
12860 G4_INST *last = bb->empty() ? NULL : bb->back();
12861 MUST_BE_TRUE(last, ERROR_FLOWGRAPH);
12862
12863 unsigned fallThroughId = bb->fallThroughBB() == NULL ? UNDEFINED_VAL : bb->fallThroughBB()->getId();
12864 if ((last && last->getPredicate() == NULL && bb->Succs.size() > 1) || (last && last->getPredicate() != NULL && bb->Succs.size() > 2))
12865 {
12866 //
12867 // merge all subroutines to the last one, it is a trick to conduct the conditional call by using last one instead of first one
12868 //
12869 unsigned masterEntryId = bb->Succs.back()->getId();
12870 //
12871 // find the root of the master subroutine
12872 //
12873 unsigned masterRetLoc = masterEntryId;
12874 while (retLoc[masterRetLoc] != masterRetLoc)
12875 masterRetLoc = retLoc[masterRetLoc];
12876 //
12877 // check other subroutines in one vertex
12878 //
12879 for (G4_BB *subBB : bb->Succs)
12880 {
12881 if (subBB->getId() != masterEntryId && subBB->getId() != fallThroughId)
12882 {
12883 //
12884 // find the root of the current subroutine
12885 //
12886 unsigned loc = subBB->getId();
12887 while (retLoc[loc] != loc)
12888 loc = retLoc[loc];
12889 //
12890 // Merge: let all the items in retLoc with value loc pointing to masterRetLoc
12891 // Suppose indirect call X calls subroutine A and B, indirect call Y calls B and C, and indirect call Z calls C and D.
12892 // Before merge, the A~D will be assigned different return location. Suppose we process the callers in order X-->Z-->Y in the merge,
12893 // if we just modified the return locations of one indirect call, we will fail to merge the return locations of A~D.
12894 //
12895 if (loc != masterRetLoc)
12896 {
12897 for (unsigned i = 0; i < fg.getNumBB(); i++)
12898 if (retLoc[i] == loc)
12899 retLoc[i] = masterRetLoc;
12900 }
12901 }
12902 }
12903 }
12904 }
12905
12906 #ifdef DEBUG_VERBOSE_ON
12907 DEBUG_MSG(std::endl << "After merge indirect call: " << std::endl);
12908 for (unsigned i = 0; i < fg.getNumBB(); i++)
12909 if (retLoc[i] == UNDEFINED_VAL) {
12910 DEBUG_MSG("BB" << i << ": X ");
12911 }
12912 else {
12913 DEBUG_MSG("BB" << i << ": " << retLoc[i] << " ");
12914 }
12915 DEBUG_MSG(std::endl << std::endl);
12916 #endif
12917
12918 //
12919 // Assign ret loc for subroutines firstly, and then check if it is wrong (due to circle in call graph).
12920 //
12921 for (unsigned i = 0, bbNum = fg.getNumBB(); i < bbNum; i++)
12922 {
12923 //
12924 // reset the return BB's retLoc
12925 //
12926 unsigned loc = i;
12927 if (retLoc[i] != UNDEFINED_VAL)
12928 {
12929 while (retLoc[loc] != loc)
12930 loc = retLoc[loc];
12931 retLoc[i] = loc;
12932 setSubRetLoc(BBs[i], retLoc[loc]);
12933 }
12934 }
12935
12936 for (G4_BB *bb : caller)
12937 {
12938 //
12939 // set caller BB's retLoc
12940 //
12941 #ifdef _DEBUG
12942 G4_INST *last = bb->empty() ? NULL : bb->back();
12943 MUST_BE_TRUE(last, ERROR_FLOWGRAPH);
12944 #endif
12945 G4_BB *subBB = bb->getCalleeInfo()->getInitBB();
12946 //
12947 // 1: Must use retLoc here, because some subBB is also the caller of another subroutine, so the entry loc in BB may be changed in this step
12948 // 2: In some cases, the caller BB is also the entry BB. At this time, the associated entry BB ID will be overwritten. However, it will not impact the
12949 // conflict detection and return location assignment, since we only check the return BB and/or caller BB in these two moudles.
12950 //
12951 setSubRetLoc(bb, retLoc[subBB->getId()]);
12952 }
12953
12954 #ifdef _DEBUG
12955 for (unsigned i = 0; i < fg.getNumBB(); i++)
12956 {
12957 G4_BB* bb = BBs[i];
12958 if (getSubRetLoc(bb) != UNDEFINED_VAL)
12959 {
12960 if (!bb->empty() && bb->front()->isLabel())
12961 {
12962 DEBUG_VERBOSE(((G4_Label*)bb->front()->getSrc(0))->getLabel()
12963 << " assigned location " << getSubRetLoc(bb) << std::endl);
12964 }
12965 }
12966 }
12967 #endif
12968
12969 //
12970 // detect the conflict (circle) at last
12971 //
12972 std::vector<unsigned> usedLoc(fg.getNumBB());
12973 unsigned stackTop = 0;
12974 for (G4_BB *bb : caller)
12975 {
12976 //
12977 // Must re-start the traversal from each caller, otherwise will lose some circle cases like TestRA_Call_1_1_3B, D, F, G, H
12978 //
12979 fg.prepareTraversal();
12980
12981 usedLoc[stackTop] = getSubRetLoc(bb);
12982
12983 G4_BB* subEntry = bb->Succs.back();
12984
12985 if (isSubRetLocConflict(subEntry, usedLoc, stackTop + 1))
12986 {
12987 MUST_BE_TRUE(false,
12988 "ERROR: Fail to assign call-return variables due to cycle in call graph!");
12989 }
12990 }
12991
12992 insertCallReturnVar();
12993 }
12994
insertCallReturnVar()12995 void GlobalRA::insertCallReturnVar()
12996 {
12997 for (auto bb : kernel.fg)
12998 {
12999 G4_INST *last = bb->empty() ? NULL : bb->back();
13000 if (last)
13001 {
13002 if (last->isCall())
13003 {
13004 insertSaveAddr(bb);
13005 }
13006 else
13007 {
13008 if (last->isReturn())
13009 {
13010 // G4_BB_EXIT_TYPE is just a dummy BB, and the return will be the last
13011 // inst in each of its predecessors
13012 insertRestoreAddr(bb);
13013 }
13014 }
13015 }
13016 }
13017 }
13018
insertSaveAddr(G4_BB * bb)13019 void GlobalRA::insertSaveAddr(G4_BB* bb)
13020 {
13021 MUST_BE_TRUE(bb != NULL, ERROR_INTERNAL_ARGUMENT);
13022 MUST_BE_TRUE(getSubRetLoc(bb) != UNDEFINED_VAL,
13023 ERROR_FLOWGRAPH); // must have a assigned loc
13024
13025
13026 G4_INST *last = bb->back();
13027 MUST_BE_TRUE1(last->isCall(), last->getLineNo(),
13028 ERROR_FLOWGRAPH);
13029 if (last->getDst() == NULL)
13030 {
13031 unsigned loc = getSubRetLoc(bb);
13032 G4_Declare* dcl = getRetDecl(loc);
13033
13034 last->setDest(builder.createDst(dcl->getRegVar(), 0, 0, 1, Type_UD)); // RET__loc12<1>:ud
13035
13036 last->setExecSize(g4::SIMD2);
13037 }
13038 }
13039
insertRestoreAddr(G4_BB * bb)13040 void GlobalRA::insertRestoreAddr(G4_BB* bb)
13041 {
13042 MUST_BE_TRUE(bb != NULL, ERROR_INTERNAL_ARGUMENT);
13043
13044 G4_INST *last = bb->back();
13045 MUST_BE_TRUE1(last->isReturn(), last->getLineNo(),
13046 ERROR_FLOWGRAPH);
13047 if (last->getSrc(0) == NULL)
13048 {
13049 unsigned loc = getSubRetLoc(bb);
13050 G4_Declare* dcl = getRetDecl(loc);
13051
13052 G4_SrcRegRegion* new_src = builder.createSrc(
13053 dcl->getRegVar(),
13054 0,
13055 0,
13056 builder.createRegionDesc(0, 2, 1),
13057 Type_UD);
13058
13059 last->setSrc(new_src, 0);
13060 last->setDest(builder.createNullDst(Type_UD));
13061
13062 last->setExecSize(g4::SIMD2);
13063 }
13064 }
13065
13066 // This function returns the weight of interference edge lr1--lr2,
13067 // which is used for computing the degree of lr1.
13068 //
13069 // When there is no alignment restriction, we should use the normal weight,
13070 // which is lr1_nreg + lr2_nreg - 1.
13071 //
13072 // Otherewise, we need to take into account additional space that may be
13073 // required because of the alignment restriction. For example,
13074 // if lr1 has even alignment and lr2 has no alignment restriction,
13075 // we need to consider the following cases that would require the
13076 // maximal available GRF space for successful allocation:
13077 // 1) lr1's size is odd, lr2's size is odd and lr2's start position is even,
13078 // the total space required would be (lr1_nreg + lr2_nreg + 1)
13079 // 2) lr1's size is odd, lr2's size is even and lr2's start position is even,
13080 // the total space required would be (lr1_nreg + lr2_nreg)
13081 // 3) lr1's size is even, lr2's size is odd and lr2's start position is odd,
13082 // the total space required would be (lr1_nreg + lr2_nreg)
13083 // 4) lr1's size is even, lr2's size is even and lr2's start position is odd,
13084 // the total space required would be (lr1_nreg + lr2_nreg + 1)
13085 // The above logic can be simplified to the following formula:
13086 // lr1_nreg + lr2_nreg + 1 - ((lr1_nreg + lr2_nreg) % 2)
13087 //
13088 // If both lr1 and lr2 have even alignment restriction,
13089 // we need to consider the following cases that would require the
13090 // maximal available GRF space for successful allocation:
13091 // 1) lr1's size is odd, lr2's size is odd and lr2's start position is even,
13092 // the total space required would be (lr1_nreg + lr2_nreg + 1)
13093 // 2) lr1's size is odd, lr2's size is even and lr2's start position is even,
13094 // the total space required would be (lr1_nreg + lr2_nreg)
13095 // 3) lr1's size is even, lr2's size is odd and lr2's start position is even,
13096 // the total space required would be (lr1_nreg + lr2_nreg)
13097 // 4) lr1's size is even, lr2's size is even and lr2's start position is even,
13098 // the total space required would be (lr1_nreg + lr2_nreg - 1)
13099 // The above logic can be simplified to the following formula:
13100 // lr1_nreg + lr2_nreg - 1 + (lr1_nreg % 2) + (lr2_nreg % 2)
13101 //
edgeWeightGRF(const LiveRange * lr1,const LiveRange * lr2)13102 unsigned GraphColor::edgeWeightGRF(const LiveRange* lr1, const LiveRange* lr2)
13103 {
13104 bool lr1EvenAlign = gra.isEvenAligned(lr1->getDcl());
13105 bool lr2EvenAlign = gra.isEvenAligned(lr2->getDcl());
13106 unsigned lr1_nreg = lr1->getNumRegNeeded();
13107 unsigned lr2_nreg = lr2->getNumRegNeeded();
13108
13109 if (!lr1EvenAlign)
13110 {
13111 return lr1_nreg + lr2_nreg - 1;
13112 }
13113 else if (!lr2EvenAlign)
13114 {
13115 unsigned sum = lr1_nreg + lr2_nreg;
13116 return sum + 1 - ((sum) % 2);
13117 }
13118 else if (lr2EvenAlign)
13119 {
13120 return lr1_nreg + lr2_nreg - 1 + (lr1_nreg % 2) + (lr2_nreg % 2);
13121 }
13122 else
13123 {
13124 assert(false && "should be unreachable");
13125 return 0;
13126 }
13127 }
13128
edgeWeightARF(const LiveRange * lr1,const LiveRange * lr2)13129 unsigned GraphColor::edgeWeightARF(const LiveRange* lr1, const LiveRange* lr2)
13130 {
13131 if (lr1->getRegKind() == G4_FLAG)
13132 {
13133 G4_SubReg_Align lr1_align = gra.getSubRegAlign(lr1->getVar()->getDeclare());
13134 G4_SubReg_Align lr2_align = gra.getSubRegAlign(lr2->getVar()->getDeclare());
13135 unsigned lr1_nreg = lr1->getNumRegNeeded();
13136 unsigned lr2_nreg = lr2->getNumRegNeeded();
13137
13138 if (lr1_align == Any)
13139 {
13140 return lr1_nreg + lr2_nreg - 1;
13141 }
13142 else if (lr1_align == Even_Word && lr2_align == Any)
13143 {
13144 return lr1_nreg + lr2_nreg + 1 - ((lr1_nreg + lr2_nreg) % 2);
13145 }
13146 else if (lr1_align == Even_Word && lr2_align == Even_Word)
13147 {
13148 if (lr1_nreg % 2 == 0 && lr2_nreg % 2 == 0)
13149 {
13150 return lr1_nreg + lr2_nreg - 2;
13151 }
13152 else
13153 {
13154 return lr1_nreg + lr2_nreg - 1 + (lr1_nreg % 2) + (lr2_nreg % 2);
13155 }
13156 }
13157 else
13158 {
13159 MUST_BE_TRUE(false, "Found unsupported subRegAlignment in flag register allocation!");
13160 return 0;
13161 }
13162 }
13163 else if (lr1->getRegKind() == G4_ADDRESS)
13164 {
13165 G4_SubReg_Align lr1_align = gra.getSubRegAlign(lr1->getVar()->getDeclare());
13166 G4_SubReg_Align lr2_align = gra.getSubRegAlign(lr2->getVar()->getDeclare());
13167 unsigned lr1_nreg = lr1->getNumRegNeeded();
13168 unsigned lr2_nreg = lr2->getNumRegNeeded();
13169
13170 if (lr1_align == Any)
13171 {
13172 return lr1_nreg + lr2_nreg - 1;
13173 }
13174 else if (lr1_align == Four_Word && lr2_align == Any)
13175 {
13176 return lr1_nreg + lr2_nreg + 3 - (lr1_nreg + lr2_nreg) % 4;
13177 }
13178 else if (lr1_align == Four_Word && lr2_align == Four_Word)
13179 {
13180 return lr1_nreg + lr2_nreg - 1 + (4 - lr1_nreg % 4) % 4 + (4 - lr2_nreg % 4) % 4;
13181 }
13182 else if (lr1_align == Eight_Word && lr2_align == Any)
13183 {
13184 return lr1_nreg + lr2_nreg + 7 - (lr1_nreg + lr2_nreg) % 8;
13185 }
13186 else if (lr1_align == Eight_Word && lr2_align == Four_Word)
13187 {
13188 if (((8 - lr1_nreg % 8) % 8) >= 4)
13189 return lr1_nreg + lr2_nreg - 1 + (8 - lr1_nreg % 8) % 8 - 4;
13190 return lr1_nreg + lr2_nreg - 1 + (8 - lr1_nreg % 8) % 8 +
13191 (4 - lr2_nreg % 4) % 4;
13192 }
13193 else if (lr1_align == Eight_Word && lr2_align == Eight_Word)
13194 {
13195 return lr1_nreg + lr2_nreg - 1 + (8 - lr1_nreg % 8) % 8 +
13196 (8 - lr2_nreg % 8) % 8;
13197 }
13198 else
13199 {
13200 MUST_BE_TRUE(false, "Found unsupported subRegAlignment in address register allocation!");
13201 return 0;
13202 }
13203 }
13204 MUST_BE_TRUE(false, "Found unsupported ARF reg type in register allocation!");
13205 return 0;
13206 }
13207
fixSrc0IndirFcall()13208 void GlobalRA::fixSrc0IndirFcall()
13209 {
13210 // Indirect calls look like:
13211 // mov (1|NM) V10 0x123456:ud
13212 // fcall (1) dst V10 <-- V10 which is src0 contains %ip to jump to
13213 //
13214 // In this function, we want to set V10 to r125.0 which is same as dst of fcall
13215 // as per ABI. This way, when inserting save/restore code around fcall, no
13216 // special checks are needed to handle V10.
13217 //
13218 // But this works only if V10 is a local. If it not a local we create a mov
13219 // that copies V10 in to a new temp variable. And then we map this temp
13220 // variable to r125.0. Hopefully V10 being global would be a rare occurence.
13221 for (auto bb : kernel.fg)
13222 {
13223 if (bb->isEndWithFCall())
13224 {
13225 auto fcall = bb->back()->asCFInst();
13226 if (!fcall->getSrc(0) ||
13227 !fcall->getSrc(0)->isSrcRegRegion())
13228 continue;
13229
13230 auto src0Rgn = fcall->getSrc(0)->asSrcRegRegion();
13231 auto src0Dcl = src0Rgn->getBase()->asRegVar()->getDeclare();
13232 auto src0TopDcl = src0Rgn->getTopDcl();
13233
13234 if (src0Dcl != src0TopDcl ||
13235 !isBlockLocal(src0TopDcl) ||
13236 src0TopDcl->getNumElems() > 1)
13237 {
13238 // create a copy
13239 auto tmpDcl = kernel.fg.builder->createHardwiredDeclare(1, src0Rgn->getType(), kernel.getFPSPGRF(),
13240 IR_Builder::SubRegs_Stackcall::Ret_IP);
13241 auto dst = kernel.fg.builder->createDst(tmpDcl->getRegVar(), src0Rgn->getType());
13242 auto src = kernel.fg.builder->duplicateOperand(src0Rgn);
13243 auto copy = kernel.fg.builder->createMov(g4::SIMD1, dst, src, InstOpt_WriteEnable, false);
13244 auto iter = std::find_if(bb->begin(), bb->end(), [](G4_INST* inst) { return inst->isFCall(); });
13245 bb->insertBefore(iter, copy);
13246 auto newSrc = kernel.fg.builder->createSrc(tmpDcl->getRegVar(), 0, 0, kernel.fg.builder->getRegionScalar(),
13247 src0Rgn->getType());
13248 fcall->setSrc(newSrc, 0);
13249 }
13250 else
13251 {
13252 src0TopDcl->getRegVar()->setPhyReg(fcall->getDst()->getBase()->asRegVar()->getPhyReg(),
13253 fcall->getDst()->getBase()->asRegVar()->getPhyRegOff());
13254 }
13255 }
13256 }
13257 }
13258
dump(const char * s,LiveRange ** lrs,unsigned size)13259 bool dump(const char* s, LiveRange** lrs, unsigned size)
13260 {
13261 // Utility function to dump lr from name.
13262 // Returns true if lr name found.
13263 std::string name = s;
13264 for (unsigned i = 0; i != size; i++)
13265 {
13266 auto lr = lrs[i];
13267 if (lr && name.compare(lr->getVar()->getName()) == 0)
13268 {
13269 lr->dump();
13270 return true;
13271 }
13272 }
13273 return false;
13274 }
13275
dump(const char * s,const G4_Kernel * kernel)13276 bool dump(const char* s, const G4_Kernel* kernel)
13277 {
13278 // Utility function to dump dcl for given variable name.
13279 // Returns true if variable found.
13280 std::string name = s;
13281 for (auto dcl : kernel->Declares)
13282 {
13283 if (name.compare(dcl->getName()) == 0)
13284 {
13285 dcl->dump();
13286 return true;
13287 }
13288 }
13289 return false;
13290 }
13291
dumpIntf(const char * s) const13292 bool Interference::dumpIntf(const char* s) const
13293 {
13294 // Utility function to dump intf for given variable based on name.
13295 // Returns true if variable found.
13296 std::cout << "\n\n **** Interference Table ****\n";
13297 for (unsigned i = 0; i < maxId; i++)
13298 {
13299 std::string name = lrs[i]->getVar()->getName();
13300 if (name.compare(s) == 0)
13301 {
13302 std::cout << "(" << i << ") ";
13303 lrs[i]->dump();
13304 std::cout << "\n";
13305 for (unsigned j = 0; j < maxId; j++)
13306 {
13307 if (interfereBetween(i, j))
13308 {
13309 std::cout << "\t";
13310 lrs[j]->getVar()->emit(std::cout);
13311 }
13312 }
13313 std::cout << "\n";
13314 return true;
13315 }
13316 }
13317 return false;
13318 }
13319
setAllocHint(unsigned h)13320 void LiveRange::setAllocHint(unsigned h)
13321 {
13322 if ((h + dcl->getNumRows()) <= gra.kernel.getNumRegTotal())
13323 allocHint = h;
13324 }
13325
13326 // sortedIntervals comes from augmentation.
13327 // This can be invoked either post RA where phy regs are assigned to dcls,
13328 // or after assignColors with lrs and numLRs passed which makes this function
13329 // use temp allocations from lrs. Doesnt handle sub-routines yet.
dumpRegChart(std::ostream & os,LiveRange ** lrs,unsigned numLRs)13330 void RegChartDump::dumpRegChart(std::ostream& os, LiveRange** lrs, unsigned numLRs)
13331 {
13332 constexpr unsigned N = 128;
13333 std::unordered_map<G4_INST*, std::bitset<N>> busyGRFPerInst;
13334 bool dumpHex = false;
13335
13336 auto getPhyReg = [&](const G4_Declare* dcl)
13337 {
13338 auto preg = dcl->getRegVar()->getPhyReg();
13339 if (preg)
13340 return preg;
13341
13342 for (unsigned i = 0; i != numLRs; i++)
13343 {
13344 const LiveRange* lr = lrs[i];
13345 if (lr->getDcl() == dcl)
13346 {
13347 preg = lr->getPhyReg();
13348 break;
13349 }
13350 }
13351
13352 return preg;
13353 };
13354
13355 for (auto dcl : sortedLiveIntervals)
13356 {
13357 if (dcl->getRegFile() != G4_RegFileKind::G4_GRF &&
13358 dcl->getRegFile() != G4_RegFileKind::G4_INPUT)
13359 continue;
13360
13361 auto phyReg = getPhyReg(dcl);
13362 if (!phyReg)
13363 continue;
13364
13365 if (!phyReg->isGreg())
13366 continue;
13367
13368 auto GRFStart = phyReg->asGreg()->getRegNum();
13369 auto numRows = dcl->getNumRows();
13370
13371 auto startInst = startEnd[dcl].first;
13372 auto endInst = startEnd[dcl].second;
13373
13374 bool start = (dcl->getRegFile() == G4_RegFileKind::G4_INPUT);
13375 bool done = false;
13376 for (auto bb : gra.kernel.fg)
13377 {
13378 for (auto inst : *bb)
13379 {
13380 if (inst == startInst)
13381 {
13382 start = true;
13383 continue;
13384 }
13385
13386 if (!start)
13387 continue;
13388
13389 for (unsigned i = GRFStart; i != (GRFStart + numRows); i++)
13390 {
13391 busyGRFPerInst[inst].set(i, true);
13392 }
13393
13394 if (inst == endInst ||
13395 endInst == startInst)
13396 {
13397 done = true;
13398 break;
13399 }
13400 }
13401
13402 if (done)
13403 break;
13404 }
13405 }
13406
13407 // Now emit instructions with GRFs
13408 for (auto bb : gra.kernel.fg)
13409 {
13410 for (auto inst : *bb)
13411 {
13412 constexpr unsigned maxInstLen = 80;
13413 auto item = busyGRFPerInst[inst];
13414 std::stringstream ss;
13415 inst->emit(ss);
13416 auto len = ss.str().length();
13417
13418 if (len <= maxInstLen)
13419 {
13420 os << ss.str();
13421 for (unsigned i = 0; i != maxInstLen - ss.str().length(); i++)
13422 os << " ";
13423 }
13424 else
13425 {
13426 auto tmpStr = ss.str();
13427 auto limitedStr = tmpStr.substr(0, maxInstLen);
13428 os << std::string(limitedStr);
13429 }
13430
13431 os << " ";
13432
13433 if (!dumpHex)
13434 {
13435 // dump GRFs | - busy, * - free
13436 for (unsigned i = 0; i != N; i++)
13437 {
13438 // emit in groups of 10 GRFs
13439 if (i > 0 && (i % 10) == 0)
13440 os << " ";
13441
13442 if (item[i] == true)
13443 os << "|"; // busy
13444 else
13445 os << "*"; // free
13446 }
13447 }
13448 else
13449 {
13450 for (unsigned i = 0; i != N; i+=sizeof(unsigned short)*8)
13451 {
13452 unsigned short busyGRFs = 0;
13453 for (unsigned j = 0; j != sizeof(unsigned short)*8; j++)
13454 {
13455 auto offset = i + j;
13456 if (offset < N)
13457 {
13458 if (item[offset])
13459 busyGRFs |= (1 << j);
13460 }
13461 }
13462 printf("r%d:%4x ", i, busyGRFs);
13463 }
13464 }
13465 os << std::endl;
13466 }
13467 os << std::endl;
13468 }
13469 }
13470
recordLiveIntervals(const std::vector<G4_Declare * > & dcls)13471 void RegChartDump::recordLiveIntervals(const std::vector<G4_Declare*>& dcls)
13472 {
13473 sortedLiveIntervals = dcls;
13474 for (auto dcl : dcls)
13475 {
13476 auto start = gra.getStartInterval(dcl);
13477 auto end = gra.getEndInterval(dcl);
13478 startEnd.insert(std::make_pair(dcl, std::make_pair(start, end)));
13479 }
13480 }
13481