1 /*========================== begin_copyright_notice ============================
2
3 Copyright (C) 2017-2021 Intel Corporation
4
5 SPDX-License-Identifier: MIT
6
7 ============================= end_copyright_notice ===========================*/
8
9 #include "RegDeps.hpp"
10 #include "Traversals.hpp"
11 #include "BitSet.hpp"
12
13 #include <iterator>
14 #include <limits>
15
16 using namespace iga;
17
18 /**
19 * RAW: R kill W R-->live explict dependence
20 * WAW: different pipelines W2 kill W1 W2-->live explict dependence
21 * WAR: different pipelines W kill R W-->live explict dependence
22 * WAR: same pipeline W kill R W-->live implict dependence
23 * AR: sample pipeline R2 kill R1 R2-->live no dependence
24 * RAR: different pipelines R1,R2-->live no dependence
25 *
26 * Different pipeline
27 * send, math, control flow, long/short (type df)
28 *
29 * add (8) r10 r20 r30
30 * add (8) r11 r21 r22
31 * if (..)
32 * // if instruction doesn't count in if calculations, but it takes about 6
33 * // cycles to resolve for fall through can treat it as continue BB.
34 * // Only when this BB has one predecessor
35 * add r40 r10 r50 {@2}
36 * else
37 * add r60 r70 r80
38 * endif
39 * //Both control flows converge on this. Conservative analysis start with 1.
40 * //By the time jmp happens counter should be at 0 anyway.
41 * add r90 r100 r110 {@1}
42 * add r91 r101 r111 {@2}
43 *
44 *
45 * Types of Dependencies
46 * dst src0 src1
47 * grf ind grf // set distance to 1. If SBID list not empty insert test instruction. Optimization if SBID == 1 AND grf depends on it, set SBID, clear SBIDList
48 */
49
50 /**
51 Bucket - represents a GRF line
52 ID - sequential numbering of instructions. Resets at 0 with new BB
53 Implicit assumption. Various data structures have pointers to Dependencies.
54
55 For each BB scan instructions down
56 if new BB
57 reset buckets //can use bit mask
58 reset distanceTracker
59 reset ID
60 For each instruction
61 Calculate dependcy on srcs and destination
62 if currDistance < MAX_DIST
63 record distance = currDistance // for a case where we at new BB
64
65 For each dependency and bucket it touches look in to a bucket
66 if bucket not empty
67 find potential dependencies //bit mask intersects
68 for each dependency found
69 if appopriate (WAW, RAR, RAW) Dependence exists
70 Clear dependency bits from bucket dependency
71 if dep empty remove from bucket
72 if DistanceDependency //no out of order
73 if instDistance > (currDistance - depID)
74 //We found dependence closer
75 record distance = currDistance - depID //CurrDistance > depID AND min(currDist - depID, 1)
76 else //sbid
77 record SBID ID
78
79 if dependencyRecord NOT empty
80 Generate appropriate SWSB/test instruction
81 IF SBID
82 if all dependencies are clear
83 add SBID to free list
84 remove entry SBID -> dependencies
85 Remove MAX_DIST DEP from buckets
86 Add current instruction Dependencies to buckets
87 if instruction isVariableExecTime //send, math.idiv
88 if freeSBITLIst IS empty
89 pick one SBID
90 generate test instruction
91 move SBID to free list
92 clear dependency from bucket/sbidList
93
94 assign SBID from free list
95 if end of block AND SBID list NOT empty
96 generate test instructions
97 */
98
99 #include "SWSBSetter.hpp"
100 /*
101 WAW
102 explicit dependence
103 math.fc (except idiv) r10 ...
104 add r10 ....
105
106 add r10 ... //long: type DF/Q
107 add r10 ... //short:
108
109 WAW
110 no dependence
111 add r10 ...
112 add r10 ...
113
114
115 Math.sin r10 r20 r30
116 Math.cos r20 r40 r50
117 Not required - same pipe
118
119 Math.sin r20 r10 r30
120 Math.cos r20 r40 r50
121 Not required - same pipe
122
123 FPU_long r20 r10 r30
124 Math.sin r20 r40 r50
125 Explicit dep required as math can overtake FPU_long - since they are in different pipes.
126
127 RAW
128 add r10 ...
129 add r20 ...
130 add ... r20 ... {@1}
131 add ... r10 {@3} <--- technically speaking this depending is not necesary
132 since they are in same pipe and previous instruction will stall
133 so last instruction dependence is cleared.
134 But in terms of runtime there is no impact so not worth special handling
135
136 assuming two grfs are written/read
137 send r10
138 send r11
139
140 add (16) ... r10 ...
141 second send has dependency on first send
142 add has dependency on second send
143 if sends written 1 grf, and add still read two grfs it will have dependence on both sends
144
145 send r10 //set$1 writes r10/r11
146 add(8) r10 {$1.dst}
147 add(8) r11 {}
148
149
150 */
clearDepBuckets(DepSet & depMatch)151 void SWSBAnalyzer::clearDepBuckets(DepSet &depMatch)
152 {
153 for (auto bucketID : depMatch.getBuckets())
154 {
155 auto bucket = &m_buckets[bucketID];
156 auto numDepSets = bucket->getNumDependencies();
157 for (uint32_t i = 0; i < numDepSets; ++i)
158 {
159 DepSet* dep = bucket->getDepSet(i);
160 //See if anything matches for this GRF bucket.
161 //originally was checking for intersect but was removing extra dependence in case like this
162 /*
163 (W) and (1|M0) r0.0<1>:ud r0.0<0;1,0>:ud 0xFFFFBFFF:ud {}
164 (W) mov (16|M0) r25.0<1>:f 0:d
165 (W) mov (1|M0) r0.2<1>:ud 0x0:ud
166 (W) mov (1|M0) r0.2<1>:ud 0x0:ud
167 (W) and (1|M0) r0.0<1>:ud r0.0<0;1,0>:ud 0xFFFFF7FF:ud {}
168 mov (16|M0) r120.0<1>:ud r17.0<8;8,1>:ud
169 mov (16|M0) r122.0<1>:ud r19.0<8;8,1>:ud
170 mov (16|M0) r124.0<1>:ud r21.0<8;8,1>:ud
171 mov (16|M0) r126.0<1>:ud r23.0<8;8,1>:ud
172 (W) mov (16|M0) r118.0<1>:ud r0.0<8;8,1>:ud {}
173 */
174 //the r0 dependece was already cleared by second r0
175 //but when clearing from buckets it would find the second r0 and clear it by mistake
176 if (dep && depMatch.getInstGlobalID() == dep->getInstGlobalID() &&
177 (dep->getDepType() == depMatch.getDepType()))
178 {
179 bucket->clearDepSet(i);
180 }
181 }
182 }
183 depMatch.reset();
184 }
185 /**
186 * This function takes in a current instruction dependency.
187 * Either SRC or DST
188 * It then checks against previous dependencies.
189 * It sets mininum valid distance
190 * and creates an active list of SBIDs this instruction depends on
191 * It clears and removes previous dependencies.
192 * The approach is bucket based.
193 * Each bucket is one GRF.
194 * So if instruction writes in to more then one GRF then multiple buckets will have the dependency
195 */
calculateDependence(DepSet & currDep,SWSB & distanceDependency,const Instruction & currInst,std::vector<SBID> & activeSBID,bool & needSyncForShootDownInst)196 void SWSBAnalyzer::calculateDependence(DepSet &currDep, SWSB &distanceDependency,
197 const Instruction &currInst, std::vector<SBID>& activeSBID, bool &needSyncForShootDownInst)
198 {
199 needSyncForShootDownInst = false;
200 auto currDepType = currDep.getDepType();
201 auto currDepPipe = currDep.getDepPipe();
202
203 for (auto bucketID : currDep.getBuckets())
204 {
205 //iterates over Dependencies in a GRF bucket
206 //Assumption there shouldn't be more then 1-2
207 Bucket* bucket = &m_buckets[bucketID];
208 size_t numDepSets = bucket->getNumDependencies();
209 for (uint32_t i = 0; i < numDepSets; ++i)
210 {
211 uint32_t index = static_cast<uint32_t>(numDepSets -1 - i);
212 auto dep = bucket->getDepSet(index);
213
214 if (dep && (dep->getDepType() == DEP_TYPE::WRITE_ALWAYS_INTERFERE ||
215 dep->getDepType() == DEP_TYPE::READ_ALWAYS_INTERFERE))
216 {
217 // force to sync with dep
218 if (dep->getDepClass() == DEP_CLASS::OUT_OF_ORDER)
219 {
220 setSbidDependency(*dep, currInst, needSyncForShootDownInst, activeSBID);
221 }
222 else
223 {
224 // Set to sync with all in-order-pipes. WRITE/READ_ALWAYS_INTERFERE
225 // could be used to mark arf dependency, which is required to be all pipes
226 // instead of dep's pipe only
227 distanceDependency.minDist = 1;
228 if (getNumOfDistPipe() == 1)
229 distanceDependency.distType = SWSB::DistType::REG_DIST;
230 else
231 distanceDependency.distType = SWSB::DistType::REG_DIST_ALL;
232 bucket->clearDepSet(index);
233 }
234 }
235
236 //See if anything matches for this GRF bucket.
237 if (dep && dep->getBitSet().intersects(currDep.getBitSet()))
238 {
239 /*
240 * RAW: R kill W R-->live explict dependence
241 * WAW: different pipelines W2 kill W1 W2-->live explict dependence
242 * WAR: different pipelines W kill R W-->live explict dependence
243 * WAR: same pipeline W kill R W-->live implict dependence
244 * AR: sample pipeline R2 kill R1 R2-->live no dependence
245 * RAR: different pipelines R1,R2-->live no dependence
246 */
247 //RAW: R kill W R-->live explict dependence
248 DEP_TYPE prevDepType = dep->getDepType();
249 DEP_PIPE prevDepPipe = dep->getDepPipe();
250 DEP_CLASS prevDepClass = dep->getDepClass();
251
252 // Send with different SFID could write to different pipes
253 bool sendInDiffPipe = false;
254 if (dep->getInstruction()->getOpSpec().isSendFamily() &&
255 currDep.getInstruction()->getOpSpec().isSendFamily())
256 {
257 sendInDiffPipe =
258 (dep->getInstruction()->getSendFc() !=
259 currDep.getInstruction()->getSendFc());
260 // for send in unknown pipe, always treated as different pipe
261 if (!sendInDiffPipe) {
262 sendInDiffPipe =
263 dep->getDepPipe() == DEP_PIPE::SEND_UNKNOWN ||
264 currDep.getDepPipe() == DEP_PIPE::SEND_UNKNOWN;
265 }
266 }
267
268 bool isRAW = currDepType == DEP_TYPE::READ &&
269 prevDepType == DEP_TYPE::WRITE;
270 //WAW: different pipelines W2 kill W1 W2-->live explict dependence
271 bool isWAW = (currDepType == DEP_TYPE::WRITE &&
272 prevDepType == DEP_TYPE::WRITE &&
273 (currDepPipe != prevDepPipe || sendInDiffPipe));
274 //WAR: different pipelines W kill R W-->live explict dependence
275 bool isWAR = currDepType == DEP_TYPE::WRITE &&
276 prevDepType == DEP_TYPE::READ &&
277 (currDepPipe != prevDepPipe || sendInDiffPipe);
278 bool isWAW_out_of_order
279 = (currDepType == DEP_TYPE::WRITE &&
280 prevDepType == DEP_TYPE::WRITE &&
281 prevDepClass == DEP_CLASS::OUT_OF_ORDER);
282
283 // Special case handling for acc/flag dependency:
284 // if the RAW dependency on acc and it's whithin the same pipe,
285 // HW can handle it that we don't need to set swsb
286 if (isRAW && currDepPipe == prevDepPipe) {
287 auto check_dep_reg = [&](DepSet* in_dep, uint32_t reg_start, uint32_t reg_len) {
288 return in_dep->getBitSet().intersects(currDep.getBitSet(),
289 reg_start, reg_len);
290 };
291 auto has_grf_dep = [&](DepSet* in_dep) {
292 return check_dep_reg(in_dep, m_DB->getGRF_START(), m_DB->getGRF_LEN());
293 };
294 auto has_arf_a_dep = [&](DepSet* in_dep) {
295 return check_dep_reg(in_dep, m_DB->getARF_A_START(), m_DB->getARF_A_LEN());
296 };
297 auto has_acc_dep = [&](DepSet* in_dep) {
298 return check_dep_reg(in_dep, m_DB->getARF_ACC_START(), m_DB->getARF_ACC_LEN());
299 };
300 auto has_flag_dep = [&](DepSet* in_dep) {
301 return check_dep_reg(in_dep, m_DB->getARF_F_START(), m_DB->getARF_F_LEN());
302 };
303 auto has_sp_dep = [&](DepSet* in_dep) {
304 return check_dep_reg(in_dep, m_DB->getARF_SPECIAL_START(), m_DB->getARF_SPECIAL_LEN());
305 };
306
307 // is acc dependecy
308 if (has_acc_dep(dep)) {
309 // and no dependency on other registers
310 if (!(has_grf_dep(dep) || has_arf_a_dep(dep) || has_flag_dep(dep) || has_sp_dep(dep)))
311 isRAW = false;
312 }
313 // is flag dependency
314 if (has_flag_dep(dep)) {
315 // and no dependency on other registers
316 if (!(has_grf_dep(dep) || has_arf_a_dep(dep) || has_acc_dep(dep) || has_sp_dep(dep)))
317 isRAW = false;
318 // flag and acc only
319 if (has_acc_dep(dep))
320 if (!(has_grf_dep(dep) || has_arf_a_dep(dep) || has_sp_dep(dep)))
321 isRAW = false;
322 }
323 }
324
325 if (isWAR ||
326 isWAW ||
327 isRAW ||
328 isWAW_out_of_order)
329 {
330 // clearing previous dependence
331 if (dep->getBitSet().empty())
332 {
333 m_errorHandler.reportWarning(
334 currInst.getPC(),
335 "Dependency in bucket with no bits set");
336 }
337 // removing from bucket if there is nothing
338 if (!dep->getBitSet().testAny(bucketID * 32, m_DB->getGRF_BYTES_PER_REG()))
339 {
340 bucket->clearDepSet(index);
341 }
342 if (prevDepClass == DEP_CLASS::IN_ORDER)
343 {
344 if (getNumOfDistPipe() == 1) {
345 // FOR WAW if PREV is SHORT and curr is LONG then write will finish
346 // before current write, no need to set swsb
347 bool isWAWHazard = (prevDepPipe == DEP_PIPE::SHORT && currDepPipe == DEP_PIPE::LONG ||
348 prevDepPipe == DEP_PIPE::SHORT && currDepPipe == DEP_PIPE::SHORT)
349 && isWAW;
350 // require swsb for all the other kinds of dependency
351 if (!isWAWHazard)
352 {
353 // setting minimum distance
354 uint32_t newDistance = m_InstIdCounter.inOrder - dep->getInstIDs().inOrder;
355 distanceDependency.minDist =
356 distanceDependency.minDist == 0 ?
357 newDistance :
358 std::min(distanceDependency.minDist, newDistance);
359 // clamp the distance to max distance
360 distanceDependency.minDist = std::min(distanceDependency.minDist, (uint32_t)MAX_VALID_DISTANCE);
361 distanceDependency.distType = SWSB::DistType::REG_DIST;
362 }
363 } else {
364 // For multiple in-order pipeline architecuture, all cases should be considered
365 // The distance is depended on the previous instruction's pipeline
366 uint32_t newDistance = 0;
367 SWSB::DistType newDepPipe = SWSB::DistType::NO_DIST;
368 switch (prevDepPipe) {
369 case DEP_PIPE::FLOAT:
370 newDistance = m_InstIdCounter.floatPipe - dep->getInstIDs().floatPipe;
371 newDepPipe = SWSB::DistType::REG_DIST_FLOAT;
372 break;
373 case DEP_PIPE::INTEGER:
374 newDistance = m_InstIdCounter.intPipe - dep->getInstIDs().intPipe;
375 newDepPipe = SWSB::DistType::REG_DIST_INT;
376 break;
377 case DEP_PIPE::LONG64:
378 newDistance = m_InstIdCounter.longPipe - dep->getInstIDs().longPipe;
379 newDepPipe = SWSB::DistType::REG_DIST_LONG;
380 break;
381 case DEP_PIPE::MATH_INORDER:
382 newDistance = m_InstIdCounter.mathPipe - dep->getInstIDs().mathPipe;
383 newDepPipe = SWSB::DistType::REG_DIST_MATH;
384 break;
385 default:
386 IGA_ASSERT(0, "Unsupported DEP_PIPE for in-order instructions");
387 break;
388 }
389
390 // the instruction already has dependency to others
391 if (distanceDependency.minDist) {
392 newDistance = std::min(distanceDependency.minDist, newDistance);
393 // if the type is REG_DIST_ALL or is the same with the new pipe type,
394 // then remains it. Otherwise update the swsb type
395 if ((distanceDependency.distType != newDepPipe) && (distanceDependency.distType != SWSB::DistType::REG_DIST_ALL)) {
396 // get the pipe_type from opnd type
397 auto op_pipe_type = [](Type op_type) {
398 if (TypeIs64b(op_type))
399 return SWSB::DistType::REG_DIST_LONG;
400 if (TypeIsFloating(op_type))
401 return SWSB::DistType::REG_DIST_FLOAT;
402 return SWSB::DistType::REG_DIST_INT;
403 };
404 // check if the given pipe type is the same with one of the src type
405 auto haveTypeInSrc = [&](SWSB::DistType swsb_type) {
406 // HW restriction (WA): Cannot use @1 on XeHPC-XT, must explicitly set pipe type
407 // A@1 or L@1, ... Always return false so that we won't use @1
408 // Note that if there isn't this restriction, we should also update op_pipe_type
409 // for FourDistPipeReduction mode that non-float-64-bit type should be in INT pipe
410 if (m_swsbMode == SWSB_ENCODE_MODE::FourDistPipeReduction) {
411 return false;
412 }
413 for (size_t i = 0; i < currInst.getSourceCount(); ++i) {
414 if (op_pipe_type(currInst.getSource(i).getType()) == swsb_type)
415 return true;
416 }
417 return false;
418 };
419 if ((distanceDependency.distType == SWSB::DistType::REG_DIST_MATH) ||
420 (newDepPipe == SWSB::DistType::REG_DIST_MATH)) {
421 // either current of prev dep is MATH, it's not possible to combine them to REG_DIST
422 distanceDependency.distType = SWSB::DistType::REG_DIST_ALL;
423 } else if ((distanceDependency.distType != SWSB::DistType::REG_DIST)) {
424 // check if both previous and current dep pipe can be satisfied by currInst src type
425 if (haveTypeInSrc(distanceDependency.distType) && haveTypeInSrc(newDepPipe))
426 distanceDependency.distType = SWSB::DistType::REG_DIST;
427 else
428 distanceDependency.distType = SWSB::DistType::REG_DIST_ALL;
429 } else {
430 // if previous one is REG_DIST, set the type to REG_DIST_ALL if
431 // current one cannot be satisfied by src type
432 if (!haveTypeInSrc(newDepPipe))
433 distanceDependency.distType = SWSB::DistType::REG_DIST_ALL;
434 }
435 }
436 } else {
437 distanceDependency.distType = newDepPipe;
438 }
439 assert(distanceDependency.distType != SWSB::DistType::NO_DIST);
440 // clamp the distance to max distance
441 distanceDependency.minDist = std::min(newDistance, (uint32_t)MAX_VALID_DISTANCE);
442 } // end of if (m_enableMultiDistPipe)
443 // clear this instruction's dependency since it is satisfied
444 clearDepBuckets(*dep);
445
446 // clear its companion because when an in-order instruction is synced, both its
447 // input and output dependency are satisfied. The only case is that if it has
448 // read/write_always_interfere dependency, it should be reserved.
449 // The restriction is that:
450 // When certain Arch Registers (sr, cr, ce) are used,
451 // the very next instruction requires dependency to be set on all pipes {A@1}
452 // e.g.
453 // mov (1|M0) r104.0<1>:ud sr0.1<0;1,0>:ud
454 // cmp(16 | M0) (ne)f0.0 null:ud r104.0<0; 1, 0> : ub r62.4<0; 1, 0> : uw
455 // A@1 is required for cmp instead of I@1
456 if (dep->getCompanion() != nullptr) {
457 // In the case that this DepSet is generated from math_wa_info, it won't have companion
458 if (dep->getCompanion()->getDepType() != DEP_TYPE::WRITE_ALWAYS_INTERFERE &&
459 dep->getCompanion()->getDepType() != DEP_TYPE::READ_ALWAYS_INTERFERE) {
460 clearDepBuckets(*dep->getCompanion());
461 }
462 }
463 } // end of if (prevDepClass == DEP_CLASS::IN_ORDER)
464 else if (prevDepClass == DEP_CLASS::OUT_OF_ORDER) // prev is out of order
465 {
466 setSbidDependency(*dep, currInst, needSyncForShootDownInst, activeSBID);
467 }
468 // for the instruction in "OTHER" DEP_CLASS, such as sync, we don't need
469 // to consider their dependency that is implied by hardware
470 }
471 }
472 }
473 }
474 }
475
setSbidDependency(DepSet & dep,const Instruction & currInst,bool & needSyncForShootDownInst,std::vector<SBID> & activeSBID)476 void SWSBAnalyzer::setSbidDependency(DepSet& dep, const Instruction& currInst,
477 bool& needSyncForShootDownInst, std::vector<SBID>& activeSBID)
478 {
479 /* For out of order we don't know how long it will finish
480 * so need to test for SBID.
481 * Instruction can depend on more then one SBID
482 * send r10
483 * send r20
484 * send r30
485 * ....
486 * add r10 r20 r30
487 * between different buckets and srcs/dst dependencies instruction can rely on multiple SBID
488 */
489 SBID depSBID = dep.getSBID();
490 if (depSBID.isFree)
491 {
492 m_errorHandler.reportError((int)dep.getInstGlobalID(), "SBID SHOULDN'T BE FREE!");
493 }
494 // clears all the buckets
495 clearDepBuckets(dep);
496
497 // In case of shooting down of this instruction, we need to add sync to preserve the swsb id sync,
498 // so that it's safe to clear the dep
499 if (currInst.hasPredication() ||
500 (currInst.getExecSize() != dep.getInstruction()->getExecSize()) ||
501 (currInst.getChannelOffset() != dep.getInstruction()->getChannelOffset()))
502 needSyncForShootDownInst = true;
503
504 // used to set read or write dependency
505 depSBID.dType = dep.getDepType();
506
507 // activeSBID stores all sbid that this inst has dependency on
508 // and it'll be processed in processActiveSBID
509 bool push_back = true;
510 // making sure there are no duplicates
511 for (auto& aSBID : activeSBID)
512 {
513 if (aSBID.sbid == depSBID.sbid)
514 {
515 //write takes longer then read
516 //so we only need to check on one.
517 //so this either sets a write or resets back to read
518 if (aSBID.dType == DEP_TYPE::READ)
519 {
520 aSBID.dType = depSBID.dType;
521 }
522 push_back = false;
523 break;
524 }
525 }
526 // adding to active SBID
527 // in Run function we will see how many this instruction relies on
528 // and generate approriate SWSB and if needed test instruction
529 // in that level also will add them back to free list
530 if (push_back)
531 {
532 activeSBID.push_back(depSBID);
533 }
534 }
535
insertSyncAllRdWr(InstList::iterator insertPoint,Block * bb)536 void SWSBAnalyzer::insertSyncAllRdWr(InstList::iterator insertPoint, Block *bb)
537 {
538 SWSB distanceDependency;
539 auto clearRD = m_kernel.createSyncAllRdInstruction(distanceDependency);
540 auto clearWR = m_kernel.createSyncAllWrInstruction(distanceDependency);
541
542 if (insertPoint == bb->getInstList().end())
543 {
544 bb->getInstList().push_back(clearRD);
545 bb->getInstList().push_back(clearWR);
546 }
547 else
548 {
549 bb->insertInstBefore(insertPoint, clearRD);
550 bb->insertInstBefore(insertPoint, clearWR);
551 }
552 }
553
554 //TODO this should also clear up grf dependency to handle this case:
555 /*
556 call (16|M0) r8.0:ud 32
557 sendc.rc (16|M0) null r118 null 0x0 0x140B1000 {} // wr:10h, rd:0, Render Target Write msc:16, to #0
558 (W) mov (1|M0) a0.0<1>:ud r7.0<0;1,0>:ud
559 sendc.rc (16|M0) null r100 null 0x0 0x140B1000 {} // wr:10h, rd:0, Render Target Write msc:16, to #0
560 sendc.rc (16|M0) null r118 null 0x0 0x140B1000 {} // wr:10h, rd:0, Render Target Write msc:16, to #0
561 (W) mov (16|M0) r118.0<1>:ud r6.0<8;8,1>:ud
562 (W) send.dc0 (16|M0) r38 r118 null 0x0 a0.0
563 ret (16|M0)
564
565 Right now mov will have false dependense on the first send.
566 */
clearSBIDDependence(InstList::iterator insertPoint,Instruction * lastInst,Block * bb)567 void SWSBAnalyzer::clearSBIDDependence(InstList::iterator insertPoint, Instruction *lastInst, Block *bb)
568 {
569 bool sbidInUse = false;
570 for (uint32_t i = 0; i < m_SBIDCount; ++i)
571 {
572 //there are still dependencies that might be used outside of this basic block
573 if (!m_freeSBIDList[i].isFree)
574 {
575 sbidInUse = true;
576 }
577 m_freeSBIDList[i].reset();
578 }
579
580 // if last instruction in basic block is EOT no need to generate flushes
581 // hardware will take care of it
582 if (lastInst && lastInst->getOpSpec().isSendFamily() && lastInst->hasInstOpt(InstOpt::EOT))
583 {
584 sbidInUse = false;
585 }
586
587 // platform check is mainly for testing purposes
588 if (sbidInUse)
589 {
590 insertSyncAllRdWr(insertPoint, bb);
591 }
592 }
593
594 // Keeping track of dependencies that need to be cleared because they are no longer relevant
595 // right now each BB ends with control flow instruction, and we reset at each BB
clearBuckets(DepSet * input,DepSet * output)596 void SWSBAnalyzer::clearBuckets(DepSet* input, DepSet* output) {
597 if (input->getDepClass() != DEP_CLASS::IN_ORDER)
598 return;
599
600 if (m_initPoint) {
601 m_distanceTracker.emplace_back(input, output);
602 m_initPoint = false;
603
604 }
605 else {
606 // add DepSet to m_distanceTracker
607 m_distanceTracker.emplace_back(input, output);
608
609 auto get_depset_id = [&](DEP_PIPE pipe_type, DepSet& dep_set) {
610 if (getNumOfDistPipe() == 1)
611 return dep_set.getInstIDs().inOrder;
612 switch(pipe_type) {
613 case DEP_PIPE::FLOAT:
614 return dep_set.getInstIDs().floatPipe;
615 case DEP_PIPE::INTEGER:
616 return dep_set.getInstIDs().intPipe;
617 case DEP_PIPE::LONG64:
618 return dep_set.getInstIDs().longPipe;
619 case DEP_PIPE::MATH_INORDER:
620 return dep_set.getInstIDs().mathPipe;
621 default:
622 IGA_ASSERT(0, "SWSB: unhandled in-order DEP_PIPE for XeHP+ encoding");
623 break;
624 }
625 return (uint32_t)0;
626 };
627
628 auto get_latency = [&](DEP_PIPE pipe_type) {
629 if (pipe_type == DEP_PIPE::LONG64)
630 return m_LatencyLong64Pipe;
631 else if (pipe_type == DEP_PIPE::MATH_INORDER)
632 return m_LatencyInOrderMath;
633 return m_LatencyInOrderPipe;
634 };
635
636 DEP_PIPE new_pipe = input->getDepPipe();
637 // max B2B latency of thie pipe
638 size_t max_dis = get_latency(new_pipe);
639 // Remove nodes from the Tracker if the latency is already satified
640 m_distanceTracker.remove_if(
641 [=](const distanceTrackerNode& node) {
642 // bypass nodes those are not belong to the same pipe
643 if (node.input->getDepPipe() != new_pipe)
644 return false;
645
646 // if the distance >= max_latency, clear buckets for corresponding
647 // input and output Dependency
648 size_t new_id = get_depset_id(new_pipe, *input);
649 if ((new_id - get_depset_id(new_pipe, *node.input)) >= max_dis) {
650 clearDepBuckets(*node.input);
651 clearDepBuckets(*node.output);
652 return true;
653 }
654 return false;
655 }
656 );
657 }
658 }
659
processActiveSBID(SWSB & distanceDependency,const DepSet * input,Block * bb,InstList::iterator instIter,std::vector<SBID> & activeSBID)660 void SWSBAnalyzer::processActiveSBID(SWSB &distanceDependency, const DepSet* input,
661 Block *bb, InstList::iterator instIter, std::vector<SBID>& activeSBID)
662 {
663 // If instruction depends on one or more SBIDS, first one goes in to SWSB field
664 // for rest we generate wait instructions.
665 for (auto aSBID : activeSBID)
666 {
667 // Could be we had operation depending on the write
668 /*
669 * This case also gets triggered when we have send in BB and dependence in another BB
670 * L0:
671 * call (16|M0) r8.0 L64
672 * L16:
673 * sendc.rc (16|M0) null r118 null 0x0 0x140B1000 {$0} // wr:10h, rd:0, Render Target Write msc:16, to #0
674 * L64:
675 * (W) mov (16|M0) r118.0<1>:ud r6.0<8;8,1>:ud
676 * (W) send.dc0 (16|M0) r38 r118 null 0x0 a0.0 {@1, $0}
677 * ret (16|M0) r8.0 {@3}
678 * After first BB in which sendc.rc ends we clear all SBID and generate sync instructions
679 * On mov it detects dependense, but all SBID are freed.
680 */
681 if (m_freeSBIDList[aSBID.sbid].isFree)
682 {
683 continue;
684 }
685
686 SWSB::TokenType tType = SWSB::TokenType::NOTOKEN;
687 if (aSBID.dType == DEP_TYPE::READ ||
688 aSBID.dType == DEP_TYPE::READ_ALWAYS_INTERFERE)
689 {
690 tType = SWSB::TokenType::SRC;
691 }
692 else
693 {
694 tType = SWSB::TokenType::DST;
695 //if SBID is cleared add it back to free pool
696 //write is last thing. So if instruction depends on it we know read is done
697 //but not vice versa
698 m_freeSBIDList[aSBID.sbid].reset();
699 // clean up the dependency
700 assert(m_IdToDepSetMap.find(aSBID.sbid) != m_IdToDepSetMap.end());
701 assert(m_IdToDepSetMap[aSBID.sbid].first->getDepClass() == DEP_CLASS::OUT_OF_ORDER);
702 clearDepBuckets(*m_IdToDepSetMap[aSBID.sbid].first);
703 clearDepBuckets(*m_IdToDepSetMap[aSBID.sbid].second);
704 }
705
706 // Setting first SBID as part of instruction
707 // If this instruction depends on more SBID, generate sync for the extra ids
708 // TODO: Is it safe to clear SBID here?
709 if (distanceDependency.tokenType == SWSB::TokenType::NOTOKEN)
710 {
711 distanceDependency.tokenType = tType;
712 distanceDependency.sbid = aSBID.sbid;
713 } else {
714 // add sync for the id
715 SWSB sync_swsb(SWSB::DistType::NO_DIST, tType, 0, aSBID.sbid);
716 auto nopInst = m_kernel.createSyncNopInstruction(sync_swsb);
717 bb->insertInstBefore(instIter, nopInst);
718 }
719 }
720
721 // verify if the combination of token and dist is valid, if not, move the
722 // token dependency out and add a sync for it
723 if (!distanceDependency.verify(m_swsbMode, input->getInstruction()->getSWSBInstType(m_swsbMode))) {
724 // add sync for the id
725 SWSB sync_swsb(SWSB::DistType::NO_DIST, distanceDependency.tokenType, 0,
726 distanceDependency.sbid);
727 auto nopInst = m_kernel.createSyncNopInstruction(sync_swsb);
728 bb->insertInstBefore(instIter, nopInst);
729 distanceDependency.tokenType = SWSB::TokenType::NOTOKEN;
730 distanceDependency.sbid = 0;
731 }
732 assert(distanceDependency.verify(m_swsbMode, input->getInstruction()->getSWSBInstType(m_swsbMode)));
733 }
734
getNumOfDistPipe()735 uint32_t SWSBAnalyzer::getNumOfDistPipe()
736 {
737 return getNumOfDistPipe(m_swsbMode);
738 }
739
getNumOfDistPipe(SWSB_ENCODE_MODE mode)740 uint32_t SWSBAnalyzer::getNumOfDistPipe(SWSB_ENCODE_MODE mode)
741 {
742 switch(mode) {
743 case SWSB_ENCODE_MODE::SingleDistPipe:
744 return 1;
745 case SWSB_ENCODE_MODE::ThreeDistPipe:
746 return 3;
747 case SWSB_ENCODE_MODE::FourDistPipe:
748 case SWSB_ENCODE_MODE::FourDistPipeReduction:
749 return 4;
750 default:
751 break;
752 }
753 return 0;
754 }
755
advanceInorderInstCounter(DEP_PIPE dep_pipe)756 void SWSBAnalyzer::advanceInorderInstCounter(DEP_PIPE dep_pipe)
757 {
758 ++m_InstIdCounter.inOrder;
759 if (getNumOfDistPipe() == 1)
760 return;
761
762 switch (dep_pipe) {
763 case DEP_PIPE::FLOAT:
764 ++m_InstIdCounter.floatPipe;
765 break;
766 case DEP_PIPE::INTEGER:
767 ++m_InstIdCounter.intPipe;
768 break;
769 case DEP_PIPE::LONG64:
770 ++m_InstIdCounter.longPipe;
771 break;
772 case DEP_PIPE::MATH_INORDER:
773 ++m_InstIdCounter.mathPipe;
774 break;
775 default:
776 IGA_ASSERT(0, "unhandled in-order DEP_PIPE for XE_HP encoding");
777 break;
778 }
779 }
780
addRMWDependencyIfReqruied(DepSet & input,DepSet & output)781 void SWSBAnalyzer::addRMWDependencyIfReqruied(DepSet& input, DepSet& output) {
782 const Instruction* inst = input.getInstruction();
783 // return if the instruction has no dst, or the dst is not GRF or not byte type
784 const Operand& dst = inst->getDestination();
785 if (dst.getKind() != Operand::Kind::DIRECT)
786 return;
787
788 if (dst.getDirRegName() != RegName::GRF_R)
789 return;
790
791 if (TypeSizeInBitsWithDefault(dst.getType(), 32) != 8)
792 return;
793
794 // When there is RMW behavior, the instruction will read the Word first,
795 // modify the byte value in it and then write back the entire Word.
796 // we assume the instruction will read/write the entire register to simplify
797 // the logic
798
799 // add the entire grf of the dst register into input and output DepSet
800 // All registers being touched are added into Bucket. We can get the touched grf
801 // number from added bucket index
802 const std::vector<size_t>& out_buk = output.getBuckets();
803 for (auto i : out_buk) {
804 // we only need grf bucket
805 if (i >= m_DB->getBucketStart(RegName::ARF_A))
806 continue;
807 input.addGrf(i);
808 input.addToBucket((uint32_t)i);
809 output.addGrf(i);
810 }
811 }
812
addSWSBToInst(Instruction & inst,const SWSB & swsb,Block & block,InstListIterator inst_it)813 void SWSBAnalyzer::addSWSBToInst(Instruction& inst,
814 const SWSB& swsb,
815 Block& block,
816 InstListIterator inst_it)
817 {
818 SWSB new_swsb(inst.getSWSB());
819 // handling distance
820 if (swsb.hasDist()) {
821 if (!inst.getSWSB().hasDist()) {
822 new_swsb.distType = swsb.distType;
823 new_swsb.minDist = swsb.minDist;
824 } else {
825 // for single dist pipe platform, distType must be REG_DIST, so won't
826 // be set to REG_DIST_ALL
827 new_swsb.distType = (inst.getSWSB().distType == swsb.distType)?
828 swsb.distType : SWSB::DistType::REG_DIST_ALL;
829 new_swsb.minDist = std::min(inst.getSWSB().minDist, swsb.minDist);
830 }
831 }
832
833 // handling token
834 if (swsb.hasToken()) {
835 if (!inst.getSWSB().hasToken()) {
836 new_swsb.tokenType = swsb.tokenType;
837 new_swsb.sbid = swsb.sbid;
838 } else {
839 // if both has id, and are different, then insert a sync to carry
840 // the new one, otherwise do nothing
841 if ((inst.getSWSB().tokenType != swsb.tokenType) ||
842 (inst.getSWSB().sbid != swsb.sbid)) {
843 SWSB tmp_swsb(SWSB::DistType::NO_DIST, swsb.tokenType,
844 0, swsb.sbid);
845 Instruction* sync_inst = m_kernel.createSyncNopInstruction(tmp_swsb);
846 block.insertInstBefore(inst_it, sync_inst);
847 }
848 }
849 }
850
851 // check if the new swsb combination is valid, if not, move the dist out to a sync
852 // FIXME: move the dist out here to let the sbid set on the instruction could have better
853 // readability, but a potential issue is that A@1 is required to be set on the instruction having
854 // architecture read/write. This case A@1 will be moved out from the instruction
855 if (!new_swsb.verify(m_swsbMode, inst.getSWSBInstType(m_swsbMode))) {
856 SWSB tmp_swsb(swsb.distType, SWSB::TokenType::NOTOKEN,
857 swsb.minDist, 0);
858 Instruction* sync_inst = m_kernel.createSyncNopInstruction(tmp_swsb);
859 block.insertInstBefore(inst_it, sync_inst);
860
861 new_swsb.distType = SWSB::DistType::NO_DIST;
862 new_swsb.minDist = 0;
863 }
864
865 inst.setSWSB(new_swsb);
866 IGA_ASSERT(inst.getSWSB().verify(m_swsbMode, inst.getSWSBInstType(m_swsbMode)),
867 "Invalid swsb dist/token combination after merge");
868 }
869
isSyncNop(const Instruction & i)870 static bool isSyncNop(const Instruction &i) {
871 return i.is(Op::SYNC) && i.getSyncFc() == SyncFC::NOP;
872 };
873
postProcess()874 void SWSBAnalyzer::postProcess()
875 {
876 // revisit all instructions
877 for (Block* bb : m_kernel.getBlockList())
878 {
879 InstList& instList = bb->getInstList();
880 for (auto inst_it = instList.begin(); inst_it != instList.end(); ++inst_it)
881 {
882 Instruction* inst = *inst_it;
883 // move all swsb set on the second instruction to the first for
884 // "instruction combined" case on byte type dst. e.g.
885 // (W) mov (32|M0) r13.0<2>:ub r11.0<1;1,0>:uw {Atomic}
886 // (W) mov (32|M0) r13.1<2>:ub r10.0<1;1,0>:uw
887 if (m_kernel.getModel().hasReadModifiedWriteOnByteDst() &&
888 inst->hasInstOpt(InstOpt::ATOMIC) &&
889 !inst->getOpSpec().isDpasFamily() &&
890 !inst->getOpSpec().isSendOrSendsFamily() &&
891 inst->getDestination().getDirRegName() == RegName::GRF_R &&
892 TypeSizeInBitsWithDefault(inst->getDestination().getType(), 32) == 8)
893 {
894 auto next_it = inst_it;
895 ++next_it;
896 assert(next_it != instList.end());
897 Instruction* next_inst = *next_it;
898
899 // in case the next instructions have sync carrying its swsb, move
900 // sync to before current instruction
901 // - Make sure current inst is not the last inst other than sync
902 InstList sync_insts;
903 while (next_inst->is(Op::SYNC)) {
904 sync_insts.push_back(next_inst);
905 ++next_it;
906 if (next_it == instList.end())
907 break;
908 next_inst = *next_it;
909 }
910
911 if (next_it == instList.end()) {
912 // An unexpected instruction with {Atomic} set but has no following
913 // instruction that can be combined with it
914 assert(next_it != instList.end());
915 continue;
916 }
917
918 // - move sync to before current inst
919 if (!sync_insts.empty()) {
920 auto remove_start = inst_it;
921 ++remove_start;
922 instList.erase(remove_start, next_it);
923 instList.insert(inst_it, sync_insts.begin(), sync_insts.end());
924 }
925
926 // the following instruction must not have Atomic set, or we do not
927 // know what should do
928 IGA_ASSERT((!next_inst->hasInstOpt(InstOpt::ATOMIC)),
929 "Atomic followed by Atomic on fixed latency instructions");
930
931 SWSB next_swsb = next_inst->getSWSB();
932 if (next_swsb.hasSWSB()) {
933 addSWSBToInst(*inst, next_swsb, *bb, inst_it);
934 next_inst->setSWSB(SWSB());
935 }
936 }
937
938 }
939 }
940 // revisit all instructions to remove redundant sync.nop
941 // sync.nop carry the sbid the same as the sbid set on the following instruction can be
942 // removed since it'll automatically be sync-ed when sbid is reused. For example:
943 // sync.nop null {$0.dst} // can be removed
944 // math.exp(8|M0) r12.0<1>:f r10.0<8;8,1>:f {$0}
945 for (Block* bb : m_kernel.getBlockList())
946 {
947 InstList& instList = bb->getInstList();
948 if (instList.empty())
949 continue;
950 auto inst_it = instList.begin();
951 // skip the first instruction, which must not be sync
952
953 ++inst_it;
954 for (; inst_it != instList.end(); ++inst_it)
955 {
956 Instruction* inst = *inst_it;
957 if (isSyncNop(*inst))
958 continue;
959 SWSB cur_swsb = inst->getSWSB();
960 if (cur_swsb.hasToken() && (cur_swsb.tokenType == SWSB::TokenType::SET)) {
961 // iterate through the previous sync
962 auto sync_it = inst_it;
963 --sync_it;
964 while (sync_it != instList.begin()) {
965 Instruction* sync_inst = *sync_it;
966 if (!isSyncNop(*sync_inst))
967 break;
968 SWSB sync_swsb = sync_inst->getSWSB();
969 // if the sync has sbid set, it could be the reserved sbid for shoot down
970 // instructions, we should keep it.
971 if (sync_swsb.hasToken() && sync_swsb.tokenType != SWSB::TokenType::SET &&
972 sync_swsb.sbid == cur_swsb.sbid) {
973 // clean the swsb so that we can remove this instruction later
974 sync_inst->setSWSB(SWSB());
975 }
976 --sync_it;
977 }
978 }
979 }
980 // remove the redundant sync.nop (sync.nop with no swsb)
981 instList.remove_if([](const Instruction* inst) {
982 return isSyncNop(*inst) && !inst->getSWSB().hasSWSB();
983 });
984 }
985 }
986
assignSBID(DepSet * input,DepSet * output,Instruction & inst,SWSB & distanceDependency,InstList::iterator insertPoint,Block * curBB,bool needSyncForShootDown)987 SBID& SWSBAnalyzer::assignSBID(DepSet* input, DepSet* output, Instruction& inst, SWSB& distanceDependency,
988 InstList::iterator insertPoint, Block *curBB, bool needSyncForShootDown)
989 {
990 bool foundFree = false;
991 SBID *sbidFree = nullptr;
992 for (uint32_t i = 0; i < m_SBIDCount; ++i)
993 {
994 if (m_freeSBIDList[i].isFree)
995 {
996 foundFree = true;
997 sbidFree = &m_freeSBIDList[i];
998 m_freeSBIDList[i].sbid = i;
999 break;
1000 }
1001 }
1002 // no free SBID.
1003 if (!foundFree)
1004 {
1005 unsigned int index = (m_SBIDRRCounter++) % m_SBIDCount;
1006
1007 // While swsb id being reuse, the dependency will automatically resolved by hardware,
1008 // so cleanup the dependency bucket for instruction that previously used this id
1009 assert(m_IdToDepSetMap.find(index) != m_IdToDepSetMap.end());
1010 assert(m_IdToDepSetMap[index].first->getDepClass() == DEP_CLASS::OUT_OF_ORDER);
1011 clearDepBuckets(*m_IdToDepSetMap[index].first);
1012 clearDepBuckets(*m_IdToDepSetMap[index].second);
1013
1014 m_freeSBIDList[index].reset();
1015 sbidFree = &m_freeSBIDList[index];
1016 sbidFree->sbid = index;
1017 }
1018 sbidFree->isFree = false;
1019 input->setSBID(*sbidFree);
1020 output->setSBID(*sbidFree);
1021 if (m_IdToDepSetMap.find(sbidFree->sbid) != m_IdToDepSetMap.end())
1022 m_IdToDepSetMap.erase(sbidFree->sbid);
1023 m_IdToDepSetMap.emplace(sbidFree->sbid, std::make_pair(input, output));
1024
1025 // adding the set for this SBID
1026 // if the swsb has the token set already, move it out to a sync
1027 if (distanceDependency.tokenType != SWSB::TokenType::NOTOKEN) {
1028 SWSB tDep(SWSB::DistType::NO_DIST, distanceDependency.tokenType,
1029 0, distanceDependency.sbid);
1030 Instruction* tInst = m_kernel.createSyncNopInstruction(tDep);
1031 curBB->insertInstBefore(insertPoint, tInst);
1032 }
1033 // set the sbid
1034 distanceDependency.tokenType = SWSB::TokenType::SET;
1035 distanceDependency.sbid = sbidFree->sbid;
1036
1037 // verify if the token and dist combination is valid, if not, move the dist out to a sync
1038 // FIXME: move the dist out here to let the sbid set on the instruction could have better readability
1039 // but a potential issue is that A@1 is required to be set on the instruction having
1040 // architecture read/write. This case A@1 will be moved out from the instruction
1041 if (!distanceDependency.verify(m_swsbMode, inst.getSWSBInstType(m_swsbMode))) {
1042 SWSB tDep(distanceDependency.distType, SWSB::TokenType::NOTOKEN,
1043 distanceDependency.minDist, 0);
1044 Instruction* tInst = m_kernel.createSyncNopInstruction(tDep);
1045 curBB->insertInstBefore(insertPoint, tInst);
1046 distanceDependency.distType = SWSB::DistType::NO_DIST;
1047 distanceDependency.minDist = 0;
1048 }
1049 assert(distanceDependency.verify(m_swsbMode, inst.getSWSBInstType(m_swsbMode)));
1050
1051 // add a sync to preserve the token for possibly shooting down instruction
1052 if (needSyncForShootDown) {
1053 SWSB tDep(SWSB::DistType::NO_DIST, distanceDependency.tokenType,
1054 0, distanceDependency.sbid);
1055 Instruction* tInst = m_kernel.createSyncNopInstruction(tDep);
1056 curBB->insertInstBefore(insertPoint, tInst);
1057 }
1058
1059 assert(sbidFree != nullptr);
1060 return *sbidFree;
1061 }
1062
run()1063 void SWSBAnalyzer::run()
1064 {
1065 m_initPoint = true;
1066 m_distanceTracker.clear();
1067
1068 for (uint32_t i = 0; i < MAX_GRF_BUCKETS; ++i)
1069 {
1070 m_buckets[i].clearDependency();
1071 }
1072
1073 // init in order pipe id counters
1074 m_InstIdCounter.inOrder = 1;
1075 m_InstIdCounter.floatPipe = 1;
1076 m_InstIdCounter.intPipe = 1;
1077 m_InstIdCounter.longPipe = 1;
1078 m_InstIdCounter.mathPipe = 1;
1079
1080 // init the math WA struct
1081 // When there is a math instruction, when the following instruction has different
1082 // predication to the math, should assume the math taking the entire GRF in it's
1083 // dst no matter the access region and channels are.
1084 struct MathWAInfo {
1085 bool previous_is_math = false;
1086 DepSet* dep_set = nullptr;
1087 // a special id to identify this DepSet when trying to clean it from buckets
1088 const InstIDs math_id = {std::numeric_limits<uint32_t>::max(), 0};
1089 Instruction* math_inst = nullptr;
1090 SBID math_sbid = {0, true, DEP_TYPE::NONE};
1091
1092 void reset() {
1093 previous_is_math = false;
1094 dep_set = nullptr;
1095 math_inst = nullptr;
1096 math_sbid = {0, true, DEP_TYPE::NONE};
1097 }
1098 } math_wa_info;
1099
1100 Instruction* inst = nullptr;
1101 Block * lastBB = nullptr;
1102 for (auto bb : m_kernel.getBlockList())
1103 {
1104 bool blockEndsWithNonBranchInst = false;
1105 // resetting things for each bb
1106 lastBB = bb;
1107 InstList& instList = bb->getInstList(); // Don't use auto for over loaded return which has const...
1108 const auto instListEnd = instList.end();
1109 for (auto instIter = instList.begin(); instIter != instListEnd; ++instIter)
1110 {
1111 m_InstIdCounter.global++;
1112 inst = *instIter;
1113 DepSet* input = nullptr;
1114 DepSet* output = nullptr;
1115 size_t dpas_cnt_in_macro = 0;
1116
1117 if (math_wa_info.math_inst != nullptr)
1118 math_wa_info.previous_is_math = true;
1119 if (inst->getOpSpec().is(Op::MATH)) {
1120 math_wa_info.math_inst = inst;
1121
1122 // if the math following a math, we only care about the last math
1123 math_wa_info.previous_is_math = false;
1124 }
1125
1126 // recored the first instruction of a dpas macro, in case that inserting instructions (e.g. sync)
1127 // before the macro, those instructions have to be insert before first_inst_in_dpas_macro
1128 InstListIterator first_inst_in_dpas_macro = instList.end();
1129 if (inst->getOpSpec().isDpasFamily()) {
1130 std::pair<DepSet*, DepSet*> dep_set_pair =
1131 m_DB->createDPASSrcDstDepSet(
1132 instList, instIter, m_InstIdCounter, dpas_cnt_in_macro, m_swsbMode);
1133 input = dep_set_pair.first;
1134 output = dep_set_pair.second;
1135
1136 first_inst_in_dpas_macro = instIter;
1137 // bypass dpas insturctions in the macro, the last dpas represents the macro
1138 for (size_t i = 0; i < dpas_cnt_in_macro - 1; ++i) {
1139 ++instIter;
1140 }
1141 inst = *instIter;
1142 } else {
1143 input = m_DB->createSrcDepSet(*inst, m_InstIdCounter, m_swsbMode);
1144 output = m_DB->createDstDepSet(*inst, m_InstIdCounter, m_swsbMode);
1145 }
1146 input->setCompanion(output);
1147 output->setCompanion(input);
1148
1149 // XeHPC+ features
1150 if (m_kernel.getModel().hasReadModifiedWriteOnByteDst())
1151 addRMWDependencyIfReqruied(*input, *output);
1152
1153 SWSB distanceDependency;
1154
1155 // Either source or destination are indirect, or there are SR access,
1156 // We don't know what registers are being accessed
1157 // Need to flush all the sbids and set distance to 1
1158 if (input->hasIndirect() || output->hasIndirect() ||
1159 input->hasSR() || output->hasSR())
1160 {
1161 // clear out-of-order dependency, insert sync.allrd and sync.allwr
1162 // if there are un-resolved sbid dependecny
1163 // if this instruction itself is an out-of-order instruction, insert
1164 // sync.all anyway.
1165 InstListIterator insert_point = instIter;
1166 if (first_inst_in_dpas_macro != instList.end())
1167 insert_point = first_inst_in_dpas_macro;
1168 if (input->getDepClass() == DEP_CLASS::OUT_OF_ORDER)
1169 insertSyncAllRdWr(insert_point, bb);
1170 else
1171 clearSBIDDependence(insert_point, inst, bb);
1172
1173 // clear in-order dependency
1174 clearBuckets(input, output);
1175
1176 // will add direct accesses to buckets
1177 // adding dependencies to buckets
1178 for (auto bucketID : input->getBuckets())
1179 {
1180 m_buckets[bucketID].addDepSet(input);
1181 }
1182 for (auto bucketID : output->getBuckets())
1183 {
1184 m_buckets[bucketID].addDepSet(output);
1185 }
1186
1187 // set to check all dist pipes
1188 if (getNumOfDistPipe() == 1)
1189 distanceDependency.distType = SWSB::DistType::REG_DIST;
1190 else
1191 distanceDependency.distType = SWSB::DistType::REG_DIST_ALL;
1192
1193 distanceDependency.minDist = 1;
1194 // input and output must have the same dep class and in the same pipe
1195 // so check the input only to add the instCounter
1196 // FIXME: is it possilbe that a instruction has output and no input?
1197 if (input->getDepClass() == DEP_CLASS::IN_ORDER)
1198 advanceInorderInstCounter(input->getDepPipe());
1199
1200 // if this is an out-of-order instruction, we still need to assign an sbid for it
1201 if (output->getDepClass() == DEP_CLASS::OUT_OF_ORDER)
1202 assignSBID(input, output, *inst, distanceDependency, insert_point, bb, false);
1203
1204 inst->setSWSB(distanceDependency);
1205 // clean up math_wa_info, this instruction force to sync all, no need to consider
1206 // math wa
1207 if (math_wa_info.previous_is_math) {
1208 math_wa_info.reset();
1209 }
1210 // early out, no need to calculateDependenc that all dependencies are resolved.
1211 continue;
1212 } // end indirect access handling
1213
1214 if (math_wa_info.previous_is_math) {
1215 // math WA affect the instruction right after the math, and with different predication
1216 // Add the WA math dst region to Buckets
1217 if (math_wa_info.math_inst->getPredication().function != inst->getPredication().function) {
1218 math_wa_info.dep_set =
1219 m_DB->createMathDstWADepSet(*math_wa_info.math_inst, math_wa_info.math_id, m_swsbMode);
1220 math_wa_info.dep_set->setSBID(math_wa_info.math_sbid);
1221 for (auto bucketID : math_wa_info.dep_set->getBuckets())
1222 {
1223 IGA_ASSERT(bucketID < m_DB->getTOTAL_BUCKETS(), "buckedID out of range");
1224 m_buckets[bucketID].addDepSet(math_wa_info.dep_set);
1225 }
1226 }
1227 }
1228
1229 std::vector<SBID> activeSBID;
1230 bool needSyncForShootDown = false;
1231 // Calculates dependence between this instruction dependencies and previous ones.
1232 calculateDependence(*input, distanceDependency, *inst, activeSBID, needSyncForShootDown);
1233 calculateDependence(*output, distanceDependency, *inst, activeSBID, needSyncForShootDown);
1234
1235 // clean up math_wa_info
1236 if (math_wa_info.previous_is_math) {
1237 if (math_wa_info.dep_set != nullptr)
1238 clearDepBuckets(*math_wa_info.dep_set);
1239 math_wa_info.reset();
1240 }
1241
1242 if (first_inst_in_dpas_macro != instList.end())
1243 processActiveSBID(distanceDependency, input, bb, first_inst_in_dpas_macro, activeSBID);
1244 else
1245 processActiveSBID(distanceDependency, input, bb, instIter, activeSBID);
1246
1247 // Need to set SBID
1248 if (output->getDepClass() == DEP_CLASS::OUT_OF_ORDER &&
1249 !(inst->getOpSpec().isSendFamily() && inst->hasInstOpt(InstOpt::EOT)))
1250 {
1251 InstList::iterator insertPoint = instIter;
1252 if (first_inst_in_dpas_macro != instList.end())
1253 insertPoint = first_inst_in_dpas_macro;
1254 SBID& assigned_id = assignSBID(input, output, *inst, distanceDependency,
1255 insertPoint, bb, needSyncForShootDown);
1256
1257 // record the sbid if it's math, for use of math wa
1258 if (inst->getOpSpec().is(Op::MATH)) {
1259 math_wa_info.math_sbid = assigned_id;
1260 }
1261 }
1262
1263 clearBuckets(input, output);
1264
1265 /*
1266 * Handling the case where everything is in one bb, and send with EOT is in the middle of instruction stream
1267 * call (16|M0) r8.0:ud 32
1268 * sendc.rc (16|M0) null r118 null 0x0 0x140B1000 {EOT} // wr:10h, rd:0, Render Target Write msc:16, to #0
1269 * ...
1270 * ret (16|M0) r8.0
1271 */
1272 if (!(inst->getOpSpec().isSendFamily() && inst->hasInstOpt(InstOpt::EOT)))
1273 {
1274 //adding dependencies to buckets
1275 for (auto bucketID : input->getBuckets())
1276 {
1277 // We want to check dependncy of regular instructions against
1278 // WRITE_ALWAYS_INTERFERE without adding them themselves
1279 if (bucketID == m_DB->getBucketStart(RegName::ARF_CR) &&
1280 input->getDepType() != DEP_TYPE::WRITE_ALWAYS_INTERFERE &&
1281 input->getDepType() != DEP_TYPE::READ_ALWAYS_INTERFERE)
1282 {
1283 continue;
1284 }
1285 m_buckets[bucketID].addDepSet(input);
1286 }
1287 for (auto bucketID : output->getBuckets())
1288 {
1289 IGA_ASSERT(bucketID < m_DB->getTOTAL_BUCKETS(),
1290 "buckedID out of range");
1291 // We want to check dependncy of regular instructions against
1292 // WRITE_ALWAYS_INTERFERE without adding them themselves
1293 if (bucketID == m_DB->getBucketStart(RegName::ARF_CR) &&
1294 output->getDepType() != DEP_TYPE::WRITE_ALWAYS_INTERFERE &&
1295 output->getDepType() != DEP_TYPE::READ_ALWAYS_INTERFERE)
1296 {
1297 continue;
1298 }
1299 m_buckets[bucketID].addDepSet(output);
1300 }
1301 }
1302
1303 if (input->getDepClass() == DEP_CLASS::IN_ORDER)
1304 {
1305 advanceInorderInstCounter(input->getDepPipe());
1306 }
1307
1308 // for dpas block, set the distance at the first inst in the block, and set the
1309 // swsb id at the last inst in the block.
1310 if ((first_inst_in_dpas_macro != instList.end()) && (*first_inst_in_dpas_macro != inst)) {
1311 (*first_inst_in_dpas_macro)->setSWSB(
1312 SWSB(distanceDependency.distType, SWSB::TokenType::NOTOKEN, distanceDependency.minDist, 0));
1313 inst->setSWSB(
1314 SWSB(SWSB::DistType::NO_DIST, distanceDependency.tokenType, 0, distanceDependency.sbid));
1315 } else {
1316 // if the input SWSB is a special token, preserve it and insert a sync before to carry the dependency info
1317 // Note that dpas must not have special token so we only do this check for non-dpas here
1318 if (inst->getSWSB().hasSpecialToken()) {
1319 if (distanceDependency.hasSWSB()) {
1320 Instruction* syncInst = m_kernel.createSyncNopInstruction(distanceDependency);
1321 bb->insertInstBefore(instIter, syncInst);
1322 }
1323 } else {
1324 inst->setSWSB(distanceDependency);
1325 }
1326 }
1327 assert(distanceDependency.verify(m_swsbMode, inst->getSWSBInstType(m_swsbMode)));
1328
1329 if (inst->isBranching())
1330 {
1331 //TODO: konrad : this is somewhat conservative, some
1332 //branch instructions might not need sync (join)
1333 blockEndsWithNonBranchInst = false;
1334 clearSBIDDependence(instIter, inst, bb);
1335 continue;
1336 }
1337 else
1338 {
1339 blockEndsWithNonBranchInst = true;
1340 }
1341 } //iterate on instr
1342 // clear read
1343 // clear write
1344 if (blockEndsWithNonBranchInst) {
1345 clearSBIDDependence(instList.end(), inst, bb);
1346 }
1347 } //iterate on basic block
1348
1349 // this code is for FC composite
1350 // if last instruction is not EOT we will insert flush instructions
1351 // and stall the pipeline since we do not do global analysis
1352 if (inst &&
1353 ((inst->getOpSpec().isSendFamily() &&
1354 !inst->getInstOpts().contains(InstOpt::EOT)) || !inst->getOpSpec().isSendFamily()))
1355 {
1356 SWSB swsb;
1357 if (getNumOfDistPipe() == 1)
1358 swsb.distType = SWSB::DistType::REG_DIST;
1359 else
1360 swsb.distType = SWSB::DistType::REG_DIST_ALL;
1361 swsb.minDist = 1;
1362 Instruction *syncInst = m_kernel.createSyncNopInstruction(swsb);
1363 lastBB->getInstList().push_back(syncInst);
1364 }
1365
1366 postProcess();
1367 return;
1368 }
1369