1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2017-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include "RegDeps.hpp"
10 #include "Traversals.hpp"
11 #include "BitSet.hpp"
12 
13 #include <iterator>
14 #include <limits>
15 
16 using namespace iga;
17 
18 /**
19  * RAW:                     R kill W    R-->live       explict dependence
20  * WAW: different pipelines W2 kill W1  W2-->live      explict dependence
21  * WAR: different pipelines W kill R    W-->live       explict dependence
22  * WAR: same pipeline       W kill R    W-->live       implict dependence
23  * AR: sample pipeline     R2 kill R1  R2-->live      no dependence
24  * RAR: different pipelines             R1,R2-->live   no dependence
25  *
26  * Different pipeline
27  * send, math, control flow, long/short (type df)
28  *
29  * add (8) r10 r20 r30
30  * add (8) r11 r21 r22
31  * if (..)
32  *   // if instruction doesn't count in if calculations, but it takes about 6
33  *   // cycles to resolve for fall through can treat it as continue BB.
34  *   // Only when this BB has one predecessor
35  *   add r40 r10 r50 {@2}
36  * else
37  *   add r60 r70 r80
38  * endif
39  * //Both control flows converge on this. Conservative analysis start with 1.
40  * //By the time jmp happens counter should be at 0 anyway.
41  * add r90 r100 r110 {@1}
42  * add r91 r101 r111 {@2}
43  *
44  *
45  * Types of Dependencies
46  * dst   src0    src1
47  * grf   ind     grf // set distance to 1. If SBID list not empty insert test instruction. Optimization if SBID == 1 AND grf depends on it, set SBID, clear SBIDList
48  */
49 
50 /**
51     Bucket - represents a GRF line
52     ID - sequential numbering of instructions. Resets at 0 with new BB
53     Implicit assumption. Various data structures have pointers to Dependencies.
54 
55     For each BB scan instructions down
56         if new BB
57             reset buckets //can use bit mask
58             reset distanceTracker
59             reset ID
60         For each instruction
61             Calculate dependcy on srcs and destination
62             if currDistance < MAX_DIST
63                 record distance = currDistance // for a case where we at new BB
64 
65             For each dependency and bucket it touches look in to a bucket
66                 if bucket not empty
67                     find potential dependencies //bit mask intersects
68                     for each dependency found
69                     if appopriate (WAW, RAR, RAW) Dependence exists
70                         Clear dependency bits from bucket dependency
71                         if dep empty remove from bucket
72                         if DistanceDependency //no out of order
73                             if instDistance > (currDistance - depID)
74                                 //We found dependence closer
75                                 record distance = currDistance - depID //CurrDistance > depID AND min(currDist - depID, 1)
76                         else //sbid
77                             record SBID ID
78 
79             if dependencyRecord NOT empty
80                 Generate appropriate SWSB/test instruction
81                 IF SBID
82                     if all dependencies are clear
83                         add SBID to free list
84                         remove entry SBID -> dependencies
85             Remove MAX_DIST DEP from buckets
86             Add current instruction Dependencies to buckets
87             if instruction isVariableExecTime //send, math.idiv
88                 if freeSBITLIst IS empty
89                     pick one SBID
90                     generate test instruction
91                     move SBID to free list
92                     clear dependency from bucket/sbidList
93 
94                 assign SBID from free list
95         if end of block AND SBID list NOT empty
96             generate test instructions
97 */
98 
99 #include "SWSBSetter.hpp"
100 /*
101 WAW
102 explicit dependence
103 math.fc (except idiv) r10 ...
104 add r10 ....
105 
106 add r10 ... //long: type DF/Q
107 add r10 ... //short:
108 
109 WAW
110 no dependence
111 add r10 ...
112 add r10 ...
113 
114 
115 Math.sin   r10 r20 r30
116 Math.cos  r20 r40 r50
117 Not required - same pipe
118 
119 Math.sin   r20 r10 r30
120 Math.cos  r20 r40 r50
121 Not required - same pipe
122 
123 FPU_long   r20 r10 r30
124 Math.sin    r20 r40 r50
125 Explicit dep required as math can overtake FPU_long - since they are in different pipes.
126 
127 RAW
128 add r10 ...
129 add r20 ...
130 add ... r20 ... {@1}
131 add ... r10  {@3} <--- technically speaking this depending is not necesary
132                        since they are in same pipe and previous instruction will stall
133                        so last instruction dependence is cleared.
134                        But in terms of runtime there is no impact so not worth special handling
135 
136 assuming two grfs are written/read
137 send r10
138 send r11
139 
140 add (16) ... r10 ...
141 second send has dependency on first send
142 add has dependency on second send
143 if sends written 1 grf, and add still read two grfs it will have dependence on both sends
144 
145 send r10 //set$1 writes r10/r11
146 add(8) r10 {$1.dst}
147 add(8) r11 {}
148 
149 
150 */
clearDepBuckets(DepSet & depMatch)151 void SWSBAnalyzer::clearDepBuckets(DepSet &depMatch)
152 {
153     for (auto bucketID : depMatch.getBuckets())
154     {
155         auto bucket = &m_buckets[bucketID];
156         auto numDepSets = bucket->getNumDependencies();
157         for (uint32_t i = 0; i < numDepSets; ++i)
158         {
159             DepSet* dep = bucket->getDepSet(i);
160             //See if anything matches for this GRF bucket.
161             //originally was checking for intersect but was removing extra dependence in case like this
162             /*
163                 (W)      and (1|M0)               r0.0<1>:ud    r0.0<0;1,0>:ud    0xFFFFBFFF:ud    {}
164                 (W)      mov (16|M0)              r25.0<1>:f    0:d
165                 (W)      mov (1|M0)               r0.2<1>:ud    0x0:ud
166                 (W)      mov (1|M0)               r0.2<1>:ud    0x0:ud
167                 (W)      and (1|M0)               r0.0<1>:ud    r0.0<0;1,0>:ud    0xFFFFF7FF:ud    {}
168                          mov (16|M0)              r120.0<1>:ud  r17.0<8;8,1>:ud
169                          mov (16|M0)              r122.0<1>:ud  r19.0<8;8,1>:ud
170                          mov (16|M0)              r124.0<1>:ud  r21.0<8;8,1>:ud
171                          mov (16|M0)              r126.0<1>:ud  r23.0<8;8,1>:ud
172                 (W)      mov (16|M0)              r118.0<1>:ud  r0.0<8;8,1>:ud                   {}
173             */
174             //the r0 dependece was already cleared by second r0
175             //but when clearing from buckets it would find the second r0 and clear it by mistake
176             if (dep && depMatch.getInstGlobalID() == dep->getInstGlobalID() &&
177                    (dep->getDepType() == depMatch.getDepType()))
178             {
179                 bucket->clearDepSet(i);
180             }
181         }
182     }
183     depMatch.reset();
184 }
185 /**
186 * This function takes in a current instruction dependency.
187 * Either SRC or DST
188 * It then checks against previous dependencies.
189 * It sets mininum valid distance
190 * and creates an active list of SBIDs this instruction depends on
191 * It clears and removes previous dependencies.
192 * The approach is bucket based.
193 * Each bucket is one GRF.
194 * So if instruction writes in to more then one GRF then multiple buckets will have the dependency
195 */
calculateDependence(DepSet & currDep,SWSB & distanceDependency,const Instruction & currInst,std::vector<SBID> & activeSBID,bool & needSyncForShootDownInst)196 void SWSBAnalyzer::calculateDependence(DepSet &currDep, SWSB &distanceDependency,
197     const Instruction &currInst, std::vector<SBID>& activeSBID, bool &needSyncForShootDownInst)
198 {
199     needSyncForShootDownInst = false;
200     auto currDepType = currDep.getDepType();
201     auto currDepPipe = currDep.getDepPipe();
202 
203     for (auto bucketID : currDep.getBuckets())
204     {
205         //iterates over Dependencies in a GRF bucket
206         //Assumption there shouldn't be more then 1-2
207         Bucket* bucket = &m_buckets[bucketID];
208         size_t numDepSets = bucket->getNumDependencies();
209         for (uint32_t i = 0; i < numDepSets; ++i)
210         {
211             uint32_t index = static_cast<uint32_t>(numDepSets -1 - i);
212             auto dep = bucket->getDepSet(index);
213 
214             if (dep && (dep->getDepType() == DEP_TYPE::WRITE_ALWAYS_INTERFERE ||
215                         dep->getDepType() == DEP_TYPE::READ_ALWAYS_INTERFERE))
216             {
217                 // force to sync with dep
218                 if (dep->getDepClass() == DEP_CLASS::OUT_OF_ORDER)
219                 {
220                     setSbidDependency(*dep, currInst, needSyncForShootDownInst, activeSBID);
221                 }
222                 else
223                 {
224                     // Set to sync with all in-order-pipes. WRITE/READ_ALWAYS_INTERFERE
225                     // could be used to mark arf dependency, which is required to be all pipes
226                     // instead of dep's pipe only
227                     distanceDependency.minDist = 1;
228                     if (getNumOfDistPipe() == 1)
229                         distanceDependency.distType = SWSB::DistType::REG_DIST;
230                     else
231                         distanceDependency.distType = SWSB::DistType::REG_DIST_ALL;
232                     bucket->clearDepSet(index);
233                 }
234             }
235 
236             //See if anything matches for this GRF bucket.
237             if (dep && dep->getBitSet().intersects(currDep.getBitSet()))
238             {
239                 /*
240                  * RAW:                     R kill W    R-->live       explict dependence
241                  * WAW: different pipelines W2 kill W1  W2-->live      explict dependence
242                  * WAR: different pipelines W kill R    W-->live       explict dependence
243                  * WAR: same pipeline       W kill R    W-->live       implict dependence
244                  * AR: sample pipeline     R2 kill R1  R2-->live      no dependence
245                  * RAR: different pipelines             R1,R2-->live   no dependence
246                  */
247                 //RAW:                     R kill W    R-->live       explict dependence
248                 DEP_TYPE prevDepType = dep->getDepType();
249                 DEP_PIPE prevDepPipe = dep->getDepPipe();
250                 DEP_CLASS prevDepClass = dep->getDepClass();
251 
252                 // Send with different SFID could write to different pipes
253                 bool sendInDiffPipe = false;
254                 if (dep->getInstruction()->getOpSpec().isSendFamily() &&
255                     currDep.getInstruction()->getOpSpec().isSendFamily())
256                 {
257                     sendInDiffPipe =
258                         (dep->getInstruction()->getSendFc() !=
259                          currDep.getInstruction()->getSendFc());
260                     // for send in unknown pipe, always treated as different pipe
261                     if (!sendInDiffPipe) {
262                         sendInDiffPipe =
263                             dep->getDepPipe() == DEP_PIPE::SEND_UNKNOWN ||
264                             currDep.getDepPipe() == DEP_PIPE::SEND_UNKNOWN;
265                     }
266                 }
267 
268                 bool isRAW = currDepType == DEP_TYPE::READ &&
269                              prevDepType == DEP_TYPE::WRITE;
270                 //WAW: different pipelines W2 kill W1  W2-->live      explict dependence
271                 bool isWAW = (currDepType == DEP_TYPE::WRITE &&
272                               prevDepType == DEP_TYPE::WRITE &&
273                      (currDepPipe != prevDepPipe || sendInDiffPipe));
274                 //WAR: different pipelines W kill R    W-->live       explict dependence
275                 bool isWAR = currDepType == DEP_TYPE::WRITE &&
276                              prevDepType == DEP_TYPE::READ  &&
277                              (currDepPipe != prevDepPipe || sendInDiffPipe);
278                 bool isWAW_out_of_order
279                            = (currDepType == DEP_TYPE::WRITE &&
280                               prevDepType == DEP_TYPE::WRITE &&
281                               prevDepClass == DEP_CLASS::OUT_OF_ORDER);
282 
283                 // Special case handling for acc/flag dependency:
284                 // if the RAW dependency on acc and it's whithin the same pipe,
285                 // HW can handle it that we don't need to set swsb
286                 if (isRAW && currDepPipe == prevDepPipe) {
287                     auto check_dep_reg = [&](DepSet* in_dep, uint32_t reg_start, uint32_t reg_len) {
288                         return in_dep->getBitSet().intersects(currDep.getBitSet(),
289                             reg_start, reg_len);
290                     };
291                     auto has_grf_dep = [&](DepSet* in_dep) {
292                         return check_dep_reg(in_dep, m_DB->getGRF_START(), m_DB->getGRF_LEN());
293                     };
294                     auto has_arf_a_dep = [&](DepSet* in_dep) {
295                         return check_dep_reg(in_dep, m_DB->getARF_A_START(), m_DB->getARF_A_LEN());
296                     };
297                     auto has_acc_dep = [&](DepSet* in_dep) {
298                         return check_dep_reg(in_dep, m_DB->getARF_ACC_START(), m_DB->getARF_ACC_LEN());
299                     };
300                     auto has_flag_dep = [&](DepSet* in_dep) {
301                         return check_dep_reg(in_dep, m_DB->getARF_F_START(), m_DB->getARF_F_LEN());
302                     };
303                     auto has_sp_dep = [&](DepSet* in_dep) {
304                         return check_dep_reg(in_dep, m_DB->getARF_SPECIAL_START(), m_DB->getARF_SPECIAL_LEN());
305                     };
306 
307                     // is acc dependecy
308                     if (has_acc_dep(dep)) {
309                         // and no dependency on other registers
310                         if (!(has_grf_dep(dep) || has_arf_a_dep(dep) || has_flag_dep(dep) || has_sp_dep(dep)))
311                             isRAW = false;
312                     }
313                     // is flag dependency
314                     if (has_flag_dep(dep)) {
315                         // and no dependency on other registers
316                         if (!(has_grf_dep(dep) || has_arf_a_dep(dep) || has_acc_dep(dep) || has_sp_dep(dep)))
317                             isRAW = false;
318                         // flag and acc only
319                         if (has_acc_dep(dep))
320                             if (!(has_grf_dep(dep) || has_arf_a_dep(dep) || has_sp_dep(dep)))
321                                 isRAW = false;
322                     }
323                 }
324 
325                 if (isWAR ||
326                     isWAW ||
327                     isRAW ||
328                     isWAW_out_of_order)
329                 {
330                     // clearing previous dependence
331                     if (dep->getBitSet().empty())
332                     {
333                         m_errorHandler.reportWarning(
334                             currInst.getPC(),
335                             "Dependency in bucket with no bits set");
336                     }
337                     // removing from bucket if there is nothing
338                     if (!dep->getBitSet().testAny(bucketID * 32, m_DB->getGRF_BYTES_PER_REG()))
339                     {
340                         bucket->clearDepSet(index);
341                     }
342                     if (prevDepClass == DEP_CLASS::IN_ORDER)
343                     {
344                         if (getNumOfDistPipe() == 1) {
345                             // FOR WAW if PREV is SHORT and curr is LONG then write will finish
346                             // before current write, no need to set swsb
347                             bool isWAWHazard = (prevDepPipe == DEP_PIPE::SHORT && currDepPipe == DEP_PIPE::LONG ||
348                                                 prevDepPipe == DEP_PIPE::SHORT && currDepPipe == DEP_PIPE::SHORT)
349                                                && isWAW;
350                             // require swsb for all the other kinds of dependency
351                             if (!isWAWHazard)
352                             {
353                                 // setting minimum distance
354                                 uint32_t newDistance = m_InstIdCounter.inOrder - dep->getInstIDs().inOrder;
355                                 distanceDependency.minDist =
356                                     distanceDependency.minDist == 0 ?
357                                     newDistance :
358                                     std::min(distanceDependency.minDist, newDistance);
359                                 // clamp the distance to max distance
360                                 distanceDependency.minDist = std::min(distanceDependency.minDist, (uint32_t)MAX_VALID_DISTANCE);
361                                 distanceDependency.distType = SWSB::DistType::REG_DIST;
362                             }
363                         } else {
364                             // For multiple in-order pipeline architecuture, all cases should be considered
365                             // The distance is depended on the previous instruction's pipeline
366                             uint32_t newDistance = 0;
367                             SWSB::DistType newDepPipe = SWSB::DistType::NO_DIST;
368                             switch (prevDepPipe) {
369                             case DEP_PIPE::FLOAT:
370                                 newDistance = m_InstIdCounter.floatPipe - dep->getInstIDs().floatPipe;
371                                 newDepPipe = SWSB::DistType::REG_DIST_FLOAT;
372                                 break;
373                             case DEP_PIPE::INTEGER:
374                                 newDistance = m_InstIdCounter.intPipe - dep->getInstIDs().intPipe;
375                                 newDepPipe = SWSB::DistType::REG_DIST_INT;
376                                 break;
377                             case DEP_PIPE::LONG64:
378                                 newDistance = m_InstIdCounter.longPipe - dep->getInstIDs().longPipe;
379                                 newDepPipe = SWSB::DistType::REG_DIST_LONG;
380                                 break;
381                            case DEP_PIPE::MATH_INORDER:
382                                 newDistance = m_InstIdCounter.mathPipe - dep->getInstIDs().mathPipe;
383                                 newDepPipe = SWSB::DistType::REG_DIST_MATH;
384                                 break;
385                             default:
386                                 IGA_ASSERT(0, "Unsupported DEP_PIPE for in-order instructions");
387                                 break;
388                             }
389 
390                             // the instruction already has dependency to others
391                             if (distanceDependency.minDist) {
392                                 newDistance = std::min(distanceDependency.minDist, newDistance);
393                                 // if the type is REG_DIST_ALL or is the same with the new pipe type,
394                                 // then remains it. Otherwise update the swsb type
395                                 if ((distanceDependency.distType != newDepPipe) && (distanceDependency.distType != SWSB::DistType::REG_DIST_ALL)) {
396                                     // get the pipe_type from opnd type
397                                     auto op_pipe_type = [](Type op_type) {
398                                         if (TypeIs64b(op_type))
399                                             return SWSB::DistType::REG_DIST_LONG;
400                                         if (TypeIsFloating(op_type))
401                                             return SWSB::DistType::REG_DIST_FLOAT;
402                                         return SWSB::DistType::REG_DIST_INT;
403                                     };
404                                     // check if the given pipe type is the same with one of the src type
405                                     auto haveTypeInSrc = [&](SWSB::DistType swsb_type) {
406                                         // HW restriction (WA): Cannot use @1 on XeHPC-XT, must explicitly set pipe type
407                                         // A@1 or L@1, ... Always return false so that we won't use @1
408                                         // Note that if there isn't this restriction, we should also update op_pipe_type
409                                         // for FourDistPipeReduction mode that non-float-64-bit type should be in INT pipe
410                                         if (m_swsbMode == SWSB_ENCODE_MODE::FourDistPipeReduction) {
411                                             return false;
412                                         }
413                                         for (size_t i = 0; i < currInst.getSourceCount(); ++i) {
414                                             if (op_pipe_type(currInst.getSource(i).getType()) == swsb_type)
415                                                 return true;
416                                         }
417                                         return false;
418                                     };
419                                     if ((distanceDependency.distType == SWSB::DistType::REG_DIST_MATH) ||
420                                         (newDepPipe == SWSB::DistType::REG_DIST_MATH)) {
421                                         // either current of prev dep is MATH, it's not possible to combine them to REG_DIST
422                                         distanceDependency.distType = SWSB::DistType::REG_DIST_ALL;
423                                     } else if ((distanceDependency.distType != SWSB::DistType::REG_DIST)) {
424                                         // check if both previous and current dep pipe can be satisfied by currInst src type
425                                         if (haveTypeInSrc(distanceDependency.distType) && haveTypeInSrc(newDepPipe))
426                                             distanceDependency.distType = SWSB::DistType::REG_DIST;
427                                         else
428                                             distanceDependency.distType = SWSB::DistType::REG_DIST_ALL;
429                                     } else {
430                                         // if previous one is REG_DIST, set the type to REG_DIST_ALL if
431                                         // current one cannot be satisfied by src type
432                                         if (!haveTypeInSrc(newDepPipe))
433                                             distanceDependency.distType = SWSB::DistType::REG_DIST_ALL;
434                                     }
435                                 }
436                             } else {
437                                 distanceDependency.distType = newDepPipe;
438                             }
439                             assert(distanceDependency.distType != SWSB::DistType::NO_DIST);
440                             // clamp the distance to max distance
441                             distanceDependency.minDist = std::min(newDistance, (uint32_t)MAX_VALID_DISTANCE);
442                         } // end of if (m_enableMultiDistPipe)
443                         // clear this instruction's dependency since it is satisfied
444                         clearDepBuckets(*dep);
445 
446                         // clear its companion because when an in-order instruction is synced, both its
447                         // input and output dependency are satisfied. The only case is that if it has
448                         // read/write_always_interfere dependency, it should be reserved.
449                         // The restriction is that:
450                         // When certain Arch Registers (sr, cr, ce) are used,
451                         // the very next instruction requires dependency to be set on all pipes {A@1}
452                         // e.g.
453                         //      mov (1|M0)               r104.0<1>:ud  sr0.1<0;1,0>:ud
454                         //      cmp(16 | M0)   (ne)f0.0   null:ud    r104.0<0; 1, 0> : ub   r62.4<0; 1, 0> : uw
455                         // A@1 is required for cmp instead of I@1
456                         if (dep->getCompanion() != nullptr) {
457                             // In the case that this DepSet is generated from math_wa_info, it won't have companion
458                             if (dep->getCompanion()->getDepType() != DEP_TYPE::WRITE_ALWAYS_INTERFERE &&
459                                 dep->getCompanion()->getDepType() != DEP_TYPE::READ_ALWAYS_INTERFERE) {
460                                 clearDepBuckets(*dep->getCompanion());
461                             }
462                         }
463                     } // end of if (prevDepClass == DEP_CLASS::IN_ORDER)
464                     else if (prevDepClass == DEP_CLASS::OUT_OF_ORDER) // prev is out of order
465                     {
466                         setSbidDependency(*dep, currInst, needSyncForShootDownInst, activeSBID);
467                     }
468                     // for the instruction in "OTHER" DEP_CLASS, such as sync, we don't need
469                     // to consider their dependency that is implied by hardware
470                 }
471             }
472         }
473     }
474 }
475 
setSbidDependency(DepSet & dep,const Instruction & currInst,bool & needSyncForShootDownInst,std::vector<SBID> & activeSBID)476 void SWSBAnalyzer::setSbidDependency(DepSet& dep, const Instruction& currInst,
477     bool& needSyncForShootDownInst, std::vector<SBID>& activeSBID)
478 {
479     /* For out of order we don't know how long it will finish
480     * so need to test for SBID.
481     * Instruction can depend on more then one SBID
482     * send r10
483     * send r20
484     * send r30
485     * ....
486     * add r10 r20 r30
487     * between different buckets and srcs/dst dependencies instruction can rely on multiple SBID
488     */
489     SBID depSBID = dep.getSBID();
490     if (depSBID.isFree)
491     {
492         m_errorHandler.reportError((int)dep.getInstGlobalID(), "SBID SHOULDN'T BE FREE!");
493     }
494     // clears all the buckets
495     clearDepBuckets(dep);
496 
497     // In case of shooting down of this instruction, we need to add sync to preserve the swsb id sync,
498     // so that it's safe to clear the dep
499     if (currInst.hasPredication() ||
500         (currInst.getExecSize() != dep.getInstruction()->getExecSize()) ||
501         (currInst.getChannelOffset() != dep.getInstruction()->getChannelOffset()))
502         needSyncForShootDownInst = true;
503 
504     // used to set read or write dependency
505     depSBID.dType = dep.getDepType();
506 
507     // activeSBID stores all sbid that this inst has dependency on
508     // and it'll be processed in processActiveSBID
509     bool push_back = true;
510     // making sure there are no duplicates
511     for (auto& aSBID : activeSBID)
512     {
513         if (aSBID.sbid == depSBID.sbid)
514         {
515             //write takes longer then read
516             //so we only need to check on one.
517             //so this either sets a write or resets back to read
518             if (aSBID.dType == DEP_TYPE::READ)
519             {
520                 aSBID.dType = depSBID.dType;
521             }
522             push_back = false;
523             break;
524         }
525     }
526     // adding to active SBID
527     // in Run function we will see how many this instruction relies on
528     // and generate approriate SWSB and if needed test instruction
529     // in that level also will add them back to free list
530     if (push_back)
531     {
532         activeSBID.push_back(depSBID);
533     }
534 }
535 
insertSyncAllRdWr(InstList::iterator insertPoint,Block * bb)536 void SWSBAnalyzer::insertSyncAllRdWr(InstList::iterator insertPoint, Block *bb)
537 {
538     SWSB distanceDependency;
539     auto clearRD = m_kernel.createSyncAllRdInstruction(distanceDependency);
540     auto clearWR = m_kernel.createSyncAllWrInstruction(distanceDependency);
541 
542     if (insertPoint == bb->getInstList().end())
543     {
544         bb->getInstList().push_back(clearRD);
545         bb->getInstList().push_back(clearWR);
546     }
547     else
548     {
549         bb->insertInstBefore(insertPoint, clearRD);
550         bb->insertInstBefore(insertPoint, clearWR);
551     }
552 }
553 
554 //TODO this should also clear up grf dependency to handle this case:
555 /*
556 call (16|M0)             r8.0:ud          32
557 sendc.rc (16|M0)         null     r118  null  0x0         0x140B1000 {} //   wr:10h, rd:0, Render Target Write msc:16, to #0
558 (W)      mov (1|M0)               a0.0<1>:ud    r7.0<0;1,0>:ud
559 sendc.rc (16|M0)         null     r100  null  0x0         0x140B1000 {} //   wr:10h, rd:0, Render Target Write msc:16, to #0
560 sendc.rc (16|M0)         null     r118  null  0x0         0x140B1000 {} //   wr:10h, rd:0, Render Target Write msc:16, to #0
561 (W)      mov (16|M0)               r118.0<1>:ud  r6.0<8;8,1>:ud
562 (W)      send.dc0 (16|M0)         r38       r118  null  0x0         a0.0
563 ret (16|M0)
564 
565 Right now mov will have false dependense on the first send.
566 */
clearSBIDDependence(InstList::iterator insertPoint,Instruction * lastInst,Block * bb)567 void SWSBAnalyzer::clearSBIDDependence(InstList::iterator insertPoint, Instruction *lastInst, Block *bb)
568 {
569     bool sbidInUse = false;
570     for (uint32_t i = 0; i < m_SBIDCount; ++i)
571     {
572         //there are still dependencies that might be used outside of this basic block
573         if (!m_freeSBIDList[i].isFree)
574         {
575             sbidInUse = true;
576         }
577         m_freeSBIDList[i].reset();
578     }
579 
580     // if last instruction in basic block is EOT no need to generate flushes
581     // hardware will take care of it
582     if (lastInst && lastInst->getOpSpec().isSendFamily() && lastInst->hasInstOpt(InstOpt::EOT))
583     {
584         sbidInUse = false;
585     }
586 
587     // platform check is mainly for testing purposes
588     if (sbidInUse)
589     {
590         insertSyncAllRdWr(insertPoint, bb);
591     }
592 }
593 
594 // Keeping track of dependencies that need to be cleared because they are no longer relevant
595 // right now each BB ends with control flow instruction, and we reset at each BB
clearBuckets(DepSet * input,DepSet * output)596 void SWSBAnalyzer::clearBuckets(DepSet* input, DepSet* output) {
597     if (input->getDepClass() != DEP_CLASS::IN_ORDER)
598         return;
599 
600     if (m_initPoint) {
601         m_distanceTracker.emplace_back(input, output);
602         m_initPoint = false;
603 
604     }
605     else {
606         // add DepSet to m_distanceTracker
607         m_distanceTracker.emplace_back(input, output);
608 
609         auto get_depset_id = [&](DEP_PIPE pipe_type, DepSet& dep_set) {
610             if (getNumOfDistPipe() == 1)
611                 return dep_set.getInstIDs().inOrder;
612             switch(pipe_type) {
613             case DEP_PIPE::FLOAT:
614                 return dep_set.getInstIDs().floatPipe;
615             case DEP_PIPE::INTEGER:
616                 return dep_set.getInstIDs().intPipe;
617             case DEP_PIPE::LONG64:
618                 return dep_set.getInstIDs().longPipe;
619             case DEP_PIPE::MATH_INORDER:
620                 return dep_set.getInstIDs().mathPipe;
621             default:
622                 IGA_ASSERT(0, "SWSB: unhandled in-order DEP_PIPE for XeHP+ encoding");
623                 break;
624             }
625             return (uint32_t)0;
626         };
627 
628         auto get_latency = [&](DEP_PIPE pipe_type) {
629             if (pipe_type == DEP_PIPE::LONG64)
630                 return m_LatencyLong64Pipe;
631             else if (pipe_type == DEP_PIPE::MATH_INORDER)
632                 return m_LatencyInOrderMath;
633             return m_LatencyInOrderPipe;
634         };
635 
636         DEP_PIPE new_pipe = input->getDepPipe();
637         // max B2B latency of thie pipe
638         size_t max_dis = get_latency(new_pipe);
639         // Remove nodes from the Tracker if the latency is already satified
640         m_distanceTracker.remove_if(
641             [=](const distanceTrackerNode& node) {
642                 // bypass nodes those are not belong to the same pipe
643                 if (node.input->getDepPipe() != new_pipe)
644                     return false;
645 
646                 // if the distance >= max_latency, clear buckets for corresponding
647                 // input and output Dependency
648                 size_t new_id = get_depset_id(new_pipe, *input);
649                 if ((new_id - get_depset_id(new_pipe, *node.input)) >= max_dis) {
650                     clearDepBuckets(*node.input);
651                     clearDepBuckets(*node.output);
652                     return true;
653                 }
654                 return false;
655             }
656         );
657     }
658 }
659 
processActiveSBID(SWSB & distanceDependency,const DepSet * input,Block * bb,InstList::iterator instIter,std::vector<SBID> & activeSBID)660 void SWSBAnalyzer::processActiveSBID(SWSB &distanceDependency, const DepSet* input,
661     Block *bb, InstList::iterator instIter, std::vector<SBID>& activeSBID)
662 {
663     // If instruction depends on one or more SBIDS, first one goes in to SWSB field
664     // for rest we generate wait instructions.
665     for (auto aSBID : activeSBID)
666     {
667         // Could be we had operation depending on the write
668         /*
669         *   This case also gets triggered when we have send in BB and dependence in another BB
670         *   L0:
671         *   call (16|M0)             r8.0          L64
672         *   L16:
673         *   sendc.rc (16|M0)         null     r118  null  0x0         0x140B1000 {$0} //   wr:10h, rd:0, Render Target Write msc:16, to #0
674         *   L64:
675         *   (W)      mov (16|M0)              r118.0<1>:ud  r6.0<8;8,1>:ud
676         *   (W)      send.dc0 (16|M0)         r38      r118  null  0x0         a0.0       {@1, $0}
677         *   ret (16|M0)                          r8.0                             {@3}
678         *   After first BB in which sendc.rc ends we clear all SBID and generate sync instructions
679         *   On mov it detects dependense, but all SBID are freed.
680         */
681         if (m_freeSBIDList[aSBID.sbid].isFree)
682         {
683             continue;
684         }
685 
686         SWSB::TokenType tType = SWSB::TokenType::NOTOKEN;
687         if (aSBID.dType == DEP_TYPE::READ ||
688             aSBID.dType == DEP_TYPE::READ_ALWAYS_INTERFERE)
689         {
690             tType = SWSB::TokenType::SRC;
691         }
692         else
693         {
694             tType = SWSB::TokenType::DST;
695             //if SBID is cleared add it back to free pool
696             //write is last thing. So if instruction depends on it we know read is done
697             //but not vice versa
698             m_freeSBIDList[aSBID.sbid].reset();
699             // clean up the dependency
700             assert(m_IdToDepSetMap.find(aSBID.sbid) != m_IdToDepSetMap.end());
701             assert(m_IdToDepSetMap[aSBID.sbid].first->getDepClass() == DEP_CLASS::OUT_OF_ORDER);
702             clearDepBuckets(*m_IdToDepSetMap[aSBID.sbid].first);
703             clearDepBuckets(*m_IdToDepSetMap[aSBID.sbid].second);
704         }
705 
706         // Setting first SBID as part of instruction
707         // If this instruction depends on more SBID, generate sync for the extra ids
708         // TODO: Is it safe to clear SBID here?
709         if (distanceDependency.tokenType == SWSB::TokenType::NOTOKEN)
710         {
711             distanceDependency.tokenType = tType;
712             distanceDependency.sbid = aSBID.sbid;
713         } else {
714             // add sync for the id
715             SWSB sync_swsb(SWSB::DistType::NO_DIST, tType, 0, aSBID.sbid);
716             auto nopInst = m_kernel.createSyncNopInstruction(sync_swsb);
717             bb->insertInstBefore(instIter, nopInst);
718         }
719     }
720 
721     // verify if the combination of token and dist is valid, if not, move the
722     // token dependency out and add a sync for it
723     if (!distanceDependency.verify(m_swsbMode, input->getInstruction()->getSWSBInstType(m_swsbMode))) {
724         // add sync for the id
725         SWSB sync_swsb(SWSB::DistType::NO_DIST, distanceDependency.tokenType, 0,
726                         distanceDependency.sbid);
727         auto nopInst = m_kernel.createSyncNopInstruction(sync_swsb);
728         bb->insertInstBefore(instIter, nopInst);
729         distanceDependency.tokenType = SWSB::TokenType::NOTOKEN;
730         distanceDependency.sbid = 0;
731     }
732     assert(distanceDependency.verify(m_swsbMode, input->getInstruction()->getSWSBInstType(m_swsbMode)));
733 }
734 
getNumOfDistPipe()735 uint32_t SWSBAnalyzer::getNumOfDistPipe()
736 {
737     return getNumOfDistPipe(m_swsbMode);
738 }
739 
getNumOfDistPipe(SWSB_ENCODE_MODE mode)740 uint32_t SWSBAnalyzer::getNumOfDistPipe(SWSB_ENCODE_MODE mode)
741 {
742     switch(mode) {
743     case SWSB_ENCODE_MODE::SingleDistPipe:
744         return 1;
745     case SWSB_ENCODE_MODE::ThreeDistPipe:
746         return 3;
747     case SWSB_ENCODE_MODE::FourDistPipe:
748     case SWSB_ENCODE_MODE::FourDistPipeReduction:
749         return 4;
750     default:
751         break;
752     }
753     return 0;
754 }
755 
advanceInorderInstCounter(DEP_PIPE dep_pipe)756 void SWSBAnalyzer::advanceInorderInstCounter(DEP_PIPE dep_pipe)
757 {
758     ++m_InstIdCounter.inOrder;
759     if (getNumOfDistPipe() == 1)
760         return;
761 
762     switch (dep_pipe) {
763     case DEP_PIPE::FLOAT:
764         ++m_InstIdCounter.floatPipe;
765         break;
766     case DEP_PIPE::INTEGER:
767         ++m_InstIdCounter.intPipe;
768         break;
769     case DEP_PIPE::LONG64:
770         ++m_InstIdCounter.longPipe;
771         break;
772     case DEP_PIPE::MATH_INORDER:
773         ++m_InstIdCounter.mathPipe;
774         break;
775     default:
776         IGA_ASSERT(0, "unhandled in-order DEP_PIPE for XE_HP encoding");
777         break;
778     }
779 }
780 
addRMWDependencyIfReqruied(DepSet & input,DepSet & output)781 void SWSBAnalyzer::addRMWDependencyIfReqruied(DepSet& input, DepSet& output) {
782     const Instruction* inst = input.getInstruction();
783     // return if the instruction has no dst, or the dst is not GRF or not byte type
784     const Operand& dst = inst->getDestination();
785     if (dst.getKind() != Operand::Kind::DIRECT)
786         return;
787 
788     if (dst.getDirRegName() != RegName::GRF_R)
789         return;
790 
791     if (TypeSizeInBitsWithDefault(dst.getType(), 32) != 8)
792         return;
793 
794     // When there is RMW behavior, the instruction will read the Word first,
795     // modify the byte value in it and then write back the entire Word.
796     // we assume the instruction will read/write the entire register to simplify
797     // the logic
798 
799     // add the entire grf of the dst register into input and output DepSet
800     // All registers being touched are added into Bucket. We can get the touched grf
801     // number from added bucket index
802     const std::vector<size_t>& out_buk = output.getBuckets();
803     for (auto i : out_buk) {
804         // we only need grf bucket
805         if (i >= m_DB->getBucketStart(RegName::ARF_A))
806             continue;
807         input.addGrf(i);
808         input.addToBucket((uint32_t)i);
809         output.addGrf(i);
810     }
811 }
812 
addSWSBToInst(Instruction & inst,const SWSB & swsb,Block & block,InstListIterator inst_it)813 void SWSBAnalyzer::addSWSBToInst(Instruction& inst,
814                                  const SWSB& swsb,
815                                  Block& block,
816                                  InstListIterator inst_it)
817 {
818     SWSB new_swsb(inst.getSWSB());
819     // handling distance
820     if (swsb.hasDist()) {
821         if (!inst.getSWSB().hasDist()) {
822             new_swsb.distType = swsb.distType;
823             new_swsb.minDist = swsb.minDist;
824         } else {
825             // for single dist pipe platform, distType must be REG_DIST, so won't
826             // be set to REG_DIST_ALL
827             new_swsb.distType = (inst.getSWSB().distType == swsb.distType)?
828                                 swsb.distType : SWSB::DistType::REG_DIST_ALL;
829             new_swsb.minDist = std::min(inst.getSWSB().minDist, swsb.minDist);
830         }
831     }
832 
833     // handling token
834     if (swsb.hasToken()) {
835         if (!inst.getSWSB().hasToken()) {
836             new_swsb.tokenType = swsb.tokenType;
837             new_swsb.sbid = swsb.sbid;
838         } else {
839             // if both has id, and are different, then insert a sync to carry
840             // the new one, otherwise do nothing
841             if ((inst.getSWSB().tokenType != swsb.tokenType) ||
842                 (inst.getSWSB().sbid != swsb.sbid)) {
843                 SWSB tmp_swsb(SWSB::DistType::NO_DIST, swsb.tokenType,
844                               0, swsb.sbid);
845                 Instruction* sync_inst = m_kernel.createSyncNopInstruction(tmp_swsb);
846                 block.insertInstBefore(inst_it, sync_inst);
847             }
848         }
849     }
850 
851     // check if the new swsb combination is valid, if not, move the dist out to a sync
852     // FIXME: move the dist out here to let the sbid set on the instruction could have better
853     // readability, but a potential issue is that A@1 is required to be set on the instruction having
854     // architecture read/write. This case A@1 will be moved out from the instruction
855     if (!new_swsb.verify(m_swsbMode, inst.getSWSBInstType(m_swsbMode))) {
856         SWSB tmp_swsb(swsb.distType, SWSB::TokenType::NOTOKEN,
857                       swsb.minDist, 0);
858         Instruction* sync_inst = m_kernel.createSyncNopInstruction(tmp_swsb);
859         block.insertInstBefore(inst_it, sync_inst);
860 
861         new_swsb.distType = SWSB::DistType::NO_DIST;
862         new_swsb.minDist = 0;
863     }
864 
865     inst.setSWSB(new_swsb);
866     IGA_ASSERT(inst.getSWSB().verify(m_swsbMode, inst.getSWSBInstType(m_swsbMode)),
867                "Invalid swsb dist/token combination after merge");
868 }
869 
isSyncNop(const Instruction & i)870 static bool isSyncNop(const Instruction &i) {
871     return i.is(Op::SYNC) && i.getSyncFc() == SyncFC::NOP;
872 };
873 
postProcess()874 void SWSBAnalyzer::postProcess()
875 {
876     // revisit all instructions
877     for (Block* bb : m_kernel.getBlockList())
878     {
879         InstList& instList = bb->getInstList();
880         for (auto inst_it = instList.begin(); inst_it != instList.end(); ++inst_it)
881         {
882             Instruction* inst = *inst_it;
883             // move all swsb set on the second instruction to the first for
884             // "instruction combined" case on byte type dst. e.g.
885             //      (W) mov (32|M0)  r13.0<2>:ub   r11.0<1;1,0>:uw   {Atomic}
886             //      (W) mov (32|M0)  r13.1<2>:ub   r10.0<1;1,0>:uw
887             if (m_kernel.getModel().hasReadModifiedWriteOnByteDst() &&
888                 inst->hasInstOpt(InstOpt::ATOMIC) &&
889                 !inst->getOpSpec().isDpasFamily() &&
890                 !inst->getOpSpec().isSendOrSendsFamily() &&
891                 inst->getDestination().getDirRegName() == RegName::GRF_R &&
892                 TypeSizeInBitsWithDefault(inst->getDestination().getType(), 32) == 8)
893             {
894                 auto next_it = inst_it;
895                 ++next_it;
896                 assert(next_it != instList.end());
897                 Instruction* next_inst = *next_it;
898 
899                 // in case the next instructions have sync carrying its swsb, move
900                 // sync to before current instruction
901                 // - Make sure current inst is not the last inst other than sync
902                 InstList sync_insts;
903                 while (next_inst->is(Op::SYNC)) {
904                     sync_insts.push_back(next_inst);
905                     ++next_it;
906                     if (next_it == instList.end())
907                         break;
908                     next_inst = *next_it;
909                 }
910 
911                 if (next_it == instList.end()) {
912                     // An unexpected instruction with {Atomic} set but has no following
913                     // instruction that can be combined with it
914                     assert(next_it != instList.end());
915                     continue;
916                 }
917 
918                 // - move sync to before current inst
919                 if (!sync_insts.empty()) {
920                     auto remove_start = inst_it;
921                     ++remove_start;
922                     instList.erase(remove_start, next_it);
923                     instList.insert(inst_it, sync_insts.begin(), sync_insts.end());
924                 }
925 
926                 // the following instruction must not have Atomic set, or we do not
927                 // know what should do
928                 IGA_ASSERT((!next_inst->hasInstOpt(InstOpt::ATOMIC)),
929                     "Atomic followed by Atomic on fixed latency instructions");
930 
931                 SWSB next_swsb = next_inst->getSWSB();
932                 if (next_swsb.hasSWSB()) {
933                     addSWSBToInst(*inst, next_swsb, *bb, inst_it);
934                     next_inst->setSWSB(SWSB());
935                 }
936             }
937 
938         }
939     }
940     // revisit all instructions to remove redundant sync.nop
941     // sync.nop carry the sbid the same as the sbid set on the following instruction can be
942     // removed since it'll automatically be sync-ed when sbid is reused. For example:
943     // sync.nop        null                       {$0.dst} // can be removed
944     // math.exp(8|M0)  r12.0<1>:f  r10.0<8;8,1>:f {$0}
945     for (Block* bb : m_kernel.getBlockList())
946     {
947         InstList& instList = bb->getInstList();
948         if (instList.empty())
949             continue;
950         auto inst_it = instList.begin();
951         // skip the first instruction, which must not be sync
952 
953         ++inst_it;
954         for (; inst_it != instList.end(); ++inst_it)
955         {
956             Instruction* inst = *inst_it;
957             if (isSyncNop(*inst))
958                 continue;
959             SWSB cur_swsb = inst->getSWSB();
960             if (cur_swsb.hasToken() && (cur_swsb.tokenType == SWSB::TokenType::SET)) {
961                 // iterate through the previous sync
962                 auto sync_it = inst_it;
963                 --sync_it;
964                 while (sync_it != instList.begin()) {
965                     Instruction* sync_inst = *sync_it;
966                     if (!isSyncNop(*sync_inst))
967                         break;
968                     SWSB sync_swsb = sync_inst->getSWSB();
969                     // if the sync has sbid set, it could be the reserved sbid for shoot down
970                     // instructions, we should keep it.
971                     if (sync_swsb.hasToken() && sync_swsb.tokenType != SWSB::TokenType::SET &&
972                         sync_swsb.sbid == cur_swsb.sbid) {
973                         // clean the swsb so that we can remove this instruction later
974                         sync_inst->setSWSB(SWSB());
975                     }
976                     --sync_it;
977                 }
978             }
979         }
980         // remove the redundant sync.nop (sync.nop with no swsb)
981         instList.remove_if([](const Instruction* inst) {
982             return isSyncNop(*inst) && !inst->getSWSB().hasSWSB();
983         });
984     }
985 }
986 
assignSBID(DepSet * input,DepSet * output,Instruction & inst,SWSB & distanceDependency,InstList::iterator insertPoint,Block * curBB,bool needSyncForShootDown)987 SBID& SWSBAnalyzer::assignSBID(DepSet* input, DepSet* output, Instruction& inst, SWSB& distanceDependency,
988     InstList::iterator insertPoint, Block *curBB, bool needSyncForShootDown)
989 {
990     bool foundFree = false;
991     SBID *sbidFree = nullptr;
992     for (uint32_t i = 0; i < m_SBIDCount; ++i)
993     {
994         if (m_freeSBIDList[i].isFree)
995         {
996             foundFree = true;
997             sbidFree = &m_freeSBIDList[i];
998             m_freeSBIDList[i].sbid = i;
999             break;
1000         }
1001     }
1002     // no free SBID.
1003     if (!foundFree)
1004     {
1005         unsigned int index = (m_SBIDRRCounter++) % m_SBIDCount;
1006 
1007         // While swsb id being reuse, the dependency will automatically resolved by hardware,
1008         // so cleanup the dependency bucket for instruction that previously used this id
1009         assert(m_IdToDepSetMap.find(index) != m_IdToDepSetMap.end());
1010         assert(m_IdToDepSetMap[index].first->getDepClass() == DEP_CLASS::OUT_OF_ORDER);
1011         clearDepBuckets(*m_IdToDepSetMap[index].first);
1012         clearDepBuckets(*m_IdToDepSetMap[index].second);
1013 
1014         m_freeSBIDList[index].reset();
1015         sbidFree = &m_freeSBIDList[index];
1016         sbidFree->sbid = index;
1017     }
1018     sbidFree->isFree = false;
1019     input->setSBID(*sbidFree);
1020     output->setSBID(*sbidFree);
1021     if (m_IdToDepSetMap.find(sbidFree->sbid) != m_IdToDepSetMap.end())
1022         m_IdToDepSetMap.erase(sbidFree->sbid);
1023     m_IdToDepSetMap.emplace(sbidFree->sbid, std::make_pair(input, output));
1024 
1025     // adding the set for this SBID
1026     // if the swsb has the token set already, move it out to a sync
1027     if (distanceDependency.tokenType != SWSB::TokenType::NOTOKEN) {
1028         SWSB tDep(SWSB::DistType::NO_DIST, distanceDependency.tokenType,
1029             0, distanceDependency.sbid);
1030         Instruction* tInst = m_kernel.createSyncNopInstruction(tDep);
1031         curBB->insertInstBefore(insertPoint, tInst);
1032     }
1033     // set the sbid
1034     distanceDependency.tokenType = SWSB::TokenType::SET;
1035     distanceDependency.sbid = sbidFree->sbid;
1036 
1037     // verify if the token and dist combination is valid, if not, move the dist out to a sync
1038     // FIXME: move the dist out here to let the sbid set on the instruction could have better readability
1039     // but a potential issue is that A@1 is required to be set on the instruction having
1040     // architecture read/write. This case A@1 will be moved out from the instruction
1041     if (!distanceDependency.verify(m_swsbMode, inst.getSWSBInstType(m_swsbMode))) {
1042         SWSB tDep(distanceDependency.distType, SWSB::TokenType::NOTOKEN,
1043             distanceDependency.minDist, 0);
1044         Instruction* tInst = m_kernel.createSyncNopInstruction(tDep);
1045         curBB->insertInstBefore(insertPoint, tInst);
1046         distanceDependency.distType = SWSB::DistType::NO_DIST;
1047         distanceDependency.minDist = 0;
1048     }
1049     assert(distanceDependency.verify(m_swsbMode, inst.getSWSBInstType(m_swsbMode)));
1050 
1051     // add a sync to preserve the token for possibly shooting down instruction
1052     if (needSyncForShootDown) {
1053         SWSB tDep(SWSB::DistType::NO_DIST, distanceDependency.tokenType,
1054             0, distanceDependency.sbid);
1055         Instruction* tInst = m_kernel.createSyncNopInstruction(tDep);
1056         curBB->insertInstBefore(insertPoint, tInst);
1057     }
1058 
1059     assert(sbidFree != nullptr);
1060     return *sbidFree;
1061 }
1062 
run()1063 void SWSBAnalyzer::run()
1064 {
1065     m_initPoint = true;
1066     m_distanceTracker.clear();
1067 
1068     for (uint32_t i = 0; i < MAX_GRF_BUCKETS; ++i)
1069     {
1070         m_buckets[i].clearDependency();
1071     }
1072 
1073     // init in order pipe id counters
1074     m_InstIdCounter.inOrder = 1;
1075     m_InstIdCounter.floatPipe = 1;
1076     m_InstIdCounter.intPipe = 1;
1077     m_InstIdCounter.longPipe = 1;
1078     m_InstIdCounter.mathPipe = 1;
1079 
1080     // init the math WA struct
1081     // When there is a math instruction, when the following instruction has different
1082     // predication to the math, should assume the math taking the entire GRF in it's
1083     // dst no matter the access region and channels are.
1084     struct MathWAInfo {
1085         bool previous_is_math = false;
1086         DepSet* dep_set = nullptr;
1087         // a special id to identify this DepSet when trying to clean it from buckets
1088         const InstIDs math_id = {std::numeric_limits<uint32_t>::max(), 0};
1089         Instruction* math_inst = nullptr;
1090         SBID math_sbid = {0, true, DEP_TYPE::NONE};
1091 
1092         void reset() {
1093             previous_is_math = false;
1094             dep_set = nullptr;
1095             math_inst = nullptr;
1096             math_sbid = {0, true, DEP_TYPE::NONE};
1097         }
1098     } math_wa_info;
1099 
1100     Instruction* inst = nullptr;
1101     Block * lastBB = nullptr;
1102     for (auto bb : m_kernel.getBlockList())
1103     {
1104         bool blockEndsWithNonBranchInst = false;
1105         // resetting things for each bb
1106         lastBB = bb;
1107         InstList& instList  = bb->getInstList(); // Don't use auto for over loaded return which has const...
1108         const auto instListEnd    = instList.end();
1109         for (auto instIter = instList.begin(); instIter != instListEnd; ++instIter)
1110         {
1111             m_InstIdCounter.global++;
1112             inst = *instIter;
1113             DepSet* input = nullptr;
1114             DepSet* output = nullptr;
1115             size_t dpas_cnt_in_macro = 0;
1116 
1117             if (math_wa_info.math_inst != nullptr)
1118                 math_wa_info.previous_is_math = true;
1119             if (inst->getOpSpec().is(Op::MATH)) {
1120                 math_wa_info.math_inst = inst;
1121 
1122                 // if the math following a math, we only care about the last math
1123                 math_wa_info.previous_is_math = false;
1124             }
1125 
1126             // recored the first instruction of a dpas macro, in case that inserting instructions (e.g. sync)
1127             // before the macro, those instructions have to be insert before first_inst_in_dpas_macro
1128             InstListIterator first_inst_in_dpas_macro = instList.end();
1129             if (inst->getOpSpec().isDpasFamily()) {
1130                 std::pair<DepSet*, DepSet*> dep_set_pair =
1131                     m_DB->createDPASSrcDstDepSet(
1132                         instList, instIter, m_InstIdCounter, dpas_cnt_in_macro, m_swsbMode);
1133                 input = dep_set_pair.first;
1134                 output = dep_set_pair.second;
1135 
1136                 first_inst_in_dpas_macro = instIter;
1137                 // bypass dpas insturctions in the macro, the last dpas represents the macro
1138                 for (size_t i = 0; i < dpas_cnt_in_macro - 1; ++i) {
1139                     ++instIter;
1140                 }
1141                 inst = *instIter;
1142             } else {
1143                 input = m_DB->createSrcDepSet(*inst, m_InstIdCounter, m_swsbMode);
1144                 output = m_DB->createDstDepSet(*inst, m_InstIdCounter, m_swsbMode);
1145             }
1146             input->setCompanion(output);
1147             output->setCompanion(input);
1148 
1149             // XeHPC+ features
1150             if (m_kernel.getModel().hasReadModifiedWriteOnByteDst())
1151                 addRMWDependencyIfReqruied(*input, *output);
1152 
1153             SWSB distanceDependency;
1154 
1155             // Either source or destination are indirect, or there are SR access,
1156             // We don't know what registers are being accessed
1157             // Need to flush all the sbids and set distance to 1
1158             if (input->hasIndirect() || output->hasIndirect() ||
1159                 input->hasSR() || output->hasSR())
1160             {
1161                 // clear out-of-order dependency, insert sync.allrd and sync.allwr
1162                 // if there are un-resolved sbid dependecny
1163                 // if this instruction itself is an out-of-order instruction, insert
1164                 // sync.all anyway.
1165                 InstListIterator insert_point = instIter;
1166                 if (first_inst_in_dpas_macro != instList.end())
1167                     insert_point = first_inst_in_dpas_macro;
1168                 if (input->getDepClass() == DEP_CLASS::OUT_OF_ORDER)
1169                     insertSyncAllRdWr(insert_point, bb);
1170                 else
1171                     clearSBIDDependence(insert_point, inst, bb);
1172 
1173                 // clear in-order dependency
1174                 clearBuckets(input, output);
1175 
1176                 // will add direct accesses to buckets
1177                 // adding dependencies to buckets
1178                 for (auto bucketID : input->getBuckets())
1179                 {
1180                     m_buckets[bucketID].addDepSet(input);
1181                 }
1182                 for (auto bucketID : output->getBuckets())
1183                 {
1184                     m_buckets[bucketID].addDepSet(output);
1185                 }
1186 
1187                 // set to check all dist pipes
1188                 if (getNumOfDistPipe() == 1)
1189                     distanceDependency.distType = SWSB::DistType::REG_DIST;
1190                 else
1191                     distanceDependency.distType = SWSB::DistType::REG_DIST_ALL;
1192 
1193                 distanceDependency.minDist = 1;
1194                 // input and output must have the same dep class and in the same pipe
1195                 // so check the input only to add the instCounter
1196                 // FIXME: is it possilbe that a instruction has output and no input?
1197                 if (input->getDepClass() == DEP_CLASS::IN_ORDER)
1198                     advanceInorderInstCounter(input->getDepPipe());
1199 
1200                 // if this is an out-of-order instruction, we still need to assign an sbid for it
1201                 if (output->getDepClass() == DEP_CLASS::OUT_OF_ORDER)
1202                     assignSBID(input, output, *inst, distanceDependency, insert_point, bb, false);
1203 
1204                 inst->setSWSB(distanceDependency);
1205                 // clean up math_wa_info, this instruction force to sync all, no need to consider
1206                 // math wa
1207                 if (math_wa_info.previous_is_math) {
1208                     math_wa_info.reset();
1209                 }
1210                 // early out, no need to calculateDependenc that all dependencies are resolved.
1211                 continue;
1212             } // end indirect access handling
1213 
1214             if (math_wa_info.previous_is_math) {
1215                 // math WA affect the instruction right after the math, and with different predication
1216                 // Add the WA math dst region to Buckets
1217                 if (math_wa_info.math_inst->getPredication().function != inst->getPredication().function) {
1218                     math_wa_info.dep_set =
1219                         m_DB->createMathDstWADepSet(*math_wa_info.math_inst, math_wa_info.math_id, m_swsbMode);
1220                     math_wa_info.dep_set->setSBID(math_wa_info.math_sbid);
1221                     for (auto bucketID : math_wa_info.dep_set->getBuckets())
1222                     {
1223                         IGA_ASSERT(bucketID < m_DB->getTOTAL_BUCKETS(), "buckedID out of range");
1224                         m_buckets[bucketID].addDepSet(math_wa_info.dep_set);
1225                     }
1226                 }
1227             }
1228 
1229             std::vector<SBID> activeSBID;
1230             bool needSyncForShootDown = false;
1231             // Calculates dependence between this instruction dependencies and previous ones.
1232             calculateDependence(*input, distanceDependency, *inst, activeSBID, needSyncForShootDown);
1233             calculateDependence(*output, distanceDependency, *inst, activeSBID, needSyncForShootDown);
1234 
1235             // clean up math_wa_info
1236             if (math_wa_info.previous_is_math) {
1237                 if (math_wa_info.dep_set != nullptr)
1238                     clearDepBuckets(*math_wa_info.dep_set);
1239                 math_wa_info.reset();
1240             }
1241 
1242             if (first_inst_in_dpas_macro != instList.end())
1243                 processActiveSBID(distanceDependency, input, bb, first_inst_in_dpas_macro, activeSBID);
1244             else
1245                 processActiveSBID(distanceDependency, input, bb, instIter, activeSBID);
1246 
1247             // Need to set SBID
1248             if (output->getDepClass() == DEP_CLASS::OUT_OF_ORDER &&
1249                 !(inst->getOpSpec().isSendFamily() && inst->hasInstOpt(InstOpt::EOT)))
1250             {
1251                 InstList::iterator insertPoint = instIter;
1252                 if (first_inst_in_dpas_macro != instList.end())
1253                     insertPoint = first_inst_in_dpas_macro;
1254                 SBID& assigned_id = assignSBID(input, output, *inst, distanceDependency,
1255                     insertPoint, bb, needSyncForShootDown);
1256 
1257                 // record the sbid if it's math, for use of math wa
1258                 if (inst->getOpSpec().is(Op::MATH)) {
1259                     math_wa_info.math_sbid = assigned_id;
1260                 }
1261             }
1262 
1263             clearBuckets(input, output);
1264 
1265             /*
1266              * Handling the case where everything is in one bb, and send with EOT is in the middle of instruction stream
1267              *           call (16|M0)             r8.0:ud          32
1268              *           sendc.rc (16|M0)         null     r118  null  0x0         0x140B1000 {EOT} //   wr:10h, rd:0, Render Target Write msc:16, to #0
1269              *           ...
1270              *           ret (16|M0)                          r8.0
1271              */
1272             if (!(inst->getOpSpec().isSendFamily() && inst->hasInstOpt(InstOpt::EOT)))
1273             {
1274                 //adding dependencies to buckets
1275                 for (auto bucketID : input->getBuckets())
1276                 {
1277                     // We want to check dependncy of regular instructions against
1278                     // WRITE_ALWAYS_INTERFERE without adding them themselves
1279                     if (bucketID == m_DB->getBucketStart(RegName::ARF_CR) &&
1280                         input->getDepType() != DEP_TYPE::WRITE_ALWAYS_INTERFERE &&
1281                         input->getDepType() != DEP_TYPE::READ_ALWAYS_INTERFERE)
1282                     {
1283                         continue;
1284                     }
1285                     m_buckets[bucketID].addDepSet(input);
1286                 }
1287                 for (auto bucketID : output->getBuckets())
1288                 {
1289                     IGA_ASSERT(bucketID < m_DB->getTOTAL_BUCKETS(),
1290                         "buckedID out of range");
1291                     // We want to check dependncy of regular instructions against
1292                     // WRITE_ALWAYS_INTERFERE without adding them themselves
1293                     if (bucketID == m_DB->getBucketStart(RegName::ARF_CR) &&
1294                         output->getDepType() != DEP_TYPE::WRITE_ALWAYS_INTERFERE &&
1295                         output->getDepType() != DEP_TYPE::READ_ALWAYS_INTERFERE)
1296                     {
1297                         continue;
1298                     }
1299                     m_buckets[bucketID].addDepSet(output);
1300                 }
1301             }
1302 
1303             if (input->getDepClass() == DEP_CLASS::IN_ORDER)
1304             {
1305                 advanceInorderInstCounter(input->getDepPipe());
1306             }
1307 
1308             // for dpas block, set the distance at the first inst in the block, and set the
1309             // swsb id at the last inst in the block.
1310             if ((first_inst_in_dpas_macro != instList.end()) && (*first_inst_in_dpas_macro != inst)) {
1311                 (*first_inst_in_dpas_macro)->setSWSB(
1312                     SWSB(distanceDependency.distType, SWSB::TokenType::NOTOKEN, distanceDependency.minDist, 0));
1313                 inst->setSWSB(
1314                     SWSB(SWSB::DistType::NO_DIST, distanceDependency.tokenType, 0, distanceDependency.sbid));
1315             } else {
1316                 // if the input SWSB is a special token, preserve it and insert a sync before to carry the dependency info
1317                 // Note that dpas must not have special token so we only do this check for non-dpas here
1318                 if (inst->getSWSB().hasSpecialToken()) {
1319                     if (distanceDependency.hasSWSB()) {
1320                         Instruction* syncInst = m_kernel.createSyncNopInstruction(distanceDependency);
1321                         bb->insertInstBefore(instIter, syncInst);
1322                     }
1323                 } else {
1324                     inst->setSWSB(distanceDependency);
1325                 }
1326             }
1327             assert(distanceDependency.verify(m_swsbMode, inst->getSWSBInstType(m_swsbMode)));
1328 
1329             if (inst->isBranching())
1330             {
1331                 //TODO: konrad : this is somewhat conservative, some
1332                 //branch instructions might not need sync (join)
1333                 blockEndsWithNonBranchInst = false;
1334                 clearSBIDDependence(instIter, inst, bb);
1335                 continue;
1336             }
1337             else
1338             {
1339                 blockEndsWithNonBranchInst = true;
1340             }
1341         } //iterate on instr
1342         //          clear read
1343         //          clear write
1344         if (blockEndsWithNonBranchInst) {
1345             clearSBIDDependence(instList.end(), inst, bb);
1346         }
1347     } //iterate on basic block
1348 
1349     // this code is for FC composite
1350     // if last instruction is not EOT we will insert flush instructions
1351     // and stall the pipeline since we do not do global analysis
1352     if (inst &&
1353         ((inst->getOpSpec().isSendFamily() &&
1354             !inst->getInstOpts().contains(InstOpt::EOT)) || !inst->getOpSpec().isSendFamily()))
1355     {
1356         SWSB swsb;
1357         if (getNumOfDistPipe() == 1)
1358             swsb.distType = SWSB::DistType::REG_DIST;
1359         else
1360             swsb.distType = SWSB::DistType::REG_DIST_ALL;
1361         swsb.minDist = 1;
1362         Instruction *syncInst = m_kernel.createSyncNopInstruction(swsb);
1363         lastBB->getInstList().push_back(syncInst);
1364     }
1365 
1366     postProcess();
1367     return;
1368 }
1369