1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2017-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include "IGC/common/StringMacros.hpp"
10 #include "visa_igc_common_header.h"
11 #include "Common_ISA.h"
12 #include "Common_ISA_util.h"
13 #include "Common_ISA_framework.h"
14 #include "JitterDataStruct.h"
15 #include "VISAKernel.h"
16 #include "G4_IR.hpp"
17 #include "BuildIR.h"
18 #include "BinaryEncodingIGA.h"
19 
20 #include <iomanip>
21 
22 using namespace vISA;
23 
24 static const char* const SrcModifierStr[Mod_src_undef] =
25 {
26     "-",       // Mod_Minus
27     "(abs)",   // Mod_Abs
28     "-(abs)",  // Mod_Minus_Abs
29     "-"        // Mod_Not (print as -)
30 };
31 
32 static const G4_InstOptInfo InstOptInfo[] =
33 {
34     {InstOpt_Align16, "Align16"},
35     {InstOpt_M0, "M0"},
36     {InstOpt_M4, "M4"},
37     {InstOpt_M8, "M8"},
38     {InstOpt_M12, "M12"},
39     {InstOpt_M16, "M16"},
40     {InstOpt_M20, "M20"},
41     {InstOpt_M24, "M24"},
42     {InstOpt_M28, "M28"},
43     {InstOpt_Switch, "Switch"},
44     {InstOpt_Atomic, "Atomic"},
45     {InstOpt_NoDDChk, "NoDDChk"},
46     {InstOpt_NoDDClr, "NoDDClr"},
47     {InstOpt_WriteEnable, "NoMask"},
48     {InstOpt_BreakPoint, "BreakPoint"},
49     {InstOpt_EOT, "EOT"},
50     {InstOpt_AccWrCtrl, "AccWrEn"},
51     {InstOpt_Compacted, "Compacted"},
52     {InstOpt_NoCompact, "NoCompact"},
53     {InstOpt_NoSrcDepSet, "NoSrcDepSet"},
54     {InstOpt_NoPreempt, "NoPreempt"},
55     {InstOpt_Serialize, "Serialize"},
56     {InstOpt_END, "END"}
57 };
58 
59 #define HANDLE_INST(op, nsrc, ndst, type, plat, attr) \
60     { G4_##op, #op, nsrc, ndst, type, plat, attr },
61 
62 
63 #define HANDLE_NAME_INST(op, name, nsrc, ndst, type, plat, attr) \
64     { G4_##op, name, nsrc, ndst, type, plat, attr },
65 
66 const G4_Inst_Info G4_Inst_Table[] = {
67 #include "G4Instruction.h"
68 };
69 
70 
getChannelEnableStr(ChannelEnable channel)71 static const char* getChannelEnableStr(ChannelEnable channel)
72 {
73     switch (channel)
74     {
75     case NoChannelEnable:
76         return "";
77     case ChannelEnable_X:
78         return "x";
79     case ChannelEnable_Y:
80         return "y";
81     case ChannelEnable_XY:
82         return "xy";
83     case ChannelEnable_Z:
84         return "z";
85     case ChannelEnable_W:
86         return "w";
87     case ChannelEnable_ZW:
88         return "zw";
89     case ChannelEnable_XYZW:
90         return "xyzw";
91     default:
92         MUST_BE_TRUE(false, "unsupported channel enable");
93         return "";
94     }
95 }
96 
97 //global functions
roundDownPow2(uint8_t n)98 uint8_t roundDownPow2(uint8_t n)
99 {
100     uint8_t i = 1;
101     while (n >= i) i <<= 1;
102     return (i>>1);
103 }
104 
105 /* Return the base rank for the input type ignoring the signed/unsigned
106  * aspect of types.
107  *    - Types of higher precision have higher ranks.
108  *    - Floating types have higher precision than all integer types.
109  */
Operand_Type_Base_Rank(G4_Type type)110 static short Operand_Type_Base_Rank(G4_Type type)
111 {
112     short type_size = (short)TypeSize(type);
113     short type_rank = type_size * 2;
114 
115     if (type == Type_V || type == Type_UV)
116     {
117         type_rank = (short)TypeSize(Type_W);
118     }
119     else if (type == Type_VF)
120     {
121         type_rank = (short)TypeSize(Type_F);
122     }
123     else if (IS_TYPE_FLOAT_ALL(type))
124     {
125         type_rank += 2;
126     }
127 
128     return type_rank;
129 }
130 
131 /* Return the rank for the input type.
132  *    - Types of higher precision have higher ranks.
133  *    - Floating types have higher precision than all integer types.
134  *    - Unsigned types have a higher rank than a signed type with the same
135  *      precision.
136  */
Operand_Type_Rank(G4_Type type)137 static short Operand_Type_Rank(G4_Type type)
138 {
139     short type_rank = Operand_Type_Base_Rank(type);
140 
141     switch (type) {
142     case Type_UB:
143     case Type_UW:
144     case Type_UD: {
145         type_rank++;
146         break;
147     }
148     default: {
149         // No nothing.
150         break;
151     }
152     }
153 
154     return type_rank;
155 }
156 
157 // check if type1 can be represented by type2
Is_Type_Included(G4_Type type1,G4_Type type2,const IR_Builder & builder)158 static bool Is_Type_Included(G4_Type type1, G4_Type type2, const IR_Builder& builder)
159 {
160     if (type1 == type2)
161     {
162         return true;
163     }
164 
165     // Float and Int types are never subtype of each other
166     if (IS_TYPE_FLOAT_ALL(type1) ^ IS_TYPE_FLOAT_ALL(type2))
167     {
168         return false;
169     }
170     if (type1 == Type_F && type2 == builder.getMixModeType() &&
171         builder.getPlatform() > GENX_BDW && builder.getOption(vISA_enableUnsafeCP_DF))
172     {
173         return true;
174     }
175 
176     if (Operand_Type_Rank(type1) < Operand_Type_Rank(type2))
177     {
178         if ((IS_UNSIGNED_INT(type1) || type1 == Type_UV) &&
179             (IS_UNSIGNED_INT(type2) || type2 == Type_UV))
180         {
181             return true;
182         }
183         else if ((IS_SIGNED_INT(type1) || type1 == Type_V) &&
184             (IS_SIGNED_INT(type2) || type2 == Type_V))
185         {
186             return true;
187         }
188         else if ((type1 == Type_UB || type1 == Type_UW || type1 == Type_UV) && IS_TYPE_INT(type2))
189         {
190             return true;
191         }
192         else if (builder.hasMixMode() && type1 == builder.getMixModeType() && type2 == Type_F)
193         {
194             return true;
195         }
196     }
197     return false;
198 }
199 
resetRightBound(G4_Operand * opnd)200 static void resetRightBound(G4_Operand* opnd)
201 {
202     if (opnd) {
203         opnd->unsetRightBound();
204     }
205 }
206 
associateOpndWithInst(G4_Operand * opnd,G4_INST * inst)207 static void associateOpndWithInst(G4_Operand* opnd, G4_INST* inst)
208 {
209     if (opnd) {
210         opnd->setInst(inst);
211     }
212 }
213 
G4_INST(const IR_Builder & irb,G4_Predicate * prd,G4_opcode o,G4_CondMod * m,G4_Sat s,G4_ExecSize size,G4_DstRegRegion * d,G4_Operand * s0,G4_Operand * s1,G4_Operand * s2,G4_Operand * s3,G4_InstOpts opt)214 G4_INST::G4_INST(
215     const IR_Builder& irb,
216     G4_Predicate* prd,
217     G4_opcode o,
218     G4_CondMod* m,
219     G4_Sat s,
220     G4_ExecSize size,
221     G4_DstRegRegion* d,
222     G4_Operand* s0,
223     G4_Operand* s1,
224     G4_Operand* s2,
225     G4_Operand* s3,
226     G4_InstOpts opt) :
227     op(o), dst(d), predicate(prd), mod(m), option(opt),
228     useInstList(irb.getAllocator()),
229     defInstList(irb.getAllocator()),
230     localId(0),
231     srcCISAoff(UndefinedCisaOffset),
232     sat(s ? 1 : 0),
233     evenlySplitInst(false),
234     execSize(size),
235     bin(nullptr),
236     builder(irb)
237 {
238     srcs[0] = s0;
239     srcs[1] = s1;
240     srcs[2] = s2;
241     srcs[3] = s3;
242 
243     dead = false;
244     skipPostRA = false;
245     implAccSrc = nullptr;
246     implAccDst = nullptr;
247 
248     resetRightBound(dst);
249     resetRightBound(s0);
250     resetRightBound(s1);
251     resetRightBound(s2);
252     resetRightBound(s3);
253     computeRightBound(predicate);
254     computeRightBound(mod);
255 
256     associateOpndWithInst(dst, this);
257     associateOpndWithInst(s0, this);
258     associateOpndWithInst(s1, this);
259     associateOpndWithInst(s2, this);
260     associateOpndWithInst(s3, this);
261     associateOpndWithInst(predicate, this);
262     associateOpndWithInst(mod, this);
263 }
264 
G4_InstSend(const IR_Builder & builder,G4_Predicate * prd,G4_opcode o,G4_ExecSize size,G4_DstRegRegion * dst,G4_SrcRegRegion * payload,G4_Operand * desc,G4_InstOpts opt,G4_SendDesc * md)265 G4_InstSend::G4_InstSend(
266     const IR_Builder& builder,
267     G4_Predicate* prd,
268     G4_opcode o,
269     G4_ExecSize size,
270     G4_DstRegRegion* dst,
271     G4_SrcRegRegion* payload,
272     G4_Operand* desc,
273     G4_InstOpts opt,
274     G4_SendDesc* md) :
275     G4_INST(builder, prd, o, nullptr, g4::NOSAT, size, dst, payload, desc, opt),
276     msgDesc(md)
277 {
278     md->setExecSize(size);
279 }
280 
G4_InstSend(const IR_Builder & builder,G4_Predicate * prd,G4_opcode o,G4_ExecSize size,G4_DstRegRegion * dst,G4_SrcRegRegion * payload,G4_SrcRegRegion * src1,G4_Operand * desc,G4_Operand * extDesc,G4_InstOpts opt,G4_SendDesc * md)281 G4_InstSend::G4_InstSend(
282     const IR_Builder& builder,
283     G4_Predicate* prd,
284     G4_opcode o,
285     G4_ExecSize size,
286     G4_DstRegRegion* dst,
287     G4_SrcRegRegion* payload,
288     G4_SrcRegRegion* src1,
289     G4_Operand* desc,
290     G4_Operand* extDesc,
291     G4_InstOpts opt,
292     G4_SendDesc* md) :
293     G4_INST(builder, prd, o, nullptr, g4::NOSAT, size, dst, payload, src1, desc, opt),
294     msgDesc(md)
295 {
296     setSrc(extDesc, 3);
297     md->setExecSize(size);
298 }
299 
setOpcode(G4_opcode opcd)300 void G4_INST::setOpcode(G4_opcode opcd)
301 {
302     MUST_BE_TRUE(opcd < G4_NUM_OPCODE &&
303         (G4_Inst_Table[op].instType == G4_Inst_Table[opcd].instType ||
304         G4_Inst_Table[opcd].instType == InstTypeMov ||
305         (
306         (G4_Inst_Table[op].instType == InstTypeMov ||
307         G4_Inst_Table[op].instType == InstTypeArith ||
308         G4_Inst_Table[op].instType == InstTypeLogic ||
309         G4_Inst_Table[op].instType == InstTypePseudoLogic ||
310         G4_Inst_Table[op].instType == InstTypeVector) &&
311 
312         (G4_Inst_Table[opcd].instType == InstTypeMov ||
313         G4_Inst_Table[opcd].instType == InstTypeArith ||
314         G4_Inst_Table[opcd].instType == InstTypeLogic ||
315         G4_Inst_Table[opcd].instType == InstTypeVector)
316        ) ||
317         opcd == G4_label),
318         "setOpcode would change the intruction class, which is illegal.");
319 
320     bool resetBounds = false;
321 
322     if (op != opcd)
323     {
324         resetBounds = true;
325     }
326 
327     op = opcd;
328 
329     if (resetBounds)
330     {
331         resetRightBound(dst);
332         resetRightBound(srcs[0]);
333         resetRightBound(srcs[1]);
334         resetRightBound(srcs[2]);
335         resetRightBound(predicate);
336         resetRightBound(mod);
337         resetRightBound(implAccDst);
338         resetRightBound(implAccSrc);
339     }
340 }
341 
setExecSize(G4_ExecSize s)342 void G4_INST::setExecSize(G4_ExecSize s)
343 {
344     bool resetBounds = false;
345 
346     if (execSize != s)
347     {
348         resetBounds = true;
349     }
350 
351     execSize = s;
352 
353     if (resetBounds)
354     {
355         resetRightBound(dst);
356         resetRightBound(srcs[0]);
357         resetRightBound(srcs[1]);
358         resetRightBound(srcs[2]);
359         resetRightBound(predicate);
360         resetRightBound(mod);
361         resetRightBound(implAccDst);
362         resetRightBound(implAccSrc);
363     }
364 }
365 
366 //
367 // We assume no mixed int and float source type, but mixed HF and F is ok
368 //
getExecType() const369 G4_Type G4_INST::getExecType() const
370 {
371     G4_Type execType = Type_W;
372 
373     // special handling for int divide, as it supports D/UD sources only, while
374     // vISA DIV allows B/W types
375     // FIXME: if there are more instructions like this, we may need to reorder fixDstAlignment()
376     // so that it happens after all sources are fixed and we can get the correct execution type
377     if (isMath() && asMathInst()->isMathIntDiv())
378     {
379         return Type_D;
380     }
381 
382     if (opcode() == G4_fcvt)
383     {
384         // fcvt : cvt b/w standard type and other special float type.
385         //        execution type is the standard type.
386         G4_Type srcTy = srcs[0]->getType();
387         if (IS_TYPE_FLOAT_ALL(srcTy))
388         {
389             return srcTy;
390         }
391         // If src isn't standard float type, dst must be!
392         return dst->getType();
393     }
394     if (opcode() == G4_srnd)
395     {
396         // srnd: src0 is either hf or f
397         return srcs[0]->getType();
398     }
399     for (unsigned i = 0; i < G4_MAX_SRCS; i++)
400     {
401         G4_Operand* src = getSrc(i);
402         if (src != NULL)
403         {
404             G4_Type srcType = src->getType();
405             if (TypeSize(srcType) >= TypeSize(execType))
406             {
407                 if (IS_DTYPE(srcType))
408                 {
409                     execType = Type_D;
410                 }
411                 else if (IS_QTYPE(srcType))
412                 {
413                     execType = Type_Q;
414                 }
415                 else if (IS_TYPE_FLOAT_ALL(srcType))
416                 {
417                     execType = srcType;
418                 }
419             }
420         }
421     }
422 
423     // int <-> HF conversion requires exec type to be dword
424     // we don't consider Q<->HF since there are special checks in fixMov() for them
425     if (dst)
426     {
427         G4_Type dstType = dst->getType();
428         if (IS_HFTYPE(dstType) && (IS_TYPE_INT(execType) && !IS_QTYPE(execType)))
429         {
430             execType = Type_D;
431         }
432         else if (IS_HFTYPE(execType) && (IS_TYPE_INT(dstType) && !IS_QTYPE(dstType)))
433         {
434             execType = Type_F;
435         }
436     }
437 
438     return execType;
439 }
440 
441 // V and VF are treated differently here from the above function
442 // FIXME: Why do we need two functions???
getExecType2() const443 G4_Type G4_INST::getExecType2() const
444 {
445     G4_Type execType = Type_W;
446 
447     // special handling for int divide, as it supports D/UD sources only, while
448     // vISA DIV allows B/W types
449     if (isMath() && asMathInst()->isMathIntDiv())
450     {
451         return Type_D;
452     }
453 
454     for (unsigned i = 0; i < G4_MAX_SRCS; i++)
455     {
456         G4_Operand* src = getSrc(i);
457         if (src == NULL)
458         {
459             continue;
460         }
461         G4_Type srcType = srcs[i]->getType();
462         if (builder.hasBFMixMode() && srcType == Type_BF)
463         {
464             execType = Type_F;
465         }
466         else if (isLowPrecisionFloatTy(srcType) &&
467             TypeSize(srcType) >= TypeSize(execType))
468         {
469             execType = srcType;
470             break;
471         }
472         else if (srcType == Type_V)
473         {
474             execType = Type_V;
475             break;
476         }
477         else if (srcType == Type_UV)
478         {
479             execType = Type_UV;
480             break;
481         }
482         else if (IS_DFTYPE(srcType) && !IS_DFTYPE(execType))
483         {
484             execType = src->getType();
485             break;
486         }
487         else if ((IS_FTYPE(srcType) || srcType == Type_VF) &&
488             !IS_DFTYPE(execType) && !IS_FTYPE(execType))
489         {
490             execType = Type_F;
491         }
492         else if (IS_DTYPE(srcType) &&
493             TypeSize(srcType) >= TypeSize(execType) &&
494             !IS_DFTYPE(execType) && !IS_FTYPE(execType))
495         {
496             execType = Type_D;
497         }
498         else if (IS_QTYPE(srcType) &&
499             TypeSize(srcType) >= TypeSize(execType) &&
500             !IS_DFTYPE(execType) && !IS_FTYPE(execType))
501         {
502             execType = Type_Q;
503         }
504     }
505 
506     // int <-> HF conversion requires exec type to be dword
507     // we don't consider Q<->HF since there are special checks in fixMov() for them
508     if (dst)
509     {
510         G4_Type dstType = dst->getType();
511         if (IS_HFTYPE(dstType) && (IS_TYPE_INT(execType) && !IS_QTYPE(execType)))
512         {
513             execType = Type_D;
514         }
515         else if (IS_HFTYPE(execType) && (IS_TYPE_INT(dstType) && !IS_QTYPE(dstType)))
516         {
517             execType = Type_F;
518         }
519     }
520 
521     return execType;
522 }
523 
getMaskOffset() const524 uint16_t G4_INST::getMaskOffset() const
525 {
526     unsigned maskOption = (getOption() & InstOpt_QuarterMasks);
527 
528     if (!builder.hasNibCtrl())
529     {
530         assert(maskOption != InstOpt_M4 && maskOption != InstOpt_M12 && maskOption != InstOpt_M20 &&
531             maskOption != InstOpt_M28 && "nibCtrl is not supported on this platform");
532     }
533 
534     switch (maskOption)
535     {
536     case InstOpt_NoOpt:
537         return 0;
538     case InstOpt_M0:
539         return 0;
540     case InstOpt_M4:
541         return 4;
542     case InstOpt_M8:
543         return 8;
544     case InstOpt_M12:
545         return 12;
546     case InstOpt_M16:
547         return 16;
548     case InstOpt_M20:
549         return 20;
550     case InstOpt_M24:
551         return 24;
552     case InstOpt_M28:
553         return 28;
554     default:
555         MUST_BE_TRUE(0, "Incorrect instruction execution mask");
556         return 0;
557     }
558 }
559 
setMetadata(const std::string & key,MDNode * value)560 void G4_INST::setMetadata(const std::string& key, MDNode* value)
561 {
562     if (!MD)
563     {
564         MD = const_cast<IR_Builder&>(builder).allocateMD();
565     }
566     MD->setMetadata(key, value);
567 }
568 
setComments(const std::string & str)569 void G4_INST::setComments(const std::string& str)
570 {
571     // we create a new MDNode the assumption is that comment should be unique and there is no opportunity for sharing
572     auto node = const_cast<IR_Builder&>(builder).allocateMDString(str);
573     setMetadata(Metadata::InstComment, node);
574 }
575 
addComment(const std::string & comment)576 void G4_INST::addComment(const std::string& comment) {
577     std::string comments = getComments();
578     if (!comments.empty()) { // add a separator
579         comments += "; ";
580     }
581     comments += comment;
582     setComments(comments);
583 }
584 
setTokenLoc(unsigned short token,unsigned globalID)585 void G4_INST::setTokenLoc(unsigned short token, unsigned globalID)
586 {
587     if (!builder.getOption(vISA_SBIDDepLoc))
588     {
589         return;
590     }
591     auto tokenLoc = getMetadata(Metadata::TokenLoc);
592     if (!tokenLoc)
593     {
594         auto node = const_cast<IR_Builder&>(builder).allocateMDTokenLocation(token, globalID);
595         setMetadata(Metadata::TokenLoc, node);
596     }
597     else
598     {
599         MDTokenLocation* tokenL = tokenLoc->asMDTokenLocation();
600         tokenL->addTokenLocation(token, globalID);
601     }
602 }
603 
604 //
605 // remove all references to this inst in other inst's use_list
606 // this is used when we want to delete this instruction
removeAllDefs()607 void G4_INST::removeAllDefs()
608 {
609     for (auto&& item : defInstList)
610     {
611         G4_INST *def = item.first;
612         def->useInstList.remove_if(
613             [&](USE_DEF_NODE node) { return node.first == this; });
614     }
615     defInstList.clear();
616 }
617 
removeAllUses()618 void G4_INST::removeAllUses()
619 {
620     for (auto&& item : useInstList)
621     {
622         G4_INST *user = item.first;
623         user->defInstList.remove_if(
624             [&](USE_DEF_NODE node) { return node.first == this; });
625     }
626     useInstList.clear();
627 }
628 
629 //
630 // remove def/use for opndNum, which must be a source
631 // (i.e., not Opnd_dst/Opnd_condMod/Opnd_implAccDst)
removeDefUse(Gen4_Operand_Number opndNum)632 void G4_INST::removeDefUse(Gen4_Operand_Number opndNum)
633 {
634     DEF_EDGE_LIST_ITER iter = defInstList.begin();
635     while (iter != defInstList.end())
636     {
637         if ((*iter).second == opndNum)
638         {
639             auto defInst = (*iter).first;
640             defInst->useInstList.remove_if(
641                 [&](USE_DEF_NODE node) { return node.first == this && node.second == opndNum; });
642             DEF_EDGE_LIST_ITER curr_iter = iter++;
643             defInstList.erase(curr_iter);
644         }
645         else
646         {
647             ++iter;
648         }
649     }
650 }
651 
getOperand(Gen4_Operand_Number opnd_num) const652 const G4_Operand* G4_INST::getOperand(Gen4_Operand_Number opnd_num) const
653 {
654     switch (opnd_num) {
655     case Opnd_dst: return (G4_Operand*) dst;
656     case Opnd_src0: return srcs[0];
657     case Opnd_src1: return srcs[1];
658     case Opnd_src2: return srcs[2];
659     case Opnd_src3: return srcs[3];
660     case Opnd_pred: return (G4_Operand*)predicate;
661     case Opnd_condMod: return (G4_Operand*)mod;
662     case Opnd_implAccSrc: return implAccSrc;
663     case Opnd_implAccDst: return (G4_Operand*) implAccDst;
664     default:
665         MUST_BE_TRUE(0, "Operand number is out of range.");
666         break;
667     }
668     return NULL;
669 }
670 
eraseUse(USE_EDGE_LIST_ITER iter)671 USE_EDGE_LIST_ITER G4_INST::eraseUse(USE_EDGE_LIST_ITER iter)
672 {
673     G4_INST *useInst = iter->first;
674     useInst->defInstList.remove_if(
675         [&](USE_DEF_NODE node) { return node.first == this && node.second == iter->second; });
676     return useInstList.erase(iter);
677 }
678 
679 // Transfer definitions used in this[opndNum1] to definitions used in
680 // inst2[opndNum2] and update definitions's def-use chain accordingly.
transferDef(G4_INST * inst2,Gen4_Operand_Number opndNum1,Gen4_Operand_Number opndNum2)681 void G4_INST::transferDef(G4_INST *inst2, Gen4_Operand_Number opndNum1, Gen4_Operand_Number opndNum2)
682 {
683     DEF_EDGE_LIST_ITER iter = defInstList.begin();
684     while (iter != defInstList.end())
685     {
686         auto defInst = (*iter).first;
687         if ((*iter).second == opndNum1)
688         {
689             // gcc 5.0 doesn't like emplace_back for some reason
690             inst2->defInstList.push_back(USE_DEF_NODE(defInst, opndNum2));
691             defInst->useInstList.remove_if(
692                 [&](USE_DEF_NODE node) { return node.second == opndNum1 && node.first == this; });
693             defInst->useInstList.push_back(USE_DEF_NODE(inst2, opndNum2));
694             DEF_EDGE_LIST_ITER curr_iter = iter++;
695             defInstList.erase(curr_iter);
696 
697             //Remove the redundant d/u node.
698             //Due to the instruction optimization, such as merge scalars, redundant d/u info may be generated.
699             //Such as the case:
700             //(W) shl (1) V3429(0,0)<1>:d V3380(0,0)<0;1,0>:d 0x17:w
701             //(W) shl (1) V3430(0,0)<1>:d V3381(0,0)<0;1,0>:d 0x17:w
702             //(W) add (1) V3432(0,0)<1>:d 0x43800000:d -V3429(0,0)<0;1,0>:d
703             //(W) add (1) V3433(0,0)<1>:d 0x43800000:d -V3430(0,0)<0;1,0>:d
704             //==>
705             //(W) shl (2) Merged138(0,0)<1>:d Merged139(0,0)<1;1,0>:d 0x17:w
706             //(W) add (2) Merged140(0,0)<1>:d 0x43800000:d -Merged138(0,0)<1;1,0>:d
707             inst2->defInstList.sort();
708             inst2->defInstList.unique();
709             defInst->useInstList.sort();
710             defInst->useInstList.unique();
711         }
712         else
713         {
714             ++iter;
715         }
716     }
717 }
718 
719 // This copies, from this definition's source opndNum1, all of its defintions to
720 // inst2's source opndNum2. This is used for example by copy propagation to copy
721 // the def-use link of the move to the use instruction.
722 //
723 // If 'checked' is true, then this only copies those effective defs to inst2.
724 //
copyDef(G4_INST * inst2,Gen4_Operand_Number opndNum1,Gen4_Operand_Number opndNum2,bool checked)725 void G4_INST::copyDef(
726     G4_INST *inst2,
727     Gen4_Operand_Number opndNum1,
728     Gen4_Operand_Number opndNum2,
729     bool checked)
730 {
731     for (auto I = def_begin(); I != def_end(); ++I)
732     {
733         if (I->second == opndNum1)
734         {
735             // If checked is enabled, then compare inst2[opndNum] with this
736             // definition. Skip if this is not an effective use.
737             if (checked)
738             {
739                 G4_Operand *use = inst2->getOperand(opndNum2);
740                 ASSERT_USER(use, "null operand unexpected");
741                 G4_Operand *dst = I->first->getOperand(Opnd_dst);
742                 G4_Operand *condMod = I->first->getOperand(Opnd_condMod);
743                 if ((dst && use->compareOperand(dst) != Rel_disjoint) ||
744                     (condMod && use->compareOperand(condMod) != Rel_disjoint))
745                 {
746                     // OK
747                 }
748                 else
749                 {
750                     // Skip to the next def.
751                     continue;
752                 }
753             }
754             I->first->addDefUse(inst2, opndNum2);
755         }
756     }
757     inst2->defInstList.unique();
758 }
759 
760 /// Copy this instruction's defs to inst2.
copyDefsTo(G4_INST * inst2,bool checked)761 void G4_INST::copyDefsTo(G4_INST *inst2, bool checked)
762 {
763     if (this == inst2)
764         return;
765 
766     for (auto I = def_begin(), E = def_end(); I != E; ++I)
767     {
768         G4_Operand *use = inst2->getOperand(I->second);
769         // Copy when the corresponding use operand is not null.
770         if (!use)
771            continue;
772 
773         if (checked)
774         {
775             G4_Operand *dst = I->first->getOperand(Opnd_dst);
776             G4_Operand *condMod = I->first->getOperand(Opnd_condMod);
777             G4_Operand* implicitAccDef = I->first->getImplAccDst();
778             if ((dst && use->compareOperand(dst) != Rel_disjoint) ||
779                 (condMod && use->compareOperand(condMod) != Rel_disjoint) ||
780                 (implicitAccDef && use->compareOperand(implicitAccDef) != Rel_disjoint))
781             {
782                 // OK
783             }
784             else
785             {
786                 // Skip to the next def.
787                 continue;
788             }
789         }
790 
791         // inst2[I->second] is defined by I->first.
792         I->first->addDefUse(inst2, I->second);
793     }
794 }
795 
796 /// Copy this instruction's uses to inst2.
copyUsesTo(G4_INST * inst2,bool checked)797 void G4_INST::copyUsesTo(G4_INST *inst2, bool checked)
798 {
799     if (this == inst2)
800         return;
801 
802     for (auto I = use_begin(), E = use_end(); I != E; ++I)
803     {
804         if (checked)
805         {
806             G4_Operand *use = I->first->getOperand(I->second);
807             ASSERT_USER(use, "null operand unexpected");
808 
809             G4_Operand *dst = inst2->getOperand(Opnd_dst);
810             G4_Operand *condMod = inst2->getOperand(Opnd_condMod);
811             G4_Operand *implicitAccDef = inst2->getImplAccDst();
812             if ((dst && use->compareOperand(dst) != Rel_disjoint) ||
813                 (condMod && use->compareOperand(condMod) != Rel_disjoint) ||
814                 (implicitAccDef && use->compareOperand(implicitAccDef) != Rel_disjoint))
815             {
816                 // OK
817             }
818             else
819             {
820                 // Skip to the next use.
821                 continue;
822             }
823         }
824 
825         // I->first[I->second] is defined by inst2.
826         inst2->addDefUse(I->first, I->second);
827     }
828 }
829 
830 // This transfers this instructions' useInstList to inst2's,
831 // and update each use's defInstList to point to inst2.
832 // this instruction's use is destroyed in the process.
833 // if keepExisting is true, it will preserve inst2's existing uses.
transferUse(G4_INST * inst2,bool keepExisting)834 void G4_INST::transferUse(G4_INST *inst2, bool keepExisting)
835 {
836     if (this == inst2)
837     {
838         return;
839     }
840 
841     if (!keepExisting)
842     {
843         inst2->removeAllUses();
844     }
845 
846     copyUsesTo(inst2, false);
847     removeAllUses();
848 }
849 
850 //
851 // remove all references of this inst in other inst's def list
852 // this is used when we want to delete this instruction
removeUseOfInst()853 void G4_INST::removeUseOfInst()
854 {
855     for (auto&& node : defInstList)
856     {
857         auto defInst = node.first;
858         defInst->useInstList.remove_if(
859             [&](USE_DEF_NODE node) { return node.first == this;});
860     }
861 }
862 
863 // remove the faked def-instructions in def list, which is resulted from instruction spliting
trimDefInstList()864 void G4_INST::trimDefInstList()
865 {
866     // trim def list
867     DEF_EDGE_LIST_ITER iter = defInstList.begin();
868     // since ACC is only exposed in ARCTAN intrinsic translation, there is no instruction split with ACC
869     while (iter != defInstList.end())
870     {
871         G4_Operand *src = getOperand((*iter).second);
872 
873         if (src == nullptr)
874         {
875             // it's possible the source is entirely gone (e.g., predicate removed)
876             iter = defInstList.erase(iter);
877             continue;
878         }
879         G4_CmpRelation rel = Rel_undef;
880         if (src->isFlag())
881         {
882             if ((*iter).first->getCondMod())
883             {
884                 rel = src->compareOperand((*iter).first->getCondMod());
885             }
886             else if ((*iter).first->getDst())
887             {
888                 if ((*iter).first->hasNULLDst())
889                 {
890                     rel = Rel_disjoint;
891                 }
892                 else
893                 {
894                     rel = src->compareOperand((*iter).first->getDst());
895                 }
896             }
897         }
898         else
899         {
900             rel = src->compareOperand((*iter).first->getDst());
901         }
902 
903         if (rel == Rel_disjoint)
904         {
905             // remove this def-use
906             // assumption: no duplicate def-use info
907             USE_EDGE_LIST_ITER useIter = (*iter).first->useInstList.begin();
908             while (useIter != (*iter).first->useInstList.end())
909             {
910                 if ((*useIter).first == this && (*useIter).second == Opnd_src2)
911                 {
912                     (*iter).first->useInstList.erase(useIter);
913                     break;
914                 }
915                 useIter++;
916             }
917             DEF_EDGE_LIST_ITER tmpIter = iter;
918             iter++;
919             defInstList.erase(tmpIter);
920             continue;
921         }
922         iter++;
923     }
924 }
925 
isDFInstruction() const926 bool G4_INST::isDFInstruction() const
927 {
928     G4_Operand* dst = getDst();
929     if (dst && (dst->getType() == Type_DF))
930     {
931         return true;
932     }
933     for (int i = 0; i < getNumSrc(); i++)
934     {
935         G4_Operand* src = getSrc(i);
936         if (src && (src->getType() == Type_DF))
937         {
938             return true;
939         }
940     }
941     return false;
942 }
943 
isMathPipeInst() const944 bool G4_INST::isMathPipeInst() const
945 {
946     if (isMath())
947     {
948         return true;
949     }
950 
951 
952     return false;
953 }
954 
distanceHonourInstruction() const955 bool G4_INST::distanceHonourInstruction() const
956 {
957     if (isSend() || op == G4_nop || isWait() || isDpas())
958     {
959         return false;
960     }
961     if (isMathPipeInst())
962     {
963         if (builder.getPlatform() >= GENX_PVC)
964         {
965             return true;
966         }
967         return false;
968     }
969     return true;
970 }
971 
tokenHonourInstruction() const972 bool G4_INST::tokenHonourInstruction() const
973 {
974     if (isSend() || isDpas())
975     {
976         return true;
977     }
978     else
979     {
980         if (isMathPipeInst())
981         {
982             if (builder.getPlatform() >= GENX_PVC)
983             {
984                 return false;
985             }
986             return true;
987         }
988         return false;
989     }
990 }
991 
hasNoPipe()992 bool G4_INST::hasNoPipe()
993 {
994     if (op == G4_wait || op == G4_halt || op == G4_nop)
995     {
996         return true;
997     }
998     // PVC only
999     if (op == G4_sync_fence)
1000     {
1001         return true;
1002     }
1003     return false;
1004 }
1005 
1006 
isLongPipeType(G4_Type type) const1007 bool G4_INST::isLongPipeType(G4_Type type) const
1008 {
1009     if (builder.hasPartialInt64Support())
1010     {
1011         return type == Type_DF;
1012     }
1013     return IS_TYPE_LONG(type);
1014 }
1015 
isIntegerPipeType(G4_Type type) const1016 bool G4_INST::isIntegerPipeType(G4_Type type) const
1017 {
1018     if (IS_TYPE_INTEGER(type))
1019     {
1020         return true;
1021     }
1022 
1023     if (builder.hasPartialInt64Support())
1024     {
1025         return type == Type_UQ || type == Type_Q;
1026     }
1027 
1028     return false;
1029 }
1030 
isJEUPipeInstructionXe() const1031 bool G4_INST::isJEUPipeInstructionXe() const
1032 {
1033     if (op == G4_jmpi ||
1034         op == G4_if ||
1035         op == G4_else ||
1036         op == G4_endif ||
1037         op == G4_break ||
1038         op == G4_join ||
1039         op == G4_cont ||
1040         op == G4_while ||
1041         op == G4_brc ||
1042         op == G4_brd ||
1043         op == G4_goto ||
1044         op == G4_call ||
1045         op == G4_return)
1046     {
1047         return true;
1048     }
1049     return false;
1050 }
1051 
1052 
isLongPipeInstructionXe() const1053 bool G4_INST::isLongPipeInstructionXe() const
1054 {
1055     if (isJEUPipeInstructionXe())
1056     {
1057         return false;
1058     }
1059 
1060     if (!distanceHonourInstruction())
1061     {
1062         return false;
1063     }
1064 
1065     if (builder.hasFixedCycleMathPipeline() &&
1066         isMath())
1067     {
1068         return false;
1069     }
1070 
1071 
1072     const G4_Operand* dst = getDst();
1073     if (dst && isLongPipeType(dst->getType()))
1074     {
1075         return true;
1076     }
1077 
1078     if (!builder.hasPartialInt64Support())
1079     {
1080         for (int i = 0; i < G4_MAX_SRCS; i++)
1081         {
1082             const G4_Operand* src = getSrc(i);
1083             if (src && isLongPipeType(src->getType()))
1084             {
1085                 return true;
1086             }
1087         }
1088     }
1089 
1090     return false;
1091 }
1092 
isIntegerPipeInstructionXe() const1093 bool G4_INST::isIntegerPipeInstructionXe() const
1094 {
1095     if (isJEUPipeInstructionXe())
1096     {
1097         return true;
1098     }
1099 
1100     if (!distanceHonourInstruction())
1101     {
1102         return false;
1103     }
1104 
1105     if (isLongPipeInstructionXe())
1106     {
1107         return false;
1108     }
1109 
1110 
1111     if (builder.hasFixedCycleMathPipeline() &&
1112         isMath())
1113     {
1114         return false;
1115     }
1116     if (op == G4_fcvt)
1117     {
1118         return false;
1119     }
1120     if (op == G4_srnd)
1121     {
1122         return false;
1123     }
1124 
1125     G4_Operand* dst = getDst();
1126     if (dst && isIntegerPipeType(dst->getType()))
1127     {
1128         return true;
1129     }
1130 
1131     if (builder.hasQ2FInIntegerPipe() && dst->getType() == Type_F)
1132     {
1133         const G4_Operand* src = getSrc(0);
1134         if (src && (src->getType() == Type_Q || src->getType() == Type_UQ))
1135         {
1136             return true;
1137         }
1138     }
1139 
1140     if (!dst)
1141     {
1142         const G4_Operand* src = getSrc(0);
1143         if (src && isIntegerPipeType(src->getType()))
1144         {
1145             return true;
1146         }
1147     }
1148 
1149     return false;
1150 }
1151 
isFloatPipeInstructionXe() const1152 bool G4_INST::isFloatPipeInstructionXe() const
1153 {
1154     if (isJEUPipeInstructionXe())
1155     {
1156         return false;
1157     }
1158 
1159     if (!distanceHonourInstruction())
1160     {
1161         return false;
1162     }
1163 
1164 
1165     if (isLongPipeInstructionXe())
1166     {
1167         return false;
1168     }
1169 
1170     if (builder.hasFixedCycleMathPipeline() &&
1171         isMath())
1172     {
1173         return false;
1174     }
1175     if (opcode() == G4_fcvt)
1176     {
1177         return true;
1178     }
1179     if (opcode() == G4_srnd)
1180     {
1181         return true;
1182     }
1183 
1184     const G4_Operand* dst = getDst();
1185     if (dst &&
1186         (dst->getType() == Type_F ||
1187             dst->getType() == Type_HF ||
1188             dst->getType() == Type_BF))
1189     {
1190         if (builder.hasQ2FInIntegerPipe() && dst->getType() == Type_F)
1191         {
1192             const G4_Operand* src = getSrc(0);
1193             if (src && (src->getType() == Type_Q || src->getType() == Type_UQ))
1194             {
1195                 return false;
1196             }
1197         }
1198         return true;
1199     }
1200 
1201     if (!dst)
1202     {
1203         const G4_Operand* src = getSrc(0);
1204         if (src &&
1205             (src->getType() == Type_F ||
1206                 src->getType() == Type_HF ||
1207                 src->getType() == Type_BF))
1208         {
1209             return true;
1210         }
1211     }
1212 
1213     return false;
1214 }
1215 
getDataTypePipeXe(G4_Type type)1216 SB_INST_PIPE G4_INST::getDataTypePipeXe(G4_Type type)
1217 {
1218     switch (type)
1219     {
1220     case Type_UB:
1221     case Type_B:
1222     case Type_UW:
1223     case Type_W:
1224     case Type_UD:
1225     case Type_D:
1226     case Type_UV:
1227     case Type_V:
1228         return PIPE_INT;
1229 
1230     case Type_Q:
1231     case Type_UQ:
1232         if (builder.hasPartialInt64Support())
1233         {
1234             return PIPE_INT;
1235         }
1236         return PIPE_LONG;
1237 
1238     case Type_DF:
1239         return PIPE_LONG;
1240 
1241     case Type_HF:
1242     case Type_F:
1243     case Type_VF:
1244     case Type_NF:
1245     case Type_BF:
1246         return PIPE_FLOAT;
1247 
1248     default:
1249         return PIPE_NONE;
1250     }
1251 
1252     return PIPE_NONE;
1253 }
1254 
getInstructionPipeXe()1255 SB_INST_PIPE G4_INST::getInstructionPipeXe()
1256 {
1257 
1258     if (isLongPipeInstructionXe())
1259     {
1260         return PIPE_LONG;
1261     }
1262 
1263     if (isIntegerPipeInstructionXe())
1264     {
1265         return PIPE_INT;
1266     }
1267 
1268     if (isFloatPipeInstructionXe())
1269     {
1270         return PIPE_FLOAT;
1271     }
1272 
1273     if (builder.hasFixedCycleMathPipeline() &&
1274         isMath())
1275     {
1276         return PIPE_MATH;
1277     }
1278 
1279     if (tokenHonourInstruction())
1280     {
1281         if (isDpas())
1282         {
1283             return PIPE_DPAS;
1284         }
1285         if (isMathPipeInst())
1286         {
1287             return PIPE_MATH;
1288         }
1289         if (isSend())
1290         {
1291             return PIPE_SEND;
1292         }
1293 
1294         ASSERT_USER(0, "Wrong token pipe instruction!");
1295     }
1296 
1297     ASSERT_USER(hasNoPipe(), "No pipe instruction");
1298     return PIPE_NONE;
1299 }
1300 
1301 template <typename T>
fmtHexBody(T t,int cols=0)1302 static std::string fmtHexBody(T t, int cols = 0)
1303 {
1304     std::stringstream ss;
1305     if (sizeof(t) == 1) // char/unsigned char to int
1306         ss << std::hex << std::setw(cols) << std::uppercase <<
1307             std::setfill('0') << (int)t;
1308     else
1309         ss << std::hex << std::setw(cols) << std::uppercase <<
1310             std::setfill('0') << t;
1311     return ss.str();
1312 }
1313 
1314 template <typename T>
fmtHex(T t,int cols=0)1315 static std::string fmtHex(T t, int cols = 0)
1316 {
1317     std::stringstream ss;
1318     ss << "0x" << fmtHexBody(t, cols);
1319     return ss.str();
1320 }
1321 
1322 
1323 #ifdef _DEBUG
printDefUseImpl(std::ostream & os,G4_INST * def,G4_INST * use,Gen4_Operand_Number pos)1324 static void printDefUseImpl(
1325     std::ostream &os, G4_INST *def, G4_INST *use, Gen4_Operand_Number pos)
1326 {
1327     os << "\n  def: ";
1328     def->emit(os);
1329     os << "\n user: ";
1330     use->emit(os);
1331     os << "\n opnd: ";
1332     use->getOperand(pos)->emit(os);
1333     os << "\n";
1334 }
1335 #endif
1336 
dumpDefUse(std::ostream & os)1337 void G4_INST::dumpDefUse(std::ostream &os)
1338 {
1339 #if _DEBUG
1340     std::cerr << "\n------------ defs ------------\n";
1341     for (auto&& UD : defInstList)
1342     {
1343         printDefUseImpl(std::cerr, UD.first, this, UD.second);
1344     }
1345     std::cerr << "\n------------ uses ------------\n";
1346     for (auto&& DU : useInstList)
1347     {
1348         printDefUseImpl(std::cerr, this, DU.first, DU.second);
1349     }
1350 #endif
1351 }
1352 
1353 namespace {
1354     // Customized def-use iterator comparison. Do not compare itself
1355     // but the content it is pointing to.
1356     struct def_less
1357     {
operator ()__anon352c3f950711::def_less1358         bool operator()(DEF_EDGE_LIST_ITER a, DEF_EDGE_LIST_ITER b) const
1359         {
1360             if (a->first < b->first)
1361             {
1362                 return true;
1363             }
1364             else if ((a->first == b->first) && (a->second < b->second))
1365             {
1366                 return true;
1367             }
1368             return false;
1369         }
1370     };
1371 }
1372 
getSingleDef(Gen4_Operand_Number opndNum,bool MakeUnique)1373 G4_INST *G4_INST::getSingleDef(Gen4_Operand_Number opndNum, bool MakeUnique)
1374 {
1375     if (MakeUnique)
1376     {
1377         std::set<DEF_EDGE_LIST_ITER, def_less> found;
1378         for (auto I = def_begin(); I != def_end(); /* empty */)
1379         {
1380             if (!found.insert(I).second)
1381             {
1382                 I = defInstList.erase(I);
1383             }
1384             else
1385             {
1386                 ++I;
1387             }
1388         }
1389     }
1390 
1391     G4_INST *def = 0;
1392     unsigned def_count = 0;
1393     for (auto I = def_begin(), E = def_end(); I != E; ++I)
1394     {
1395         if (I->second == opndNum)
1396         {
1397             if (++def_count > 1) return 0;
1398             def = I->first;
1399         }
1400     }
1401 
1402     return def;
1403 }
1404 
1405 // add def-use between this instruction <--> inst[srcPos]
1406 // Note that this function does not check for duplicates
addDefUse(G4_INST * inst,Gen4_Operand_Number srcPos)1407 void G4_INST::addDefUse(G4_INST* inst, Gen4_Operand_Number srcPos)
1408 {
1409     MUST_BE_TRUE(srcPos == Opnd_dst ||
1410         srcPos == Opnd_src0 || srcPos == Opnd_src1 ||
1411         srcPos == Opnd_src2 || srcPos == Opnd_src3 ||
1412         srcPos == Opnd_src4 || srcPos == Opnd_src5 ||
1413         srcPos == Opnd_src6 || srcPos == Opnd_src7 ||
1414         srcPos == Opnd_pred ||
1415         srcPos == Opnd_implAccSrc, "unexpected operand number");
1416     useInstList.emplace_back(inst, srcPos);
1417     inst->defInstList.emplace_back(this, srcPos);
1418 }
1419 
1420 // exchange def/use info of src0 and src1 after they are swapped.
swapDefUse(Gen4_Operand_Number srcIxA,Gen4_Operand_Number srcIxB)1421 void G4_INST::swapDefUse(Gen4_Operand_Number srcIxA, Gen4_Operand_Number srcIxB)
1422 {
1423     DEF_EDGE_LIST_ITER iter = defInstList.begin();
1424     //To avoid redundant define and use items
1425     INST_LIST handledDefInst;
1426 
1427     // since ACC is only exposed in ARCTAN intrinsic translation, there is no instruction split with ACC
1428     while (iter != defInstList.end())
1429     {
1430         if ((*iter).second == srcIxB)
1431         {
1432             (*iter).second = srcIxA;
1433         }
1434         else if ((*iter).second == srcIxA)
1435         {
1436             (*iter).second = srcIxB;
1437         }
1438         else
1439         {
1440             iter++;
1441             continue;
1442         }
1443         if (std::find(handledDefInst.begin(), handledDefInst.end(), (*iter).first) != handledDefInst.end())
1444         {
1445             iter++;
1446             continue;
1447         }
1448         handledDefInst.push_back((*iter).first);
1449         // change uselist of def inst
1450         USE_EDGE_LIST_ITER useIter = (*iter).first->useInstList.begin();
1451         for (; useIter != (*iter).first->useInstList.end(); useIter++)
1452         {
1453             if ((*useIter).first == this)
1454             {
1455                 if ((*useIter).second == srcIxB)
1456                 {
1457                     (*useIter).second = srcIxA;
1458                 }
1459                 else if ((*useIter).second == srcIxA)
1460                 {
1461                     (*useIter).second = srcIxB;
1462                 }
1463             }
1464         }
1465         iter++;
1466     }
1467 }
1468 
1469 // returns true if inst is a commutable binary instruction and its two sources can be swapped
canSwapSource() const1470 bool G4_INST::canSwapSource() const
1471 {
1472     if (getNumSrc() != 2)
1473     {
1474         return false;
1475     }
1476 
1477     if (!INST_COMMUTATIVE(opcode()))
1478     {
1479         return false;
1480     }
1481 
1482     G4_Operand* src0 = getSrc(0);
1483     G4_Operand* src1 = getSrc(1);
1484     // src1 restrictions: no ARF, no VXH
1485     if (src0->isSrcRegRegion())
1486     {
1487         G4_SrcRegRegion* src0Region = src0->asSrcRegRegion();
1488         if (src0Region->isAreg() || src0Region->getRegion()->isRegionWH())
1489         {
1490             return false;
1491         }
1492     }
1493 
1494     // src0 restrictions: no Imm
1495     if (src1->isImm() || src1->isAddrExp())
1496     {
1497         return false;
1498     }
1499 
1500     // special check for mul: don't put DW on src1
1501     if (opcode() == G4_mul)
1502     {
1503         if (IS_DTYPE(src0->getType()) && !IS_DTYPE(src1->getType()))
1504         {
1505             return false;
1506         }
1507     }
1508 
1509     return true;
1510 }
1511 // fix src2 def/use to implicitSrc def/use
fixMACSrc2DefUse()1512 void G4_INST::fixMACSrc2DefUse()
1513 {
1514     if (op != G4_mac)
1515     {
1516         return;
1517     }
1518     for (DEF_EDGE_LIST_ITER iter = defInstList.begin();
1519         iter != defInstList.end();
1520         iter++)
1521     {
1522         if ((*iter).second == Opnd_src2)
1523         {
1524             (*iter).second = Opnd_implAccSrc;
1525             G4_INST* defInst = (*iter).first;
1526             for (USE_EDGE_LIST_ITER useIter = defInst->useInstList.begin();
1527                 useIter != defInst->useInstList.end();
1528                 ++useIter)
1529             {
1530                 if (((*useIter).first == this) &&
1531                     ((*useIter).second == Opnd_src2))
1532                 {
1533                     (*useIter).second = Opnd_implAccSrc;
1534                     break;
1535                 }
1536             }
1537             break;
1538         }
1539     }
1540 }
1541 
1542 // a raw move is a move with
1543 // -- no saturation or src modifiers
1544 // -- same dst and src type
1545 // -- no conditional modifier (predicate is ok)
isRawMov() const1546 bool G4_INST::isRawMov() const
1547 {
1548     return op == G4_mov && !sat && dst->getType() == srcs[0]->getType() &&
1549         getCondMod() == NULL &&
1550         (srcs[0]->isImm() ||
1551         (srcs[0]->isSrcRegRegion() && srcs[0]->asSrcRegRegion()->getModifier() == Mod_src_undef));
1552 }
1553 
hasACCSrc() const1554 bool G4_INST::hasACCSrc() const
1555 {
1556     if (implAccSrc ||
1557         (srcs[0] && srcs[0]->isSrcRegRegion() && srcs[0]->asSrcRegRegion()->isAccReg()))
1558     {
1559         return true;
1560     }
1561     return false;
1562 }
1563 
1564 // check if acc is possibly used by this instruction
hasACCOpnd() const1565 bool G4_INST::hasACCOpnd() const
1566 {
1567     return (isAccWrCtrlInst() ||
1568         implAccSrc ||
1569         implAccDst ||
1570         (op == G4_mulh &&
1571         IS_DTYPE(srcs[0]->getType()) && IS_DTYPE(srcs[1]->getType())) ||
1572         (dst && dst->isAccReg()) ||
1573         (srcs[0] && srcs[0]->isAccReg()) ||
1574         (srcs[1] && srcs[1]->isAccReg()) ||
1575         (srcs[2] && srcs[2]->isAccReg()) ||
1576         op == G4_madw);
1577 }
1578 
getOpExecType(int & extypesize)1579 G4_Type G4_INST::getOpExecType(int& extypesize)
1580 {
1581     G4_Type extype;
1582     if (isRawMov())
1583     {
1584         extype = srcs[0]->getType();
1585     }
1586     else
1587     {
1588         extype = getExecType2();
1589     }
1590     if (IS_VINTTYPE(extype))
1591     {
1592         extypesize = numEltPerGRF<Type_UB>()/2;
1593     }
1594     else if (IS_VFTYPE(extype))
1595     {
1596         extypesize = numEltPerGRF<Type_UB>();
1597     }
1598     else
1599     {
1600         extypesize = TypeSize(extype);
1601     }
1602 
1603     return extype;
1604 }
1605 
getMovType(const G4_INST * Inst,G4_Type dstTy,G4_Type srcTy,G4_SrcModifier srcMod)1606 static G4_INST::MovType getMovType(
1607     const G4_INST* Inst, G4_Type dstTy, G4_Type srcTy, G4_SrcModifier srcMod)
1608 {
1609     // COPY when dst & src types are the same.
1610     if (dstTy == srcTy)
1611         return G4_INST::Copy;
1612 
1613     bool dstIsFP = IS_TYPE_FLOAT_ALL(dstTy);
1614     bool srcIsFP = IS_TYPE_FLOAT_ALL(srcTy);
1615 
1616     // If dst & src are not both FPs or both Integers, that MOV must be
1617     // conversions from Integer to FP or vice versa.
1618     if (dstIsFP != srcIsFP) {
1619         if (dstIsFP)
1620             return G4_INST::IntToFP;
1621 
1622         ASSERT_USER(srcIsFP, "Unexpected source type!");
1623         return G4_INST::FPToInt;
1624     }
1625 
1626     // If they are both FPs, that MOV must be either up or down conversion.
1627     // Note it could not be a COPY as dst & src are different here.
1628     if (dstIsFP) {
1629         ASSERT_USER(srcIsFP, "Unexpected source type!");
1630 
1631         // TODO: Do we need to treat 'vf' differently?
1632 
1633         if (TypeSize(srcTy) < TypeSize(dstTy))
1634             return G4_INST::FPUpConv;
1635 
1636         ASSERT_USER(TypeSize(srcTy) > TypeSize(dstTy),
1637             "Unexpected FP source and destination type sizes!");
1638         return G4_INST::FPDownConv;
1639     }
1640 
1641     // They are both Integers. The destination signedness is ignored here to
1642     // detect the mov type as it really does not matter without saturation nor
1643     // condition modifier.
1644 
1645     ASSERT_USER(!IS_VINTTYPE(dstTy),
1646                 "Unexpected immediate types are used as dst type!");
1647 
1648     // Always treat 'v' as SExt as they will always be extended even for
1649     // BYTE-sized types.
1650     if (srcTy == Type_V) {
1651         // If the sign bit is 0, then zext is the same as sext.
1652         // prefer zext as it allows more propagation.
1653         G4_Operand *Op0 = Inst->getSrc(0);
1654         if (Op0->isImm() && Op0->asImm()->isSignBitZero())
1655             return G4_INST::ZExt;
1656         return G4_INST::SExt;
1657     }
1658 
1659     // Always treat 'uv' as ZExt as they will always be extended even for
1660     // BYTE-sized types.
1661     if (srcTy == Type_UV)
1662         return G4_INST::ZExt;
1663 
1664     // Treat that mov as truncation.
1665     if (TypeSize(srcTy) > TypeSize(dstTy))
1666     {
1667         if (IS_SIGNED_INT(srcTy) &&
1668            srcMod != Mod_src_undef &&
1669            srcMod != Mod_Not)
1670         {
1671             return G4_INST::SuperMov;
1672         }
1673         else
1674         {
1675             return G4_INST::Trunc;
1676         }
1677     }
1678 
1679     // Treat that mov as sign extend or zero extend based on the signedness of
1680     // the source type only.
1681     if (TypeSize(srcTy) < TypeSize(dstTy)) {
1682         if (IS_SIGNED_INT(srcTy)) {
1683             // Treat ABS as zero-extenstion.
1684             if (srcMod == Mod_Abs)
1685                 return G4_INST::ZExt;
1686             // If the sign bit is 0, then zext is the same as sext.
1687             // prefer zext as it allows more propagation.
1688             G4_Operand *Op0 = Inst->getSrc(0);
1689             if (Op0->isImm() && Op0->asImm()->isSignBitZero())
1690                 return G4_INST::ZExt;
1691 
1692             return G4_INST::SExt;
1693         }
1694         else if (srcMod == Mod_Minus || srcMod == Mod_Minus_Abs)
1695         {   // SrcMod=negate means that number is signed
1696             return G4_INST::SExt;
1697         }
1698         return G4_INST::ZExt;
1699     }
1700 
1701     // Otherwise, treat it as COPY they are the same in bit size.
1702     // Treat ABS as zero-extenstion.
1703     if (IS_SIGNED_INT(srcTy) && srcMod == Mod_Abs)
1704         return G4_INST::ZExt;
1705     return G4_INST::Copy;
1706 }
1707 
1708 // check if this instruction can be propagated
canPropagate() const1709 G4_INST::MovType G4_INST::canPropagate() const
1710 {
1711     G4_Declare* topDcl = NULL;
1712 
1713     if (dst == NULL)
1714     {
1715         return SuperMov;
1716     }
1717 
1718     topDcl = dst->getTopDcl();
1719 
1720     if (op != G4_mov
1721         // Do not eliminate if either sat or condMod is present.
1722         || getSaturate() || getCondMod()
1723         // Do not eliminate if there's no use (dead or side-effect code?)
1724         || useInstList.size() == 0
1725         // Do not eliminate stack call return value passing instructions.
1726         // Do not eliminate vars marked with Output attribute
1727         || (topDcl && topDcl->isOutput()))
1728     {
1729         return SuperMov;
1730     }
1731 
1732     // can't propagate stack call related variables (Arg, Retval, SP, FP)
1733     if (topDcl)
1734     {
1735         G4_Declare* rootDcl = topDcl->getRootDeclare();
1736         if (builder.isPreDefFEStackVar(rootDcl) || builder.isPreDefArg(rootDcl) ||
1737             builder.isPreDefRet(rootDcl))
1738         {
1739             return SuperMov;
1740         }
1741     }
1742 
1743 
1744     // Do not eliminate MOV/COPY to Acc/flag registers.
1745     if (dst->isAccReg() || dst->isFlag())
1746     {
1747         return SuperMov;
1748     }
1749 
1750     // Retain side effect of writing to debug register.
1751     if (dst->isDbgReg())
1752     {
1753         return SuperMov;
1754     }
1755 
1756     G4_Operand *src = srcs[0];
1757 
1758     if (src->isRelocImm())
1759     {
1760         return SuperMov;
1761     }
1762 
1763     // only support flag propagation for simd1 copy moves
1764     if (src->isFlag())
1765     {
1766         if (getExecSize() != g4::SIMD1 || src->getType() != dst->getType())
1767         {
1768             return SuperMov;
1769         }
1770     }
1771 
1772     // Do not propagate through copy of `acc0` if its execution size does not match the native size,
1773     // as some latest passes (e.g., fixAddCSubb) rely on the acc0 copy move for correctness
1774     if (src->isAccReg() && getExecSize() != builder.getNativeExecSize())
1775     {
1776         return SuperMov;
1777     }
1778 
1779     if (builder.kernel.fg.globalOpndHT.isOpndGlobal(dst))
1780     {
1781         return SuperMov;
1782     }
1783 
1784     G4_Type dstType = dst->getType();
1785     G4_Type srcType = src->getType();
1786 
1787     if (!builder.hasByteALU()
1788         && (TypeSize(dstType) == 1 || TypeSize(srcType) == 1))
1789     {
1790         return SuperMov;
1791     }
1792 
1793     G4_SrcModifier srcMod = Mod_src_undef;
1794     if (src->isSrcRegRegion()) {
1795         srcMod = src->asSrcRegRegion()->getModifier();
1796     }
1797 
1798     MovType MT = getMovType(this, dstType, srcType, srcMod);
1799 
1800     //Disabling mix mode copy propogation
1801     if (!builder.hasMixMode() &&
1802         ((IS_TYPE_F32_F64(srcType) && isLowPrecisionFloatTy(dstType)) ||
1803         (isLowPrecisionFloatTy(srcType) && IS_TYPE_F32_F64(dstType))))
1804     {
1805         return SuperMov;
1806     }
1807 
1808     // Selectively enable copy propagation on the detected mov type.
1809     switch (MT) {
1810     default:
1811         return SuperMov;
1812     case Copy:
1813     case ZExt:
1814     case SExt:
1815         // COPY and integer extending are allowed.
1816         break;
1817     case Trunc: {
1818         if (!src->isSrcRegRegion())
1819             return SuperMov;
1820         G4_SrcRegRegion *src0 = src->asSrcRegRegion();
1821         if (src0->getRegion()->isContiguous(getExecSize())) {
1822             unsigned newHS = TypeSize(srcType) / TypeSize(dstType);
1823             if (newHS > 4) {
1824                 // Rule out Q -> B. WHY?
1825                 return SuperMov;
1826             }
1827         } else if (!src0->isScalar()) {
1828             return SuperMov;
1829         }
1830         break;
1831     }
1832     case FPUpConv:
1833         // For FPUpConv, only HF -> F is allowed.
1834         if (!(srcType == builder.getMixModeType() && dstType == Type_F))
1835             return SuperMov;
1836         break;
1837     case FPDownConv:
1838     {
1839         if (IS_TYPE_F32_F64(srcType) &&
1840             builder.getMixModeType() == dstType &&
1841             builder.getOption(vISA_enableUnsafeCP_DF) &&
1842             useInstList.size() == 1)
1843             return FPDownConvSafe;
1844         break;
1845     }
1846     // TODO: Enable IntToFP or vice versa on constant.
1847     }
1848 
1849     return MT;
1850 }
1851 
canPropagateBinaryToTernary() const1852 bool G4_INST::canPropagateBinaryToTernary() const
1853 {
1854     if (opcode() != G4_add && opcode() != G4_mul)
1855         return false; // constrain just to a few ops for the moment
1856     else if (dst == nullptr)
1857         return false;
1858     else if (!dst->getBase()->isRegVar() && !dst->getBase()->isPhyGreg())
1859         return false; // must be GRF dst
1860     else if (dst->isIndirect())
1861         return false; // must not be indirect
1862     else if (dst->getHorzStride() != 1)
1863         return false; // must be <1>
1864     else if (
1865         dst->getType() != Type_D && dst->getType() != Type_UD &&
1866         dst->getType() != Type_Q && dst->getType() != Type_UQ)
1867         return false; // dst has to be :d or :ud (for now)
1868     else if (builder.kernel.fg.globalOpndHT.isOpndGlobal(dst))
1869         return false; // writes to globals must be visible
1870     else if (getNumSrc() != 2)
1871         return false; // must be binary
1872     else if (getPredicate())
1873         return false; // no predicates
1874     else if (getExecSize() != 1 && dst->getSubRegOff() != 0)
1875         return false; // must be dst.0 or SIMD1 to any subreg
1876     else if (getImplAccDst() || getImplAccSrc())
1877         return false; // no {AccWrEn}
1878     else if (getSaturate() || getCondMod())
1879         return false; // do not eliminate if either sat or condMod is present.
1880     else if (useInstList.size() == 0)
1881         return false; // do not eliminate if there's no use (dead or side-effect code?)
1882 
1883     G4_Declare* topDcl = dst->getTopDcl();
1884     if (topDcl) {
1885         // Do not eliminate stack call return value passing instructions.
1886         // Do not eliminate vars marked with Output attribute.
1887         if (topDcl->isOutput())
1888             return false;
1889         G4_Declare* rootDcl = topDcl->getRootDeclare();
1890         if (builder.isPreDefFEStackVar(rootDcl) || builder.isPreDefArg(rootDcl) ||
1891             builder.isPreDefRet(rootDcl))
1892         {
1893             // can't propagate stack call related variables (Arg, Retval, SP, FP)
1894             return false;
1895         }
1896     }
1897 
1898     for (int srcIx = 0; srcIx < getNumSrc(); srcIx++) {
1899         G4_Operand *src = srcs[srcIx];
1900 
1901         if (!src->isSrcRegRegion() && !src->isImm()) {
1902             return false; // only GRF
1903         } else if (src->isRelocImm()) {
1904             return false;
1905         }
1906         if (src->isSrcRegRegion()) {
1907             const G4_SrcRegRegion *srr = src->asSrcRegRegion();
1908             if (!srr->getBase()->isRegVar() && !srr->getBase()->isPhyGreg()) {
1909                 return false; // has to be GRF
1910             } else if (srr->isIndirect()) {
1911                 return false; // has to be direct
1912             }
1913         }
1914     }
1915 
1916     return true;
1917 }
1918 
1919 // Check to see whether the given type is supported by this opcode + operand. Mainly focus on integer ops
1920 // This is used by copy propagation and def-hoisting to determine if the resulting instruction is legal
isLegalType(G4_Type type,Gen4_Operand_Number opndNum) const1921 bool G4_INST::isLegalType(G4_Type type, Gen4_Operand_Number opndNum) const
1922 {
1923     bool isSrc = (opndNum == Opnd_src0 || opndNum == Opnd_src1 || opndNum == Opnd_src2);
1924     switch (op)
1925     {
1926     default:
1927         // ToDo: Make this function more complete by adding more opcodes
1928         // keep alphabetical order when adding to make it easier to maintain
1929         return true;
1930     case G4_addc:
1931         return type == Type_UD;
1932     case G4_bfe:
1933     case G4_bfi1:
1934     case G4_bfi2:
1935         // additionally check src and dst have same type
1936         return (type == Type_D || type == Type_UD) &&
1937          (isSrc ? type == dst->getType() : type == getSrc(0)->getType());
1938     case G4_bfrev:
1939         return type == Type_UD;
1940     case G4_cbit:
1941         return type == Type_UB || type == Type_UW || type == Type_UD;
1942     case G4_fbh:
1943         return type == Type_D || type == Type_UD;
1944     case G4_fbl:
1945         return type == Type_UD;
1946     case G4_lzd:
1947         return type == Type_D || type == Type_UD;
1948     case G4_sad2:
1949     case G4_sada2:
1950         return type == Type_B || type == Type_UB;
1951     case G4_subb:
1952         return type == Type_UD;
1953     case G4_mov:
1954         // Avoid mov r7.0<1>:hf  0x76543210:v
1955         if (IS_VINTTYPE(type) &&
1956             (IS_FTYPE(dst->getType()) || IS_HFTYPE(dst->getType())))
1957         {
1958             return false;
1959         }
1960         return true;
1961     case G4_bfn:
1962         // do not allow copy propagation to change BFN operand type
1963         if (isSrc && type != getOperand(opndNum)->getType())
1964         {
1965             return false;
1966         }
1967         // fall through
1968     case G4_add3:
1969         return type == Type_W || type == Type_UW || type == Type_D || type == Type_UD;
1970     }
1971 }
1972 
1973 // returns true if inst supports only F type for both src and dst
isFloatOnly() const1974 bool G4_INST::isFloatOnly() const
1975 {
1976     switch (op)
1977     {
1978     default:
1979         return false;
1980     case G4_dp2:
1981     case G4_dp3:
1982     case G4_dp4:
1983     case G4_dph:
1984     case G4_frc:
1985     case G4_line:
1986     case G4_lrp:
1987     case G4_pln:
1988     case G4_rndd:
1989     case G4_rnde:
1990     case G4_rndu:
1991     case G4_rndz:
1992         return true;
1993     }
1994 }
1995 
1996 /// isSignSensitive() - Check whether this instruction is sign sensitive on the
1997 /// specified source operand.
isSignSensitive(Gen4_Operand_Number opndNum) const1998 bool G4_INST::isSignSensitive(Gen4_Operand_Number opndNum) const
1999 {
2000     const G4_Operand *use = getOperand(opndNum);
2001     G4_Type useType = use->getType();
2002     G4_Type dstType = dst->getType();
2003 
2004     // If extending is required, most of insts are sign sensitive.
2005     if (TypeSize(dstType) > TypeSize(useType)) {
2006         return true;
2007     }
2008 
2009     switch (op) {
2010     case G4_asr:
2011         if (opndNum != Opnd_src0)
2012             break;
2013         // FALL THROUGH
2014     case G4_mach:
2015     case G4_fbh:
2016     case G4_mulh:
2017     case G4_sel:
2018     case G4_cmp:
2019     case G4_cmpn:
2020     case G4_madw:
2021         return true;
2022     case G4_mov:
2023         // inttofp is sign sensitive
2024         return IS_TYPE_INT(useType) && IS_TYPE_FLOAT_ALL(dstType);
2025     default:
2026         break;
2027     }
2028     // By default, inst is regarded as sign insensitive.
2029     return false;
2030 }
2031 
getPropType(Gen4_Operand_Number opndNum,MovType MT,const G4_INST * mov) const2032 G4_Type G4_INST::getPropType(
2033     Gen4_Operand_Number opndNum, MovType MT, const G4_INST *mov) const
2034 {
2035     const G4_Operand *use = getOperand(opndNum);
2036     G4_Type useType = use->getType();
2037     G4_Type srcType = mov->getSrc(0)->getType();
2038 
2039     G4_SrcModifier srcMod = Mod_src_undef;
2040     if (mov->getSrc(0)->isSrcRegRegion()) {
2041         srcMod = mov->getSrc(0)->asSrcRegRegion()->getModifier();
2042     }
2043     G4_SrcModifier useMod = Mod_src_undef;
2044     if (use->isSrcRegRegion()) {
2045         useMod = use->asSrcRegRegion()->getModifier();
2046     }
2047 
2048     bool useIsFP = IS_TYPE_FLOAT_ALL(useType);
2049     bool srcIsFP = IS_TYPE_FLOAT_ALL(srcType);
2050     // Different numeric type.
2051     bool diffNumTy = useIsFP != srcIsFP;
2052 
2053     // TODO: Once we handle IntToFp, this condition should be checked
2054     // individually for each MovType.
2055 
2056     switch (MT) {
2057     case Copy:
2058         // Different numeric type with src mod cannot be propagated.
2059         if (diffNumTy && srcMod != Mod_src_undef)
2060             return Type_UNDEF;
2061         // Fp is simply to use useType.
2062         if (useIsFP)
2063             return useType;
2064         // Int needs to consider whether the use is sign-sensitive and the src
2065         // modifier.
2066         if (isSignSensitive(opndNum)) {
2067             switch (srcMod) {
2068             case Mod_Not:
2069             case Mod_Minus:
2070             case Mod_Minus_Abs:
2071                 if (IS_UNSIGNED_INT(useType))
2072                     return Type_UNDEF;
2073                 // Assume the combination of srcMod/srcType is valid.
2074                 // FALL THROUGH
2075             case Mod_Abs:
2076                 return srcType;
2077             default:
2078                 break;
2079             }
2080         }
2081         else if (srcMod == Mod_Abs && IS_UNSIGNED_INT(useType) &&
2082                  IS_SIGNED_INT(srcType))
2083             return srcType;
2084         return useType;
2085     case ZExt:
2086         // Different numeric type with src zero-extended cannot be propagated.
2087         if (diffNumTy)
2088             return Type_UNDEF;
2089         // (sext (zext x)) is equal to (zext x)
2090         return srcType;
2091     case SExt:
2092         // Different numeric type with src sign-extended cannot be propagated.
2093         if (diffNumTy)
2094             return Type_UNDEF;
2095         // (zext (sext x)) is not equal to (sext x)
2096         if (IS_UNSIGNED_INT(useType))
2097             return Type_UNDEF;
2098         // Check if there's any modifier on the use.
2099         switch (useMod) {
2100         case Mod_Not:
2101         case Mod_Minus:
2102         case Mod_Minus_Abs:
2103             if (IS_QTYPE(useType) && IS_DTYPE(srcType)) {
2104                 // (- (sext x)) is not equal to (sext (-x)) due to the corner case
2105                 // where x is INT_MIN and -x is still INT_MIN without being
2106                 // extended.
2107                 return Type_UNDEF;
2108             }
2109             // FALL THROUGH
2110         default:
2111             break;
2112         }
2113         return srcType;
2114     case Trunc:
2115         if (diffNumTy)
2116             return Type_UNDEF;
2117         // Truncation always use the useType but the original source operand.
2118         // As a result, region needs changing to access the truncated bits
2119         // only.
2120         return useType;
2121     case FPUpConv:
2122         // Different numeric type with src up-converted cannot be propagated.
2123         if (diffNumTy)
2124             return Type_UNDEF;
2125         return srcType;
2126     case FPDownConvSafe:
2127         return srcType;
2128     default:
2129         break;
2130     }
2131 
2132     return Type_UNDEF;
2133 }
2134 
isLegalImmType(G4_Type type)2135 static bool isLegalImmType(G4_Type type)
2136 {
2137     return type != Type_BF;
2138     return true;
2139 }
2140 
2141 // cases that we do not propagate
2142 // 0. use inst does not support the type of the operand being propagated
2143 // 1. use inst is align16 instruction
2144 // 2. first source of line
2145 // 3. indirect source to compressed instructions or math instructions
2146 // 4. byte src to if/while instructions
2147 // 5. src with modifier to logic inst on BDW
2148 // 6. When useinst is lifetime.end
2149 // 7. use inst does not have dst
canPropagateTo(G4_INST * useInst,Gen4_Operand_Number opndNum,MovType MT,bool inSimdFlow,bool statelessAddr)2150 bool G4_INST::canPropagateTo(
2151     G4_INST *useInst, Gen4_Operand_Number opndNum, MovType MT, bool inSimdFlow, bool statelessAddr)
2152 {
2153     G4_Operand *src = srcs[0];
2154     bool indirectSrc = src->isSrcRegRegion() &&
2155                        src->asSrcRegRegion()->getRegAccess() != Direct;
2156     bool hasModifier = src->isSrcRegRegion() &&
2157                        src->asSrcRegRegion()->getModifier() != Mod_src_undef;
2158     G4_Type dstType = dst->getType();
2159     G4_Type srcType = src->getType();
2160 
2161     G4_Operand *use = useInst->getOperand(opndNum);
2162     G4_Type useType = use->getType();
2163 
2164     //If the operand to be copied is acc register, need to check if the use operand can use acc register
2165     if (src->isAccReg())
2166     {
2167         if (!useInst->canSrcBeAccBeforeHWConform(opndNum))
2168         {
2169             return false;
2170         }
2171     }
2172 
2173     if (useInst->is2SrcAlign16())
2174     {
2175         // don't copy propagate for the legacy dp* instructions,
2176         // as we are missing some HW conformity checks for them
2177         return false;
2178     }
2179 
2180     // Skip lifetime.
2181     if (useInst->isLifeTimeEnd())
2182     {
2183         return false;
2184     }
2185 
2186     // Skip dpas as it has no region (maybe too conservative)
2187     if (useInst->isDpas())
2188     {
2189         return false;
2190     }
2191 
2192     // skip the instruction has no dst. e.g. G4_pseudo_fcall
2193     if (useInst->getDst() == nullptr)
2194         return false;
2195 
2196     // limit flag copy propagation to opcode known to work for now
2197     if (src->isFlag() && (useInst->opcode() != G4_not && useInst->opcode() != G4_and))
2198     {
2199         return false;
2200     }
2201 
2202     if (isMixedMode())
2203     {
2204         // FIXME: what's this for?
2205         if (execSize < g4::SIMD16 && MT == FPDownConvSafe && useInst->execSize == g4::SIMD16 &&
2206             !useInst->isMixedMode())
2207         {
2208             return false;
2209         }
2210 
2211         G4_opcode useOp = useInst->opcode();
2212 
2213         if (useOp != G4_mov &&
2214             useOp != G4_mul &&
2215             useOp != G4_pseudo_mad &&
2216             useOp != G4_add &&
2217             useOp != G4_sel &&
2218             useOp != G4_cmp)
2219         {
2220             return false;
2221         }
2222     }
2223     else if (srcType != useType && (useInst->opcode() == G4_mulh || useInst->opcode() == G4_madw))
2224     {
2225         // don't propagate widening ops into a mul/mach
2226         //   mov  T:d  SRC:w
2227         //   ...
2228         //   mach ... T:d ...
2229         // mach requires 32b types only
2230         return false;
2231     }
2232 
2233 
2234     // special checks for message desc/extended desc, which must be either a0 or imm
2235     if (useInst->isSend())
2236     {
2237         auto msgDescOpnd = useInst->isSplitSend() ? Opnd_src2 : Opnd_src1;
2238         if (opndNum == msgDescOpnd)
2239         {
2240             if (!src->isImm() && !src->isAddress())
2241             {
2242                 return false;
2243             }
2244         }
2245         if (opndNum == Opnd_src3)
2246         {
2247             // there are some HW restrictions that prevent imm exdesc (e.g., on MRT write),
2248             // so we conservatively disable copy prop here
2249             return false;
2250         }
2251     }
2252 
2253     // The following are copied from local dataflow analysis.
2254     // TODO: re-examine..
2255     if (((opndNum == Opnd_src0 && useInst->isSend()) && !statelessAddr) ||
2256         (opndNum == Opnd_src1 && useInst->isSplitSend()))
2257     {
2258         return false;
2259     }
2260 
2261     auto isFloatPseudoMAD = [](G4_INST *inst)
2262     {
2263         return inst->opcode() == G4_pseudo_mad && IS_TYPE_FLOAT_ALL(inst->getDst()->getType());
2264     };
2265 
2266     //     mov (16|M0) r47.0 1:w
2267     // (W) add (16|M0) r49.0 r47.0 r45.0
2268     //
2269     // FIXME: remove this once DU/UD chain are computed correctly.
2270     //
2271     // Only skip when the defInst ('this') is defined in SIMD CF.
2272     if (useInst->isWriteEnableInst() && !isWriteEnableInst() && inSimdFlow)
2273     {
2274         return false;
2275     }
2276 
2277     if (useInst->opcode() == G4_fcvt)
2278     {
2279         // fcvt is not allowed to have immediate src.
2280         if (src->isImm() ||
2281             !src->isSrcRegRegion() ||
2282             !(src->asSrcRegRegion()->getRegion()->isContiguous(useInst->getExecSize())))
2283         {
2284             return false;
2285         }
2286     }
2287     if (useInst->opcode() == G4_srnd)
2288     {
2289         // srnd rZ.0<1>:ub  rX.0<1;1,0>:hf rY.0<1;1,0>:hf
2290         //   operands should be packed.
2291         if (useInst->getDst()->getType() == Type_UB &&
2292             src->isSrcRegRegion() &&
2293             !(src->asSrcRegRegion()->getRegion()->isContiguous(useInst->getExecSize())))
2294         {
2295             return false;
2296         }
2297     }
2298 
2299     if (src->isImm())
2300     {
2301         if (isFloatPseudoMAD(useInst) || useInst->opcode() == G4_math ||
2302             use->asSrcRegRegion()->hasModifier())
2303         {
2304             return false;
2305         }
2306     } else if (indirectSrc &&
2307                (isFloatPseudoMAD(useInst) || useInst->opcode() == G4_math))
2308     {
2309         return false;
2310     }
2311     if (getGRFSize() == 64 &&
2312         (useInst->opcode() == G4_dpas || useInst->opcode() == G4_dpasw) &&
2313         (opndNum == Opnd_src0 || opndNum == Opnd_src1))
2314     {
2315         uint32_t leftBoundInBytes = src->getLeftBound() * src->getTypeSize();
2316         // left bound should be 2grf aligned to propagate into dpas.
2317         if (leftBoundInBytes % (numEltPerGRF<Type_UB>()*2))
2318         {
2319             return false;
2320         }
2321     }
2322 
2323     // FIXME: to add specific checks for other instructions.
2324     G4_opcode useInst_op = useInst->opcode();
2325 
2326     if (useInst_op == G4_madm || (useInst->isMath() && useInst->asMathInst()->isIEEEMath()))
2327     {
2328         // do not propagate if useInst uses mme registers
2329         return false;
2330     }
2331     if ((useInst_op == G4_line && opndNum == Opnd_src0) ||
2332         (hasModifier && G4_Inst_Table[useInst_op].instType == InstTypeLogic))
2333     {
2334         return false;
2335     }
2336 
2337     bool isVxHSrc = indirectSrc && src->asSrcRegRegion()->getRegion()->isRegionWH();
2338     if (isVxHSrc && (useInst->getExecSize() != execSize || execSize >= g4::SIMD8))
2339     {
2340         // copy propagating VxH region may result in address spills later so it's usually a net loss
2341         return false;
2342     }
2343 
2344     if ((useInst_op == G4_asr || useInst_op == G4_shl || useInst_op == G4_shr) &&
2345         opndNum == Opnd_src0 && src->getTypeSize() < use->getTypeSize())
2346     {
2347         // Handle cases such as
2348         //     mov  A:q  B:d
2349         //     asr  r:d  A:q  C:q
2350         //  if C is immediate and its value is in 0:31 (for d), it is okay to prop;
2351         //  otherwise, no.
2352         G4_Operand* src1 = useInst->getOperand(Opnd_src1);
2353         if (src1->isImm())
2354         {
2355             // shiftAmt is LSB[0:useTypeBits - 1]
2356             int64_t v = src1->asImm()->getImm();
2357             uint32_t shiftAmt = (uint32_t)((uint64_t)v & (use->getTypeSize()*8 - 1));
2358             uint32_t nbits = 8 * src->getTypeSize();
2359             if (shiftAmt >= nbits)
2360             {
2361                 return false;
2362             }
2363         }
2364         else
2365         {
2366             return false;
2367         }
2368     }
2369 
2370     // In general, to check whether that MOV could be propagated:
2371     //
2372     //  dst/T1 = src/T0;
2373     //  op(..., use/T2, ...);
2374     //
2375     // We need firstly check whether 'dst' and 'use' are exactly the same
2376     // variable regardless data type.
2377 
2378     // Check T1 and T2 has the same bit/byte size. Otherwise, it's not legal to
2379     // be propagated.
2380     // TODO: Revisit later if exection mask is guaranteed to be NoMask.
2381     if (TypeSize(dstType) != TypeSize(useType) && !statelessAddr) {
2382         return false;
2383     }
2384 
2385     // Do not propagate if def type is float and use type is int, or vice
2386     // versa.
2387     // NOTE: Such usage is possible from bitcast (not through this MOV but the
2388     // reference in the use insn) from one type to another.
2389     // TODO: Revisit this later to handle the case where this MOV insn is
2390     // exactly a COPY. The useType should be used instead.
2391     if (MT != Copy && ((IS_TYPE_FLOAT_ALL(dstType) && IS_TYPE_INT(useType)) ||
2392                        (IS_TYPE_INT(dstType) && IS_TYPE_FLOAT_ALL(useType))))
2393     {
2394         return false;
2395     }
2396 
2397     if (MT == Copy &&
2398         hasModifier &&
2399         dstType != useType)
2400     {
2401         return false;
2402     }
2403 
2404     if (hasModifier && !useInst->canSupportSrcModifier())
2405     {
2406         return false;
2407     }
2408 
2409     // Check 'dst' of MOV and 'use' are the same variable. Otherwise, it's not
2410     // legal to be propagated.
2411     G4_CmpRelation rel = dst->compareOperand(use);
2412     if (rel != Rel_eq)
2413     {
2414         return false;
2415     }
2416 
2417     // Type to be used after propagation. Use srcType by default.
2418     G4_Type propType = useInst->getPropType(opndNum, MT, this);
2419 
2420     if (propType == Type_UNDEF || (src->isImm() && !isLegalImmType(propType)))
2421     {
2422         return false;
2423     }
2424 
2425     // bfloat specific checks
2426     if (propType == Type_BF)
2427     {
2428         // If the useInst is G4_pseudo_mad and the use operand has source modifier, a invalid bf->bf mov with source modifier
2429         // may be inserted in fixMADInst(). So avoid propagating to G4_pseudo_mad source with source modifier.
2430         // TODO: a mov is not always inserted for G4_pseudo_mad source with source modifier since gen mad inst supports source
2431         // modifier. So for the no mov inserted case, avoid propagating may miss this opotimize. So, do we need to check if a mov
2432         // is really needed for G4_pseudo_mad source here? But the same check code in fixMADInst() seems very complicated?
2433         if (use->asSrcRegRegion()->hasModifier() && (useInst->isMov() || useInst->opcode() == G4_pseudo_mad))
2434         {
2435             // BF_CVT does not like source modifier
2436             return false;
2437         }
2438         if (src->isSrcRegRegion() && src->asSrcRegRegion()->isScalar() &&
2439             useInst->opcode() != G4_mov)
2440         {
2441             // HW has bug with scalar bfloat in mix mode instructions
2442             return false;
2443         }
2444         if (useInst->getDst()->getType() != Type_F)
2445         {
2446             // we currently don't handle BF->HF or BF->DF conversion
2447             return false;
2448         }
2449     }
2450 
2451     // Don't propagate unsupported propType.
2452     if (!useInst->isLegalType(propType, opndNum))
2453     {
2454         return false;
2455     }
2456 
2457     // TODO: Revisit this later as IntToFp could be folded on specific insts,
2458     // such as add, cmp, and mul, when types of all source operands could be
2459     // consistent.
2460     if (!(useInst->isRawMov() && dstType == useType) &&
2461         !(MT == Copy && propType == useType) &&
2462         ((IS_FTYPE(dstType) && (IS_TYPE_INT(propType) || IS_VINTTYPE(propType))) ||
2463          (IS_TYPE_INT(dstType) && (IS_FTYPE(propType) || IS_VFTYPE(propType)))))
2464     {
2465         return false;
2466     }
2467 
2468     if (useInst->getSingleDef(opndNum) == nullptr)
2469     {
2470         return false;
2471     }
2472 
2473     // Cannot generally safely propagate replicated vectors.
2474     unsigned dstElSize = TypeSize(dstType);
2475     unsigned srcElSize = TypeSize(propType);
2476     unsigned useElSize = TypeSize(useType);
2477 
2478     const RegionDesc *rd =
2479         src->isSrcRegRegion() ? src->asSrcRegRegion()->getRegion() : nullptr;
2480     G4_ExecSize newExecSize = useInst->getExecSize();
2481     if ((useElSize != dstElSize && !statelessAddr) &&
2482         (!src->isSrcRegRegion()
2483          || rd->isRepeatRegion(execSize)
2484          || !(rd->isFlatRegion() && rd->isPackedRegion())))
2485     {
2486         return false;
2487     }
2488 
2489     // Skip propagate scalar copies into the additive operand (src2) of integer
2490     // pseudo mad.
2491     if (!builder.hasAlign1Ternary())
2492     {
2493         if (opndNum == Opnd_src2 && useInst->opcode() == G4_pseudo_mad &&
2494             IS_TYPE_INT(useType) && rd && rd->isScalar())
2495             return false;
2496     }
2497 
2498     // Check repeat region
2499     bool sameDefUseELSize = (dstElSize == useElSize);
2500     bool sameExecSize = (execSize == newExecSize);
2501     const RegionDesc *useRd =
2502         use->isSrcRegRegion() ? use->asSrcRegRegion()->getRegion() : nullptr;
2503     bool repeatUseRegion = useRd && useRd->isRepeatRegion(newExecSize);
2504     bool scalarUse = useRd && useRd->isScalar();
2505     bool repeatSrcRegion = (rd && rd->isRepeatRegion(execSize));
2506     if (!sameExecSize && !statelessAddr &&
2507         !((sameDefUseELSize && scalarUse) ||
2508           (!repeatUseRegion && rd && rd->isFlatRegion() && rd->isPackedRegion()) ||
2509           (repeatUseRegion && sameDefUseELSize && (src->isImm() || !repeatSrcRegion))))
2510     {
2511         return false;
2512     }
2513 
2514     // Be conserversative, do not bother to do complicated region compositions.
2515     // There are three variables to compute the composition:
2516     // (1) the dst stride
2517     // (2) the source region
2518     // (3) the use source region
2519 
2520     // dStride, the dst stride
2521     // stride1, stride2 must be positive
2522     auto isComposable = [=](unsigned dStride, unsigned stride1,
2523                             unsigned stride2) -> bool
2524     {
2525         MUST_BE_TRUE(stride1 && stride2, "scalar region not expected");
2526 
2527         // composition is rd1 (or rd2).
2528         // If two variables are trivial, then the other variable could be
2529         // arbitrary. E.g.
2530         //
2531         // mov (8) V81(0,0)<1>:w V80(0,0)<1;1,0>:w
2532         // add (16) V82(0,0)<1>:w V81(0,0)<0;8,1>:w 0xa:w
2533         //
2534         // where rd1 has stride 1, dStride = 1, rd2 is non single strided.
2535         if ((stride1 == 1 && dStride == 1) || (stride2 == 1 && dStride == 1))
2536           return true;
2537 
2538         // If either stride1 or stride2 equals UndefVal, then there is no easy
2539         // formula to do the composition unless dStride == 1 and the other has
2540         // stride 1. This case is covered by the first check.
2541         //
2542         // To be composable, both regions need to be single strided (i.e. value
2543         // != UndefVal). This check is simplified by the value UndefVal (64).
2544         return stride1 * stride2 * dStride <= 32;
2545     };
2546 
2547     if (!sameExecSize && rd && useRd)
2548     {
2549         // the compoisition is also scalar.
2550         if (!rd->isScalar() && !useRd->isScalar())
2551         {
2552             G4_DstRegRegion *dstRegion = dst;
2553             uint16_t dstStride = dstRegion->getHorzStride();
2554 
2555             // A value to indicate this region is non-single strided.
2556             // Make it larger than 32 to simplify/unify the checking.
2557             const uint16_t UndefVal = 64;
2558 
2559             uint16_t stride1 = UndefVal;
2560             if (rd->isContiguous(execSize))
2561                 stride1 = 1;
2562             else
2563                 rd->isSingleNonUnitStride(execSize, stride1);
2564 
2565             uint16_t stride2 = UndefVal;
2566             if (useRd->isContiguous(newExecSize))
2567                 stride2 = 1;
2568             else
2569                 useRd->isSingleNonUnitStride(newExecSize, stride2);
2570 
2571             if (!isComposable(dstStride, stride1, stride2))
2572                 return false;
2573         }
2574     }
2575 
2576     // check data type alignment
2577     if ((srcElSize < useElSize) &&
2578         (dstElSize == srcElSize) &&
2579         (execSize > g4::SIMD1) &&
2580         (!src->isImm()) &&
2581         ((src->getByteOffset() % useElSize) != 0))
2582     {
2583         return false;
2584     }
2585 
2586     if (src->isImm() && use->asSrcRegRegion()->hasModifier())
2587     {
2588         //FIXME: do we need to worry about signal bit in NaN being dropped?
2589         if (IS_TYPE_INT(srcType))
2590         {
2591             // we can't represent -(INT_MIN) or abs(INT_MIN)
2592             int64_t value = src->asImm()->getImm();
2593             switch (propType)
2594             {
2595             case Type_Q:
2596             case Type_UQ:
2597                 return value != LLONG_MIN;
2598             default:
2599                 return value != INT_MIN;
2600             }
2601         }
2602     }
2603 
2604     return true;
2605 }
2606 
2607 // check if this inst can be hoisted
2608 // assume only MOV inst is checked
canHoist(bool simdBB,const Options * opt) const2609 bool G4_INST::canHoist(bool simdBB, const Options *opt) const
2610 {
2611     assert(op == G4_mov && "defHoisting only handles mov");
2612     if (dst == NULL)
2613     {
2614         return false;
2615     }
2616 
2617     G4_Operand *src = srcs[0];
2618     // check attributes of src and number of defs
2619     bool archRegSrc = (src->isFlag() || src->isAreg() || src->isAddress());
2620     bool indirectSrc = (src->getTopDcl() && src->getTopDcl()->getAddressed()) || src->getRegAccess() != Direct;
2621     bool noMultiDefOpt = ((defInstList.size() > 1) &&
2622         (predicate || (dst->getRegAccess() != Direct) || simdBB));
2623     if (src->isImm() ||
2624         archRegSrc ||
2625         indirectSrc ||
2626         (src->isSrcRegRegion() && src->asSrcRegRegion()->getModifier() != Mod_src_undef) ||
2627         (defInstList.size() == 0) ||
2628         noMultiDefOpt)
2629     {
2630         return false;
2631     }
2632 
2633     // check type
2634     G4_Type dstType, srcType;
2635     dstType = dst->getType();
2636     srcType = src->getType();
2637 
2638     // no dst type promotion after hoisting
2639     if (!Is_Type_Included(dstType, srcType, builder) ||
2640         // if multi def, src and dst should have the same type size
2641         (defInstList.size() > 1 &&
2642         (Operand_Type_Rank(srcType) != Operand_Type_Rank(dstType) ||
2643         // if multidef and used as a scalar, execution size should be one.
2644         (src->isSrcRegRegion() && src->asSrcRegRegion()->isScalar() && execSize > g4::SIMD1))))
2645     {
2646         return false;
2647     }
2648 
2649     // no opt repeat region
2650     unsigned short src_wd = src->asSrcRegRegion()->getRegion()->width;
2651     if ((src_wd != execSize &&
2652         (src->asSrcRegRegion()->getRegion()->vertStride < (src_wd * src->asSrcRegRegion()->getRegion()->horzStride))) ||
2653         // actually we can hoist if src is a scalar and target inst has no pred or cond mod.
2654         (execSize > g4::SIMD1 && src->asSrcRegRegion()->isScalar()))
2655     {
2656         return false;
2657     }
2658 
2659     if (src->getTopDcl() && src->getTopDcl()->isOutput())
2660     {
2661         return false;
2662     }
2663 
2664     return true;
2665 }
2666 
2667 // check if this instruction can be hoisted to defInst
canHoistTo(const G4_INST * defInst,bool simdBB) const2668 bool G4_INST::canHoistTo(const G4_INST *defInst, bool simdBB) const
2669 {
2670     assert(op == G4_mov && "defHoisting only handles mov");
2671     bool indirect_dst = (dst->getRegAccess() != Direct);
2672 
2673     auto def_dst = defInst->getDst();
2674 
2675     if (!def_dst)
2676     {
2677         // can this actually happen?
2678         return false;
2679     }
2680     G4_Type defDstType = def_dst->getType();
2681     G4_Type dstType = dst->getType(), srcType = srcs[0]->getType();
2682     unsigned int srcElSize = TypeSize(srcType);
2683     unsigned int dstElSize = TypeSize(dstType);
2684     unsigned int defDstElSize = TypeSize(defDstType);
2685 
2686     // cannot hoist an accumulator access into an instruction
2687     // that doesn't have a dst hz stride that matches source
2688     //   def (..) T<1> .. acc:d
2689     //   use (..) ...<2>:d  T<1>
2690     // ==>
2691     //   def2 (..) ...<2>:d ... acc
2692     //                 ^ dst stride mismatch means we mustn't hoist
2693     if (defInst->useAcc() && dst->getExecTypeSize() != srcElSize) {
2694         return false;
2695     }
2696 
2697     bool rawMovInst = isRawMov();
2698     bool cantHoistMAD =
2699         (defInst->opcode() == G4_pseudo_mad &&
2700             !(IS_TYPE_FLOAT_ALL(dstType) && IS_TYPE_FLOAT_ALL(defDstType)));
2701     if ((defInst->useInstList.size() != 1) ||
2702         (defInst->opcode() == G4_sad2) ||
2703         (defInst->opcode() == G4_sada2) ||
2704         (defInst->opcode() == G4_cbit && dstType != defDstType) ||
2705         (defInst->opcode() == G4_dp4a && dstType != defDstType) ||
2706         ((cantHoistMAD || (defInst->opcode() == G4_math)) &&
2707          (indirect_dst || (dstType != defDstType && !rawMovInst))))
2708     {
2709         return false;
2710     }
2711 
2712     if (!defInst->isLegalType(dstType, Opnd_dst))
2713     {
2714         return false;
2715     }
2716 
2717     if (isMixedMode())
2718     {
2719         G4_opcode defOp = defInst->opcode();
2720 
2721         if (defOp != G4_mov &&
2722             defOp != G4_mul &&
2723             defOp != G4_pseudo_mad &&
2724             defOp != G4_add &&
2725             defOp != G4_sel &&
2726             defOp != G4_cmp)
2727         {
2728             return false;
2729         }
2730         if (!builder.hasMixMode())
2731         {
2732             // normally we should disable the opt, but for the special case where
2733             // defInst is a move with integer source, we can still hoist since it
2734             // won't produce a mixed mode inst
2735             if (!(defInst->isMov() && IS_TYPE_INT(defInst->getSrc(0)->getType())))
2736             {
2737                 return false;
2738             }
2739         }
2740         if (!builder.getOption(vISA_ignoreBFRounding) && dstType == Type_BF && defOp != G4_mov)
2741         {
2742             // F->BF move has RNE mode while mix mode BF uses RTZ due to HW bug
2743             // so we have to disallow the def-hoisting
2744             return false;
2745         }
2746     }
2747 
2748     if (dst->isAddress() && defInst->getNumSrc() == 3)
2749     {
2750         // no A0 dst for ternary instructions
2751         return false;
2752     }
2753 
2754     // compare boudaries and bitset
2755     if ((def_dst->getLeftBound() < srcs[0]->getLeftBound()) ||
2756         (def_dst->getRightBound() > srcs[0]->getRightBound()))
2757     {
2758         return false;
2759     }
2760 
2761     if (getSaturate() && !defInst->canSupportSaturate())
2762     {
2763         return false;
2764     }
2765 
2766     // check mixed type conversion
2767     // TODO: cleanup this part since mixed type check of the first half is already checked in canHoist.
2768     if ((!(defInst->isRawMov() && (defDstType == srcType)) &&
2769         ((IS_FTYPE(dstType) && (IS_TYPE_INT(srcType) || IS_VINTTYPE(srcType))) ||
2770         ((IS_FTYPE(srcType) || IS_VFTYPE(srcType)) && IS_TYPE_INT(dstType)))) ||
2771         (!rawMovInst &&
2772         ((IS_FTYPE(defDstType) && IS_TYPE_INT(defInst->getExecType())) ||
2773         (IS_FTYPE(defInst->getExecType())  && IS_TYPE_INT(defDstType)))))
2774     {
2775         return false;
2776     }
2777 
2778     if (!rawMovInst && (defInst->getSrc(0) &&
2779         (IS_DFTYPE(defInst->getSrc(0)->getType()) || IS_FTYPE(defInst->getSrc(0)->getType()))) &&
2780         (IS_SIGNED_INT(defDstType) || IS_UNSIGNED_INT(defDstType)))
2781     {
2782         // Sequence that should not be optimized:
2783         // mov V1:d    V2:df
2784         // mov V3:uw    V1:d
2785         //
2786         // This is *NOT* a candidate for:
2787         // mov V3:uw    V2:df
2788         //
2789         // In general, df/f->int performs saturation and unless value of
2790         // df/f is known, the result of mov may differ based on type
2791         // of dst.
2792         return false;
2793     }
2794 
2795     // no def hoisting for sends for now
2796     if (defInst->isSend())
2797     {
2798         return false;
2799     }
2800 
2801     if (defInst->opcode() == G4_mov && defInst->getSrc(0)->isFlag())
2802     {
2803         // TODO: check if use is a predicate, if not, can propagate?
2804         return false;
2805     }
2806 
2807     if (simdBB && (defInst->isWriteEnableInst() ^ isWriteEnableInst()))
2808     {
2809         // no opt if one isNoMask but the other is not
2810         return false;
2811     }
2812 
2813     if (defInst->getMaskOffset() != getMaskOffset() &&
2814         (simdBB || getPredicate() || getCondMod() ||
2815          defInst->getPredicate() || defInst->getCondMod()))
2816     {
2817         // no opt if their mask offset do not match,
2818         // and mov/defInst has flags
2819         return false;
2820     }
2821 
2822     if ((getPredicate() || getCondMod()) && (defInst->getPredicate() || defInst->getCondMod()))
2823     {
2824         // can't have both inst using flags
2825         return false;
2826     }
2827 
2828     bool same_type_size = def_dst->getTypeSize() == TypeSize(srcType);
2829     bool scalarSrc = srcs[0]->asSrcRegRegion()->isScalar();
2830     // handle predicated MOV and float def
2831     if ((getPredicate() && (execSize > g4::SIMD1) && !same_type_size) ||
2832         (IS_FTYPE(defDstType) && (defDstType != srcType) && (dstType != srcType)))
2833     {
2834         return false;
2835     }
2836 
2837     // if used as scalar and repeated region, dst should be packed
2838     // add(2) v2<1>:w v3 v4
2839     // mov(2) v5<2>:d  V2<0;1,0>:d
2840     if (scalarSrc && !same_type_size &&
2841         (execSize > g4::SIMD1) && (dst->getHorzStride() != 1))
2842     {
2843         return false;
2844     }
2845 
2846     // if indirect source is repeat region, or defhoisting will make it a repeat region,
2847     // no opt
2848     if (srcs[0]->asSrcRegRegion()->getRegion()->isRepeatRegion(execSize) &&
2849         !scalarSrc)
2850     {
2851         return false;
2852     }
2853 
2854     // check type conversion
2855     if (IS_SIGNED_INT(dstType) && (defInst->opcode() == G4_mov) &&
2856         (TypeSize(dstType) > srcElSize) &&
2857         ((IS_SIGNED_INT(defDstType) && IS_UNSIGNED_INT(defInst->getSrc(0)->getType())) ||
2858         (IS_UNSIGNED_INT(defDstType) && IS_SIGNED_INT(defInst->getSrc(0)->getType()))))
2859     {
2860         return false;
2861     }
2862 
2863     // check alignment and saturate
2864     if (((srcElSize > defDstElSize) || defInst->getSaturate()) && (srcType != dstType))
2865     {
2866         return false;
2867     }
2868 
2869     uint16_t dstHS = dst->getHorzStride();
2870     uint16_t srcHS = 0;
2871     const RegionDesc *srcRd = srcs[0]->asSrcRegRegion()->getRegion();
2872     if (!srcRd->isSingleStride(execSize, srcHS))
2873     {
2874         return false;
2875     }
2876     if ((srcElSize < defDstElSize) && ((dstHS > 1) || (srcHS > 1)))
2877     {
2878         return false;
2879     }
2880     if ((dstElSize != defDstElSize) && (srcElSize == dstElSize) &&
2881         (indirect_dst || ((dst->getByteOffset() % defDstElSize) != 0) ||
2882         (dstHS != srcHS)))
2883     {
2884         return false;
2885     }
2886 
2887     // dont hoist stack calls related variables (Arg, Retval, SP, FP)
2888     if (defInst->getDst() && defInst->getDst()->getTopDcl())
2889     {
2890         G4_Declare* defDstDcl = defInst->getDst()->getTopDcl()->getRootDeclare();
2891         if (builder.isPreDefFEStackVar(defDstDcl) || builder.isPreDefArg(defDstDcl) ||
2892             builder.isPreDefRet(defDstDcl))
2893         {
2894             return false;
2895         }
2896     }
2897 
2898     // For mov HF F, we have to check if the def Inst supports HF
2899     if (dstType != Type_F && defInst->isFloatOnly() && !isRawMov())
2900     {
2901         return false;
2902     }
2903 
2904     // Before:
2905     // or (8) V100(0,0)<1>:d ...
2906     // or (8) V100(1,0)<1>:d ...
2907     // mov (16) V101(0,0)<1>:b    V102(0,0)<16;16,1>:w <-- V102 is alias of V100
2908     // mov (16) V101(0,16)<1>:b   V102(1,0)<16;16,1>:w
2909 
2910     // After (invalid optimization):
2911     // or (8) V100(0,0)<1>:d ...
2912     // or (8) V100(0,4)<1>:d ...
2913     if (defDstType != srcType)
2914     {
2915         if (isRawMov() == false)
2916         {
2917             return false;
2918         }
2919     }
2920 
2921     // As dst's type of shl inst decides what shifting amt should be used,
2922     // make sure shifting amt would not be changed after doing hoisting.
2923     //    shl (1) V00(0,0)<1>:q V101(0,0):w  V102(0,0)<0;1,0>:q
2924     //    mov(1) V103(0, 0)<1>:b V100(0, 0)<0;1,0 >:q
2925     // Cannot do it for this case.
2926     if (defInst->opcode() == G4_shl || defInst->opcode() == G4_shr || defInst->opcode() == G4_asr)
2927     {
2928         uint32_t defSrc0Bytes = defInst->getSrc(0)->getTypeSize();
2929         bool QMode = (defDstElSize == 8 || defSrc0Bytes == 8);
2930         if ((QMode && defSrc0Bytes != 8 && dstElSize != 8) ||
2931             (!QMode && dstElSize == 8))
2932         {
2933             // Disable it; otherwise shift's mode is changed illegally!
2934             return false;
2935         }
2936     }
2937 
2938     // Cannot do hoisting if the use inst has src modifier.
2939     if (getSrc(0)->asSrcRegRegion()->hasModifier())
2940     {
2941         return false;
2942     }
2943 
2944     if (getGRFSize() == 64 &&
2945         (defInst->opcode() == G4_dpas || defInst->opcode() == G4_dpasw))
2946     {
2947         uint32_t leftBoundInBytes = dst->getLeftBound() * dst->getTypeSize();
2948         // left bound should be 2grf aligned to hoist dst into dpas.
2949         if (leftBoundInBytes % (numEltPerGRF<Type_UB>() * 2))
2950         {
2951             return false;
2952         }
2953     }
2954     if (defInst->opcode() == G4_fcvt)
2955     {
2956         return false;
2957     }
2958     if (defInst->opcode() == G4_srnd)
2959     {
2960         return false;
2961     }
2962 
2963     return true;
2964 }
2965 
2966 // check if the sources of an inst is commutative
2967 // besides the property shown in inst table, some INT MUL instructions
2968 // are not commutative due to HW restrictions
isCommutative() const2969 bool G4_INST::isCommutative() const
2970 {
2971     //TODO: we can invert condMod of cmp to swap sources
2972     if (!(G4_Inst_Table[op].attributes & ATTR_COMMUTATIVE) || op == G4_cmp)
2973         return false;
2974 
2975     // for mul we can do D*W but not W*D
2976     if (op == G4_mul)
2977     {
2978         if (IS_DTYPE(srcs[0]->getType()))
2979         {
2980             return false;
2981         }
2982     }
2983     return true;
2984 }
2985 
hasNULLDst() const2986 bool G4_INST::hasNULLDst() const
2987 {
2988     if (dst && dst->isNullReg())
2989     {
2990         return true;
2991     }
2992 
2993     return false;
2994 }
2995 
goodTwoGRFDst(bool & evenSplitDst)2996 bool G4_INST::goodTwoGRFDst(bool& evenSplitDst)
2997 {
2998     evenSplitDst = false;
2999     // The following applies to all platforms
3000     // The problem is , the first case is really an instruction with two destination registers.
3001     // in which case, hardware breaks into two operations. When this happens, hardware cannot update flag registers.
3002     // I.e., if execution size is 8 or less and the destination register is 2, flag updates are not supported.
3003     // -naveen
3004 
3005     if (!dst || hasNULLDst())
3006     {
3007         evenSplitDst = true;
3008         return true;
3009     }
3010     else
3011     {
3012         evenSplitDst = dst->evenlySplitCrossGRF(execSize);
3013         // check if elements evenly split between two GRFs
3014         if (evenSplitDst)
3015         {
3016             return true;
3017         }
3018         else
3019         {
3020             return false;
3021         }
3022     }
3023 }
3024 
3025 // check if there is WAW, WAR, RAW dependency between the passing-in inst and this instruction
3026 // there is no check for the case that two instructions are both send, since these checks are
3027 // only used in def-joisting and copy propagation
isWARdep(G4_INST * inst)3028 bool G4_INST::isWARdep(G4_INST* inst)
3029 {
3030     G4_Operand* msg0 = NULL;
3031     G4_Operand* src0_0 = inst->getSrc(0);
3032     G4_Operand* src0_1 = inst->getSrc(1);
3033     G4_Operand* src0_2 = inst->getSrc(2);
3034     G4_Operand* src0_3 = inst->getSrc(3);
3035     G4_Operand* implicitSrc0 = inst->getImplAccSrc();
3036     G4_Predicate* pred0 = inst->getPredicate();
3037 
3038     G4_Operand* dst1 = dst;
3039 
3040     if (dst1 && !hasNULLDst())
3041     {
3042 
3043         if (
3044             (src0_0 && src0_0->compareOperand(dst1) != Rel_disjoint) ||
3045             (src0_1 && src0_1->compareOperand(dst1) != Rel_disjoint) ||
3046             (src0_2 && src0_2->compareOperand(dst1) != Rel_disjoint) ||
3047             (src0_3 && src0_3->compareOperand(dst1) != Rel_disjoint) ||
3048             (msg0 && (msg0->compareOperand(dst1) != Rel_disjoint)) ||
3049             (pred0 && (pred0->compareOperand(dst1) != Rel_disjoint)) ||
3050             (implicitSrc0 && (implicitSrc0->compareOperand(dst1) != Rel_disjoint)))
3051         {
3052             return true;
3053         }
3054     }
3055 
3056     if (mod)
3057     {
3058         if ((pred0 && pred0->compareOperand(mod) != Rel_disjoint) ||
3059             (src0_0 && src0_0->isFlag() && src0_0->compareOperand(mod) != Rel_disjoint) ||
3060             (src0_1 && src0_1->isFlag() && src0_1->compareOperand(mod) != Rel_disjoint) ||
3061             (src0_2 && src0_2->isFlag() && src0_2->compareOperand(mod) != Rel_disjoint))
3062         {
3063             return true;
3064         }
3065     }
3066 
3067     if (implAccDst)
3068     {
3069         if ((implicitSrc0 && implicitSrc0->compareOperand(implAccDst) != Rel_disjoint) ||
3070             (src0_0 && src0_0->isAccReg() && src0_0->compareOperand(implAccDst) != Rel_disjoint) ||
3071             (src0_1 && src0_1->isAccReg() && src0_1->compareOperand(implAccDst) != Rel_disjoint) ||
3072             (src0_2 && src0_2->isAccReg() && src0_2->compareOperand(implAccDst) != Rel_disjoint))
3073         {
3074             return true;
3075         }
3076     }
3077     return false;
3078 }
3079 
isWAWdep(G4_INST * inst)3080 bool G4_INST::isWAWdep(G4_INST *inst)
3081 {
3082     G4_Operand *dst0 = inst->getDst();
3083     G4_Operand *dst1 = dst;
3084     G4_CondMod *cMod0 = inst->getCondMod();
3085     G4_CondMod *cMod1 = mod;
3086     G4_Operand *implicitDst0 = inst->getImplAccDst();
3087     G4_Operand *implicitDst1 = implAccDst;
3088 
3089     bool NULLDst1 = !dst1 || hasNULLDst();
3090     if (dst0 && !inst->hasNULLDst())
3091     {
3092         if ((!NULLDst1 && dst1->compareOperand(dst0) != Rel_disjoint) ||
3093             (implicitDst1 && implicitDst1->compareOperand(dst0) != Rel_disjoint) ||
3094             (cMod1 && cMod1->getBase() && cMod1->compareOperand(dst0) != Rel_disjoint))
3095         {
3096             return true;
3097         }
3098     }
3099 
3100     if (implicitDst0)
3101     {
3102         if ((!NULLDst1 && dst1->compareOperand(implicitDst0) != Rel_disjoint) ||
3103             (implicitDst1 && implicitDst1->compareOperand(implicitDst0) != Rel_disjoint))
3104         {
3105             return true;
3106         }
3107     }
3108 
3109     if (cMod0 && cMod0->getBase())
3110     {
3111         if ((!NULLDst1 && dst1->compareOperand(cMod0) != Rel_disjoint) ||
3112             (cMod1 && cMod1->getBase() && cMod1->compareOperand(cMod0) != Rel_disjoint))
3113         {
3114             return true;
3115         }
3116     }
3117 
3118     return false;
3119 }
isRAWdep(G4_INST * inst)3120 bool G4_INST::isRAWdep(G4_INST *inst)
3121 {
3122     G4_Operand *dst0 = inst->getDst();
3123     G4_CondMod *cMod0 = inst->getCondMod();
3124     G4_Operand *implicitDst0   = inst->getImplAccDst();
3125     G4_Operand *msg1 = NULL;
3126     G4_Predicate *pred1   = getPredicate();
3127     G4_Operand *src1_0 = getSrc(0);
3128     G4_Operand *src1_1 = getSrc(1);
3129     G4_Operand *src1_2 = getSrc(2);
3130     G4_Operand* src1_3 = getSrc(3);
3131     G4_Operand *implicitSrc1   = implAccSrc;
3132 
3133     bool NULLSrc1 = (opcode() == G4_math && src1_1->isNullReg());
3134     if (dst0 && !inst->hasNULLDst())
3135     {
3136         if ((src1_0 && src1_0->compareOperand(dst0) != Rel_disjoint) ||
3137             (src1_1 && !NULLSrc1 && src1_1->compareOperand(dst0) != Rel_disjoint) ||
3138             (src1_2 && src1_2->compareOperand(dst0) != Rel_disjoint) ||
3139             (src1_3 && src1_3->compareOperand(dst0) != Rel_disjoint) ||
3140             (msg1 && msg1->compareOperand(dst0) != Rel_disjoint) ||
3141             (pred1 && pred1->compareOperand(dst0) != Rel_disjoint) ||
3142             (implicitSrc1 && implicitSrc1->compareOperand(dst0) != Rel_disjoint))
3143         {
3144             return true;
3145         }
3146     }
3147 
3148     if (cMod0 && cMod0->getBase())
3149     {
3150         if ((pred1 && pred1->compareOperand(cMod0) != Rel_disjoint) ||
3151             (src1_0 && src1_0->isFlag() && src1_0->compareOperand(cMod0) != Rel_disjoint) ||
3152             (src1_2 && src1_2->isFlag() && src1_2->compareOperand(cMod0) != Rel_disjoint) ||
3153             (src1_1 && src1_1->isFlag() && src1_1->compareOperand(cMod0) != Rel_disjoint))
3154         {
3155             return true;
3156         }
3157     }
3158 
3159     if (implicitDst0)
3160     {
3161         if ((implicitSrc1 && implicitSrc1->compareOperand(implicitDst0) != Rel_disjoint) ||
3162             (src1_0 && src1_0->isAccReg() && src1_0->compareOperand(implicitDst0) != Rel_disjoint) ||
3163             (src1_2 && src1_2->isAccReg() && src1_2->compareOperand(implicitDst0) != Rel_disjoint) ||
3164             (src1_1 && src1_1->isAccReg() && src1_1->compareOperand(implicitDst0) != Rel_disjoint))
3165         {
3166             return true;
3167         }
3168     }
3169     return false;
3170 }
3171 
detectComprInst() const3172 bool G4_INST::detectComprInst() const
3173 {
3174     enum class ComprInstStates : unsigned char { U, T, F };
3175 
3176     G4_Type execType = getExecType();
3177     ComprInstStates comprInst = ComprInstStates::U;
3178 
3179     // Compressed instructions must have a minimum execution size of
3180     // at least 8.
3181     if (execSize < g4::SIMD8)
3182     {
3183         comprInst = ComprInstStates::F;
3184     }
3185 
3186     // Compressed instructions must have a minimum execution size of
3187     // at least 16 if the execution type is less than DF.
3188     else if (dst &&
3189              dst->getHorzStride() != UNDEFINED_SHORT &&
3190              dst->getType() != Type_UNDEF)
3191     {
3192         if ((unsigned)execSize * dst->getTypeSize() * dst->getHorzStride() >
3193             numEltPerGRF<Type_UB>())
3194         {
3195             comprInst = ComprInstStates::T;
3196         }
3197         else
3198         {
3199             comprInst = ComprInstStates::F;
3200         }
3201     }
3202 
3203     // Uncompressed instructions can only operate on a max of 4 DFs or
3204     // 8 DF4/F/DWs or 16 W/Bs (the only exception being packed byte
3205     // moves which always have destinations).
3206     else if ((unsigned)execSize * TypeSize(execType) > numEltPerGRF<Type_UB>())
3207     {
3208         comprInst = ComprInstStates::T;
3209     }
3210 
3211     else
3212     {
3213         comprInst = ComprInstStates::F;
3214     }
3215 
3216     return (comprInst == ComprInstStates::T);
3217 }
3218 
3219 /*
3220  * Check to see if the interpretation of the i/p src region is unaffected by
3221  * virtue of it making it a src of the compressed op, as opposed to (if
3222  * possible) it appearing within a regular uncompressed op with the same exec
3223  * size.
3224  * Register-indirect operands are NOT compression invariant. The following 4 rules
3225  * are used to determine compression invariant register-direct opnds:
3226  *    1. constants, scalars, and ARF regions/registers are always compression invariant
3227  *    2. if both the dst region and the i/p source region are native packed
3228  *       regions, and the GRF source region is additionally of type W/UW
3229  *    3. the src region covers (i.e. vs(region) * rows(region)) exactly two
3230  *       registers (strides allowed), except when the dst region is a native
3231  *       packed region and the GRF source has packed rows of type W/UW
3232  *    4. the first src of line op is always considered compression invariant
3233  *       (this is a special case quadruple region of <0;4,1>)
3234  * e.g.
3235  *   (Both srcs are compression invariant in the following examples)
3236  *      add (16) r10.0<1>:d  r12.0<0;1,0>:w  0x80:w {CC}
3237  *      add (16) r10.0<2>:w  r12.0<8;8,1>:d  r14.0<16;8,2>:w {CC}
3238  *      add (16) r10.0<1>:d  r12.0<16;8,2>:w r14.0<32;8,4>:b {CC}
3239  *      add (16) r10.0<1>:d  r12.0<8;8,1>:w  r14.0<8;8,1>:w {CC}
3240  *      add (16) r10.0<1>:d  r12.0<4;4,1>:w  r14.0<4;4,1>:d {CC}
3241  *      add (32) r10.0<1>:w  r12.0<8;8,1>:w  r14.0<16;8,2>:b {CC}
3242  *      add (8)  r10.0<1>:df r12.0<4;4,1>:df r14.0<4;4,1>:df {CC}
3243  *      mov (8)  r10.0<1>:df r12.0<4;4,1>:w {CC}
3244  *   (Only the first src is compression invariant in the following examples)
3245  *      add (16) r10.0<1>:d  r12.0<8;8,1>:w  r14.0<16;8,2>:b {CC}
3246  *      add (16) r10.0<2>:w  r14.0<32;8,1>:b r12.0<16;8,1>:w {CC}
3247  *      add (16) r10.0<2>:w  r12.0<4;4,1>:d  r14.0<8;8,1>:w {CC}
3248  *      add (32) r10.0<1>:w  r12.0<8;8,1>:w  r14.0<8;8,1>:b {CC}
3249  *   (Neither src is compression invariant in the following examples)
3250  *      add (16) r10.0<2>:w  r12.0<8;8,1>:w  r14.0<16;8,2>:b {CC}
3251  *      add (32) r10.0<1>:w  r12.0<8;8,1>:b  r14.0<8;8,1>:b {CC}
3252  *      mov (8)  r10.0<1>:df r12.0<4;4,1>:dw {CC}
3253  * Inputs:
3254  *      src - the i/p src operand region
3255  *      src_pos - the position that the src operand appears in the list
3256  *                of src operands
3257  * Assumptions:
3258  *    - this function is only valid for compressed ops and it is invalid
3259  *      to call it for uncompressed ops
3260  */
3261 bool
isComprInvariantSrcRegion(G4_SrcRegRegion * src,int srcPos)3262 G4_INST::isComprInvariantSrcRegion(G4_SrcRegRegion* src, int srcPos)
3263 {
3264     if (src == NULL)
3265     {
3266         return true;
3267     }
3268     else if (src->isImm() || src->isAddrExp())
3269     {
3270         return true;
3271     }
3272     else if (src->getRegAccess() != Direct)
3273     {
3274         return false;
3275     }
3276     else if (src->getBase()->asRegVar()->getDeclare()->getRegFile() != G4_GRF &&
3277              src->getBase()->asRegVar()->getDeclare()->getRegFile() != G4_INPUT)
3278     {
3279          return true;
3280     }
3281 
3282     const RegionDesc* region = src->getRegion();
3283 
3284     if (opcode() == G4_line && srcPos == 0)
3285     {
3286         return true;
3287     }
3288     else if (region->isScalar())
3289     {
3290         return true;
3291     }
3292     else
3293     {
3294         int num_rows  = getExecSize() / src->getRegion()->width;
3295         int type_sz   = (int)src->getTypeSize();
3296         int byte_size = src->getRegion()->vertStride * type_sz * num_rows;
3297 
3298         if (getDst() && getDst()->isNativePackedRegion() &&
3299             IS_WTYPE(src->getType())) {
3300             if (src->isNativePackedRegion()) {
3301                 return true;
3302             }
3303             else if (src->isNativePackedRowRegion()) {
3304                 return false;
3305             }
3306         }
3307         if (byte_size == 2 * numEltPerGRF<Type_UB>()) {
3308             return true;
3309         }
3310         else {
3311             return false;
3312         }
3313     }
3314 }
3315 
isPartialWrite() const3316 bool G4_INST::isPartialWrite() const
3317 {
3318     G4_Predicate* aPred = predicate;
3319     if (aPred && aPred->isSameAsNoMask())
3320     {
3321         // equivalent to NoMask (W) without predicate
3322         aPred = nullptr;
3323     }
3324 
3325     return (aPred != NULL && op != G4_sel) || op == G4_smov;
3326 }
3327 
isPartialWriteForSpill(bool inSIMDCF) const3328 bool G4_INST::isPartialWriteForSpill(bool inSIMDCF) const
3329 {
3330     if (!getDst() || hasNULLDst())
3331     {
3332         // inst does not write to GRF
3333         return false;
3334     }
3335 
3336     if (isPartialWrite())
3337     {
3338         return true;
3339     }
3340 
3341     if (inSIMDCF && !isWriteEnableInst())
3342     {
3343         if (builder.usesStack() || !(builder.hasMaskForScratchMsg() && getDst()->getElemSize() == 4))
3344         {
3345             // scratch message only supports DWord mask
3346             // also we can't use the scratch message when under stack call
3347             return true;
3348         }
3349     }
3350 
3351     return false;
3352 }
3353 
3354 
isAccSrcInst() const3355 bool G4_INST::isAccSrcInst() const
3356 {
3357     if (srcs[0] && srcs[0]->isSrcRegRegion() && srcs[0]->asSrcRegRegion()->getBase()->isAccReg())
3358     {
3359         return true;
3360     }
3361     else if (getNumSrc() == 3 && srcs[1] != nullptr)
3362     {
3363         if (srcs[1]->isSrcRegRegion() && srcs[1]->asSrcRegRegion()->getBase()->isAccReg())
3364         {
3365             return true;
3366         }
3367     }
3368     return false;
3369 }
3370 
3371 // Check if this instruction has an explicit acc destination
isAccDstInst() const3372 bool G4_INST::isAccDstInst() const
3373 {
3374     if (dst != NULL && dst->getBase()->isAccReg())
3375     {
3376         return true;
3377     }
3378     return false;
3379 }
3380 
isArithAddr() const3381 bool G4_INST::isArithAddr() const
3382 {
3383     if (srcs[1] != NULL)
3384         return isArithmetic() && srcs[1]->isAddrExp();
3385     else
3386         return false;
3387 }
3388 
isMovAddr() const3389 bool G4_INST::isMovAddr() const
3390 {
3391     if (srcs[0] != NULL)
3392         return isMov() && srcs[0]->isAddrExp();
3393     return false;
3394 }
3395 
3396 //
3397 // Check if the operands of send instruction obey the symbolic register rule
3398 // ToDo: this is obsolete and should be removed
3399 //
isValidSymbolOperand(bool & dst_valid,bool * srcs_valid) const3400 bool G4_INST::isValidSymbolOperand(bool &dst_valid, bool *srcs_valid) const
3401 {
3402     MUST_BE_TRUE(srcs_valid, ERROR_INTERNAL_ARGUMENT);
3403 
3404     bool obeyRule = true;
3405     if (dst && dst->getBase()->isRegVar())
3406     {
3407         dst_valid = dst->obeySymbolRegRule();
3408         if (!dst_valid)
3409             obeyRule = false;
3410     }
3411     else
3412         dst_valid = false;                          // does not change obeyRule for non-register-variable
3413 
3414     for (unsigned i = 0; i < G4_MAX_SRCS; i++)
3415     {
3416         G4_Operand* src = getSrc(i);
3417         if (src && src->isSrcRegRegion() && src->asSrcRegRegion()->getBase()->isRegVar())
3418         {
3419             srcs_valid[i] = src->asSrcRegRegion()->obeySymbolRegRule();
3420             if (!srcs_valid[i])
3421                 obeyRule = false;
3422         }
3423         else
3424             srcs_valid[i] = false;                  // does not change obeyRule for non-register-variable
3425     }
3426 
3427     return obeyRule;
3428 }
3429 
getCondModBase() const3430 const G4_VarBase* G4_INST::getCondModBase() const
3431 {
3432     if (!getCondMod())
3433         return nullptr;
3434 
3435     return getCondMod()->getBase();
3436 }
3437 
isOptBarrier() const3438 bool G4_INST::isOptBarrier() const
3439 {
3440     if (op == G4_join)
3441     {
3442         return true;
3443     }
3444 
3445     if (isIntrinsic() && asIntrinsicInst()->getIntrinsicId() == Intrinsic::MemFence)
3446     {
3447         return true;
3448     }
3449 
3450     // any instructions that access special ARFs is considered a opt barrier
3451     // this includes any ARF that is not address/flag/acc
3452     if (dst != NULL)
3453     {
3454         if (dst->isAreg())
3455         {
3456             if (dst->isNReg() ||
3457                 dst->isSrReg() ||
3458                 dst->isCrReg() ||
3459                 dst->isTmReg() ||
3460                 dst->isTDRReg())
3461             {
3462                 return true;
3463             }
3464         }
3465     }
3466 
3467     for (int i = 0; i < getNumSrc(); i++)
3468     {
3469         if (getSrc(i))
3470         {
3471             if (getSrc(i)->isAreg())
3472             {
3473                 if (getSrc(i)->isNReg() ||
3474                     getSrc(i)->isSrReg() ||
3475                     getSrc(i)->isCrReg() ||
3476                     getSrc(i)->isTmReg() ||
3477                     getSrc(i)->isTDRReg())
3478                 {
3479                     return true;
3480                 }
3481             }
3482         }
3483     }
3484     return false;
3485 }
3486 
3487 
emitPredWrEn(std::ostream & output,G4_INST & inst)3488 static void emitPredWrEn(std::ostream& output, G4_INST &inst)
3489 {
3490     G4_Predicate *pred = inst.getPredicate();
3491     bool isNoMask = (inst.getOption() & InstOpt_WriteEnable) != 0;
3492 
3493     if (pred) {
3494         output << "(";
3495         if (isNoMask)
3496             output << "W&";
3497         pred->emit_body(output, false);
3498         output << ") ";
3499     } else if (isNoMask) {
3500         output << "(W) ";
3501     } else {
3502         output << "    "; // align for predication (.....)
3503     }
3504 }
3505 
emitExecSize(std::ostream & output,const G4_INST & inst)3506 static void emitExecSize(std::ostream& output, const G4_INST &inst)
3507 {
3508     auto execSize = static_cast<int>(inst.getExecSize());
3509     if (inst.opcode() != G4_nop && inst.opcode() != G4_wait)
3510     {
3511         output << '(';
3512         if (execSize == UNDEFINED_EXEC_SIZE) {
3513             output << "??";
3514         } else {
3515             output << execSize;
3516         }
3517         if (int execOffset = inst.getMaskOffset()) {
3518             // non-zero channel offset
3519             output << "|M" << execOffset;
3520         }
3521         output << ") ";
3522     }
3523 }
3524 
3525 // the syntax column width of beinning instruction info
3526 //  (P1.0) and (16)     ...
3527 //         nop
3528 //         and (16|M0)  ...
3529 //                      ^ aligns operand start to same place here
3530 static const int INST_START_COLUMN_WIDTH = 24;
3531 
3532 // emits the first part of an instruction in an aligned column
emitInstructionStartColumn(std::ostream & output,G4_INST & inst)3533 static void emitInstructionStartColumn(std::ostream& output, G4_INST &inst)
3534 {
3535     std::stringstream oupPfx;
3536     emitPredWrEn(oupPfx, inst);
3537 
3538     oupPfx << G4_Inst_Table[inst.opcode()].str;
3539     if (inst.isIntrinsic())
3540     {
3541         oupPfx << "." << inst.asIntrinsicInst()->getName();
3542         if (inst.isSpillIntrinsic())
3543         {
3544             oupPfx << "." << inst.asSpillIntrinsic()->getNumRows();
3545         }
3546         else if (inst.isFillIntrinsic())
3547         {
3548             oupPfx << "." << inst.asFillIntrinsic()->getNumRows();
3549         }
3550     }
3551     else if (inst.opcode() == G4_goto)
3552     {
3553         oupPfx << (inst.asCFInst()->isBackward() ? ".bwd" : ".fwd");
3554     }
3555     else if (inst.isBfn()) {
3556         oupPfx << "." << fmtHex(inst.asBfnInst()->getBooleanFuncCtrl(), 2);
3557     }
3558     else if (inst.isMath() && inst.asMathInst()->getMathCtrl() != MATH_RESERVED)
3559     {
3560         oupPfx << "." << MathOpNames[inst.asMathInst()->getMathCtrl()];
3561     }
3562 
3563     oupPfx << ' ';
3564     emitExecSize(oupPfx, inst);
3565 
3566     G4_CondMod *mod = inst.getCondMod();
3567     if (mod) {
3568         oupPfx << ' ';
3569         mod->emit(oupPfx);
3570     }
3571 
3572     std::string pfx = oupPfx.str();
3573     output << pfx;
3574     for (int i = 0; i < INST_START_COLUMN_WIDTH - (int)pfx.size(); i++)
3575         output << ' ';
3576 }
3577 
3578 
emit_inst(std::ostream & output,bool symbol_dst,bool * symbol_srcs)3579 void G4_INST::emit_inst(std::ostream& output, bool symbol_dst, bool *symbol_srcs)
3580 {
3581     if (isLabel())
3582     {
3583         srcs[0]->emit(output);
3584         output << ":";
3585         if (((G4_Label*)srcs[0])->isStartLoopLabel())
3586             output << " // do";
3587     }
3588     else
3589     {
3590         // predication, opcode, execsize, condition, ...
3591         emitInstructionStartColumn(output, *this);
3592 
3593         if (isSpillIntrinsic())
3594         {
3595             output << ' ';
3596             output << "Scratch[" << asSpillIntrinsic()->getOffset() << "x" << numEltPerGRF<Type_UB>() << "]";
3597         }
3598         else if (dst)
3599         {
3600             output << ' ';
3601             if (sat)
3602                 output << "(sat)";
3603             dst->emit(output, symbol_dst);
3604         } // else: may not have dst (e.g. branch)
3605 
3606         auto numSrcOpnds = getNumSrc();
3607         for (int i = 0; i < numSrcOpnds; i++)
3608         {
3609             if (getSrc(i))
3610             {
3611                 output << "  ";
3612                 if (symbol_srcs != NULL)
3613                 {
3614                     getSrc(i)->emit(output, symbol_srcs[i]);  // emit symbolic/physical register depends on the flag
3615                 }
3616                 else
3617                 {
3618                     getSrc(i)->emit(output, false);   // emit physical register
3619                 }
3620             }
3621         }
3622 
3623         if (isFillIntrinsic())
3624         {
3625             output << "  ";
3626             output << "Scratch[" << asFillIntrinsic()->getOffset() << "x" << numEltPerGRF<Type_UB>() << "] ";
3627         }
3628 
3629         if (isFlowControl() && asCFInst()->getJip())
3630         {
3631             output << "  ";
3632             asCFInst()->getJip()->emit(output);
3633         }
3634 
3635         if (isFlowControl() && asCFInst()->getUip())
3636         {
3637             output << "  ";
3638             asCFInst()->getUip()->emit(output);
3639         }
3640 
3641         emit_options(output);
3642         if (getCISAOff() != -1) {
3643             output << " // ";
3644             emitInstIds(output);
3645         }
3646     } // end: non-label
3647 } // G4_INST::emit_inst
3648 
3649 
emitInstIds(std::ostream & output) const3650 void G4_INST::emitInstIds(std::ostream& output) const
3651 {
3652     int srcLine = getLineNo();
3653     if (srcLine != 0) {
3654         output << "#" << srcLine << ":";
3655     }
3656 
3657     int vISAId = getCISAOff();
3658     if (vISAId != -1) {
3659         output << "$" << vISAId << ":";
3660     }
3661 
3662     uint32_t genId = getLexicalId();
3663     if (genId != -1) {
3664         output << "&" << genId << ":";
3665     }
3666 
3667     if (builder.hasSWSB())
3668     {
3669         unsigned tokenLocNum = getTokenLocationNum();
3670         for (unsigned i = 0; i < tokenLocNum; i++)
3671         {
3672             unsigned short token = 0;
3673             uint32_t depId = getTokenLoc(i, token);
3674             output << token << "." << depId << ":";
3675         }
3676     }
3677 
3678     int64_t pc = getGenOffset();
3679     if (pc != -1) {
3680         output << "[" << fmtHexBody(pc, 5) << "]";
3681     }
3682 }
3683 
3684 
3685 //
3686 // Here we add a parameter symbolreg instead of use global option Options::symbolReg,
3687 // because we should ouput non-symbolic register when dumping dot files
3688 //
emit(std::ostream & output,bool symbolreg,bool dotStyle)3689 void G4_INST::emit(std::ostream& output, bool symbolreg, bool dotStyle)
3690 {
3691     bool dst_valid = true;
3692     bool srcs_valid[G4_MAX_SRCS];
3693 
3694     if (symbolreg)
3695     {
3696         if (op==G4_nop || isLabel())
3697         {
3698             emit_inst(output, false, NULL);
3699             return;
3700         }
3701 
3702         //
3703         // Emit as comment if there is invalid operand, then emit instruction
3704         // based on the situation of operand
3705         //
3706         if (!isValidSymbolOperand(dst_valid, srcs_valid))
3707         {
3708             if (!dotStyle)
3709             {
3710                 output << "//";
3711                 bool srcs_valid1[G4_MAX_SRCS];
3712                 for (unsigned i = 0; i < G4_MAX_SRCS; i++)
3713                     srcs_valid1[i] = true;
3714                 emit_inst(output, true, srcs_valid1); // emit comments
3715                 output << std::endl;
3716             }
3717         }
3718         emit_inst(output, dst_valid, srcs_valid); // emit instruction
3719     }
3720     else
3721         emit_inst(output, false, NULL); // emit instruction with physical register
3722 }
3723 
operator <<(std::ostream & os,G4_INST & inst)3724 std::ostream& operator<<(std::ostream& os, G4_INST& inst)
3725 {
3726     inst.emit(os, false, false);
3727     return os;
3728 }
3729 
3730 // add instruction options; only wrap in braces {...}
3731 // if there's at least one option
3732 // instructions are assumed Align1 and only Align16 will be explicitly stated
emit_options(std::ostream & output) const3733 void G4_INST::emit_options(std::ostream& output) const
3734 {
3735     std::stringstream opts;
3736     bool first = true;
3737     auto emitOption = [&](const std::string &str) {
3738         if (first) {
3739             first = false;
3740         } else {
3741             opts << ",";
3742         }
3743         opts << str;
3744     };
3745 
3746 
3747     ////////////////////////////////////////////////////////////
3748     // SWSB options
3749     if (getDistance() != 0) {
3750         std::stringstream dists;
3751         switch (getDistanceTypeXe()) {
3752         case DistanceType::DIST:                    break;
3753         case DistanceType::DISTALL:   dists << 'A'; break;
3754         case DistanceType::DISTINT:   dists << 'I'; break;
3755         case DistanceType::DISTFLOAT: dists << 'F'; break;
3756         case DistanceType::DISTLONG:  dists << 'L'; break;
3757         case DistanceType::DISTMATH:  dists << 'M'; break;
3758         default:                      dists << "?"; break;
3759         }
3760         dists << '@' << (int)getDistance();
3761         emitOption(dists.str());
3762     }
3763 
3764     std::stringstream tks;
3765     std::string tks1;
3766     auto id = getToken();
3767     SWSBTokenType tkType = getTokenType();
3768     switch (tkType) {
3769     case TOKEN_NONE:
3770     case SB_SET:      break;
3771     case NoACCSBSet:  tks1 = "NoACC"; break;
3772     case AFTER_READ:  tks1 = ".R"; break;
3773     case AFTER_WRITE: tks1 = ".W"; break;
3774     case READ_ALL:    tks1 = ".R*"; break;
3775     case WRITE_ALL:   tks1 = ".W*"; break;
3776     default:          tks1 = ".??"; break;
3777     }
3778     if (tkType != TOKEN_NONE)
3779     {
3780         if (tkType != NoACCSBSet)
3781         {
3782             tks << '$' << (int)id << tks1;
3783         }
3784 
3785         if (tks1.size())
3786         {
3787             tks << tks1;
3788         }
3789         emitOption(tks.str());
3790     }
3791 
3792     ////////////////////////////////////////////////
3793     // bitset options
3794     G4_InstOpts currOpts = option;
3795     if (isEOT()) {
3796         currOpts |= InstOpt_EOT;
3797     }
3798 
3799     // strip out stuff we handle elsewhere
3800     currOpts &= ~(InstOpt_QuarterMasks | InstOpt_WriteEnable);
3801     unsigned short optIdx = 0;
3802     while (currOpts && 0xFFFFFFFF != InstOptInfo[optIdx].optMask)
3803     {
3804         if (currOpts & InstOptInfo[optIdx].optMask)
3805         {
3806             emitOption(InstOptInfo[optIdx].optStr);
3807             currOpts &= ~InstOptInfo[optIdx].optMask; // clear this bit
3808         }
3809         optIdx++;
3810     }
3811 
3812     ////////////////////////////////////////////////
3813     // for older Align16-supporting platforms
3814     // absense implies Align1
3815     if (isAligned16Inst()) {
3816         emitOption("Align16");
3817     }
3818 
3819     //////////////////////////////////////////////////
3820     // only include braces {...} if there's something
3821     auto optsStr = opts.str();
3822     if (!optsStr.empty())
3823         output << " {" << optsStr << "}";
3824 }
3825 
3826 
3827 static const char* const operandString[] =
3828 {
3829     OPND_NUM_ENUM(STRINGIFY)
3830 };
3831 
emitDefUse(std::ostream & output) const3832 void G4_INST::emitDefUse(std::ostream& output) const
3833 {
3834     output << "Def:\n";
3835     for (auto iter = defInstList.begin(), iterEnd = defInstList.end(); iter != iterEnd; ++iter)
3836     {
3837         G4_INST* inst = (*iter).first;
3838         inst->emit(output);
3839         output << "\t" << operandString[(*iter).second];
3840         output << "\n";
3841     }
3842     output << "Use:\n";
3843     for (auto iter = useInstList.begin(), iterEnd = useInstList.end(); iter != iterEnd; ++iter)
3844     {
3845         G4_INST* inst = (*iter).first;
3846         inst->emit(output);
3847         output << "\t" << operandString[(*iter).second];
3848         output << "\n";
3849     }
3850 }
3851 
isMixedMode() const3852 bool G4_INST::isMixedMode() const
3853 {
3854     if (mayExceedTwoGRF() || !getDst())
3855     {
3856         return false;
3857     }
3858     for (int i = 0; i < getNumSrc(); ++i)
3859     {
3860         G4_Operand *tOpnd = getSrc(i);
3861         if (!tOpnd)
3862         {
3863             continue;
3864         }
3865 
3866         G4_Type srcType = tOpnd->getType();
3867         G4_Type dstType = getDst()->getType();
3868 
3869         if ((dstType == builder.getMixModeType() || srcType == builder.getMixModeType()) &&
3870             dstType != srcType)
3871         {
3872             // do not consider int<->float conversion as mixed type
3873             if (!IS_TYPE_INT(dstType) && !IS_TYPE_INT(srcType))
3874             {
3875                 return true;
3876             }
3877         }
3878     }
3879 
3880     return false;
3881 }
3882 
setMsgDesc(G4_SendDesc * in)3883 void G4_InstSend::setMsgDesc(G4_SendDesc *in)
3884 {
3885     assert(in && "null descriptor not expected");
3886 #if defined(_DEBUG)
3887     if (in && in->getExecSize() == g4::SIMD_UNDEFINED)
3888     {
3889         DEBUG_MSG("Msg Desc has execSize undefined!\n");
3890     }
3891 #endif
3892     msgDesc = in;
3893     resetRightBound((G4_Operand*)dst);
3894     resetRightBound(srcs[0]);
3895 }
3896 
isDirectSplittableSend()3897 bool G4_InstSend::isDirectSplittableSend()
3898 {
3899     unsigned short elemSize = dst->getElemSize();
3900     SFID funcID = msgDesc->getSFID();
3901     const G4_SendDescRaw *desc = getMsgDescRaw();
3902     if (desc == nullptr) {
3903         // load/store messages are unsplittable for now
3904         return false;
3905     }
3906     switch (funcID)
3907     {
3908     case SFID::DP_DC1:
3909         switch (desc->getHdcMessageType())
3910         {
3911         case DC1_A64_SCATTERED_READ:   //emask need be vertically cut.
3912             return false;
3913 
3914         case DC1_A64_UNTYPED_SURFACE_READ:  //SVM gather 4: emask can be reused if the per-channel data is larger than 1 GRF
3915         case DC1_UNTYPED_SURFACE_READ:   //VISA gather 4
3916         case DC1_TYPED_SURFACE_READ:   //Gather 4 typed
3917             if (elemSize * execSize > (int)numEltPerGRF<Type_UB>() &&
3918                 elemSize * execSize % numEltPerGRF<Type_UB>() == 0)
3919             {
3920                 return true;
3921             }
3922             else
3923             {
3924                 return false;
3925             }
3926 
3927         default: return false;
3928         }
3929     case SFID::DP_DC2:
3930         switch (desc->getHdcMessageType())
3931         {
3932         case DC2_UNTYPED_SURFACE_READ:   //gather 4 scaled :  emask can be reused if the per-channel data is larger than 1 GRF
3933         case DC2_A64_UNTYPED_SURFACE_READ: //SVM gather 4 scaled
3934             if (elemSize * execSize > (int)numEltPerGRF<Type_UB>() &&
3935                 elemSize * execSize % numEltPerGRF<Type_UB>() == 0)
3936             {
3937                 return true;
3938             }
3939             else
3940             {
3941                 return false;
3942             }
3943 
3944         case DC2_BYTE_SCATTERED_READ:   //scaled byte scattered read: gather_scaled, handled as block read write, nomask
3945             return true;
3946 
3947         default: return false;
3948         }
3949     case SFID::DP_DC0:
3950         switch (desc->getHdcMessageType())
3951         {
3952         case DC_DWORD_SCATTERED_READ:   //dword scattered read: emask need be vertically cut according to splitting
3953         case DC_BYTE_SCATTERED_READ:       //byte scattered read
3954             return false;
3955         case DC_ALIGNED_OWORD_BLOCK_READ: //Nomask
3956         case DC_OWORD_BLOCK_READ:
3957             return true;
3958         default: return false;
3959         }
3960     case SFID::SAMPLER:
3961         return true;
3962     default: return false;
3963     }
3964 
3965     return false;
3966 }
3967 
3968 
3969 //
3970 // emit send instruction with symbolic/physical register operand depending on the operand check
3971 //
emit_send(std::ostream & output,bool symbol_dst,bool * symbol_srcs)3972 void G4_InstSend::emit_send(std::ostream& output, bool symbol_dst, bool *symbol_srcs)
3973 {
3974     emitInstructionStartColumn(output, *this);
3975 
3976     output << ' ';
3977     dst->emit(output, symbol_dst);
3978 
3979     output << ' ';
3980     G4_Operand* currSrc = srcs[0];
3981     if (currSrc->isSrcRegRegion()) {
3982         // only output reg var & reg off; don't output region desc and type
3983         currSrc->asSrcRegRegion()->emitRegVarOff(output, false);
3984     } else {
3985         currSrc->emit(output, false); //emit CurrDst
3986     }
3987     output << ' ';
3988 
3989     if (isSplitSend())
3990     {
3991         // emit src1
3992         srcs[1]->asSrcRegRegion()->emitRegVarOff(output, false);
3993         output << ' ';
3994     }
3995 
3996     // emit exDesc if srcs[3] is not null.
3997     // It should always be a0.2 unless it was constant folded
3998     if (isSplitSend() && srcs[3])
3999     {
4000         srcs[3]->emit(output, false);
4001         output << ' ';
4002     }
4003     else
4004     {
4005         if (getMsgDescRaw()) {
4006             std::ios::fmtflags outFlags(output.flags());
4007             output << fmtHex(getMsgDescRaw()->getExtendedDesc());
4008             output << ' ';
4009             output.flags(outFlags);
4010         }
4011     }
4012 
4013     // emit msgDesc (2 for sends and 1 for send). Last operand shown in asm.
4014     int msgDescId = isSplitSend() ? 2 : 1;
4015     srcs[msgDescId]->emit(output, false);
4016 
4017     emit_options(output);
4018 }
4019 
emit_send(std::ostream & output,bool dotStyle)4020 void G4_InstSend::emit_send(std::ostream& output, bool dotStyle)
4021 {
4022     emit_send(output, false, NULL);
4023 }
4024 
emit_send_desc(std::ostream & output)4025 void G4_InstSend::emit_send_desc(std::ostream& output)
4026 {
4027     const G4_INST* sendInst = this;
4028 
4029     // Emit a text description of the descriptor if it is available
4030     G4_SendDesc* msgDesc = sendInst->getMsgDesc();
4031     output << " // ";
4032     if (getCISAOff() != -1) {
4033         emitInstIds(output);
4034         output << "; ";
4035     }
4036 
4037     auto desc = msgDesc->getDescription();
4038     if (!desc.empty()) {
4039         output << msgDesc->getDescription();
4040     }
4041     if (const auto *rawDesc = sendInst->getMsgDescRaw()) {
4042     }
4043 
4044     output << ", resLen=" << msgDesc->getDstLenRegs();
4045     output << ", msgLen=" << msgDesc->getSrc0LenRegs();
4046     if (isSplitSend())
4047     {
4048         output << ", extMsgLen=" << msgDesc->getSrc1LenRegs();
4049     }
4050 
4051     if (msgDesc->isBarrier())
4052     {
4053         output << ", barrier";
4054     }
4055 }
4056 
4057 
4058 // print r#
emit(std::ostream & output,bool symbolreg)4059 void G4_Greg::emit(std::ostream& output, bool symbolreg)
4060 {
4061     output << "r" << getRegNum();
4062 }
4063 
emit(std::ostream & output,bool symbolreg)4064 void G4_Areg::emit(std::ostream& output, bool symbolreg)
4065 {
4066     switch (getArchRegType())
4067     {
4068     case AREG_NULL:    output << "null";  break;
4069     case AREG_A0:      output << "a0";    break;
4070     case AREG_ACC0:    output << "acc0";  break;
4071     case AREG_ACC1:    output << "acc1";  break;
4072     case AREG_MASK0:   output << "ce0";   break;
4073     case AREG_MS0:     output << "ms0";   break;
4074     case AREG_DBG:     output << "dbg0";  break;
4075     case AREG_SR0:     output << "sr0";   break;
4076     case AREG_CR0:     output << "cr0";   break;
4077     case AREG_TM0:     output << "tm0";   break;
4078     case AREG_N0:      output << "n0";    break;
4079     case AREG_N1:      output << "n1";    break;
4080     case AREG_IP:      output << "ip";    break;
4081     case AREG_F0:      output << "f0";    break;
4082     case AREG_F1:      output << "f1";    break;
4083     case AREG_TDR0:    output << "tdr0";  break;
4084     case AREG_SP:      output << "sp";    break;
4085     case AREG_F2:      output << "f2";    break;
4086     case AREG_F3:      output << "f3";    break;
4087     default:
4088         output << "unknown architecture reg";
4089         MUST_BE_TRUE(false, ERROR_UNKNOWN);
4090     }
4091 }
4092 
4093 //
4094 // initial all values idential to rgn's
4095 //
G4_SrcRegRegion(G4_SrcRegRegion & rgn)4096 G4_SrcRegRegion::G4_SrcRegRegion(G4_SrcRegRegion &rgn)
4097     : G4_Operand(G4_Operand::srcRegRegion), acc(rgn.acc), regOff(rgn.regOff), subRegOff(rgn.subRegOff)
4098 {
4099     base = rgn.base;
4100     mod = rgn.mod;
4101     immAddrOff = rgn.immAddrOff;
4102     desc = rgn.desc;
4103     type = rgn.type;
4104     // copy swizzle value
4105     char *sw1 = swizzle, *sw2 = rgn.swizzle;
4106     while (*sw2) *sw1++ = *sw2++;
4107     *sw1 = *sw2;
4108     accRegSel = rgn.accRegSel;
4109 
4110     // FIXME: it's rather suspicious that we are copying internal fields this way
4111     bitVec[0] = rgn.bitVec[0];
4112     bitVec[1] = rgn.bitVec[1];
4113 
4114     top_dcl = rgn.top_dcl;
4115     left_bound = rgn.left_bound;
4116     right_bound = rgn.right_bound;
4117     byteOffset = rgn.byteOffset;
4118     rightBoundSet = rgn.rightBoundSet;
4119 }
4120 
4121 //
4122 // return true if rng and this have the same reg region
4123 //
sameSrcRegRegion(G4_SrcRegRegion & rgn)4124 bool G4_SrcRegRegion::sameSrcRegRegion(G4_SrcRegRegion& rgn)
4125 {
4126     return base == rgn.base &&
4127            acc == rgn.acc &&
4128            mod == rgn.mod &&
4129            strcmp(swizzle,rgn.swizzle) == 0 &&
4130            desc == rgn.desc &&
4131            regOff == rgn.regOff &&
4132            subRegOff == rgn.subRegOff &&
4133            immAddrOff == rgn.immAddrOff &&
4134            type == rgn.type &&
4135            accRegSel == rgn.accRegSel;
4136 }
4137 
4138 // compute max execution size starting from the current pos.
4139 // power of two. cross-GRF boundary is allowed if the region is evenly split.
4140 // cross half-GRF should guaranttee evenly split
getMaxExecSize(int pos,uint8_t maxExSize,bool allowCrossGRF,uint16_t & vs,uint16_t & wd,bool & twoGRFsrc)4141 uint8_t G4_SrcRegRegion::getMaxExecSize(int pos, uint8_t maxExSize, bool allowCrossGRF, uint16_t &vs, uint16_t &wd, bool &twoGRFsrc)
4142 {
4143     if (isRightBoundSet() == false)
4144     {
4145         getInst()->computeRightBound(this);
4146     }
4147 
4148     twoGRFsrc = false;
4149     vs = 0;
4150     wd = 0;
4151     if (isScalar())
4152     {
4153         vs = 0;
4154         wd = 1;
4155         return maxExSize;
4156     }
4157     else if (acc != Direct)
4158     {
4159         // assume this operand is kosher (i.e., does not cross GRF) as the vISA spec requires it
4160         vs = desc->vertStride;
4161         wd = desc->width;
4162         return roundDownPow2(maxExSize);
4163     }
4164 
4165     // align16 operands
4166     if (desc->isRegionV())
4167     {
4168         vs = desc->vertStride;
4169         wd = desc->width;
4170         if (desc->horzStride == 0)
4171         {
4172             return roundDownPow2(maxExSize);
4173         }
4174 
4175         uint32_t elSize = getTypeSize();
4176         uint8_t maxSize = 0;
4177 
4178         uint32_t prevPos = pos * elSize;
4179         uint8_t numEleInFristGRF = 0, numEleInSecondGRF = 0;
4180         uint32_t newLB = getLeftBound() + prevPos;
4181         bool crossGRF = (newLB / numEltPerGRF<Type_UB>() != getRightBound() / numEltPerGRF<Type_UB>()),
4182             inFirstGRF = true;
4183 
4184         for (int i = pos + 4; i < (pos + maxExSize); i += 4)
4185         {
4186             uint32_t currPos = i * elSize;
4187 
4188             // check cross GRF boundary
4189             if (crossGRF && inFirstGRF)
4190             {
4191                 uint32_t newRB = getLeftBound() + currPos - 1;
4192                 uint32_t leftGRF = newLB / numEltPerGRF<Type_UB>(), rightGRF = newRB / numEltPerGRF<Type_UB>();
4193                 if (leftGRF != rightGRF)
4194                 {
4195                     inFirstGRF = false;
4196                     numEleInFristGRF = maxSize;
4197                     newLB = newRB;
4198                 }
4199             }
4200 
4201             maxSize += 4;
4202 
4203             if (numEleInFristGRF)
4204             {
4205                 numEleInSecondGRF += 4;
4206                 if (numEleInSecondGRF == numEleInFristGRF)
4207                 {
4208                     twoGRFsrc = true;
4209                     break;
4210                 }
4211             }
4212         }
4213         if (numEleInSecondGRF < numEleInFristGRF)
4214         {
4215             twoGRFsrc = false;
4216             maxSize = numEleInFristGRF;
4217         }
4218         return maxSize;
4219     }
4220 
4221     // align1 direct
4222     uint32_t elSize = TypeSize(type);
4223     uint8_t maxSize = 1;
4224 
4225     bool alignToRow = pos % desc->width == 0;
4226 
4227     // region may not be contiguous/single stride depending on the start position
4228     bool contRegion = desc->isContiguous(maxExSize + (pos % desc->width));
4229 
4230     uint16_t vStride = 1;
4231     if (contRegion || desc->isSingleNonUnitStride(maxExSize + (pos % desc->width), vStride))
4232     {
4233         // apparently the old code actually allows GRF-crossing as long as it's evenly divided
4234         // (the function comment lied), so we have to try all exec sizes from the largest possible. sigh..
4235         vs = vStride;
4236         wd = 1;
4237         // we need to be careful with start byte here since maxExSize may not be same as inst exec size
4238         // e.g., say this is called on
4239         // mov (16) V44_m(2,0)<1>:f V43_in(1,19)<16;8,1>:ub
4240         // with pos 8 and maxExSize 8
4241         // the region is considered single stride in this case, but is not with the original exsize (16),
4242         // so we can't just multiply stride with type size to get starting offset
4243         uint32_t startByte = (getLeftBound() + getByteOffset(pos)) % numEltPerGRF<Type_UB>();
4244         int retExecSize = 1;
4245         int execTypeSize = vStride * getElemSize();
4246         int exSizes[] = { 32, 16, 8, 4, 2 };
4247 
4248         for (auto size : exSizes)
4249         {
4250             if (maxExSize < size)
4251             {
4252                 continue;
4253             }
4254             if (startByte + (size - 1) * execTypeSize + getElemSize() <= numEltPerGRF<Type_UB>())
4255             {
4256                 // no GRF crossing (we don't count the padding bytes after the last element)
4257                 retExecSize = size;
4258                 break;
4259             }
4260             else if (allowCrossGRF)
4261             {
4262                 int numEltInFirstGRF = (numEltPerGRF<Type_UB>() - startByte) / execTypeSize;
4263                 // startByte may not be aligned to exec type size (e.g., r1.1<2;1,0>:b).  We need to increment by 1 in this case
4264                 if ((numEltPerGRF<Type_UB>() - startByte) % execTypeSize != 0)
4265                 {
4266                     numEltInFirstGRF += 1;
4267                 }
4268                 if (numEltInFirstGRF == size - numEltInFirstGRF)
4269                 {
4270                     twoGRFsrc = true;
4271                     retExecSize = size;
4272                     break;
4273                 }
4274             }
4275         }
4276 
4277         return (uint8_t)retExecSize;
4278     }
4279 
4280     // conservative.
4281     // Here we assume that no cross width if row size is larger than width
4282     // mul (16) V112(0,0)<1>:f V111(0,0)<16;16,1>:f r1.0<1;4,0>:f
4283     if (!alignToRow && !contRegion && desc->vertStride != 0 && desc->horzStride != 0)
4284     {
4285         wd = vs = (uint16_t)roundDownPow2((pos/desc->width + 1) * desc->width - pos);
4286 
4287         // Need to check whether this subregion crosses grf or not.
4288         // E.g. the second half does cross a grf:
4289         // mov (8) V41(0, 9)<1> V58(2, 8)<32;8,4>
4290         //
4291         // Given a linearized index, compute its byte offset relative to the
4292         // first element (index 0).
4293         auto computeOffset = [=](unsigned index) -> unsigned {
4294             unsigned typeSize = TypeSize(type);
4295             unsigned offset = (index % desc->width) * desc->horzStride * typeSize;
4296             offset += (index / desc->width) * desc->vertStride * typeSize;
4297             return offset;
4298         };
4299 
4300         // Since a single element cannot cross a grf, checking the first byte of the
4301         // first and last element is sufficient.
4302         // FIXME: fix other places with this logic.
4303         unsigned firstPos = getLeftBound() + computeOffset((unsigned)pos);
4304         unsigned lastPos = getLeftBound() + computeOffset((unsigned)(pos + wd - 1));
4305         twoGRFsrc = firstPos / numEltPerGRF<Type_UB>() != lastPos / numEltPerGRF<Type_UB>();
4306 
4307         return (uint8_t)wd;
4308     }
4309 
4310     uint8_t posInFirstRow = pos%desc->width, eleInRow = 1, eleInFirstRow = desc->width - posInFirstRow;
4311     uint8_t pow2 = roundDownPow2(eleInFirstRow);
4312 
4313     if (eleInFirstRow != pow2 && !contRegion)
4314     {
4315         wd = pow2;
4316         vs = wd * desc->horzStride;
4317         return pow2;
4318     }
4319 
4320     uint32_t prevPos = (pos/desc->width * desc->vertStride + posInFirstRow * desc->horzStride) * elSize;
4321     uint8_t numEleInFristGRF = 0, numEleInSecondGRF = 0;
4322     bool crossRow = false;
4323     uint32_t newLB = getLeftBound() + prevPos;
4324     bool crossGRF = (newLB / numEltPerGRF<Type_UB>() != getRightBound() / numEltPerGRF<Type_UB>()),
4325         inFirstGRF = true;
4326     bool negVS = (desc->vertStride < desc->horzStride * desc->width);
4327 
4328     for (int i = pos + 1; i < (pos + maxExSize); i++)
4329     {
4330         uint8_t posInRow = i % desc->width;
4331         uint32_t currPos = ((i / desc->width) * desc->vertStride + posInRow * desc->horzStride) * elSize;
4332 
4333         // check cross row boundary
4334         if ((!contRegion || desc->vertStride == 0) && posInRow == 0)
4335         {
4336             uint8_t pow2Val = roundDownPow2(eleInRow);
4337             if (pow2Val != eleInRow  ||
4338                 ((desc->vertStride == 0 || negVS) && !alignToRow))
4339             {
4340                 // this happens in the first row
4341                 wd = maxSize = pow2Val;
4342                 vs = wd * desc->horzStride;
4343                 break;
4344             }
4345             else if (wd == 0)
4346             {
4347                 // <2;4,1>
4348                 wd = eleInRow;
4349                 if (alignToRow)
4350                 {
4351                     vs= desc->vertStride;
4352                 }
4353                 else
4354                 {
4355                     vs = (currPos - prevPos) / elSize;
4356                 }
4357             }
4358             crossRow = true;
4359             eleInRow = 0;
4360         }
4361 
4362         // check cross GRF boundary
4363         if (crossGRF && inFirstGRF)
4364         {
4365             uint32_t newRB = getLeftBound() + currPos + elSize - 1;
4366             uint32_t leftGRF = newLB / numEltPerGRF<Type_UB>(), rightGRF = newRB / numEltPerGRF<Type_UB>();
4367             if (leftGRF != rightGRF)
4368             {
4369                 inFirstGRF = false;
4370                 uint8_t pow2Val = roundDownPow2(maxSize);
4371 
4372                 // if number of element in first GRF is not power of 2, or
4373                 // subregister offset of two GRFs are different and not contiguous(too conservative?)
4374                 if (pow2Val != maxSize ||
4375                     (!contRegion && !(alignToRow && maxSize <= desc->width) && newLB % numEltPerGRF<Type_UB>() != (getLeftBound() + currPos) % numEltPerGRF<Type_UB>()))
4376                 {
4377                     maxSize = pow2Val;
4378                     if (wd == 0)
4379                     {
4380                         wd = pow2Val;
4381                         vs = wd * desc->horzStride;
4382                     }
4383                     break;
4384                 }
4385                 else if (wd == 0)
4386                 {
4387                     wd = maxSize < desc->width ? maxSize : desc->width;
4388                     vs = (currPos - prevPos) / elSize;
4389                 }
4390                 numEleInFristGRF = maxSize;
4391                 newLB = newRB;
4392             }
4393         }
4394 
4395         maxSize++;
4396         eleInRow++;
4397         // make sure the number of elements in two rows are the same
4398         if (crossRow && eleInRow == eleInFirstRow && !alignToRow && !contRegion)
4399         {
4400             break;
4401         }
4402 
4403         if (numEleInFristGRF)
4404         {
4405             numEleInSecondGRF++;
4406             if (numEleInSecondGRF == numEleInFristGRF)
4407             {
4408                 twoGRFsrc = true;
4409                 break;
4410             }
4411         }
4412     }
4413     if (wd == 0)
4414     {
4415         // contiguous region
4416         wd = pow2;
4417         vs = wd * desc->horzStride;
4418     }
4419     if (numEleInSecondGRF < numEleInFristGRF)
4420     {
4421         maxSize = numEleInFristGRF;
4422     }
4423     return maxSize;
4424 }
4425 
4426 //
4427 // output (Var+refOff).subRegOff
4428 //
printRegVarOff(std::ostream & output,G4_Operand * opnd,short regOff,short subRegOff,short immAddrOff,G4_Type type,bool symbolreg,bool printSubReg)4429 void printRegVarOff(std::ostream&  output,
4430                     G4_Operand*    opnd,
4431                     short          regOff,  // base+regOff is the starting register
4432                     short          subRegOff, // sub reg offset
4433                     short          immAddrOff, // imm addr offset
4434                     G4_Type        type,
4435                     bool symbolreg,
4436                     bool printSubReg)
4437 //
4438 // symbolreg == false,  output physcial register operand
4439 // symbolreg == true,   output symbolic register operand
4440 // Here we use symbolreg instead of Options::symbolReg, because sometimes we need to emit an instruction with invalid symbolic register as comment and
4441 // then emit a workable instruction with physical register. In this case, we do not want to re-set the global option variable Options::symbolReg to switch
4442 // between these two states, that may have potential side effects.
4443 //
4444 {
4445     short subRegOffset = (subRegOff != (short) UNDEFINED_SHORT) ? subRegOff : 0;
4446 
4447     G4_RegAccess acc = opnd->getRegAccess();
4448     G4_VarBase* base = opnd->getBase();
4449     if (acc == Direct)
4450     {
4451         MUST_BE_TRUE(regOff != (short) UNDEFINED_SHORT,
4452             ERROR_INTERNAL_ARGUMENT);
4453 
4454         if (base->isRegVar())
4455         {
4456             G4_RegVar* baseVar = static_cast<G4_RegVar*>(base);
4457             int declOpSize = baseVar->getDeclare()->getElemSize();
4458             uint16_t thisOpSize = TypeSize(type);
4459 
4460             if (baseVar->isPhyRegAssigned())
4461             {
4462 
4463                 if (symbolreg && !base->isFlag())
4464                 {
4465                     //
4466                     // No matter the type of register and if the allocation successed, we output format <symbol>(RegOff, SubRegOff)
4467                     // Note: we have check if the register allocation  successed when emit the declare!
4468                     //
4469                     output << base->asRegVar()->getName() << "(" << regOff << "," << subRegOff << ")";
4470                     return;
4471                 }
4472 
4473                 if (baseVar->getPhyReg()->isGreg())
4474                 {
4475                     int regNum = 0, subRegNum = 0;
4476                     uint32_t byteAddress = opnd->getLinearizedStart();
4477 
4478                     if (baseVar->getDeclare()->getGRFBaseOffset() == 0)
4479                     {
4480                         // This is before RA and getLineariedStart() only contains the left bound
4481                         // we have to add the declare's phyreg
4482                         byteAddress += baseVar->getPhyReg()->asGreg()->getRegNum() * getGRFSize() + baseVar->getPhyRegOff() * TypeSize(type);
4483                     }
4484 
4485                     regNum = byteAddress / getGRFSize();
4486                     subRegNum = (byteAddress % getGRFSize()) / TypeSize(type);
4487 
4488 
4489                      output << "r" << regNum;
4490                      if (printSubReg)
4491                      {
4492                          output << "." << subRegNum;
4493                      }
4494                 }
4495                 else if (baseVar->getPhyReg()->isAreg())
4496                 {
4497                     (static_cast<G4_Areg*>(baseVar->getPhyReg()))->emit(output);
4498                     if (!baseVar->isNullReg())
4499                     {
4500                         unsigned ArfSubRegNum = baseVar->getPhyRegOff();
4501 
4502                         //ArfSubRegNum is in unit of declOpSize
4503                         //transform ArfSubRegNum to unit of thisOpSize
4504                         if (thisOpSize != declOpSize)
4505                         {
4506                             if (!opnd->getInst()->isPseudoKill())
4507                             {
4508                                 MUST_BE_TRUE((ArfSubRegNum * declOpSize) % thisOpSize == 0,
4509                                     ERROR_DATA_RANGE("ARF sub-register number"));
4510                             }
4511                             ArfSubRegNum = (ArfSubRegNum * declOpSize) / thisOpSize;
4512                         }
4513 
4514                         unsigned subreg = ArfSubRegNum + subRegOffset;
4515                         output << '.' << subreg;
4516                     }
4517                 }
4518                 else
4519                     MUST_BE_TRUE(false, ERROR_UNKNOWN);
4520             }
4521             else        // physical register not allocated
4522             {
4523                 baseVar->emit(output);
4524                 output << '(' << regOff << ',' << subRegOff << ')';
4525             }
4526         }
4527         else //This is not a RegVar
4528         {
4529             if (base->isAccReg() && regOff != 0)
4530             {
4531                 bool valid;
4532                 int regNum = base->ExRegNum(valid);
4533                 output << "acc" << regNum + regOff;
4534             }
4535             else
4536             {
4537                 base->emit(output);
4538             }
4539             if (!base->isNullReg() && !base->isIpReg() && !base->isNReg() && subRegOff != (short) UNDEFINED_SHORT && printSubReg)
4540             {
4541                 output << '.' << subRegOff;
4542             }
4543         }
4544     }
4545     else //This is an indirect access
4546     {
4547         if (acc == IndirGRF)
4548         {
4549             output << "r[";
4550         }
4551         else //Unknown access type
4552         {
4553             MUST_BE_TRUE(false, ERROR_UNKNOWN);
4554         }
4555 
4556         if (base->isRegVar())
4557         {
4558             MUST_BE_TRUE(regOff == 0, ERROR_INTERNAL_ARGUMENT);
4559             G4_RegVar* baseVar = static_cast<G4_RegVar*>(base);
4560             if (baseVar->isPhyRegAssigned())
4561             {
4562                 MUST_BE_TRUE(baseVar->getPhyReg()->isAreg(), ERROR_UNKNOWN);
4563 
4564                 if (symbolreg)
4565                 {
4566                     output << baseVar->getName();
4567                     output << '(' << regOff << ',' << subRegOffset << ")," << immAddrOff << ']';
4568                 }
4569                 else
4570                 {
4571                     (static_cast<G4_Areg*>(baseVar->getPhyReg()))->emit(output);
4572                     output << '.' << (baseVar->getPhyRegOff() + subRegOffset);
4573                     {
4574                         output << ", " << immAddrOff << ']';
4575                     }
4576                 }
4577             }
4578             else //No register assigned yet
4579             {
4580                 baseVar->emit(output);
4581                 output << '(' << regOff << ',' << subRegOff << ')';
4582                 output << ", " << immAddrOff << ']';
4583             }
4584         }
4585         else if (base->isAreg())
4586         {
4587             (static_cast<G4_Areg*>(base))->emit(output);
4588             output << '.' << subRegOffset;
4589             {
4590                 output << ", " << immAddrOff << ']';
4591             }
4592         }
4593         else
4594         {
4595             MUST_BE_TRUE(false, "Unknown base variable type for indirect access");
4596         }
4597     }
4598 }
4599 
4600 //
4601 // output <modifier>(Var+refOff).subRegOff<16;16,1>.xxyy
4602 //
4603 // symbolreg == false,  output <modifier>(Var+refOff).subRegOff<16;16,1>.xxyy
4604 // symbolreg == true,   output <modifier><symbol>(RegOff, SubRegOff)<16;16,1> in symbolic register emit
4605 // Here we use symbolreg instead of Options::symbolReg, because sometimes we need to emit an instruction with invalid symbolic register as comment and
4606 // then emit a workable instruction with physical register. In this case, we do not want to re-set the global option variable Options::symbolReg to switch
4607 // between these two states, that may have potential side effects.
4608 //
emit(std::ostream & output,bool symbolreg)4609 void G4_SrcRegRegion::emit(std::ostream& output, bool symbolreg)
4610 {
4611     if (mod != Mod_src_undef)
4612     {
4613         output << SrcModifierStr[mod];
4614     }
4615 
4616     //
4617     // output Var(refOff,subRegOff)
4618     //
4619     emitRegVarOff(output, symbolreg);
4620     //
4621     // output <vertStride;width,horzStride>
4622     //
4623     // do not emit region for null reg
4624     // do not emit region for macro madm
4625     if (desc && !base->isNullReg() && !base->isNReg() && !isAccRegValid())// rgn == NULL, the default region is used
4626     {
4627         bool align1ternary = inst && inst->getNumSrc() == 3 && inst->getPlatform() >= GENX_ICLLP &&
4628             !inst->isSend() && inst->isAligned1Inst();
4629 
4630         // RegionV is invalid for SRC operands
4631         if (desc->isRegionWH())
4632         {
4633             output << "<" << desc->width << "," << desc->horzStride << ">";
4634         }
4635         else if (desc->isRegionSW()) // support <0/4> for Src of Align16 instruction
4636         {
4637             output << "<" << desc->vertStride << ">";
4638         }
4639         else if (desc->vertStride == UNDEFINED_SHORT && desc->width == UNDEFINED_SHORT)
4640         {
4641             output << "<" << desc->horzStride << ">";
4642         }
4643         else
4644         {
4645             if (align1ternary)
4646             {
4647                 // format is <V;H> with W derived from V and H
4648                 output << "<" << desc->vertStride << ";" << desc->horzStride << ">";
4649             }
4650             else if (!isWithSwizzle())
4651             {
4652                 // do not print region for align16 sources
4653                 output << "<" << desc->vertStride << ";" << desc->width << "," << desc->horzStride << ">";
4654             }
4655         }
4656     }
4657 
4658     if (isAccRegValid())
4659     {
4660         // no vertical stride for 3-source instruction
4661         if (inst->getNumSrc() != 3 && desc)
4662         {
4663             output << "<" << desc->vertStride << ">";
4664         }
4665 
4666         // output acc2~acc9
4667         if (getAccRegSel() == NOACC)
4668         {
4669             output << ".noacc";
4670         }
4671         else
4672         {
4673             output <<".acc"<< (getAccRegSel()+2);
4674         }
4675     }
4676     else if (*swizzle)
4677     {
4678         output << "." << swizzle;
4679     }
4680 
4681     if (Type_UNDEF != type)
4682     {
4683         if (!symbolreg || acc != Direct)                // can output register data type for indirect addressing in any time
4684             output << ':' << TypeSymbol(type);
4685     }
4686 }
4687 
4688 //
4689 
4690 // return true if this src is a scalar
4691 
4692 // V82(1,0)<0>.xxxx:f
4693 
4694 // V82(1,0)<0;1,0>:f  --- detect via
4695 
4696 //
4697 
isScalar() const4698 bool G4_SrcRegRegion::isScalar() const
4699 
4700 {
4701 
4702     if (!isWithSwizzle())
4703     {
4704 
4705         return getRegion()->isScalar(); // check <0;1,0>
4706     }
4707     else
4708     {
4709         return swizzle[0] == 'r';
4710     }
4711 
4712 }
4713 
4714 
4715 //
4716 // This function is used to check if the src operand obey the rule of symbolic register. We need this function to check the operand before we emit an instruction
4717 //
obeySymbolRegRule() const4718 bool G4_SrcRegRegion::obeySymbolRegRule() const
4719 {
4720     if (!base->isRegVar())          // only for reg var
4721         return false;
4722 
4723     if (base->asRegVar()->getDeclare()->isSpilled())
4724     {
4725         return false;
4726     }
4727 
4728     //
4729     // Rule-3: No swizzle .xyzw
4730     //
4731     if (*swizzle)
4732     {
4733         return false;
4734     }
4735     //
4736     // Rule-4: do not support date type redefinition in direct addressing
4737     //
4738     if (Type_UNDEF != type)
4739     {
4740          if (base->isRegVar() && acc == Direct && base->asRegVar()->getDeclare()->getElemType() != type)        // check if the data type is the same as in declare
4741          {
4742              return false;
4743          }
4744     }
4745 
4746     return true;
4747 }
4748 
4749 //
4750 // Here we use symbolreg instead of Options::symbolReg, because sometimes we need to emit an instruction with invalid symbolic register as comment and
4751 // then emit a workable instruction with physical register. In this case, we do not want to re-set the global option variable Options::symbolReg to switch
4752 // between these two states, that may have potential side effects.
4753 //
emitRegVarOff(std::ostream & output,bool symbolreg)4754 void G4_SrcRegRegion::emitRegVarOff(std::ostream& output, bool symbolreg)
4755 {
4756     bool printSubReg = true;
4757     if (inst && inst->isSend())
4758     {
4759         printSubReg = false;
4760     }
4761     printRegVarOff(output, this, regOff,subRegOff,immAddrOff,type, symbolreg, printSubReg);
4762 }
4763 
4764 //
4765 // initial all values idential to rgn's
4766 //
G4_DstRegRegion(G4_DstRegRegion & rgn)4767 G4_DstRegRegion::G4_DstRegRegion(G4_DstRegRegion &rgn)
4768     : G4_Operand(G4_Operand::dstRegRegion)
4769 {
4770     acc = rgn.acc;
4771     base = rgn.base;
4772     regOff = rgn.regOff;
4773     subRegOff = rgn.subRegOff;
4774     immAddrOff = rgn.immAddrOff;
4775     horzStride = rgn.horzStride;
4776     type = rgn.type;
4777     writeMask = rgn.writeMask;
4778     accRegSel = rgn.accRegSel;
4779 
4780     top_dcl = rgn.top_dcl;
4781     left_bound = rgn.left_bound;
4782     right_bound = rgn.right_bound;
4783     bitVec[0] = rgn.bitVec[0];
4784     bitVec[1] = rgn.bitVec[1];
4785     byteOffset = rgn.byteOffset;
4786     rightBoundSet = rgn.rightBoundSet;
4787 }
4788 
computeLeftBound()4789 void G4_DstRegRegion::computeLeftBound()
4790 {
4791     top_dcl = NULL;
4792     uint32_t newregoff = regOff, offset = 0;
4793     if (base && base->isRegVar())
4794     {
4795         top_dcl = base->asRegVar()->getDeclare();
4796         if (!top_dcl && base->asRegVar()->isGreg())
4797         {
4798             newregoff = base->asRegVar()->asGreg()->getRegNum();
4799         }
4800     }
4801 
4802     if (top_dcl)
4803     {
4804         while (top_dcl->getAliasDeclare())
4805         {
4806             offset += top_dcl->getAliasOffset();
4807             top_dcl = top_dcl->getAliasDeclare();
4808         }
4809     }
4810 
4811     if (base && base->isFlag())
4812     {
4813         if (base->isRegVar())
4814         {
4815             if (base->asRegVar()->getPhyReg())
4816             {
4817                 left_bound = base->asRegVar()->getPhyRegOff() * 16;   // the bound of flag register is in unit of BIT
4818                 left_bound += subRegOff * 16;
4819                 left_bound += base->asRegVar()->getPhyReg()->asAreg()->getFlagNum() * 32;
4820             }
4821             else
4822             {
4823                 left_bound = subRegOff * 16;
4824             }
4825         }
4826         else
4827         {
4828             left_bound = subRegOff * 16;
4829             left_bound += base->asAreg()->getFlagNum() * 32;
4830         }
4831 
4832         byteOffset = left_bound / 8;
4833     }
4834     else if (base != NULL && base->isAccReg())
4835     {
4836         left_bound = subRegOff * TypeSize(type);
4837         if (base->asAreg()->getArchRegType() == AREG_ACC1 || regOff == 1)
4838         {
4839             left_bound += getGRFSize();
4840         }
4841         byteOffset = left_bound;
4842     } else if (top_dcl) {
4843         if (acc == Direct) {
4844             left_bound = offset + newregoff * numEltPerGRF<Type_UB>() + subRegOff * TypeSize(type);
4845             if (top_dcl->getTotalElems() * top_dcl->getElemSize() >= (int)numEltPerGRF<Type_UB>()) {
4846                 byteOffset = left_bound;
4847             }
4848             else {
4849                 unsigned alignOff = TypeSize(type) > TypeSize(Type_W) ?
4850                     TypeSize(type) : TypeSize(Type_W);
4851 
4852                 if (top_dcl->getSubRegAlign() == Even_Word || top_dcl->getSubRegAlign() >= Four_Word) {
4853                     alignOff = top_dcl->getSubRegAlign() * 2;
4854                 }
4855 
4856                 byteOffset = left_bound + alignOff;
4857             }
4858         }
4859         else {
4860             left_bound = subRegOff * TypeSize(ADDR_REG_TYPE);
4861             byteOffset = TypeSize(type);
4862         }
4863     } else { // arch reg
4864         left_bound = 0;
4865         byteOffset = left_bound;
4866     }
4867 }
4868 //
4869 // Initialize all values idential to rgn's, except for the base operand.
4870 // Caller is responsible for allocating base operand and making sure it doesn't
4871 // mess up the operands' hash table.
4872 //
G4_DstRegRegion(G4_DstRegRegion & rgn,G4_VarBase * new_base)4873 G4_DstRegRegion::G4_DstRegRegion(G4_DstRegRegion &rgn, G4_VarBase *new_base)
4874     : G4_Operand(G4_Operand::dstRegRegion)
4875 {
4876     acc = rgn.acc;
4877     regOff = rgn.regOff;
4878     subRegOff = rgn.subRegOff;
4879     immAddrOff = rgn.immAddrOff;
4880     horzStride = rgn.horzStride;
4881     type = rgn.type;
4882     writeMask = rgn.writeMask;
4883     base = new_base;
4884 
4885     computeLeftBound();
4886     rightBoundSet = false;
4887 }
4888 
setDstBitVec(uint8_t exec_size)4889 void G4_DstRegRegion::setDstBitVec(uint8_t exec_size)
4890 {
4891     // byte level footprint computing bit vectors.
4892     uint64_t footprint0 = 0;
4893     uint64_t footprint1 = 0;
4894 
4895     unsigned short type_size = getTypeSize();
4896     unsigned short s_size = horzStride * type_size;
4897 
4898     // General cases.
4899     uint64_t bit_seq = TypeFootprint(type);
4900     for (uint8_t i = 0; i < exec_size; ++i)
4901     {
4902         int eltOffset = i * s_size;
4903         // no element can cross 64-byte boundary
4904         if (eltOffset >= 64)
4905         {
4906             footprint1 |= bit_seq << (eltOffset - 64);
4907         }
4908         else
4909         {
4910             footprint0 |= bit_seq << eltOffset;
4911         }
4912     }
4913 
4914     bitVec[0] = footprint0;
4915     bitVec[1] = footprint1;
4916 
4917     return;
4918 }
4919 
computeRightBound(uint8_t exec_size)4920 unsigned G4_DstRegRegion::computeRightBound(uint8_t exec_size)
4921 {
4922     bitVec[0] = 0;
4923     bitVec[1] = 0;
4924 
4925     if (base->isFlag()) {
4926         unsigned int totalBits = 0;
4927         if (G4_Inst_Table[inst->opcode()].instType != InstTypePseudoLogic)
4928         {
4929             // mov (1) f0.1<1>:uw ...
4930             // subreg is 1 if it's a 32 bit flag and we want to set the upper 16 bits
4931             left_bound = subRegOff * 16;
4932             totalBits = TypeBitSize(type);
4933         }
4934         else
4935         {
4936             /*
4937                 we need to set leftBound for pseudo intruction
4938                 so that it creates use/def links correctly in the control flow graph between
4939                 cmp instruction and pseudo instruction.
4940                 This matters when we break up SIMD32 instruction in to two SIMD16 with H1/H2 masks.
4941                 The bound for compare for H2 will be [15,31], and this has to match.
4942                 Without this no use/def link was created which caused issues in logic optimization.
4943                 Also it produce incorrect behavior in any operation that relies on compareOperand.
4944             */
4945             left_bound = inst->getMaskOffset();
4946             totalBits = exec_size;
4947         }
4948 
4949         right_bound = left_bound + totalBits - 1;
4950 
4951         bitVec[0] = totalBits == 32 ? 0xFFFFFFFF : (1 << totalBits) - 1;
4952     }
4953     else
4954     {
4955         // For call, the return addr is always set as if simd2.
4956         if (inst->isCall() || inst->isFCall())
4957         {
4958             exec_size = 2;
4959         }
4960 
4961         if (acc == Direct)
4962         {
4963             setDstBitVec(exec_size);
4964 
4965             unsigned short type_size = TypeSize(type);
4966             unsigned short s_size = horzStride * type_size;
4967             unsigned totalBytes = (exec_size - 1) * s_size + type_size;
4968 
4969             // For wide dst instructions like madw opcode, the dst(SOA layout) size should be the sum of low result size and high
4970             // result size, and also both low and high results are GRF-aligned.
4971             if (INST_WIDE_DST(inst->opcode()))
4972             {
4973                 unsigned totalBytesDstLow = (totalBytes + getGRFSize() - 1) & (~(getGRFSize() - 1)); // GRF-aligned
4974                 totalBytes = totalBytesDstLow * 2;
4975             }
4976 
4977             right_bound = left_bound + totalBytes - 1;
4978         }
4979         else
4980         {
4981             // indirect
4982             bitVec[0] |= 0x3;
4983             right_bound = left_bound + TypeSize(ADDR_REG_TYPE) - 1;
4984         }
4985     }
4986     rightBoundSet = true;
4987     return right_bound;
4988 }
4989 
4990 /// compare regRegion to opnd
4991 /// regRegion is either a SrcRegRegion or DstRegRegion, opnd can be any G4_operand
4992 /// We put this in a separate function since G4_DstRegRegion and G4_SrcRegRegion
4993 /// should have (nearly) identical code for compareOperand
compareRegRegionToOperand(G4_Operand * regRegion,G4_Operand * opnd)4994 static G4_CmpRelation compareRegRegionToOperand(G4_Operand* regRegion, G4_Operand* opnd)
4995 {
4996     assert((regRegion->isSrcRegRegion() || regRegion->isDstRegRegion()) && "expect either src or dst regRegion");
4997     bool legal_opnd = opnd->isSrcRegRegion() || opnd->isDstRegRegion() || opnd->isPredicate() || opnd->isCondMod() || opnd->isAddrExp();
4998     G4_VarBase* myBase = regRegion->getBase();
4999     G4_VarBase *opndBase = opnd->getBase();
5000     G4_RegAccess myAcc = regRegion->getRegAccess();
5001     G4_RegAccess opndAcc = opnd->getRegAccess();
5002     G4_Declare* myDcl = regRegion->getTopDcl();
5003     G4_Declare* opndDcl = opnd->getTopDcl();
5004     if (opnd->isAddrExp())
5005     {
5006         opndBase = opnd->asAddrExp()->getRegVar()->getBaseRegVar();
5007         opndDcl = opnd->asAddrExp()->getRegVar()->getDeclare();
5008     }
5009 
5010     if (regRegion->isAddrExp())
5011     {
5012         myBase = opnd->asAddrExp()->getRegVar()->getBaseRegVar();
5013         myDcl = opnd->asAddrExp()->getRegVar()->getDeclare();
5014     }
5015 
5016     if (!legal_opnd || myBase == nullptr || opndBase == nullptr)
5017     {
5018         // a null base operand can never interfere with anything
5019         return Rel_disjoint;
5020     }
5021 
5022     if (myDcl == opndDcl && opndDcl != nullptr)
5023     {
5024         // special checks for pseudo kills
5025         G4_INST* myInst = regRegion->getInst();
5026         G4_INST* opndInst = opnd->getInst();
5027         if (myInst && (myInst->isPseudoKill() || myInst->isLifeTimeEnd()))
5028         {
5029             return Rel_interfere;
5030         }
5031 
5032         if (opndInst && (opndInst->isPseudoKill() || opndInst->isLifeTimeEnd()))
5033         {
5034             return Rel_interfere;
5035         }
5036 
5037         if (opnd->isAddrExp() || regRegion->isAddrExp())
5038         {
5039             return Rel_interfere;
5040         }
5041     }
5042 
5043     if (opndAcc == myAcc && myAcc != Direct)
5044     {
5045         // two indirect are assumed to interfere in the absence of pointer analysis
5046         return Rel_interfere;
5047     }
5048     else if (opndAcc != myAcc)
5049     {
5050         // direct v. indirect
5051         // the two may inteferce if the direct operand is either an address-taken GRF or an address operand
5052         // we could make the check tighter by considering the offsets of the address operand,
5053         // but it won't much difference in practice
5054         auto mayInterfereWithIndirect = [](G4_Operand* direct, G4_Operand* indirect)
5055         {
5056             assert((direct->getRegAccess() == Direct && indirect->getRegAccess() == IndirGRF) &&
5057                 "first opereand should be direct and second indirect");
5058             return (direct->getTopDcl() && direct->getTopDcl()->getAddressed()) ||
5059                 (direct->isAddress() && direct->getTopDcl() == indirect->getTopDcl());
5060         };
5061 
5062         if ((opndAcc != Direct && mayInterfereWithIndirect(regRegion, opnd)) ||
5063             (myAcc != Direct && mayInterfereWithIndirect(opnd, regRegion)))
5064         {
5065             return Rel_interfere;
5066         }
5067         return Rel_disjoint;
5068     }
5069 
5070     // both are physically assigned.
5071     G4_VarBase *myPhyReg = myBase->isRegVar() ? myBase->asRegVar()->getPhyReg() : myBase;
5072     G4_VarBase *opndPhyReg = opndBase->isRegVar() ? opndBase->asRegVar()->getPhyReg() : opndBase;
5073     if (myPhyReg && opndPhyReg)
5074     {
5075         assert(myPhyReg->isPhyReg() && opndPhyReg->isPhyReg());
5076         if (myPhyReg->getKind() != opndPhyReg->getKind())
5077             return Rel_disjoint;
5078 
5079         if (myPhyReg->isPhyAreg())
5080         {
5081             if (myPhyReg->asAreg()->getArchRegType() == AREG_NULL)
5082             {
5083                 //like NaN, a null ARF is disjoint to everyone including itself
5084                 return Rel_disjoint;
5085             }
5086 
5087             // TODO: this is not accurate for flag/acc/address.
5088             return (myPhyReg->asAreg()->getArchRegType() ==
5089                   opndPhyReg->asAreg()->getArchRegType()) ? Rel_eq : Rel_disjoint;
5090         }
5091 
5092         // TODO: handle physically assigned GRF reg. Right now this should
5093         // not happen prior to RA.
5094     }
5095 
5096     if (myBase->getKind() != opndBase->getKind())
5097     {
5098         return Rel_disjoint;
5099     }
5100 
5101     if (myDcl != opndDcl)
5102     {
5103         return Rel_disjoint;
5104     }
5105 
5106     unsigned int left_bound2 = opnd->getLeftBound(), right_bound2 = opnd->getRightBound();
5107     uint32_t myLeftBound = regRegion->getLeftBound();
5108     uint32_t myRightBound = regRegion->getRightBound();
5109 
5110     {
5111         uint64_t opndBitVecL = opnd->getBitVecL(), opndBitVecH = opnd->getBitVecH();
5112         uint64_t myBitVecL = regRegion->getBitVecL(), myBitVecH = regRegion->getBitVecH();
5113         if (myRightBound < left_bound2 || right_bound2 < myLeftBound)
5114         {
5115             return Rel_disjoint;
5116         }
5117         else if (myLeftBound == left_bound2 &&
5118             myRightBound == right_bound2 &&
5119             myBitVecL == opndBitVecL && myBitVecH == opndBitVecH)
5120         {
5121             return Rel_eq;
5122         }
5123         else
5124         {
5125             // First consider if any operand is > two GRFs. If so we just compare the bound
5126             // as such operands are assumed to touch every element within the bound.
5127             bool meExceedTwoGRF = (myRightBound - myLeftBound) > 2u * getGRFSize();
5128             bool opndExceedTwoGRF = (right_bound2 - left_bound2) > 2u * getGRFSize();
5129             if (meExceedTwoGRF || opndExceedTwoGRF)
5130             {
5131                 if (left_bound2 >= myLeftBound && right_bound2 <= myRightBound)
5132                 {
5133                     return Rel_gt;
5134                 }
5135                 else if (myLeftBound >= left_bound2 && myRightBound <= right_bound2)
5136                 {
5137                     return Rel_lt;
5138                 }
5139                 return Rel_interfere;
5140             }
5141 
5142             // Now both operands are within two GRFs, compare their footprint to get precise relations
5143             int maskSize = 2 * getGRFSize();
5144             if (myDcl)
5145             {
5146                 maskSize = myDcl->getRegVar()->isFlag() ? myDcl->getNumberFlagElements()
5147                     : myDcl->getByteSize();
5148             }
5149             BitSet myBitSet(maskSize, false);
5150             BitSet otherBitSet(maskSize, false);
5151             regRegion->updateFootPrint(myBitSet, true);
5152             opnd->updateFootPrint(otherBitSet, true);
5153 
5154             BitSet tmp = myBitSet;
5155             myBitSet &= otherBitSet;
5156             if (myBitSet.isEmpty())
5157             {
5158                 return Rel_disjoint;
5159             }
5160 
5161             myBitSet = tmp;
5162             myBitSet -= otherBitSet;
5163             if (myBitSet.isEmpty())
5164             {
5165                 return Rel_lt;
5166             }
5167             otherBitSet -= tmp;
5168             return otherBitSet.isEmpty() ? Rel_gt : Rel_interfere;
5169         }
5170     }
5171 }
5172 
compareOperand(G4_Operand * opnd)5173 G4_CmpRelation G4_DstRegRegion::compareOperand(G4_Operand *opnd)
5174 {
5175     return compareRegRegionToOperand(this, opnd);
5176 }
5177 
isNativeType() const5178 bool G4_DstRegRegion::isNativeType() const
5179 {
5180     G4_Type type = getType();
5181 
5182     if (IS_WTYPE(type) || IS_DTYPE(type) || IS_FTYPE(type) || type == Type_DF) {
5183         return true;
5184     }
5185     else {
5186         return false;
5187     }
5188 }
5189 
isNativePackedRowRegion() const5190 bool G4_DstRegRegion::isNativePackedRowRegion() const
5191 {
5192     if (isNativeType()) {
5193         return horzStride  == 1;
5194     }
5195     else {
5196         return false;
5197     }
5198 }
5199 
isNativePackedRegion() const5200 bool G4_DstRegRegion::isNativePackedRegion() const
5201 {
5202     return isNativePackedRowRegion();
5203 }
5204 
coverGRF(uint16_t numGRF,uint8_t execSize)5205 bool G4_DstRegRegion::coverGRF(uint16_t numGRF, uint8_t execSize)
5206 {
5207     uint32_t size = numEltPerGRF<Type_UB>() * numGRF;
5208     uint32_t range = getRightBound() - getLeftBound() + 1;
5209     if (acc == Direct)
5210     {
5211         if (range == size)
5212         {
5213             return true;
5214         }
5215         if (horzStride > 1)
5216         {
5217             if (size == execSize * horzStride * TypeSize(type))
5218             {
5219                 return true;
5220             }
5221         }
5222     }
5223     else
5224     {
5225         if (size == execSize * horzStride * TypeSize(type))
5226         {
5227             return true;
5228         }
5229     }
5230     return false;
5231 }
5232 
5233 // Check if dst satisfies the following conditions(for platforms before BDW):
5234 //The destination region is entirely contained in the lower OWord of a register.
5235 //The destination region is entirely contained in the upper OWord of a register.
5236 //The destination elements are evenly split between the two OWords of a register.
5237 
goodOneGRFDst(uint8_t execSize)5238 bool G4_DstRegRegion::goodOneGRFDst(uint8_t execSize)
5239 {
5240     if (acc != Direct)
5241     {
5242         return horzStride * TypeSize(type) * execSize == numEltPerGRF<Type_UB>();
5243     }
5244     uint32_t halfSize = (getRightBound() - getLeftBound() + 1 + (horzStride - 1) * getTypeSize()) / 2;
5245     uint32_t middle = getLeftBound() + halfSize;
5246     if (getLeftBound()/(numEltPerGRF<Type_UB>()/2) == getRightBound()/(numEltPerGRF<Type_UB>()/2) ||
5247         (getLeftBound()/(numEltPerGRF<Type_UB>()/2) == (getRightBound()/(numEltPerGRF<Type_UB>()/2) - 1) &&
5248         getLeftBound()%(numEltPerGRF<Type_UB>()/2) == middle%(numEltPerGRF<Type_UB>()/2)))
5249     {
5250         return true;
5251     }
5252     return false;
5253 }
5254 
goodtwoGRFDst(uint8_t execSize)5255 bool G4_DstRegRegion::goodtwoGRFDst(uint8_t execSize)
5256 {
5257     return evenlySplitCrossGRF(execSize);
5258 }
5259 
5260 // this is true if dst crosses GRF and has same number of elements in both GRFs
5261 // (i.e, the middle element has same GRF offset as the start element)
evenlySplitCrossGRF(uint8_t execSize)5262 bool G4_DstRegRegion::evenlySplitCrossGRF(uint8_t execSize)
5263 {
5264     // check number of elements in first GRF.
5265     MUST_BE_TRUE(acc == Direct, "Indirect operand can not cross GRF boundary.");
5266 
5267     if (execSize == 1)
5268     {
5269         return false;
5270     }
5271 
5272     int halfBytes = left_bound + horzStride * TypeSize(type) * (execSize / 2);
5273     int halfOffset = halfBytes % numEltPerGRF<Type_UB>();
5274     int startOffset = left_bound % numEltPerGRF<Type_UB>();
5275     return halfOffset == startOffset;
5276 }
5277 
5278 /*
5279  * check if the input opnd is align to GRF
5280  * if the first level dcl is not aligned to GRF or sub register offset of this opnd is not multiple GRFs, including 0,
5281  * return true.
5282  */
checkGRFAlign() const5283 bool G4_DstRegRegion::checkGRFAlign() const
5284 {
5285     bool GRF_aligned = false;
5286     unsigned byte_subregoff = subRegOff * TypeSize(type);
5287 
5288     if (byte_subregoff  % numEltPerGRF<Type_UB>() != 0)
5289     {
5290         return false;
5291     }
5292 
5293     if (base)
5294     {
5295         if (base->isRegVar())
5296         {
5297             G4_Declare *dcl = base->asRegVar()->getDeclare();
5298 
5299             if (dcl)
5300             {
5301                 G4_Declare *aliasdcl = dcl;
5302 
5303                 unsigned aliasOffset = 0;
5304                 while (aliasdcl->getAliasDeclare())
5305                 {
5306                     aliasOffset += aliasdcl->getAliasOffset();
5307                     aliasdcl = aliasdcl->getAliasDeclare();
5308                 }
5309                 if (aliasOffset % numEltPerGRF<Type_UB>() != 0)
5310                 {
5311                     return false;
5312                 }
5313 
5314                 if (aliasdcl->getSubRegAlign() >= GRFALIGN ||
5315                     aliasdcl->getNumRows() * aliasdcl->getElemSize() * aliasdcl->getElemSize() >= (int)numEltPerGRF<Type_UB>()) {
5316                         return true;
5317                 }
5318             }
5319             else if (base->asRegVar()->isPhyRegAssigned() &&
5320                 base->asRegVar()->getByteAddr() % numEltPerGRF<Type_UB>() == 0)
5321             {
5322                     return true;
5323             }
5324         }
5325     }
5326 
5327     return GRF_aligned;
5328 }
5329 
5330 //
5331 // returns true if this operand (must be either Src or DstRegRegion) has a fixed subreg offset.
5332 // This is true only if
5333 // -- operand is direct,
5334 // -- operand has assigned GRF (i.e., input), or
5335 // -- base declare is a GRF variable that is GRF-aligned
5336 // if true, the subreg offset is also returned via offset in bytes
5337 // Note this always returns false for ARFs (flag, addr, etc.)
5338 //
regionHasFixedSubreg(G4_Operand * opnd,uint32_t & offset)5339 static bool regionHasFixedSubreg(G4_Operand* opnd, uint32_t& offset)
5340 {
5341     assert(opnd->isSrcRegRegion() || opnd->isDstRegRegion());
5342     short subRegOff = 0;
5343     if (opnd->isSrcRegRegion())
5344     {
5345         if (opnd->asSrcRegRegion()->getRegAccess() != Direct)
5346         {
5347             return false;
5348         }
5349         subRegOff = opnd->asSrcRegRegion()->getSubRegOff();
5350     }
5351     else if (opnd->isDstRegRegion())
5352     {
5353         if (opnd->asDstRegRegion()->getRegAccess() != Direct)
5354         {
5355             return false;
5356         }
5357         subRegOff = opnd->asDstRegRegion()->getSubRegOff();
5358     }
5359 
5360     G4_VarBase* base = opnd->getBase();
5361 
5362     if (base == NULL || !base->isRegVar() || !base->asRegVar()->getDeclare()->useGRF())
5363     {
5364         return false;
5365     }
5366 
5367     if (base->asRegVar()->isPhyRegAssigned())
5368     {
5369         offset = (subRegOff + base->asRegVar()->getPhyRegOff()) * TypeSize(opnd->getType());
5370         offset %= getGRFSize();
5371         return true;
5372     }
5373 
5374     uint32_t subregByte = 0;
5375     G4_Declare *rootDcl = base->asRegVar()->getDeclare()->getRootDeclare(subregByte);
5376     subregByte += subRegOff * TypeSize(opnd->getType());
5377 
5378     if (rootDcl->getSubRegAlign() < GRFALIGN)
5379     {
5380         return false;
5381     }
5382     offset = subregByte % numEltPerGRF<Type_UB>();
5383 
5384     return true;
5385 }
5386 
5387 
hasFixedSubregOffset(uint32_t & offset)5388 bool G4_DstRegRegion::hasFixedSubregOffset(uint32_t& offset)
5389 {
5390     return regionHasFixedSubreg(this, offset);
5391 }
5392 
5393 // compute max execution size starting from the current pos.
5394 // power of two. no cross GRF boundary is allowed now.
5395 // TODO: cross GRF is allowed in BDW+.
5396 // cross half-GRF should guaranttee evenly split
getMaxExecSize(int pos,uint8_t maxExSize,bool twoGRFsrc)5397 uint8_t G4_DstRegRegion::getMaxExecSize(int pos, uint8_t maxExSize, bool twoGRFsrc)
5398 {
5399     if (acc != Direct)
5400     {
5401         return roundDownPow2(maxExSize);
5402     }
5403 
5404     uint8_t elSize = (uint8_t)getTypeSize();
5405     uint8_t exTypeSize = horzStride * elSize;
5406     uint8_t maxSize = roundDownPow2(maxExSize);
5407     uint32_t newLB = getLeftBound() + pos * exTypeSize,
5408         newRB = newLB + (maxExSize - 1) * exTypeSize + elSize - 1;
5409     uint32_t leftGRF = newLB / numEltPerGRF<Type_UB>(), rightGRF = newRB / numEltPerGRF<Type_UB>();
5410     // pre-BDW does not allow cross GRF dst except full 2-GRF dst.
5411     // BDW+ allows if elements are evenly split between two GRFs
5412     bool crossGRF = false;
5413     if (isCrossGRFDst())
5414     {
5415         // check cross GRF boundary
5416         uint8_t byteInFirstGRF = ((leftGRF + 1) * numEltPerGRF<Type_UB>() - newLB);
5417         uint8_t eleInFirstGRF = byteInFirstGRF / exTypeSize +
5418             // v20(0,17)<2>:ub and simd size is 16
5419             ((byteInFirstGRF % exTypeSize != 0) && (byteInFirstGRF % exTypeSize >= elSize) ? 1 : 0);
5420 
5421         if (leftGRF != rightGRF)
5422         {
5423             uint8_t pow2 = roundDownPow2(eleInFirstGRF);
5424             if (pow2 != eleInFirstGRF)
5425             {
5426                 maxSize = pow2;
5427                 newRB = newLB + (maxSize - 1) * exTypeSize + elSize - 1;
5428             }
5429             else
5430             {
5431                 // number of elements in first GRF is power of 2 and HS is not used to cross GRF
5432                 // search into second GRF
5433                 // if number of elements in second GRF >= numbr of elements in first GRF
5434                 uint8_t byteInSecondGRF = (newRB + 1) % numEltPerGRF<Type_UB>();
5435                 uint8_t eleInSecondGRF = byteInSecondGRF / exTypeSize + (horzStride > 1 ? 1 : 0);
5436                 if (eleInSecondGRF >= eleInFirstGRF)
5437                 {
5438                     crossGRF = true;
5439                     maxSize = eleInFirstGRF * 2;
5440                 }
5441             }
5442         }
5443     }
5444     // check if cross half-GRF boundary
5445     // FIXME: if we know that the new srcs are all in one GRF, we do not have to do the following check.
5446     if (!crossGRF && twoGRFsrc)
5447     {
5448         uint32_t halfGRFSize = numEltPerGRF<Type_UB>() / 2;
5449         if (newLB / halfGRFSize != newRB / halfGRFSize)
5450         {
5451             uint32_t middlePoint = (newRB + (horzStride - 1) * elSize - newLB + 1) / 2;
5452             // check middle point
5453             if ((middlePoint + newLB) % halfGRFSize != 0)
5454             {
5455                 // check size before half-GRF
5456                 uint8_t sizeBeforeMidGRF = ((leftGRF * numEltPerGRF<Type_UB>() + halfGRFSize) - newLB + exTypeSize - 1) / exTypeSize;
5457                 uint8_t pow2Size = roundDownPow2(sizeBeforeMidGRF);
5458                 // V36(0,1)<4>:ud is slipt into 2x2
5459                 if (sizeBeforeMidGRF <= (maxSize >> 1) && pow2Size == sizeBeforeMidGRF)
5460                 {
5461                     maxSize = 2 * pow2Size;
5462                 }
5463                 else
5464                 {
5465                     maxSize = pow2Size;
5466                 }
5467             }
5468         }
5469     }
5470 
5471     return maxSize;
5472 }
5473 //
5474 // output (Var+refOff).subRegOff<1><WriteMask>
5475 //
5476 // symbolreg == false,  output <modifier>(Var+refOff).subRegOff<16;16,1>.xxyy
5477 // symbolreg == true,   output <modifier><symbol>(RegOff, SubRegOff)<16;16,1> in symbolic register emit
5478 // Here we use symbolreg instead of Options::symbolReg, because sometimes we need to emit an instruction with invalid symbolic register as comment and
5479 // then emit a workable instruction with physical register. In this case, we do not want to re-set the global option variable Options::symbolReg to switch
5480 // between these two states, that may have potential side effects.
5481 //
emit(std::ostream & output,bool symbolreg)5482 void G4_DstRegRegion::emit(std::ostream& output, bool symbolreg)
5483 {
5484     //
5485     // output Var(refOff,subRegOff)
5486     //
5487     emitRegVarOff(output, symbolreg);
5488 
5489     //
5490     // output <horzStride>
5491     //
5492     if (inst != NULL && inst->isSplitSend())
5493     {
5494         // do nothing for sends
5495     }
5496     else if (writeMask != NoChannelEnable)
5497     {
5498         // do nothing for align16 instructions
5499     }
5500     else if (isAccRegValid())
5501     {
5502         // do nothing for madm
5503     }
5504     else if (horzStride != UNDEFINED_SHORT)
5505     {
5506         output << '<' << horzStride << '>';
5507     }
5508     else if (base->isAreg())
5509     {
5510         output << "<1>";
5511     }
5512     else if (base->isNullReg())
5513     {
5514         // do not emit region for null reg
5515     }
5516     else if (base->isFlag())
5517     {
5518         output << "<1>";
5519     }
5520     else
5521     {
5522         MUST_BE_TRUE(false, "No default region specified");
5523     }
5524 
5525     if (isAccRegValid())
5526     {
5527         // output acc2~acc9
5528         if (getAccRegSel() == NOACC)
5529         {
5530             output << ".noacc";
5531         }
5532         else
5533         {
5534             output <<".acc"<< (getAccRegSel()+2);
5535         }
5536     }
5537     else if (writeMask != NoChannelEnable)
5538     {
5539         output << "." << getChannelEnableStr(writeMask);
5540     }
5541 
5542     if (Type_UNDEF != type)
5543     {
5544         if (!symbolreg || acc != Direct)                // can output register data type for indirect addressing in any time
5545             output << ':' << TypeSymbol(type);
5546     }
5547 }
5548 
5549 //
5550 // This function is used to check if the src operand obey the rule of symbolic register. We need this function to check the operand before we emit an instruction
5551 //
obeySymbolRegRule() const5552 bool G4_DstRegRegion::obeySymbolRegRule() const
5553 {
5554     if (!base->isRegVar())          // only for reg var
5555         return false;
5556     if (base->asRegVar()->getDeclare()->isSpilled())
5557     {
5558         return false;
5559     }
5560     //
5561     // For dst operand, we do not have Rule-2
5562     // Rule-2: must have register region or default register region
5563     //
5564     // Rule-3: No swizzle .xyzw
5565     //
5566     if (writeMask != NoChannelEnable)
5567     {
5568         return false;
5569     }
5570     //
5571     // Rule-4: do not support date type redefinition in direct addressing
5572     //
5573     if (Type_UNDEF != type)
5574     {
5575          if (base->isRegVar() && acc == Direct && base->asRegVar()->getDeclare()->getElemType() != type)        // check if the data type is the same as in declare
5576          {
5577              return false;
5578          }
5579     }
5580 
5581     return true;
5582 }
5583 
5584 //
5585 // Here we use symbolreg instead of Options::symbolReg, because sometimes we need to emit an instruction with invalid symbolic register as comment and
5586 // then emit a workable instruction with physical register. In this case, we do not want to re-set the global option variable Options::symbolReg to switch
5587 // between these two states, that may have potential side effects.
5588 //
emitRegVarOff(std::ostream & output,bool symbolreg)5589 void G4_DstRegRegion::emitRegVarOff(std::ostream& output, bool symbolreg)
5590 {
5591     bool printSubReg = true;
5592     if (inst != NULL && inst->isSplitSend())
5593     {
5594         printSubReg = false;
5595     }
5596     printRegVarOff(output, this, regOff,subRegOff,immAddrOff,type, symbolreg, printSubReg);
5597 }
5598 
5599 //
5600 // return true if prd and this are the same inst predicate
5601 //
samePredicate(const G4_Predicate & prd) const5602 bool G4_Predicate::samePredicate(const G4_Predicate& prd) const
5603 {
5604     return getBase() == prd.getBase() &&
5605            state == prd.state &&
5606            subRegOff == prd.subRegOff &&
5607            control == prd.control;
5608 }
5609 //
5610 // return true if mod and this are the same condition modifier
5611 //
sameCondMod(const G4_CondMod & m) const5612 bool G4_CondMod::sameCondMod(const G4_CondMod& m) const
5613 {
5614     return getBase() == m.getBase() &&
5615            mod == m.mod &&
5616            subRegOff == m.subRegOff;
5617 }
5618 
5619 //
5620 // create all physical register operands
5621 //
PhyRegPool(Mem_Manager & m,unsigned int maxRegisterNumber)5622 PhyRegPool::PhyRegPool(Mem_Manager& m, unsigned int maxRegisterNumber)
5623 {
5624     maxGRFNum = maxRegisterNumber;
5625 
5626     GRF_Table = (G4_Greg**)m.alloc(sizeof(G4_Greg*) * maxGRFNum);
5627     // create General Registers
5628     for (unsigned int i = 0; i < maxGRFNum; i++)
5629         GRF_Table[i] = new (m) G4_Greg(i);
5630 
5631     for (unsigned i = 0; i < AREG_LAST; i++)
5632     {
5633         ARF_Table[i] = nullptr;
5634     }
5635 
5636     // create Architecture Registers
5637     ARF_Table[AREG_NULL]     = new (m) G4_Areg(AREG_NULL);
5638     ARF_Table[AREG_A0]       = new (m) G4_Areg(AREG_A0);
5639     ARF_Table[AREG_ACC0]     = new (m) G4_Areg(AREG_ACC0);
5640     ARF_Table[AREG_ACC1]     = new (m) G4_Areg(AREG_ACC1);
5641     ARF_Table[AREG_MASK0]    = new (m) G4_Areg(AREG_MASK0);
5642     ARF_Table[AREG_MS0]      = new (m) G4_Areg(AREG_MS0);
5643     ARF_Table[AREG_DBG]      = new (m) G4_Areg(AREG_DBG);
5644     ARF_Table[AREG_SR0]      = new (m) G4_Areg(AREG_SR0);
5645     ARF_Table[AREG_CR0]      = new (m) G4_Areg(AREG_CR0);
5646     ARF_Table[AREG_TM0]      = new (m) G4_Areg(AREG_TM0);
5647     ARF_Table[AREG_N0]       = new (m) G4_Areg(AREG_N0);
5648     ARF_Table[AREG_N1]       = new (m) G4_Areg(AREG_N1);
5649     ARF_Table[AREG_IP]       = new (m) G4_Areg(AREG_IP);
5650     ARF_Table[AREG_F0]       = new (m) G4_Areg(AREG_F0);
5651     ARF_Table[AREG_F1]       = new (m) G4_Areg(AREG_F1);
5652     ARF_Table[AREG_TDR0]     = new (m) G4_Areg(AREG_TDR0);
5653     ARF_Table[AREG_SP]       = new (m)G4_Areg(AREG_SP);
5654     ARF_Table[AREG_F2]       = new (m) G4_Areg(AREG_F2);
5655     ARF_Table[AREG_F3]       = new (m) G4_Areg(AREG_F3);
5656 }
5657 
rebuildRegPool(Mem_Manager & m,unsigned int numRegisters)5658 void PhyRegPool::rebuildRegPool(Mem_Manager& m, unsigned int numRegisters)
5659 {
5660     maxGRFNum = numRegisters;
5661 
5662     GRF_Table = (G4_Greg**)m.alloc(sizeof(G4_Greg*) * maxGRFNum);
5663     // create General Registers
5664     for (unsigned int i = 0; i < maxGRFNum; i++)
5665         GRF_Table[i] = new (m) G4_Greg(i);
5666 }
5667 
setEvenAlign()5668 void G4_Declare::setEvenAlign()
5669 {
5670     regVar->setEvenAlign();
5671 }
5672 
setSubRegAlign(G4_SubReg_Align subAl)5673 void G4_Declare::setSubRegAlign(G4_SubReg_Align subAl)
5674 {
5675     regVar->setSubRegAlignment(subAl);
5676 }
5677 
isEvenAlign() const5678 bool G4_Declare::isEvenAlign() const
5679 {
5680     return regVar->isEvenAlign();
5681 }
5682 
getSubRegAlign() const5683 G4_SubReg_Align G4_Declare::getSubRegAlign() const
5684 {
5685     return regVar->getSubRegAlignment();
5686 }
5687 
copyAlign(G4_Declare * dcl)5688 void G4_Declare::copyAlign(G4_Declare* dcl)
5689 {
5690     if (dcl->isEvenAlign())
5691     {
5692         setEvenAlign();
5693     }
5694     regVar->setSubRegAlignment(dcl->getSubRegAlign());
5695 }
5696 
emit(std::ostream & output) const5697 void G4_Declare::emit(std::ostream &output) const
5698 {
5699 
5700     output << "//.declare " << name;
5701     output << " rf=";
5702     if (useGRF())
5703     {
5704         output << 'r';
5705     }
5706     else if (regFile == G4_ADDRESS)
5707     {
5708         output << 'a';
5709     }
5710     else if (regFile == G4_SCALAR)
5711     {
5712         output << 's';
5713     }
5714     else if (regFile == G4_FLAG)
5715     {
5716         output << 'f';
5717     }
5718     else
5719     {
5720         MUST_BE_TRUE(false, ERROR_UNKNOWN); //unhandled case
5721     }
5722 
5723     output << " size=" << getByteSize();
5724     if (Type_UNDEF != elemType)
5725     {
5726         output << " type=" << TypeSymbol(elemType);
5727     }
5728     if (AliasDCL)
5729     {
5730         output << " alias=" << AliasDCL->getName() << "+" << getAliasOffset();
5731     }
5732     output << " align=" << getSubRegAlign() << " words";
5733     if (regVar->isPhyRegAssigned())
5734     {
5735         G4_VarBase* phyreg = regVar->getPhyReg();
5736         if (phyreg->isGreg())
5737         {
5738             output << " (r" << phyreg->asGreg()->getRegNum() << "." << regVar->getPhyRegOff() << ")";
5739         }
5740         else if (phyreg->isAddress())
5741         {
5742             output << " (a0." << regVar->getPhyRegOff() << ")";
5743         }
5744         else if (phyreg->isFlag())
5745         {
5746             bool valid = false;
5747             output << " (f" << phyreg->asAreg()->ExRegNum(valid) << "." << regVar->getPhyRegOff() << ")";
5748         }
5749     }
5750     else if (isSpilled())
5751     {
5752         if (spillDCL)
5753         {
5754             // flag/addr spill
5755             output << " (spilled -> " << spillDCL->getName() << ")";
5756         }
5757         else
5758         {
5759             // GRF spill
5760             auto GRFOffset = getRegVar()->getDisp() / getGRFSize();
5761             if (!AliasDCL)
5762             {
5763                 output << " (spilled -> Scratch[" << GRFOffset << "x" << (int)getGRFSize() << "])";
5764             }
5765             else
5766             {
5767                 output << " (spilled)";
5768             }
5769         }
5770     }
5771 
5772     if (liveIn && liveOut)
5773     {
5774         output << " Input_Output";
5775     }
5776     else if (liveIn)
5777     {
5778         output << " Input";
5779     }
5780     else if (liveOut)
5781     {
5782         output << " Output";
5783     }
5784 
5785     output << "\n";
5786 }
5787 
emit(std::ostream & output,bool symbolreg)5788 void G4_Predicate::emit(std::ostream& output, bool symbolreg)
5789 {
5790     output << "(";
5791     emit_body(output, symbolreg);
5792     output << ") ";
5793 }
5794 
emit_body(std::ostream & output,bool symbolreg)5795 void G4_Predicate::emit_body(std::ostream& output, bool symbolreg)
5796 {
5797     static const char* align16ControlNames[] =
5798     {
5799         "",
5800         "xyzw",
5801         "x",
5802         "y",
5803         "z",
5804         "w"
5805         "any4h",
5806         "all4h"
5807     };
5808 
5809     if (state == PredState_Minus)
5810     {
5811         output << '!';
5812     }
5813 
5814     if (getBase()->asRegVar()->isPhyRegAssigned())
5815     {
5816         getBase()->asRegVar()->getPhyReg()->emit(output);
5817         output << "." << getBase()->asRegVar()->getPhyRegOff();
5818     }
5819     else
5820     {
5821         getBase()->emit(output);
5822         if (subRegOff != UNDEFINED_SHORT)
5823         {
5824             output << '.' << subRegOff;
5825         }
5826     }
5827 
5828     if (align16Control != PRED_ALIGN16_DEFAULT)
5829     {
5830         output << "." << align16ControlNames[align16Control];
5831     }
5832     else
5833     {
5834         if (control != PRED_DEFAULT)
5835         {
5836             output << '.';
5837             switch (control)
5838             {
5839             case PRED_ANY2H:
5840                 output << "any2h";
5841                 break;
5842             case PRED_ANY4H:
5843                 output << "any4h";
5844                 break;
5845             case PRED_ANY8H:
5846                 output << "any8h";
5847                 break;
5848             case PRED_ANY16H:
5849                 output << "any16h";
5850                 break;
5851             case PRED_ANY32H:
5852                 output << "any32h";
5853                 break;
5854             case PRED_ALL2H:
5855                 output << "all2h";
5856                 break;
5857             case PRED_ALL4H:
5858                 output << "all4h";
5859                 break;
5860             case PRED_ALL8H:
5861                 output << "all8h";
5862                 break;
5863             case PRED_ALL16H:
5864                 output << "all16h";
5865                 break;
5866             case PRED_ALL32H:
5867                 output << "all32h";
5868                 break;
5869             case PRED_ANYV:
5870                 output << "anyv";
5871                 break;
5872             case PRED_ALLV:
5873                 output << "allv";
5874                 break;
5875             default:
5876                 // do nothing
5877                 break;
5878             }
5879         }
5880     }
5881 }
5882 
G4_Predicate(G4_Predicate & prd)5883 G4_Predicate::G4_Predicate(G4_Predicate &prd)
5884     : G4_Operand(G4_Operand::predicate, prd.getBase())
5885 {
5886     state = prd.state;
5887     subRegOff = prd.subRegOff;
5888     control = prd.control;
5889     align16Control = prd.align16Control;
5890 
5891     top_dcl = prd.top_dcl;
5892     left_bound = prd.left_bound;
5893     right_bound = prd.right_bound;
5894     bitVec[0] = prd.bitVec[0];
5895     bitVec[1] = prd.bitVec[1];
5896     byteOffset = prd.byteOffset;
5897     rightBoundSet = prd.rightBoundSet;
5898     isPredicateSameAsNoMask = prd.isPredicateSameAsNoMask;
5899 }
5900 
computeRightBound(uint8_t exec_size)5901 unsigned G4_Predicate::computeRightBound(uint8_t exec_size)
5902 {
5903     rightBoundSet = true;
5904     bitVec[0] = 0;
5905     bitVec[1] = 0;
5906 
5907     uint16_t group_size = (uint16_t)getPredCtrlGroupSize();
5908     uint16_t totalBits = (exec_size > group_size) ? exec_size : group_size;
5909 
5910     if (inst)
5911         left_bound = inst->getMaskOffset();
5912 
5913     right_bound = left_bound + totalBits - 1;
5914 
5915     bitVec[0] = exec_size == 32 ? 0xFFFFFFFF : (1 << exec_size) - 1;
5916 
5917     return right_bound;
5918 }
5919 
compareBound(uint32_t myLB,uint32_t myRB,uint32_t otherLB,uint32_t otherRB)5920 static G4_CmpRelation compareBound(uint32_t myLB, uint32_t myRB, uint32_t otherLB, uint32_t otherRB)
5921 {
5922     if (myLB == otherLB && myRB == otherRB)
5923     {
5924         return Rel_eq;
5925     }
5926     else if (myRB < otherLB || otherRB < myLB)
5927     {
5928         return Rel_disjoint;
5929     }
5930     else if (myLB <= otherLB && myRB >= otherRB)
5931     {
5932         return Rel_gt;
5933     }
5934     else if (myLB >= otherLB && myRB <= otherRB)
5935     {
5936         return Rel_lt;
5937     }
5938     else
5939     {
5940         return Rel_interfere;
5941     }
5942 }
5943 
5944 /// compare flag to opnd
5945 /// flag is either a G4_Predicate or G4_CondMod, opnd can be any G4_operand
5946 /// We put this in a separate function since G4_Predicate and G4_CondMod
5947 /// should have identical code for compareOperand
compareFlagToOperand(G4_Operand * flag,G4_Operand * opnd)5948 static G4_CmpRelation compareFlagToOperand(G4_Operand* flag, G4_Operand* opnd)
5949 {
5950     assert((flag->isPredicate() || flag->isCondMod()) && "expect either predicate or conditional modifier");
5951 
5952     bool legalOpnd = opnd->isSrcRegRegion() || opnd->isDstRegRegion() || opnd->isPredicate() || opnd->isCondMod();
5953     G4_VarBase* myBase = flag->getBase();
5954     G4_VarBase *opndBase = opnd->getBase();
5955 
5956     if (!legalOpnd || myBase == nullptr || opndBase == nullptr || !opndBase->isFlag())
5957     {
5958         return Rel_disjoint;
5959     }
5960 
5961     // flags with different base declare definitely do not interfere (we do not consider physical flags here)
5962     if (flag->getTopDcl() != opnd->getTopDcl())
5963     {
5964         return Rel_disjoint;
5965     }
5966 
5967     // Do we generate pseudo kill on flags?
5968     G4_INST* opndInst = opnd->getInst();
5969     if (opndInst && (opndInst->isPseudoKill() || opndInst->isLifeTimeEnd()))
5970     {
5971         return Rel_interfere;
5972     }
5973 
5974     return compareBound(flag->getLeftBound(), flag->getRightBound(), opnd->getLeftBound(), opnd->getRightBound());
5975 }
5976 
compareOperand(G4_Operand * opnd)5977 G4_CmpRelation G4_Predicate::compareOperand(G4_Operand *opnd)
5978 {
5979     return compareFlagToOperand(this, opnd);
5980 }
5981 
5982 // remove half of the bitvector and change right bound
splitPred()5983 void G4_Predicate::splitPred()
5984 {
5985     uint16_t range = getRightBound() - getLeftBound() + 1;
5986     uint16_t shiftLen = range >> 2;
5987     right_bound = getLeftBound() + shiftLen - 1;
5988 
5989     bitVec[0] = ((uint32_t)getBitVecL()) >> shiftLen;
5990 }
5991 
emit(std::ostream & output,bool symbolreg)5992 void G4_CondMod::emit(std::ostream& output, bool symbolreg)
5993 {
5994     static const char* const CondModStr[Mod_cond_undef] =
5995     {
5996         "ze",  // zero
5997         "eq",  // equal
5998         "nz", // not zero
5999         "ne", // not equal
6000         "gt",  // greater
6001         "ge", // greater or equal
6002         "lt",  // less
6003         "le", // less or equal
6004         "ov",  // overflow
6005         "ri",  // round increment
6006         "un",  // unorder (NaN)
6007     };
6008     output << "(" <<  CondModStr[mod] << ")";
6009     if (getBase() == nullptr)
6010     {
6011         output << "f0.0";
6012     } else if (getBase()->asRegVar()->isPhyRegAssigned()) {
6013         getBase()->asRegVar()->getPhyReg()->emit(output);
6014         output << "." << getBase()->asRegVar()->getPhyRegOff();
6015     } else {
6016         getBase()->emit(output);
6017         if (subRegOff != UNDEFINED_SHORT)
6018         {
6019             output << '.' << subRegOff;
6020         }
6021     }
6022 }
G4_CondMod(G4_CondMod & cMod)6023 G4_CondMod::G4_CondMod(G4_CondMod &cMod)
6024     : G4_Operand(G4_Operand::condMod, cMod.getBase())
6025 {
6026     mod = cMod.mod;
6027     subRegOff = cMod.subRegOff;
6028 
6029     top_dcl = cMod.top_dcl;
6030     left_bound = cMod.left_bound;
6031     right_bound = cMod.right_bound;
6032     bitVec[0] = cMod.bitVec[0];
6033     bitVec[1] = cMod.bitVec[1];
6034     byteOffset = cMod.byteOffset;
6035     rightBoundSet = cMod.rightBoundSet;
6036 }
6037 
computeRightBound(uint8_t exec_size)6038 unsigned G4_CondMod::computeRightBound(uint8_t exec_size)
6039 {
6040     bitVec[0] = 0;
6041     bitVec[1] = 0;
6042     rightBoundSet = true;
6043 
6044     if (inst)
6045         left_bound = inst->getMaskOffset();
6046 
6047     right_bound = left_bound + exec_size - 1;
6048 
6049     bitVec[0] = exec_size == 32 ? 0xFFFFFFFF : (1 << exec_size) - 1;
6050 
6051     return right_bound;
6052 }
6053 
6054 /// same as G4_Predicate::compareOperand
compareOperand(G4_Operand * opnd)6055 G4_CmpRelation G4_CondMod::compareOperand(G4_Operand *opnd)
6056 {
6057     return compareFlagToOperand(this, opnd);
6058 }
6059 
6060 
6061 // remove half of the bitvector and change right bound
splitCondMod()6062 void G4_CondMod::splitCondMod()
6063 {
6064     uint16_t range = getRightBound() - getLeftBound() + 1;
6065     uint16_t shiftLen = range >> 2;
6066     right_bound = getLeftBound() + shiftLen - 1;
6067 
6068     bitVec[0] = ((uint32_t)getBitVecL()) >> shiftLen;
6069 }
isEqualTo(G4_Imm & imm1) const6070 bool G4_Imm::isEqualTo(G4_Imm& imm1) const
6071 {
6072     return (imm1.getType() == type) && (imm1.getImm() == imm.num);
6073 }
6074 
6075 // check if an immedate is in the range of type
isInTypeRange(int64_t imm,G4_Type ty)6076 bool G4_Imm::isInTypeRange(int64_t imm, G4_Type ty)
6077 {
6078     switch (ty)
6079     {
6080     case Type_D:
6081         return imm >= (int)MIN_DWORD_VALUE && imm <= (int)MAX_DWORD_VALUE;
6082     case Type_Q:
6083         return true;
6084     case Type_UQ:
6085         return imm >= 0;
6086     case Type_UD:
6087         return (imm >= (unsigned)MIN_UDWORD_VALUE && imm <= (unsigned)MAX_UDWORD_VALUE);
6088     case Type_W:
6089         return (imm >= (int)MIN_WORD_VALUE && imm <= (int)MAX_WORD_VALUE);
6090     case Type_UW:
6091         return (imm >= (int)MIN_UWORD_VALUE && imm <= (int)MAX_UWORD_VALUE);
6092     case Type_B:
6093         return (imm >= (int)MIN_CHAR_VALUE && imm <= (int)MAX_CHAR_VALUE);
6094     case Type_UB:
6095         return (imm >= (int)MIN_UCHAR_VALUE && imm <= (int)MAX_UCHAR_VALUE);
6096     default:
6097         break;
6098     }
6099 
6100     return false;
6101 }
6102 
isZero() const6103 bool G4_Imm::isZero() const
6104 {
6105     if (IS_TYPE_F32_F64(type))
6106     {
6107         if (type == Type_F)
6108         {
6109             return (imm.fp32 == 0.0f);
6110         }
6111         return (imm.fp == 0.0);
6112     }
6113     return (imm.num == 0);
6114 }
6115 
isSignBitZero() const6116 bool G4_Imm::isSignBitZero() const
6117 {
6118     G4_Type Ty = getType();
6119     int64_t val = getInt();
6120     switch (Ty) {
6121     case Type_B:
6122     case Type_W:
6123     case Type_D:
6124     case Type_Q:
6125         return val > 0;
6126     case Type_V:
6127         return ((uint64_t)val & 0x88888888) == 0;
6128     default:
6129         break;
6130     }
6131     return false;
6132 }
6133 
compareOperand(G4_Operand * opnd)6134 G4_CmpRelation G4_Imm::compareOperand(G4_Operand *opnd)
6135 {
6136     G4_CmpRelation rel = Rel_disjoint;
6137     if (opnd->isImm() && isEqualTo(opnd->asImm()))
6138     {
6139         return Rel_eq;
6140     }
6141     return rel;
6142 }
6143 
emit(std::ostream & output,bool symbolreg)6144 void G4_Imm::emit(std::ostream& output, bool symbolreg)
6145 {
6146     //
6147     // we only emit hex in this function
6148     //
6149     std::ios::fmtflags outFlags(output.flags());
6150     output.flags(std::ios_base::hex | std::ios_base::showbase);
6151 
6152     short word;
6153     if (type == Type_DF)
6154     {
6155         output << (uint64_t)imm.num;
6156     }
6157     else if (type == Type_F)
6158     {
6159         output << imm.num32;
6160     }
6161     else if (type == Type_W || type == Type_UW || type == Type_B || type == Type_UB)
6162     {
6163         word = (short)imm.num;
6164         output << word;
6165     }
6166     else if (type == Type_D || type == Type_UD)
6167     {
6168         // 32-bit int
6169         output << (int)imm.num;
6170     }
6171     else
6172     {
6173         // 64-bit int
6174         output << imm.num;
6175     }
6176 
6177     output.flags(outFlags);
6178 
6179     if (Type_UNDEF != type)
6180     {
6181         output << ':' << TypeSymbol(type);
6182     }
6183 }
6184 
6185 // emit number, automatically select the format according to its original format
emitAutoFmt(std::ostream & output)6186 void G4_Imm::emitAutoFmt(std::ostream& output)
6187 {
6188     if (Type_F == type)
6189     {
6190         output << imm.fp32;
6191     }
6192     else if (Type_DF == type)
6193     {
6194        output << imm.fp;
6195     }
6196     else if (Type_W == type || Type_B == type)
6197     {
6198         output << (short)imm.num;
6199     }
6200     else if (Type_D == type)
6201     {
6202         output << imm.num;
6203     }
6204     else //unsigned value
6205     {
6206         output << (unsigned)imm.num;
6207     }
6208 
6209     if (Type_UNDEF != type)
6210     {
6211         output << ':' << TypeSymbol(type);
6212     }
6213 }
6214 
typecastVals(int64_t value,G4_Type type)6215 int64_t G4_Imm::typecastVals(int64_t value, G4_Type type)
6216 {
6217     int64_t retVal = 0;
6218     switch (type)
6219     {
6220     case Type_UD:
6221     case Type_UV:
6222     case Type_VF:
6223     {
6224         retVal = (int64_t)((unsigned)value);
6225         break;
6226     }
6227     case Type_D:
6228     case Type_V:
6229     {
6230         retVal = (int64_t)((int)value);
6231         break;
6232     }
6233     case Type_UW:
6234     {
6235         retVal = (int64_t)((uint16_t)value);
6236         break;
6237     }
6238     case Type_W:
6239     {
6240         retVal = (int64_t)((int16_t)value);
6241         break;
6242     }
6243     case Type_UB:
6244     {
6245         retVal = (int64_t)((uint8_t)value);
6246         break;
6247     }
6248     case Type_B:
6249     {
6250         retVal = (int64_t)((int8_t)value);
6251         break;
6252     }
6253     default:
6254     {
6255         // Dont do float conversions
6256         retVal = value;
6257     }
6258     }
6259     return retVal;
6260 }
6261 
6262 G4_RegVar *
getNonTransientBaseRegVar()6263 G4_RegVarTransient::getNonTransientBaseRegVar ()
6264 {
6265     G4_RegVar * base;
6266     for (base = getBaseRegVar (); base->isRegVarTransient (); base = base->getBaseRegVar ());
6267     return base;
6268 }
6269 
6270 G4_RegVar *
getAbsBaseRegVar()6271 G4_RegVarTransient::getAbsBaseRegVar ()
6272 {
6273     G4_RegVar * base;
6274     for (base = getBaseRegVar (); base->getBaseRegVar () != base; base = base->getBaseRegVar ());
6275     return base;
6276 }
6277 
6278 G4_RegVar *
getAbsBaseRegVar()6279 G4_RegVarTmp::getAbsBaseRegVar ()
6280 {
6281     G4_RegVar * base;
6282     for (base = getBaseRegVar (); base->getBaseRegVar () != base; base = base->getBaseRegVar ());
6283     return base;
6284 }
6285 
6286 void
emit(std::ostream & output,bool symbolreg)6287 G4_RegVar::emit(std::ostream& output, bool symbolreg)
6288 {
6289 
6290     output << decl->getName();
6291     if (reg.phyReg != NULL)
6292     {
6293         output << "(";
6294         reg.phyReg->emit(output);
6295         output << '.' << reg.subRegOff << ':' <<
6296             TypeSymbol(getDeclare()->getElemType()) << ")";
6297     }
6298 }
eval()6299 int G4_AddrExp::eval()
6300 {
6301     int byteAddr = 0;
6302 
6303     if (m_addressedReg->getPhyReg() == NULL)
6304     {
6305         // address taken range is spilled
6306         G4_Declare* addrTakenSpillFillDcl = m_addressedReg->getDeclare()->getAddrTakenSpillFill();
6307         MUST_BE_TRUE(addrTakenSpillFillDcl != NULL, "No addr taken spill fill register found!");
6308         byteAddr = addrTakenSpillFillDcl->getGRFBaseOffset();
6309     }
6310     else
6311     {
6312         byteAddr = m_addressedReg->getByteAddr(); //let's assume the unsigned=>int won't overflow for now.
6313     }
6314 
6315     // byteAddr += offsetInEle * addressedReg->getDeclare()->getElemSize();
6316     byteAddr += m_offset;
6317 
6318     return byteAddr;
6319 }
emit(std::ostream & output,bool symbolreg)6320 void G4_AddrExp::emit(std::ostream& output, bool symbolreg)
6321 {
6322     output << '&';
6323     m_addressedReg->emit(output);
6324     output << '+' << m_offset;
6325 }
6326 
computeLeftBound()6327 void G4_SrcRegRegion::computeLeftBound()
6328 {
6329     top_dcl = NULL;
6330     unsigned newregoff = regOff, offset = 0;
6331 
6332     if (base)
6333     {
6334         if (base->isRegVar())
6335         {
6336             top_dcl = base->asRegVar()->getDeclare();
6337             if (!top_dcl && base->asRegVar()->isGreg())
6338             {
6339                 newregoff = base->asRegVar()->asGreg()->getRegNum();
6340             }
6341         }
6342     }
6343 
6344     if (top_dcl)
6345     {
6346         while (top_dcl->getAliasDeclare())
6347         {
6348             offset += top_dcl->getAliasOffset();
6349             top_dcl = top_dcl->getAliasDeclare();
6350         }
6351     }
6352 
6353     if (base != NULL && base->isFlag())
6354     {
6355         if (base->isRegVar())
6356         {
6357             if (base->asRegVar()->getPhyReg())
6358             {
6359                 left_bound = base->asRegVar()->getPhyRegOff() * 16;   // the bound of flag register is in unit of BIT
6360                 left_bound += subRegOff * 16;
6361                 left_bound += base->asRegVar()->getPhyReg()->asAreg()->getFlagNum() * 32;
6362             }
6363             else
6364             {
6365                 left_bound = subRegOff * 16;
6366             }
6367         }
6368         else
6369         {
6370             left_bound = subRegOff * 16;
6371             left_bound += base->asAreg()->getFlagNum() * 32;
6372         }
6373 
6374         right_bound = 0;
6375     }
6376     else if (base != NULL && base->isAccReg())
6377     {
6378         left_bound = subRegOff * TypeSize(type);
6379         if (base->asAreg()->getArchRegType() == AREG_ACC1)
6380         {
6381             left_bound += 32;  // TODO: size of ACC is assumed to be 32 BYTEs.
6382         }
6383         byteOffset = left_bound;
6384     }
6385     else if (top_dcl)
6386     {
6387         if (acc == Direct)
6388         {
6389             left_bound = offset + newregoff * numEltPerGRF<Type_UB>() + subRegOff * TypeSize(type);
6390             if (top_dcl->getTotalElems() * top_dcl->getElemSize() >= (int)numEltPerGRF<Type_UB>())
6391             {
6392                 byteOffset = left_bound;
6393             }
6394             else
6395             {
6396                 unsigned alignOff = TypeSize(type) > TypeSize(Type_W) ?
6397                     TypeSize(type) : TypeSize(Type_W);
6398                 if (top_dcl->getSubRegAlign() == Even_Word || top_dcl->getSubRegAlign() >= Four_Word)
6399                 {
6400                     alignOff = top_dcl->getSubRegAlign() * 2;
6401                 }
6402                 byteOffset = left_bound + alignOff;
6403             }
6404         }
6405         else
6406         {
6407             left_bound = subRegOff * TypeSize(ADDR_REG_TYPE);
6408             byteOffset = TypeSize(type);
6409         }
6410 
6411         if (desc && desc->isScalar())
6412         {
6413             right_bound = left_bound + TypeSize(type) - 1;
6414         }
6415         else
6416         {
6417             right_bound = 0;
6418             // for other cases, we need execution size and instruction compression attr, so we just set
6419             // partial value here, which will be patched later
6420             // right_bound = desc->horzStride * TypeSize(type);
6421             // patch it with *exec_size + left_bound
6422             // if vertical stride == 0 and width < exec_size, divide it by 2
6423         }
6424     }
6425     else
6426     {  //arch reg
6427         left_bound = 0;
6428         byteOffset = left_bound;
6429     }
6430 }
6431 
setSrcBitVec(uint8_t exec_size)6432 void G4_SrcRegRegion::setSrcBitVec(uint8_t exec_size)
6433 {
6434     uint64_t bit_seq = TypeFootprint(type);
6435     unsigned short typeSize = TypeSize(type);
6436 
6437     uint64_t footPrint0 = 0;
6438     uint64_t footPrint1 = 0;
6439 
6440     MUST_BE_TRUE(exec_size >= desc->width, "exec size must be >= width");
6441     if (desc->isScalar())
6442     {
6443         footPrint0 = bit_seq;
6444     }
6445     else if (desc->isContiguous(exec_size))
6446     {
6447         // fast path
6448         int totalBytes = exec_size * typeSize;
6449         MUST_BE_TRUE(totalBytes <= 2 * getGRFSize(), "total bytes exceed 2 GRFs");
6450 
6451         footPrint0 = totalBytes < 64 ? (1ULL << totalBytes) - 1 : ULLONG_MAX;
6452         if (totalBytes > 64)
6453         {
6454             footPrint1 = totalBytes == 128 ? ULLONG_MAX : (1ULL << (totalBytes - 64)) - 1;
6455         }
6456     }
6457     else
6458     {
6459         for (int i = 0, numRows = exec_size / desc->width; i < numRows; ++i)
6460         {
6461             for (int j = 0; j < desc->width; ++j)
6462             {
6463                 int eltOffset = i * desc->vertStride * typeSize + j * desc->horzStride * typeSize;
6464                 // no element can cross 64-byte boundary
6465                 if (eltOffset >= 64)
6466                 {
6467                     footPrint1 |= bit_seq << (eltOffset - 64);
6468                 }
6469                 else
6470                 {
6471                     footPrint0 |= bit_seq << eltOffset;
6472                 }
6473             }
6474         }
6475     }
6476 
6477     bitVec[0] = footPrint0;
6478     bitVec[1] = footPrint1;
6479 }
6480 
computeRightBound(uint8_t exec_size)6481 unsigned G4_SrcRegRegion::computeRightBound(uint8_t exec_size)
6482 {
6483     unsigned short hs = desc->isScalar() ? 1 : desc->horzStride;
6484     unsigned short vs = desc->isScalar() ? 0 : desc->vertStride;
6485     rightBoundSet = true;
6486     unsigned short typeSize = TypeSize(type);
6487 
6488     bitVec[0] = 0;
6489     bitVec[1] = 0;
6490     if (base->isFlag())
6491     {
6492         unsigned int totalBits = 0;
6493         if (G4_Inst_Table[inst->opcode()].instType != InstTypePseudoLogic)
6494         {
6495             // mov (1) ... fx.1<0;1,0>:uw
6496             left_bound = subRegOff * 16;
6497             totalBits = base->asRegVar()->getDeclare()->getNumberFlagElements() < TypeBitSize(type) ?
6498                         base->asRegVar()->getDeclare()->getNumberFlagElements() : TypeBitSize(type);
6499         }
6500         else
6501         {
6502             /*
6503                 we need to set leftBound for pseudo intruction
6504                 so that it creates use/def links correctly in the control flow graph between
6505                 cmp instruction and pseudo instruction.
6506                 This matters when we break up SIMD32 instruction in to two SIMD16 with H1/H2 masks.
6507                 The bound for compare for H2 will be [15,31], and this has to match.
6508                 Without this no use/def link was created which caused issues in logic optimization.
6509                 Also it produce incorrect behavior in any operation that relies on compareOperand.
6510             */
6511             left_bound = inst->getMaskOffset();
6512             totalBits = exec_size;
6513         }
6514 
6515         right_bound = left_bound + totalBits - 1;
6516 
6517         bitVec[0] = totalBits == 32 ? 0xFFFFFFFF : (1 << totalBits) - 1;
6518     }
6519     else
6520     {
6521         if (acc == Direct)
6522         {
6523             if (inst->isReturn() || inst->isFReturn())
6524             {
6525                 exec_size = 2;
6526             }
6527 
6528             setSrcBitVec(exec_size);
6529 
6530             if (desc->isScalar())
6531             {
6532                 right_bound = left_bound + typeSize - 1;
6533             }
6534             else
6535             {
6536                 int num_rows = exec_size / desc->width;
6537                 if (num_rows > 0)
6538                 {
6539                     right_bound =
6540                         left_bound +
6541                         (num_rows - 1) * vs * typeSize +
6542                         hs * (desc->width - 1) * typeSize +
6543                         typeSize - 1;
6544                 }
6545                 else
6546                 {
6547                     // this fix applies to inplicit acc src
6548                     // usually when we compute new rb after inst splitting,
6549                     // the region is still the old one.
6550                     // exec_size may be smaller than width
6551                     right_bound =
6552                         left_bound +
6553                         hs * (exec_size - 1) * typeSize +
6554                         typeSize - 1;
6555                 }
6556             }
6557         }
6558         else
6559         {
6560             unsigned short numAddrSubReg = 1;
6561             if (desc->isRegionWH())
6562             {
6563                 numAddrSubReg = exec_size/desc->width;
6564             }
6565             for (uint16_t i = 0; i < numAddrSubReg; i++)
6566             {
6567                 bitVec[0] |= ((uint64_t) 0x3) << (i * 2);
6568             }
6569             right_bound = left_bound + TypeSize(ADDR_REG_TYPE) * numAddrSubReg - 1;
6570         }
6571     }
6572     return right_bound;
6573 }
6574 
compareOperand(G4_Operand * opnd)6575 G4_CmpRelation G4_SrcRegRegion::compareOperand(G4_Operand *opnd)
6576 {
6577     return compareRegRegionToOperand(this, opnd);
6578 }
6579 
isNativeType() const6580 bool G4_SrcRegRegion::isNativeType() const
6581 {
6582     G4_Type type = getType();
6583 
6584     if (IS_WTYPE(type) || IS_DTYPE(type) || IS_FTYPE(type) || type == Type_DF) {
6585         return true;
6586     }
6587     else {
6588         return false;
6589     }
6590 }
6591 
isNativePackedRowRegion() const6592 bool G4_SrcRegRegion::isNativePackedRowRegion() const
6593 {
6594     if (isNativeType())
6595     {
6596         // A single element row is always packed.
6597         return (desc->horzStride == 1) ||
6598                (desc->width == 1 && desc->horzStride == 0);
6599     }
6600 
6601     return false;
6602 }
6603 
isNativePackedRegion() const6604 bool G4_SrcRegRegion::isNativePackedRegion() const
6605 {
6606     return isNativePackedRowRegion() && desc->vertStride == desc->width;
6607 }
6608 
coverTwoGRF()6609 bool G4_SrcRegRegion::coverTwoGRF()
6610 {
6611     uint16_t range = getRightBound() - getLeftBound() + 1;
6612     if (range < numEltPerGRF<Type_UB>())
6613         return false;
6614     if (desc->horzStride > 1)
6615     {
6616         range += (desc->horzStride - 1) * TypeSize(type);
6617     }
6618     if (range == numEltPerGRF<Type_UB>() * 2 &&
6619         (desc->vertStride == desc->horzStride * desc->width ||
6620          desc->isContiguous(getInst()->getExecSize())))
6621     {
6622         return true;
6623     }
6624     return false;
6625 }
6626 // Assumption:
6627 // operand crosses GRF boundary
evenlySplitCrossGRF(uint8_t execSize,bool & sameSubRegOff,bool & vertCrossGRF,bool & contRegion,uint8_t & eleInFirstGRF)6628 bool G4_SrcRegRegion::evenlySplitCrossGRF(uint8_t execSize, bool &sameSubRegOff,
6629     bool &vertCrossGRF, bool &contRegion, uint8_t &eleInFirstGRF)
6630 {
6631     // always return true since all align16 instructions are generated by JIT
6632     // later on when we have other execution types for align16 instructions,
6633     // fix the following if to check src element distribution.
6634     // FIXME: do we need to check HS here?
6635     if (desc->isRegionV())
6636     {
6637         sameSubRegOff = true;
6638         vertCrossGRF = true;
6639         contRegion = true;
6640         return true;
6641     }
6642     vertCrossGRF = true;
6643     contRegion = desc->isSingleStride(getInst()->getExecSize());
6644     MUST_BE_TRUE(acc == Direct, "Indirect operand can not cross GRF boundary.");
6645     uint8_t firstSubRegOff = getLeftBound() % numEltPerGRF<Type_UB>();
6646     uint8_t left = firstSubRegOff;
6647     uint8_t typeSize = (uint8_t)TypeSize(type);
6648     uint8_t execTySize = (desc->horzStride == 0 ? 1 : desc->horzStride) * typeSize;
6649     uint8_t lastEltEndByte = desc->horzStride * (desc->width - 1) * typeSize + typeSize;
6650     uint8_t realRowSize = lastEltEndByte;
6651     // check number of elements in first GRF.
6652     eleInFirstGRF = 0;
6653     while (left < numEltPerGRF<Type_UB>())
6654     {
6655         if (left + realRowSize  <= (int)numEltPerGRF<Type_UB>())
6656         {
6657             // realRowSize is used to handle V12(0,17)<32;8,2>:b
6658             eleInFirstGRF += desc->width;
6659             left += desc->vertStride * TypeSize(type);
6660         }
6661         else
6662         {
6663             vertCrossGRF = false;
6664             // V12(0,17)<32;8,2>:b is a good two GRF source
6665             eleInFirstGRF++;
6666             uint8_t newLeft = left + typeSize;
6667             newLeft += execTySize;
6668             while (newLeft < numEltPerGRF<Type_UB>())
6669             {
6670                 eleInFirstGRF++;
6671                 newLeft += execTySize;
6672             }
6673             if (newLeft == numEltPerGRF<Type_UB>())
6674             {
6675                 eleInFirstGRF++;
6676                 if (eleInFirstGRF % desc->width == 0)
6677                 {
6678                     left += desc->vertStride * TypeSize(type);
6679                 }
6680                 else
6681                 {
6682                     left = newLeft + (execTySize - typeSize);
6683                 }
6684             }
6685             else if (eleInFirstGRF % desc->width == 0)
6686             {
6687                 left += desc->vertStride * TypeSize(type);
6688             }
6689             else if (typeSize == execTySize)
6690             {
6691                 left = newLeft;
6692             }
6693             else
6694             {
6695                 left = newLeft - typeSize;
6696             }
6697         }
6698     }
6699     uint8_t secondSubRegOff = left % numEltPerGRF<Type_UB>();
6700 
6701     sameSubRegOff = (firstSubRegOff == secondSubRegOff);
6702     // TODO: this guaranttees that there are equal number fo elements in each GRF, but not the distribution of elements in each of them.
6703     if (eleInFirstGRF * 2 == execSize)
6704     {
6705         return true;
6706     }
6707     return false;
6708 }
6709 
evenlySplitCrossGRF(uint8_t execSize)6710 bool G4_SrcRegRegion::evenlySplitCrossGRF(uint8_t execSize)
6711 {
6712     // check number of elements in first GRF.
6713     MUST_BE_TRUE(acc == Direct, "Indirect operand can not cross GRF boundary.");
6714     uint16_t sizeInFirstGRF = numEltPerGRF<Type_UB>() - getLeftBound() % numEltPerGRF<Type_UB>();
6715     uint16_t vertSize = desc->vertStride * getElemSize();
6716     uint16_t execTypeSize = desc->horzStride == 0 ? getElemSize() : desc->horzStride * getElemSize();
6717     uint16_t numEle = (sizeInFirstGRF + execTypeSize - 1)/ execTypeSize;
6718     uint16_t rowSize = desc->horzStride == 0 ? execTypeSize : desc->width * execTypeSize,
6719         numRows = desc->vertStride == 0 ? 1 : execSize/desc->width,
6720         numElePerRow = rowSize / execTypeSize,
6721         numExecEmePerRow = desc->horzStride == 0 ? 1 : desc->width;
6722 
6723     if (sizeInFirstGRF <= vertSize)
6724     {
6725         if (numEle >= desc->width)
6726         {
6727             numEle = desc->width;
6728         }
6729     }
6730     else if (desc->vertStride > desc->width)
6731     {
6732         numEle = sizeInFirstGRF/vertSize * numExecEmePerRow +
6733             ((sizeInFirstGRF%vertSize > rowSize) ? numExecEmePerRow : (sizeInFirstGRF%vertSize + execTypeSize - 1) / execTypeSize);
6734     }
6735 
6736     uint16_t totalNumEle = (desc->vertStride >= numElePerRow) ? (numRows * numExecEmePerRow) :
6737         (getRightBound() - getLeftBound() + 1) / execTypeSize;
6738 
6739     // TODO: this guarantees that there are equal number of elements in each GRF, but not the distribution of elements in each of them.
6740     if (numEle * 2 == totalNumEle)
6741     {
6742         return true;
6743     }
6744     return false;
6745 }
6746 
6747 /*
6748  * check if the input opnd is align to GRF
6749  * if the first level dcl is not aligned to GRF or sub register offset of this opnd is not multiple GRFs, including 0,
6750  * return true.
6751  */
checkGRFAlign()6752 bool G4_SrcRegRegion::checkGRFAlign() {
6753 
6754     bool GRF_aligned = false;
6755     uint32_t byte_subregoff = subRegOff * getTypeSize();
6756 
6757     if (byte_subregoff  % numEltPerGRF<Type_UB>() != 0) {
6758         return false;
6759     }
6760 
6761     if (base) {
6762         if (base->isRegVar()) {
6763             G4_Declare *dcl = base->asRegVar()->getDeclare();
6764 
6765             if (dcl) {
6766                 G4_Declare *aliasdcl = dcl;
6767 
6768                 unsigned aliasOffset = 0;
6769                 while (aliasdcl->getAliasDeclare())
6770                 {
6771                     aliasOffset += aliasdcl->getAliasOffset();
6772                     aliasdcl = aliasdcl->getAliasDeclare();
6773                 }
6774                 if (aliasOffset % numEltPerGRF<Type_UB>() != 0)
6775                 {
6776                     return false;
6777                 }
6778 
6779                 if (aliasdcl->getSubRegAlign() >= GRFALIGN ||
6780                     aliasdcl->getNumRows() * aliasdcl->getElemSize() * aliasdcl->getElemSize() >= (int)numEltPerGRF<Type_UB>()) {
6781                         return true;
6782                 }
6783             }else if (base->asRegVar()->isPhyRegAssigned() &&
6784                 base->asRegVar()->getByteAddr() % numEltPerGRF<Type_UB>() == 0) {
6785                     return true;
6786             }
6787         }
6788     }
6789 
6790     return GRF_aligned;
6791 }
6792 
6793 //
6794 // returns true if this SrcRegRegion has a fixed subreg offset (in bytes).
6795 // This is true only if
6796 // -- src is direct
6797 // -- base declare is a GRF variable that is GRF-aligned
6798 // if true, the subreg offset is also returned via offset
6799 // Note this always returns false for ARFs (flag, addr, etc.)
6800 //
hasFixedSubregOffset(uint32_t & offset)6801 bool G4_SrcRegRegion::hasFixedSubregOffset(uint32_t& offset)
6802 {
6803     return regionHasFixedSubreg(this, offset);
6804 }
6805 
6806 /*
6807  * Return true if the src operand has a native type and has a packed (stride
6808  * of 1) region.
6809  */
isNativePackedSrcRegion()6810 bool G4_SrcRegRegion::isNativePackedSrcRegion()
6811 {
6812     return isNativePackedRowRegion() &&
6813             (desc->vertStride == desc->width);
6814 }
6815 
emit(std::ostream & output) const6816 void RegionDesc::emit(std::ostream& output) const
6817 {
6818     if (isRegionV())
6819     {
6820         output << '<' << horzStride << '>';
6821     }
6822     else if (isRegionWH())
6823     {
6824         output << '<' << width << ',' << horzStride << '>';
6825     }
6826     else
6827     {
6828         output << '<' << vertStride << ';' << width << ',' << horzStride << '>';
6829     }
6830 }
6831 
emit(std::ostream & output,bool symbolreg)6832 void G4_Label::emit(std::ostream& output, bool symbolreg)
6833 {
6834     output << label;
6835 }
6836 
getByteAddr() const6837 unsigned G4_RegVar::getByteAddr() const
6838 {
6839     MUST_BE_TRUE(reg.phyReg != NULL, ERROR_UNKNOWN);
6840     if (reg.phyReg->isGreg())
6841     {
6842         return reg.phyReg->asGreg()->getRegNum() * numEltPerGRF<Type_UB>() +
6843             reg.subRegOff * decl->getElemSize();
6844     }
6845     if (reg.phyReg->isA0())
6846     {
6847         return reg.subRegOff * TypeSize(Type_UW);
6848     }
6849 
6850     MUST_BE_TRUE(false, ERROR_UNKNOWN);
6851     return 0;
6852 }
6853 
setSubRegAlignment(G4_SubReg_Align subAlg)6854 void G4_RegVar::setSubRegAlignment(G4_SubReg_Align subAlg)
6855 {
6856     // sub reg alignment can only be more restricted than prior setting
6857     MUST_BE_TRUE(subAlign == Any || subAlign == subAlg || subAlign % 2 == 0,
6858                  ERROR_UNKNOWN);
6859     if (subAlign > subAlg)
6860     {
6861         MUST_BE_TRUE(subAlign % subAlg == 0, "Sub reg alignment conflict");
6862         // do nothing; keep the original alignment (more restricted)
6863     }
6864     else
6865     {
6866         MUST_BE_TRUE(subAlg % subAlign == 0, "Sub reg alignment conflict");
6867         subAlign = subAlg;
6868     }
6869 }
6870 
6871 // For implicit Acc operands, left bound depends on
6872 //    a) Inst execution type
6873 //    b) Qtr control
6874 //
6875 // This function handles relevant cases, including hw intricacies
6876 // and updates left bound only.
6877 //
computeLeftBoundForImplAcc(G4_Operand * opnd)6878 void G4_INST::computeLeftBoundForImplAcc(G4_Operand* opnd)
6879 {
6880     if (opnd != NULL)
6881     {
6882         G4_Type extype;
6883         int extypesize;
6884         extype = getOpExecType(extypesize);
6885 
6886         if ((IS_WTYPE(extype) || IS_DTYPE(extype)))
6887         {
6888             // This condition is a result of HW Conformity requirement
6889             // that for exec type = D/DW, only acc0 is used even when
6890             // qtr control is set to Q2/H2
6891             opnd->setLeftBound(0);
6892         }
6893         else
6894         {
6895             if (opnd->isSrcRegRegion())
6896             {
6897                 opnd->asSrcRegRegion()->computeLeftBound();
6898             }
6899             else if (opnd->isDstRegRegion())
6900             {
6901                 opnd->asDstRegRegion()->computeLeftBound();
6902             }
6903         }
6904     }
6905 }
6906 
6907 //
6908 // Normalize an operand's bitvec footprint based on its left bound
6909 // and update the given bitset.
6910 // If isSet is true, we set all bits that are covered by this operand.
6911 // If isSet os false, we clear all bits that are covered by this operand.
6912 //
updateFootPrint(BitSet & footprint,bool isSet)6913 void G4_Operand::updateFootPrint(BitSet& footprint, bool isSet)
6914 {
6915     unsigned N = NUM_BITS_PER_ELT;
6916     unsigned lb = getLeftBound();
6917     unsigned rb = getRightBound();
6918     const bool doFastPath = true; // for debugging
6919 
6920     if (doFastPath && lb % N == 0 && (rb + 1) % N == 0)
6921     {
6922         // lb is 32-byte aligned, set one dword at a time
6923         unsigned idx = lb / N;
6924         unsigned endIdx = rb / N;
6925         // get the precise footprint for the first two GRF
6926         for (int i = 0; i < 2 && idx <= endIdx; ++i, ++idx)
6927         {
6928             uint64_t bits = getBitVecL();
6929             uint32_t bitVal = (uint32_t)(i % 2 ? bits >> N : bits);
6930             if (isSet)
6931             {
6932                 footprint.setElt(idx, bitVal);
6933             }
6934             else
6935             {
6936                 footprint.resetElt(idx, bitVal);
6937             }
6938         }
6939         if (getGRFSize() > 32)
6940         {
6941             for (int i = 0; i < 2 && idx <= endIdx; ++i, ++idx)
6942             {
6943                 uint64_t bits = getBitVecH();
6944                 uint32_t bitVal = (uint32_t)(i % 2 ? bits >> N : bits);
6945                 if (isSet)
6946                 {
6947                     footprint.setElt(idx, bitVal);
6948                 }
6949                 else
6950                 {
6951                     footprint.resetElt(idx, bitVal);
6952                 }
6953             }
6954         }
6955 
6956         // beyond the first two GRF we assume every byte is touched
6957         while (idx <= endIdx)
6958         {
6959             if (isSet)
6960             {
6961                 footprint.setElt(idx, 0xFFFFFFFF);
6962             }
6963             else
6964             {
6965                 footprint.resetElt(idx, 0xFFFFFFFF);
6966             }
6967             idx++;
6968         }
6969     }
6970     else
6971     {
6972         // handle unaligned case
6973         uint64_t mask0 = getBitVecL();
6974         unsigned j = lb;
6975         for (unsigned i = 0; i < 64 && j <= rb; ++i, ++j)
6976         {
6977             if (mask0 & (1ULL << i))
6978                 footprint.set(j, isSet);
6979         }
6980         if (getGRFSize() > 32)
6981         {
6982             uint64_t mask1 = getBitVecH();
6983             for (unsigned i = 0; i < 64 && j <= rb; ++i, ++j)
6984             {
6985                 if (mask1 & (1ULL << i))
6986                     footprint.set(j, isSet);
6987             }
6988         }
6989         while (j++ <= rb)
6990             footprint.set(j, isSet);
6991     }
6992 }
6993 
6994 // update bit vector for this operand based on it size
6995 // We assume all bytes are touched
setBitVecFromSize(uint32_t NBytes)6996 void G4_Operand::setBitVecFromSize(uint32_t NBytes)
6997 {
6998     bitVec[0] = NBytes < 64 ? (1ULL << NBytes) - 1 : ULLONG_MAX;
6999     bitVec[1] = 0;
7000     if (getGRFSize() > 32 && NBytes >= 64)
7001     {
7002         bitVec[1] = (NBytes < 64 * 2) ? (1ULL << (NBytes - 64)) - 1 : ULLONG_MAX;
7003     }
7004 }
7005 
7006 // Left and right bound for every operand is based off
7007 // top most dcl.
7008 // For flag register as dst/src/pred/cond mod, each bit of
7009 // bitset represents corresponding bit of flag.
7010 // For indirect access, right bound is set to sum of
7011 // left bound and 15. The constant 15 is derived by the
7012 // fact that address register is accessed as Type_UW which
7013 // means 16 bits. right bound represents closed interval
7014 // so 1 is subtracted.
7015 // For direct access of GRF, each bit of bitset represents
7016 // correcponding byte of operand.
computeRightBound(G4_Operand * opnd)7017 void G4_INST::computeRightBound(G4_Operand* opnd)
7018 {
7019     associateOpndWithInst(opnd, this);
7020 
7021     if (opnd &&
7022         opnd->isImm() == false &&
7023         opnd->isNullReg() == false)
7024     {
7025         bool done = false;
7026 
7027         if (done == false && op == G4_pln && opnd == srcs[1])
7028         {
7029             opnd->computeRightBound(execSize > g4::SIMD8 ? execSize : execSize * 2);
7030             if (execSize > g4::SIMD8)
7031             {
7032                 opnd->setRightBound(opnd->right_bound * 2 - opnd->getLeftBound() + 1);
7033             }
7034 
7035             done = true;
7036         }
7037         else if (done == false && (isPseudoKill() || isPseudoUse()))
7038         {
7039             // pseudo kills/use write/read entire variable
7040             G4_Declare* topdcl = opnd->getBase()->asRegVar()->getDeclare()->getRootDeclare();
7041             opnd->setRightBound(topdcl->getByteSize() - 1);
7042 
7043             done = true;
7044         }
7045         else if (done == false && isFillIntrinsic())
7046         {
7047             asFillIntrinsic()->computeRightBound(opnd);
7048             done = true;
7049         }
7050         else if (done == false && isSpillIntrinsic())
7051         {
7052             asSpillIntrinsic()->computeRightBound(opnd);
7053             done = true;
7054         }
7055 
7056         if (done == false)
7057         {
7058             opnd->computeRightBound(execSize);
7059 
7060             if (getMaskOffset() > 0 &&
7061                 ((opnd == getImplAccSrc()) ||
7062                 (opnd == getImplAccDst())))
7063             {
7064                 // for ARF (flag, acc) we have to adjust its bound based on the emask
7065                 // We have to reset LB since the original instruction may have a non default emask
7066                 opnd->setLeftBound(0);
7067                 opnd->computeRightBound(execSize);
7068                 unsigned int multiplicationFactor = 1;
7069                 bool exceptionBoundsComputation = false;
7070                 if (opnd->isAccReg())
7071                 {
7072                     // Right bound granularity is in terms of
7073                     // bytes for Acc registers
7074                     multiplicationFactor = 4;
7075                 }
7076 
7077                 if (opnd == getImplAccDst() || opnd == getImplAccSrc())
7078                 {
7079                     G4_Type extype;
7080                     int extypesize;
7081                     extype = getOpExecType(extypesize);
7082 
7083                     if ((IS_WTYPE(extype) || IS_DTYPE(extype)))
7084                     {
7085                         // This condition is a result of HW Conformity requirement
7086                         // that for exec type = D/DW, only acc0 is used even when
7087                         // qtr control is set to Q2/H2
7088                         opnd->setLeftBound(0);
7089                         opnd->setRightBound(31);
7090                         exceptionBoundsComputation = true;
7091                     }
7092                 }
7093 
7094                 if (exceptionBoundsComputation == false)
7095                 {
7096                     // Update left/right bound as per inst mask offset
7097                     opnd->setLeftBound(opnd->left_bound + (getMaskOffset() * multiplicationFactor));
7098                     opnd->setRightBound(opnd->right_bound + (getMaskOffset () * multiplicationFactor));
7099                 }
7100             }
7101 
7102             done = true;
7103         }
7104     }
7105 }
7106 
computeRightBound(G4_Operand * opnd)7107 void G4_InstSend::computeRightBound(G4_Operand* opnd)
7108 {
7109     associateOpndWithInst(opnd, this);
7110 
7111     if (opnd && !opnd->isImm() && !opnd->isNullReg())
7112     {
7113         auto computeSendOperandBound = [](G4_Operand* opnd, int numReg)
7114         {
7115             if (numReg == 0)
7116             {
7117                 return;
7118             }
7119 
7120             // Sends read/write in units of GRF. With a narrower simd width,
7121             // the variable may have size smaller than one GRF, or smaller
7122             // the reponse or message length. In this case, limit the right
7123             // bound up to the variable size.
7124             unsigned LB = opnd->left_bound;
7125             unsigned RB = std::min(opnd->getTopDcl()->getByteSize(),
7126                 LB + numReg * numEltPerGRF<Type_UB>()) - 1;
7127 
7128             unsigned NBytes = RB - LB + 1;
7129             opnd->setBitVecFromSize(NBytes);
7130             opnd->setRightBound(RB);
7131         };
7132 
7133         if (srcs[0] == opnd || (isSplitSend() && srcs[1] == opnd))
7134         {
7135             // For send instruction's msg operand rightbound depends
7136             // on msg descriptor
7137             uint16_t numReg = (srcs[0] == opnd) ?
7138                 getMsgDesc()->getSrc0LenRegs() : getMsgDesc()->getSrc1LenRegs();
7139             computeSendOperandBound(opnd, numReg);
7140         }
7141         else if (dst == opnd)
7142         {
7143             // Compute right bound for dst operand
7144             const auto *desc = getMsgDesc();
7145             uint32_t dstBytes = desc->getDstLenBytes();
7146             if (dstBytes < getGRFSize()) {
7147                 // e.g. OWord block read x1
7148                 opnd->setBitVecL((1ULL << dstBytes) - 1);
7149                 opnd->setRightBound(opnd->left_bound + dstBytes - 1);
7150 
7151             } else {
7152                 uint16_t numReg = desc->getDstLenRegs();
7153                 computeSendOperandBound(opnd, numReg);
7154             }
7155         }
7156         else
7157         {
7158             opnd->computeRightBound(execSize);
7159         }
7160     }
7161 
7162 }
7163 
computeARFRightBound()7164 void G4_INST::computeARFRightBound()
7165 {
7166     computeRightBound(predicate);
7167     computeRightBound(mod);
7168     computeRightBound(implAccSrc);
7169     computeRightBound(implAccDst);
7170 }
7171 
7172 
7173 // This function should only be invoked after computePReg() function
7174 // has been invoked. The function computePReg() is invoked by computePhyReg()
7175 // just before scheduling and post-RA.
7176 // For GRF type variables this function returns linearized byte offset into
7177 // GRF file. So if a variable is assigned r1 and its left bound is 0, this
7178 // function will return (1 * 32) + 0 = 32.
7179 // For non-GRF variables, GRF base offset value is 0 so value returned will
7180 // be left bound.
7181 // This function works for both, G4_SrcRegRegion as well as G4_DstRegRegion.
getLinearizedStart()7182 unsigned int G4_Operand::getLinearizedStart()
7183 {
7184     unsigned linearizedStart = getLeftBound();
7185     G4_VarBase* base = getBase();
7186 
7187     if (base && base->isRegVar())
7188     {
7189         // LB is computed based on the root variable, so we have to go all the way up
7190         G4_Declare* dcl = base->asRegVar()->getDeclare();
7191         linearizedStart += dcl->getGRFBaseOffset();
7192         linearizedStart -= dcl->getOffsetFromBase();
7193     }
7194 
7195     return linearizedStart;
7196 }
7197 
7198 // Just like getLinearizedStart(), this function returns linearized byte
7199 // offset of end of variable. For eg, if a variable is assigned r1 and
7200 // region is type dword with inst exec size = 16, linearized end will be
7201 // (63 - 0 + 32) = 95.
7202 // Here, right bound is 63 since the region accesses 64 bytes,
7203 // left bound is 0 since region access begins at byte 0,
7204 // linearizedStart() will return 32 since r1 is allocated to the region.
7205 // This function works for both, G4_SrcRegRegion as well as G4_DstRegRegion.
getLinearizedEnd()7206 unsigned int G4_Operand::getLinearizedEnd()
7207 {
7208     return (getRightBound() - getLeftBound() + getLinearizedStart());
7209 }
7210 
dump() const7211 void G4_Operand::dump() const
7212 {
7213 #if _DEBUG
7214     const_cast<G4_Operand *>(this)->emit(std::cerr, false);
7215 #endif
7216 }
7217 
setPredicate(G4_Predicate * p)7218 void G4_INST::setPredicate(G4_Predicate* p)
7219 {
7220     if (predicate && predicate->getInst() == this)
7221     {
7222         predicate->setInst(NULL);
7223     }
7224 
7225     predicate = p;
7226 
7227     associateOpndWithInst(p, this);
7228     computeRightBound(p);
7229 }
7230 
setSrc(G4_Operand * opnd,unsigned i)7231 void G4_INST::setSrc(G4_Operand* opnd, unsigned i)
7232 {
7233     if (isPseudoAddrMovIntrinsic())
7234     {
7235         asIntrinsicInst()->setIntrinsicSrc(opnd, i);
7236         return;
7237     }
7238 
7239     MUST_BE_TRUE(i < G4_MAX_SRCS, ERROR_INTERNAL_ARGUMENT);
7240 
7241     if (srcs[i] != NULL)
7242     {
7243         if ((srcs[0] == srcs[i] && i != 0) ||
7244             (srcs[1] == srcs[i] && i != 1) ||
7245             (srcs[2] == srcs[i] && i != 2) ||
7246             (srcs[3] == srcs[i] && i != 3))
7247         {
7248             // opnd is present in some other
7249             // index of srcs so dont set its
7250             // inst to NULL
7251         }
7252         else
7253         {
7254             if (srcs[i]->getInst() == this)
7255             {
7256                 srcs[i]->setInst(NULL);
7257             }
7258         }
7259     }
7260 
7261     srcs[i] = opnd;
7262 
7263     associateOpndWithInst(opnd, this);
7264     resetRightBound(opnd);
7265 }
7266 
setDest(G4_DstRegRegion * opnd)7267 void G4_INST::setDest(G4_DstRegRegion* opnd)
7268 {
7269     if (dst != NULL && dst->getInst() == this)
7270     {
7271         dst->setInst(NULL);
7272     }
7273 
7274     dst = opnd;
7275 
7276     associateOpndWithInst(opnd, this);
7277     resetRightBound(opnd);
7278 }
7279 
setCondMod(G4_CondMod * m)7280 void G4_INST::setCondMod(G4_CondMod* m)
7281 {
7282     if (mod && mod->getInst() == this)
7283     {
7284         mod->setInst(NULL);
7285     }
7286 
7287     mod = m;
7288 
7289     associateOpndWithInst(m, this);
7290     computeRightBound(m);
7291 }
7292 
setImplAccSrc(G4_Operand * opnd)7293 void G4_INST::setImplAccSrc(G4_Operand* opnd)
7294 {
7295     if (implAccSrc != NULL && implAccSrc->getInst() == this)
7296     {
7297         implAccSrc->setInst(NULL);
7298     }
7299 
7300     implAccSrc = opnd;
7301 
7302     associateOpndWithInst(opnd, this);
7303     computeRightBound(opnd);
7304 }
7305 
setImplAccDst(G4_DstRegRegion * opnd)7306 void G4_INST::setImplAccDst(G4_DstRegRegion* opnd)
7307 {
7308     if (implAccDst != NULL && implAccDst->getInst() == this)
7309     {
7310         implAccDst->setInst(NULL);
7311     }
7312 
7313     implAccDst = opnd;
7314 
7315     associateOpndWithInst(opnd, this);
7316     computeRightBound(opnd);
7317 }
7318 
7319 // get simd lane mask for this instruction. For example,
7320 //      add  (8|M8) ...
7321 // will have 0xFF00, which lane 8-15
getExecLaneMask() const7322 unsigned G4_INST::getExecLaneMask() const
7323 {
7324     unsigned maskbits = (1 << getExecSize()) - 1;
7325     unsigned chanOffset = getMaskOffset();
7326     return (maskbits << chanOffset);
7327 }
7328 
print(std::ostream & OS) const7329 void G4_INST::print(std::ostream& OS) const
7330 {
7331     G4_INST& inst = const_cast<G4_INST&>(*this);
7332     if (!inst.isLabel())
7333         OS << "\t";
7334     inst.emit(OS, false, false);
7335     OS << "\n";
7336 }
7337 
dump() const7338 void G4_INST::dump() const
7339 {
7340     print(std::cerr);
7341 }
7342 
canSupportSaturate() const7343 bool G4_INST::canSupportSaturate() const
7344 {
7345     if (op == G4_mul || op == G4_pseudo_mad)
7346     {
7347         for (int i = 0, numSrc = getNumSrc(); i < numSrc; ++i)
7348         {
7349             if (IS_DTYPE(getSrc(i)->getType()))
7350             {
7351                 return false;
7352             }
7353         }
7354         return true;
7355     }
7356 
7357     if (isIntrinsic() || op == G4_mulh || op == G4_madw)
7358     {
7359         return false;
7360     }
7361 
7362     // note that IGA will return false for any opcode it does not recognize
7363     // If your psuedo opcode needs to support saturation you must add explicit check before this
7364     return InstSupportsSaturationIGA(getPlatform(), *this, builder);
7365 }
7366 
canSupportCondMod() const7367 bool G4_INST::canSupportCondMod() const
7368 {
7369     if (!builder.hasCondModForTernary() && getNumSrc() == 3)
7370     {
7371         return false;
7372     }
7373 
7374     if (op == G4_mul)
7375     {
7376         // can't support conditional modifiers if source is DW and dst is not QW
7377         bool dwordSrc = false;
7378         for (int i = 0, numSrc = getNumSrc(); i < numSrc; ++i)
7379         {
7380             if (IS_DTYPE(getSrc(i)->getType()))
7381             {
7382                 dwordSrc = true;
7383                 break;
7384             }
7385         }
7386         if (dwordSrc && !IS_QTYPE(getDst()->getType()))
7387         {
7388             return false;
7389         }
7390         return true;
7391     }
7392     else if (op == G4_pseudo_mad)
7393     {
7394         // no cond mod for D * W
7395         G4_Operand* src0 = getSrc(0);
7396         G4_Operand* src1 = getSrc(1);
7397         if (IS_DTYPE(src0->getType()) || IS_DTYPE(src1->getType()))
7398         {
7399             return false;
7400         }
7401         return true;
7402     }
7403 
7404     if (op == G4_mov)
7405     {
7406         return dst->getType() != Type_BF && getSrc(0)->getType() != Type_BF;
7407     }
7408 
7409     // ToDo: replace with IGA model
7410     return ((op == G4_add) ||
7411         (op == G4_and) ||
7412         (op == G4_addc) ||
7413         (op == G4_asr) ||
7414         (op == G4_avg) ||
7415         (op == G4_dp2) ||
7416         (op == G4_dp3) ||
7417         (op == G4_dp4) ||
7418         (op == G4_dp4a) ||
7419         (op == G4_dph) ||
7420         (op == G4_dp4a) ||
7421         (op == G4_frc) ||
7422         (op == G4_line) ||
7423         (op == G4_lrp) ||
7424         (op == G4_lzd) ||
7425         (op == G4_mac) ||
7426         (op == G4_mach) ||
7427         (op == G4_mad) ||
7428         (op == G4_mov) ||
7429         (op == G4_mul) ||
7430         (op == G4_not) ||
7431         (op == G4_or) ||
7432         (op == G4_pln) ||
7433         (op == G4_rndd) ||
7434         (op == G4_rnde) ||
7435         (op == G4_rndu) ||
7436         (op == G4_rndz) ||
7437         (op == G4_sad2) ||
7438         (op == G4_sada2) ||
7439         (op == G4_shl) ||
7440         (op == G4_shr) ||
7441         (op == G4_subb) ||
7442         (op == G4_xor));
7443 }
7444 
canSupportSrcModifier() const7445 bool G4_INST::canSupportSrcModifier() const
7446 {
7447     if (opcode() == G4_mov)
7448     {
7449         if (getDst()->getType() == Type_BF)
7450         {
7451             return false;
7452         }
7453     }
7454 
7455     if (opcode() == G4_pseudo_mad)
7456     {
7457         return true;
7458     }
7459 
7460     // note that IGA will return false for any opcode it does not recognize
7461     // If your psuedo opcode needs to support source modifier you must add
7462     // explicit check before this
7463     return InstSupportsSrcModifierIGA(getPlatform(), *this, builder);
7464 }
7465 
7466 // convert (execsize, offset) into emask option
7467 // if no such mask option exists, return InstOpt_NoOpt
offsetToMask(int execSize,int offset,bool nibOk)7468 G4_InstOption G4_INST::offsetToMask(int execSize, int offset, bool nibOk)
7469 {
7470     switch (execSize)
7471     {
7472     case 32:
7473         return InstOpt_M0;
7474     case 16:
7475         switch (offset)
7476         {
7477         case 0:
7478             return InstOpt_M0;
7479         case 16:
7480             return InstOpt_M16;
7481         default:
7482             return InstOpt_NoOpt;
7483         }
7484     case 8:
7485         switch (offset)
7486         {
7487         case 0:
7488             return InstOpt_M0;
7489         case 8:
7490             return InstOpt_M8;
7491         case 16:
7492             return InstOpt_M16;
7493         case 24:
7494             return InstOpt_M24;
7495         default:
7496             return InstOpt_NoOpt;
7497         }
7498     case 4:
7499         if (nibOk)
7500         {
7501             switch (offset)
7502             {
7503             case 0:
7504                 return InstOpt_M0;
7505             case 4:
7506                 return InstOpt_M4;
7507             case 8:
7508                 return InstOpt_M8;
7509             case 12:
7510                 return InstOpt_M12;
7511             case 16:
7512                 return InstOpt_M16;
7513             case 20:
7514                 return InstOpt_M20;
7515             case 24:
7516                 return InstOpt_M24;
7517             case 28:
7518                 return InstOpt_M28;
7519             default:
7520                 return InstOpt_NoOpt;
7521             }
7522         }
7523         else
7524         {
7525             return InstOpt_NoOpt;
7526         }
7527     default:
7528         return InstOpt_NoOpt;
7529     }
7530 }
7531 
setWriteMask(ChannelEnable wm)7532 void G4_DstRegRegion::setWriteMask(ChannelEnable wm)
7533 {
7534     writeMask = wm;
7535 }
7536 
setSwizzle(const char * sw)7537 void G4_SrcRegRegion::setSwizzle(const char* sw)
7538 {
7539     MUST_BE_TRUE((int)strlen(sw) <  max_swizzle, ERROR_INTERNAL_ARGUMENT);
7540     strcpy_s(swizzle, max_swizzle, sw);
7541 }
7542 
7543 // convert contiguous regions to <N;N,1> form subject to the requirment
7544 // that width is not used to cross GRF
7545 // This is done because <1;1,0>/<2;2,1> require crossbar muxes and thus incur a performance penalty
7546 // This should only be called after RA when we know the actual subreg offset
rewriteContiguousRegion(IR_Builder & builder,uint16_t opNum)7547 void G4_SrcRegRegion::rewriteContiguousRegion(IR_Builder& builder, uint16_t opNum)
7548 {
7549     int execSize = inst->getExecSize();
7550     if (execSize == 1 || !desc->isContiguous(execSize))
7551     {
7552         return;
7553     }
7554     uint32_t eltSize = getTypeSize();
7555     uint32_t subRegOffset = getLinearizedStart() % numEltPerGRF<Type_UB>();
7556     uint32_t endOffset = subRegOffset + inst->getExecSize() * eltSize;
7557 
7558     bool isAlign1Ternary = builder.hasAlign1Ternary() && inst->getNumSrc() == 3;
7559 
7560     if (builder.doNotRewriteContiguousRegion())
7561     {
7562         // 2-src and 3-src src0/1: normalize region to <1;1,0>
7563         // 3-src src2: normalize region to <2;2,1> since it only supports horz stride
7564         setRegion(isAlign1Ternary && opNum == 2 ? builder.createRegionDesc(2, 2, 1) : builder.getRegionStride1(), true);
7565         return;
7566     }
7567 
7568     if (inst->getNumSrc() < 3)
7569     {
7570         // do <16;16,1> for HF/W if possible
7571         if (subRegOff == 0 && execSize == 16 && eltSize == 2)
7572         {
7573             setRegion(builder.createRegionDesc(16, 16, 1), true);
7574             return;
7575         }
7576     }
7577 
7578     // Find a width that does not cross GRF from <8;8,1>, <4;4,1>, to <2;2,1>
7579     auto getWidth = [=](unsigned offset, unsigned eltSize) -> unsigned
7580     {
7581         unsigned Widths[] = { 8, 4, 2 };
7582         for (auto w : Widths)
7583         {
7584             if (w > inst->getExecSize())
7585                 continue;
7586 
7587             if (w * eltSize > numEltPerGRF<Type_UB>())
7588             {
7589                 // <8;8,1> is not allowed for 64-bit type
7590                 continue;
7591             }
7592 
7593             if (endOffset <= numEltPerGRF<Type_UB>() ||
7594                 subRegOffset % (w * eltSize) == 0)
7595             {
7596                 return w;
7597             }
7598         }
7599 
7600         // width >= 2 crosses GRF
7601         return 0;
7602     };
7603 
7604     unsigned short w = (unsigned short)getWidth(subRegOffset, eltSize);
7605 
7606     if (builder.newTernaryStride() && isAlign1Ternary && (w == 2 || w == 0) && opNum != 2)
7607     {
7608         setRegion(builder.getRegionStride1(), true);
7609         return;
7610     }
7611 
7612     if (w)
7613     {
7614         setRegion(builder.createRegionDesc(w, w, 1), true);
7615     }
7616     else if (isAlign1Ternary)
7617     {
7618         // binary encoding asserts on <1;1,0> region for 3-src inst, so force change it to <2;2,1>
7619         setRegion(builder.createRegionDesc(2, 2, 1), true);
7620     }
7621 }
7622 
getLiveIntervals(std::vector<std::pair<uint32_t,uint32_t>> & intervals)7623 void LiveIntervalInfo::getLiveIntervals(std::vector<std::pair<uint32_t, uint32_t>>& intervals)
7624 {
7625     for (auto&& it : liveIntervals)
7626     {
7627         intervals.push_back(it);
7628     }
7629 }
7630 
addLiveInterval(uint32_t start,uint32_t end)7631 void LiveIntervalInfo::addLiveInterval(uint32_t start, uint32_t end)
7632 {
7633     if (liveIntervals.size() == 0)
7634     {
7635         liveIntervals.emplace_back(start, end);
7636     }
7637     else if (start - liveIntervals.back().second <= 1)
7638     {
7639         liveIntervals.back().second = end;
7640     }
7641     else if (liveIntervals.back().second < start)
7642     {
7643         liveIntervals.emplace_back(start, end);
7644     }
7645     else if (liveIntervals.front().first >= start && liveIntervals.back().second <= end)
7646     {
7647         liveIntervals.clear();
7648         liveIntervals.emplace_back(start, end);
7649     }
7650     else
7651     {
7652         bool inserted = false;
7653         uint32_t newEnd = end;
7654         for (auto liveIt = liveIntervals.begin(); liveIt != liveIntervals.end();)
7655         {
7656             auto& lr = (*liveIt);
7657 
7658             if (!inserted)
7659             {
7660                 if (lr.first <= start && lr.second >= newEnd)
7661                 {
7662                     inserted = true;
7663                     break;
7664                 }
7665                 else if (lr.first <= start && lr.second > start && lr.second <= newEnd)
7666                 {
7667                     // Extend existing sub-interval
7668                     lr.second = newEnd;
7669                     inserted = true;
7670                     ++liveIt;
7671                     continue;
7672                 }
7673                 else if ((start - lr.second) <= 1u)
7674                 {
7675                     lr.second = newEnd;
7676                     inserted = true;
7677                     ++liveIt;
7678                     continue;
7679                 }
7680                 else if (lr.first > start)
7681                 {
7682                     // Insert new sub-interval
7683                     liveIntervals.insert(liveIt, std::make_pair(start, newEnd));
7684                     inserted = true;
7685                     continue;
7686                 }
7687             }
7688             else
7689             {
7690                 if (lr.first > newEnd)
7691                     break;
7692                 else if (lr.first == newEnd)
7693                 {
7694                     newEnd = lr.second;
7695                     auto newLRIt = liveIt;
7696                     --newLRIt;
7697                     (*newLRIt).second = newEnd;
7698                     liveIt = liveIntervals.erase(liveIt);
7699                     continue;
7700                 }
7701                 else if (lr.second <= newEnd)
7702                 {
7703                     liveIt = liveIntervals.erase(liveIt);
7704                     continue;
7705                 }
7706                 else if(lr.first < newEnd && lr.second > newEnd)
7707                 {
7708                     auto newLRIt = liveIt;
7709                     --newLRIt;
7710                     (*newLRIt).second = lr.second;
7711                     liveIntervals.erase(liveIt);
7712                     break;
7713                 }
7714             }
7715             ++liveIt;
7716         }
7717 
7718         if (!inserted)
7719         {
7720             if (start - liveIntervals.back().second <= 1)
7721                 liveIntervals.back().second = end;
7722             else
7723                 liveIntervals.emplace_back(start, end);
7724         }
7725     }
7726 }
7727 
liveAt(uint32_t cisaOff)7728 void LiveIntervalInfo::liveAt(uint32_t cisaOff)
7729 {
7730     if (cisaOff == UNMAPPABLE_VISA_INDEX)
7731         return;
7732 
7733     // Now iterate over all intervals and check which one should
7734     // be extended. If none, start a new one.
7735     bool added = false;
7736     auto prev = liveIntervals.begin();
7737 
7738     for (auto it = liveIntervals.begin(), itEnd = liveIntervals.end();
7739         it != itEnd;
7740         prev = it++)
7741     {
7742         auto& item = (*it);
7743 
7744         if (added)
7745         {
7746             // Check whether prev and current one can be merged
7747             if (((*prev).second == item.first) ||
7748                 ((*prev).second == item.first - 1))
7749             {
7750                 prev->second = item.second;
7751                 it = liveIntervals.erase(it);
7752                 break;
7753             }
7754             else
7755             {
7756                 break;
7757             }
7758         }
7759 
7760         if (item.first == cisaOff + 1)
7761         {
7762             item.first = cisaOff;
7763             added = true;
7764             break;
7765         }
7766 
7767         if (item.second == cisaOff - 1)
7768         {
7769             item.second = cisaOff;
7770             added = true;
7771             continue;
7772         }
7773 
7774         if (!added &&
7775             item.first <= cisaOff &&
7776             item.second >= cisaOff)
7777         {
7778             added = true;
7779             break;
7780         }
7781 
7782         if (item.first > cisaOff)
7783         {
7784             liveIntervals.insert(it, std::make_pair(cisaOff, cisaOff));
7785             added = true;
7786             break;
7787         }
7788     }
7789 
7790     if (!added)
7791     {
7792         liveIntervals.push_back(std::make_pair(cisaOff, cisaOff));
7793     }
7794 }
7795 
supportsNullDst() const7796 bool G4_INST::supportsNullDst() const
7797 {
7798     if (isSend())
7799     {
7800         return true;
7801     }
7802     if (builder.getPlatform() >= GENX_PVC && dst->getTypeSize() == 1)
7803     {
7804         // null:b not supported
7805         return false;
7806     }
7807     return getNumSrc() != 3 && !(op == G4_pln && !builder.doPlane());
7808 }
7809 
isAlign1Ternary() const7810 bool G4_INST::isAlign1Ternary() const
7811 {
7812     return builder.hasAlign1Ternary() && getNumSrc() == 3 && !mayExceedTwoGRF();
7813 }
7814 
7815 // Detect packed low-precision instruction. This is used by the scheduler.
7816 // - all src and dst are GRF and of :hf type and "packed".
7817 // (src is also packed when it is replicated scalar).
7818 // Two cases are possible:
7819 // 1)   add (16)    r1.0<1>:hf   r2.0<8;8,1>:hf   r3.0<8;8,1>:hf    { Align1, H1 }
7820 // 2)   add (16)    r1.0<1>:hf   r2.0<8;8,1>:hf   r3.0<0;1,0>:hf    { Align1, H1 }
isFastHFInstruction(void) const7821 bool G4_INST::isFastHFInstruction(void) const {
7822     if (getExecSize() < g4::SIMD16) {
7823         return false;
7824     }
7825     bool isHF = false;
7826     for (int op_i = 0, op_e = getNumSrc(); op_i < op_e; ++op_i) {
7827         G4_Operand *opnd = getSrc(op_i);
7828         if (! opnd) {
7829             continue;
7830         }
7831         if (!IS_HFTYPE(opnd->getType())) {
7832             return false;
7833         }
7834         if (opnd->isSrcRegRegion()) {
7835             G4_SrcRegRegion *srcRgn = opnd->asSrcRegRegion();
7836             if (! srcRgn->getRegion()->isContiguous(getExecSize())) {
7837                 return false;
7838             }
7839         }
7840         isHF = true;
7841     }
7842     return isHF;
7843 }
7844 
7845 
prepareForRealloc(G4_Kernel * kernel)7846 void G4_Declare::prepareForRealloc(G4_Kernel* kernel)
7847 {
7848     // Reset allocated register if this dcl is not an input
7849     // or a pre-defined variable.
7850     auto& builder = kernel->fg.builder;
7851 
7852     setGRFBaseOffset(0);
7853 
7854     if (getRegFile() != G4_RegFileKind::G4_INPUT &&
7855         getRegVar()->isPhyRegAssigned() &&
7856         !getRegVar()->isAreg() &&
7857         this != builder->getBuiltinA0() &&
7858         this != builder->getBuiltinR0() &&
7859         this != builder->getBuiltinA0Dot2() &&
7860         this != builder->getBuiltinBindlessSampler() &&
7861         this != builder->getBuiltinHWTID() &&
7862         this != builder->getBuiltinT252() &&
7863         this != builder->getStackCallRet() &&
7864         this != builder->getStackCallArg() &&
7865         this != builder->getBEFP() &&
7866         this != builder->getBESP() &&
7867         this != kernel->fg.getScratchRegDcl() &&
7868         this != kernel->fg.getStackPtrDcl() &&
7869         this != kernel->fg.getFramePtrDcl())
7870     {
7871         getRegVar()->resetPhyReg();
7872         getRegVar()->setDisp(UINT_MAX);
7873     }
7874 }
7875 
mayExpandToAccMacro() const7876 bool G4_INST::mayExpandToAccMacro() const
7877 {
7878     auto isDMul = [](const G4_INST *Inst) {
7879         return Inst->opcode() == G4_mul && (IS_QTYPE(Inst->getDst()->getType()) ||
7880                                             (IS_DTYPE(Inst->getSrc(0)->getType()) &&
7881                                              IS_DTYPE(Inst->getSrc(1)->getType())));
7882     };
7883 
7884     auto mayBeMAC = [&](const G4_INST *Inst) {
7885        if (Inst->opcode() != G4_pseudo_mad)
7886            return false;
7887        if (IS_TYPE_FLOAT_ALL(Inst->getDst()->getType()) &&
7888            builder.getOption(vISA_forceFPMAD))
7889            return false;
7890        return true;
7891     };
7892 
7893     return opcode() == G4_mach ||
7894            opcode() == G4_mulh ||
7895            opcode() == G4_madw ||
7896            isDMul(this) ||
7897            mayBeMAC(this) ||
7898            (opcode() == G4_pln && !builder.doPlane());
7899 }
7900 
canExecSizeBeAcc(Gen4_Operand_Number opndNum) const7901 bool G4_INST::canExecSizeBeAcc(Gen4_Operand_Number opndNum) const
7902 {
7903     switch (dst->getType())
7904     {
7905     case Type_HF:
7906     case Type_BF:
7907         if (builder.relaxedACCRestrictions())
7908         {
7909             if (!((isMixedMode() && getExecSize() == g4::SIMD8) ||
7910                 (getExecSize() == g4::SIMD16)))
7911             {
7912                 return false;
7913             }
7914         }
7915         else
7916         {
7917             if (getExecSize() != G4_ExecSize(builder.getNativeExecSize() * 2))
7918             {
7919                 return false;
7920             }
7921         }
7922         break;
7923     case Type_W:
7924     case Type_UW:
7925         if (getExecSize() != G4_ExecSize(builder.getNativeExecSize() * 2))
7926         {
7927             return false;
7928         }
7929         break;
7930     case Type_F:
7931         if (getExecSize() != G4_ExecSize(builder.getNativeExecSize() * 2) &&
7932             getExecSize() != builder.getNativeExecSize())
7933         {
7934             return false;
7935         }
7936         break;
7937     case Type_DF:
7938         if (!builder.useAccForDF())
7939         {
7940             return false;
7941         }
7942         if (getExecSize() != builder.getNativeExecSize() &&
7943             getExecSize() != G4_ExecSize(builder.getNativeExecSize() / 2))
7944         {
7945             return false;
7946         }
7947         break;
7948     case Type_D:
7949     case Type_UD:
7950         if (getExecSize() != builder.getNativeExecSize())
7951         {
7952             return false;
7953         }
7954         if (opndNum != Opnd_dst && isSignSensitive(opndNum))
7955         {
7956             return false;
7957         }
7958         break;
7959     default:
7960         return false;
7961     }
7962 
7963     return true;
7964 }
7965 
7966 // returns true if dst may be replaced by an explicit acc
7967 // in addition to opcode-specific checks, we require
7968 // -- dst must be GRF
7969 // -- contiguous regions
7970 // -- simd8 for D/UD, simd8/16 for F, simd16 for HF/W, other types not allowed
canDstBeAcc() const7971 bool G4_INST::canDstBeAcc() const
7972 {
7973     if (mayExpandToAccMacro())
7974     {
7975         // while this should not prevent dst from becoming acc (mul/plane macros use
7976         // acc as temp so should not affect final dst), later HW conformity is not equipped
7977         // to deal with such code so we disable the substitution
7978         return false;
7979     }
7980 
7981     if (dst == nullptr || dst->getTopDcl() == nullptr || dst->getHorzStride() != 1)
7982     {
7983         return false;
7984     }
7985 
7986     if (dst->getTopDcl()->getRegFile() != G4_GRF)
7987     {
7988         return false;
7989     }
7990 
7991     if (!builder.hasFP64Acc() && dst->getType() == Type_DF)
7992     {
7993         return false;
7994     }
7995 
7996     // src0 may not have indirect regioning
7997     if (!builder.accDstforIndirectSrc() && getSrc(0) && getSrc(0)->isSrcRegRegion())
7998     {
7999         auto src0Region = getSrc(0)->asSrcRegRegion();
8000         if (src0Region->getRegAccess() == IndirGRF)
8001         {
8002             return false;
8003         }
8004     }
8005 
8006     if (!canExecSizeBeAcc(Opnd_dst))
8007     {
8008         return false;
8009     }
8010 
8011     if (getSaturate() && IS_INT(dst->getType()))
8012     {
8013         return false;
8014     }
8015 
8016     if (!builder.relaxedACCRestrictions())
8017     {
8018         if (dst->getType() == builder.getMixModeType() && isMixedMode())
8019         {
8020             // acc can't be used as packed f16 for mix mode instruction as it doesn't support regioning
8021             return false;
8022         }
8023     }
8024 
8025     if (builder.avoidAccDstWithIndirectSource())
8026     {
8027         for (int i = 0, numSrc = getNumSrc(); i < numSrc; ++i)
8028         {
8029             bool indirectSrc = getSrc(i) && getSrc(i)->isSrcRegRegion() &&
8030                 getSrc(i)->asSrcRegRegion()->getRegAccess() != Direct;
8031             if (indirectSrc)
8032             {
8033                 return false;
8034             }
8035         }
8036     }
8037 
8038     if (isMath())
8039     {
8040         return builder.hasMathAcc();
8041     }
8042     switch (opcode())
8043     {
8044     case G4_add:
8045     case G4_and:
8046     case G4_asr:
8047     case G4_avg:
8048     case G4_frc:
8049     case G4_lzd:
8050     case G4_mul:
8051     case G4_not:
8052     case G4_or:
8053     case G4_rndd:
8054     case G4_rnde:
8055     case G4_rndu:
8056     case G4_rndz:
8057     case G4_shr:
8058     case G4_smov:
8059     case G4_xor:
8060     case G4_rol:
8061     case G4_ror:
8062         return true;
8063     case G4_sel:
8064         // sel seems to fail with int acc for some strange reason (sign extension?)
8065         return getCondMod() ? IS_TYPE_FLOAT_ALL(dst->getType()) : true;
8066     case G4_cmp:
8067     case G4_cmpn:
8068         // disable for now since it's causing some SKL tests to fail
8069         return false;
8070     case G4_mov:
8071         if (builder.hasFormatConversionACCRestrictions())
8072         {
8073             const bool allowedICombination = (IS_DTYPE(getSrc(0)->getType()) || getSrc(0)->getType() == Type_W || getSrc(0)->getType() == Type_UW) &&
8074                 (IS_DTYPE(dst->getType()) || dst->getType() == Type_W || dst->getType() == Type_UW);
8075             const bool allowedFCombination = (getSrc(0)->getType() == Type_F || getSrc(0)->getType() == Type_HF) &&
8076                 (dst->getType() == Type_F || dst->getType() == Type_HF);
8077             const bool allowedDFCombination = getSrc(0)->getType() == Type_DF &&
8078                 dst->getType() == Type_DF;
8079 
8080             if (builder.restrictedACCRestrictions() && allowedFCombination)
8081             {
8082                 uint16_t dstStride = dst->getHorzStride();
8083                 uint16_t srcStride = 0;
8084                 if (getSrc(0)->isSrcRegRegion())
8085                 {
8086                     G4_SrcRegRegion* src = getSrc(0)->asSrcRegRegion();
8087                     const RegionDesc* region = src->getRegion();
8088 
8089                     if (!region->isSingleStride(execSize, srcStride))
8090                     {
8091                         return false;
8092                     }
8093 
8094                     //The bitmapping is model by the element size * element stride.
8095                     // No matter dst is float or half float.
8096                     // Pack and un-pack happen only in the destination register, so no matter dst is F or HF, it's not allowed to be replaced with ACC if bitmapping swizzles.
8097                     // If both dst and src are HF type, swizzle is not allowed as well.
8098                     //FIXME, mov (16|M0)   acc0.0<1>:f, r28<1;1,0>:hf can be passed in HW.
8099                     if ((dst->getType() != src->getType() || dst->getType() == Type_HF) &&
8100                         (dstStride * dst->getTypeSize() != srcStride * src->getTypeSize()))
8101                     {
8102                         return false;
8103                     }
8104                 }
8105             }
8106 
8107             if (!allowedICombination && !allowedFCombination && !allowedDFCombination)
8108             {
8109                 return false;
8110             }
8111         }
8112         return builder.relaxedACCRestrictions() || !getSrc(0)->isAccReg();
8113     case G4_pln:
8114         // we can't use acc if plane will be expanded
8115         return builder.doPlane();
8116     case G4_madm:
8117         return builder.useAccForMadm();
8118     case G4_mad:
8119     case G4_csel:
8120         return builder.canMadHaveAcc();
8121     case G4_dp4a:
8122         return builder.relaxedACCRestrictions2();
8123     case G4_bfn:
8124     case G4_add3:
8125         return true;
8126     default:
8127         return false;
8128     }
8129 }
8130 
8131 // returns true if src0 may be replaced by an explicit acc
8132 // in addition to opcode-specific checks, we require
8133 // -- contiguous regions
8134 // -- simd8 for D/UD, simd8/16 for F, simd16 for HF/W, other types not allowed
canSrcBeAccBeforeHWConform(Gen4_Operand_Number opndNum) const8135 bool G4_INST::canSrcBeAccBeforeHWConform(Gen4_Operand_Number opndNum) const
8136 {
8137     int srcId = getSrcNum(opndNum);
8138     assert((srcId == 0 || srcId == 1 || srcId == 2) && "must be either src0, src1 or src2");
8139 
8140     if (!builder.relaxedACCRestrictions3() && srcId == 2)
8141     {
8142         return false;
8143     }
8144 
8145     if (getSrc(srcId) == nullptr || !getSrc(srcId)->isSrcRegRegion())
8146     {
8147         return false;
8148     }
8149 
8150     if (mayExpandToAccMacro())
8151     {
8152         return false;
8153     }
8154 
8155     G4_SrcRegRegion* src = getSrc(srcId)->asSrcRegRegion();
8156     if (srcId == 1 && src->hasModifier())
8157     {
8158         // some platforms allow float src1 acc modifiers,
8159         // while some don't allow src1 acc modifier at all.
8160         if (!IS_TYPE_FLOAT_ALL(src->getType()) || !builder.relaxedACCRestrictions())
8161         {
8162             return false;
8163         }
8164     }
8165     if (!src->getRegion()->isContiguous(getExecSize()))
8166     {
8167         return false;
8168     }
8169 
8170     if (builder.relaxedACCRestrictions() &&
8171         isMixedMode() &&
8172         isLowPrecisionFloatTy(src->getType()))
8173     {
8174         return false;
8175     }
8176 
8177     if (!canExecSizeBeAcc(opndNum))
8178     {
8179         return false;
8180     }
8181 
8182     if (opcode() == G4_mad && srcId == 0 &&
8183         !builder.canMadHaveSrc0Acc())
8184     {
8185         // mac's implicit acc gets its region from dst, so we have to check src and
8186         // dst have the same type
8187         if (src->getType() != dst->getType())
8188         {
8189             return false;
8190         }
8191     }
8192 
8193     if (IS_TYPE_FLOAT_ALL(src->getType()) ^ IS_TYPE_FLOAT_ALL(getDst()->getType()))
8194     {
8195         // no float <-> int conversion for acc source
8196         return false;
8197     }
8198 
8199     if (isMath())
8200     {
8201         return builder.hasMathAcc();
8202     }
8203     switch (opcode())
8204     {
8205     case G4_add:
8206     case G4_asr:
8207     case G4_avg:
8208     case G4_cmp:
8209     case G4_cmpn:
8210     case G4_frc:
8211     case G4_lzd:
8212     case G4_rndd:
8213     case G4_rnde:
8214     case G4_rndu:
8215     case G4_rndz:
8216     case G4_sel:
8217     case G4_shl:
8218     case G4_shr:
8219     case G4_smov:
8220     case G4_rol:
8221     case G4_ror:
8222         return true;
8223     case G4_mov:
8224         if (builder.hasFormatConversionACCRestrictions())
8225         {
8226             const bool allowedICombination = (IS_DTYPE(src->getType()) || src->getType() == Type_W || src->getType() == Type_UW) &&
8227                 (IS_DTYPE(dst->getType()) || dst->getType() == Type_W || dst->getType() == Type_UW);
8228             const bool allowedFCombination = (src->getType() == Type_F || src->getType() == Type_HF) &&
8229                 (dst->getType() == Type_F || dst->getType() == Type_HF);
8230             const bool allowedDFCombination = src->getType() == Type_DF &&
8231                 dst->getType() == Type_DF;
8232 
8233             if (builder.restrictedACCRestrictions() && allowedFCombination)
8234             {
8235                 uint16_t dstStride = dst->getHorzStride();
8236                 uint16_t srcStride = 0;
8237                 const RegionDesc* region = src->getRegion();
8238 
8239                 if (!region->isSingleStride(execSize, srcStride))
8240                 {
8241                     return false;
8242                 }
8243 
8244                 //The bitmapping is model by the element size * element stride.
8245                 //When dst type is different with src type, or both are HF type.
8246                 //FIXME, currently, r35 in following case cannot be replaced with acc
8247                 // mov (16|M0) r35.0<1>:f r25.0<1;1,0>:f
8248                 // mov (16|M0) r36.0<1>:hf r35.0<1;1,0>:f
8249                 // the restriction may be relaxed after validation of HW team
8250                 if ((dst->getType() != src->getType() || dst->getType() == Type_HF) &&
8251                     (dstStride * dst->getTypeSize() != srcStride * src->getTypeSize()))
8252                 {
8253                     return false;
8254                 }
8255             }
8256 
8257             if (!allowedICombination && !allowedFCombination && !allowedDFCombination)
8258             {
8259                 return false;
8260             }
8261         }
8262         return builder.relaxedACCRestrictions() || !getDst()->isAccReg();
8263     case G4_madm:
8264         return builder.useAccForMadm();
8265     case G4_mad:
8266         // no int acc if it's used as mul operand
8267         return builder.canMadHaveAcc() &&
8268             ((srcId == 1 && (IS_FTYPE(src->getType()) || (src->getType() == Type_DF))) ||
8269                 (srcId == 0 && src->getModifier() == Mod_src_undef) ||
8270                 (srcId == 0 && builder.relaxedACCRestrictions_1()) ||
8271                 (srcId == 2 && (IS_FTYPE(src->getType()) || (src->getType() == Type_DF))));
8272     case G4_csel:
8273         return builder.canMadHaveAcc();
8274     case G4_mul:
8275         return IS_TYPE_FLOAT_ALL(src->getType());
8276     case G4_and:
8277     case G4_not:
8278     case G4_or:
8279     case G4_xor:
8280         return src->getModifier() == Mod_src_undef;
8281     case G4_pln:
8282         return builder.doPlane() && src->getModifier() == Mod_src_undef;
8283     case G4_dp4a:
8284         if (builder.restrictedACCRestrictions())
8285         {
8286             return srcId == 0;
8287         }
8288         return builder.relaxedACCRestrictions2();
8289     case G4_bfn:
8290     case G4_add3:
8291         return true;
8292     default:
8293         return false;
8294     }
8295 }
8296 
canSrcBeAccAfterHWConform(Gen4_Operand_Number opndNum) const8297 bool G4_INST::canSrcBeAccAfterHWConform(Gen4_Operand_Number opndNum) const
8298 {
8299     int srcId = getSrcNum(opndNum);
8300     G4_SrcRegRegion* src = getSrc(srcId)->asSrcRegRegion();
8301 
8302     // dst must be GRF-aligned
8303     if ((getDst()->getLinearizedStart() % numEltPerGRF<Type_UB>()) != 0)
8304     {
8305         if (!(isMixedMode() && builder.getPlatform() == XeHP_SDV))
8306             return false;
8307     }
8308 
8309     // check that src0 and dst have the same type/alignment
8310     auto dstEltSize = getDst()->getHorzStride() * getDst()->getTypeSize();
8311     if (dstEltSize > TypeSize(src->getType()))
8312     {
8313         return false;
8314     }
8315     else if (isLowPrecisionFloatTy(getDst()->getType()) && src->getType() == Type_F &&
8316         dstEltSize == 2)
8317     {
8318         if (builder.relaxedACCRestrictions())
8319         {
8320             //When source is float or half float from accumulator register and destination is half float with a stride of 1,
8321             //the source must register aligned. i.e., source must have offset zero.
8322             if ((src->getLinearizedStart() % numEltPerGRF<Type_UB>()) != 0)
8323             {
8324                 return false;
8325             }
8326         }
8327         else
8328         {
8329             // no acc for mix mode inst with packed HF dst
8330             return false;
8331         }
8332     }
8333 
8334     return true;
8335 }
8336 
canSrcBeAcc(Gen4_Operand_Number opndNum) const8337 bool G4_INST::canSrcBeAcc(Gen4_Operand_Number opndNum) const
8338 {
8339     return canSrcBeAccBeforeHWConform(opndNum) && canSrcBeAccAfterHWConform(opndNum);
8340 }
8341 
getPlatform() const8342 TARGET_PLATFORM G4_INST::getPlatform() const
8343 {
8344     return builder.getPlatform();
8345 }
8346 
cloneInst()8347 G4_INST* G4_INST::cloneInst()
8348 {
8349     // return nullptr if new derived class hasnt implemented
8350     // its own cloneInst()
8351     if (!isBaseInst() && !isCFInst())
8352         return nullptr;
8353 
8354     // Return a clone of current instruction.
8355     // This functionality is expected to be used by optimizations
8356     // such as rematerialization that require creating a copy
8357     // of instructions.
8358     auto nonConstBuilder = const_cast<IR_Builder*>(&builder);
8359     G4_INST* newInst = nullptr;
8360     auto prd = nonConstBuilder->duplicateOperand(getPredicate());
8361     auto condMod = nonConstBuilder->duplicateOperand(getCondMod());
8362     auto dst = nonConstBuilder->duplicateOperand(getDst());
8363     auto src0 = nonConstBuilder->duplicateOperand(getSrc(0));
8364     auto src1 = nonConstBuilder->duplicateOperand(getSrc(1));
8365     auto src2 = nonConstBuilder->duplicateOperand(getSrc(2));
8366     auto accSrc = nonConstBuilder->duplicateOperand(getImplAccSrc());
8367     auto accDst = nonConstBuilder->duplicateOperand(getImplAccDst());
8368 
8369     if (isSend())
8370     {
8371         MUST_BE_TRUE(false, "cloning send not yet supported");
8372     }
8373     else
8374     {
8375         newInst = nonConstBuilder->createInternalInst(prd, op, condMod, getSaturate(), getExecSize(),
8376             dst, src0, src1, option);
8377 
8378         if (src2)
8379             newInst->setSrc(src2, 2);
8380 
8381         if (accSrc)
8382             newInst->setImplAccSrc(accSrc);
8383 
8384         if (accDst)
8385             newInst->setImplAccDst(accDst);
8386     }
8387 
8388     return newInst;
8389 }
8390 
cloneInst()8391 G4_INST* G4_InstSend::cloneInst()
8392 {
8393     auto nonConstBuilder = const_cast<IR_Builder*>(&builder);
8394     G4_INST* newInst = nullptr;
8395     auto prd = nonConstBuilder->duplicateOperand(getPredicate());
8396     auto dst = nonConstBuilder->duplicateOperand(getDst());
8397     auto src0 = nonConstBuilder->duplicateOperand(getSrc(0))->asSrcRegRegion();
8398 
8399     if (isSplitSend())
8400     {
8401         // desc -> src2, extDesc -> src3
8402         auto src1 = nonConstBuilder->duplicateOperand(getSrc(1))->asSrcRegRegion();
8403         auto desc = nonConstBuilder->duplicateOperand(getSrc(2));
8404         auto extDesc = nonConstBuilder->duplicateOperand(getSrc(3));
8405         newInst = nonConstBuilder->createInternalSplitSendInst(getExecSize(), dst, src0, src1, desc,
8406             getOption(), getMsgDescRaw(), extDesc);
8407         if (prd)
8408         {
8409             newInst->setPredicate(prd);
8410         }
8411     }
8412     else
8413     {
8414         auto desc = nonConstBuilder->duplicateOperand(getSrc(1));
8415         // desc -> src1, no extDesc (must be imm and stored in SendMsgDesc)
8416         newInst = nonConstBuilder->createInternalSendInst(prd, op, getExecSize(),
8417             dst, src0, desc, getOption(), getMsgDesc());
8418     }
8419 
8420     return newInst;
8421 }
8422 
G4_InstIntrinsic(const IR_Builder & builder,G4_Predicate * prd,Intrinsic intrinId,G4_ExecSize execSize,G4_DstRegRegion * d,G4_Operand * s0,G4_Operand * s1,G4_Operand * s2,G4_Operand * s3,G4_Operand * s4,G4_Operand * s5,G4_Operand * s6,G4_Operand * s7,G4_InstOpts opt)8423 G4_InstIntrinsic::G4_InstIntrinsic(
8424     const IR_Builder& builder,
8425     G4_Predicate* prd,
8426     Intrinsic intrinId,
8427     G4_ExecSize execSize,
8428     G4_DstRegRegion* d,
8429     G4_Operand* s0,
8430     G4_Operand* s1,
8431     G4_Operand* s2,
8432     G4_Operand* s3,
8433     G4_Operand* s4,
8434     G4_Operand* s5,
8435     G4_Operand* s6,
8436     G4_Operand* s7,
8437     G4_InstOpts opt) :
8438     G4_INST(builder, prd, G4_intrinsic, nullptr, g4::NOSAT, execSize, d, nullptr, nullptr, nullptr, opt),
8439     intrinsicId(intrinId), tmpGRFStart(-1), tmpAddrStart(-1), tmpFlagStart(-1)
8440 {
8441     srcs[0] = s0;
8442     srcs[1] = s1;
8443     srcs[2] = s2;
8444     srcs[3] = s3;
8445     srcs[4] = s4;
8446     srcs[5] = s5;
8447     srcs[6] = s6;
8448     srcs[7] = s7;
8449 
8450     resetRightBound(s0);
8451     resetRightBound(s1);
8452     resetRightBound(s2);
8453     resetRightBound(s3);
8454     resetRightBound(s4);
8455     resetRightBound(s5);
8456     resetRightBound(s6);
8457     resetRightBound(s7);
8458 
8459     associateOpndWithInst(s0, this);
8460     associateOpndWithInst(s1, this);
8461     associateOpndWithInst(s2, this);
8462     associateOpndWithInst(s3, this);
8463     associateOpndWithInst(s4, this);
8464     associateOpndWithInst(s5, this);
8465     associateOpndWithInst(s6, this);
8466     associateOpndWithInst(s7, this);
8467 }
8468 
getIntrinsicSrc(unsigned i) const8469 G4_Operand* G4_InstIntrinsic::getIntrinsicSrc(unsigned i) const
8470 {
8471     MUST_BE_TRUE(i < G4_MAX_INTRINSIC_SRCS, ERROR_INTERNAL_ARGUMENT);
8472     return srcs[i];
8473 }
8474 
getOperand(Gen4_Operand_Number opnd_num) const8475 G4_Operand* G4_InstIntrinsic::getOperand(Gen4_Operand_Number opnd_num) const
8476 {
8477     switch (opnd_num) {
8478     case Opnd_src0: return srcs[0];
8479     case Opnd_src1: return srcs[1];
8480     case Opnd_src2: return srcs[2];
8481     case Opnd_src3: return srcs[3];
8482     case Opnd_src4: return srcs[4];
8483     case Opnd_src5: return srcs[5];
8484     case Opnd_src6: return srcs[6];
8485     case Opnd_src7: return srcs[7];
8486     default:
8487         MUST_BE_TRUE(0, "Operand number is out of range.");
8488         break;
8489     }
8490     return NULL;
8491 }
8492 
setIntrinsicSrc(G4_Operand * opnd,unsigned i)8493 void G4_InstIntrinsic::setIntrinsicSrc(G4_Operand* opnd, unsigned i)
8494 {
8495     MUST_BE_TRUE(i < G4_MAX_INTRINSIC_SRCS, ERROR_INTERNAL_ARGUMENT);
8496 
8497     if (srcs[i] != NULL)
8498     {
8499         if (srcs[i]->getInst() == (G4_INST *)this)
8500         {
8501             srcs[i]->setInst(NULL);
8502         }
8503     }
8504     srcs[i] = opnd;
8505 
8506     associateOpndWithInst(opnd, (G4_INST*)this);
8507     resetRightBound(opnd);
8508 }
8509 
cloneInst()8510 G4_INST* G4_InstIntrinsic::cloneInst()
8511 {
8512     auto nonConstBuilder = const_cast<IR_Builder*>(&builder);
8513     auto prd = nonConstBuilder->duplicateOperand(getPredicate());
8514     auto dst = nonConstBuilder->duplicateOperand(getDst());
8515     auto src0 = nonConstBuilder->duplicateOperand(getSrc(0));
8516     auto src1 = nonConstBuilder->duplicateOperand(getSrc(1));
8517     auto src2 = nonConstBuilder->duplicateOperand(getSrc(2));
8518 
8519     return nonConstBuilder->createInternalIntrinsicInst(prd, getIntrinsicId(), getExecSize(), dst,
8520         src0, src1, src2, option);
8521 }
8522 
isLegal(unsigned vs,unsigned w,unsigned hs)8523 bool RegionDesc::isLegal(unsigned vs, unsigned w, unsigned hs)
8524 {
8525     auto isPositiveAndLegal = [](unsigned val, unsigned high) {
8526         if (val == UNDEFINED_SHORT)
8527             return true;
8528         if (val > high || val == 0)
8529             return false;
8530         return ((val - 1) & val) == 0;
8531     };
8532     return isPositiveAndLegal(w, 16) &&
8533             (vs == 0 || isPositiveAndLegal(vs, 32)) &&
8534             (hs == 0 || isPositiveAndLegal(hs, 16));
8535 }
8536 
getRegionDescKind(uint16_t size,uint16_t vstride,uint16_t width,uint16_t hstride)8537 RegionDesc::RegionDescKind RegionDesc::getRegionDescKind(
8538     uint16_t size, uint16_t vstride,
8539     uint16_t width, uint16_t hstride)
8540 {
8541     // Skip special cases.
8542     if (vstride == UNDEFINED_SHORT || width == UNDEFINED_SHORT ||
8543         hstride == UNDEFINED_SHORT)
8544         return RK_Other;
8545 
8546     // <0;1,0>
8547     if (size == 1 || (vstride == 0 && hstride == 0) ||
8548         (vstride == 0 && width == 1))
8549         return RK_Stride0;
8550 
8551     // <1;1,0>
8552     if ((vstride == 1 && width == 1) || (size <= width && hstride == 1) ||
8553         (vstride == width && hstride == 1))
8554         return RK_Stride1;
8555 
8556     // <N;1,0>
8557     uint16_t stride = 0;
8558     if (vstride == width * hstride || width == size)
8559     {
8560         stride = hstride;
8561     }
8562     else if (width == 1 && hstride == 0)
8563     {
8564         stride = vstride;
8565     }
8566 
8567     return (stride == 2) ? RK_Stride2 : (stride == 4) ? RK_Stride4
8568                                                       : RK_Other;
8569 }
8570 
isContiguous(unsigned ExSize) const8571 bool RegionDesc::isContiguous(unsigned ExSize) const
8572 {
8573     if (vertStride == 1 && width == 1)
8574         return true;
8575     if (vertStride == width && horzStride == 1)
8576         return true;
8577 
8578     return (ExSize == 1) ||
8579             (ExSize <= (unsigned)width && horzStride == 1);
8580 }
isSingleNonUnitStride(uint32_t execSize,uint16_t & stride) const8581 bool RegionDesc::isSingleNonUnitStride(uint32_t execSize, uint16_t& stride) const
8582 {
8583     if (isScalar() || isContiguous(execSize))
8584     {
8585         return false;
8586     }
8587 
8588     if (vertStride == width * horzStride || width == execSize)
8589     {
8590         stride = horzStride;
8591         return true;
8592     }
8593 
8594     if (horzStride == 0 && width == 1)
8595     {
8596         stride = vertStride;
8597         return true;
8598     }
8599 
8600     return false;
8601 }
8602 
isSingleStride(uint32_t execSize,uint16_t & stride) const8603 bool RegionDesc::isSingleStride(uint32_t execSize, uint16_t &stride) const
8604 {
8605     if (isScalar())
8606     {
8607         stride = 0;
8608         return true;
8609     }
8610     if (isContiguous(execSize))
8611     {
8612         stride = 1;
8613         return true;
8614     }
8615 
8616     return isSingleNonUnitStride(execSize, stride);
8617 }
8618 
cloneInst()8619 G4_INST* G4_InstMath::cloneInst()
8620 {
8621     auto nonConstBuilder = const_cast<IR_Builder*>(&builder);
8622     auto prd = nonConstBuilder->duplicateOperand(getPredicate());
8623     auto dst = nonConstBuilder->duplicateOperand(getDst());
8624     auto src0 = nonConstBuilder->duplicateOperand(getSrc(0));
8625     auto src1 = nonConstBuilder->duplicateOperand(getSrc(1));
8626 
8627     return nonConstBuilder->createInternalMathInst(
8628         prd, getSaturate(), getExecSize(),
8629         dst, src0, src1, getMathCtrl(), option);
8630 }
8631 
cloneInst()8632 G4_INST* G4_InstBfn::cloneInst()
8633 {
8634     auto nonConstBuilder = const_cast<IR_Builder*>(&builder);
8635     auto prd = nonConstBuilder->duplicateOperand(getPredicate());
8636     auto condMod = nonConstBuilder->duplicateOperand(getCondMod());
8637     auto dst = nonConstBuilder->duplicateOperand(getDst());
8638     auto src0 = nonConstBuilder->duplicateOperand(getSrc(0));
8639     auto src1 = nonConstBuilder->duplicateOperand(getSrc(1));
8640     auto src2 = nonConstBuilder->duplicateOperand(getSrc(2));
8641     return nonConstBuilder->createInternalBfnInst(
8642         getBooleanFuncCtrl(), prd, condMod, getSaturate(), getExecSize(),
8643         dst, src0, src1, src2, option);
8644 }
8645 
cloneInst()8646 G4_INST* G4_InstDpas::cloneInst()
8647 {
8648     auto nonConstBuilder = const_cast<IR_Builder*>(&builder);
8649     auto dst = nonConstBuilder->duplicateOperand(getDst());
8650     auto src0 = nonConstBuilder->duplicateOperand(getSrc(0));
8651     auto src1 = nonConstBuilder->duplicateOperand(getSrc(1));
8652     auto src2 = nonConstBuilder->duplicateOperand(getSrc(2));
8653     auto src3 = nonConstBuilder->duplicateOperand(getSrc(3));
8654     return nonConstBuilder->createInternalDpasInst(
8655         op, getExecSize(),
8656         dst, src0, src1, src2, src3, option,
8657         getSrc2Precision(), getSrc1Precision(), getSystolicDepth(), getRepeatCount());
8658 }
8659 
isInt() const8660 bool G4_InstDpas::isInt() const
8661 {
8662     // Check Src1 is enough.
8663     switch (Src1Precision)
8664     {
8665     case GenPrecision::S8:
8666     case GenPrecision::U8:
8667     case GenPrecision::S4:
8668     case GenPrecision::U4:
8669     case GenPrecision::S2:
8670     case GenPrecision::U2:
8671         return true;
8672     default:
8673         break;
8674     }
8675     return false;
8676 }
8677 
is2xInt8() const8678 bool G4_InstDpas::is2xInt8() const
8679 {
8680     if ((Src1Precision == GenPrecision::S4 || Src1Precision == GenPrecision::U4 ||
8681          Src1Precision == GenPrecision::S2 || Src1Precision == GenPrecision::U2)
8682         &&
8683         (Src2Precision == GenPrecision::S4 || Src2Precision == GenPrecision::U4 ||
8684          Src2Precision == GenPrecision::S2 || Src2Precision == GenPrecision::U2))
8685     {
8686         return true;
8687     }
8688     return false;
8689 }
8690 
getOpsPerChan() const8691 uint8_t G4_InstDpas::getOpsPerChan() const
8692 {
8693     if (isBF16() || isFP16())
8694         return OPS_PER_CHAN_2;
8695     else if (isTF32())
8696         return OPS_PER_CHAN_1;
8697     else if (isBF8())
8698         return OPS_PER_CHAN_4;
8699     else if (is2xInt8())
8700         return OPS_PER_CHAN_8;
8701     // int8
8702     return OPS_PER_CHAN_4;
8703 }
8704 
computeRightBound(G4_Operand * opnd)8705 void G4_InstDpas::computeRightBound(G4_Operand* opnd)
8706 {
8707     associateOpndWithInst(opnd, this);
8708     if (opnd && !opnd->isImm() && !opnd->isNullReg())
8709     {
8710         G4_InstDpas* dpasInst = asDpasInst();
8711         uint8_t D = dpasInst->getSystolicDepth();
8712         uint8_t C = dpasInst->getRepeatCount();
8713 
8714         auto computeDpasOperandBound = [](G4_Operand* opnd, unsigned leftBound, unsigned rightBound)
8715         {
8716             unsigned NBytes = rightBound - leftBound + 1;
8717             opnd->setBitVecFromSize(NBytes);
8718             opnd->setRightBound(rightBound);
8719         };
8720 
8721         if (opnd == dst || (opnd == srcs[0] && !opnd->isNullReg()))
8722         {
8723             // dst and src0 are always packed, and RB is exec_size * type_size * rep_count
8724             auto opndSize = builder.getNativeExecSize() * opnd->getTypeSize() * C;
8725             computeDpasOperandBound(opnd, opnd->left_bound, opnd->left_bound + opndSize - 1);
8726         }
8727         else if (opnd == srcs[1])
8728         {
8729             uint32_t bytesPerLane = dpasInst->getSrc1SizePerLaneInByte();
8730             uint8_t src1_D = D;
8731 
8732             // Each lanes needs (src1_D * bytesPerLane) bytes, and it's multiple of DW!
8733             uint32_t bytesPerLaneForAllDepth = bytesPerLane * src1_D;
8734             bytesPerLaneForAllDepth = ((bytesPerLaneForAllDepth + 3) / 4) * 4;
8735 
8736             uint32_t bytes = bytesPerLaneForAllDepth * builder.getNativeExecSize();
8737             computeDpasOperandBound(opnd, opnd->left_bound, opnd->left_bound + bytes - 1);
8738         }
8739         else if (opnd == srcs[2])
8740         {
8741             // src2 is uniform.
8742             uint32_t bytesPerLane = dpasInst->getSrc2SizePerLaneInByte();
8743             uint32_t bytes = bytesPerLane * D * C;
8744             if (op == G4_dpasw) {
8745                 bytes = bytesPerLane * D * ((C + 1) / 2);
8746             }
8747             computeDpasOperandBound(opnd, opnd->left_bound, opnd->left_bound + bytes - 1);
8748         }
8749     }
8750 }
8751 
inheritDIFrom(const G4_INST * inst)8752 void G4_INST::inheritDIFrom(const G4_INST* inst)
8753 {
8754     // Copy over debug info from inst
8755     setLocation(inst->getLocation());
8756     setCISAOff(getCISAOff() == UndefinedCisaOffset ? inst->getCISAOff() : getCISAOff());
8757 }
8758 
inheritSWSBFrom(const G4_INST * inst)8759 void G4_INST::inheritSWSBFrom(const G4_INST* inst)
8760 {
8761     // Copy the SWSB info
8762     setDistance(inst->getDistance());
8763     setLexicalId(inst->getLexicalId());
8764 
8765     setDistanceTypeXe(inst->getDistanceTypeXe());
8766     unsigned short token = inst->getToken();
8767     setToken(token);
8768     SWSBTokenType type = inst->getTokenType();
8769     setTokenType(type);
8770 }
8771