1 /*========================== begin_copyright_notice ============================
2
3 Copyright (C) 2017-2021 Intel Corporation
4
5 SPDX-License-Identifier: MIT
6
7 ============================= end_copyright_notice ===========================*/
8
9 #include "IGC/common/StringMacros.hpp"
10 #include "visa_igc_common_header.h"
11 #include "Common_ISA.h"
12 #include "Common_ISA_util.h"
13 #include "Common_ISA_framework.h"
14 #include "JitterDataStruct.h"
15 #include "VISAKernel.h"
16 #include "G4_IR.hpp"
17 #include "BuildIR.h"
18 #include "BinaryEncodingIGA.h"
19
20 #include <iomanip>
21
22 using namespace vISA;
23
24 static const char* const SrcModifierStr[Mod_src_undef] =
25 {
26 "-", // Mod_Minus
27 "(abs)", // Mod_Abs
28 "-(abs)", // Mod_Minus_Abs
29 "-" // Mod_Not (print as -)
30 };
31
32 static const G4_InstOptInfo InstOptInfo[] =
33 {
34 {InstOpt_Align16, "Align16"},
35 {InstOpt_M0, "M0"},
36 {InstOpt_M4, "M4"},
37 {InstOpt_M8, "M8"},
38 {InstOpt_M12, "M12"},
39 {InstOpt_M16, "M16"},
40 {InstOpt_M20, "M20"},
41 {InstOpt_M24, "M24"},
42 {InstOpt_M28, "M28"},
43 {InstOpt_Switch, "Switch"},
44 {InstOpt_Atomic, "Atomic"},
45 {InstOpt_NoDDChk, "NoDDChk"},
46 {InstOpt_NoDDClr, "NoDDClr"},
47 {InstOpt_WriteEnable, "NoMask"},
48 {InstOpt_BreakPoint, "BreakPoint"},
49 {InstOpt_EOT, "EOT"},
50 {InstOpt_AccWrCtrl, "AccWrEn"},
51 {InstOpt_Compacted, "Compacted"},
52 {InstOpt_NoCompact, "NoCompact"},
53 {InstOpt_NoSrcDepSet, "NoSrcDepSet"},
54 {InstOpt_NoPreempt, "NoPreempt"},
55 {InstOpt_Serialize, "Serialize"},
56 {InstOpt_END, "END"}
57 };
58
59 #define HANDLE_INST(op, nsrc, ndst, type, plat, attr) \
60 { G4_##op, #op, nsrc, ndst, type, plat, attr },
61
62
63 #define HANDLE_NAME_INST(op, name, nsrc, ndst, type, plat, attr) \
64 { G4_##op, name, nsrc, ndst, type, plat, attr },
65
66 const G4_Inst_Info G4_Inst_Table[] = {
67 #include "G4Instruction.h"
68 };
69
70
getChannelEnableStr(ChannelEnable channel)71 static const char* getChannelEnableStr(ChannelEnable channel)
72 {
73 switch (channel)
74 {
75 case NoChannelEnable:
76 return "";
77 case ChannelEnable_X:
78 return "x";
79 case ChannelEnable_Y:
80 return "y";
81 case ChannelEnable_XY:
82 return "xy";
83 case ChannelEnable_Z:
84 return "z";
85 case ChannelEnable_W:
86 return "w";
87 case ChannelEnable_ZW:
88 return "zw";
89 case ChannelEnable_XYZW:
90 return "xyzw";
91 default:
92 MUST_BE_TRUE(false, "unsupported channel enable");
93 return "";
94 }
95 }
96
97 //global functions
roundDownPow2(uint8_t n)98 uint8_t roundDownPow2(uint8_t n)
99 {
100 uint8_t i = 1;
101 while (n >= i) i <<= 1;
102 return (i>>1);
103 }
104
105 /* Return the base rank for the input type ignoring the signed/unsigned
106 * aspect of types.
107 * - Types of higher precision have higher ranks.
108 * - Floating types have higher precision than all integer types.
109 */
Operand_Type_Base_Rank(G4_Type type)110 static short Operand_Type_Base_Rank(G4_Type type)
111 {
112 short type_size = (short)TypeSize(type);
113 short type_rank = type_size * 2;
114
115 if (type == Type_V || type == Type_UV)
116 {
117 type_rank = (short)TypeSize(Type_W);
118 }
119 else if (type == Type_VF)
120 {
121 type_rank = (short)TypeSize(Type_F);
122 }
123 else if (IS_TYPE_FLOAT_ALL(type))
124 {
125 type_rank += 2;
126 }
127
128 return type_rank;
129 }
130
131 /* Return the rank for the input type.
132 * - Types of higher precision have higher ranks.
133 * - Floating types have higher precision than all integer types.
134 * - Unsigned types have a higher rank than a signed type with the same
135 * precision.
136 */
Operand_Type_Rank(G4_Type type)137 static short Operand_Type_Rank(G4_Type type)
138 {
139 short type_rank = Operand_Type_Base_Rank(type);
140
141 switch (type) {
142 case Type_UB:
143 case Type_UW:
144 case Type_UD: {
145 type_rank++;
146 break;
147 }
148 default: {
149 // No nothing.
150 break;
151 }
152 }
153
154 return type_rank;
155 }
156
157 // check if type1 can be represented by type2
Is_Type_Included(G4_Type type1,G4_Type type2,const IR_Builder & builder)158 static bool Is_Type_Included(G4_Type type1, G4_Type type2, const IR_Builder& builder)
159 {
160 if (type1 == type2)
161 {
162 return true;
163 }
164
165 // Float and Int types are never subtype of each other
166 if (IS_TYPE_FLOAT_ALL(type1) ^ IS_TYPE_FLOAT_ALL(type2))
167 {
168 return false;
169 }
170 if (type1 == Type_F && type2 == builder.getMixModeType() &&
171 builder.getPlatform() > GENX_BDW && builder.getOption(vISA_enableUnsafeCP_DF))
172 {
173 return true;
174 }
175
176 if (Operand_Type_Rank(type1) < Operand_Type_Rank(type2))
177 {
178 if ((IS_UNSIGNED_INT(type1) || type1 == Type_UV) &&
179 (IS_UNSIGNED_INT(type2) || type2 == Type_UV))
180 {
181 return true;
182 }
183 else if ((IS_SIGNED_INT(type1) || type1 == Type_V) &&
184 (IS_SIGNED_INT(type2) || type2 == Type_V))
185 {
186 return true;
187 }
188 else if ((type1 == Type_UB || type1 == Type_UW || type1 == Type_UV) && IS_TYPE_INT(type2))
189 {
190 return true;
191 }
192 else if (builder.hasMixMode() && type1 == builder.getMixModeType() && type2 == Type_F)
193 {
194 return true;
195 }
196 }
197 return false;
198 }
199
resetRightBound(G4_Operand * opnd)200 static void resetRightBound(G4_Operand* opnd)
201 {
202 if (opnd) {
203 opnd->unsetRightBound();
204 }
205 }
206
associateOpndWithInst(G4_Operand * opnd,G4_INST * inst)207 static void associateOpndWithInst(G4_Operand* opnd, G4_INST* inst)
208 {
209 if (opnd) {
210 opnd->setInst(inst);
211 }
212 }
213
G4_INST(const IR_Builder & irb,G4_Predicate * prd,G4_opcode o,G4_CondMod * m,G4_Sat s,G4_ExecSize size,G4_DstRegRegion * d,G4_Operand * s0,G4_Operand * s1,G4_Operand * s2,G4_Operand * s3,G4_InstOpts opt)214 G4_INST::G4_INST(
215 const IR_Builder& irb,
216 G4_Predicate* prd,
217 G4_opcode o,
218 G4_CondMod* m,
219 G4_Sat s,
220 G4_ExecSize size,
221 G4_DstRegRegion* d,
222 G4_Operand* s0,
223 G4_Operand* s1,
224 G4_Operand* s2,
225 G4_Operand* s3,
226 G4_InstOpts opt) :
227 op(o), dst(d), predicate(prd), mod(m), option(opt),
228 useInstList(irb.getAllocator()),
229 defInstList(irb.getAllocator()),
230 localId(0),
231 srcCISAoff(UndefinedCisaOffset),
232 sat(s ? 1 : 0),
233 evenlySplitInst(false),
234 execSize(size),
235 bin(nullptr),
236 builder(irb)
237 {
238 srcs[0] = s0;
239 srcs[1] = s1;
240 srcs[2] = s2;
241 srcs[3] = s3;
242
243 dead = false;
244 skipPostRA = false;
245 implAccSrc = nullptr;
246 implAccDst = nullptr;
247
248 resetRightBound(dst);
249 resetRightBound(s0);
250 resetRightBound(s1);
251 resetRightBound(s2);
252 resetRightBound(s3);
253 computeRightBound(predicate);
254 computeRightBound(mod);
255
256 associateOpndWithInst(dst, this);
257 associateOpndWithInst(s0, this);
258 associateOpndWithInst(s1, this);
259 associateOpndWithInst(s2, this);
260 associateOpndWithInst(s3, this);
261 associateOpndWithInst(predicate, this);
262 associateOpndWithInst(mod, this);
263 }
264
G4_InstSend(const IR_Builder & builder,G4_Predicate * prd,G4_opcode o,G4_ExecSize size,G4_DstRegRegion * dst,G4_SrcRegRegion * payload,G4_Operand * desc,G4_InstOpts opt,G4_SendDesc * md)265 G4_InstSend::G4_InstSend(
266 const IR_Builder& builder,
267 G4_Predicate* prd,
268 G4_opcode o,
269 G4_ExecSize size,
270 G4_DstRegRegion* dst,
271 G4_SrcRegRegion* payload,
272 G4_Operand* desc,
273 G4_InstOpts opt,
274 G4_SendDesc* md) :
275 G4_INST(builder, prd, o, nullptr, g4::NOSAT, size, dst, payload, desc, opt),
276 msgDesc(md)
277 {
278 md->setExecSize(size);
279 }
280
G4_InstSend(const IR_Builder & builder,G4_Predicate * prd,G4_opcode o,G4_ExecSize size,G4_DstRegRegion * dst,G4_SrcRegRegion * payload,G4_SrcRegRegion * src1,G4_Operand * desc,G4_Operand * extDesc,G4_InstOpts opt,G4_SendDesc * md)281 G4_InstSend::G4_InstSend(
282 const IR_Builder& builder,
283 G4_Predicate* prd,
284 G4_opcode o,
285 G4_ExecSize size,
286 G4_DstRegRegion* dst,
287 G4_SrcRegRegion* payload,
288 G4_SrcRegRegion* src1,
289 G4_Operand* desc,
290 G4_Operand* extDesc,
291 G4_InstOpts opt,
292 G4_SendDesc* md) :
293 G4_INST(builder, prd, o, nullptr, g4::NOSAT, size, dst, payload, src1, desc, opt),
294 msgDesc(md)
295 {
296 setSrc(extDesc, 3);
297 md->setExecSize(size);
298 }
299
setOpcode(G4_opcode opcd)300 void G4_INST::setOpcode(G4_opcode opcd)
301 {
302 MUST_BE_TRUE(opcd < G4_NUM_OPCODE &&
303 (G4_Inst_Table[op].instType == G4_Inst_Table[opcd].instType ||
304 G4_Inst_Table[opcd].instType == InstTypeMov ||
305 (
306 (G4_Inst_Table[op].instType == InstTypeMov ||
307 G4_Inst_Table[op].instType == InstTypeArith ||
308 G4_Inst_Table[op].instType == InstTypeLogic ||
309 G4_Inst_Table[op].instType == InstTypePseudoLogic ||
310 G4_Inst_Table[op].instType == InstTypeVector) &&
311
312 (G4_Inst_Table[opcd].instType == InstTypeMov ||
313 G4_Inst_Table[opcd].instType == InstTypeArith ||
314 G4_Inst_Table[opcd].instType == InstTypeLogic ||
315 G4_Inst_Table[opcd].instType == InstTypeVector)
316 ) ||
317 opcd == G4_label),
318 "setOpcode would change the intruction class, which is illegal.");
319
320 bool resetBounds = false;
321
322 if (op != opcd)
323 {
324 resetBounds = true;
325 }
326
327 op = opcd;
328
329 if (resetBounds)
330 {
331 resetRightBound(dst);
332 resetRightBound(srcs[0]);
333 resetRightBound(srcs[1]);
334 resetRightBound(srcs[2]);
335 resetRightBound(predicate);
336 resetRightBound(mod);
337 resetRightBound(implAccDst);
338 resetRightBound(implAccSrc);
339 }
340 }
341
setExecSize(G4_ExecSize s)342 void G4_INST::setExecSize(G4_ExecSize s)
343 {
344 bool resetBounds = false;
345
346 if (execSize != s)
347 {
348 resetBounds = true;
349 }
350
351 execSize = s;
352
353 if (resetBounds)
354 {
355 resetRightBound(dst);
356 resetRightBound(srcs[0]);
357 resetRightBound(srcs[1]);
358 resetRightBound(srcs[2]);
359 resetRightBound(predicate);
360 resetRightBound(mod);
361 resetRightBound(implAccDst);
362 resetRightBound(implAccSrc);
363 }
364 }
365
366 //
367 // We assume no mixed int and float source type, but mixed HF and F is ok
368 //
getExecType() const369 G4_Type G4_INST::getExecType() const
370 {
371 G4_Type execType = Type_W;
372
373 // special handling for int divide, as it supports D/UD sources only, while
374 // vISA DIV allows B/W types
375 // FIXME: if there are more instructions like this, we may need to reorder fixDstAlignment()
376 // so that it happens after all sources are fixed and we can get the correct execution type
377 if (isMath() && asMathInst()->isMathIntDiv())
378 {
379 return Type_D;
380 }
381
382 if (opcode() == G4_fcvt)
383 {
384 // fcvt : cvt b/w standard type and other special float type.
385 // execution type is the standard type.
386 G4_Type srcTy = srcs[0]->getType();
387 if (IS_TYPE_FLOAT_ALL(srcTy))
388 {
389 return srcTy;
390 }
391 // If src isn't standard float type, dst must be!
392 return dst->getType();
393 }
394 if (opcode() == G4_srnd)
395 {
396 // srnd: src0 is either hf or f
397 return srcs[0]->getType();
398 }
399 for (unsigned i = 0; i < G4_MAX_SRCS; i++)
400 {
401 G4_Operand* src = getSrc(i);
402 if (src != NULL)
403 {
404 G4_Type srcType = src->getType();
405 if (TypeSize(srcType) >= TypeSize(execType))
406 {
407 if (IS_DTYPE(srcType))
408 {
409 execType = Type_D;
410 }
411 else if (IS_QTYPE(srcType))
412 {
413 execType = Type_Q;
414 }
415 else if (IS_TYPE_FLOAT_ALL(srcType))
416 {
417 execType = srcType;
418 }
419 }
420 }
421 }
422
423 // int <-> HF conversion requires exec type to be dword
424 // we don't consider Q<->HF since there are special checks in fixMov() for them
425 if (dst)
426 {
427 G4_Type dstType = dst->getType();
428 if (IS_HFTYPE(dstType) && (IS_TYPE_INT(execType) && !IS_QTYPE(execType)))
429 {
430 execType = Type_D;
431 }
432 else if (IS_HFTYPE(execType) && (IS_TYPE_INT(dstType) && !IS_QTYPE(dstType)))
433 {
434 execType = Type_F;
435 }
436 }
437
438 return execType;
439 }
440
441 // V and VF are treated differently here from the above function
442 // FIXME: Why do we need two functions???
getExecType2() const443 G4_Type G4_INST::getExecType2() const
444 {
445 G4_Type execType = Type_W;
446
447 // special handling for int divide, as it supports D/UD sources only, while
448 // vISA DIV allows B/W types
449 if (isMath() && asMathInst()->isMathIntDiv())
450 {
451 return Type_D;
452 }
453
454 for (unsigned i = 0; i < G4_MAX_SRCS; i++)
455 {
456 G4_Operand* src = getSrc(i);
457 if (src == NULL)
458 {
459 continue;
460 }
461 G4_Type srcType = srcs[i]->getType();
462 if (builder.hasBFMixMode() && srcType == Type_BF)
463 {
464 execType = Type_F;
465 }
466 else if (isLowPrecisionFloatTy(srcType) &&
467 TypeSize(srcType) >= TypeSize(execType))
468 {
469 execType = srcType;
470 break;
471 }
472 else if (srcType == Type_V)
473 {
474 execType = Type_V;
475 break;
476 }
477 else if (srcType == Type_UV)
478 {
479 execType = Type_UV;
480 break;
481 }
482 else if (IS_DFTYPE(srcType) && !IS_DFTYPE(execType))
483 {
484 execType = src->getType();
485 break;
486 }
487 else if ((IS_FTYPE(srcType) || srcType == Type_VF) &&
488 !IS_DFTYPE(execType) && !IS_FTYPE(execType))
489 {
490 execType = Type_F;
491 }
492 else if (IS_DTYPE(srcType) &&
493 TypeSize(srcType) >= TypeSize(execType) &&
494 !IS_DFTYPE(execType) && !IS_FTYPE(execType))
495 {
496 execType = Type_D;
497 }
498 else if (IS_QTYPE(srcType) &&
499 TypeSize(srcType) >= TypeSize(execType) &&
500 !IS_DFTYPE(execType) && !IS_FTYPE(execType))
501 {
502 execType = Type_Q;
503 }
504 }
505
506 // int <-> HF conversion requires exec type to be dword
507 // we don't consider Q<->HF since there are special checks in fixMov() for them
508 if (dst)
509 {
510 G4_Type dstType = dst->getType();
511 if (IS_HFTYPE(dstType) && (IS_TYPE_INT(execType) && !IS_QTYPE(execType)))
512 {
513 execType = Type_D;
514 }
515 else if (IS_HFTYPE(execType) && (IS_TYPE_INT(dstType) && !IS_QTYPE(dstType)))
516 {
517 execType = Type_F;
518 }
519 }
520
521 return execType;
522 }
523
getMaskOffset() const524 uint16_t G4_INST::getMaskOffset() const
525 {
526 unsigned maskOption = (getOption() & InstOpt_QuarterMasks);
527
528 if (!builder.hasNibCtrl())
529 {
530 assert(maskOption != InstOpt_M4 && maskOption != InstOpt_M12 && maskOption != InstOpt_M20 &&
531 maskOption != InstOpt_M28 && "nibCtrl is not supported on this platform");
532 }
533
534 switch (maskOption)
535 {
536 case InstOpt_NoOpt:
537 return 0;
538 case InstOpt_M0:
539 return 0;
540 case InstOpt_M4:
541 return 4;
542 case InstOpt_M8:
543 return 8;
544 case InstOpt_M12:
545 return 12;
546 case InstOpt_M16:
547 return 16;
548 case InstOpt_M20:
549 return 20;
550 case InstOpt_M24:
551 return 24;
552 case InstOpt_M28:
553 return 28;
554 default:
555 MUST_BE_TRUE(0, "Incorrect instruction execution mask");
556 return 0;
557 }
558 }
559
setMetadata(const std::string & key,MDNode * value)560 void G4_INST::setMetadata(const std::string& key, MDNode* value)
561 {
562 if (!MD)
563 {
564 MD = const_cast<IR_Builder&>(builder).allocateMD();
565 }
566 MD->setMetadata(key, value);
567 }
568
setComments(const std::string & str)569 void G4_INST::setComments(const std::string& str)
570 {
571 // we create a new MDNode the assumption is that comment should be unique and there is no opportunity for sharing
572 auto node = const_cast<IR_Builder&>(builder).allocateMDString(str);
573 setMetadata(Metadata::InstComment, node);
574 }
575
addComment(const std::string & comment)576 void G4_INST::addComment(const std::string& comment) {
577 std::string comments = getComments();
578 if (!comments.empty()) { // add a separator
579 comments += "; ";
580 }
581 comments += comment;
582 setComments(comments);
583 }
584
setTokenLoc(unsigned short token,unsigned globalID)585 void G4_INST::setTokenLoc(unsigned short token, unsigned globalID)
586 {
587 if (!builder.getOption(vISA_SBIDDepLoc))
588 {
589 return;
590 }
591 auto tokenLoc = getMetadata(Metadata::TokenLoc);
592 if (!tokenLoc)
593 {
594 auto node = const_cast<IR_Builder&>(builder).allocateMDTokenLocation(token, globalID);
595 setMetadata(Metadata::TokenLoc, node);
596 }
597 else
598 {
599 MDTokenLocation* tokenL = tokenLoc->asMDTokenLocation();
600 tokenL->addTokenLocation(token, globalID);
601 }
602 }
603
604 //
605 // remove all references to this inst in other inst's use_list
606 // this is used when we want to delete this instruction
removeAllDefs()607 void G4_INST::removeAllDefs()
608 {
609 for (auto&& item : defInstList)
610 {
611 G4_INST *def = item.first;
612 def->useInstList.remove_if(
613 [&](USE_DEF_NODE node) { return node.first == this; });
614 }
615 defInstList.clear();
616 }
617
removeAllUses()618 void G4_INST::removeAllUses()
619 {
620 for (auto&& item : useInstList)
621 {
622 G4_INST *user = item.first;
623 user->defInstList.remove_if(
624 [&](USE_DEF_NODE node) { return node.first == this; });
625 }
626 useInstList.clear();
627 }
628
629 //
630 // remove def/use for opndNum, which must be a source
631 // (i.e., not Opnd_dst/Opnd_condMod/Opnd_implAccDst)
removeDefUse(Gen4_Operand_Number opndNum)632 void G4_INST::removeDefUse(Gen4_Operand_Number opndNum)
633 {
634 DEF_EDGE_LIST_ITER iter = defInstList.begin();
635 while (iter != defInstList.end())
636 {
637 if ((*iter).second == opndNum)
638 {
639 auto defInst = (*iter).first;
640 defInst->useInstList.remove_if(
641 [&](USE_DEF_NODE node) { return node.first == this && node.second == opndNum; });
642 DEF_EDGE_LIST_ITER curr_iter = iter++;
643 defInstList.erase(curr_iter);
644 }
645 else
646 {
647 ++iter;
648 }
649 }
650 }
651
getOperand(Gen4_Operand_Number opnd_num) const652 const G4_Operand* G4_INST::getOperand(Gen4_Operand_Number opnd_num) const
653 {
654 switch (opnd_num) {
655 case Opnd_dst: return (G4_Operand*) dst;
656 case Opnd_src0: return srcs[0];
657 case Opnd_src1: return srcs[1];
658 case Opnd_src2: return srcs[2];
659 case Opnd_src3: return srcs[3];
660 case Opnd_pred: return (G4_Operand*)predicate;
661 case Opnd_condMod: return (G4_Operand*)mod;
662 case Opnd_implAccSrc: return implAccSrc;
663 case Opnd_implAccDst: return (G4_Operand*) implAccDst;
664 default:
665 MUST_BE_TRUE(0, "Operand number is out of range.");
666 break;
667 }
668 return NULL;
669 }
670
eraseUse(USE_EDGE_LIST_ITER iter)671 USE_EDGE_LIST_ITER G4_INST::eraseUse(USE_EDGE_LIST_ITER iter)
672 {
673 G4_INST *useInst = iter->first;
674 useInst->defInstList.remove_if(
675 [&](USE_DEF_NODE node) { return node.first == this && node.second == iter->second; });
676 return useInstList.erase(iter);
677 }
678
679 // Transfer definitions used in this[opndNum1] to definitions used in
680 // inst2[opndNum2] and update definitions's def-use chain accordingly.
transferDef(G4_INST * inst2,Gen4_Operand_Number opndNum1,Gen4_Operand_Number opndNum2)681 void G4_INST::transferDef(G4_INST *inst2, Gen4_Operand_Number opndNum1, Gen4_Operand_Number opndNum2)
682 {
683 DEF_EDGE_LIST_ITER iter = defInstList.begin();
684 while (iter != defInstList.end())
685 {
686 auto defInst = (*iter).first;
687 if ((*iter).second == opndNum1)
688 {
689 // gcc 5.0 doesn't like emplace_back for some reason
690 inst2->defInstList.push_back(USE_DEF_NODE(defInst, opndNum2));
691 defInst->useInstList.remove_if(
692 [&](USE_DEF_NODE node) { return node.second == opndNum1 && node.first == this; });
693 defInst->useInstList.push_back(USE_DEF_NODE(inst2, opndNum2));
694 DEF_EDGE_LIST_ITER curr_iter = iter++;
695 defInstList.erase(curr_iter);
696
697 //Remove the redundant d/u node.
698 //Due to the instruction optimization, such as merge scalars, redundant d/u info may be generated.
699 //Such as the case:
700 //(W) shl (1) V3429(0,0)<1>:d V3380(0,0)<0;1,0>:d 0x17:w
701 //(W) shl (1) V3430(0,0)<1>:d V3381(0,0)<0;1,0>:d 0x17:w
702 //(W) add (1) V3432(0,0)<1>:d 0x43800000:d -V3429(0,0)<0;1,0>:d
703 //(W) add (1) V3433(0,0)<1>:d 0x43800000:d -V3430(0,0)<0;1,0>:d
704 //==>
705 //(W) shl (2) Merged138(0,0)<1>:d Merged139(0,0)<1;1,0>:d 0x17:w
706 //(W) add (2) Merged140(0,0)<1>:d 0x43800000:d -Merged138(0,0)<1;1,0>:d
707 inst2->defInstList.sort();
708 inst2->defInstList.unique();
709 defInst->useInstList.sort();
710 defInst->useInstList.unique();
711 }
712 else
713 {
714 ++iter;
715 }
716 }
717 }
718
719 // This copies, from this definition's source opndNum1, all of its defintions to
720 // inst2's source opndNum2. This is used for example by copy propagation to copy
721 // the def-use link of the move to the use instruction.
722 //
723 // If 'checked' is true, then this only copies those effective defs to inst2.
724 //
copyDef(G4_INST * inst2,Gen4_Operand_Number opndNum1,Gen4_Operand_Number opndNum2,bool checked)725 void G4_INST::copyDef(
726 G4_INST *inst2,
727 Gen4_Operand_Number opndNum1,
728 Gen4_Operand_Number opndNum2,
729 bool checked)
730 {
731 for (auto I = def_begin(); I != def_end(); ++I)
732 {
733 if (I->second == opndNum1)
734 {
735 // If checked is enabled, then compare inst2[opndNum] with this
736 // definition. Skip if this is not an effective use.
737 if (checked)
738 {
739 G4_Operand *use = inst2->getOperand(opndNum2);
740 ASSERT_USER(use, "null operand unexpected");
741 G4_Operand *dst = I->first->getOperand(Opnd_dst);
742 G4_Operand *condMod = I->first->getOperand(Opnd_condMod);
743 if ((dst && use->compareOperand(dst) != Rel_disjoint) ||
744 (condMod && use->compareOperand(condMod) != Rel_disjoint))
745 {
746 // OK
747 }
748 else
749 {
750 // Skip to the next def.
751 continue;
752 }
753 }
754 I->first->addDefUse(inst2, opndNum2);
755 }
756 }
757 inst2->defInstList.unique();
758 }
759
760 /// Copy this instruction's defs to inst2.
copyDefsTo(G4_INST * inst2,bool checked)761 void G4_INST::copyDefsTo(G4_INST *inst2, bool checked)
762 {
763 if (this == inst2)
764 return;
765
766 for (auto I = def_begin(), E = def_end(); I != E; ++I)
767 {
768 G4_Operand *use = inst2->getOperand(I->second);
769 // Copy when the corresponding use operand is not null.
770 if (!use)
771 continue;
772
773 if (checked)
774 {
775 G4_Operand *dst = I->first->getOperand(Opnd_dst);
776 G4_Operand *condMod = I->first->getOperand(Opnd_condMod);
777 G4_Operand* implicitAccDef = I->first->getImplAccDst();
778 if ((dst && use->compareOperand(dst) != Rel_disjoint) ||
779 (condMod && use->compareOperand(condMod) != Rel_disjoint) ||
780 (implicitAccDef && use->compareOperand(implicitAccDef) != Rel_disjoint))
781 {
782 // OK
783 }
784 else
785 {
786 // Skip to the next def.
787 continue;
788 }
789 }
790
791 // inst2[I->second] is defined by I->first.
792 I->first->addDefUse(inst2, I->second);
793 }
794 }
795
796 /// Copy this instruction's uses to inst2.
copyUsesTo(G4_INST * inst2,bool checked)797 void G4_INST::copyUsesTo(G4_INST *inst2, bool checked)
798 {
799 if (this == inst2)
800 return;
801
802 for (auto I = use_begin(), E = use_end(); I != E; ++I)
803 {
804 if (checked)
805 {
806 G4_Operand *use = I->first->getOperand(I->second);
807 ASSERT_USER(use, "null operand unexpected");
808
809 G4_Operand *dst = inst2->getOperand(Opnd_dst);
810 G4_Operand *condMod = inst2->getOperand(Opnd_condMod);
811 G4_Operand *implicitAccDef = inst2->getImplAccDst();
812 if ((dst && use->compareOperand(dst) != Rel_disjoint) ||
813 (condMod && use->compareOperand(condMod) != Rel_disjoint) ||
814 (implicitAccDef && use->compareOperand(implicitAccDef) != Rel_disjoint))
815 {
816 // OK
817 }
818 else
819 {
820 // Skip to the next use.
821 continue;
822 }
823 }
824
825 // I->first[I->second] is defined by inst2.
826 inst2->addDefUse(I->first, I->second);
827 }
828 }
829
830 // This transfers this instructions' useInstList to inst2's,
831 // and update each use's defInstList to point to inst2.
832 // this instruction's use is destroyed in the process.
833 // if keepExisting is true, it will preserve inst2's existing uses.
transferUse(G4_INST * inst2,bool keepExisting)834 void G4_INST::transferUse(G4_INST *inst2, bool keepExisting)
835 {
836 if (this == inst2)
837 {
838 return;
839 }
840
841 if (!keepExisting)
842 {
843 inst2->removeAllUses();
844 }
845
846 copyUsesTo(inst2, false);
847 removeAllUses();
848 }
849
850 //
851 // remove all references of this inst in other inst's def list
852 // this is used when we want to delete this instruction
removeUseOfInst()853 void G4_INST::removeUseOfInst()
854 {
855 for (auto&& node : defInstList)
856 {
857 auto defInst = node.first;
858 defInst->useInstList.remove_if(
859 [&](USE_DEF_NODE node) { return node.first == this;});
860 }
861 }
862
863 // remove the faked def-instructions in def list, which is resulted from instruction spliting
trimDefInstList()864 void G4_INST::trimDefInstList()
865 {
866 // trim def list
867 DEF_EDGE_LIST_ITER iter = defInstList.begin();
868 // since ACC is only exposed in ARCTAN intrinsic translation, there is no instruction split with ACC
869 while (iter != defInstList.end())
870 {
871 G4_Operand *src = getOperand((*iter).second);
872
873 if (src == nullptr)
874 {
875 // it's possible the source is entirely gone (e.g., predicate removed)
876 iter = defInstList.erase(iter);
877 continue;
878 }
879 G4_CmpRelation rel = Rel_undef;
880 if (src->isFlag())
881 {
882 if ((*iter).first->getCondMod())
883 {
884 rel = src->compareOperand((*iter).first->getCondMod());
885 }
886 else if ((*iter).first->getDst())
887 {
888 if ((*iter).first->hasNULLDst())
889 {
890 rel = Rel_disjoint;
891 }
892 else
893 {
894 rel = src->compareOperand((*iter).first->getDst());
895 }
896 }
897 }
898 else
899 {
900 rel = src->compareOperand((*iter).first->getDst());
901 }
902
903 if (rel == Rel_disjoint)
904 {
905 // remove this def-use
906 // assumption: no duplicate def-use info
907 USE_EDGE_LIST_ITER useIter = (*iter).first->useInstList.begin();
908 while (useIter != (*iter).first->useInstList.end())
909 {
910 if ((*useIter).first == this && (*useIter).second == Opnd_src2)
911 {
912 (*iter).first->useInstList.erase(useIter);
913 break;
914 }
915 useIter++;
916 }
917 DEF_EDGE_LIST_ITER tmpIter = iter;
918 iter++;
919 defInstList.erase(tmpIter);
920 continue;
921 }
922 iter++;
923 }
924 }
925
isDFInstruction() const926 bool G4_INST::isDFInstruction() const
927 {
928 G4_Operand* dst = getDst();
929 if (dst && (dst->getType() == Type_DF))
930 {
931 return true;
932 }
933 for (int i = 0; i < getNumSrc(); i++)
934 {
935 G4_Operand* src = getSrc(i);
936 if (src && (src->getType() == Type_DF))
937 {
938 return true;
939 }
940 }
941 return false;
942 }
943
isMathPipeInst() const944 bool G4_INST::isMathPipeInst() const
945 {
946 if (isMath())
947 {
948 return true;
949 }
950
951
952 return false;
953 }
954
distanceHonourInstruction() const955 bool G4_INST::distanceHonourInstruction() const
956 {
957 if (isSend() || op == G4_nop || isWait() || isDpas())
958 {
959 return false;
960 }
961 if (isMathPipeInst())
962 {
963 if (builder.getPlatform() >= GENX_PVC)
964 {
965 return true;
966 }
967 return false;
968 }
969 return true;
970 }
971
tokenHonourInstruction() const972 bool G4_INST::tokenHonourInstruction() const
973 {
974 if (isSend() || isDpas())
975 {
976 return true;
977 }
978 else
979 {
980 if (isMathPipeInst())
981 {
982 if (builder.getPlatform() >= GENX_PVC)
983 {
984 return false;
985 }
986 return true;
987 }
988 return false;
989 }
990 }
991
hasNoPipe()992 bool G4_INST::hasNoPipe()
993 {
994 if (op == G4_wait || op == G4_halt || op == G4_nop)
995 {
996 return true;
997 }
998 // PVC only
999 if (op == G4_sync_fence)
1000 {
1001 return true;
1002 }
1003 return false;
1004 }
1005
1006
isLongPipeType(G4_Type type) const1007 bool G4_INST::isLongPipeType(G4_Type type) const
1008 {
1009 if (builder.hasPartialInt64Support())
1010 {
1011 return type == Type_DF;
1012 }
1013 return IS_TYPE_LONG(type);
1014 }
1015
isIntegerPipeType(G4_Type type) const1016 bool G4_INST::isIntegerPipeType(G4_Type type) const
1017 {
1018 if (IS_TYPE_INTEGER(type))
1019 {
1020 return true;
1021 }
1022
1023 if (builder.hasPartialInt64Support())
1024 {
1025 return type == Type_UQ || type == Type_Q;
1026 }
1027
1028 return false;
1029 }
1030
isJEUPipeInstructionXe() const1031 bool G4_INST::isJEUPipeInstructionXe() const
1032 {
1033 if (op == G4_jmpi ||
1034 op == G4_if ||
1035 op == G4_else ||
1036 op == G4_endif ||
1037 op == G4_break ||
1038 op == G4_join ||
1039 op == G4_cont ||
1040 op == G4_while ||
1041 op == G4_brc ||
1042 op == G4_brd ||
1043 op == G4_goto ||
1044 op == G4_call ||
1045 op == G4_return)
1046 {
1047 return true;
1048 }
1049 return false;
1050 }
1051
1052
isLongPipeInstructionXe() const1053 bool G4_INST::isLongPipeInstructionXe() const
1054 {
1055 if (isJEUPipeInstructionXe())
1056 {
1057 return false;
1058 }
1059
1060 if (!distanceHonourInstruction())
1061 {
1062 return false;
1063 }
1064
1065 if (builder.hasFixedCycleMathPipeline() &&
1066 isMath())
1067 {
1068 return false;
1069 }
1070
1071
1072 const G4_Operand* dst = getDst();
1073 if (dst && isLongPipeType(dst->getType()))
1074 {
1075 return true;
1076 }
1077
1078 if (!builder.hasPartialInt64Support())
1079 {
1080 for (int i = 0; i < G4_MAX_SRCS; i++)
1081 {
1082 const G4_Operand* src = getSrc(i);
1083 if (src && isLongPipeType(src->getType()))
1084 {
1085 return true;
1086 }
1087 }
1088 }
1089
1090 return false;
1091 }
1092
isIntegerPipeInstructionXe() const1093 bool G4_INST::isIntegerPipeInstructionXe() const
1094 {
1095 if (isJEUPipeInstructionXe())
1096 {
1097 return true;
1098 }
1099
1100 if (!distanceHonourInstruction())
1101 {
1102 return false;
1103 }
1104
1105 if (isLongPipeInstructionXe())
1106 {
1107 return false;
1108 }
1109
1110
1111 if (builder.hasFixedCycleMathPipeline() &&
1112 isMath())
1113 {
1114 return false;
1115 }
1116 if (op == G4_fcvt)
1117 {
1118 return false;
1119 }
1120 if (op == G4_srnd)
1121 {
1122 return false;
1123 }
1124
1125 G4_Operand* dst = getDst();
1126 if (dst && isIntegerPipeType(dst->getType()))
1127 {
1128 return true;
1129 }
1130
1131 if (builder.hasQ2FInIntegerPipe() && dst->getType() == Type_F)
1132 {
1133 const G4_Operand* src = getSrc(0);
1134 if (src && (src->getType() == Type_Q || src->getType() == Type_UQ))
1135 {
1136 return true;
1137 }
1138 }
1139
1140 if (!dst)
1141 {
1142 const G4_Operand* src = getSrc(0);
1143 if (src && isIntegerPipeType(src->getType()))
1144 {
1145 return true;
1146 }
1147 }
1148
1149 return false;
1150 }
1151
isFloatPipeInstructionXe() const1152 bool G4_INST::isFloatPipeInstructionXe() const
1153 {
1154 if (isJEUPipeInstructionXe())
1155 {
1156 return false;
1157 }
1158
1159 if (!distanceHonourInstruction())
1160 {
1161 return false;
1162 }
1163
1164
1165 if (isLongPipeInstructionXe())
1166 {
1167 return false;
1168 }
1169
1170 if (builder.hasFixedCycleMathPipeline() &&
1171 isMath())
1172 {
1173 return false;
1174 }
1175 if (opcode() == G4_fcvt)
1176 {
1177 return true;
1178 }
1179 if (opcode() == G4_srnd)
1180 {
1181 return true;
1182 }
1183
1184 const G4_Operand* dst = getDst();
1185 if (dst &&
1186 (dst->getType() == Type_F ||
1187 dst->getType() == Type_HF ||
1188 dst->getType() == Type_BF))
1189 {
1190 if (builder.hasQ2FInIntegerPipe() && dst->getType() == Type_F)
1191 {
1192 const G4_Operand* src = getSrc(0);
1193 if (src && (src->getType() == Type_Q || src->getType() == Type_UQ))
1194 {
1195 return false;
1196 }
1197 }
1198 return true;
1199 }
1200
1201 if (!dst)
1202 {
1203 const G4_Operand* src = getSrc(0);
1204 if (src &&
1205 (src->getType() == Type_F ||
1206 src->getType() == Type_HF ||
1207 src->getType() == Type_BF))
1208 {
1209 return true;
1210 }
1211 }
1212
1213 return false;
1214 }
1215
getDataTypePipeXe(G4_Type type)1216 SB_INST_PIPE G4_INST::getDataTypePipeXe(G4_Type type)
1217 {
1218 switch (type)
1219 {
1220 case Type_UB:
1221 case Type_B:
1222 case Type_UW:
1223 case Type_W:
1224 case Type_UD:
1225 case Type_D:
1226 case Type_UV:
1227 case Type_V:
1228 return PIPE_INT;
1229
1230 case Type_Q:
1231 case Type_UQ:
1232 if (builder.hasPartialInt64Support())
1233 {
1234 return PIPE_INT;
1235 }
1236 return PIPE_LONG;
1237
1238 case Type_DF:
1239 return PIPE_LONG;
1240
1241 case Type_HF:
1242 case Type_F:
1243 case Type_VF:
1244 case Type_NF:
1245 case Type_BF:
1246 return PIPE_FLOAT;
1247
1248 default:
1249 return PIPE_NONE;
1250 }
1251
1252 return PIPE_NONE;
1253 }
1254
getInstructionPipeXe()1255 SB_INST_PIPE G4_INST::getInstructionPipeXe()
1256 {
1257
1258 if (isLongPipeInstructionXe())
1259 {
1260 return PIPE_LONG;
1261 }
1262
1263 if (isIntegerPipeInstructionXe())
1264 {
1265 return PIPE_INT;
1266 }
1267
1268 if (isFloatPipeInstructionXe())
1269 {
1270 return PIPE_FLOAT;
1271 }
1272
1273 if (builder.hasFixedCycleMathPipeline() &&
1274 isMath())
1275 {
1276 return PIPE_MATH;
1277 }
1278
1279 if (tokenHonourInstruction())
1280 {
1281 if (isDpas())
1282 {
1283 return PIPE_DPAS;
1284 }
1285 if (isMathPipeInst())
1286 {
1287 return PIPE_MATH;
1288 }
1289 if (isSend())
1290 {
1291 return PIPE_SEND;
1292 }
1293
1294 ASSERT_USER(0, "Wrong token pipe instruction!");
1295 }
1296
1297 ASSERT_USER(hasNoPipe(), "No pipe instruction");
1298 return PIPE_NONE;
1299 }
1300
1301 template <typename T>
fmtHexBody(T t,int cols=0)1302 static std::string fmtHexBody(T t, int cols = 0)
1303 {
1304 std::stringstream ss;
1305 if (sizeof(t) == 1) // char/unsigned char to int
1306 ss << std::hex << std::setw(cols) << std::uppercase <<
1307 std::setfill('0') << (int)t;
1308 else
1309 ss << std::hex << std::setw(cols) << std::uppercase <<
1310 std::setfill('0') << t;
1311 return ss.str();
1312 }
1313
1314 template <typename T>
fmtHex(T t,int cols=0)1315 static std::string fmtHex(T t, int cols = 0)
1316 {
1317 std::stringstream ss;
1318 ss << "0x" << fmtHexBody(t, cols);
1319 return ss.str();
1320 }
1321
1322
1323 #ifdef _DEBUG
printDefUseImpl(std::ostream & os,G4_INST * def,G4_INST * use,Gen4_Operand_Number pos)1324 static void printDefUseImpl(
1325 std::ostream &os, G4_INST *def, G4_INST *use, Gen4_Operand_Number pos)
1326 {
1327 os << "\n def: ";
1328 def->emit(os);
1329 os << "\n user: ";
1330 use->emit(os);
1331 os << "\n opnd: ";
1332 use->getOperand(pos)->emit(os);
1333 os << "\n";
1334 }
1335 #endif
1336
dumpDefUse(std::ostream & os)1337 void G4_INST::dumpDefUse(std::ostream &os)
1338 {
1339 #if _DEBUG
1340 std::cerr << "\n------------ defs ------------\n";
1341 for (auto&& UD : defInstList)
1342 {
1343 printDefUseImpl(std::cerr, UD.first, this, UD.second);
1344 }
1345 std::cerr << "\n------------ uses ------------\n";
1346 for (auto&& DU : useInstList)
1347 {
1348 printDefUseImpl(std::cerr, this, DU.first, DU.second);
1349 }
1350 #endif
1351 }
1352
1353 namespace {
1354 // Customized def-use iterator comparison. Do not compare itself
1355 // but the content it is pointing to.
1356 struct def_less
1357 {
operator ()__anon352c3f950711::def_less1358 bool operator()(DEF_EDGE_LIST_ITER a, DEF_EDGE_LIST_ITER b) const
1359 {
1360 if (a->first < b->first)
1361 {
1362 return true;
1363 }
1364 else if ((a->first == b->first) && (a->second < b->second))
1365 {
1366 return true;
1367 }
1368 return false;
1369 }
1370 };
1371 }
1372
getSingleDef(Gen4_Operand_Number opndNum,bool MakeUnique)1373 G4_INST *G4_INST::getSingleDef(Gen4_Operand_Number opndNum, bool MakeUnique)
1374 {
1375 if (MakeUnique)
1376 {
1377 std::set<DEF_EDGE_LIST_ITER, def_less> found;
1378 for (auto I = def_begin(); I != def_end(); /* empty */)
1379 {
1380 if (!found.insert(I).second)
1381 {
1382 I = defInstList.erase(I);
1383 }
1384 else
1385 {
1386 ++I;
1387 }
1388 }
1389 }
1390
1391 G4_INST *def = 0;
1392 unsigned def_count = 0;
1393 for (auto I = def_begin(), E = def_end(); I != E; ++I)
1394 {
1395 if (I->second == opndNum)
1396 {
1397 if (++def_count > 1) return 0;
1398 def = I->first;
1399 }
1400 }
1401
1402 return def;
1403 }
1404
1405 // add def-use between this instruction <--> inst[srcPos]
1406 // Note that this function does not check for duplicates
addDefUse(G4_INST * inst,Gen4_Operand_Number srcPos)1407 void G4_INST::addDefUse(G4_INST* inst, Gen4_Operand_Number srcPos)
1408 {
1409 MUST_BE_TRUE(srcPos == Opnd_dst ||
1410 srcPos == Opnd_src0 || srcPos == Opnd_src1 ||
1411 srcPos == Opnd_src2 || srcPos == Opnd_src3 ||
1412 srcPos == Opnd_src4 || srcPos == Opnd_src5 ||
1413 srcPos == Opnd_src6 || srcPos == Opnd_src7 ||
1414 srcPos == Opnd_pred ||
1415 srcPos == Opnd_implAccSrc, "unexpected operand number");
1416 useInstList.emplace_back(inst, srcPos);
1417 inst->defInstList.emplace_back(this, srcPos);
1418 }
1419
1420 // exchange def/use info of src0 and src1 after they are swapped.
swapDefUse(Gen4_Operand_Number srcIxA,Gen4_Operand_Number srcIxB)1421 void G4_INST::swapDefUse(Gen4_Operand_Number srcIxA, Gen4_Operand_Number srcIxB)
1422 {
1423 DEF_EDGE_LIST_ITER iter = defInstList.begin();
1424 //To avoid redundant define and use items
1425 INST_LIST handledDefInst;
1426
1427 // since ACC is only exposed in ARCTAN intrinsic translation, there is no instruction split with ACC
1428 while (iter != defInstList.end())
1429 {
1430 if ((*iter).second == srcIxB)
1431 {
1432 (*iter).second = srcIxA;
1433 }
1434 else if ((*iter).second == srcIxA)
1435 {
1436 (*iter).second = srcIxB;
1437 }
1438 else
1439 {
1440 iter++;
1441 continue;
1442 }
1443 if (std::find(handledDefInst.begin(), handledDefInst.end(), (*iter).first) != handledDefInst.end())
1444 {
1445 iter++;
1446 continue;
1447 }
1448 handledDefInst.push_back((*iter).first);
1449 // change uselist of def inst
1450 USE_EDGE_LIST_ITER useIter = (*iter).first->useInstList.begin();
1451 for (; useIter != (*iter).first->useInstList.end(); useIter++)
1452 {
1453 if ((*useIter).first == this)
1454 {
1455 if ((*useIter).second == srcIxB)
1456 {
1457 (*useIter).second = srcIxA;
1458 }
1459 else if ((*useIter).second == srcIxA)
1460 {
1461 (*useIter).second = srcIxB;
1462 }
1463 }
1464 }
1465 iter++;
1466 }
1467 }
1468
1469 // returns true if inst is a commutable binary instruction and its two sources can be swapped
canSwapSource() const1470 bool G4_INST::canSwapSource() const
1471 {
1472 if (getNumSrc() != 2)
1473 {
1474 return false;
1475 }
1476
1477 if (!INST_COMMUTATIVE(opcode()))
1478 {
1479 return false;
1480 }
1481
1482 G4_Operand* src0 = getSrc(0);
1483 G4_Operand* src1 = getSrc(1);
1484 // src1 restrictions: no ARF, no VXH
1485 if (src0->isSrcRegRegion())
1486 {
1487 G4_SrcRegRegion* src0Region = src0->asSrcRegRegion();
1488 if (src0Region->isAreg() || src0Region->getRegion()->isRegionWH())
1489 {
1490 return false;
1491 }
1492 }
1493
1494 // src0 restrictions: no Imm
1495 if (src1->isImm() || src1->isAddrExp())
1496 {
1497 return false;
1498 }
1499
1500 // special check for mul: don't put DW on src1
1501 if (opcode() == G4_mul)
1502 {
1503 if (IS_DTYPE(src0->getType()) && !IS_DTYPE(src1->getType()))
1504 {
1505 return false;
1506 }
1507 }
1508
1509 return true;
1510 }
1511 // fix src2 def/use to implicitSrc def/use
fixMACSrc2DefUse()1512 void G4_INST::fixMACSrc2DefUse()
1513 {
1514 if (op != G4_mac)
1515 {
1516 return;
1517 }
1518 for (DEF_EDGE_LIST_ITER iter = defInstList.begin();
1519 iter != defInstList.end();
1520 iter++)
1521 {
1522 if ((*iter).second == Opnd_src2)
1523 {
1524 (*iter).second = Opnd_implAccSrc;
1525 G4_INST* defInst = (*iter).first;
1526 for (USE_EDGE_LIST_ITER useIter = defInst->useInstList.begin();
1527 useIter != defInst->useInstList.end();
1528 ++useIter)
1529 {
1530 if (((*useIter).first == this) &&
1531 ((*useIter).second == Opnd_src2))
1532 {
1533 (*useIter).second = Opnd_implAccSrc;
1534 break;
1535 }
1536 }
1537 break;
1538 }
1539 }
1540 }
1541
1542 // a raw move is a move with
1543 // -- no saturation or src modifiers
1544 // -- same dst and src type
1545 // -- no conditional modifier (predicate is ok)
isRawMov() const1546 bool G4_INST::isRawMov() const
1547 {
1548 return op == G4_mov && !sat && dst->getType() == srcs[0]->getType() &&
1549 getCondMod() == NULL &&
1550 (srcs[0]->isImm() ||
1551 (srcs[0]->isSrcRegRegion() && srcs[0]->asSrcRegRegion()->getModifier() == Mod_src_undef));
1552 }
1553
hasACCSrc() const1554 bool G4_INST::hasACCSrc() const
1555 {
1556 if (implAccSrc ||
1557 (srcs[0] && srcs[0]->isSrcRegRegion() && srcs[0]->asSrcRegRegion()->isAccReg()))
1558 {
1559 return true;
1560 }
1561 return false;
1562 }
1563
1564 // check if acc is possibly used by this instruction
hasACCOpnd() const1565 bool G4_INST::hasACCOpnd() const
1566 {
1567 return (isAccWrCtrlInst() ||
1568 implAccSrc ||
1569 implAccDst ||
1570 (op == G4_mulh &&
1571 IS_DTYPE(srcs[0]->getType()) && IS_DTYPE(srcs[1]->getType())) ||
1572 (dst && dst->isAccReg()) ||
1573 (srcs[0] && srcs[0]->isAccReg()) ||
1574 (srcs[1] && srcs[1]->isAccReg()) ||
1575 (srcs[2] && srcs[2]->isAccReg()) ||
1576 op == G4_madw);
1577 }
1578
getOpExecType(int & extypesize)1579 G4_Type G4_INST::getOpExecType(int& extypesize)
1580 {
1581 G4_Type extype;
1582 if (isRawMov())
1583 {
1584 extype = srcs[0]->getType();
1585 }
1586 else
1587 {
1588 extype = getExecType2();
1589 }
1590 if (IS_VINTTYPE(extype))
1591 {
1592 extypesize = numEltPerGRF<Type_UB>()/2;
1593 }
1594 else if (IS_VFTYPE(extype))
1595 {
1596 extypesize = numEltPerGRF<Type_UB>();
1597 }
1598 else
1599 {
1600 extypesize = TypeSize(extype);
1601 }
1602
1603 return extype;
1604 }
1605
getMovType(const G4_INST * Inst,G4_Type dstTy,G4_Type srcTy,G4_SrcModifier srcMod)1606 static G4_INST::MovType getMovType(
1607 const G4_INST* Inst, G4_Type dstTy, G4_Type srcTy, G4_SrcModifier srcMod)
1608 {
1609 // COPY when dst & src types are the same.
1610 if (dstTy == srcTy)
1611 return G4_INST::Copy;
1612
1613 bool dstIsFP = IS_TYPE_FLOAT_ALL(dstTy);
1614 bool srcIsFP = IS_TYPE_FLOAT_ALL(srcTy);
1615
1616 // If dst & src are not both FPs or both Integers, that MOV must be
1617 // conversions from Integer to FP or vice versa.
1618 if (dstIsFP != srcIsFP) {
1619 if (dstIsFP)
1620 return G4_INST::IntToFP;
1621
1622 ASSERT_USER(srcIsFP, "Unexpected source type!");
1623 return G4_INST::FPToInt;
1624 }
1625
1626 // If they are both FPs, that MOV must be either up or down conversion.
1627 // Note it could not be a COPY as dst & src are different here.
1628 if (dstIsFP) {
1629 ASSERT_USER(srcIsFP, "Unexpected source type!");
1630
1631 // TODO: Do we need to treat 'vf' differently?
1632
1633 if (TypeSize(srcTy) < TypeSize(dstTy))
1634 return G4_INST::FPUpConv;
1635
1636 ASSERT_USER(TypeSize(srcTy) > TypeSize(dstTy),
1637 "Unexpected FP source and destination type sizes!");
1638 return G4_INST::FPDownConv;
1639 }
1640
1641 // They are both Integers. The destination signedness is ignored here to
1642 // detect the mov type as it really does not matter without saturation nor
1643 // condition modifier.
1644
1645 ASSERT_USER(!IS_VINTTYPE(dstTy),
1646 "Unexpected immediate types are used as dst type!");
1647
1648 // Always treat 'v' as SExt as they will always be extended even for
1649 // BYTE-sized types.
1650 if (srcTy == Type_V) {
1651 // If the sign bit is 0, then zext is the same as sext.
1652 // prefer zext as it allows more propagation.
1653 G4_Operand *Op0 = Inst->getSrc(0);
1654 if (Op0->isImm() && Op0->asImm()->isSignBitZero())
1655 return G4_INST::ZExt;
1656 return G4_INST::SExt;
1657 }
1658
1659 // Always treat 'uv' as ZExt as they will always be extended even for
1660 // BYTE-sized types.
1661 if (srcTy == Type_UV)
1662 return G4_INST::ZExt;
1663
1664 // Treat that mov as truncation.
1665 if (TypeSize(srcTy) > TypeSize(dstTy))
1666 {
1667 if (IS_SIGNED_INT(srcTy) &&
1668 srcMod != Mod_src_undef &&
1669 srcMod != Mod_Not)
1670 {
1671 return G4_INST::SuperMov;
1672 }
1673 else
1674 {
1675 return G4_INST::Trunc;
1676 }
1677 }
1678
1679 // Treat that mov as sign extend or zero extend based on the signedness of
1680 // the source type only.
1681 if (TypeSize(srcTy) < TypeSize(dstTy)) {
1682 if (IS_SIGNED_INT(srcTy)) {
1683 // Treat ABS as zero-extenstion.
1684 if (srcMod == Mod_Abs)
1685 return G4_INST::ZExt;
1686 // If the sign bit is 0, then zext is the same as sext.
1687 // prefer zext as it allows more propagation.
1688 G4_Operand *Op0 = Inst->getSrc(0);
1689 if (Op0->isImm() && Op0->asImm()->isSignBitZero())
1690 return G4_INST::ZExt;
1691
1692 return G4_INST::SExt;
1693 }
1694 else if (srcMod == Mod_Minus || srcMod == Mod_Minus_Abs)
1695 { // SrcMod=negate means that number is signed
1696 return G4_INST::SExt;
1697 }
1698 return G4_INST::ZExt;
1699 }
1700
1701 // Otherwise, treat it as COPY they are the same in bit size.
1702 // Treat ABS as zero-extenstion.
1703 if (IS_SIGNED_INT(srcTy) && srcMod == Mod_Abs)
1704 return G4_INST::ZExt;
1705 return G4_INST::Copy;
1706 }
1707
1708 // check if this instruction can be propagated
canPropagate() const1709 G4_INST::MovType G4_INST::canPropagate() const
1710 {
1711 G4_Declare* topDcl = NULL;
1712
1713 if (dst == NULL)
1714 {
1715 return SuperMov;
1716 }
1717
1718 topDcl = dst->getTopDcl();
1719
1720 if (op != G4_mov
1721 // Do not eliminate if either sat or condMod is present.
1722 || getSaturate() || getCondMod()
1723 // Do not eliminate if there's no use (dead or side-effect code?)
1724 || useInstList.size() == 0
1725 // Do not eliminate stack call return value passing instructions.
1726 // Do not eliminate vars marked with Output attribute
1727 || (topDcl && topDcl->isOutput()))
1728 {
1729 return SuperMov;
1730 }
1731
1732 // can't propagate stack call related variables (Arg, Retval, SP, FP)
1733 if (topDcl)
1734 {
1735 G4_Declare* rootDcl = topDcl->getRootDeclare();
1736 if (builder.isPreDefFEStackVar(rootDcl) || builder.isPreDefArg(rootDcl) ||
1737 builder.isPreDefRet(rootDcl))
1738 {
1739 return SuperMov;
1740 }
1741 }
1742
1743
1744 // Do not eliminate MOV/COPY to Acc/flag registers.
1745 if (dst->isAccReg() || dst->isFlag())
1746 {
1747 return SuperMov;
1748 }
1749
1750 // Retain side effect of writing to debug register.
1751 if (dst->isDbgReg())
1752 {
1753 return SuperMov;
1754 }
1755
1756 G4_Operand *src = srcs[0];
1757
1758 if (src->isRelocImm())
1759 {
1760 return SuperMov;
1761 }
1762
1763 // only support flag propagation for simd1 copy moves
1764 if (src->isFlag())
1765 {
1766 if (getExecSize() != g4::SIMD1 || src->getType() != dst->getType())
1767 {
1768 return SuperMov;
1769 }
1770 }
1771
1772 // Do not propagate through copy of `acc0` if its execution size does not match the native size,
1773 // as some latest passes (e.g., fixAddCSubb) rely on the acc0 copy move for correctness
1774 if (src->isAccReg() && getExecSize() != builder.getNativeExecSize())
1775 {
1776 return SuperMov;
1777 }
1778
1779 if (builder.kernel.fg.globalOpndHT.isOpndGlobal(dst))
1780 {
1781 return SuperMov;
1782 }
1783
1784 G4_Type dstType = dst->getType();
1785 G4_Type srcType = src->getType();
1786
1787 if (!builder.hasByteALU()
1788 && (TypeSize(dstType) == 1 || TypeSize(srcType) == 1))
1789 {
1790 return SuperMov;
1791 }
1792
1793 G4_SrcModifier srcMod = Mod_src_undef;
1794 if (src->isSrcRegRegion()) {
1795 srcMod = src->asSrcRegRegion()->getModifier();
1796 }
1797
1798 MovType MT = getMovType(this, dstType, srcType, srcMod);
1799
1800 //Disabling mix mode copy propogation
1801 if (!builder.hasMixMode() &&
1802 ((IS_TYPE_F32_F64(srcType) && isLowPrecisionFloatTy(dstType)) ||
1803 (isLowPrecisionFloatTy(srcType) && IS_TYPE_F32_F64(dstType))))
1804 {
1805 return SuperMov;
1806 }
1807
1808 // Selectively enable copy propagation on the detected mov type.
1809 switch (MT) {
1810 default:
1811 return SuperMov;
1812 case Copy:
1813 case ZExt:
1814 case SExt:
1815 // COPY and integer extending are allowed.
1816 break;
1817 case Trunc: {
1818 if (!src->isSrcRegRegion())
1819 return SuperMov;
1820 G4_SrcRegRegion *src0 = src->asSrcRegRegion();
1821 if (src0->getRegion()->isContiguous(getExecSize())) {
1822 unsigned newHS = TypeSize(srcType) / TypeSize(dstType);
1823 if (newHS > 4) {
1824 // Rule out Q -> B. WHY?
1825 return SuperMov;
1826 }
1827 } else if (!src0->isScalar()) {
1828 return SuperMov;
1829 }
1830 break;
1831 }
1832 case FPUpConv:
1833 // For FPUpConv, only HF -> F is allowed.
1834 if (!(srcType == builder.getMixModeType() && dstType == Type_F))
1835 return SuperMov;
1836 break;
1837 case FPDownConv:
1838 {
1839 if (IS_TYPE_F32_F64(srcType) &&
1840 builder.getMixModeType() == dstType &&
1841 builder.getOption(vISA_enableUnsafeCP_DF) &&
1842 useInstList.size() == 1)
1843 return FPDownConvSafe;
1844 break;
1845 }
1846 // TODO: Enable IntToFP or vice versa on constant.
1847 }
1848
1849 return MT;
1850 }
1851
canPropagateBinaryToTernary() const1852 bool G4_INST::canPropagateBinaryToTernary() const
1853 {
1854 if (opcode() != G4_add && opcode() != G4_mul)
1855 return false; // constrain just to a few ops for the moment
1856 else if (dst == nullptr)
1857 return false;
1858 else if (!dst->getBase()->isRegVar() && !dst->getBase()->isPhyGreg())
1859 return false; // must be GRF dst
1860 else if (dst->isIndirect())
1861 return false; // must not be indirect
1862 else if (dst->getHorzStride() != 1)
1863 return false; // must be <1>
1864 else if (
1865 dst->getType() != Type_D && dst->getType() != Type_UD &&
1866 dst->getType() != Type_Q && dst->getType() != Type_UQ)
1867 return false; // dst has to be :d or :ud (for now)
1868 else if (builder.kernel.fg.globalOpndHT.isOpndGlobal(dst))
1869 return false; // writes to globals must be visible
1870 else if (getNumSrc() != 2)
1871 return false; // must be binary
1872 else if (getPredicate())
1873 return false; // no predicates
1874 else if (getExecSize() != 1 && dst->getSubRegOff() != 0)
1875 return false; // must be dst.0 or SIMD1 to any subreg
1876 else if (getImplAccDst() || getImplAccSrc())
1877 return false; // no {AccWrEn}
1878 else if (getSaturate() || getCondMod())
1879 return false; // do not eliminate if either sat or condMod is present.
1880 else if (useInstList.size() == 0)
1881 return false; // do not eliminate if there's no use (dead or side-effect code?)
1882
1883 G4_Declare* topDcl = dst->getTopDcl();
1884 if (topDcl) {
1885 // Do not eliminate stack call return value passing instructions.
1886 // Do not eliminate vars marked with Output attribute.
1887 if (topDcl->isOutput())
1888 return false;
1889 G4_Declare* rootDcl = topDcl->getRootDeclare();
1890 if (builder.isPreDefFEStackVar(rootDcl) || builder.isPreDefArg(rootDcl) ||
1891 builder.isPreDefRet(rootDcl))
1892 {
1893 // can't propagate stack call related variables (Arg, Retval, SP, FP)
1894 return false;
1895 }
1896 }
1897
1898 for (int srcIx = 0; srcIx < getNumSrc(); srcIx++) {
1899 G4_Operand *src = srcs[srcIx];
1900
1901 if (!src->isSrcRegRegion() && !src->isImm()) {
1902 return false; // only GRF
1903 } else if (src->isRelocImm()) {
1904 return false;
1905 }
1906 if (src->isSrcRegRegion()) {
1907 const G4_SrcRegRegion *srr = src->asSrcRegRegion();
1908 if (!srr->getBase()->isRegVar() && !srr->getBase()->isPhyGreg()) {
1909 return false; // has to be GRF
1910 } else if (srr->isIndirect()) {
1911 return false; // has to be direct
1912 }
1913 }
1914 }
1915
1916 return true;
1917 }
1918
1919 // Check to see whether the given type is supported by this opcode + operand. Mainly focus on integer ops
1920 // This is used by copy propagation and def-hoisting to determine if the resulting instruction is legal
isLegalType(G4_Type type,Gen4_Operand_Number opndNum) const1921 bool G4_INST::isLegalType(G4_Type type, Gen4_Operand_Number opndNum) const
1922 {
1923 bool isSrc = (opndNum == Opnd_src0 || opndNum == Opnd_src1 || opndNum == Opnd_src2);
1924 switch (op)
1925 {
1926 default:
1927 // ToDo: Make this function more complete by adding more opcodes
1928 // keep alphabetical order when adding to make it easier to maintain
1929 return true;
1930 case G4_addc:
1931 return type == Type_UD;
1932 case G4_bfe:
1933 case G4_bfi1:
1934 case G4_bfi2:
1935 // additionally check src and dst have same type
1936 return (type == Type_D || type == Type_UD) &&
1937 (isSrc ? type == dst->getType() : type == getSrc(0)->getType());
1938 case G4_bfrev:
1939 return type == Type_UD;
1940 case G4_cbit:
1941 return type == Type_UB || type == Type_UW || type == Type_UD;
1942 case G4_fbh:
1943 return type == Type_D || type == Type_UD;
1944 case G4_fbl:
1945 return type == Type_UD;
1946 case G4_lzd:
1947 return type == Type_D || type == Type_UD;
1948 case G4_sad2:
1949 case G4_sada2:
1950 return type == Type_B || type == Type_UB;
1951 case G4_subb:
1952 return type == Type_UD;
1953 case G4_mov:
1954 // Avoid mov r7.0<1>:hf 0x76543210:v
1955 if (IS_VINTTYPE(type) &&
1956 (IS_FTYPE(dst->getType()) || IS_HFTYPE(dst->getType())))
1957 {
1958 return false;
1959 }
1960 return true;
1961 case G4_bfn:
1962 // do not allow copy propagation to change BFN operand type
1963 if (isSrc && type != getOperand(opndNum)->getType())
1964 {
1965 return false;
1966 }
1967 // fall through
1968 case G4_add3:
1969 return type == Type_W || type == Type_UW || type == Type_D || type == Type_UD;
1970 }
1971 }
1972
1973 // returns true if inst supports only F type for both src and dst
isFloatOnly() const1974 bool G4_INST::isFloatOnly() const
1975 {
1976 switch (op)
1977 {
1978 default:
1979 return false;
1980 case G4_dp2:
1981 case G4_dp3:
1982 case G4_dp4:
1983 case G4_dph:
1984 case G4_frc:
1985 case G4_line:
1986 case G4_lrp:
1987 case G4_pln:
1988 case G4_rndd:
1989 case G4_rnde:
1990 case G4_rndu:
1991 case G4_rndz:
1992 return true;
1993 }
1994 }
1995
1996 /// isSignSensitive() - Check whether this instruction is sign sensitive on the
1997 /// specified source operand.
isSignSensitive(Gen4_Operand_Number opndNum) const1998 bool G4_INST::isSignSensitive(Gen4_Operand_Number opndNum) const
1999 {
2000 const G4_Operand *use = getOperand(opndNum);
2001 G4_Type useType = use->getType();
2002 G4_Type dstType = dst->getType();
2003
2004 // If extending is required, most of insts are sign sensitive.
2005 if (TypeSize(dstType) > TypeSize(useType)) {
2006 return true;
2007 }
2008
2009 switch (op) {
2010 case G4_asr:
2011 if (opndNum != Opnd_src0)
2012 break;
2013 // FALL THROUGH
2014 case G4_mach:
2015 case G4_fbh:
2016 case G4_mulh:
2017 case G4_sel:
2018 case G4_cmp:
2019 case G4_cmpn:
2020 case G4_madw:
2021 return true;
2022 case G4_mov:
2023 // inttofp is sign sensitive
2024 return IS_TYPE_INT(useType) && IS_TYPE_FLOAT_ALL(dstType);
2025 default:
2026 break;
2027 }
2028 // By default, inst is regarded as sign insensitive.
2029 return false;
2030 }
2031
getPropType(Gen4_Operand_Number opndNum,MovType MT,const G4_INST * mov) const2032 G4_Type G4_INST::getPropType(
2033 Gen4_Operand_Number opndNum, MovType MT, const G4_INST *mov) const
2034 {
2035 const G4_Operand *use = getOperand(opndNum);
2036 G4_Type useType = use->getType();
2037 G4_Type srcType = mov->getSrc(0)->getType();
2038
2039 G4_SrcModifier srcMod = Mod_src_undef;
2040 if (mov->getSrc(0)->isSrcRegRegion()) {
2041 srcMod = mov->getSrc(0)->asSrcRegRegion()->getModifier();
2042 }
2043 G4_SrcModifier useMod = Mod_src_undef;
2044 if (use->isSrcRegRegion()) {
2045 useMod = use->asSrcRegRegion()->getModifier();
2046 }
2047
2048 bool useIsFP = IS_TYPE_FLOAT_ALL(useType);
2049 bool srcIsFP = IS_TYPE_FLOAT_ALL(srcType);
2050 // Different numeric type.
2051 bool diffNumTy = useIsFP != srcIsFP;
2052
2053 // TODO: Once we handle IntToFp, this condition should be checked
2054 // individually for each MovType.
2055
2056 switch (MT) {
2057 case Copy:
2058 // Different numeric type with src mod cannot be propagated.
2059 if (diffNumTy && srcMod != Mod_src_undef)
2060 return Type_UNDEF;
2061 // Fp is simply to use useType.
2062 if (useIsFP)
2063 return useType;
2064 // Int needs to consider whether the use is sign-sensitive and the src
2065 // modifier.
2066 if (isSignSensitive(opndNum)) {
2067 switch (srcMod) {
2068 case Mod_Not:
2069 case Mod_Minus:
2070 case Mod_Minus_Abs:
2071 if (IS_UNSIGNED_INT(useType))
2072 return Type_UNDEF;
2073 // Assume the combination of srcMod/srcType is valid.
2074 // FALL THROUGH
2075 case Mod_Abs:
2076 return srcType;
2077 default:
2078 break;
2079 }
2080 }
2081 else if (srcMod == Mod_Abs && IS_UNSIGNED_INT(useType) &&
2082 IS_SIGNED_INT(srcType))
2083 return srcType;
2084 return useType;
2085 case ZExt:
2086 // Different numeric type with src zero-extended cannot be propagated.
2087 if (diffNumTy)
2088 return Type_UNDEF;
2089 // (sext (zext x)) is equal to (zext x)
2090 return srcType;
2091 case SExt:
2092 // Different numeric type with src sign-extended cannot be propagated.
2093 if (diffNumTy)
2094 return Type_UNDEF;
2095 // (zext (sext x)) is not equal to (sext x)
2096 if (IS_UNSIGNED_INT(useType))
2097 return Type_UNDEF;
2098 // Check if there's any modifier on the use.
2099 switch (useMod) {
2100 case Mod_Not:
2101 case Mod_Minus:
2102 case Mod_Minus_Abs:
2103 if (IS_QTYPE(useType) && IS_DTYPE(srcType)) {
2104 // (- (sext x)) is not equal to (sext (-x)) due to the corner case
2105 // where x is INT_MIN and -x is still INT_MIN without being
2106 // extended.
2107 return Type_UNDEF;
2108 }
2109 // FALL THROUGH
2110 default:
2111 break;
2112 }
2113 return srcType;
2114 case Trunc:
2115 if (diffNumTy)
2116 return Type_UNDEF;
2117 // Truncation always use the useType but the original source operand.
2118 // As a result, region needs changing to access the truncated bits
2119 // only.
2120 return useType;
2121 case FPUpConv:
2122 // Different numeric type with src up-converted cannot be propagated.
2123 if (diffNumTy)
2124 return Type_UNDEF;
2125 return srcType;
2126 case FPDownConvSafe:
2127 return srcType;
2128 default:
2129 break;
2130 }
2131
2132 return Type_UNDEF;
2133 }
2134
isLegalImmType(G4_Type type)2135 static bool isLegalImmType(G4_Type type)
2136 {
2137 return type != Type_BF;
2138 return true;
2139 }
2140
2141 // cases that we do not propagate
2142 // 0. use inst does not support the type of the operand being propagated
2143 // 1. use inst is align16 instruction
2144 // 2. first source of line
2145 // 3. indirect source to compressed instructions or math instructions
2146 // 4. byte src to if/while instructions
2147 // 5. src with modifier to logic inst on BDW
2148 // 6. When useinst is lifetime.end
2149 // 7. use inst does not have dst
canPropagateTo(G4_INST * useInst,Gen4_Operand_Number opndNum,MovType MT,bool inSimdFlow,bool statelessAddr)2150 bool G4_INST::canPropagateTo(
2151 G4_INST *useInst, Gen4_Operand_Number opndNum, MovType MT, bool inSimdFlow, bool statelessAddr)
2152 {
2153 G4_Operand *src = srcs[0];
2154 bool indirectSrc = src->isSrcRegRegion() &&
2155 src->asSrcRegRegion()->getRegAccess() != Direct;
2156 bool hasModifier = src->isSrcRegRegion() &&
2157 src->asSrcRegRegion()->getModifier() != Mod_src_undef;
2158 G4_Type dstType = dst->getType();
2159 G4_Type srcType = src->getType();
2160
2161 G4_Operand *use = useInst->getOperand(opndNum);
2162 G4_Type useType = use->getType();
2163
2164 //If the operand to be copied is acc register, need to check if the use operand can use acc register
2165 if (src->isAccReg())
2166 {
2167 if (!useInst->canSrcBeAccBeforeHWConform(opndNum))
2168 {
2169 return false;
2170 }
2171 }
2172
2173 if (useInst->is2SrcAlign16())
2174 {
2175 // don't copy propagate for the legacy dp* instructions,
2176 // as we are missing some HW conformity checks for them
2177 return false;
2178 }
2179
2180 // Skip lifetime.
2181 if (useInst->isLifeTimeEnd())
2182 {
2183 return false;
2184 }
2185
2186 // Skip dpas as it has no region (maybe too conservative)
2187 if (useInst->isDpas())
2188 {
2189 return false;
2190 }
2191
2192 // skip the instruction has no dst. e.g. G4_pseudo_fcall
2193 if (useInst->getDst() == nullptr)
2194 return false;
2195
2196 // limit flag copy propagation to opcode known to work for now
2197 if (src->isFlag() && (useInst->opcode() != G4_not && useInst->opcode() != G4_and))
2198 {
2199 return false;
2200 }
2201
2202 if (isMixedMode())
2203 {
2204 // FIXME: what's this for?
2205 if (execSize < g4::SIMD16 && MT == FPDownConvSafe && useInst->execSize == g4::SIMD16 &&
2206 !useInst->isMixedMode())
2207 {
2208 return false;
2209 }
2210
2211 G4_opcode useOp = useInst->opcode();
2212
2213 if (useOp != G4_mov &&
2214 useOp != G4_mul &&
2215 useOp != G4_pseudo_mad &&
2216 useOp != G4_add &&
2217 useOp != G4_sel &&
2218 useOp != G4_cmp)
2219 {
2220 return false;
2221 }
2222 }
2223 else if (srcType != useType && (useInst->opcode() == G4_mulh || useInst->opcode() == G4_madw))
2224 {
2225 // don't propagate widening ops into a mul/mach
2226 // mov T:d SRC:w
2227 // ...
2228 // mach ... T:d ...
2229 // mach requires 32b types only
2230 return false;
2231 }
2232
2233
2234 // special checks for message desc/extended desc, which must be either a0 or imm
2235 if (useInst->isSend())
2236 {
2237 auto msgDescOpnd = useInst->isSplitSend() ? Opnd_src2 : Opnd_src1;
2238 if (opndNum == msgDescOpnd)
2239 {
2240 if (!src->isImm() && !src->isAddress())
2241 {
2242 return false;
2243 }
2244 }
2245 if (opndNum == Opnd_src3)
2246 {
2247 // there are some HW restrictions that prevent imm exdesc (e.g., on MRT write),
2248 // so we conservatively disable copy prop here
2249 return false;
2250 }
2251 }
2252
2253 // The following are copied from local dataflow analysis.
2254 // TODO: re-examine..
2255 if (((opndNum == Opnd_src0 && useInst->isSend()) && !statelessAddr) ||
2256 (opndNum == Opnd_src1 && useInst->isSplitSend()))
2257 {
2258 return false;
2259 }
2260
2261 auto isFloatPseudoMAD = [](G4_INST *inst)
2262 {
2263 return inst->opcode() == G4_pseudo_mad && IS_TYPE_FLOAT_ALL(inst->getDst()->getType());
2264 };
2265
2266 // mov (16|M0) r47.0 1:w
2267 // (W) add (16|M0) r49.0 r47.0 r45.0
2268 //
2269 // FIXME: remove this once DU/UD chain are computed correctly.
2270 //
2271 // Only skip when the defInst ('this') is defined in SIMD CF.
2272 if (useInst->isWriteEnableInst() && !isWriteEnableInst() && inSimdFlow)
2273 {
2274 return false;
2275 }
2276
2277 if (useInst->opcode() == G4_fcvt)
2278 {
2279 // fcvt is not allowed to have immediate src.
2280 if (src->isImm() ||
2281 !src->isSrcRegRegion() ||
2282 !(src->asSrcRegRegion()->getRegion()->isContiguous(useInst->getExecSize())))
2283 {
2284 return false;
2285 }
2286 }
2287 if (useInst->opcode() == G4_srnd)
2288 {
2289 // srnd rZ.0<1>:ub rX.0<1;1,0>:hf rY.0<1;1,0>:hf
2290 // operands should be packed.
2291 if (useInst->getDst()->getType() == Type_UB &&
2292 src->isSrcRegRegion() &&
2293 !(src->asSrcRegRegion()->getRegion()->isContiguous(useInst->getExecSize())))
2294 {
2295 return false;
2296 }
2297 }
2298
2299 if (src->isImm())
2300 {
2301 if (isFloatPseudoMAD(useInst) || useInst->opcode() == G4_math ||
2302 use->asSrcRegRegion()->hasModifier())
2303 {
2304 return false;
2305 }
2306 } else if (indirectSrc &&
2307 (isFloatPseudoMAD(useInst) || useInst->opcode() == G4_math))
2308 {
2309 return false;
2310 }
2311 if (getGRFSize() == 64 &&
2312 (useInst->opcode() == G4_dpas || useInst->opcode() == G4_dpasw) &&
2313 (opndNum == Opnd_src0 || opndNum == Opnd_src1))
2314 {
2315 uint32_t leftBoundInBytes = src->getLeftBound() * src->getTypeSize();
2316 // left bound should be 2grf aligned to propagate into dpas.
2317 if (leftBoundInBytes % (numEltPerGRF<Type_UB>()*2))
2318 {
2319 return false;
2320 }
2321 }
2322
2323 // FIXME: to add specific checks for other instructions.
2324 G4_opcode useInst_op = useInst->opcode();
2325
2326 if (useInst_op == G4_madm || (useInst->isMath() && useInst->asMathInst()->isIEEEMath()))
2327 {
2328 // do not propagate if useInst uses mme registers
2329 return false;
2330 }
2331 if ((useInst_op == G4_line && opndNum == Opnd_src0) ||
2332 (hasModifier && G4_Inst_Table[useInst_op].instType == InstTypeLogic))
2333 {
2334 return false;
2335 }
2336
2337 bool isVxHSrc = indirectSrc && src->asSrcRegRegion()->getRegion()->isRegionWH();
2338 if (isVxHSrc && (useInst->getExecSize() != execSize || execSize >= g4::SIMD8))
2339 {
2340 // copy propagating VxH region may result in address spills later so it's usually a net loss
2341 return false;
2342 }
2343
2344 if ((useInst_op == G4_asr || useInst_op == G4_shl || useInst_op == G4_shr) &&
2345 opndNum == Opnd_src0 && src->getTypeSize() < use->getTypeSize())
2346 {
2347 // Handle cases such as
2348 // mov A:q B:d
2349 // asr r:d A:q C:q
2350 // if C is immediate and its value is in 0:31 (for d), it is okay to prop;
2351 // otherwise, no.
2352 G4_Operand* src1 = useInst->getOperand(Opnd_src1);
2353 if (src1->isImm())
2354 {
2355 // shiftAmt is LSB[0:useTypeBits - 1]
2356 int64_t v = src1->asImm()->getImm();
2357 uint32_t shiftAmt = (uint32_t)((uint64_t)v & (use->getTypeSize()*8 - 1));
2358 uint32_t nbits = 8 * src->getTypeSize();
2359 if (shiftAmt >= nbits)
2360 {
2361 return false;
2362 }
2363 }
2364 else
2365 {
2366 return false;
2367 }
2368 }
2369
2370 // In general, to check whether that MOV could be propagated:
2371 //
2372 // dst/T1 = src/T0;
2373 // op(..., use/T2, ...);
2374 //
2375 // We need firstly check whether 'dst' and 'use' are exactly the same
2376 // variable regardless data type.
2377
2378 // Check T1 and T2 has the same bit/byte size. Otherwise, it's not legal to
2379 // be propagated.
2380 // TODO: Revisit later if exection mask is guaranteed to be NoMask.
2381 if (TypeSize(dstType) != TypeSize(useType) && !statelessAddr) {
2382 return false;
2383 }
2384
2385 // Do not propagate if def type is float and use type is int, or vice
2386 // versa.
2387 // NOTE: Such usage is possible from bitcast (not through this MOV but the
2388 // reference in the use insn) from one type to another.
2389 // TODO: Revisit this later to handle the case where this MOV insn is
2390 // exactly a COPY. The useType should be used instead.
2391 if (MT != Copy && ((IS_TYPE_FLOAT_ALL(dstType) && IS_TYPE_INT(useType)) ||
2392 (IS_TYPE_INT(dstType) && IS_TYPE_FLOAT_ALL(useType))))
2393 {
2394 return false;
2395 }
2396
2397 if (MT == Copy &&
2398 hasModifier &&
2399 dstType != useType)
2400 {
2401 return false;
2402 }
2403
2404 if (hasModifier && !useInst->canSupportSrcModifier())
2405 {
2406 return false;
2407 }
2408
2409 // Check 'dst' of MOV and 'use' are the same variable. Otherwise, it's not
2410 // legal to be propagated.
2411 G4_CmpRelation rel = dst->compareOperand(use);
2412 if (rel != Rel_eq)
2413 {
2414 return false;
2415 }
2416
2417 // Type to be used after propagation. Use srcType by default.
2418 G4_Type propType = useInst->getPropType(opndNum, MT, this);
2419
2420 if (propType == Type_UNDEF || (src->isImm() && !isLegalImmType(propType)))
2421 {
2422 return false;
2423 }
2424
2425 // bfloat specific checks
2426 if (propType == Type_BF)
2427 {
2428 // If the useInst is G4_pseudo_mad and the use operand has source modifier, a invalid bf->bf mov with source modifier
2429 // may be inserted in fixMADInst(). So avoid propagating to G4_pseudo_mad source with source modifier.
2430 // TODO: a mov is not always inserted for G4_pseudo_mad source with source modifier since gen mad inst supports source
2431 // modifier. So for the no mov inserted case, avoid propagating may miss this opotimize. So, do we need to check if a mov
2432 // is really needed for G4_pseudo_mad source here? But the same check code in fixMADInst() seems very complicated?
2433 if (use->asSrcRegRegion()->hasModifier() && (useInst->isMov() || useInst->opcode() == G4_pseudo_mad))
2434 {
2435 // BF_CVT does not like source modifier
2436 return false;
2437 }
2438 if (src->isSrcRegRegion() && src->asSrcRegRegion()->isScalar() &&
2439 useInst->opcode() != G4_mov)
2440 {
2441 // HW has bug with scalar bfloat in mix mode instructions
2442 return false;
2443 }
2444 if (useInst->getDst()->getType() != Type_F)
2445 {
2446 // we currently don't handle BF->HF or BF->DF conversion
2447 return false;
2448 }
2449 }
2450
2451 // Don't propagate unsupported propType.
2452 if (!useInst->isLegalType(propType, opndNum))
2453 {
2454 return false;
2455 }
2456
2457 // TODO: Revisit this later as IntToFp could be folded on specific insts,
2458 // such as add, cmp, and mul, when types of all source operands could be
2459 // consistent.
2460 if (!(useInst->isRawMov() && dstType == useType) &&
2461 !(MT == Copy && propType == useType) &&
2462 ((IS_FTYPE(dstType) && (IS_TYPE_INT(propType) || IS_VINTTYPE(propType))) ||
2463 (IS_TYPE_INT(dstType) && (IS_FTYPE(propType) || IS_VFTYPE(propType)))))
2464 {
2465 return false;
2466 }
2467
2468 if (useInst->getSingleDef(opndNum) == nullptr)
2469 {
2470 return false;
2471 }
2472
2473 // Cannot generally safely propagate replicated vectors.
2474 unsigned dstElSize = TypeSize(dstType);
2475 unsigned srcElSize = TypeSize(propType);
2476 unsigned useElSize = TypeSize(useType);
2477
2478 const RegionDesc *rd =
2479 src->isSrcRegRegion() ? src->asSrcRegRegion()->getRegion() : nullptr;
2480 G4_ExecSize newExecSize = useInst->getExecSize();
2481 if ((useElSize != dstElSize && !statelessAddr) &&
2482 (!src->isSrcRegRegion()
2483 || rd->isRepeatRegion(execSize)
2484 || !(rd->isFlatRegion() && rd->isPackedRegion())))
2485 {
2486 return false;
2487 }
2488
2489 // Skip propagate scalar copies into the additive operand (src2) of integer
2490 // pseudo mad.
2491 if (!builder.hasAlign1Ternary())
2492 {
2493 if (opndNum == Opnd_src2 && useInst->opcode() == G4_pseudo_mad &&
2494 IS_TYPE_INT(useType) && rd && rd->isScalar())
2495 return false;
2496 }
2497
2498 // Check repeat region
2499 bool sameDefUseELSize = (dstElSize == useElSize);
2500 bool sameExecSize = (execSize == newExecSize);
2501 const RegionDesc *useRd =
2502 use->isSrcRegRegion() ? use->asSrcRegRegion()->getRegion() : nullptr;
2503 bool repeatUseRegion = useRd && useRd->isRepeatRegion(newExecSize);
2504 bool scalarUse = useRd && useRd->isScalar();
2505 bool repeatSrcRegion = (rd && rd->isRepeatRegion(execSize));
2506 if (!sameExecSize && !statelessAddr &&
2507 !((sameDefUseELSize && scalarUse) ||
2508 (!repeatUseRegion && rd && rd->isFlatRegion() && rd->isPackedRegion()) ||
2509 (repeatUseRegion && sameDefUseELSize && (src->isImm() || !repeatSrcRegion))))
2510 {
2511 return false;
2512 }
2513
2514 // Be conserversative, do not bother to do complicated region compositions.
2515 // There are three variables to compute the composition:
2516 // (1) the dst stride
2517 // (2) the source region
2518 // (3) the use source region
2519
2520 // dStride, the dst stride
2521 // stride1, stride2 must be positive
2522 auto isComposable = [=](unsigned dStride, unsigned stride1,
2523 unsigned stride2) -> bool
2524 {
2525 MUST_BE_TRUE(stride1 && stride2, "scalar region not expected");
2526
2527 // composition is rd1 (or rd2).
2528 // If two variables are trivial, then the other variable could be
2529 // arbitrary. E.g.
2530 //
2531 // mov (8) V81(0,0)<1>:w V80(0,0)<1;1,0>:w
2532 // add (16) V82(0,0)<1>:w V81(0,0)<0;8,1>:w 0xa:w
2533 //
2534 // where rd1 has stride 1, dStride = 1, rd2 is non single strided.
2535 if ((stride1 == 1 && dStride == 1) || (stride2 == 1 && dStride == 1))
2536 return true;
2537
2538 // If either stride1 or stride2 equals UndefVal, then there is no easy
2539 // formula to do the composition unless dStride == 1 and the other has
2540 // stride 1. This case is covered by the first check.
2541 //
2542 // To be composable, both regions need to be single strided (i.e. value
2543 // != UndefVal). This check is simplified by the value UndefVal (64).
2544 return stride1 * stride2 * dStride <= 32;
2545 };
2546
2547 if (!sameExecSize && rd && useRd)
2548 {
2549 // the compoisition is also scalar.
2550 if (!rd->isScalar() && !useRd->isScalar())
2551 {
2552 G4_DstRegRegion *dstRegion = dst;
2553 uint16_t dstStride = dstRegion->getHorzStride();
2554
2555 // A value to indicate this region is non-single strided.
2556 // Make it larger than 32 to simplify/unify the checking.
2557 const uint16_t UndefVal = 64;
2558
2559 uint16_t stride1 = UndefVal;
2560 if (rd->isContiguous(execSize))
2561 stride1 = 1;
2562 else
2563 rd->isSingleNonUnitStride(execSize, stride1);
2564
2565 uint16_t stride2 = UndefVal;
2566 if (useRd->isContiguous(newExecSize))
2567 stride2 = 1;
2568 else
2569 useRd->isSingleNonUnitStride(newExecSize, stride2);
2570
2571 if (!isComposable(dstStride, stride1, stride2))
2572 return false;
2573 }
2574 }
2575
2576 // check data type alignment
2577 if ((srcElSize < useElSize) &&
2578 (dstElSize == srcElSize) &&
2579 (execSize > g4::SIMD1) &&
2580 (!src->isImm()) &&
2581 ((src->getByteOffset() % useElSize) != 0))
2582 {
2583 return false;
2584 }
2585
2586 if (src->isImm() && use->asSrcRegRegion()->hasModifier())
2587 {
2588 //FIXME: do we need to worry about signal bit in NaN being dropped?
2589 if (IS_TYPE_INT(srcType))
2590 {
2591 // we can't represent -(INT_MIN) or abs(INT_MIN)
2592 int64_t value = src->asImm()->getImm();
2593 switch (propType)
2594 {
2595 case Type_Q:
2596 case Type_UQ:
2597 return value != LLONG_MIN;
2598 default:
2599 return value != INT_MIN;
2600 }
2601 }
2602 }
2603
2604 return true;
2605 }
2606
2607 // check if this inst can be hoisted
2608 // assume only MOV inst is checked
canHoist(bool simdBB,const Options * opt) const2609 bool G4_INST::canHoist(bool simdBB, const Options *opt) const
2610 {
2611 assert(op == G4_mov && "defHoisting only handles mov");
2612 if (dst == NULL)
2613 {
2614 return false;
2615 }
2616
2617 G4_Operand *src = srcs[0];
2618 // check attributes of src and number of defs
2619 bool archRegSrc = (src->isFlag() || src->isAreg() || src->isAddress());
2620 bool indirectSrc = (src->getTopDcl() && src->getTopDcl()->getAddressed()) || src->getRegAccess() != Direct;
2621 bool noMultiDefOpt = ((defInstList.size() > 1) &&
2622 (predicate || (dst->getRegAccess() != Direct) || simdBB));
2623 if (src->isImm() ||
2624 archRegSrc ||
2625 indirectSrc ||
2626 (src->isSrcRegRegion() && src->asSrcRegRegion()->getModifier() != Mod_src_undef) ||
2627 (defInstList.size() == 0) ||
2628 noMultiDefOpt)
2629 {
2630 return false;
2631 }
2632
2633 // check type
2634 G4_Type dstType, srcType;
2635 dstType = dst->getType();
2636 srcType = src->getType();
2637
2638 // no dst type promotion after hoisting
2639 if (!Is_Type_Included(dstType, srcType, builder) ||
2640 // if multi def, src and dst should have the same type size
2641 (defInstList.size() > 1 &&
2642 (Operand_Type_Rank(srcType) != Operand_Type_Rank(dstType) ||
2643 // if multidef and used as a scalar, execution size should be one.
2644 (src->isSrcRegRegion() && src->asSrcRegRegion()->isScalar() && execSize > g4::SIMD1))))
2645 {
2646 return false;
2647 }
2648
2649 // no opt repeat region
2650 unsigned short src_wd = src->asSrcRegRegion()->getRegion()->width;
2651 if ((src_wd != execSize &&
2652 (src->asSrcRegRegion()->getRegion()->vertStride < (src_wd * src->asSrcRegRegion()->getRegion()->horzStride))) ||
2653 // actually we can hoist if src is a scalar and target inst has no pred or cond mod.
2654 (execSize > g4::SIMD1 && src->asSrcRegRegion()->isScalar()))
2655 {
2656 return false;
2657 }
2658
2659 if (src->getTopDcl() && src->getTopDcl()->isOutput())
2660 {
2661 return false;
2662 }
2663
2664 return true;
2665 }
2666
2667 // check if this instruction can be hoisted to defInst
canHoistTo(const G4_INST * defInst,bool simdBB) const2668 bool G4_INST::canHoistTo(const G4_INST *defInst, bool simdBB) const
2669 {
2670 assert(op == G4_mov && "defHoisting only handles mov");
2671 bool indirect_dst = (dst->getRegAccess() != Direct);
2672
2673 auto def_dst = defInst->getDst();
2674
2675 if (!def_dst)
2676 {
2677 // can this actually happen?
2678 return false;
2679 }
2680 G4_Type defDstType = def_dst->getType();
2681 G4_Type dstType = dst->getType(), srcType = srcs[0]->getType();
2682 unsigned int srcElSize = TypeSize(srcType);
2683 unsigned int dstElSize = TypeSize(dstType);
2684 unsigned int defDstElSize = TypeSize(defDstType);
2685
2686 // cannot hoist an accumulator access into an instruction
2687 // that doesn't have a dst hz stride that matches source
2688 // def (..) T<1> .. acc:d
2689 // use (..) ...<2>:d T<1>
2690 // ==>
2691 // def2 (..) ...<2>:d ... acc
2692 // ^ dst stride mismatch means we mustn't hoist
2693 if (defInst->useAcc() && dst->getExecTypeSize() != srcElSize) {
2694 return false;
2695 }
2696
2697 bool rawMovInst = isRawMov();
2698 bool cantHoistMAD =
2699 (defInst->opcode() == G4_pseudo_mad &&
2700 !(IS_TYPE_FLOAT_ALL(dstType) && IS_TYPE_FLOAT_ALL(defDstType)));
2701 if ((defInst->useInstList.size() != 1) ||
2702 (defInst->opcode() == G4_sad2) ||
2703 (defInst->opcode() == G4_sada2) ||
2704 (defInst->opcode() == G4_cbit && dstType != defDstType) ||
2705 (defInst->opcode() == G4_dp4a && dstType != defDstType) ||
2706 ((cantHoistMAD || (defInst->opcode() == G4_math)) &&
2707 (indirect_dst || (dstType != defDstType && !rawMovInst))))
2708 {
2709 return false;
2710 }
2711
2712 if (!defInst->isLegalType(dstType, Opnd_dst))
2713 {
2714 return false;
2715 }
2716
2717 if (isMixedMode())
2718 {
2719 G4_opcode defOp = defInst->opcode();
2720
2721 if (defOp != G4_mov &&
2722 defOp != G4_mul &&
2723 defOp != G4_pseudo_mad &&
2724 defOp != G4_add &&
2725 defOp != G4_sel &&
2726 defOp != G4_cmp)
2727 {
2728 return false;
2729 }
2730 if (!builder.hasMixMode())
2731 {
2732 // normally we should disable the opt, but for the special case where
2733 // defInst is a move with integer source, we can still hoist since it
2734 // won't produce a mixed mode inst
2735 if (!(defInst->isMov() && IS_TYPE_INT(defInst->getSrc(0)->getType())))
2736 {
2737 return false;
2738 }
2739 }
2740 if (!builder.getOption(vISA_ignoreBFRounding) && dstType == Type_BF && defOp != G4_mov)
2741 {
2742 // F->BF move has RNE mode while mix mode BF uses RTZ due to HW bug
2743 // so we have to disallow the def-hoisting
2744 return false;
2745 }
2746 }
2747
2748 if (dst->isAddress() && defInst->getNumSrc() == 3)
2749 {
2750 // no A0 dst for ternary instructions
2751 return false;
2752 }
2753
2754 // compare boudaries and bitset
2755 if ((def_dst->getLeftBound() < srcs[0]->getLeftBound()) ||
2756 (def_dst->getRightBound() > srcs[0]->getRightBound()))
2757 {
2758 return false;
2759 }
2760
2761 if (getSaturate() && !defInst->canSupportSaturate())
2762 {
2763 return false;
2764 }
2765
2766 // check mixed type conversion
2767 // TODO: cleanup this part since mixed type check of the first half is already checked in canHoist.
2768 if ((!(defInst->isRawMov() && (defDstType == srcType)) &&
2769 ((IS_FTYPE(dstType) && (IS_TYPE_INT(srcType) || IS_VINTTYPE(srcType))) ||
2770 ((IS_FTYPE(srcType) || IS_VFTYPE(srcType)) && IS_TYPE_INT(dstType)))) ||
2771 (!rawMovInst &&
2772 ((IS_FTYPE(defDstType) && IS_TYPE_INT(defInst->getExecType())) ||
2773 (IS_FTYPE(defInst->getExecType()) && IS_TYPE_INT(defDstType)))))
2774 {
2775 return false;
2776 }
2777
2778 if (!rawMovInst && (defInst->getSrc(0) &&
2779 (IS_DFTYPE(defInst->getSrc(0)->getType()) || IS_FTYPE(defInst->getSrc(0)->getType()))) &&
2780 (IS_SIGNED_INT(defDstType) || IS_UNSIGNED_INT(defDstType)))
2781 {
2782 // Sequence that should not be optimized:
2783 // mov V1:d V2:df
2784 // mov V3:uw V1:d
2785 //
2786 // This is *NOT* a candidate for:
2787 // mov V3:uw V2:df
2788 //
2789 // In general, df/f->int performs saturation and unless value of
2790 // df/f is known, the result of mov may differ based on type
2791 // of dst.
2792 return false;
2793 }
2794
2795 // no def hoisting for sends for now
2796 if (defInst->isSend())
2797 {
2798 return false;
2799 }
2800
2801 if (defInst->opcode() == G4_mov && defInst->getSrc(0)->isFlag())
2802 {
2803 // TODO: check if use is a predicate, if not, can propagate?
2804 return false;
2805 }
2806
2807 if (simdBB && (defInst->isWriteEnableInst() ^ isWriteEnableInst()))
2808 {
2809 // no opt if one isNoMask but the other is not
2810 return false;
2811 }
2812
2813 if (defInst->getMaskOffset() != getMaskOffset() &&
2814 (simdBB || getPredicate() || getCondMod() ||
2815 defInst->getPredicate() || defInst->getCondMod()))
2816 {
2817 // no opt if their mask offset do not match,
2818 // and mov/defInst has flags
2819 return false;
2820 }
2821
2822 if ((getPredicate() || getCondMod()) && (defInst->getPredicate() || defInst->getCondMod()))
2823 {
2824 // can't have both inst using flags
2825 return false;
2826 }
2827
2828 bool same_type_size = def_dst->getTypeSize() == TypeSize(srcType);
2829 bool scalarSrc = srcs[0]->asSrcRegRegion()->isScalar();
2830 // handle predicated MOV and float def
2831 if ((getPredicate() && (execSize > g4::SIMD1) && !same_type_size) ||
2832 (IS_FTYPE(defDstType) && (defDstType != srcType) && (dstType != srcType)))
2833 {
2834 return false;
2835 }
2836
2837 // if used as scalar and repeated region, dst should be packed
2838 // add(2) v2<1>:w v3 v4
2839 // mov(2) v5<2>:d V2<0;1,0>:d
2840 if (scalarSrc && !same_type_size &&
2841 (execSize > g4::SIMD1) && (dst->getHorzStride() != 1))
2842 {
2843 return false;
2844 }
2845
2846 // if indirect source is repeat region, or defhoisting will make it a repeat region,
2847 // no opt
2848 if (srcs[0]->asSrcRegRegion()->getRegion()->isRepeatRegion(execSize) &&
2849 !scalarSrc)
2850 {
2851 return false;
2852 }
2853
2854 // check type conversion
2855 if (IS_SIGNED_INT(dstType) && (defInst->opcode() == G4_mov) &&
2856 (TypeSize(dstType) > srcElSize) &&
2857 ((IS_SIGNED_INT(defDstType) && IS_UNSIGNED_INT(defInst->getSrc(0)->getType())) ||
2858 (IS_UNSIGNED_INT(defDstType) && IS_SIGNED_INT(defInst->getSrc(0)->getType()))))
2859 {
2860 return false;
2861 }
2862
2863 // check alignment and saturate
2864 if (((srcElSize > defDstElSize) || defInst->getSaturate()) && (srcType != dstType))
2865 {
2866 return false;
2867 }
2868
2869 uint16_t dstHS = dst->getHorzStride();
2870 uint16_t srcHS = 0;
2871 const RegionDesc *srcRd = srcs[0]->asSrcRegRegion()->getRegion();
2872 if (!srcRd->isSingleStride(execSize, srcHS))
2873 {
2874 return false;
2875 }
2876 if ((srcElSize < defDstElSize) && ((dstHS > 1) || (srcHS > 1)))
2877 {
2878 return false;
2879 }
2880 if ((dstElSize != defDstElSize) && (srcElSize == dstElSize) &&
2881 (indirect_dst || ((dst->getByteOffset() % defDstElSize) != 0) ||
2882 (dstHS != srcHS)))
2883 {
2884 return false;
2885 }
2886
2887 // dont hoist stack calls related variables (Arg, Retval, SP, FP)
2888 if (defInst->getDst() && defInst->getDst()->getTopDcl())
2889 {
2890 G4_Declare* defDstDcl = defInst->getDst()->getTopDcl()->getRootDeclare();
2891 if (builder.isPreDefFEStackVar(defDstDcl) || builder.isPreDefArg(defDstDcl) ||
2892 builder.isPreDefRet(defDstDcl))
2893 {
2894 return false;
2895 }
2896 }
2897
2898 // For mov HF F, we have to check if the def Inst supports HF
2899 if (dstType != Type_F && defInst->isFloatOnly() && !isRawMov())
2900 {
2901 return false;
2902 }
2903
2904 // Before:
2905 // or (8) V100(0,0)<1>:d ...
2906 // or (8) V100(1,0)<1>:d ...
2907 // mov (16) V101(0,0)<1>:b V102(0,0)<16;16,1>:w <-- V102 is alias of V100
2908 // mov (16) V101(0,16)<1>:b V102(1,0)<16;16,1>:w
2909
2910 // After (invalid optimization):
2911 // or (8) V100(0,0)<1>:d ...
2912 // or (8) V100(0,4)<1>:d ...
2913 if (defDstType != srcType)
2914 {
2915 if (isRawMov() == false)
2916 {
2917 return false;
2918 }
2919 }
2920
2921 // As dst's type of shl inst decides what shifting amt should be used,
2922 // make sure shifting amt would not be changed after doing hoisting.
2923 // shl (1) V00(0,0)<1>:q V101(0,0):w V102(0,0)<0;1,0>:q
2924 // mov(1) V103(0, 0)<1>:b V100(0, 0)<0;1,0 >:q
2925 // Cannot do it for this case.
2926 if (defInst->opcode() == G4_shl || defInst->opcode() == G4_shr || defInst->opcode() == G4_asr)
2927 {
2928 uint32_t defSrc0Bytes = defInst->getSrc(0)->getTypeSize();
2929 bool QMode = (defDstElSize == 8 || defSrc0Bytes == 8);
2930 if ((QMode && defSrc0Bytes != 8 && dstElSize != 8) ||
2931 (!QMode && dstElSize == 8))
2932 {
2933 // Disable it; otherwise shift's mode is changed illegally!
2934 return false;
2935 }
2936 }
2937
2938 // Cannot do hoisting if the use inst has src modifier.
2939 if (getSrc(0)->asSrcRegRegion()->hasModifier())
2940 {
2941 return false;
2942 }
2943
2944 if (getGRFSize() == 64 &&
2945 (defInst->opcode() == G4_dpas || defInst->opcode() == G4_dpasw))
2946 {
2947 uint32_t leftBoundInBytes = dst->getLeftBound() * dst->getTypeSize();
2948 // left bound should be 2grf aligned to hoist dst into dpas.
2949 if (leftBoundInBytes % (numEltPerGRF<Type_UB>() * 2))
2950 {
2951 return false;
2952 }
2953 }
2954 if (defInst->opcode() == G4_fcvt)
2955 {
2956 return false;
2957 }
2958 if (defInst->opcode() == G4_srnd)
2959 {
2960 return false;
2961 }
2962
2963 return true;
2964 }
2965
2966 // check if the sources of an inst is commutative
2967 // besides the property shown in inst table, some INT MUL instructions
2968 // are not commutative due to HW restrictions
isCommutative() const2969 bool G4_INST::isCommutative() const
2970 {
2971 //TODO: we can invert condMod of cmp to swap sources
2972 if (!(G4_Inst_Table[op].attributes & ATTR_COMMUTATIVE) || op == G4_cmp)
2973 return false;
2974
2975 // for mul we can do D*W but not W*D
2976 if (op == G4_mul)
2977 {
2978 if (IS_DTYPE(srcs[0]->getType()))
2979 {
2980 return false;
2981 }
2982 }
2983 return true;
2984 }
2985
hasNULLDst() const2986 bool G4_INST::hasNULLDst() const
2987 {
2988 if (dst && dst->isNullReg())
2989 {
2990 return true;
2991 }
2992
2993 return false;
2994 }
2995
goodTwoGRFDst(bool & evenSplitDst)2996 bool G4_INST::goodTwoGRFDst(bool& evenSplitDst)
2997 {
2998 evenSplitDst = false;
2999 // The following applies to all platforms
3000 // The problem is , the first case is really an instruction with two destination registers.
3001 // in which case, hardware breaks into two operations. When this happens, hardware cannot update flag registers.
3002 // I.e., if execution size is 8 or less and the destination register is 2, flag updates are not supported.
3003 // -naveen
3004
3005 if (!dst || hasNULLDst())
3006 {
3007 evenSplitDst = true;
3008 return true;
3009 }
3010 else
3011 {
3012 evenSplitDst = dst->evenlySplitCrossGRF(execSize);
3013 // check if elements evenly split between two GRFs
3014 if (evenSplitDst)
3015 {
3016 return true;
3017 }
3018 else
3019 {
3020 return false;
3021 }
3022 }
3023 }
3024
3025 // check if there is WAW, WAR, RAW dependency between the passing-in inst and this instruction
3026 // there is no check for the case that two instructions are both send, since these checks are
3027 // only used in def-joisting and copy propagation
isWARdep(G4_INST * inst)3028 bool G4_INST::isWARdep(G4_INST* inst)
3029 {
3030 G4_Operand* msg0 = NULL;
3031 G4_Operand* src0_0 = inst->getSrc(0);
3032 G4_Operand* src0_1 = inst->getSrc(1);
3033 G4_Operand* src0_2 = inst->getSrc(2);
3034 G4_Operand* src0_3 = inst->getSrc(3);
3035 G4_Operand* implicitSrc0 = inst->getImplAccSrc();
3036 G4_Predicate* pred0 = inst->getPredicate();
3037
3038 G4_Operand* dst1 = dst;
3039
3040 if (dst1 && !hasNULLDst())
3041 {
3042
3043 if (
3044 (src0_0 && src0_0->compareOperand(dst1) != Rel_disjoint) ||
3045 (src0_1 && src0_1->compareOperand(dst1) != Rel_disjoint) ||
3046 (src0_2 && src0_2->compareOperand(dst1) != Rel_disjoint) ||
3047 (src0_3 && src0_3->compareOperand(dst1) != Rel_disjoint) ||
3048 (msg0 && (msg0->compareOperand(dst1) != Rel_disjoint)) ||
3049 (pred0 && (pred0->compareOperand(dst1) != Rel_disjoint)) ||
3050 (implicitSrc0 && (implicitSrc0->compareOperand(dst1) != Rel_disjoint)))
3051 {
3052 return true;
3053 }
3054 }
3055
3056 if (mod)
3057 {
3058 if ((pred0 && pred0->compareOperand(mod) != Rel_disjoint) ||
3059 (src0_0 && src0_0->isFlag() && src0_0->compareOperand(mod) != Rel_disjoint) ||
3060 (src0_1 && src0_1->isFlag() && src0_1->compareOperand(mod) != Rel_disjoint) ||
3061 (src0_2 && src0_2->isFlag() && src0_2->compareOperand(mod) != Rel_disjoint))
3062 {
3063 return true;
3064 }
3065 }
3066
3067 if (implAccDst)
3068 {
3069 if ((implicitSrc0 && implicitSrc0->compareOperand(implAccDst) != Rel_disjoint) ||
3070 (src0_0 && src0_0->isAccReg() && src0_0->compareOperand(implAccDst) != Rel_disjoint) ||
3071 (src0_1 && src0_1->isAccReg() && src0_1->compareOperand(implAccDst) != Rel_disjoint) ||
3072 (src0_2 && src0_2->isAccReg() && src0_2->compareOperand(implAccDst) != Rel_disjoint))
3073 {
3074 return true;
3075 }
3076 }
3077 return false;
3078 }
3079
isWAWdep(G4_INST * inst)3080 bool G4_INST::isWAWdep(G4_INST *inst)
3081 {
3082 G4_Operand *dst0 = inst->getDst();
3083 G4_Operand *dst1 = dst;
3084 G4_CondMod *cMod0 = inst->getCondMod();
3085 G4_CondMod *cMod1 = mod;
3086 G4_Operand *implicitDst0 = inst->getImplAccDst();
3087 G4_Operand *implicitDst1 = implAccDst;
3088
3089 bool NULLDst1 = !dst1 || hasNULLDst();
3090 if (dst0 && !inst->hasNULLDst())
3091 {
3092 if ((!NULLDst1 && dst1->compareOperand(dst0) != Rel_disjoint) ||
3093 (implicitDst1 && implicitDst1->compareOperand(dst0) != Rel_disjoint) ||
3094 (cMod1 && cMod1->getBase() && cMod1->compareOperand(dst0) != Rel_disjoint))
3095 {
3096 return true;
3097 }
3098 }
3099
3100 if (implicitDst0)
3101 {
3102 if ((!NULLDst1 && dst1->compareOperand(implicitDst0) != Rel_disjoint) ||
3103 (implicitDst1 && implicitDst1->compareOperand(implicitDst0) != Rel_disjoint))
3104 {
3105 return true;
3106 }
3107 }
3108
3109 if (cMod0 && cMod0->getBase())
3110 {
3111 if ((!NULLDst1 && dst1->compareOperand(cMod0) != Rel_disjoint) ||
3112 (cMod1 && cMod1->getBase() && cMod1->compareOperand(cMod0) != Rel_disjoint))
3113 {
3114 return true;
3115 }
3116 }
3117
3118 return false;
3119 }
isRAWdep(G4_INST * inst)3120 bool G4_INST::isRAWdep(G4_INST *inst)
3121 {
3122 G4_Operand *dst0 = inst->getDst();
3123 G4_CondMod *cMod0 = inst->getCondMod();
3124 G4_Operand *implicitDst0 = inst->getImplAccDst();
3125 G4_Operand *msg1 = NULL;
3126 G4_Predicate *pred1 = getPredicate();
3127 G4_Operand *src1_0 = getSrc(0);
3128 G4_Operand *src1_1 = getSrc(1);
3129 G4_Operand *src1_2 = getSrc(2);
3130 G4_Operand* src1_3 = getSrc(3);
3131 G4_Operand *implicitSrc1 = implAccSrc;
3132
3133 bool NULLSrc1 = (opcode() == G4_math && src1_1->isNullReg());
3134 if (dst0 && !inst->hasNULLDst())
3135 {
3136 if ((src1_0 && src1_0->compareOperand(dst0) != Rel_disjoint) ||
3137 (src1_1 && !NULLSrc1 && src1_1->compareOperand(dst0) != Rel_disjoint) ||
3138 (src1_2 && src1_2->compareOperand(dst0) != Rel_disjoint) ||
3139 (src1_3 && src1_3->compareOperand(dst0) != Rel_disjoint) ||
3140 (msg1 && msg1->compareOperand(dst0) != Rel_disjoint) ||
3141 (pred1 && pred1->compareOperand(dst0) != Rel_disjoint) ||
3142 (implicitSrc1 && implicitSrc1->compareOperand(dst0) != Rel_disjoint))
3143 {
3144 return true;
3145 }
3146 }
3147
3148 if (cMod0 && cMod0->getBase())
3149 {
3150 if ((pred1 && pred1->compareOperand(cMod0) != Rel_disjoint) ||
3151 (src1_0 && src1_0->isFlag() && src1_0->compareOperand(cMod0) != Rel_disjoint) ||
3152 (src1_2 && src1_2->isFlag() && src1_2->compareOperand(cMod0) != Rel_disjoint) ||
3153 (src1_1 && src1_1->isFlag() && src1_1->compareOperand(cMod0) != Rel_disjoint))
3154 {
3155 return true;
3156 }
3157 }
3158
3159 if (implicitDst0)
3160 {
3161 if ((implicitSrc1 && implicitSrc1->compareOperand(implicitDst0) != Rel_disjoint) ||
3162 (src1_0 && src1_0->isAccReg() && src1_0->compareOperand(implicitDst0) != Rel_disjoint) ||
3163 (src1_2 && src1_2->isAccReg() && src1_2->compareOperand(implicitDst0) != Rel_disjoint) ||
3164 (src1_1 && src1_1->isAccReg() && src1_1->compareOperand(implicitDst0) != Rel_disjoint))
3165 {
3166 return true;
3167 }
3168 }
3169 return false;
3170 }
3171
detectComprInst() const3172 bool G4_INST::detectComprInst() const
3173 {
3174 enum class ComprInstStates : unsigned char { U, T, F };
3175
3176 G4_Type execType = getExecType();
3177 ComprInstStates comprInst = ComprInstStates::U;
3178
3179 // Compressed instructions must have a minimum execution size of
3180 // at least 8.
3181 if (execSize < g4::SIMD8)
3182 {
3183 comprInst = ComprInstStates::F;
3184 }
3185
3186 // Compressed instructions must have a minimum execution size of
3187 // at least 16 if the execution type is less than DF.
3188 else if (dst &&
3189 dst->getHorzStride() != UNDEFINED_SHORT &&
3190 dst->getType() != Type_UNDEF)
3191 {
3192 if ((unsigned)execSize * dst->getTypeSize() * dst->getHorzStride() >
3193 numEltPerGRF<Type_UB>())
3194 {
3195 comprInst = ComprInstStates::T;
3196 }
3197 else
3198 {
3199 comprInst = ComprInstStates::F;
3200 }
3201 }
3202
3203 // Uncompressed instructions can only operate on a max of 4 DFs or
3204 // 8 DF4/F/DWs or 16 W/Bs (the only exception being packed byte
3205 // moves which always have destinations).
3206 else if ((unsigned)execSize * TypeSize(execType) > numEltPerGRF<Type_UB>())
3207 {
3208 comprInst = ComprInstStates::T;
3209 }
3210
3211 else
3212 {
3213 comprInst = ComprInstStates::F;
3214 }
3215
3216 return (comprInst == ComprInstStates::T);
3217 }
3218
3219 /*
3220 * Check to see if the interpretation of the i/p src region is unaffected by
3221 * virtue of it making it a src of the compressed op, as opposed to (if
3222 * possible) it appearing within a regular uncompressed op with the same exec
3223 * size.
3224 * Register-indirect operands are NOT compression invariant. The following 4 rules
3225 * are used to determine compression invariant register-direct opnds:
3226 * 1. constants, scalars, and ARF regions/registers are always compression invariant
3227 * 2. if both the dst region and the i/p source region are native packed
3228 * regions, and the GRF source region is additionally of type W/UW
3229 * 3. the src region covers (i.e. vs(region) * rows(region)) exactly two
3230 * registers (strides allowed), except when the dst region is a native
3231 * packed region and the GRF source has packed rows of type W/UW
3232 * 4. the first src of line op is always considered compression invariant
3233 * (this is a special case quadruple region of <0;4,1>)
3234 * e.g.
3235 * (Both srcs are compression invariant in the following examples)
3236 * add (16) r10.0<1>:d r12.0<0;1,0>:w 0x80:w {CC}
3237 * add (16) r10.0<2>:w r12.0<8;8,1>:d r14.0<16;8,2>:w {CC}
3238 * add (16) r10.0<1>:d r12.0<16;8,2>:w r14.0<32;8,4>:b {CC}
3239 * add (16) r10.0<1>:d r12.0<8;8,1>:w r14.0<8;8,1>:w {CC}
3240 * add (16) r10.0<1>:d r12.0<4;4,1>:w r14.0<4;4,1>:d {CC}
3241 * add (32) r10.0<1>:w r12.0<8;8,1>:w r14.0<16;8,2>:b {CC}
3242 * add (8) r10.0<1>:df r12.0<4;4,1>:df r14.0<4;4,1>:df {CC}
3243 * mov (8) r10.0<1>:df r12.0<4;4,1>:w {CC}
3244 * (Only the first src is compression invariant in the following examples)
3245 * add (16) r10.0<1>:d r12.0<8;8,1>:w r14.0<16;8,2>:b {CC}
3246 * add (16) r10.0<2>:w r14.0<32;8,1>:b r12.0<16;8,1>:w {CC}
3247 * add (16) r10.0<2>:w r12.0<4;4,1>:d r14.0<8;8,1>:w {CC}
3248 * add (32) r10.0<1>:w r12.0<8;8,1>:w r14.0<8;8,1>:b {CC}
3249 * (Neither src is compression invariant in the following examples)
3250 * add (16) r10.0<2>:w r12.0<8;8,1>:w r14.0<16;8,2>:b {CC}
3251 * add (32) r10.0<1>:w r12.0<8;8,1>:b r14.0<8;8,1>:b {CC}
3252 * mov (8) r10.0<1>:df r12.0<4;4,1>:dw {CC}
3253 * Inputs:
3254 * src - the i/p src operand region
3255 * src_pos - the position that the src operand appears in the list
3256 * of src operands
3257 * Assumptions:
3258 * - this function is only valid for compressed ops and it is invalid
3259 * to call it for uncompressed ops
3260 */
3261 bool
isComprInvariantSrcRegion(G4_SrcRegRegion * src,int srcPos)3262 G4_INST::isComprInvariantSrcRegion(G4_SrcRegRegion* src, int srcPos)
3263 {
3264 if (src == NULL)
3265 {
3266 return true;
3267 }
3268 else if (src->isImm() || src->isAddrExp())
3269 {
3270 return true;
3271 }
3272 else if (src->getRegAccess() != Direct)
3273 {
3274 return false;
3275 }
3276 else if (src->getBase()->asRegVar()->getDeclare()->getRegFile() != G4_GRF &&
3277 src->getBase()->asRegVar()->getDeclare()->getRegFile() != G4_INPUT)
3278 {
3279 return true;
3280 }
3281
3282 const RegionDesc* region = src->getRegion();
3283
3284 if (opcode() == G4_line && srcPos == 0)
3285 {
3286 return true;
3287 }
3288 else if (region->isScalar())
3289 {
3290 return true;
3291 }
3292 else
3293 {
3294 int num_rows = getExecSize() / src->getRegion()->width;
3295 int type_sz = (int)src->getTypeSize();
3296 int byte_size = src->getRegion()->vertStride * type_sz * num_rows;
3297
3298 if (getDst() && getDst()->isNativePackedRegion() &&
3299 IS_WTYPE(src->getType())) {
3300 if (src->isNativePackedRegion()) {
3301 return true;
3302 }
3303 else if (src->isNativePackedRowRegion()) {
3304 return false;
3305 }
3306 }
3307 if (byte_size == 2 * numEltPerGRF<Type_UB>()) {
3308 return true;
3309 }
3310 else {
3311 return false;
3312 }
3313 }
3314 }
3315
isPartialWrite() const3316 bool G4_INST::isPartialWrite() const
3317 {
3318 G4_Predicate* aPred = predicate;
3319 if (aPred && aPred->isSameAsNoMask())
3320 {
3321 // equivalent to NoMask (W) without predicate
3322 aPred = nullptr;
3323 }
3324
3325 return (aPred != NULL && op != G4_sel) || op == G4_smov;
3326 }
3327
isPartialWriteForSpill(bool inSIMDCF) const3328 bool G4_INST::isPartialWriteForSpill(bool inSIMDCF) const
3329 {
3330 if (!getDst() || hasNULLDst())
3331 {
3332 // inst does not write to GRF
3333 return false;
3334 }
3335
3336 if (isPartialWrite())
3337 {
3338 return true;
3339 }
3340
3341 if (inSIMDCF && !isWriteEnableInst())
3342 {
3343 if (builder.usesStack() || !(builder.hasMaskForScratchMsg() && getDst()->getElemSize() == 4))
3344 {
3345 // scratch message only supports DWord mask
3346 // also we can't use the scratch message when under stack call
3347 return true;
3348 }
3349 }
3350
3351 return false;
3352 }
3353
3354
isAccSrcInst() const3355 bool G4_INST::isAccSrcInst() const
3356 {
3357 if (srcs[0] && srcs[0]->isSrcRegRegion() && srcs[0]->asSrcRegRegion()->getBase()->isAccReg())
3358 {
3359 return true;
3360 }
3361 else if (getNumSrc() == 3 && srcs[1] != nullptr)
3362 {
3363 if (srcs[1]->isSrcRegRegion() && srcs[1]->asSrcRegRegion()->getBase()->isAccReg())
3364 {
3365 return true;
3366 }
3367 }
3368 return false;
3369 }
3370
3371 // Check if this instruction has an explicit acc destination
isAccDstInst() const3372 bool G4_INST::isAccDstInst() const
3373 {
3374 if (dst != NULL && dst->getBase()->isAccReg())
3375 {
3376 return true;
3377 }
3378 return false;
3379 }
3380
isArithAddr() const3381 bool G4_INST::isArithAddr() const
3382 {
3383 if (srcs[1] != NULL)
3384 return isArithmetic() && srcs[1]->isAddrExp();
3385 else
3386 return false;
3387 }
3388
isMovAddr() const3389 bool G4_INST::isMovAddr() const
3390 {
3391 if (srcs[0] != NULL)
3392 return isMov() && srcs[0]->isAddrExp();
3393 return false;
3394 }
3395
3396 //
3397 // Check if the operands of send instruction obey the symbolic register rule
3398 // ToDo: this is obsolete and should be removed
3399 //
isValidSymbolOperand(bool & dst_valid,bool * srcs_valid) const3400 bool G4_INST::isValidSymbolOperand(bool &dst_valid, bool *srcs_valid) const
3401 {
3402 MUST_BE_TRUE(srcs_valid, ERROR_INTERNAL_ARGUMENT);
3403
3404 bool obeyRule = true;
3405 if (dst && dst->getBase()->isRegVar())
3406 {
3407 dst_valid = dst->obeySymbolRegRule();
3408 if (!dst_valid)
3409 obeyRule = false;
3410 }
3411 else
3412 dst_valid = false; // does not change obeyRule for non-register-variable
3413
3414 for (unsigned i = 0; i < G4_MAX_SRCS; i++)
3415 {
3416 G4_Operand* src = getSrc(i);
3417 if (src && src->isSrcRegRegion() && src->asSrcRegRegion()->getBase()->isRegVar())
3418 {
3419 srcs_valid[i] = src->asSrcRegRegion()->obeySymbolRegRule();
3420 if (!srcs_valid[i])
3421 obeyRule = false;
3422 }
3423 else
3424 srcs_valid[i] = false; // does not change obeyRule for non-register-variable
3425 }
3426
3427 return obeyRule;
3428 }
3429
getCondModBase() const3430 const G4_VarBase* G4_INST::getCondModBase() const
3431 {
3432 if (!getCondMod())
3433 return nullptr;
3434
3435 return getCondMod()->getBase();
3436 }
3437
isOptBarrier() const3438 bool G4_INST::isOptBarrier() const
3439 {
3440 if (op == G4_join)
3441 {
3442 return true;
3443 }
3444
3445 if (isIntrinsic() && asIntrinsicInst()->getIntrinsicId() == Intrinsic::MemFence)
3446 {
3447 return true;
3448 }
3449
3450 // any instructions that access special ARFs is considered a opt barrier
3451 // this includes any ARF that is not address/flag/acc
3452 if (dst != NULL)
3453 {
3454 if (dst->isAreg())
3455 {
3456 if (dst->isNReg() ||
3457 dst->isSrReg() ||
3458 dst->isCrReg() ||
3459 dst->isTmReg() ||
3460 dst->isTDRReg())
3461 {
3462 return true;
3463 }
3464 }
3465 }
3466
3467 for (int i = 0; i < getNumSrc(); i++)
3468 {
3469 if (getSrc(i))
3470 {
3471 if (getSrc(i)->isAreg())
3472 {
3473 if (getSrc(i)->isNReg() ||
3474 getSrc(i)->isSrReg() ||
3475 getSrc(i)->isCrReg() ||
3476 getSrc(i)->isTmReg() ||
3477 getSrc(i)->isTDRReg())
3478 {
3479 return true;
3480 }
3481 }
3482 }
3483 }
3484 return false;
3485 }
3486
3487
emitPredWrEn(std::ostream & output,G4_INST & inst)3488 static void emitPredWrEn(std::ostream& output, G4_INST &inst)
3489 {
3490 G4_Predicate *pred = inst.getPredicate();
3491 bool isNoMask = (inst.getOption() & InstOpt_WriteEnable) != 0;
3492
3493 if (pred) {
3494 output << "(";
3495 if (isNoMask)
3496 output << "W&";
3497 pred->emit_body(output, false);
3498 output << ") ";
3499 } else if (isNoMask) {
3500 output << "(W) ";
3501 } else {
3502 output << " "; // align for predication (.....)
3503 }
3504 }
3505
emitExecSize(std::ostream & output,const G4_INST & inst)3506 static void emitExecSize(std::ostream& output, const G4_INST &inst)
3507 {
3508 auto execSize = static_cast<int>(inst.getExecSize());
3509 if (inst.opcode() != G4_nop && inst.opcode() != G4_wait)
3510 {
3511 output << '(';
3512 if (execSize == UNDEFINED_EXEC_SIZE) {
3513 output << "??";
3514 } else {
3515 output << execSize;
3516 }
3517 if (int execOffset = inst.getMaskOffset()) {
3518 // non-zero channel offset
3519 output << "|M" << execOffset;
3520 }
3521 output << ") ";
3522 }
3523 }
3524
3525 // the syntax column width of beinning instruction info
3526 // (P1.0) and (16) ...
3527 // nop
3528 // and (16|M0) ...
3529 // ^ aligns operand start to same place here
3530 static const int INST_START_COLUMN_WIDTH = 24;
3531
3532 // emits the first part of an instruction in an aligned column
emitInstructionStartColumn(std::ostream & output,G4_INST & inst)3533 static void emitInstructionStartColumn(std::ostream& output, G4_INST &inst)
3534 {
3535 std::stringstream oupPfx;
3536 emitPredWrEn(oupPfx, inst);
3537
3538 oupPfx << G4_Inst_Table[inst.opcode()].str;
3539 if (inst.isIntrinsic())
3540 {
3541 oupPfx << "." << inst.asIntrinsicInst()->getName();
3542 if (inst.isSpillIntrinsic())
3543 {
3544 oupPfx << "." << inst.asSpillIntrinsic()->getNumRows();
3545 }
3546 else if (inst.isFillIntrinsic())
3547 {
3548 oupPfx << "." << inst.asFillIntrinsic()->getNumRows();
3549 }
3550 }
3551 else if (inst.opcode() == G4_goto)
3552 {
3553 oupPfx << (inst.asCFInst()->isBackward() ? ".bwd" : ".fwd");
3554 }
3555 else if (inst.isBfn()) {
3556 oupPfx << "." << fmtHex(inst.asBfnInst()->getBooleanFuncCtrl(), 2);
3557 }
3558 else if (inst.isMath() && inst.asMathInst()->getMathCtrl() != MATH_RESERVED)
3559 {
3560 oupPfx << "." << MathOpNames[inst.asMathInst()->getMathCtrl()];
3561 }
3562
3563 oupPfx << ' ';
3564 emitExecSize(oupPfx, inst);
3565
3566 G4_CondMod *mod = inst.getCondMod();
3567 if (mod) {
3568 oupPfx << ' ';
3569 mod->emit(oupPfx);
3570 }
3571
3572 std::string pfx = oupPfx.str();
3573 output << pfx;
3574 for (int i = 0; i < INST_START_COLUMN_WIDTH - (int)pfx.size(); i++)
3575 output << ' ';
3576 }
3577
3578
emit_inst(std::ostream & output,bool symbol_dst,bool * symbol_srcs)3579 void G4_INST::emit_inst(std::ostream& output, bool symbol_dst, bool *symbol_srcs)
3580 {
3581 if (isLabel())
3582 {
3583 srcs[0]->emit(output);
3584 output << ":";
3585 if (((G4_Label*)srcs[0])->isStartLoopLabel())
3586 output << " // do";
3587 }
3588 else
3589 {
3590 // predication, opcode, execsize, condition, ...
3591 emitInstructionStartColumn(output, *this);
3592
3593 if (isSpillIntrinsic())
3594 {
3595 output << ' ';
3596 output << "Scratch[" << asSpillIntrinsic()->getOffset() << "x" << numEltPerGRF<Type_UB>() << "]";
3597 }
3598 else if (dst)
3599 {
3600 output << ' ';
3601 if (sat)
3602 output << "(sat)";
3603 dst->emit(output, symbol_dst);
3604 } // else: may not have dst (e.g. branch)
3605
3606 auto numSrcOpnds = getNumSrc();
3607 for (int i = 0; i < numSrcOpnds; i++)
3608 {
3609 if (getSrc(i))
3610 {
3611 output << " ";
3612 if (symbol_srcs != NULL)
3613 {
3614 getSrc(i)->emit(output, symbol_srcs[i]); // emit symbolic/physical register depends on the flag
3615 }
3616 else
3617 {
3618 getSrc(i)->emit(output, false); // emit physical register
3619 }
3620 }
3621 }
3622
3623 if (isFillIntrinsic())
3624 {
3625 output << " ";
3626 output << "Scratch[" << asFillIntrinsic()->getOffset() << "x" << numEltPerGRF<Type_UB>() << "] ";
3627 }
3628
3629 if (isFlowControl() && asCFInst()->getJip())
3630 {
3631 output << " ";
3632 asCFInst()->getJip()->emit(output);
3633 }
3634
3635 if (isFlowControl() && asCFInst()->getUip())
3636 {
3637 output << " ";
3638 asCFInst()->getUip()->emit(output);
3639 }
3640
3641 emit_options(output);
3642 if (getCISAOff() != -1) {
3643 output << " // ";
3644 emitInstIds(output);
3645 }
3646 } // end: non-label
3647 } // G4_INST::emit_inst
3648
3649
emitInstIds(std::ostream & output) const3650 void G4_INST::emitInstIds(std::ostream& output) const
3651 {
3652 int srcLine = getLineNo();
3653 if (srcLine != 0) {
3654 output << "#" << srcLine << ":";
3655 }
3656
3657 int vISAId = getCISAOff();
3658 if (vISAId != -1) {
3659 output << "$" << vISAId << ":";
3660 }
3661
3662 uint32_t genId = getLexicalId();
3663 if (genId != -1) {
3664 output << "&" << genId << ":";
3665 }
3666
3667 if (builder.hasSWSB())
3668 {
3669 unsigned tokenLocNum = getTokenLocationNum();
3670 for (unsigned i = 0; i < tokenLocNum; i++)
3671 {
3672 unsigned short token = 0;
3673 uint32_t depId = getTokenLoc(i, token);
3674 output << token << "." << depId << ":";
3675 }
3676 }
3677
3678 int64_t pc = getGenOffset();
3679 if (pc != -1) {
3680 output << "[" << fmtHexBody(pc, 5) << "]";
3681 }
3682 }
3683
3684
3685 //
3686 // Here we add a parameter symbolreg instead of use global option Options::symbolReg,
3687 // because we should ouput non-symbolic register when dumping dot files
3688 //
emit(std::ostream & output,bool symbolreg,bool dotStyle)3689 void G4_INST::emit(std::ostream& output, bool symbolreg, bool dotStyle)
3690 {
3691 bool dst_valid = true;
3692 bool srcs_valid[G4_MAX_SRCS];
3693
3694 if (symbolreg)
3695 {
3696 if (op==G4_nop || isLabel())
3697 {
3698 emit_inst(output, false, NULL);
3699 return;
3700 }
3701
3702 //
3703 // Emit as comment if there is invalid operand, then emit instruction
3704 // based on the situation of operand
3705 //
3706 if (!isValidSymbolOperand(dst_valid, srcs_valid))
3707 {
3708 if (!dotStyle)
3709 {
3710 output << "//";
3711 bool srcs_valid1[G4_MAX_SRCS];
3712 for (unsigned i = 0; i < G4_MAX_SRCS; i++)
3713 srcs_valid1[i] = true;
3714 emit_inst(output, true, srcs_valid1); // emit comments
3715 output << std::endl;
3716 }
3717 }
3718 emit_inst(output, dst_valid, srcs_valid); // emit instruction
3719 }
3720 else
3721 emit_inst(output, false, NULL); // emit instruction with physical register
3722 }
3723
operator <<(std::ostream & os,G4_INST & inst)3724 std::ostream& operator<<(std::ostream& os, G4_INST& inst)
3725 {
3726 inst.emit(os, false, false);
3727 return os;
3728 }
3729
3730 // add instruction options; only wrap in braces {...}
3731 // if there's at least one option
3732 // instructions are assumed Align1 and only Align16 will be explicitly stated
emit_options(std::ostream & output) const3733 void G4_INST::emit_options(std::ostream& output) const
3734 {
3735 std::stringstream opts;
3736 bool first = true;
3737 auto emitOption = [&](const std::string &str) {
3738 if (first) {
3739 first = false;
3740 } else {
3741 opts << ",";
3742 }
3743 opts << str;
3744 };
3745
3746
3747 ////////////////////////////////////////////////////////////
3748 // SWSB options
3749 if (getDistance() != 0) {
3750 std::stringstream dists;
3751 switch (getDistanceTypeXe()) {
3752 case DistanceType::DIST: break;
3753 case DistanceType::DISTALL: dists << 'A'; break;
3754 case DistanceType::DISTINT: dists << 'I'; break;
3755 case DistanceType::DISTFLOAT: dists << 'F'; break;
3756 case DistanceType::DISTLONG: dists << 'L'; break;
3757 case DistanceType::DISTMATH: dists << 'M'; break;
3758 default: dists << "?"; break;
3759 }
3760 dists << '@' << (int)getDistance();
3761 emitOption(dists.str());
3762 }
3763
3764 std::stringstream tks;
3765 std::string tks1;
3766 auto id = getToken();
3767 SWSBTokenType tkType = getTokenType();
3768 switch (tkType) {
3769 case TOKEN_NONE:
3770 case SB_SET: break;
3771 case NoACCSBSet: tks1 = "NoACC"; break;
3772 case AFTER_READ: tks1 = ".R"; break;
3773 case AFTER_WRITE: tks1 = ".W"; break;
3774 case READ_ALL: tks1 = ".R*"; break;
3775 case WRITE_ALL: tks1 = ".W*"; break;
3776 default: tks1 = ".??"; break;
3777 }
3778 if (tkType != TOKEN_NONE)
3779 {
3780 if (tkType != NoACCSBSet)
3781 {
3782 tks << '$' << (int)id << tks1;
3783 }
3784
3785 if (tks1.size())
3786 {
3787 tks << tks1;
3788 }
3789 emitOption(tks.str());
3790 }
3791
3792 ////////////////////////////////////////////////
3793 // bitset options
3794 G4_InstOpts currOpts = option;
3795 if (isEOT()) {
3796 currOpts |= InstOpt_EOT;
3797 }
3798
3799 // strip out stuff we handle elsewhere
3800 currOpts &= ~(InstOpt_QuarterMasks | InstOpt_WriteEnable);
3801 unsigned short optIdx = 0;
3802 while (currOpts && 0xFFFFFFFF != InstOptInfo[optIdx].optMask)
3803 {
3804 if (currOpts & InstOptInfo[optIdx].optMask)
3805 {
3806 emitOption(InstOptInfo[optIdx].optStr);
3807 currOpts &= ~InstOptInfo[optIdx].optMask; // clear this bit
3808 }
3809 optIdx++;
3810 }
3811
3812 ////////////////////////////////////////////////
3813 // for older Align16-supporting platforms
3814 // absense implies Align1
3815 if (isAligned16Inst()) {
3816 emitOption("Align16");
3817 }
3818
3819 //////////////////////////////////////////////////
3820 // only include braces {...} if there's something
3821 auto optsStr = opts.str();
3822 if (!optsStr.empty())
3823 output << " {" << optsStr << "}";
3824 }
3825
3826
3827 static const char* const operandString[] =
3828 {
3829 OPND_NUM_ENUM(STRINGIFY)
3830 };
3831
emitDefUse(std::ostream & output) const3832 void G4_INST::emitDefUse(std::ostream& output) const
3833 {
3834 output << "Def:\n";
3835 for (auto iter = defInstList.begin(), iterEnd = defInstList.end(); iter != iterEnd; ++iter)
3836 {
3837 G4_INST* inst = (*iter).first;
3838 inst->emit(output);
3839 output << "\t" << operandString[(*iter).second];
3840 output << "\n";
3841 }
3842 output << "Use:\n";
3843 for (auto iter = useInstList.begin(), iterEnd = useInstList.end(); iter != iterEnd; ++iter)
3844 {
3845 G4_INST* inst = (*iter).first;
3846 inst->emit(output);
3847 output << "\t" << operandString[(*iter).second];
3848 output << "\n";
3849 }
3850 }
3851
isMixedMode() const3852 bool G4_INST::isMixedMode() const
3853 {
3854 if (mayExceedTwoGRF() || !getDst())
3855 {
3856 return false;
3857 }
3858 for (int i = 0; i < getNumSrc(); ++i)
3859 {
3860 G4_Operand *tOpnd = getSrc(i);
3861 if (!tOpnd)
3862 {
3863 continue;
3864 }
3865
3866 G4_Type srcType = tOpnd->getType();
3867 G4_Type dstType = getDst()->getType();
3868
3869 if ((dstType == builder.getMixModeType() || srcType == builder.getMixModeType()) &&
3870 dstType != srcType)
3871 {
3872 // do not consider int<->float conversion as mixed type
3873 if (!IS_TYPE_INT(dstType) && !IS_TYPE_INT(srcType))
3874 {
3875 return true;
3876 }
3877 }
3878 }
3879
3880 return false;
3881 }
3882
setMsgDesc(G4_SendDesc * in)3883 void G4_InstSend::setMsgDesc(G4_SendDesc *in)
3884 {
3885 assert(in && "null descriptor not expected");
3886 #if defined(_DEBUG)
3887 if (in && in->getExecSize() == g4::SIMD_UNDEFINED)
3888 {
3889 DEBUG_MSG("Msg Desc has execSize undefined!\n");
3890 }
3891 #endif
3892 msgDesc = in;
3893 resetRightBound((G4_Operand*)dst);
3894 resetRightBound(srcs[0]);
3895 }
3896
isDirectSplittableSend()3897 bool G4_InstSend::isDirectSplittableSend()
3898 {
3899 unsigned short elemSize = dst->getElemSize();
3900 SFID funcID = msgDesc->getSFID();
3901 const G4_SendDescRaw *desc = getMsgDescRaw();
3902 if (desc == nullptr) {
3903 // load/store messages are unsplittable for now
3904 return false;
3905 }
3906 switch (funcID)
3907 {
3908 case SFID::DP_DC1:
3909 switch (desc->getHdcMessageType())
3910 {
3911 case DC1_A64_SCATTERED_READ: //emask need be vertically cut.
3912 return false;
3913
3914 case DC1_A64_UNTYPED_SURFACE_READ: //SVM gather 4: emask can be reused if the per-channel data is larger than 1 GRF
3915 case DC1_UNTYPED_SURFACE_READ: //VISA gather 4
3916 case DC1_TYPED_SURFACE_READ: //Gather 4 typed
3917 if (elemSize * execSize > (int)numEltPerGRF<Type_UB>() &&
3918 elemSize * execSize % numEltPerGRF<Type_UB>() == 0)
3919 {
3920 return true;
3921 }
3922 else
3923 {
3924 return false;
3925 }
3926
3927 default: return false;
3928 }
3929 case SFID::DP_DC2:
3930 switch (desc->getHdcMessageType())
3931 {
3932 case DC2_UNTYPED_SURFACE_READ: //gather 4 scaled : emask can be reused if the per-channel data is larger than 1 GRF
3933 case DC2_A64_UNTYPED_SURFACE_READ: //SVM gather 4 scaled
3934 if (elemSize * execSize > (int)numEltPerGRF<Type_UB>() &&
3935 elemSize * execSize % numEltPerGRF<Type_UB>() == 0)
3936 {
3937 return true;
3938 }
3939 else
3940 {
3941 return false;
3942 }
3943
3944 case DC2_BYTE_SCATTERED_READ: //scaled byte scattered read: gather_scaled, handled as block read write, nomask
3945 return true;
3946
3947 default: return false;
3948 }
3949 case SFID::DP_DC0:
3950 switch (desc->getHdcMessageType())
3951 {
3952 case DC_DWORD_SCATTERED_READ: //dword scattered read: emask need be vertically cut according to splitting
3953 case DC_BYTE_SCATTERED_READ: //byte scattered read
3954 return false;
3955 case DC_ALIGNED_OWORD_BLOCK_READ: //Nomask
3956 case DC_OWORD_BLOCK_READ:
3957 return true;
3958 default: return false;
3959 }
3960 case SFID::SAMPLER:
3961 return true;
3962 default: return false;
3963 }
3964
3965 return false;
3966 }
3967
3968
3969 //
3970 // emit send instruction with symbolic/physical register operand depending on the operand check
3971 //
emit_send(std::ostream & output,bool symbol_dst,bool * symbol_srcs)3972 void G4_InstSend::emit_send(std::ostream& output, bool symbol_dst, bool *symbol_srcs)
3973 {
3974 emitInstructionStartColumn(output, *this);
3975
3976 output << ' ';
3977 dst->emit(output, symbol_dst);
3978
3979 output << ' ';
3980 G4_Operand* currSrc = srcs[0];
3981 if (currSrc->isSrcRegRegion()) {
3982 // only output reg var & reg off; don't output region desc and type
3983 currSrc->asSrcRegRegion()->emitRegVarOff(output, false);
3984 } else {
3985 currSrc->emit(output, false); //emit CurrDst
3986 }
3987 output << ' ';
3988
3989 if (isSplitSend())
3990 {
3991 // emit src1
3992 srcs[1]->asSrcRegRegion()->emitRegVarOff(output, false);
3993 output << ' ';
3994 }
3995
3996 // emit exDesc if srcs[3] is not null.
3997 // It should always be a0.2 unless it was constant folded
3998 if (isSplitSend() && srcs[3])
3999 {
4000 srcs[3]->emit(output, false);
4001 output << ' ';
4002 }
4003 else
4004 {
4005 if (getMsgDescRaw()) {
4006 std::ios::fmtflags outFlags(output.flags());
4007 output << fmtHex(getMsgDescRaw()->getExtendedDesc());
4008 output << ' ';
4009 output.flags(outFlags);
4010 }
4011 }
4012
4013 // emit msgDesc (2 for sends and 1 for send). Last operand shown in asm.
4014 int msgDescId = isSplitSend() ? 2 : 1;
4015 srcs[msgDescId]->emit(output, false);
4016
4017 emit_options(output);
4018 }
4019
emit_send(std::ostream & output,bool dotStyle)4020 void G4_InstSend::emit_send(std::ostream& output, bool dotStyle)
4021 {
4022 emit_send(output, false, NULL);
4023 }
4024
emit_send_desc(std::ostream & output)4025 void G4_InstSend::emit_send_desc(std::ostream& output)
4026 {
4027 const G4_INST* sendInst = this;
4028
4029 // Emit a text description of the descriptor if it is available
4030 G4_SendDesc* msgDesc = sendInst->getMsgDesc();
4031 output << " // ";
4032 if (getCISAOff() != -1) {
4033 emitInstIds(output);
4034 output << "; ";
4035 }
4036
4037 auto desc = msgDesc->getDescription();
4038 if (!desc.empty()) {
4039 output << msgDesc->getDescription();
4040 }
4041 if (const auto *rawDesc = sendInst->getMsgDescRaw()) {
4042 }
4043
4044 output << ", resLen=" << msgDesc->getDstLenRegs();
4045 output << ", msgLen=" << msgDesc->getSrc0LenRegs();
4046 if (isSplitSend())
4047 {
4048 output << ", extMsgLen=" << msgDesc->getSrc1LenRegs();
4049 }
4050
4051 if (msgDesc->isBarrier())
4052 {
4053 output << ", barrier";
4054 }
4055 }
4056
4057
4058 // print r#
emit(std::ostream & output,bool symbolreg)4059 void G4_Greg::emit(std::ostream& output, bool symbolreg)
4060 {
4061 output << "r" << getRegNum();
4062 }
4063
emit(std::ostream & output,bool symbolreg)4064 void G4_Areg::emit(std::ostream& output, bool symbolreg)
4065 {
4066 switch (getArchRegType())
4067 {
4068 case AREG_NULL: output << "null"; break;
4069 case AREG_A0: output << "a0"; break;
4070 case AREG_ACC0: output << "acc0"; break;
4071 case AREG_ACC1: output << "acc1"; break;
4072 case AREG_MASK0: output << "ce0"; break;
4073 case AREG_MS0: output << "ms0"; break;
4074 case AREG_DBG: output << "dbg0"; break;
4075 case AREG_SR0: output << "sr0"; break;
4076 case AREG_CR0: output << "cr0"; break;
4077 case AREG_TM0: output << "tm0"; break;
4078 case AREG_N0: output << "n0"; break;
4079 case AREG_N1: output << "n1"; break;
4080 case AREG_IP: output << "ip"; break;
4081 case AREG_F0: output << "f0"; break;
4082 case AREG_F1: output << "f1"; break;
4083 case AREG_TDR0: output << "tdr0"; break;
4084 case AREG_SP: output << "sp"; break;
4085 case AREG_F2: output << "f2"; break;
4086 case AREG_F3: output << "f3"; break;
4087 default:
4088 output << "unknown architecture reg";
4089 MUST_BE_TRUE(false, ERROR_UNKNOWN);
4090 }
4091 }
4092
4093 //
4094 // initial all values idential to rgn's
4095 //
G4_SrcRegRegion(G4_SrcRegRegion & rgn)4096 G4_SrcRegRegion::G4_SrcRegRegion(G4_SrcRegRegion &rgn)
4097 : G4_Operand(G4_Operand::srcRegRegion), acc(rgn.acc), regOff(rgn.regOff), subRegOff(rgn.subRegOff)
4098 {
4099 base = rgn.base;
4100 mod = rgn.mod;
4101 immAddrOff = rgn.immAddrOff;
4102 desc = rgn.desc;
4103 type = rgn.type;
4104 // copy swizzle value
4105 char *sw1 = swizzle, *sw2 = rgn.swizzle;
4106 while (*sw2) *sw1++ = *sw2++;
4107 *sw1 = *sw2;
4108 accRegSel = rgn.accRegSel;
4109
4110 // FIXME: it's rather suspicious that we are copying internal fields this way
4111 bitVec[0] = rgn.bitVec[0];
4112 bitVec[1] = rgn.bitVec[1];
4113
4114 top_dcl = rgn.top_dcl;
4115 left_bound = rgn.left_bound;
4116 right_bound = rgn.right_bound;
4117 byteOffset = rgn.byteOffset;
4118 rightBoundSet = rgn.rightBoundSet;
4119 }
4120
4121 //
4122 // return true if rng and this have the same reg region
4123 //
sameSrcRegRegion(G4_SrcRegRegion & rgn)4124 bool G4_SrcRegRegion::sameSrcRegRegion(G4_SrcRegRegion& rgn)
4125 {
4126 return base == rgn.base &&
4127 acc == rgn.acc &&
4128 mod == rgn.mod &&
4129 strcmp(swizzle,rgn.swizzle) == 0 &&
4130 desc == rgn.desc &&
4131 regOff == rgn.regOff &&
4132 subRegOff == rgn.subRegOff &&
4133 immAddrOff == rgn.immAddrOff &&
4134 type == rgn.type &&
4135 accRegSel == rgn.accRegSel;
4136 }
4137
4138 // compute max execution size starting from the current pos.
4139 // power of two. cross-GRF boundary is allowed if the region is evenly split.
4140 // cross half-GRF should guaranttee evenly split
getMaxExecSize(int pos,uint8_t maxExSize,bool allowCrossGRF,uint16_t & vs,uint16_t & wd,bool & twoGRFsrc)4141 uint8_t G4_SrcRegRegion::getMaxExecSize(int pos, uint8_t maxExSize, bool allowCrossGRF, uint16_t &vs, uint16_t &wd, bool &twoGRFsrc)
4142 {
4143 if (isRightBoundSet() == false)
4144 {
4145 getInst()->computeRightBound(this);
4146 }
4147
4148 twoGRFsrc = false;
4149 vs = 0;
4150 wd = 0;
4151 if (isScalar())
4152 {
4153 vs = 0;
4154 wd = 1;
4155 return maxExSize;
4156 }
4157 else if (acc != Direct)
4158 {
4159 // assume this operand is kosher (i.e., does not cross GRF) as the vISA spec requires it
4160 vs = desc->vertStride;
4161 wd = desc->width;
4162 return roundDownPow2(maxExSize);
4163 }
4164
4165 // align16 operands
4166 if (desc->isRegionV())
4167 {
4168 vs = desc->vertStride;
4169 wd = desc->width;
4170 if (desc->horzStride == 0)
4171 {
4172 return roundDownPow2(maxExSize);
4173 }
4174
4175 uint32_t elSize = getTypeSize();
4176 uint8_t maxSize = 0;
4177
4178 uint32_t prevPos = pos * elSize;
4179 uint8_t numEleInFristGRF = 0, numEleInSecondGRF = 0;
4180 uint32_t newLB = getLeftBound() + prevPos;
4181 bool crossGRF = (newLB / numEltPerGRF<Type_UB>() != getRightBound() / numEltPerGRF<Type_UB>()),
4182 inFirstGRF = true;
4183
4184 for (int i = pos + 4; i < (pos + maxExSize); i += 4)
4185 {
4186 uint32_t currPos = i * elSize;
4187
4188 // check cross GRF boundary
4189 if (crossGRF && inFirstGRF)
4190 {
4191 uint32_t newRB = getLeftBound() + currPos - 1;
4192 uint32_t leftGRF = newLB / numEltPerGRF<Type_UB>(), rightGRF = newRB / numEltPerGRF<Type_UB>();
4193 if (leftGRF != rightGRF)
4194 {
4195 inFirstGRF = false;
4196 numEleInFristGRF = maxSize;
4197 newLB = newRB;
4198 }
4199 }
4200
4201 maxSize += 4;
4202
4203 if (numEleInFristGRF)
4204 {
4205 numEleInSecondGRF += 4;
4206 if (numEleInSecondGRF == numEleInFristGRF)
4207 {
4208 twoGRFsrc = true;
4209 break;
4210 }
4211 }
4212 }
4213 if (numEleInSecondGRF < numEleInFristGRF)
4214 {
4215 twoGRFsrc = false;
4216 maxSize = numEleInFristGRF;
4217 }
4218 return maxSize;
4219 }
4220
4221 // align1 direct
4222 uint32_t elSize = TypeSize(type);
4223 uint8_t maxSize = 1;
4224
4225 bool alignToRow = pos % desc->width == 0;
4226
4227 // region may not be contiguous/single stride depending on the start position
4228 bool contRegion = desc->isContiguous(maxExSize + (pos % desc->width));
4229
4230 uint16_t vStride = 1;
4231 if (contRegion || desc->isSingleNonUnitStride(maxExSize + (pos % desc->width), vStride))
4232 {
4233 // apparently the old code actually allows GRF-crossing as long as it's evenly divided
4234 // (the function comment lied), so we have to try all exec sizes from the largest possible. sigh..
4235 vs = vStride;
4236 wd = 1;
4237 // we need to be careful with start byte here since maxExSize may not be same as inst exec size
4238 // e.g., say this is called on
4239 // mov (16) V44_m(2,0)<1>:f V43_in(1,19)<16;8,1>:ub
4240 // with pos 8 and maxExSize 8
4241 // the region is considered single stride in this case, but is not with the original exsize (16),
4242 // so we can't just multiply stride with type size to get starting offset
4243 uint32_t startByte = (getLeftBound() + getByteOffset(pos)) % numEltPerGRF<Type_UB>();
4244 int retExecSize = 1;
4245 int execTypeSize = vStride * getElemSize();
4246 int exSizes[] = { 32, 16, 8, 4, 2 };
4247
4248 for (auto size : exSizes)
4249 {
4250 if (maxExSize < size)
4251 {
4252 continue;
4253 }
4254 if (startByte + (size - 1) * execTypeSize + getElemSize() <= numEltPerGRF<Type_UB>())
4255 {
4256 // no GRF crossing (we don't count the padding bytes after the last element)
4257 retExecSize = size;
4258 break;
4259 }
4260 else if (allowCrossGRF)
4261 {
4262 int numEltInFirstGRF = (numEltPerGRF<Type_UB>() - startByte) / execTypeSize;
4263 // startByte may not be aligned to exec type size (e.g., r1.1<2;1,0>:b). We need to increment by 1 in this case
4264 if ((numEltPerGRF<Type_UB>() - startByte) % execTypeSize != 0)
4265 {
4266 numEltInFirstGRF += 1;
4267 }
4268 if (numEltInFirstGRF == size - numEltInFirstGRF)
4269 {
4270 twoGRFsrc = true;
4271 retExecSize = size;
4272 break;
4273 }
4274 }
4275 }
4276
4277 return (uint8_t)retExecSize;
4278 }
4279
4280 // conservative.
4281 // Here we assume that no cross width if row size is larger than width
4282 // mul (16) V112(0,0)<1>:f V111(0,0)<16;16,1>:f r1.0<1;4,0>:f
4283 if (!alignToRow && !contRegion && desc->vertStride != 0 && desc->horzStride != 0)
4284 {
4285 wd = vs = (uint16_t)roundDownPow2((pos/desc->width + 1) * desc->width - pos);
4286
4287 // Need to check whether this subregion crosses grf or not.
4288 // E.g. the second half does cross a grf:
4289 // mov (8) V41(0, 9)<1> V58(2, 8)<32;8,4>
4290 //
4291 // Given a linearized index, compute its byte offset relative to the
4292 // first element (index 0).
4293 auto computeOffset = [=](unsigned index) -> unsigned {
4294 unsigned typeSize = TypeSize(type);
4295 unsigned offset = (index % desc->width) * desc->horzStride * typeSize;
4296 offset += (index / desc->width) * desc->vertStride * typeSize;
4297 return offset;
4298 };
4299
4300 // Since a single element cannot cross a grf, checking the first byte of the
4301 // first and last element is sufficient.
4302 // FIXME: fix other places with this logic.
4303 unsigned firstPos = getLeftBound() + computeOffset((unsigned)pos);
4304 unsigned lastPos = getLeftBound() + computeOffset((unsigned)(pos + wd - 1));
4305 twoGRFsrc = firstPos / numEltPerGRF<Type_UB>() != lastPos / numEltPerGRF<Type_UB>();
4306
4307 return (uint8_t)wd;
4308 }
4309
4310 uint8_t posInFirstRow = pos%desc->width, eleInRow = 1, eleInFirstRow = desc->width - posInFirstRow;
4311 uint8_t pow2 = roundDownPow2(eleInFirstRow);
4312
4313 if (eleInFirstRow != pow2 && !contRegion)
4314 {
4315 wd = pow2;
4316 vs = wd * desc->horzStride;
4317 return pow2;
4318 }
4319
4320 uint32_t prevPos = (pos/desc->width * desc->vertStride + posInFirstRow * desc->horzStride) * elSize;
4321 uint8_t numEleInFristGRF = 0, numEleInSecondGRF = 0;
4322 bool crossRow = false;
4323 uint32_t newLB = getLeftBound() + prevPos;
4324 bool crossGRF = (newLB / numEltPerGRF<Type_UB>() != getRightBound() / numEltPerGRF<Type_UB>()),
4325 inFirstGRF = true;
4326 bool negVS = (desc->vertStride < desc->horzStride * desc->width);
4327
4328 for (int i = pos + 1; i < (pos + maxExSize); i++)
4329 {
4330 uint8_t posInRow = i % desc->width;
4331 uint32_t currPos = ((i / desc->width) * desc->vertStride + posInRow * desc->horzStride) * elSize;
4332
4333 // check cross row boundary
4334 if ((!contRegion || desc->vertStride == 0) && posInRow == 0)
4335 {
4336 uint8_t pow2Val = roundDownPow2(eleInRow);
4337 if (pow2Val != eleInRow ||
4338 ((desc->vertStride == 0 || negVS) && !alignToRow))
4339 {
4340 // this happens in the first row
4341 wd = maxSize = pow2Val;
4342 vs = wd * desc->horzStride;
4343 break;
4344 }
4345 else if (wd == 0)
4346 {
4347 // <2;4,1>
4348 wd = eleInRow;
4349 if (alignToRow)
4350 {
4351 vs= desc->vertStride;
4352 }
4353 else
4354 {
4355 vs = (currPos - prevPos) / elSize;
4356 }
4357 }
4358 crossRow = true;
4359 eleInRow = 0;
4360 }
4361
4362 // check cross GRF boundary
4363 if (crossGRF && inFirstGRF)
4364 {
4365 uint32_t newRB = getLeftBound() + currPos + elSize - 1;
4366 uint32_t leftGRF = newLB / numEltPerGRF<Type_UB>(), rightGRF = newRB / numEltPerGRF<Type_UB>();
4367 if (leftGRF != rightGRF)
4368 {
4369 inFirstGRF = false;
4370 uint8_t pow2Val = roundDownPow2(maxSize);
4371
4372 // if number of element in first GRF is not power of 2, or
4373 // subregister offset of two GRFs are different and not contiguous(too conservative?)
4374 if (pow2Val != maxSize ||
4375 (!contRegion && !(alignToRow && maxSize <= desc->width) && newLB % numEltPerGRF<Type_UB>() != (getLeftBound() + currPos) % numEltPerGRF<Type_UB>()))
4376 {
4377 maxSize = pow2Val;
4378 if (wd == 0)
4379 {
4380 wd = pow2Val;
4381 vs = wd * desc->horzStride;
4382 }
4383 break;
4384 }
4385 else if (wd == 0)
4386 {
4387 wd = maxSize < desc->width ? maxSize : desc->width;
4388 vs = (currPos - prevPos) / elSize;
4389 }
4390 numEleInFristGRF = maxSize;
4391 newLB = newRB;
4392 }
4393 }
4394
4395 maxSize++;
4396 eleInRow++;
4397 // make sure the number of elements in two rows are the same
4398 if (crossRow && eleInRow == eleInFirstRow && !alignToRow && !contRegion)
4399 {
4400 break;
4401 }
4402
4403 if (numEleInFristGRF)
4404 {
4405 numEleInSecondGRF++;
4406 if (numEleInSecondGRF == numEleInFristGRF)
4407 {
4408 twoGRFsrc = true;
4409 break;
4410 }
4411 }
4412 }
4413 if (wd == 0)
4414 {
4415 // contiguous region
4416 wd = pow2;
4417 vs = wd * desc->horzStride;
4418 }
4419 if (numEleInSecondGRF < numEleInFristGRF)
4420 {
4421 maxSize = numEleInFristGRF;
4422 }
4423 return maxSize;
4424 }
4425
4426 //
4427 // output (Var+refOff).subRegOff
4428 //
printRegVarOff(std::ostream & output,G4_Operand * opnd,short regOff,short subRegOff,short immAddrOff,G4_Type type,bool symbolreg,bool printSubReg)4429 void printRegVarOff(std::ostream& output,
4430 G4_Operand* opnd,
4431 short regOff, // base+regOff is the starting register
4432 short subRegOff, // sub reg offset
4433 short immAddrOff, // imm addr offset
4434 G4_Type type,
4435 bool symbolreg,
4436 bool printSubReg)
4437 //
4438 // symbolreg == false, output physcial register operand
4439 // symbolreg == true, output symbolic register operand
4440 // Here we use symbolreg instead of Options::symbolReg, because sometimes we need to emit an instruction with invalid symbolic register as comment and
4441 // then emit a workable instruction with physical register. In this case, we do not want to re-set the global option variable Options::symbolReg to switch
4442 // between these two states, that may have potential side effects.
4443 //
4444 {
4445 short subRegOffset = (subRegOff != (short) UNDEFINED_SHORT) ? subRegOff : 0;
4446
4447 G4_RegAccess acc = opnd->getRegAccess();
4448 G4_VarBase* base = opnd->getBase();
4449 if (acc == Direct)
4450 {
4451 MUST_BE_TRUE(regOff != (short) UNDEFINED_SHORT,
4452 ERROR_INTERNAL_ARGUMENT);
4453
4454 if (base->isRegVar())
4455 {
4456 G4_RegVar* baseVar = static_cast<G4_RegVar*>(base);
4457 int declOpSize = baseVar->getDeclare()->getElemSize();
4458 uint16_t thisOpSize = TypeSize(type);
4459
4460 if (baseVar->isPhyRegAssigned())
4461 {
4462
4463 if (symbolreg && !base->isFlag())
4464 {
4465 //
4466 // No matter the type of register and if the allocation successed, we output format <symbol>(RegOff, SubRegOff)
4467 // Note: we have check if the register allocation successed when emit the declare!
4468 //
4469 output << base->asRegVar()->getName() << "(" << regOff << "," << subRegOff << ")";
4470 return;
4471 }
4472
4473 if (baseVar->getPhyReg()->isGreg())
4474 {
4475 int regNum = 0, subRegNum = 0;
4476 uint32_t byteAddress = opnd->getLinearizedStart();
4477
4478 if (baseVar->getDeclare()->getGRFBaseOffset() == 0)
4479 {
4480 // This is before RA and getLineariedStart() only contains the left bound
4481 // we have to add the declare's phyreg
4482 byteAddress += baseVar->getPhyReg()->asGreg()->getRegNum() * getGRFSize() + baseVar->getPhyRegOff() * TypeSize(type);
4483 }
4484
4485 regNum = byteAddress / getGRFSize();
4486 subRegNum = (byteAddress % getGRFSize()) / TypeSize(type);
4487
4488
4489 output << "r" << regNum;
4490 if (printSubReg)
4491 {
4492 output << "." << subRegNum;
4493 }
4494 }
4495 else if (baseVar->getPhyReg()->isAreg())
4496 {
4497 (static_cast<G4_Areg*>(baseVar->getPhyReg()))->emit(output);
4498 if (!baseVar->isNullReg())
4499 {
4500 unsigned ArfSubRegNum = baseVar->getPhyRegOff();
4501
4502 //ArfSubRegNum is in unit of declOpSize
4503 //transform ArfSubRegNum to unit of thisOpSize
4504 if (thisOpSize != declOpSize)
4505 {
4506 if (!opnd->getInst()->isPseudoKill())
4507 {
4508 MUST_BE_TRUE((ArfSubRegNum * declOpSize) % thisOpSize == 0,
4509 ERROR_DATA_RANGE("ARF sub-register number"));
4510 }
4511 ArfSubRegNum = (ArfSubRegNum * declOpSize) / thisOpSize;
4512 }
4513
4514 unsigned subreg = ArfSubRegNum + subRegOffset;
4515 output << '.' << subreg;
4516 }
4517 }
4518 else
4519 MUST_BE_TRUE(false, ERROR_UNKNOWN);
4520 }
4521 else // physical register not allocated
4522 {
4523 baseVar->emit(output);
4524 output << '(' << regOff << ',' << subRegOff << ')';
4525 }
4526 }
4527 else //This is not a RegVar
4528 {
4529 if (base->isAccReg() && regOff != 0)
4530 {
4531 bool valid;
4532 int regNum = base->ExRegNum(valid);
4533 output << "acc" << regNum + regOff;
4534 }
4535 else
4536 {
4537 base->emit(output);
4538 }
4539 if (!base->isNullReg() && !base->isIpReg() && !base->isNReg() && subRegOff != (short) UNDEFINED_SHORT && printSubReg)
4540 {
4541 output << '.' << subRegOff;
4542 }
4543 }
4544 }
4545 else //This is an indirect access
4546 {
4547 if (acc == IndirGRF)
4548 {
4549 output << "r[";
4550 }
4551 else //Unknown access type
4552 {
4553 MUST_BE_TRUE(false, ERROR_UNKNOWN);
4554 }
4555
4556 if (base->isRegVar())
4557 {
4558 MUST_BE_TRUE(regOff == 0, ERROR_INTERNAL_ARGUMENT);
4559 G4_RegVar* baseVar = static_cast<G4_RegVar*>(base);
4560 if (baseVar->isPhyRegAssigned())
4561 {
4562 MUST_BE_TRUE(baseVar->getPhyReg()->isAreg(), ERROR_UNKNOWN);
4563
4564 if (symbolreg)
4565 {
4566 output << baseVar->getName();
4567 output << '(' << regOff << ',' << subRegOffset << ")," << immAddrOff << ']';
4568 }
4569 else
4570 {
4571 (static_cast<G4_Areg*>(baseVar->getPhyReg()))->emit(output);
4572 output << '.' << (baseVar->getPhyRegOff() + subRegOffset);
4573 {
4574 output << ", " << immAddrOff << ']';
4575 }
4576 }
4577 }
4578 else //No register assigned yet
4579 {
4580 baseVar->emit(output);
4581 output << '(' << regOff << ',' << subRegOff << ')';
4582 output << ", " << immAddrOff << ']';
4583 }
4584 }
4585 else if (base->isAreg())
4586 {
4587 (static_cast<G4_Areg*>(base))->emit(output);
4588 output << '.' << subRegOffset;
4589 {
4590 output << ", " << immAddrOff << ']';
4591 }
4592 }
4593 else
4594 {
4595 MUST_BE_TRUE(false, "Unknown base variable type for indirect access");
4596 }
4597 }
4598 }
4599
4600 //
4601 // output <modifier>(Var+refOff).subRegOff<16;16,1>.xxyy
4602 //
4603 // symbolreg == false, output <modifier>(Var+refOff).subRegOff<16;16,1>.xxyy
4604 // symbolreg == true, output <modifier><symbol>(RegOff, SubRegOff)<16;16,1> in symbolic register emit
4605 // Here we use symbolreg instead of Options::symbolReg, because sometimes we need to emit an instruction with invalid symbolic register as comment and
4606 // then emit a workable instruction with physical register. In this case, we do not want to re-set the global option variable Options::symbolReg to switch
4607 // between these two states, that may have potential side effects.
4608 //
emit(std::ostream & output,bool symbolreg)4609 void G4_SrcRegRegion::emit(std::ostream& output, bool symbolreg)
4610 {
4611 if (mod != Mod_src_undef)
4612 {
4613 output << SrcModifierStr[mod];
4614 }
4615
4616 //
4617 // output Var(refOff,subRegOff)
4618 //
4619 emitRegVarOff(output, symbolreg);
4620 //
4621 // output <vertStride;width,horzStride>
4622 //
4623 // do not emit region for null reg
4624 // do not emit region for macro madm
4625 if (desc && !base->isNullReg() && !base->isNReg() && !isAccRegValid())// rgn == NULL, the default region is used
4626 {
4627 bool align1ternary = inst && inst->getNumSrc() == 3 && inst->getPlatform() >= GENX_ICLLP &&
4628 !inst->isSend() && inst->isAligned1Inst();
4629
4630 // RegionV is invalid for SRC operands
4631 if (desc->isRegionWH())
4632 {
4633 output << "<" << desc->width << "," << desc->horzStride << ">";
4634 }
4635 else if (desc->isRegionSW()) // support <0/4> for Src of Align16 instruction
4636 {
4637 output << "<" << desc->vertStride << ">";
4638 }
4639 else if (desc->vertStride == UNDEFINED_SHORT && desc->width == UNDEFINED_SHORT)
4640 {
4641 output << "<" << desc->horzStride << ">";
4642 }
4643 else
4644 {
4645 if (align1ternary)
4646 {
4647 // format is <V;H> with W derived from V and H
4648 output << "<" << desc->vertStride << ";" << desc->horzStride << ">";
4649 }
4650 else if (!isWithSwizzle())
4651 {
4652 // do not print region for align16 sources
4653 output << "<" << desc->vertStride << ";" << desc->width << "," << desc->horzStride << ">";
4654 }
4655 }
4656 }
4657
4658 if (isAccRegValid())
4659 {
4660 // no vertical stride for 3-source instruction
4661 if (inst->getNumSrc() != 3 && desc)
4662 {
4663 output << "<" << desc->vertStride << ">";
4664 }
4665
4666 // output acc2~acc9
4667 if (getAccRegSel() == NOACC)
4668 {
4669 output << ".noacc";
4670 }
4671 else
4672 {
4673 output <<".acc"<< (getAccRegSel()+2);
4674 }
4675 }
4676 else if (*swizzle)
4677 {
4678 output << "." << swizzle;
4679 }
4680
4681 if (Type_UNDEF != type)
4682 {
4683 if (!symbolreg || acc != Direct) // can output register data type for indirect addressing in any time
4684 output << ':' << TypeSymbol(type);
4685 }
4686 }
4687
4688 //
4689
4690 // return true if this src is a scalar
4691
4692 // V82(1,0)<0>.xxxx:f
4693
4694 // V82(1,0)<0;1,0>:f --- detect via
4695
4696 //
4697
isScalar() const4698 bool G4_SrcRegRegion::isScalar() const
4699
4700 {
4701
4702 if (!isWithSwizzle())
4703 {
4704
4705 return getRegion()->isScalar(); // check <0;1,0>
4706 }
4707 else
4708 {
4709 return swizzle[0] == 'r';
4710 }
4711
4712 }
4713
4714
4715 //
4716 // This function is used to check if the src operand obey the rule of symbolic register. We need this function to check the operand before we emit an instruction
4717 //
obeySymbolRegRule() const4718 bool G4_SrcRegRegion::obeySymbolRegRule() const
4719 {
4720 if (!base->isRegVar()) // only for reg var
4721 return false;
4722
4723 if (base->asRegVar()->getDeclare()->isSpilled())
4724 {
4725 return false;
4726 }
4727
4728 //
4729 // Rule-3: No swizzle .xyzw
4730 //
4731 if (*swizzle)
4732 {
4733 return false;
4734 }
4735 //
4736 // Rule-4: do not support date type redefinition in direct addressing
4737 //
4738 if (Type_UNDEF != type)
4739 {
4740 if (base->isRegVar() && acc == Direct && base->asRegVar()->getDeclare()->getElemType() != type) // check if the data type is the same as in declare
4741 {
4742 return false;
4743 }
4744 }
4745
4746 return true;
4747 }
4748
4749 //
4750 // Here we use symbolreg instead of Options::symbolReg, because sometimes we need to emit an instruction with invalid symbolic register as comment and
4751 // then emit a workable instruction with physical register. In this case, we do not want to re-set the global option variable Options::symbolReg to switch
4752 // between these two states, that may have potential side effects.
4753 //
emitRegVarOff(std::ostream & output,bool symbolreg)4754 void G4_SrcRegRegion::emitRegVarOff(std::ostream& output, bool symbolreg)
4755 {
4756 bool printSubReg = true;
4757 if (inst && inst->isSend())
4758 {
4759 printSubReg = false;
4760 }
4761 printRegVarOff(output, this, regOff,subRegOff,immAddrOff,type, symbolreg, printSubReg);
4762 }
4763
4764 //
4765 // initial all values idential to rgn's
4766 //
G4_DstRegRegion(G4_DstRegRegion & rgn)4767 G4_DstRegRegion::G4_DstRegRegion(G4_DstRegRegion &rgn)
4768 : G4_Operand(G4_Operand::dstRegRegion)
4769 {
4770 acc = rgn.acc;
4771 base = rgn.base;
4772 regOff = rgn.regOff;
4773 subRegOff = rgn.subRegOff;
4774 immAddrOff = rgn.immAddrOff;
4775 horzStride = rgn.horzStride;
4776 type = rgn.type;
4777 writeMask = rgn.writeMask;
4778 accRegSel = rgn.accRegSel;
4779
4780 top_dcl = rgn.top_dcl;
4781 left_bound = rgn.left_bound;
4782 right_bound = rgn.right_bound;
4783 bitVec[0] = rgn.bitVec[0];
4784 bitVec[1] = rgn.bitVec[1];
4785 byteOffset = rgn.byteOffset;
4786 rightBoundSet = rgn.rightBoundSet;
4787 }
4788
computeLeftBound()4789 void G4_DstRegRegion::computeLeftBound()
4790 {
4791 top_dcl = NULL;
4792 uint32_t newregoff = regOff, offset = 0;
4793 if (base && base->isRegVar())
4794 {
4795 top_dcl = base->asRegVar()->getDeclare();
4796 if (!top_dcl && base->asRegVar()->isGreg())
4797 {
4798 newregoff = base->asRegVar()->asGreg()->getRegNum();
4799 }
4800 }
4801
4802 if (top_dcl)
4803 {
4804 while (top_dcl->getAliasDeclare())
4805 {
4806 offset += top_dcl->getAliasOffset();
4807 top_dcl = top_dcl->getAliasDeclare();
4808 }
4809 }
4810
4811 if (base && base->isFlag())
4812 {
4813 if (base->isRegVar())
4814 {
4815 if (base->asRegVar()->getPhyReg())
4816 {
4817 left_bound = base->asRegVar()->getPhyRegOff() * 16; // the bound of flag register is in unit of BIT
4818 left_bound += subRegOff * 16;
4819 left_bound += base->asRegVar()->getPhyReg()->asAreg()->getFlagNum() * 32;
4820 }
4821 else
4822 {
4823 left_bound = subRegOff * 16;
4824 }
4825 }
4826 else
4827 {
4828 left_bound = subRegOff * 16;
4829 left_bound += base->asAreg()->getFlagNum() * 32;
4830 }
4831
4832 byteOffset = left_bound / 8;
4833 }
4834 else if (base != NULL && base->isAccReg())
4835 {
4836 left_bound = subRegOff * TypeSize(type);
4837 if (base->asAreg()->getArchRegType() == AREG_ACC1 || regOff == 1)
4838 {
4839 left_bound += getGRFSize();
4840 }
4841 byteOffset = left_bound;
4842 } else if (top_dcl) {
4843 if (acc == Direct) {
4844 left_bound = offset + newregoff * numEltPerGRF<Type_UB>() + subRegOff * TypeSize(type);
4845 if (top_dcl->getTotalElems() * top_dcl->getElemSize() >= (int)numEltPerGRF<Type_UB>()) {
4846 byteOffset = left_bound;
4847 }
4848 else {
4849 unsigned alignOff = TypeSize(type) > TypeSize(Type_W) ?
4850 TypeSize(type) : TypeSize(Type_W);
4851
4852 if (top_dcl->getSubRegAlign() == Even_Word || top_dcl->getSubRegAlign() >= Four_Word) {
4853 alignOff = top_dcl->getSubRegAlign() * 2;
4854 }
4855
4856 byteOffset = left_bound + alignOff;
4857 }
4858 }
4859 else {
4860 left_bound = subRegOff * TypeSize(ADDR_REG_TYPE);
4861 byteOffset = TypeSize(type);
4862 }
4863 } else { // arch reg
4864 left_bound = 0;
4865 byteOffset = left_bound;
4866 }
4867 }
4868 //
4869 // Initialize all values idential to rgn's, except for the base operand.
4870 // Caller is responsible for allocating base operand and making sure it doesn't
4871 // mess up the operands' hash table.
4872 //
G4_DstRegRegion(G4_DstRegRegion & rgn,G4_VarBase * new_base)4873 G4_DstRegRegion::G4_DstRegRegion(G4_DstRegRegion &rgn, G4_VarBase *new_base)
4874 : G4_Operand(G4_Operand::dstRegRegion)
4875 {
4876 acc = rgn.acc;
4877 regOff = rgn.regOff;
4878 subRegOff = rgn.subRegOff;
4879 immAddrOff = rgn.immAddrOff;
4880 horzStride = rgn.horzStride;
4881 type = rgn.type;
4882 writeMask = rgn.writeMask;
4883 base = new_base;
4884
4885 computeLeftBound();
4886 rightBoundSet = false;
4887 }
4888
setDstBitVec(uint8_t exec_size)4889 void G4_DstRegRegion::setDstBitVec(uint8_t exec_size)
4890 {
4891 // byte level footprint computing bit vectors.
4892 uint64_t footprint0 = 0;
4893 uint64_t footprint1 = 0;
4894
4895 unsigned short type_size = getTypeSize();
4896 unsigned short s_size = horzStride * type_size;
4897
4898 // General cases.
4899 uint64_t bit_seq = TypeFootprint(type);
4900 for (uint8_t i = 0; i < exec_size; ++i)
4901 {
4902 int eltOffset = i * s_size;
4903 // no element can cross 64-byte boundary
4904 if (eltOffset >= 64)
4905 {
4906 footprint1 |= bit_seq << (eltOffset - 64);
4907 }
4908 else
4909 {
4910 footprint0 |= bit_seq << eltOffset;
4911 }
4912 }
4913
4914 bitVec[0] = footprint0;
4915 bitVec[1] = footprint1;
4916
4917 return;
4918 }
4919
computeRightBound(uint8_t exec_size)4920 unsigned G4_DstRegRegion::computeRightBound(uint8_t exec_size)
4921 {
4922 bitVec[0] = 0;
4923 bitVec[1] = 0;
4924
4925 if (base->isFlag()) {
4926 unsigned int totalBits = 0;
4927 if (G4_Inst_Table[inst->opcode()].instType != InstTypePseudoLogic)
4928 {
4929 // mov (1) f0.1<1>:uw ...
4930 // subreg is 1 if it's a 32 bit flag and we want to set the upper 16 bits
4931 left_bound = subRegOff * 16;
4932 totalBits = TypeBitSize(type);
4933 }
4934 else
4935 {
4936 /*
4937 we need to set leftBound for pseudo intruction
4938 so that it creates use/def links correctly in the control flow graph between
4939 cmp instruction and pseudo instruction.
4940 This matters when we break up SIMD32 instruction in to two SIMD16 with H1/H2 masks.
4941 The bound for compare for H2 will be [15,31], and this has to match.
4942 Without this no use/def link was created which caused issues in logic optimization.
4943 Also it produce incorrect behavior in any operation that relies on compareOperand.
4944 */
4945 left_bound = inst->getMaskOffset();
4946 totalBits = exec_size;
4947 }
4948
4949 right_bound = left_bound + totalBits - 1;
4950
4951 bitVec[0] = totalBits == 32 ? 0xFFFFFFFF : (1 << totalBits) - 1;
4952 }
4953 else
4954 {
4955 // For call, the return addr is always set as if simd2.
4956 if (inst->isCall() || inst->isFCall())
4957 {
4958 exec_size = 2;
4959 }
4960
4961 if (acc == Direct)
4962 {
4963 setDstBitVec(exec_size);
4964
4965 unsigned short type_size = TypeSize(type);
4966 unsigned short s_size = horzStride * type_size;
4967 unsigned totalBytes = (exec_size - 1) * s_size + type_size;
4968
4969 // For wide dst instructions like madw opcode, the dst(SOA layout) size should be the sum of low result size and high
4970 // result size, and also both low and high results are GRF-aligned.
4971 if (INST_WIDE_DST(inst->opcode()))
4972 {
4973 unsigned totalBytesDstLow = (totalBytes + getGRFSize() - 1) & (~(getGRFSize() - 1)); // GRF-aligned
4974 totalBytes = totalBytesDstLow * 2;
4975 }
4976
4977 right_bound = left_bound + totalBytes - 1;
4978 }
4979 else
4980 {
4981 // indirect
4982 bitVec[0] |= 0x3;
4983 right_bound = left_bound + TypeSize(ADDR_REG_TYPE) - 1;
4984 }
4985 }
4986 rightBoundSet = true;
4987 return right_bound;
4988 }
4989
4990 /// compare regRegion to opnd
4991 /// regRegion is either a SrcRegRegion or DstRegRegion, opnd can be any G4_operand
4992 /// We put this in a separate function since G4_DstRegRegion and G4_SrcRegRegion
4993 /// should have (nearly) identical code for compareOperand
compareRegRegionToOperand(G4_Operand * regRegion,G4_Operand * opnd)4994 static G4_CmpRelation compareRegRegionToOperand(G4_Operand* regRegion, G4_Operand* opnd)
4995 {
4996 assert((regRegion->isSrcRegRegion() || regRegion->isDstRegRegion()) && "expect either src or dst regRegion");
4997 bool legal_opnd = opnd->isSrcRegRegion() || opnd->isDstRegRegion() || opnd->isPredicate() || opnd->isCondMod() || opnd->isAddrExp();
4998 G4_VarBase* myBase = regRegion->getBase();
4999 G4_VarBase *opndBase = opnd->getBase();
5000 G4_RegAccess myAcc = regRegion->getRegAccess();
5001 G4_RegAccess opndAcc = opnd->getRegAccess();
5002 G4_Declare* myDcl = regRegion->getTopDcl();
5003 G4_Declare* opndDcl = opnd->getTopDcl();
5004 if (opnd->isAddrExp())
5005 {
5006 opndBase = opnd->asAddrExp()->getRegVar()->getBaseRegVar();
5007 opndDcl = opnd->asAddrExp()->getRegVar()->getDeclare();
5008 }
5009
5010 if (regRegion->isAddrExp())
5011 {
5012 myBase = opnd->asAddrExp()->getRegVar()->getBaseRegVar();
5013 myDcl = opnd->asAddrExp()->getRegVar()->getDeclare();
5014 }
5015
5016 if (!legal_opnd || myBase == nullptr || opndBase == nullptr)
5017 {
5018 // a null base operand can never interfere with anything
5019 return Rel_disjoint;
5020 }
5021
5022 if (myDcl == opndDcl && opndDcl != nullptr)
5023 {
5024 // special checks for pseudo kills
5025 G4_INST* myInst = regRegion->getInst();
5026 G4_INST* opndInst = opnd->getInst();
5027 if (myInst && (myInst->isPseudoKill() || myInst->isLifeTimeEnd()))
5028 {
5029 return Rel_interfere;
5030 }
5031
5032 if (opndInst && (opndInst->isPseudoKill() || opndInst->isLifeTimeEnd()))
5033 {
5034 return Rel_interfere;
5035 }
5036
5037 if (opnd->isAddrExp() || regRegion->isAddrExp())
5038 {
5039 return Rel_interfere;
5040 }
5041 }
5042
5043 if (opndAcc == myAcc && myAcc != Direct)
5044 {
5045 // two indirect are assumed to interfere in the absence of pointer analysis
5046 return Rel_interfere;
5047 }
5048 else if (opndAcc != myAcc)
5049 {
5050 // direct v. indirect
5051 // the two may inteferce if the direct operand is either an address-taken GRF or an address operand
5052 // we could make the check tighter by considering the offsets of the address operand,
5053 // but it won't much difference in practice
5054 auto mayInterfereWithIndirect = [](G4_Operand* direct, G4_Operand* indirect)
5055 {
5056 assert((direct->getRegAccess() == Direct && indirect->getRegAccess() == IndirGRF) &&
5057 "first opereand should be direct and second indirect");
5058 return (direct->getTopDcl() && direct->getTopDcl()->getAddressed()) ||
5059 (direct->isAddress() && direct->getTopDcl() == indirect->getTopDcl());
5060 };
5061
5062 if ((opndAcc != Direct && mayInterfereWithIndirect(regRegion, opnd)) ||
5063 (myAcc != Direct && mayInterfereWithIndirect(opnd, regRegion)))
5064 {
5065 return Rel_interfere;
5066 }
5067 return Rel_disjoint;
5068 }
5069
5070 // both are physically assigned.
5071 G4_VarBase *myPhyReg = myBase->isRegVar() ? myBase->asRegVar()->getPhyReg() : myBase;
5072 G4_VarBase *opndPhyReg = opndBase->isRegVar() ? opndBase->asRegVar()->getPhyReg() : opndBase;
5073 if (myPhyReg && opndPhyReg)
5074 {
5075 assert(myPhyReg->isPhyReg() && opndPhyReg->isPhyReg());
5076 if (myPhyReg->getKind() != opndPhyReg->getKind())
5077 return Rel_disjoint;
5078
5079 if (myPhyReg->isPhyAreg())
5080 {
5081 if (myPhyReg->asAreg()->getArchRegType() == AREG_NULL)
5082 {
5083 //like NaN, a null ARF is disjoint to everyone including itself
5084 return Rel_disjoint;
5085 }
5086
5087 // TODO: this is not accurate for flag/acc/address.
5088 return (myPhyReg->asAreg()->getArchRegType() ==
5089 opndPhyReg->asAreg()->getArchRegType()) ? Rel_eq : Rel_disjoint;
5090 }
5091
5092 // TODO: handle physically assigned GRF reg. Right now this should
5093 // not happen prior to RA.
5094 }
5095
5096 if (myBase->getKind() != opndBase->getKind())
5097 {
5098 return Rel_disjoint;
5099 }
5100
5101 if (myDcl != opndDcl)
5102 {
5103 return Rel_disjoint;
5104 }
5105
5106 unsigned int left_bound2 = opnd->getLeftBound(), right_bound2 = opnd->getRightBound();
5107 uint32_t myLeftBound = regRegion->getLeftBound();
5108 uint32_t myRightBound = regRegion->getRightBound();
5109
5110 {
5111 uint64_t opndBitVecL = opnd->getBitVecL(), opndBitVecH = opnd->getBitVecH();
5112 uint64_t myBitVecL = regRegion->getBitVecL(), myBitVecH = regRegion->getBitVecH();
5113 if (myRightBound < left_bound2 || right_bound2 < myLeftBound)
5114 {
5115 return Rel_disjoint;
5116 }
5117 else if (myLeftBound == left_bound2 &&
5118 myRightBound == right_bound2 &&
5119 myBitVecL == opndBitVecL && myBitVecH == opndBitVecH)
5120 {
5121 return Rel_eq;
5122 }
5123 else
5124 {
5125 // First consider if any operand is > two GRFs. If so we just compare the bound
5126 // as such operands are assumed to touch every element within the bound.
5127 bool meExceedTwoGRF = (myRightBound - myLeftBound) > 2u * getGRFSize();
5128 bool opndExceedTwoGRF = (right_bound2 - left_bound2) > 2u * getGRFSize();
5129 if (meExceedTwoGRF || opndExceedTwoGRF)
5130 {
5131 if (left_bound2 >= myLeftBound && right_bound2 <= myRightBound)
5132 {
5133 return Rel_gt;
5134 }
5135 else if (myLeftBound >= left_bound2 && myRightBound <= right_bound2)
5136 {
5137 return Rel_lt;
5138 }
5139 return Rel_interfere;
5140 }
5141
5142 // Now both operands are within two GRFs, compare their footprint to get precise relations
5143 int maskSize = 2 * getGRFSize();
5144 if (myDcl)
5145 {
5146 maskSize = myDcl->getRegVar()->isFlag() ? myDcl->getNumberFlagElements()
5147 : myDcl->getByteSize();
5148 }
5149 BitSet myBitSet(maskSize, false);
5150 BitSet otherBitSet(maskSize, false);
5151 regRegion->updateFootPrint(myBitSet, true);
5152 opnd->updateFootPrint(otherBitSet, true);
5153
5154 BitSet tmp = myBitSet;
5155 myBitSet &= otherBitSet;
5156 if (myBitSet.isEmpty())
5157 {
5158 return Rel_disjoint;
5159 }
5160
5161 myBitSet = tmp;
5162 myBitSet -= otherBitSet;
5163 if (myBitSet.isEmpty())
5164 {
5165 return Rel_lt;
5166 }
5167 otherBitSet -= tmp;
5168 return otherBitSet.isEmpty() ? Rel_gt : Rel_interfere;
5169 }
5170 }
5171 }
5172
compareOperand(G4_Operand * opnd)5173 G4_CmpRelation G4_DstRegRegion::compareOperand(G4_Operand *opnd)
5174 {
5175 return compareRegRegionToOperand(this, opnd);
5176 }
5177
isNativeType() const5178 bool G4_DstRegRegion::isNativeType() const
5179 {
5180 G4_Type type = getType();
5181
5182 if (IS_WTYPE(type) || IS_DTYPE(type) || IS_FTYPE(type) || type == Type_DF) {
5183 return true;
5184 }
5185 else {
5186 return false;
5187 }
5188 }
5189
isNativePackedRowRegion() const5190 bool G4_DstRegRegion::isNativePackedRowRegion() const
5191 {
5192 if (isNativeType()) {
5193 return horzStride == 1;
5194 }
5195 else {
5196 return false;
5197 }
5198 }
5199
isNativePackedRegion() const5200 bool G4_DstRegRegion::isNativePackedRegion() const
5201 {
5202 return isNativePackedRowRegion();
5203 }
5204
coverGRF(uint16_t numGRF,uint8_t execSize)5205 bool G4_DstRegRegion::coverGRF(uint16_t numGRF, uint8_t execSize)
5206 {
5207 uint32_t size = numEltPerGRF<Type_UB>() * numGRF;
5208 uint32_t range = getRightBound() - getLeftBound() + 1;
5209 if (acc == Direct)
5210 {
5211 if (range == size)
5212 {
5213 return true;
5214 }
5215 if (horzStride > 1)
5216 {
5217 if (size == execSize * horzStride * TypeSize(type))
5218 {
5219 return true;
5220 }
5221 }
5222 }
5223 else
5224 {
5225 if (size == execSize * horzStride * TypeSize(type))
5226 {
5227 return true;
5228 }
5229 }
5230 return false;
5231 }
5232
5233 // Check if dst satisfies the following conditions(for platforms before BDW):
5234 //The destination region is entirely contained in the lower OWord of a register.
5235 //The destination region is entirely contained in the upper OWord of a register.
5236 //The destination elements are evenly split between the two OWords of a register.
5237
goodOneGRFDst(uint8_t execSize)5238 bool G4_DstRegRegion::goodOneGRFDst(uint8_t execSize)
5239 {
5240 if (acc != Direct)
5241 {
5242 return horzStride * TypeSize(type) * execSize == numEltPerGRF<Type_UB>();
5243 }
5244 uint32_t halfSize = (getRightBound() - getLeftBound() + 1 + (horzStride - 1) * getTypeSize()) / 2;
5245 uint32_t middle = getLeftBound() + halfSize;
5246 if (getLeftBound()/(numEltPerGRF<Type_UB>()/2) == getRightBound()/(numEltPerGRF<Type_UB>()/2) ||
5247 (getLeftBound()/(numEltPerGRF<Type_UB>()/2) == (getRightBound()/(numEltPerGRF<Type_UB>()/2) - 1) &&
5248 getLeftBound()%(numEltPerGRF<Type_UB>()/2) == middle%(numEltPerGRF<Type_UB>()/2)))
5249 {
5250 return true;
5251 }
5252 return false;
5253 }
5254
goodtwoGRFDst(uint8_t execSize)5255 bool G4_DstRegRegion::goodtwoGRFDst(uint8_t execSize)
5256 {
5257 return evenlySplitCrossGRF(execSize);
5258 }
5259
5260 // this is true if dst crosses GRF and has same number of elements in both GRFs
5261 // (i.e, the middle element has same GRF offset as the start element)
evenlySplitCrossGRF(uint8_t execSize)5262 bool G4_DstRegRegion::evenlySplitCrossGRF(uint8_t execSize)
5263 {
5264 // check number of elements in first GRF.
5265 MUST_BE_TRUE(acc == Direct, "Indirect operand can not cross GRF boundary.");
5266
5267 if (execSize == 1)
5268 {
5269 return false;
5270 }
5271
5272 int halfBytes = left_bound + horzStride * TypeSize(type) * (execSize / 2);
5273 int halfOffset = halfBytes % numEltPerGRF<Type_UB>();
5274 int startOffset = left_bound % numEltPerGRF<Type_UB>();
5275 return halfOffset == startOffset;
5276 }
5277
5278 /*
5279 * check if the input opnd is align to GRF
5280 * if the first level dcl is not aligned to GRF or sub register offset of this opnd is not multiple GRFs, including 0,
5281 * return true.
5282 */
checkGRFAlign() const5283 bool G4_DstRegRegion::checkGRFAlign() const
5284 {
5285 bool GRF_aligned = false;
5286 unsigned byte_subregoff = subRegOff * TypeSize(type);
5287
5288 if (byte_subregoff % numEltPerGRF<Type_UB>() != 0)
5289 {
5290 return false;
5291 }
5292
5293 if (base)
5294 {
5295 if (base->isRegVar())
5296 {
5297 G4_Declare *dcl = base->asRegVar()->getDeclare();
5298
5299 if (dcl)
5300 {
5301 G4_Declare *aliasdcl = dcl;
5302
5303 unsigned aliasOffset = 0;
5304 while (aliasdcl->getAliasDeclare())
5305 {
5306 aliasOffset += aliasdcl->getAliasOffset();
5307 aliasdcl = aliasdcl->getAliasDeclare();
5308 }
5309 if (aliasOffset % numEltPerGRF<Type_UB>() != 0)
5310 {
5311 return false;
5312 }
5313
5314 if (aliasdcl->getSubRegAlign() >= GRFALIGN ||
5315 aliasdcl->getNumRows() * aliasdcl->getElemSize() * aliasdcl->getElemSize() >= (int)numEltPerGRF<Type_UB>()) {
5316 return true;
5317 }
5318 }
5319 else if (base->asRegVar()->isPhyRegAssigned() &&
5320 base->asRegVar()->getByteAddr() % numEltPerGRF<Type_UB>() == 0)
5321 {
5322 return true;
5323 }
5324 }
5325 }
5326
5327 return GRF_aligned;
5328 }
5329
5330 //
5331 // returns true if this operand (must be either Src or DstRegRegion) has a fixed subreg offset.
5332 // This is true only if
5333 // -- operand is direct,
5334 // -- operand has assigned GRF (i.e., input), or
5335 // -- base declare is a GRF variable that is GRF-aligned
5336 // if true, the subreg offset is also returned via offset in bytes
5337 // Note this always returns false for ARFs (flag, addr, etc.)
5338 //
regionHasFixedSubreg(G4_Operand * opnd,uint32_t & offset)5339 static bool regionHasFixedSubreg(G4_Operand* opnd, uint32_t& offset)
5340 {
5341 assert(opnd->isSrcRegRegion() || opnd->isDstRegRegion());
5342 short subRegOff = 0;
5343 if (opnd->isSrcRegRegion())
5344 {
5345 if (opnd->asSrcRegRegion()->getRegAccess() != Direct)
5346 {
5347 return false;
5348 }
5349 subRegOff = opnd->asSrcRegRegion()->getSubRegOff();
5350 }
5351 else if (opnd->isDstRegRegion())
5352 {
5353 if (opnd->asDstRegRegion()->getRegAccess() != Direct)
5354 {
5355 return false;
5356 }
5357 subRegOff = opnd->asDstRegRegion()->getSubRegOff();
5358 }
5359
5360 G4_VarBase* base = opnd->getBase();
5361
5362 if (base == NULL || !base->isRegVar() || !base->asRegVar()->getDeclare()->useGRF())
5363 {
5364 return false;
5365 }
5366
5367 if (base->asRegVar()->isPhyRegAssigned())
5368 {
5369 offset = (subRegOff + base->asRegVar()->getPhyRegOff()) * TypeSize(opnd->getType());
5370 offset %= getGRFSize();
5371 return true;
5372 }
5373
5374 uint32_t subregByte = 0;
5375 G4_Declare *rootDcl = base->asRegVar()->getDeclare()->getRootDeclare(subregByte);
5376 subregByte += subRegOff * TypeSize(opnd->getType());
5377
5378 if (rootDcl->getSubRegAlign() < GRFALIGN)
5379 {
5380 return false;
5381 }
5382 offset = subregByte % numEltPerGRF<Type_UB>();
5383
5384 return true;
5385 }
5386
5387
hasFixedSubregOffset(uint32_t & offset)5388 bool G4_DstRegRegion::hasFixedSubregOffset(uint32_t& offset)
5389 {
5390 return regionHasFixedSubreg(this, offset);
5391 }
5392
5393 // compute max execution size starting from the current pos.
5394 // power of two. no cross GRF boundary is allowed now.
5395 // TODO: cross GRF is allowed in BDW+.
5396 // cross half-GRF should guaranttee evenly split
getMaxExecSize(int pos,uint8_t maxExSize,bool twoGRFsrc)5397 uint8_t G4_DstRegRegion::getMaxExecSize(int pos, uint8_t maxExSize, bool twoGRFsrc)
5398 {
5399 if (acc != Direct)
5400 {
5401 return roundDownPow2(maxExSize);
5402 }
5403
5404 uint8_t elSize = (uint8_t)getTypeSize();
5405 uint8_t exTypeSize = horzStride * elSize;
5406 uint8_t maxSize = roundDownPow2(maxExSize);
5407 uint32_t newLB = getLeftBound() + pos * exTypeSize,
5408 newRB = newLB + (maxExSize - 1) * exTypeSize + elSize - 1;
5409 uint32_t leftGRF = newLB / numEltPerGRF<Type_UB>(), rightGRF = newRB / numEltPerGRF<Type_UB>();
5410 // pre-BDW does not allow cross GRF dst except full 2-GRF dst.
5411 // BDW+ allows if elements are evenly split between two GRFs
5412 bool crossGRF = false;
5413 if (isCrossGRFDst())
5414 {
5415 // check cross GRF boundary
5416 uint8_t byteInFirstGRF = ((leftGRF + 1) * numEltPerGRF<Type_UB>() - newLB);
5417 uint8_t eleInFirstGRF = byteInFirstGRF / exTypeSize +
5418 // v20(0,17)<2>:ub and simd size is 16
5419 ((byteInFirstGRF % exTypeSize != 0) && (byteInFirstGRF % exTypeSize >= elSize) ? 1 : 0);
5420
5421 if (leftGRF != rightGRF)
5422 {
5423 uint8_t pow2 = roundDownPow2(eleInFirstGRF);
5424 if (pow2 != eleInFirstGRF)
5425 {
5426 maxSize = pow2;
5427 newRB = newLB + (maxSize - 1) * exTypeSize + elSize - 1;
5428 }
5429 else
5430 {
5431 // number of elements in first GRF is power of 2 and HS is not used to cross GRF
5432 // search into second GRF
5433 // if number of elements in second GRF >= numbr of elements in first GRF
5434 uint8_t byteInSecondGRF = (newRB + 1) % numEltPerGRF<Type_UB>();
5435 uint8_t eleInSecondGRF = byteInSecondGRF / exTypeSize + (horzStride > 1 ? 1 : 0);
5436 if (eleInSecondGRF >= eleInFirstGRF)
5437 {
5438 crossGRF = true;
5439 maxSize = eleInFirstGRF * 2;
5440 }
5441 }
5442 }
5443 }
5444 // check if cross half-GRF boundary
5445 // FIXME: if we know that the new srcs are all in one GRF, we do not have to do the following check.
5446 if (!crossGRF && twoGRFsrc)
5447 {
5448 uint32_t halfGRFSize = numEltPerGRF<Type_UB>() / 2;
5449 if (newLB / halfGRFSize != newRB / halfGRFSize)
5450 {
5451 uint32_t middlePoint = (newRB + (horzStride - 1) * elSize - newLB + 1) / 2;
5452 // check middle point
5453 if ((middlePoint + newLB) % halfGRFSize != 0)
5454 {
5455 // check size before half-GRF
5456 uint8_t sizeBeforeMidGRF = ((leftGRF * numEltPerGRF<Type_UB>() + halfGRFSize) - newLB + exTypeSize - 1) / exTypeSize;
5457 uint8_t pow2Size = roundDownPow2(sizeBeforeMidGRF);
5458 // V36(0,1)<4>:ud is slipt into 2x2
5459 if (sizeBeforeMidGRF <= (maxSize >> 1) && pow2Size == sizeBeforeMidGRF)
5460 {
5461 maxSize = 2 * pow2Size;
5462 }
5463 else
5464 {
5465 maxSize = pow2Size;
5466 }
5467 }
5468 }
5469 }
5470
5471 return maxSize;
5472 }
5473 //
5474 // output (Var+refOff).subRegOff<1><WriteMask>
5475 //
5476 // symbolreg == false, output <modifier>(Var+refOff).subRegOff<16;16,1>.xxyy
5477 // symbolreg == true, output <modifier><symbol>(RegOff, SubRegOff)<16;16,1> in symbolic register emit
5478 // Here we use symbolreg instead of Options::symbolReg, because sometimes we need to emit an instruction with invalid symbolic register as comment and
5479 // then emit a workable instruction with physical register. In this case, we do not want to re-set the global option variable Options::symbolReg to switch
5480 // between these two states, that may have potential side effects.
5481 //
emit(std::ostream & output,bool symbolreg)5482 void G4_DstRegRegion::emit(std::ostream& output, bool symbolreg)
5483 {
5484 //
5485 // output Var(refOff,subRegOff)
5486 //
5487 emitRegVarOff(output, symbolreg);
5488
5489 //
5490 // output <horzStride>
5491 //
5492 if (inst != NULL && inst->isSplitSend())
5493 {
5494 // do nothing for sends
5495 }
5496 else if (writeMask != NoChannelEnable)
5497 {
5498 // do nothing for align16 instructions
5499 }
5500 else if (isAccRegValid())
5501 {
5502 // do nothing for madm
5503 }
5504 else if (horzStride != UNDEFINED_SHORT)
5505 {
5506 output << '<' << horzStride << '>';
5507 }
5508 else if (base->isAreg())
5509 {
5510 output << "<1>";
5511 }
5512 else if (base->isNullReg())
5513 {
5514 // do not emit region for null reg
5515 }
5516 else if (base->isFlag())
5517 {
5518 output << "<1>";
5519 }
5520 else
5521 {
5522 MUST_BE_TRUE(false, "No default region specified");
5523 }
5524
5525 if (isAccRegValid())
5526 {
5527 // output acc2~acc9
5528 if (getAccRegSel() == NOACC)
5529 {
5530 output << ".noacc";
5531 }
5532 else
5533 {
5534 output <<".acc"<< (getAccRegSel()+2);
5535 }
5536 }
5537 else if (writeMask != NoChannelEnable)
5538 {
5539 output << "." << getChannelEnableStr(writeMask);
5540 }
5541
5542 if (Type_UNDEF != type)
5543 {
5544 if (!symbolreg || acc != Direct) // can output register data type for indirect addressing in any time
5545 output << ':' << TypeSymbol(type);
5546 }
5547 }
5548
5549 //
5550 // This function is used to check if the src operand obey the rule of symbolic register. We need this function to check the operand before we emit an instruction
5551 //
obeySymbolRegRule() const5552 bool G4_DstRegRegion::obeySymbolRegRule() const
5553 {
5554 if (!base->isRegVar()) // only for reg var
5555 return false;
5556 if (base->asRegVar()->getDeclare()->isSpilled())
5557 {
5558 return false;
5559 }
5560 //
5561 // For dst operand, we do not have Rule-2
5562 // Rule-2: must have register region or default register region
5563 //
5564 // Rule-3: No swizzle .xyzw
5565 //
5566 if (writeMask != NoChannelEnable)
5567 {
5568 return false;
5569 }
5570 //
5571 // Rule-4: do not support date type redefinition in direct addressing
5572 //
5573 if (Type_UNDEF != type)
5574 {
5575 if (base->isRegVar() && acc == Direct && base->asRegVar()->getDeclare()->getElemType() != type) // check if the data type is the same as in declare
5576 {
5577 return false;
5578 }
5579 }
5580
5581 return true;
5582 }
5583
5584 //
5585 // Here we use symbolreg instead of Options::symbolReg, because sometimes we need to emit an instruction with invalid symbolic register as comment and
5586 // then emit a workable instruction with physical register. In this case, we do not want to re-set the global option variable Options::symbolReg to switch
5587 // between these two states, that may have potential side effects.
5588 //
emitRegVarOff(std::ostream & output,bool symbolreg)5589 void G4_DstRegRegion::emitRegVarOff(std::ostream& output, bool symbolreg)
5590 {
5591 bool printSubReg = true;
5592 if (inst != NULL && inst->isSplitSend())
5593 {
5594 printSubReg = false;
5595 }
5596 printRegVarOff(output, this, regOff,subRegOff,immAddrOff,type, symbolreg, printSubReg);
5597 }
5598
5599 //
5600 // return true if prd and this are the same inst predicate
5601 //
samePredicate(const G4_Predicate & prd) const5602 bool G4_Predicate::samePredicate(const G4_Predicate& prd) const
5603 {
5604 return getBase() == prd.getBase() &&
5605 state == prd.state &&
5606 subRegOff == prd.subRegOff &&
5607 control == prd.control;
5608 }
5609 //
5610 // return true if mod and this are the same condition modifier
5611 //
sameCondMod(const G4_CondMod & m) const5612 bool G4_CondMod::sameCondMod(const G4_CondMod& m) const
5613 {
5614 return getBase() == m.getBase() &&
5615 mod == m.mod &&
5616 subRegOff == m.subRegOff;
5617 }
5618
5619 //
5620 // create all physical register operands
5621 //
PhyRegPool(Mem_Manager & m,unsigned int maxRegisterNumber)5622 PhyRegPool::PhyRegPool(Mem_Manager& m, unsigned int maxRegisterNumber)
5623 {
5624 maxGRFNum = maxRegisterNumber;
5625
5626 GRF_Table = (G4_Greg**)m.alloc(sizeof(G4_Greg*) * maxGRFNum);
5627 // create General Registers
5628 for (unsigned int i = 0; i < maxGRFNum; i++)
5629 GRF_Table[i] = new (m) G4_Greg(i);
5630
5631 for (unsigned i = 0; i < AREG_LAST; i++)
5632 {
5633 ARF_Table[i] = nullptr;
5634 }
5635
5636 // create Architecture Registers
5637 ARF_Table[AREG_NULL] = new (m) G4_Areg(AREG_NULL);
5638 ARF_Table[AREG_A0] = new (m) G4_Areg(AREG_A0);
5639 ARF_Table[AREG_ACC0] = new (m) G4_Areg(AREG_ACC0);
5640 ARF_Table[AREG_ACC1] = new (m) G4_Areg(AREG_ACC1);
5641 ARF_Table[AREG_MASK0] = new (m) G4_Areg(AREG_MASK0);
5642 ARF_Table[AREG_MS0] = new (m) G4_Areg(AREG_MS0);
5643 ARF_Table[AREG_DBG] = new (m) G4_Areg(AREG_DBG);
5644 ARF_Table[AREG_SR0] = new (m) G4_Areg(AREG_SR0);
5645 ARF_Table[AREG_CR0] = new (m) G4_Areg(AREG_CR0);
5646 ARF_Table[AREG_TM0] = new (m) G4_Areg(AREG_TM0);
5647 ARF_Table[AREG_N0] = new (m) G4_Areg(AREG_N0);
5648 ARF_Table[AREG_N1] = new (m) G4_Areg(AREG_N1);
5649 ARF_Table[AREG_IP] = new (m) G4_Areg(AREG_IP);
5650 ARF_Table[AREG_F0] = new (m) G4_Areg(AREG_F0);
5651 ARF_Table[AREG_F1] = new (m) G4_Areg(AREG_F1);
5652 ARF_Table[AREG_TDR0] = new (m) G4_Areg(AREG_TDR0);
5653 ARF_Table[AREG_SP] = new (m)G4_Areg(AREG_SP);
5654 ARF_Table[AREG_F2] = new (m) G4_Areg(AREG_F2);
5655 ARF_Table[AREG_F3] = new (m) G4_Areg(AREG_F3);
5656 }
5657
rebuildRegPool(Mem_Manager & m,unsigned int numRegisters)5658 void PhyRegPool::rebuildRegPool(Mem_Manager& m, unsigned int numRegisters)
5659 {
5660 maxGRFNum = numRegisters;
5661
5662 GRF_Table = (G4_Greg**)m.alloc(sizeof(G4_Greg*) * maxGRFNum);
5663 // create General Registers
5664 for (unsigned int i = 0; i < maxGRFNum; i++)
5665 GRF_Table[i] = new (m) G4_Greg(i);
5666 }
5667
setEvenAlign()5668 void G4_Declare::setEvenAlign()
5669 {
5670 regVar->setEvenAlign();
5671 }
5672
setSubRegAlign(G4_SubReg_Align subAl)5673 void G4_Declare::setSubRegAlign(G4_SubReg_Align subAl)
5674 {
5675 regVar->setSubRegAlignment(subAl);
5676 }
5677
isEvenAlign() const5678 bool G4_Declare::isEvenAlign() const
5679 {
5680 return regVar->isEvenAlign();
5681 }
5682
getSubRegAlign() const5683 G4_SubReg_Align G4_Declare::getSubRegAlign() const
5684 {
5685 return regVar->getSubRegAlignment();
5686 }
5687
copyAlign(G4_Declare * dcl)5688 void G4_Declare::copyAlign(G4_Declare* dcl)
5689 {
5690 if (dcl->isEvenAlign())
5691 {
5692 setEvenAlign();
5693 }
5694 regVar->setSubRegAlignment(dcl->getSubRegAlign());
5695 }
5696
emit(std::ostream & output) const5697 void G4_Declare::emit(std::ostream &output) const
5698 {
5699
5700 output << "//.declare " << name;
5701 output << " rf=";
5702 if (useGRF())
5703 {
5704 output << 'r';
5705 }
5706 else if (regFile == G4_ADDRESS)
5707 {
5708 output << 'a';
5709 }
5710 else if (regFile == G4_SCALAR)
5711 {
5712 output << 's';
5713 }
5714 else if (regFile == G4_FLAG)
5715 {
5716 output << 'f';
5717 }
5718 else
5719 {
5720 MUST_BE_TRUE(false, ERROR_UNKNOWN); //unhandled case
5721 }
5722
5723 output << " size=" << getByteSize();
5724 if (Type_UNDEF != elemType)
5725 {
5726 output << " type=" << TypeSymbol(elemType);
5727 }
5728 if (AliasDCL)
5729 {
5730 output << " alias=" << AliasDCL->getName() << "+" << getAliasOffset();
5731 }
5732 output << " align=" << getSubRegAlign() << " words";
5733 if (regVar->isPhyRegAssigned())
5734 {
5735 G4_VarBase* phyreg = regVar->getPhyReg();
5736 if (phyreg->isGreg())
5737 {
5738 output << " (r" << phyreg->asGreg()->getRegNum() << "." << regVar->getPhyRegOff() << ")";
5739 }
5740 else if (phyreg->isAddress())
5741 {
5742 output << " (a0." << regVar->getPhyRegOff() << ")";
5743 }
5744 else if (phyreg->isFlag())
5745 {
5746 bool valid = false;
5747 output << " (f" << phyreg->asAreg()->ExRegNum(valid) << "." << regVar->getPhyRegOff() << ")";
5748 }
5749 }
5750 else if (isSpilled())
5751 {
5752 if (spillDCL)
5753 {
5754 // flag/addr spill
5755 output << " (spilled -> " << spillDCL->getName() << ")";
5756 }
5757 else
5758 {
5759 // GRF spill
5760 auto GRFOffset = getRegVar()->getDisp() / getGRFSize();
5761 if (!AliasDCL)
5762 {
5763 output << " (spilled -> Scratch[" << GRFOffset << "x" << (int)getGRFSize() << "])";
5764 }
5765 else
5766 {
5767 output << " (spilled)";
5768 }
5769 }
5770 }
5771
5772 if (liveIn && liveOut)
5773 {
5774 output << " Input_Output";
5775 }
5776 else if (liveIn)
5777 {
5778 output << " Input";
5779 }
5780 else if (liveOut)
5781 {
5782 output << " Output";
5783 }
5784
5785 output << "\n";
5786 }
5787
emit(std::ostream & output,bool symbolreg)5788 void G4_Predicate::emit(std::ostream& output, bool symbolreg)
5789 {
5790 output << "(";
5791 emit_body(output, symbolreg);
5792 output << ") ";
5793 }
5794
emit_body(std::ostream & output,bool symbolreg)5795 void G4_Predicate::emit_body(std::ostream& output, bool symbolreg)
5796 {
5797 static const char* align16ControlNames[] =
5798 {
5799 "",
5800 "xyzw",
5801 "x",
5802 "y",
5803 "z",
5804 "w"
5805 "any4h",
5806 "all4h"
5807 };
5808
5809 if (state == PredState_Minus)
5810 {
5811 output << '!';
5812 }
5813
5814 if (getBase()->asRegVar()->isPhyRegAssigned())
5815 {
5816 getBase()->asRegVar()->getPhyReg()->emit(output);
5817 output << "." << getBase()->asRegVar()->getPhyRegOff();
5818 }
5819 else
5820 {
5821 getBase()->emit(output);
5822 if (subRegOff != UNDEFINED_SHORT)
5823 {
5824 output << '.' << subRegOff;
5825 }
5826 }
5827
5828 if (align16Control != PRED_ALIGN16_DEFAULT)
5829 {
5830 output << "." << align16ControlNames[align16Control];
5831 }
5832 else
5833 {
5834 if (control != PRED_DEFAULT)
5835 {
5836 output << '.';
5837 switch (control)
5838 {
5839 case PRED_ANY2H:
5840 output << "any2h";
5841 break;
5842 case PRED_ANY4H:
5843 output << "any4h";
5844 break;
5845 case PRED_ANY8H:
5846 output << "any8h";
5847 break;
5848 case PRED_ANY16H:
5849 output << "any16h";
5850 break;
5851 case PRED_ANY32H:
5852 output << "any32h";
5853 break;
5854 case PRED_ALL2H:
5855 output << "all2h";
5856 break;
5857 case PRED_ALL4H:
5858 output << "all4h";
5859 break;
5860 case PRED_ALL8H:
5861 output << "all8h";
5862 break;
5863 case PRED_ALL16H:
5864 output << "all16h";
5865 break;
5866 case PRED_ALL32H:
5867 output << "all32h";
5868 break;
5869 case PRED_ANYV:
5870 output << "anyv";
5871 break;
5872 case PRED_ALLV:
5873 output << "allv";
5874 break;
5875 default:
5876 // do nothing
5877 break;
5878 }
5879 }
5880 }
5881 }
5882
G4_Predicate(G4_Predicate & prd)5883 G4_Predicate::G4_Predicate(G4_Predicate &prd)
5884 : G4_Operand(G4_Operand::predicate, prd.getBase())
5885 {
5886 state = prd.state;
5887 subRegOff = prd.subRegOff;
5888 control = prd.control;
5889 align16Control = prd.align16Control;
5890
5891 top_dcl = prd.top_dcl;
5892 left_bound = prd.left_bound;
5893 right_bound = prd.right_bound;
5894 bitVec[0] = prd.bitVec[0];
5895 bitVec[1] = prd.bitVec[1];
5896 byteOffset = prd.byteOffset;
5897 rightBoundSet = prd.rightBoundSet;
5898 isPredicateSameAsNoMask = prd.isPredicateSameAsNoMask;
5899 }
5900
computeRightBound(uint8_t exec_size)5901 unsigned G4_Predicate::computeRightBound(uint8_t exec_size)
5902 {
5903 rightBoundSet = true;
5904 bitVec[0] = 0;
5905 bitVec[1] = 0;
5906
5907 uint16_t group_size = (uint16_t)getPredCtrlGroupSize();
5908 uint16_t totalBits = (exec_size > group_size) ? exec_size : group_size;
5909
5910 if (inst)
5911 left_bound = inst->getMaskOffset();
5912
5913 right_bound = left_bound + totalBits - 1;
5914
5915 bitVec[0] = exec_size == 32 ? 0xFFFFFFFF : (1 << exec_size) - 1;
5916
5917 return right_bound;
5918 }
5919
compareBound(uint32_t myLB,uint32_t myRB,uint32_t otherLB,uint32_t otherRB)5920 static G4_CmpRelation compareBound(uint32_t myLB, uint32_t myRB, uint32_t otherLB, uint32_t otherRB)
5921 {
5922 if (myLB == otherLB && myRB == otherRB)
5923 {
5924 return Rel_eq;
5925 }
5926 else if (myRB < otherLB || otherRB < myLB)
5927 {
5928 return Rel_disjoint;
5929 }
5930 else if (myLB <= otherLB && myRB >= otherRB)
5931 {
5932 return Rel_gt;
5933 }
5934 else if (myLB >= otherLB && myRB <= otherRB)
5935 {
5936 return Rel_lt;
5937 }
5938 else
5939 {
5940 return Rel_interfere;
5941 }
5942 }
5943
5944 /// compare flag to opnd
5945 /// flag is either a G4_Predicate or G4_CondMod, opnd can be any G4_operand
5946 /// We put this in a separate function since G4_Predicate and G4_CondMod
5947 /// should have identical code for compareOperand
compareFlagToOperand(G4_Operand * flag,G4_Operand * opnd)5948 static G4_CmpRelation compareFlagToOperand(G4_Operand* flag, G4_Operand* opnd)
5949 {
5950 assert((flag->isPredicate() || flag->isCondMod()) && "expect either predicate or conditional modifier");
5951
5952 bool legalOpnd = opnd->isSrcRegRegion() || opnd->isDstRegRegion() || opnd->isPredicate() || opnd->isCondMod();
5953 G4_VarBase* myBase = flag->getBase();
5954 G4_VarBase *opndBase = opnd->getBase();
5955
5956 if (!legalOpnd || myBase == nullptr || opndBase == nullptr || !opndBase->isFlag())
5957 {
5958 return Rel_disjoint;
5959 }
5960
5961 // flags with different base declare definitely do not interfere (we do not consider physical flags here)
5962 if (flag->getTopDcl() != opnd->getTopDcl())
5963 {
5964 return Rel_disjoint;
5965 }
5966
5967 // Do we generate pseudo kill on flags?
5968 G4_INST* opndInst = opnd->getInst();
5969 if (opndInst && (opndInst->isPseudoKill() || opndInst->isLifeTimeEnd()))
5970 {
5971 return Rel_interfere;
5972 }
5973
5974 return compareBound(flag->getLeftBound(), flag->getRightBound(), opnd->getLeftBound(), opnd->getRightBound());
5975 }
5976
compareOperand(G4_Operand * opnd)5977 G4_CmpRelation G4_Predicate::compareOperand(G4_Operand *opnd)
5978 {
5979 return compareFlagToOperand(this, opnd);
5980 }
5981
5982 // remove half of the bitvector and change right bound
splitPred()5983 void G4_Predicate::splitPred()
5984 {
5985 uint16_t range = getRightBound() - getLeftBound() + 1;
5986 uint16_t shiftLen = range >> 2;
5987 right_bound = getLeftBound() + shiftLen - 1;
5988
5989 bitVec[0] = ((uint32_t)getBitVecL()) >> shiftLen;
5990 }
5991
emit(std::ostream & output,bool symbolreg)5992 void G4_CondMod::emit(std::ostream& output, bool symbolreg)
5993 {
5994 static const char* const CondModStr[Mod_cond_undef] =
5995 {
5996 "ze", // zero
5997 "eq", // equal
5998 "nz", // not zero
5999 "ne", // not equal
6000 "gt", // greater
6001 "ge", // greater or equal
6002 "lt", // less
6003 "le", // less or equal
6004 "ov", // overflow
6005 "ri", // round increment
6006 "un", // unorder (NaN)
6007 };
6008 output << "(" << CondModStr[mod] << ")";
6009 if (getBase() == nullptr)
6010 {
6011 output << "f0.0";
6012 } else if (getBase()->asRegVar()->isPhyRegAssigned()) {
6013 getBase()->asRegVar()->getPhyReg()->emit(output);
6014 output << "." << getBase()->asRegVar()->getPhyRegOff();
6015 } else {
6016 getBase()->emit(output);
6017 if (subRegOff != UNDEFINED_SHORT)
6018 {
6019 output << '.' << subRegOff;
6020 }
6021 }
6022 }
G4_CondMod(G4_CondMod & cMod)6023 G4_CondMod::G4_CondMod(G4_CondMod &cMod)
6024 : G4_Operand(G4_Operand::condMod, cMod.getBase())
6025 {
6026 mod = cMod.mod;
6027 subRegOff = cMod.subRegOff;
6028
6029 top_dcl = cMod.top_dcl;
6030 left_bound = cMod.left_bound;
6031 right_bound = cMod.right_bound;
6032 bitVec[0] = cMod.bitVec[0];
6033 bitVec[1] = cMod.bitVec[1];
6034 byteOffset = cMod.byteOffset;
6035 rightBoundSet = cMod.rightBoundSet;
6036 }
6037
computeRightBound(uint8_t exec_size)6038 unsigned G4_CondMod::computeRightBound(uint8_t exec_size)
6039 {
6040 bitVec[0] = 0;
6041 bitVec[1] = 0;
6042 rightBoundSet = true;
6043
6044 if (inst)
6045 left_bound = inst->getMaskOffset();
6046
6047 right_bound = left_bound + exec_size - 1;
6048
6049 bitVec[0] = exec_size == 32 ? 0xFFFFFFFF : (1 << exec_size) - 1;
6050
6051 return right_bound;
6052 }
6053
6054 /// same as G4_Predicate::compareOperand
compareOperand(G4_Operand * opnd)6055 G4_CmpRelation G4_CondMod::compareOperand(G4_Operand *opnd)
6056 {
6057 return compareFlagToOperand(this, opnd);
6058 }
6059
6060
6061 // remove half of the bitvector and change right bound
splitCondMod()6062 void G4_CondMod::splitCondMod()
6063 {
6064 uint16_t range = getRightBound() - getLeftBound() + 1;
6065 uint16_t shiftLen = range >> 2;
6066 right_bound = getLeftBound() + shiftLen - 1;
6067
6068 bitVec[0] = ((uint32_t)getBitVecL()) >> shiftLen;
6069 }
isEqualTo(G4_Imm & imm1) const6070 bool G4_Imm::isEqualTo(G4_Imm& imm1) const
6071 {
6072 return (imm1.getType() == type) && (imm1.getImm() == imm.num);
6073 }
6074
6075 // check if an immedate is in the range of type
isInTypeRange(int64_t imm,G4_Type ty)6076 bool G4_Imm::isInTypeRange(int64_t imm, G4_Type ty)
6077 {
6078 switch (ty)
6079 {
6080 case Type_D:
6081 return imm >= (int)MIN_DWORD_VALUE && imm <= (int)MAX_DWORD_VALUE;
6082 case Type_Q:
6083 return true;
6084 case Type_UQ:
6085 return imm >= 0;
6086 case Type_UD:
6087 return (imm >= (unsigned)MIN_UDWORD_VALUE && imm <= (unsigned)MAX_UDWORD_VALUE);
6088 case Type_W:
6089 return (imm >= (int)MIN_WORD_VALUE && imm <= (int)MAX_WORD_VALUE);
6090 case Type_UW:
6091 return (imm >= (int)MIN_UWORD_VALUE && imm <= (int)MAX_UWORD_VALUE);
6092 case Type_B:
6093 return (imm >= (int)MIN_CHAR_VALUE && imm <= (int)MAX_CHAR_VALUE);
6094 case Type_UB:
6095 return (imm >= (int)MIN_UCHAR_VALUE && imm <= (int)MAX_UCHAR_VALUE);
6096 default:
6097 break;
6098 }
6099
6100 return false;
6101 }
6102
isZero() const6103 bool G4_Imm::isZero() const
6104 {
6105 if (IS_TYPE_F32_F64(type))
6106 {
6107 if (type == Type_F)
6108 {
6109 return (imm.fp32 == 0.0f);
6110 }
6111 return (imm.fp == 0.0);
6112 }
6113 return (imm.num == 0);
6114 }
6115
isSignBitZero() const6116 bool G4_Imm::isSignBitZero() const
6117 {
6118 G4_Type Ty = getType();
6119 int64_t val = getInt();
6120 switch (Ty) {
6121 case Type_B:
6122 case Type_W:
6123 case Type_D:
6124 case Type_Q:
6125 return val > 0;
6126 case Type_V:
6127 return ((uint64_t)val & 0x88888888) == 0;
6128 default:
6129 break;
6130 }
6131 return false;
6132 }
6133
compareOperand(G4_Operand * opnd)6134 G4_CmpRelation G4_Imm::compareOperand(G4_Operand *opnd)
6135 {
6136 G4_CmpRelation rel = Rel_disjoint;
6137 if (opnd->isImm() && isEqualTo(opnd->asImm()))
6138 {
6139 return Rel_eq;
6140 }
6141 return rel;
6142 }
6143
emit(std::ostream & output,bool symbolreg)6144 void G4_Imm::emit(std::ostream& output, bool symbolreg)
6145 {
6146 //
6147 // we only emit hex in this function
6148 //
6149 std::ios::fmtflags outFlags(output.flags());
6150 output.flags(std::ios_base::hex | std::ios_base::showbase);
6151
6152 short word;
6153 if (type == Type_DF)
6154 {
6155 output << (uint64_t)imm.num;
6156 }
6157 else if (type == Type_F)
6158 {
6159 output << imm.num32;
6160 }
6161 else if (type == Type_W || type == Type_UW || type == Type_B || type == Type_UB)
6162 {
6163 word = (short)imm.num;
6164 output << word;
6165 }
6166 else if (type == Type_D || type == Type_UD)
6167 {
6168 // 32-bit int
6169 output << (int)imm.num;
6170 }
6171 else
6172 {
6173 // 64-bit int
6174 output << imm.num;
6175 }
6176
6177 output.flags(outFlags);
6178
6179 if (Type_UNDEF != type)
6180 {
6181 output << ':' << TypeSymbol(type);
6182 }
6183 }
6184
6185 // emit number, automatically select the format according to its original format
emitAutoFmt(std::ostream & output)6186 void G4_Imm::emitAutoFmt(std::ostream& output)
6187 {
6188 if (Type_F == type)
6189 {
6190 output << imm.fp32;
6191 }
6192 else if (Type_DF == type)
6193 {
6194 output << imm.fp;
6195 }
6196 else if (Type_W == type || Type_B == type)
6197 {
6198 output << (short)imm.num;
6199 }
6200 else if (Type_D == type)
6201 {
6202 output << imm.num;
6203 }
6204 else //unsigned value
6205 {
6206 output << (unsigned)imm.num;
6207 }
6208
6209 if (Type_UNDEF != type)
6210 {
6211 output << ':' << TypeSymbol(type);
6212 }
6213 }
6214
typecastVals(int64_t value,G4_Type type)6215 int64_t G4_Imm::typecastVals(int64_t value, G4_Type type)
6216 {
6217 int64_t retVal = 0;
6218 switch (type)
6219 {
6220 case Type_UD:
6221 case Type_UV:
6222 case Type_VF:
6223 {
6224 retVal = (int64_t)((unsigned)value);
6225 break;
6226 }
6227 case Type_D:
6228 case Type_V:
6229 {
6230 retVal = (int64_t)((int)value);
6231 break;
6232 }
6233 case Type_UW:
6234 {
6235 retVal = (int64_t)((uint16_t)value);
6236 break;
6237 }
6238 case Type_W:
6239 {
6240 retVal = (int64_t)((int16_t)value);
6241 break;
6242 }
6243 case Type_UB:
6244 {
6245 retVal = (int64_t)((uint8_t)value);
6246 break;
6247 }
6248 case Type_B:
6249 {
6250 retVal = (int64_t)((int8_t)value);
6251 break;
6252 }
6253 default:
6254 {
6255 // Dont do float conversions
6256 retVal = value;
6257 }
6258 }
6259 return retVal;
6260 }
6261
6262 G4_RegVar *
getNonTransientBaseRegVar()6263 G4_RegVarTransient::getNonTransientBaseRegVar ()
6264 {
6265 G4_RegVar * base;
6266 for (base = getBaseRegVar (); base->isRegVarTransient (); base = base->getBaseRegVar ());
6267 return base;
6268 }
6269
6270 G4_RegVar *
getAbsBaseRegVar()6271 G4_RegVarTransient::getAbsBaseRegVar ()
6272 {
6273 G4_RegVar * base;
6274 for (base = getBaseRegVar (); base->getBaseRegVar () != base; base = base->getBaseRegVar ());
6275 return base;
6276 }
6277
6278 G4_RegVar *
getAbsBaseRegVar()6279 G4_RegVarTmp::getAbsBaseRegVar ()
6280 {
6281 G4_RegVar * base;
6282 for (base = getBaseRegVar (); base->getBaseRegVar () != base; base = base->getBaseRegVar ());
6283 return base;
6284 }
6285
6286 void
emit(std::ostream & output,bool symbolreg)6287 G4_RegVar::emit(std::ostream& output, bool symbolreg)
6288 {
6289
6290 output << decl->getName();
6291 if (reg.phyReg != NULL)
6292 {
6293 output << "(";
6294 reg.phyReg->emit(output);
6295 output << '.' << reg.subRegOff << ':' <<
6296 TypeSymbol(getDeclare()->getElemType()) << ")";
6297 }
6298 }
eval()6299 int G4_AddrExp::eval()
6300 {
6301 int byteAddr = 0;
6302
6303 if (m_addressedReg->getPhyReg() == NULL)
6304 {
6305 // address taken range is spilled
6306 G4_Declare* addrTakenSpillFillDcl = m_addressedReg->getDeclare()->getAddrTakenSpillFill();
6307 MUST_BE_TRUE(addrTakenSpillFillDcl != NULL, "No addr taken spill fill register found!");
6308 byteAddr = addrTakenSpillFillDcl->getGRFBaseOffset();
6309 }
6310 else
6311 {
6312 byteAddr = m_addressedReg->getByteAddr(); //let's assume the unsigned=>int won't overflow for now.
6313 }
6314
6315 // byteAddr += offsetInEle * addressedReg->getDeclare()->getElemSize();
6316 byteAddr += m_offset;
6317
6318 return byteAddr;
6319 }
emit(std::ostream & output,bool symbolreg)6320 void G4_AddrExp::emit(std::ostream& output, bool symbolreg)
6321 {
6322 output << '&';
6323 m_addressedReg->emit(output);
6324 output << '+' << m_offset;
6325 }
6326
computeLeftBound()6327 void G4_SrcRegRegion::computeLeftBound()
6328 {
6329 top_dcl = NULL;
6330 unsigned newregoff = regOff, offset = 0;
6331
6332 if (base)
6333 {
6334 if (base->isRegVar())
6335 {
6336 top_dcl = base->asRegVar()->getDeclare();
6337 if (!top_dcl && base->asRegVar()->isGreg())
6338 {
6339 newregoff = base->asRegVar()->asGreg()->getRegNum();
6340 }
6341 }
6342 }
6343
6344 if (top_dcl)
6345 {
6346 while (top_dcl->getAliasDeclare())
6347 {
6348 offset += top_dcl->getAliasOffset();
6349 top_dcl = top_dcl->getAliasDeclare();
6350 }
6351 }
6352
6353 if (base != NULL && base->isFlag())
6354 {
6355 if (base->isRegVar())
6356 {
6357 if (base->asRegVar()->getPhyReg())
6358 {
6359 left_bound = base->asRegVar()->getPhyRegOff() * 16; // the bound of flag register is in unit of BIT
6360 left_bound += subRegOff * 16;
6361 left_bound += base->asRegVar()->getPhyReg()->asAreg()->getFlagNum() * 32;
6362 }
6363 else
6364 {
6365 left_bound = subRegOff * 16;
6366 }
6367 }
6368 else
6369 {
6370 left_bound = subRegOff * 16;
6371 left_bound += base->asAreg()->getFlagNum() * 32;
6372 }
6373
6374 right_bound = 0;
6375 }
6376 else if (base != NULL && base->isAccReg())
6377 {
6378 left_bound = subRegOff * TypeSize(type);
6379 if (base->asAreg()->getArchRegType() == AREG_ACC1)
6380 {
6381 left_bound += 32; // TODO: size of ACC is assumed to be 32 BYTEs.
6382 }
6383 byteOffset = left_bound;
6384 }
6385 else if (top_dcl)
6386 {
6387 if (acc == Direct)
6388 {
6389 left_bound = offset + newregoff * numEltPerGRF<Type_UB>() + subRegOff * TypeSize(type);
6390 if (top_dcl->getTotalElems() * top_dcl->getElemSize() >= (int)numEltPerGRF<Type_UB>())
6391 {
6392 byteOffset = left_bound;
6393 }
6394 else
6395 {
6396 unsigned alignOff = TypeSize(type) > TypeSize(Type_W) ?
6397 TypeSize(type) : TypeSize(Type_W);
6398 if (top_dcl->getSubRegAlign() == Even_Word || top_dcl->getSubRegAlign() >= Four_Word)
6399 {
6400 alignOff = top_dcl->getSubRegAlign() * 2;
6401 }
6402 byteOffset = left_bound + alignOff;
6403 }
6404 }
6405 else
6406 {
6407 left_bound = subRegOff * TypeSize(ADDR_REG_TYPE);
6408 byteOffset = TypeSize(type);
6409 }
6410
6411 if (desc && desc->isScalar())
6412 {
6413 right_bound = left_bound + TypeSize(type) - 1;
6414 }
6415 else
6416 {
6417 right_bound = 0;
6418 // for other cases, we need execution size and instruction compression attr, so we just set
6419 // partial value here, which will be patched later
6420 // right_bound = desc->horzStride * TypeSize(type);
6421 // patch it with *exec_size + left_bound
6422 // if vertical stride == 0 and width < exec_size, divide it by 2
6423 }
6424 }
6425 else
6426 { //arch reg
6427 left_bound = 0;
6428 byteOffset = left_bound;
6429 }
6430 }
6431
setSrcBitVec(uint8_t exec_size)6432 void G4_SrcRegRegion::setSrcBitVec(uint8_t exec_size)
6433 {
6434 uint64_t bit_seq = TypeFootprint(type);
6435 unsigned short typeSize = TypeSize(type);
6436
6437 uint64_t footPrint0 = 0;
6438 uint64_t footPrint1 = 0;
6439
6440 MUST_BE_TRUE(exec_size >= desc->width, "exec size must be >= width");
6441 if (desc->isScalar())
6442 {
6443 footPrint0 = bit_seq;
6444 }
6445 else if (desc->isContiguous(exec_size))
6446 {
6447 // fast path
6448 int totalBytes = exec_size * typeSize;
6449 MUST_BE_TRUE(totalBytes <= 2 * getGRFSize(), "total bytes exceed 2 GRFs");
6450
6451 footPrint0 = totalBytes < 64 ? (1ULL << totalBytes) - 1 : ULLONG_MAX;
6452 if (totalBytes > 64)
6453 {
6454 footPrint1 = totalBytes == 128 ? ULLONG_MAX : (1ULL << (totalBytes - 64)) - 1;
6455 }
6456 }
6457 else
6458 {
6459 for (int i = 0, numRows = exec_size / desc->width; i < numRows; ++i)
6460 {
6461 for (int j = 0; j < desc->width; ++j)
6462 {
6463 int eltOffset = i * desc->vertStride * typeSize + j * desc->horzStride * typeSize;
6464 // no element can cross 64-byte boundary
6465 if (eltOffset >= 64)
6466 {
6467 footPrint1 |= bit_seq << (eltOffset - 64);
6468 }
6469 else
6470 {
6471 footPrint0 |= bit_seq << eltOffset;
6472 }
6473 }
6474 }
6475 }
6476
6477 bitVec[0] = footPrint0;
6478 bitVec[1] = footPrint1;
6479 }
6480
computeRightBound(uint8_t exec_size)6481 unsigned G4_SrcRegRegion::computeRightBound(uint8_t exec_size)
6482 {
6483 unsigned short hs = desc->isScalar() ? 1 : desc->horzStride;
6484 unsigned short vs = desc->isScalar() ? 0 : desc->vertStride;
6485 rightBoundSet = true;
6486 unsigned short typeSize = TypeSize(type);
6487
6488 bitVec[0] = 0;
6489 bitVec[1] = 0;
6490 if (base->isFlag())
6491 {
6492 unsigned int totalBits = 0;
6493 if (G4_Inst_Table[inst->opcode()].instType != InstTypePseudoLogic)
6494 {
6495 // mov (1) ... fx.1<0;1,0>:uw
6496 left_bound = subRegOff * 16;
6497 totalBits = base->asRegVar()->getDeclare()->getNumberFlagElements() < TypeBitSize(type) ?
6498 base->asRegVar()->getDeclare()->getNumberFlagElements() : TypeBitSize(type);
6499 }
6500 else
6501 {
6502 /*
6503 we need to set leftBound for pseudo intruction
6504 so that it creates use/def links correctly in the control flow graph between
6505 cmp instruction and pseudo instruction.
6506 This matters when we break up SIMD32 instruction in to two SIMD16 with H1/H2 masks.
6507 The bound for compare for H2 will be [15,31], and this has to match.
6508 Without this no use/def link was created which caused issues in logic optimization.
6509 Also it produce incorrect behavior in any operation that relies on compareOperand.
6510 */
6511 left_bound = inst->getMaskOffset();
6512 totalBits = exec_size;
6513 }
6514
6515 right_bound = left_bound + totalBits - 1;
6516
6517 bitVec[0] = totalBits == 32 ? 0xFFFFFFFF : (1 << totalBits) - 1;
6518 }
6519 else
6520 {
6521 if (acc == Direct)
6522 {
6523 if (inst->isReturn() || inst->isFReturn())
6524 {
6525 exec_size = 2;
6526 }
6527
6528 setSrcBitVec(exec_size);
6529
6530 if (desc->isScalar())
6531 {
6532 right_bound = left_bound + typeSize - 1;
6533 }
6534 else
6535 {
6536 int num_rows = exec_size / desc->width;
6537 if (num_rows > 0)
6538 {
6539 right_bound =
6540 left_bound +
6541 (num_rows - 1) * vs * typeSize +
6542 hs * (desc->width - 1) * typeSize +
6543 typeSize - 1;
6544 }
6545 else
6546 {
6547 // this fix applies to inplicit acc src
6548 // usually when we compute new rb after inst splitting,
6549 // the region is still the old one.
6550 // exec_size may be smaller than width
6551 right_bound =
6552 left_bound +
6553 hs * (exec_size - 1) * typeSize +
6554 typeSize - 1;
6555 }
6556 }
6557 }
6558 else
6559 {
6560 unsigned short numAddrSubReg = 1;
6561 if (desc->isRegionWH())
6562 {
6563 numAddrSubReg = exec_size/desc->width;
6564 }
6565 for (uint16_t i = 0; i < numAddrSubReg; i++)
6566 {
6567 bitVec[0] |= ((uint64_t) 0x3) << (i * 2);
6568 }
6569 right_bound = left_bound + TypeSize(ADDR_REG_TYPE) * numAddrSubReg - 1;
6570 }
6571 }
6572 return right_bound;
6573 }
6574
compareOperand(G4_Operand * opnd)6575 G4_CmpRelation G4_SrcRegRegion::compareOperand(G4_Operand *opnd)
6576 {
6577 return compareRegRegionToOperand(this, opnd);
6578 }
6579
isNativeType() const6580 bool G4_SrcRegRegion::isNativeType() const
6581 {
6582 G4_Type type = getType();
6583
6584 if (IS_WTYPE(type) || IS_DTYPE(type) || IS_FTYPE(type) || type == Type_DF) {
6585 return true;
6586 }
6587 else {
6588 return false;
6589 }
6590 }
6591
isNativePackedRowRegion() const6592 bool G4_SrcRegRegion::isNativePackedRowRegion() const
6593 {
6594 if (isNativeType())
6595 {
6596 // A single element row is always packed.
6597 return (desc->horzStride == 1) ||
6598 (desc->width == 1 && desc->horzStride == 0);
6599 }
6600
6601 return false;
6602 }
6603
isNativePackedRegion() const6604 bool G4_SrcRegRegion::isNativePackedRegion() const
6605 {
6606 return isNativePackedRowRegion() && desc->vertStride == desc->width;
6607 }
6608
coverTwoGRF()6609 bool G4_SrcRegRegion::coverTwoGRF()
6610 {
6611 uint16_t range = getRightBound() - getLeftBound() + 1;
6612 if (range < numEltPerGRF<Type_UB>())
6613 return false;
6614 if (desc->horzStride > 1)
6615 {
6616 range += (desc->horzStride - 1) * TypeSize(type);
6617 }
6618 if (range == numEltPerGRF<Type_UB>() * 2 &&
6619 (desc->vertStride == desc->horzStride * desc->width ||
6620 desc->isContiguous(getInst()->getExecSize())))
6621 {
6622 return true;
6623 }
6624 return false;
6625 }
6626 // Assumption:
6627 // operand crosses GRF boundary
evenlySplitCrossGRF(uint8_t execSize,bool & sameSubRegOff,bool & vertCrossGRF,bool & contRegion,uint8_t & eleInFirstGRF)6628 bool G4_SrcRegRegion::evenlySplitCrossGRF(uint8_t execSize, bool &sameSubRegOff,
6629 bool &vertCrossGRF, bool &contRegion, uint8_t &eleInFirstGRF)
6630 {
6631 // always return true since all align16 instructions are generated by JIT
6632 // later on when we have other execution types for align16 instructions,
6633 // fix the following if to check src element distribution.
6634 // FIXME: do we need to check HS here?
6635 if (desc->isRegionV())
6636 {
6637 sameSubRegOff = true;
6638 vertCrossGRF = true;
6639 contRegion = true;
6640 return true;
6641 }
6642 vertCrossGRF = true;
6643 contRegion = desc->isSingleStride(getInst()->getExecSize());
6644 MUST_BE_TRUE(acc == Direct, "Indirect operand can not cross GRF boundary.");
6645 uint8_t firstSubRegOff = getLeftBound() % numEltPerGRF<Type_UB>();
6646 uint8_t left = firstSubRegOff;
6647 uint8_t typeSize = (uint8_t)TypeSize(type);
6648 uint8_t execTySize = (desc->horzStride == 0 ? 1 : desc->horzStride) * typeSize;
6649 uint8_t lastEltEndByte = desc->horzStride * (desc->width - 1) * typeSize + typeSize;
6650 uint8_t realRowSize = lastEltEndByte;
6651 // check number of elements in first GRF.
6652 eleInFirstGRF = 0;
6653 while (left < numEltPerGRF<Type_UB>())
6654 {
6655 if (left + realRowSize <= (int)numEltPerGRF<Type_UB>())
6656 {
6657 // realRowSize is used to handle V12(0,17)<32;8,2>:b
6658 eleInFirstGRF += desc->width;
6659 left += desc->vertStride * TypeSize(type);
6660 }
6661 else
6662 {
6663 vertCrossGRF = false;
6664 // V12(0,17)<32;8,2>:b is a good two GRF source
6665 eleInFirstGRF++;
6666 uint8_t newLeft = left + typeSize;
6667 newLeft += execTySize;
6668 while (newLeft < numEltPerGRF<Type_UB>())
6669 {
6670 eleInFirstGRF++;
6671 newLeft += execTySize;
6672 }
6673 if (newLeft == numEltPerGRF<Type_UB>())
6674 {
6675 eleInFirstGRF++;
6676 if (eleInFirstGRF % desc->width == 0)
6677 {
6678 left += desc->vertStride * TypeSize(type);
6679 }
6680 else
6681 {
6682 left = newLeft + (execTySize - typeSize);
6683 }
6684 }
6685 else if (eleInFirstGRF % desc->width == 0)
6686 {
6687 left += desc->vertStride * TypeSize(type);
6688 }
6689 else if (typeSize == execTySize)
6690 {
6691 left = newLeft;
6692 }
6693 else
6694 {
6695 left = newLeft - typeSize;
6696 }
6697 }
6698 }
6699 uint8_t secondSubRegOff = left % numEltPerGRF<Type_UB>();
6700
6701 sameSubRegOff = (firstSubRegOff == secondSubRegOff);
6702 // TODO: this guaranttees that there are equal number fo elements in each GRF, but not the distribution of elements in each of them.
6703 if (eleInFirstGRF * 2 == execSize)
6704 {
6705 return true;
6706 }
6707 return false;
6708 }
6709
evenlySplitCrossGRF(uint8_t execSize)6710 bool G4_SrcRegRegion::evenlySplitCrossGRF(uint8_t execSize)
6711 {
6712 // check number of elements in first GRF.
6713 MUST_BE_TRUE(acc == Direct, "Indirect operand can not cross GRF boundary.");
6714 uint16_t sizeInFirstGRF = numEltPerGRF<Type_UB>() - getLeftBound() % numEltPerGRF<Type_UB>();
6715 uint16_t vertSize = desc->vertStride * getElemSize();
6716 uint16_t execTypeSize = desc->horzStride == 0 ? getElemSize() : desc->horzStride * getElemSize();
6717 uint16_t numEle = (sizeInFirstGRF + execTypeSize - 1)/ execTypeSize;
6718 uint16_t rowSize = desc->horzStride == 0 ? execTypeSize : desc->width * execTypeSize,
6719 numRows = desc->vertStride == 0 ? 1 : execSize/desc->width,
6720 numElePerRow = rowSize / execTypeSize,
6721 numExecEmePerRow = desc->horzStride == 0 ? 1 : desc->width;
6722
6723 if (sizeInFirstGRF <= vertSize)
6724 {
6725 if (numEle >= desc->width)
6726 {
6727 numEle = desc->width;
6728 }
6729 }
6730 else if (desc->vertStride > desc->width)
6731 {
6732 numEle = sizeInFirstGRF/vertSize * numExecEmePerRow +
6733 ((sizeInFirstGRF%vertSize > rowSize) ? numExecEmePerRow : (sizeInFirstGRF%vertSize + execTypeSize - 1) / execTypeSize);
6734 }
6735
6736 uint16_t totalNumEle = (desc->vertStride >= numElePerRow) ? (numRows * numExecEmePerRow) :
6737 (getRightBound() - getLeftBound() + 1) / execTypeSize;
6738
6739 // TODO: this guarantees that there are equal number of elements in each GRF, but not the distribution of elements in each of them.
6740 if (numEle * 2 == totalNumEle)
6741 {
6742 return true;
6743 }
6744 return false;
6745 }
6746
6747 /*
6748 * check if the input opnd is align to GRF
6749 * if the first level dcl is not aligned to GRF or sub register offset of this opnd is not multiple GRFs, including 0,
6750 * return true.
6751 */
checkGRFAlign()6752 bool G4_SrcRegRegion::checkGRFAlign() {
6753
6754 bool GRF_aligned = false;
6755 uint32_t byte_subregoff = subRegOff * getTypeSize();
6756
6757 if (byte_subregoff % numEltPerGRF<Type_UB>() != 0) {
6758 return false;
6759 }
6760
6761 if (base) {
6762 if (base->isRegVar()) {
6763 G4_Declare *dcl = base->asRegVar()->getDeclare();
6764
6765 if (dcl) {
6766 G4_Declare *aliasdcl = dcl;
6767
6768 unsigned aliasOffset = 0;
6769 while (aliasdcl->getAliasDeclare())
6770 {
6771 aliasOffset += aliasdcl->getAliasOffset();
6772 aliasdcl = aliasdcl->getAliasDeclare();
6773 }
6774 if (aliasOffset % numEltPerGRF<Type_UB>() != 0)
6775 {
6776 return false;
6777 }
6778
6779 if (aliasdcl->getSubRegAlign() >= GRFALIGN ||
6780 aliasdcl->getNumRows() * aliasdcl->getElemSize() * aliasdcl->getElemSize() >= (int)numEltPerGRF<Type_UB>()) {
6781 return true;
6782 }
6783 }else if (base->asRegVar()->isPhyRegAssigned() &&
6784 base->asRegVar()->getByteAddr() % numEltPerGRF<Type_UB>() == 0) {
6785 return true;
6786 }
6787 }
6788 }
6789
6790 return GRF_aligned;
6791 }
6792
6793 //
6794 // returns true if this SrcRegRegion has a fixed subreg offset (in bytes).
6795 // This is true only if
6796 // -- src is direct
6797 // -- base declare is a GRF variable that is GRF-aligned
6798 // if true, the subreg offset is also returned via offset
6799 // Note this always returns false for ARFs (flag, addr, etc.)
6800 //
hasFixedSubregOffset(uint32_t & offset)6801 bool G4_SrcRegRegion::hasFixedSubregOffset(uint32_t& offset)
6802 {
6803 return regionHasFixedSubreg(this, offset);
6804 }
6805
6806 /*
6807 * Return true if the src operand has a native type and has a packed (stride
6808 * of 1) region.
6809 */
isNativePackedSrcRegion()6810 bool G4_SrcRegRegion::isNativePackedSrcRegion()
6811 {
6812 return isNativePackedRowRegion() &&
6813 (desc->vertStride == desc->width);
6814 }
6815
emit(std::ostream & output) const6816 void RegionDesc::emit(std::ostream& output) const
6817 {
6818 if (isRegionV())
6819 {
6820 output << '<' << horzStride << '>';
6821 }
6822 else if (isRegionWH())
6823 {
6824 output << '<' << width << ',' << horzStride << '>';
6825 }
6826 else
6827 {
6828 output << '<' << vertStride << ';' << width << ',' << horzStride << '>';
6829 }
6830 }
6831
emit(std::ostream & output,bool symbolreg)6832 void G4_Label::emit(std::ostream& output, bool symbolreg)
6833 {
6834 output << label;
6835 }
6836
getByteAddr() const6837 unsigned G4_RegVar::getByteAddr() const
6838 {
6839 MUST_BE_TRUE(reg.phyReg != NULL, ERROR_UNKNOWN);
6840 if (reg.phyReg->isGreg())
6841 {
6842 return reg.phyReg->asGreg()->getRegNum() * numEltPerGRF<Type_UB>() +
6843 reg.subRegOff * decl->getElemSize();
6844 }
6845 if (reg.phyReg->isA0())
6846 {
6847 return reg.subRegOff * TypeSize(Type_UW);
6848 }
6849
6850 MUST_BE_TRUE(false, ERROR_UNKNOWN);
6851 return 0;
6852 }
6853
setSubRegAlignment(G4_SubReg_Align subAlg)6854 void G4_RegVar::setSubRegAlignment(G4_SubReg_Align subAlg)
6855 {
6856 // sub reg alignment can only be more restricted than prior setting
6857 MUST_BE_TRUE(subAlign == Any || subAlign == subAlg || subAlign % 2 == 0,
6858 ERROR_UNKNOWN);
6859 if (subAlign > subAlg)
6860 {
6861 MUST_BE_TRUE(subAlign % subAlg == 0, "Sub reg alignment conflict");
6862 // do nothing; keep the original alignment (more restricted)
6863 }
6864 else
6865 {
6866 MUST_BE_TRUE(subAlg % subAlign == 0, "Sub reg alignment conflict");
6867 subAlign = subAlg;
6868 }
6869 }
6870
6871 // For implicit Acc operands, left bound depends on
6872 // a) Inst execution type
6873 // b) Qtr control
6874 //
6875 // This function handles relevant cases, including hw intricacies
6876 // and updates left bound only.
6877 //
computeLeftBoundForImplAcc(G4_Operand * opnd)6878 void G4_INST::computeLeftBoundForImplAcc(G4_Operand* opnd)
6879 {
6880 if (opnd != NULL)
6881 {
6882 G4_Type extype;
6883 int extypesize;
6884 extype = getOpExecType(extypesize);
6885
6886 if ((IS_WTYPE(extype) || IS_DTYPE(extype)))
6887 {
6888 // This condition is a result of HW Conformity requirement
6889 // that for exec type = D/DW, only acc0 is used even when
6890 // qtr control is set to Q2/H2
6891 opnd->setLeftBound(0);
6892 }
6893 else
6894 {
6895 if (opnd->isSrcRegRegion())
6896 {
6897 opnd->asSrcRegRegion()->computeLeftBound();
6898 }
6899 else if (opnd->isDstRegRegion())
6900 {
6901 opnd->asDstRegRegion()->computeLeftBound();
6902 }
6903 }
6904 }
6905 }
6906
6907 //
6908 // Normalize an operand's bitvec footprint based on its left bound
6909 // and update the given bitset.
6910 // If isSet is true, we set all bits that are covered by this operand.
6911 // If isSet os false, we clear all bits that are covered by this operand.
6912 //
updateFootPrint(BitSet & footprint,bool isSet)6913 void G4_Operand::updateFootPrint(BitSet& footprint, bool isSet)
6914 {
6915 unsigned N = NUM_BITS_PER_ELT;
6916 unsigned lb = getLeftBound();
6917 unsigned rb = getRightBound();
6918 const bool doFastPath = true; // for debugging
6919
6920 if (doFastPath && lb % N == 0 && (rb + 1) % N == 0)
6921 {
6922 // lb is 32-byte aligned, set one dword at a time
6923 unsigned idx = lb / N;
6924 unsigned endIdx = rb / N;
6925 // get the precise footprint for the first two GRF
6926 for (int i = 0; i < 2 && idx <= endIdx; ++i, ++idx)
6927 {
6928 uint64_t bits = getBitVecL();
6929 uint32_t bitVal = (uint32_t)(i % 2 ? bits >> N : bits);
6930 if (isSet)
6931 {
6932 footprint.setElt(idx, bitVal);
6933 }
6934 else
6935 {
6936 footprint.resetElt(idx, bitVal);
6937 }
6938 }
6939 if (getGRFSize() > 32)
6940 {
6941 for (int i = 0; i < 2 && idx <= endIdx; ++i, ++idx)
6942 {
6943 uint64_t bits = getBitVecH();
6944 uint32_t bitVal = (uint32_t)(i % 2 ? bits >> N : bits);
6945 if (isSet)
6946 {
6947 footprint.setElt(idx, bitVal);
6948 }
6949 else
6950 {
6951 footprint.resetElt(idx, bitVal);
6952 }
6953 }
6954 }
6955
6956 // beyond the first two GRF we assume every byte is touched
6957 while (idx <= endIdx)
6958 {
6959 if (isSet)
6960 {
6961 footprint.setElt(idx, 0xFFFFFFFF);
6962 }
6963 else
6964 {
6965 footprint.resetElt(idx, 0xFFFFFFFF);
6966 }
6967 idx++;
6968 }
6969 }
6970 else
6971 {
6972 // handle unaligned case
6973 uint64_t mask0 = getBitVecL();
6974 unsigned j = lb;
6975 for (unsigned i = 0; i < 64 && j <= rb; ++i, ++j)
6976 {
6977 if (mask0 & (1ULL << i))
6978 footprint.set(j, isSet);
6979 }
6980 if (getGRFSize() > 32)
6981 {
6982 uint64_t mask1 = getBitVecH();
6983 for (unsigned i = 0; i < 64 && j <= rb; ++i, ++j)
6984 {
6985 if (mask1 & (1ULL << i))
6986 footprint.set(j, isSet);
6987 }
6988 }
6989 while (j++ <= rb)
6990 footprint.set(j, isSet);
6991 }
6992 }
6993
6994 // update bit vector for this operand based on it size
6995 // We assume all bytes are touched
setBitVecFromSize(uint32_t NBytes)6996 void G4_Operand::setBitVecFromSize(uint32_t NBytes)
6997 {
6998 bitVec[0] = NBytes < 64 ? (1ULL << NBytes) - 1 : ULLONG_MAX;
6999 bitVec[1] = 0;
7000 if (getGRFSize() > 32 && NBytes >= 64)
7001 {
7002 bitVec[1] = (NBytes < 64 * 2) ? (1ULL << (NBytes - 64)) - 1 : ULLONG_MAX;
7003 }
7004 }
7005
7006 // Left and right bound for every operand is based off
7007 // top most dcl.
7008 // For flag register as dst/src/pred/cond mod, each bit of
7009 // bitset represents corresponding bit of flag.
7010 // For indirect access, right bound is set to sum of
7011 // left bound and 15. The constant 15 is derived by the
7012 // fact that address register is accessed as Type_UW which
7013 // means 16 bits. right bound represents closed interval
7014 // so 1 is subtracted.
7015 // For direct access of GRF, each bit of bitset represents
7016 // correcponding byte of operand.
computeRightBound(G4_Operand * opnd)7017 void G4_INST::computeRightBound(G4_Operand* opnd)
7018 {
7019 associateOpndWithInst(opnd, this);
7020
7021 if (opnd &&
7022 opnd->isImm() == false &&
7023 opnd->isNullReg() == false)
7024 {
7025 bool done = false;
7026
7027 if (done == false && op == G4_pln && opnd == srcs[1])
7028 {
7029 opnd->computeRightBound(execSize > g4::SIMD8 ? execSize : execSize * 2);
7030 if (execSize > g4::SIMD8)
7031 {
7032 opnd->setRightBound(opnd->right_bound * 2 - opnd->getLeftBound() + 1);
7033 }
7034
7035 done = true;
7036 }
7037 else if (done == false && (isPseudoKill() || isPseudoUse()))
7038 {
7039 // pseudo kills/use write/read entire variable
7040 G4_Declare* topdcl = opnd->getBase()->asRegVar()->getDeclare()->getRootDeclare();
7041 opnd->setRightBound(topdcl->getByteSize() - 1);
7042
7043 done = true;
7044 }
7045 else if (done == false && isFillIntrinsic())
7046 {
7047 asFillIntrinsic()->computeRightBound(opnd);
7048 done = true;
7049 }
7050 else if (done == false && isSpillIntrinsic())
7051 {
7052 asSpillIntrinsic()->computeRightBound(opnd);
7053 done = true;
7054 }
7055
7056 if (done == false)
7057 {
7058 opnd->computeRightBound(execSize);
7059
7060 if (getMaskOffset() > 0 &&
7061 ((opnd == getImplAccSrc()) ||
7062 (opnd == getImplAccDst())))
7063 {
7064 // for ARF (flag, acc) we have to adjust its bound based on the emask
7065 // We have to reset LB since the original instruction may have a non default emask
7066 opnd->setLeftBound(0);
7067 opnd->computeRightBound(execSize);
7068 unsigned int multiplicationFactor = 1;
7069 bool exceptionBoundsComputation = false;
7070 if (opnd->isAccReg())
7071 {
7072 // Right bound granularity is in terms of
7073 // bytes for Acc registers
7074 multiplicationFactor = 4;
7075 }
7076
7077 if (opnd == getImplAccDst() || opnd == getImplAccSrc())
7078 {
7079 G4_Type extype;
7080 int extypesize;
7081 extype = getOpExecType(extypesize);
7082
7083 if ((IS_WTYPE(extype) || IS_DTYPE(extype)))
7084 {
7085 // This condition is a result of HW Conformity requirement
7086 // that for exec type = D/DW, only acc0 is used even when
7087 // qtr control is set to Q2/H2
7088 opnd->setLeftBound(0);
7089 opnd->setRightBound(31);
7090 exceptionBoundsComputation = true;
7091 }
7092 }
7093
7094 if (exceptionBoundsComputation == false)
7095 {
7096 // Update left/right bound as per inst mask offset
7097 opnd->setLeftBound(opnd->left_bound + (getMaskOffset() * multiplicationFactor));
7098 opnd->setRightBound(opnd->right_bound + (getMaskOffset () * multiplicationFactor));
7099 }
7100 }
7101
7102 done = true;
7103 }
7104 }
7105 }
7106
computeRightBound(G4_Operand * opnd)7107 void G4_InstSend::computeRightBound(G4_Operand* opnd)
7108 {
7109 associateOpndWithInst(opnd, this);
7110
7111 if (opnd && !opnd->isImm() && !opnd->isNullReg())
7112 {
7113 auto computeSendOperandBound = [](G4_Operand* opnd, int numReg)
7114 {
7115 if (numReg == 0)
7116 {
7117 return;
7118 }
7119
7120 // Sends read/write in units of GRF. With a narrower simd width,
7121 // the variable may have size smaller than one GRF, or smaller
7122 // the reponse or message length. In this case, limit the right
7123 // bound up to the variable size.
7124 unsigned LB = opnd->left_bound;
7125 unsigned RB = std::min(opnd->getTopDcl()->getByteSize(),
7126 LB + numReg * numEltPerGRF<Type_UB>()) - 1;
7127
7128 unsigned NBytes = RB - LB + 1;
7129 opnd->setBitVecFromSize(NBytes);
7130 opnd->setRightBound(RB);
7131 };
7132
7133 if (srcs[0] == opnd || (isSplitSend() && srcs[1] == opnd))
7134 {
7135 // For send instruction's msg operand rightbound depends
7136 // on msg descriptor
7137 uint16_t numReg = (srcs[0] == opnd) ?
7138 getMsgDesc()->getSrc0LenRegs() : getMsgDesc()->getSrc1LenRegs();
7139 computeSendOperandBound(opnd, numReg);
7140 }
7141 else if (dst == opnd)
7142 {
7143 // Compute right bound for dst operand
7144 const auto *desc = getMsgDesc();
7145 uint32_t dstBytes = desc->getDstLenBytes();
7146 if (dstBytes < getGRFSize()) {
7147 // e.g. OWord block read x1
7148 opnd->setBitVecL((1ULL << dstBytes) - 1);
7149 opnd->setRightBound(opnd->left_bound + dstBytes - 1);
7150
7151 } else {
7152 uint16_t numReg = desc->getDstLenRegs();
7153 computeSendOperandBound(opnd, numReg);
7154 }
7155 }
7156 else
7157 {
7158 opnd->computeRightBound(execSize);
7159 }
7160 }
7161
7162 }
7163
computeARFRightBound()7164 void G4_INST::computeARFRightBound()
7165 {
7166 computeRightBound(predicate);
7167 computeRightBound(mod);
7168 computeRightBound(implAccSrc);
7169 computeRightBound(implAccDst);
7170 }
7171
7172
7173 // This function should only be invoked after computePReg() function
7174 // has been invoked. The function computePReg() is invoked by computePhyReg()
7175 // just before scheduling and post-RA.
7176 // For GRF type variables this function returns linearized byte offset into
7177 // GRF file. So if a variable is assigned r1 and its left bound is 0, this
7178 // function will return (1 * 32) + 0 = 32.
7179 // For non-GRF variables, GRF base offset value is 0 so value returned will
7180 // be left bound.
7181 // This function works for both, G4_SrcRegRegion as well as G4_DstRegRegion.
getLinearizedStart()7182 unsigned int G4_Operand::getLinearizedStart()
7183 {
7184 unsigned linearizedStart = getLeftBound();
7185 G4_VarBase* base = getBase();
7186
7187 if (base && base->isRegVar())
7188 {
7189 // LB is computed based on the root variable, so we have to go all the way up
7190 G4_Declare* dcl = base->asRegVar()->getDeclare();
7191 linearizedStart += dcl->getGRFBaseOffset();
7192 linearizedStart -= dcl->getOffsetFromBase();
7193 }
7194
7195 return linearizedStart;
7196 }
7197
7198 // Just like getLinearizedStart(), this function returns linearized byte
7199 // offset of end of variable. For eg, if a variable is assigned r1 and
7200 // region is type dword with inst exec size = 16, linearized end will be
7201 // (63 - 0 + 32) = 95.
7202 // Here, right bound is 63 since the region accesses 64 bytes,
7203 // left bound is 0 since region access begins at byte 0,
7204 // linearizedStart() will return 32 since r1 is allocated to the region.
7205 // This function works for both, G4_SrcRegRegion as well as G4_DstRegRegion.
getLinearizedEnd()7206 unsigned int G4_Operand::getLinearizedEnd()
7207 {
7208 return (getRightBound() - getLeftBound() + getLinearizedStart());
7209 }
7210
dump() const7211 void G4_Operand::dump() const
7212 {
7213 #if _DEBUG
7214 const_cast<G4_Operand *>(this)->emit(std::cerr, false);
7215 #endif
7216 }
7217
setPredicate(G4_Predicate * p)7218 void G4_INST::setPredicate(G4_Predicate* p)
7219 {
7220 if (predicate && predicate->getInst() == this)
7221 {
7222 predicate->setInst(NULL);
7223 }
7224
7225 predicate = p;
7226
7227 associateOpndWithInst(p, this);
7228 computeRightBound(p);
7229 }
7230
setSrc(G4_Operand * opnd,unsigned i)7231 void G4_INST::setSrc(G4_Operand* opnd, unsigned i)
7232 {
7233 if (isPseudoAddrMovIntrinsic())
7234 {
7235 asIntrinsicInst()->setIntrinsicSrc(opnd, i);
7236 return;
7237 }
7238
7239 MUST_BE_TRUE(i < G4_MAX_SRCS, ERROR_INTERNAL_ARGUMENT);
7240
7241 if (srcs[i] != NULL)
7242 {
7243 if ((srcs[0] == srcs[i] && i != 0) ||
7244 (srcs[1] == srcs[i] && i != 1) ||
7245 (srcs[2] == srcs[i] && i != 2) ||
7246 (srcs[3] == srcs[i] && i != 3))
7247 {
7248 // opnd is present in some other
7249 // index of srcs so dont set its
7250 // inst to NULL
7251 }
7252 else
7253 {
7254 if (srcs[i]->getInst() == this)
7255 {
7256 srcs[i]->setInst(NULL);
7257 }
7258 }
7259 }
7260
7261 srcs[i] = opnd;
7262
7263 associateOpndWithInst(opnd, this);
7264 resetRightBound(opnd);
7265 }
7266
setDest(G4_DstRegRegion * opnd)7267 void G4_INST::setDest(G4_DstRegRegion* opnd)
7268 {
7269 if (dst != NULL && dst->getInst() == this)
7270 {
7271 dst->setInst(NULL);
7272 }
7273
7274 dst = opnd;
7275
7276 associateOpndWithInst(opnd, this);
7277 resetRightBound(opnd);
7278 }
7279
setCondMod(G4_CondMod * m)7280 void G4_INST::setCondMod(G4_CondMod* m)
7281 {
7282 if (mod && mod->getInst() == this)
7283 {
7284 mod->setInst(NULL);
7285 }
7286
7287 mod = m;
7288
7289 associateOpndWithInst(m, this);
7290 computeRightBound(m);
7291 }
7292
setImplAccSrc(G4_Operand * opnd)7293 void G4_INST::setImplAccSrc(G4_Operand* opnd)
7294 {
7295 if (implAccSrc != NULL && implAccSrc->getInst() == this)
7296 {
7297 implAccSrc->setInst(NULL);
7298 }
7299
7300 implAccSrc = opnd;
7301
7302 associateOpndWithInst(opnd, this);
7303 computeRightBound(opnd);
7304 }
7305
setImplAccDst(G4_DstRegRegion * opnd)7306 void G4_INST::setImplAccDst(G4_DstRegRegion* opnd)
7307 {
7308 if (implAccDst != NULL && implAccDst->getInst() == this)
7309 {
7310 implAccDst->setInst(NULL);
7311 }
7312
7313 implAccDst = opnd;
7314
7315 associateOpndWithInst(opnd, this);
7316 computeRightBound(opnd);
7317 }
7318
7319 // get simd lane mask for this instruction. For example,
7320 // add (8|M8) ...
7321 // will have 0xFF00, which lane 8-15
getExecLaneMask() const7322 unsigned G4_INST::getExecLaneMask() const
7323 {
7324 unsigned maskbits = (1 << getExecSize()) - 1;
7325 unsigned chanOffset = getMaskOffset();
7326 return (maskbits << chanOffset);
7327 }
7328
print(std::ostream & OS) const7329 void G4_INST::print(std::ostream& OS) const
7330 {
7331 G4_INST& inst = const_cast<G4_INST&>(*this);
7332 if (!inst.isLabel())
7333 OS << "\t";
7334 inst.emit(OS, false, false);
7335 OS << "\n";
7336 }
7337
dump() const7338 void G4_INST::dump() const
7339 {
7340 print(std::cerr);
7341 }
7342
canSupportSaturate() const7343 bool G4_INST::canSupportSaturate() const
7344 {
7345 if (op == G4_mul || op == G4_pseudo_mad)
7346 {
7347 for (int i = 0, numSrc = getNumSrc(); i < numSrc; ++i)
7348 {
7349 if (IS_DTYPE(getSrc(i)->getType()))
7350 {
7351 return false;
7352 }
7353 }
7354 return true;
7355 }
7356
7357 if (isIntrinsic() || op == G4_mulh || op == G4_madw)
7358 {
7359 return false;
7360 }
7361
7362 // note that IGA will return false for any opcode it does not recognize
7363 // If your psuedo opcode needs to support saturation you must add explicit check before this
7364 return InstSupportsSaturationIGA(getPlatform(), *this, builder);
7365 }
7366
canSupportCondMod() const7367 bool G4_INST::canSupportCondMod() const
7368 {
7369 if (!builder.hasCondModForTernary() && getNumSrc() == 3)
7370 {
7371 return false;
7372 }
7373
7374 if (op == G4_mul)
7375 {
7376 // can't support conditional modifiers if source is DW and dst is not QW
7377 bool dwordSrc = false;
7378 for (int i = 0, numSrc = getNumSrc(); i < numSrc; ++i)
7379 {
7380 if (IS_DTYPE(getSrc(i)->getType()))
7381 {
7382 dwordSrc = true;
7383 break;
7384 }
7385 }
7386 if (dwordSrc && !IS_QTYPE(getDst()->getType()))
7387 {
7388 return false;
7389 }
7390 return true;
7391 }
7392 else if (op == G4_pseudo_mad)
7393 {
7394 // no cond mod for D * W
7395 G4_Operand* src0 = getSrc(0);
7396 G4_Operand* src1 = getSrc(1);
7397 if (IS_DTYPE(src0->getType()) || IS_DTYPE(src1->getType()))
7398 {
7399 return false;
7400 }
7401 return true;
7402 }
7403
7404 if (op == G4_mov)
7405 {
7406 return dst->getType() != Type_BF && getSrc(0)->getType() != Type_BF;
7407 }
7408
7409 // ToDo: replace with IGA model
7410 return ((op == G4_add) ||
7411 (op == G4_and) ||
7412 (op == G4_addc) ||
7413 (op == G4_asr) ||
7414 (op == G4_avg) ||
7415 (op == G4_dp2) ||
7416 (op == G4_dp3) ||
7417 (op == G4_dp4) ||
7418 (op == G4_dp4a) ||
7419 (op == G4_dph) ||
7420 (op == G4_dp4a) ||
7421 (op == G4_frc) ||
7422 (op == G4_line) ||
7423 (op == G4_lrp) ||
7424 (op == G4_lzd) ||
7425 (op == G4_mac) ||
7426 (op == G4_mach) ||
7427 (op == G4_mad) ||
7428 (op == G4_mov) ||
7429 (op == G4_mul) ||
7430 (op == G4_not) ||
7431 (op == G4_or) ||
7432 (op == G4_pln) ||
7433 (op == G4_rndd) ||
7434 (op == G4_rnde) ||
7435 (op == G4_rndu) ||
7436 (op == G4_rndz) ||
7437 (op == G4_sad2) ||
7438 (op == G4_sada2) ||
7439 (op == G4_shl) ||
7440 (op == G4_shr) ||
7441 (op == G4_subb) ||
7442 (op == G4_xor));
7443 }
7444
canSupportSrcModifier() const7445 bool G4_INST::canSupportSrcModifier() const
7446 {
7447 if (opcode() == G4_mov)
7448 {
7449 if (getDst()->getType() == Type_BF)
7450 {
7451 return false;
7452 }
7453 }
7454
7455 if (opcode() == G4_pseudo_mad)
7456 {
7457 return true;
7458 }
7459
7460 // note that IGA will return false for any opcode it does not recognize
7461 // If your psuedo opcode needs to support source modifier you must add
7462 // explicit check before this
7463 return InstSupportsSrcModifierIGA(getPlatform(), *this, builder);
7464 }
7465
7466 // convert (execsize, offset) into emask option
7467 // if no such mask option exists, return InstOpt_NoOpt
offsetToMask(int execSize,int offset,bool nibOk)7468 G4_InstOption G4_INST::offsetToMask(int execSize, int offset, bool nibOk)
7469 {
7470 switch (execSize)
7471 {
7472 case 32:
7473 return InstOpt_M0;
7474 case 16:
7475 switch (offset)
7476 {
7477 case 0:
7478 return InstOpt_M0;
7479 case 16:
7480 return InstOpt_M16;
7481 default:
7482 return InstOpt_NoOpt;
7483 }
7484 case 8:
7485 switch (offset)
7486 {
7487 case 0:
7488 return InstOpt_M0;
7489 case 8:
7490 return InstOpt_M8;
7491 case 16:
7492 return InstOpt_M16;
7493 case 24:
7494 return InstOpt_M24;
7495 default:
7496 return InstOpt_NoOpt;
7497 }
7498 case 4:
7499 if (nibOk)
7500 {
7501 switch (offset)
7502 {
7503 case 0:
7504 return InstOpt_M0;
7505 case 4:
7506 return InstOpt_M4;
7507 case 8:
7508 return InstOpt_M8;
7509 case 12:
7510 return InstOpt_M12;
7511 case 16:
7512 return InstOpt_M16;
7513 case 20:
7514 return InstOpt_M20;
7515 case 24:
7516 return InstOpt_M24;
7517 case 28:
7518 return InstOpt_M28;
7519 default:
7520 return InstOpt_NoOpt;
7521 }
7522 }
7523 else
7524 {
7525 return InstOpt_NoOpt;
7526 }
7527 default:
7528 return InstOpt_NoOpt;
7529 }
7530 }
7531
setWriteMask(ChannelEnable wm)7532 void G4_DstRegRegion::setWriteMask(ChannelEnable wm)
7533 {
7534 writeMask = wm;
7535 }
7536
setSwizzle(const char * sw)7537 void G4_SrcRegRegion::setSwizzle(const char* sw)
7538 {
7539 MUST_BE_TRUE((int)strlen(sw) < max_swizzle, ERROR_INTERNAL_ARGUMENT);
7540 strcpy_s(swizzle, max_swizzle, sw);
7541 }
7542
7543 // convert contiguous regions to <N;N,1> form subject to the requirment
7544 // that width is not used to cross GRF
7545 // This is done because <1;1,0>/<2;2,1> require crossbar muxes and thus incur a performance penalty
7546 // This should only be called after RA when we know the actual subreg offset
rewriteContiguousRegion(IR_Builder & builder,uint16_t opNum)7547 void G4_SrcRegRegion::rewriteContiguousRegion(IR_Builder& builder, uint16_t opNum)
7548 {
7549 int execSize = inst->getExecSize();
7550 if (execSize == 1 || !desc->isContiguous(execSize))
7551 {
7552 return;
7553 }
7554 uint32_t eltSize = getTypeSize();
7555 uint32_t subRegOffset = getLinearizedStart() % numEltPerGRF<Type_UB>();
7556 uint32_t endOffset = subRegOffset + inst->getExecSize() * eltSize;
7557
7558 bool isAlign1Ternary = builder.hasAlign1Ternary() && inst->getNumSrc() == 3;
7559
7560 if (builder.doNotRewriteContiguousRegion())
7561 {
7562 // 2-src and 3-src src0/1: normalize region to <1;1,0>
7563 // 3-src src2: normalize region to <2;2,1> since it only supports horz stride
7564 setRegion(isAlign1Ternary && opNum == 2 ? builder.createRegionDesc(2, 2, 1) : builder.getRegionStride1(), true);
7565 return;
7566 }
7567
7568 if (inst->getNumSrc() < 3)
7569 {
7570 // do <16;16,1> for HF/W if possible
7571 if (subRegOff == 0 && execSize == 16 && eltSize == 2)
7572 {
7573 setRegion(builder.createRegionDesc(16, 16, 1), true);
7574 return;
7575 }
7576 }
7577
7578 // Find a width that does not cross GRF from <8;8,1>, <4;4,1>, to <2;2,1>
7579 auto getWidth = [=](unsigned offset, unsigned eltSize) -> unsigned
7580 {
7581 unsigned Widths[] = { 8, 4, 2 };
7582 for (auto w : Widths)
7583 {
7584 if (w > inst->getExecSize())
7585 continue;
7586
7587 if (w * eltSize > numEltPerGRF<Type_UB>())
7588 {
7589 // <8;8,1> is not allowed for 64-bit type
7590 continue;
7591 }
7592
7593 if (endOffset <= numEltPerGRF<Type_UB>() ||
7594 subRegOffset % (w * eltSize) == 0)
7595 {
7596 return w;
7597 }
7598 }
7599
7600 // width >= 2 crosses GRF
7601 return 0;
7602 };
7603
7604 unsigned short w = (unsigned short)getWidth(subRegOffset, eltSize);
7605
7606 if (builder.newTernaryStride() && isAlign1Ternary && (w == 2 || w == 0) && opNum != 2)
7607 {
7608 setRegion(builder.getRegionStride1(), true);
7609 return;
7610 }
7611
7612 if (w)
7613 {
7614 setRegion(builder.createRegionDesc(w, w, 1), true);
7615 }
7616 else if (isAlign1Ternary)
7617 {
7618 // binary encoding asserts on <1;1,0> region for 3-src inst, so force change it to <2;2,1>
7619 setRegion(builder.createRegionDesc(2, 2, 1), true);
7620 }
7621 }
7622
getLiveIntervals(std::vector<std::pair<uint32_t,uint32_t>> & intervals)7623 void LiveIntervalInfo::getLiveIntervals(std::vector<std::pair<uint32_t, uint32_t>>& intervals)
7624 {
7625 for (auto&& it : liveIntervals)
7626 {
7627 intervals.push_back(it);
7628 }
7629 }
7630
addLiveInterval(uint32_t start,uint32_t end)7631 void LiveIntervalInfo::addLiveInterval(uint32_t start, uint32_t end)
7632 {
7633 if (liveIntervals.size() == 0)
7634 {
7635 liveIntervals.emplace_back(start, end);
7636 }
7637 else if (start - liveIntervals.back().second <= 1)
7638 {
7639 liveIntervals.back().second = end;
7640 }
7641 else if (liveIntervals.back().second < start)
7642 {
7643 liveIntervals.emplace_back(start, end);
7644 }
7645 else if (liveIntervals.front().first >= start && liveIntervals.back().second <= end)
7646 {
7647 liveIntervals.clear();
7648 liveIntervals.emplace_back(start, end);
7649 }
7650 else
7651 {
7652 bool inserted = false;
7653 uint32_t newEnd = end;
7654 for (auto liveIt = liveIntervals.begin(); liveIt != liveIntervals.end();)
7655 {
7656 auto& lr = (*liveIt);
7657
7658 if (!inserted)
7659 {
7660 if (lr.first <= start && lr.second >= newEnd)
7661 {
7662 inserted = true;
7663 break;
7664 }
7665 else if (lr.first <= start && lr.second > start && lr.second <= newEnd)
7666 {
7667 // Extend existing sub-interval
7668 lr.second = newEnd;
7669 inserted = true;
7670 ++liveIt;
7671 continue;
7672 }
7673 else if ((start - lr.second) <= 1u)
7674 {
7675 lr.second = newEnd;
7676 inserted = true;
7677 ++liveIt;
7678 continue;
7679 }
7680 else if (lr.first > start)
7681 {
7682 // Insert new sub-interval
7683 liveIntervals.insert(liveIt, std::make_pair(start, newEnd));
7684 inserted = true;
7685 continue;
7686 }
7687 }
7688 else
7689 {
7690 if (lr.first > newEnd)
7691 break;
7692 else if (lr.first == newEnd)
7693 {
7694 newEnd = lr.second;
7695 auto newLRIt = liveIt;
7696 --newLRIt;
7697 (*newLRIt).second = newEnd;
7698 liveIt = liveIntervals.erase(liveIt);
7699 continue;
7700 }
7701 else if (lr.second <= newEnd)
7702 {
7703 liveIt = liveIntervals.erase(liveIt);
7704 continue;
7705 }
7706 else if(lr.first < newEnd && lr.second > newEnd)
7707 {
7708 auto newLRIt = liveIt;
7709 --newLRIt;
7710 (*newLRIt).second = lr.second;
7711 liveIntervals.erase(liveIt);
7712 break;
7713 }
7714 }
7715 ++liveIt;
7716 }
7717
7718 if (!inserted)
7719 {
7720 if (start - liveIntervals.back().second <= 1)
7721 liveIntervals.back().second = end;
7722 else
7723 liveIntervals.emplace_back(start, end);
7724 }
7725 }
7726 }
7727
liveAt(uint32_t cisaOff)7728 void LiveIntervalInfo::liveAt(uint32_t cisaOff)
7729 {
7730 if (cisaOff == UNMAPPABLE_VISA_INDEX)
7731 return;
7732
7733 // Now iterate over all intervals and check which one should
7734 // be extended. If none, start a new one.
7735 bool added = false;
7736 auto prev = liveIntervals.begin();
7737
7738 for (auto it = liveIntervals.begin(), itEnd = liveIntervals.end();
7739 it != itEnd;
7740 prev = it++)
7741 {
7742 auto& item = (*it);
7743
7744 if (added)
7745 {
7746 // Check whether prev and current one can be merged
7747 if (((*prev).second == item.first) ||
7748 ((*prev).second == item.first - 1))
7749 {
7750 prev->second = item.second;
7751 it = liveIntervals.erase(it);
7752 break;
7753 }
7754 else
7755 {
7756 break;
7757 }
7758 }
7759
7760 if (item.first == cisaOff + 1)
7761 {
7762 item.first = cisaOff;
7763 added = true;
7764 break;
7765 }
7766
7767 if (item.second == cisaOff - 1)
7768 {
7769 item.second = cisaOff;
7770 added = true;
7771 continue;
7772 }
7773
7774 if (!added &&
7775 item.first <= cisaOff &&
7776 item.second >= cisaOff)
7777 {
7778 added = true;
7779 break;
7780 }
7781
7782 if (item.first > cisaOff)
7783 {
7784 liveIntervals.insert(it, std::make_pair(cisaOff, cisaOff));
7785 added = true;
7786 break;
7787 }
7788 }
7789
7790 if (!added)
7791 {
7792 liveIntervals.push_back(std::make_pair(cisaOff, cisaOff));
7793 }
7794 }
7795
supportsNullDst() const7796 bool G4_INST::supportsNullDst() const
7797 {
7798 if (isSend())
7799 {
7800 return true;
7801 }
7802 if (builder.getPlatform() >= GENX_PVC && dst->getTypeSize() == 1)
7803 {
7804 // null:b not supported
7805 return false;
7806 }
7807 return getNumSrc() != 3 && !(op == G4_pln && !builder.doPlane());
7808 }
7809
isAlign1Ternary() const7810 bool G4_INST::isAlign1Ternary() const
7811 {
7812 return builder.hasAlign1Ternary() && getNumSrc() == 3 && !mayExceedTwoGRF();
7813 }
7814
7815 // Detect packed low-precision instruction. This is used by the scheduler.
7816 // - all src and dst are GRF and of :hf type and "packed".
7817 // (src is also packed when it is replicated scalar).
7818 // Two cases are possible:
7819 // 1) add (16) r1.0<1>:hf r2.0<8;8,1>:hf r3.0<8;8,1>:hf { Align1, H1 }
7820 // 2) add (16) r1.0<1>:hf r2.0<8;8,1>:hf r3.0<0;1,0>:hf { Align1, H1 }
isFastHFInstruction(void) const7821 bool G4_INST::isFastHFInstruction(void) const {
7822 if (getExecSize() < g4::SIMD16) {
7823 return false;
7824 }
7825 bool isHF = false;
7826 for (int op_i = 0, op_e = getNumSrc(); op_i < op_e; ++op_i) {
7827 G4_Operand *opnd = getSrc(op_i);
7828 if (! opnd) {
7829 continue;
7830 }
7831 if (!IS_HFTYPE(opnd->getType())) {
7832 return false;
7833 }
7834 if (opnd->isSrcRegRegion()) {
7835 G4_SrcRegRegion *srcRgn = opnd->asSrcRegRegion();
7836 if (! srcRgn->getRegion()->isContiguous(getExecSize())) {
7837 return false;
7838 }
7839 }
7840 isHF = true;
7841 }
7842 return isHF;
7843 }
7844
7845
prepareForRealloc(G4_Kernel * kernel)7846 void G4_Declare::prepareForRealloc(G4_Kernel* kernel)
7847 {
7848 // Reset allocated register if this dcl is not an input
7849 // or a pre-defined variable.
7850 auto& builder = kernel->fg.builder;
7851
7852 setGRFBaseOffset(0);
7853
7854 if (getRegFile() != G4_RegFileKind::G4_INPUT &&
7855 getRegVar()->isPhyRegAssigned() &&
7856 !getRegVar()->isAreg() &&
7857 this != builder->getBuiltinA0() &&
7858 this != builder->getBuiltinR0() &&
7859 this != builder->getBuiltinA0Dot2() &&
7860 this != builder->getBuiltinBindlessSampler() &&
7861 this != builder->getBuiltinHWTID() &&
7862 this != builder->getBuiltinT252() &&
7863 this != builder->getStackCallRet() &&
7864 this != builder->getStackCallArg() &&
7865 this != builder->getBEFP() &&
7866 this != builder->getBESP() &&
7867 this != kernel->fg.getScratchRegDcl() &&
7868 this != kernel->fg.getStackPtrDcl() &&
7869 this != kernel->fg.getFramePtrDcl())
7870 {
7871 getRegVar()->resetPhyReg();
7872 getRegVar()->setDisp(UINT_MAX);
7873 }
7874 }
7875
mayExpandToAccMacro() const7876 bool G4_INST::mayExpandToAccMacro() const
7877 {
7878 auto isDMul = [](const G4_INST *Inst) {
7879 return Inst->opcode() == G4_mul && (IS_QTYPE(Inst->getDst()->getType()) ||
7880 (IS_DTYPE(Inst->getSrc(0)->getType()) &&
7881 IS_DTYPE(Inst->getSrc(1)->getType())));
7882 };
7883
7884 auto mayBeMAC = [&](const G4_INST *Inst) {
7885 if (Inst->opcode() != G4_pseudo_mad)
7886 return false;
7887 if (IS_TYPE_FLOAT_ALL(Inst->getDst()->getType()) &&
7888 builder.getOption(vISA_forceFPMAD))
7889 return false;
7890 return true;
7891 };
7892
7893 return opcode() == G4_mach ||
7894 opcode() == G4_mulh ||
7895 opcode() == G4_madw ||
7896 isDMul(this) ||
7897 mayBeMAC(this) ||
7898 (opcode() == G4_pln && !builder.doPlane());
7899 }
7900
canExecSizeBeAcc(Gen4_Operand_Number opndNum) const7901 bool G4_INST::canExecSizeBeAcc(Gen4_Operand_Number opndNum) const
7902 {
7903 switch (dst->getType())
7904 {
7905 case Type_HF:
7906 case Type_BF:
7907 if (builder.relaxedACCRestrictions())
7908 {
7909 if (!((isMixedMode() && getExecSize() == g4::SIMD8) ||
7910 (getExecSize() == g4::SIMD16)))
7911 {
7912 return false;
7913 }
7914 }
7915 else
7916 {
7917 if (getExecSize() != G4_ExecSize(builder.getNativeExecSize() * 2))
7918 {
7919 return false;
7920 }
7921 }
7922 break;
7923 case Type_W:
7924 case Type_UW:
7925 if (getExecSize() != G4_ExecSize(builder.getNativeExecSize() * 2))
7926 {
7927 return false;
7928 }
7929 break;
7930 case Type_F:
7931 if (getExecSize() != G4_ExecSize(builder.getNativeExecSize() * 2) &&
7932 getExecSize() != builder.getNativeExecSize())
7933 {
7934 return false;
7935 }
7936 break;
7937 case Type_DF:
7938 if (!builder.useAccForDF())
7939 {
7940 return false;
7941 }
7942 if (getExecSize() != builder.getNativeExecSize() &&
7943 getExecSize() != G4_ExecSize(builder.getNativeExecSize() / 2))
7944 {
7945 return false;
7946 }
7947 break;
7948 case Type_D:
7949 case Type_UD:
7950 if (getExecSize() != builder.getNativeExecSize())
7951 {
7952 return false;
7953 }
7954 if (opndNum != Opnd_dst && isSignSensitive(opndNum))
7955 {
7956 return false;
7957 }
7958 break;
7959 default:
7960 return false;
7961 }
7962
7963 return true;
7964 }
7965
7966 // returns true if dst may be replaced by an explicit acc
7967 // in addition to opcode-specific checks, we require
7968 // -- dst must be GRF
7969 // -- contiguous regions
7970 // -- simd8 for D/UD, simd8/16 for F, simd16 for HF/W, other types not allowed
canDstBeAcc() const7971 bool G4_INST::canDstBeAcc() const
7972 {
7973 if (mayExpandToAccMacro())
7974 {
7975 // while this should not prevent dst from becoming acc (mul/plane macros use
7976 // acc as temp so should not affect final dst), later HW conformity is not equipped
7977 // to deal with such code so we disable the substitution
7978 return false;
7979 }
7980
7981 if (dst == nullptr || dst->getTopDcl() == nullptr || dst->getHorzStride() != 1)
7982 {
7983 return false;
7984 }
7985
7986 if (dst->getTopDcl()->getRegFile() != G4_GRF)
7987 {
7988 return false;
7989 }
7990
7991 if (!builder.hasFP64Acc() && dst->getType() == Type_DF)
7992 {
7993 return false;
7994 }
7995
7996 // src0 may not have indirect regioning
7997 if (!builder.accDstforIndirectSrc() && getSrc(0) && getSrc(0)->isSrcRegRegion())
7998 {
7999 auto src0Region = getSrc(0)->asSrcRegRegion();
8000 if (src0Region->getRegAccess() == IndirGRF)
8001 {
8002 return false;
8003 }
8004 }
8005
8006 if (!canExecSizeBeAcc(Opnd_dst))
8007 {
8008 return false;
8009 }
8010
8011 if (getSaturate() && IS_INT(dst->getType()))
8012 {
8013 return false;
8014 }
8015
8016 if (!builder.relaxedACCRestrictions())
8017 {
8018 if (dst->getType() == builder.getMixModeType() && isMixedMode())
8019 {
8020 // acc can't be used as packed f16 for mix mode instruction as it doesn't support regioning
8021 return false;
8022 }
8023 }
8024
8025 if (builder.avoidAccDstWithIndirectSource())
8026 {
8027 for (int i = 0, numSrc = getNumSrc(); i < numSrc; ++i)
8028 {
8029 bool indirectSrc = getSrc(i) && getSrc(i)->isSrcRegRegion() &&
8030 getSrc(i)->asSrcRegRegion()->getRegAccess() != Direct;
8031 if (indirectSrc)
8032 {
8033 return false;
8034 }
8035 }
8036 }
8037
8038 if (isMath())
8039 {
8040 return builder.hasMathAcc();
8041 }
8042 switch (opcode())
8043 {
8044 case G4_add:
8045 case G4_and:
8046 case G4_asr:
8047 case G4_avg:
8048 case G4_frc:
8049 case G4_lzd:
8050 case G4_mul:
8051 case G4_not:
8052 case G4_or:
8053 case G4_rndd:
8054 case G4_rnde:
8055 case G4_rndu:
8056 case G4_rndz:
8057 case G4_shr:
8058 case G4_smov:
8059 case G4_xor:
8060 case G4_rol:
8061 case G4_ror:
8062 return true;
8063 case G4_sel:
8064 // sel seems to fail with int acc for some strange reason (sign extension?)
8065 return getCondMod() ? IS_TYPE_FLOAT_ALL(dst->getType()) : true;
8066 case G4_cmp:
8067 case G4_cmpn:
8068 // disable for now since it's causing some SKL tests to fail
8069 return false;
8070 case G4_mov:
8071 if (builder.hasFormatConversionACCRestrictions())
8072 {
8073 const bool allowedICombination = (IS_DTYPE(getSrc(0)->getType()) || getSrc(0)->getType() == Type_W || getSrc(0)->getType() == Type_UW) &&
8074 (IS_DTYPE(dst->getType()) || dst->getType() == Type_W || dst->getType() == Type_UW);
8075 const bool allowedFCombination = (getSrc(0)->getType() == Type_F || getSrc(0)->getType() == Type_HF) &&
8076 (dst->getType() == Type_F || dst->getType() == Type_HF);
8077 const bool allowedDFCombination = getSrc(0)->getType() == Type_DF &&
8078 dst->getType() == Type_DF;
8079
8080 if (builder.restrictedACCRestrictions() && allowedFCombination)
8081 {
8082 uint16_t dstStride = dst->getHorzStride();
8083 uint16_t srcStride = 0;
8084 if (getSrc(0)->isSrcRegRegion())
8085 {
8086 G4_SrcRegRegion* src = getSrc(0)->asSrcRegRegion();
8087 const RegionDesc* region = src->getRegion();
8088
8089 if (!region->isSingleStride(execSize, srcStride))
8090 {
8091 return false;
8092 }
8093
8094 //The bitmapping is model by the element size * element stride.
8095 // No matter dst is float or half float.
8096 // Pack and un-pack happen only in the destination register, so no matter dst is F or HF, it's not allowed to be replaced with ACC if bitmapping swizzles.
8097 // If both dst and src are HF type, swizzle is not allowed as well.
8098 //FIXME, mov (16|M0) acc0.0<1>:f, r28<1;1,0>:hf can be passed in HW.
8099 if ((dst->getType() != src->getType() || dst->getType() == Type_HF) &&
8100 (dstStride * dst->getTypeSize() != srcStride * src->getTypeSize()))
8101 {
8102 return false;
8103 }
8104 }
8105 }
8106
8107 if (!allowedICombination && !allowedFCombination && !allowedDFCombination)
8108 {
8109 return false;
8110 }
8111 }
8112 return builder.relaxedACCRestrictions() || !getSrc(0)->isAccReg();
8113 case G4_pln:
8114 // we can't use acc if plane will be expanded
8115 return builder.doPlane();
8116 case G4_madm:
8117 return builder.useAccForMadm();
8118 case G4_mad:
8119 case G4_csel:
8120 return builder.canMadHaveAcc();
8121 case G4_dp4a:
8122 return builder.relaxedACCRestrictions2();
8123 case G4_bfn:
8124 case G4_add3:
8125 return true;
8126 default:
8127 return false;
8128 }
8129 }
8130
8131 // returns true if src0 may be replaced by an explicit acc
8132 // in addition to opcode-specific checks, we require
8133 // -- contiguous regions
8134 // -- simd8 for D/UD, simd8/16 for F, simd16 for HF/W, other types not allowed
canSrcBeAccBeforeHWConform(Gen4_Operand_Number opndNum) const8135 bool G4_INST::canSrcBeAccBeforeHWConform(Gen4_Operand_Number opndNum) const
8136 {
8137 int srcId = getSrcNum(opndNum);
8138 assert((srcId == 0 || srcId == 1 || srcId == 2) && "must be either src0, src1 or src2");
8139
8140 if (!builder.relaxedACCRestrictions3() && srcId == 2)
8141 {
8142 return false;
8143 }
8144
8145 if (getSrc(srcId) == nullptr || !getSrc(srcId)->isSrcRegRegion())
8146 {
8147 return false;
8148 }
8149
8150 if (mayExpandToAccMacro())
8151 {
8152 return false;
8153 }
8154
8155 G4_SrcRegRegion* src = getSrc(srcId)->asSrcRegRegion();
8156 if (srcId == 1 && src->hasModifier())
8157 {
8158 // some platforms allow float src1 acc modifiers,
8159 // while some don't allow src1 acc modifier at all.
8160 if (!IS_TYPE_FLOAT_ALL(src->getType()) || !builder.relaxedACCRestrictions())
8161 {
8162 return false;
8163 }
8164 }
8165 if (!src->getRegion()->isContiguous(getExecSize()))
8166 {
8167 return false;
8168 }
8169
8170 if (builder.relaxedACCRestrictions() &&
8171 isMixedMode() &&
8172 isLowPrecisionFloatTy(src->getType()))
8173 {
8174 return false;
8175 }
8176
8177 if (!canExecSizeBeAcc(opndNum))
8178 {
8179 return false;
8180 }
8181
8182 if (opcode() == G4_mad && srcId == 0 &&
8183 !builder.canMadHaveSrc0Acc())
8184 {
8185 // mac's implicit acc gets its region from dst, so we have to check src and
8186 // dst have the same type
8187 if (src->getType() != dst->getType())
8188 {
8189 return false;
8190 }
8191 }
8192
8193 if (IS_TYPE_FLOAT_ALL(src->getType()) ^ IS_TYPE_FLOAT_ALL(getDst()->getType()))
8194 {
8195 // no float <-> int conversion for acc source
8196 return false;
8197 }
8198
8199 if (isMath())
8200 {
8201 return builder.hasMathAcc();
8202 }
8203 switch (opcode())
8204 {
8205 case G4_add:
8206 case G4_asr:
8207 case G4_avg:
8208 case G4_cmp:
8209 case G4_cmpn:
8210 case G4_frc:
8211 case G4_lzd:
8212 case G4_rndd:
8213 case G4_rnde:
8214 case G4_rndu:
8215 case G4_rndz:
8216 case G4_sel:
8217 case G4_shl:
8218 case G4_shr:
8219 case G4_smov:
8220 case G4_rol:
8221 case G4_ror:
8222 return true;
8223 case G4_mov:
8224 if (builder.hasFormatConversionACCRestrictions())
8225 {
8226 const bool allowedICombination = (IS_DTYPE(src->getType()) || src->getType() == Type_W || src->getType() == Type_UW) &&
8227 (IS_DTYPE(dst->getType()) || dst->getType() == Type_W || dst->getType() == Type_UW);
8228 const bool allowedFCombination = (src->getType() == Type_F || src->getType() == Type_HF) &&
8229 (dst->getType() == Type_F || dst->getType() == Type_HF);
8230 const bool allowedDFCombination = src->getType() == Type_DF &&
8231 dst->getType() == Type_DF;
8232
8233 if (builder.restrictedACCRestrictions() && allowedFCombination)
8234 {
8235 uint16_t dstStride = dst->getHorzStride();
8236 uint16_t srcStride = 0;
8237 const RegionDesc* region = src->getRegion();
8238
8239 if (!region->isSingleStride(execSize, srcStride))
8240 {
8241 return false;
8242 }
8243
8244 //The bitmapping is model by the element size * element stride.
8245 //When dst type is different with src type, or both are HF type.
8246 //FIXME, currently, r35 in following case cannot be replaced with acc
8247 // mov (16|M0) r35.0<1>:f r25.0<1;1,0>:f
8248 // mov (16|M0) r36.0<1>:hf r35.0<1;1,0>:f
8249 // the restriction may be relaxed after validation of HW team
8250 if ((dst->getType() != src->getType() || dst->getType() == Type_HF) &&
8251 (dstStride * dst->getTypeSize() != srcStride * src->getTypeSize()))
8252 {
8253 return false;
8254 }
8255 }
8256
8257 if (!allowedICombination && !allowedFCombination && !allowedDFCombination)
8258 {
8259 return false;
8260 }
8261 }
8262 return builder.relaxedACCRestrictions() || !getDst()->isAccReg();
8263 case G4_madm:
8264 return builder.useAccForMadm();
8265 case G4_mad:
8266 // no int acc if it's used as mul operand
8267 return builder.canMadHaveAcc() &&
8268 ((srcId == 1 && (IS_FTYPE(src->getType()) || (src->getType() == Type_DF))) ||
8269 (srcId == 0 && src->getModifier() == Mod_src_undef) ||
8270 (srcId == 0 && builder.relaxedACCRestrictions_1()) ||
8271 (srcId == 2 && (IS_FTYPE(src->getType()) || (src->getType() == Type_DF))));
8272 case G4_csel:
8273 return builder.canMadHaveAcc();
8274 case G4_mul:
8275 return IS_TYPE_FLOAT_ALL(src->getType());
8276 case G4_and:
8277 case G4_not:
8278 case G4_or:
8279 case G4_xor:
8280 return src->getModifier() == Mod_src_undef;
8281 case G4_pln:
8282 return builder.doPlane() && src->getModifier() == Mod_src_undef;
8283 case G4_dp4a:
8284 if (builder.restrictedACCRestrictions())
8285 {
8286 return srcId == 0;
8287 }
8288 return builder.relaxedACCRestrictions2();
8289 case G4_bfn:
8290 case G4_add3:
8291 return true;
8292 default:
8293 return false;
8294 }
8295 }
8296
canSrcBeAccAfterHWConform(Gen4_Operand_Number opndNum) const8297 bool G4_INST::canSrcBeAccAfterHWConform(Gen4_Operand_Number opndNum) const
8298 {
8299 int srcId = getSrcNum(opndNum);
8300 G4_SrcRegRegion* src = getSrc(srcId)->asSrcRegRegion();
8301
8302 // dst must be GRF-aligned
8303 if ((getDst()->getLinearizedStart() % numEltPerGRF<Type_UB>()) != 0)
8304 {
8305 if (!(isMixedMode() && builder.getPlatform() == XeHP_SDV))
8306 return false;
8307 }
8308
8309 // check that src0 and dst have the same type/alignment
8310 auto dstEltSize = getDst()->getHorzStride() * getDst()->getTypeSize();
8311 if (dstEltSize > TypeSize(src->getType()))
8312 {
8313 return false;
8314 }
8315 else if (isLowPrecisionFloatTy(getDst()->getType()) && src->getType() == Type_F &&
8316 dstEltSize == 2)
8317 {
8318 if (builder.relaxedACCRestrictions())
8319 {
8320 //When source is float or half float from accumulator register and destination is half float with a stride of 1,
8321 //the source must register aligned. i.e., source must have offset zero.
8322 if ((src->getLinearizedStart() % numEltPerGRF<Type_UB>()) != 0)
8323 {
8324 return false;
8325 }
8326 }
8327 else
8328 {
8329 // no acc for mix mode inst with packed HF dst
8330 return false;
8331 }
8332 }
8333
8334 return true;
8335 }
8336
canSrcBeAcc(Gen4_Operand_Number opndNum) const8337 bool G4_INST::canSrcBeAcc(Gen4_Operand_Number opndNum) const
8338 {
8339 return canSrcBeAccBeforeHWConform(opndNum) && canSrcBeAccAfterHWConform(opndNum);
8340 }
8341
getPlatform() const8342 TARGET_PLATFORM G4_INST::getPlatform() const
8343 {
8344 return builder.getPlatform();
8345 }
8346
cloneInst()8347 G4_INST* G4_INST::cloneInst()
8348 {
8349 // return nullptr if new derived class hasnt implemented
8350 // its own cloneInst()
8351 if (!isBaseInst() && !isCFInst())
8352 return nullptr;
8353
8354 // Return a clone of current instruction.
8355 // This functionality is expected to be used by optimizations
8356 // such as rematerialization that require creating a copy
8357 // of instructions.
8358 auto nonConstBuilder = const_cast<IR_Builder*>(&builder);
8359 G4_INST* newInst = nullptr;
8360 auto prd = nonConstBuilder->duplicateOperand(getPredicate());
8361 auto condMod = nonConstBuilder->duplicateOperand(getCondMod());
8362 auto dst = nonConstBuilder->duplicateOperand(getDst());
8363 auto src0 = nonConstBuilder->duplicateOperand(getSrc(0));
8364 auto src1 = nonConstBuilder->duplicateOperand(getSrc(1));
8365 auto src2 = nonConstBuilder->duplicateOperand(getSrc(2));
8366 auto accSrc = nonConstBuilder->duplicateOperand(getImplAccSrc());
8367 auto accDst = nonConstBuilder->duplicateOperand(getImplAccDst());
8368
8369 if (isSend())
8370 {
8371 MUST_BE_TRUE(false, "cloning send not yet supported");
8372 }
8373 else
8374 {
8375 newInst = nonConstBuilder->createInternalInst(prd, op, condMod, getSaturate(), getExecSize(),
8376 dst, src0, src1, option);
8377
8378 if (src2)
8379 newInst->setSrc(src2, 2);
8380
8381 if (accSrc)
8382 newInst->setImplAccSrc(accSrc);
8383
8384 if (accDst)
8385 newInst->setImplAccDst(accDst);
8386 }
8387
8388 return newInst;
8389 }
8390
cloneInst()8391 G4_INST* G4_InstSend::cloneInst()
8392 {
8393 auto nonConstBuilder = const_cast<IR_Builder*>(&builder);
8394 G4_INST* newInst = nullptr;
8395 auto prd = nonConstBuilder->duplicateOperand(getPredicate());
8396 auto dst = nonConstBuilder->duplicateOperand(getDst());
8397 auto src0 = nonConstBuilder->duplicateOperand(getSrc(0))->asSrcRegRegion();
8398
8399 if (isSplitSend())
8400 {
8401 // desc -> src2, extDesc -> src3
8402 auto src1 = nonConstBuilder->duplicateOperand(getSrc(1))->asSrcRegRegion();
8403 auto desc = nonConstBuilder->duplicateOperand(getSrc(2));
8404 auto extDesc = nonConstBuilder->duplicateOperand(getSrc(3));
8405 newInst = nonConstBuilder->createInternalSplitSendInst(getExecSize(), dst, src0, src1, desc,
8406 getOption(), getMsgDescRaw(), extDesc);
8407 if (prd)
8408 {
8409 newInst->setPredicate(prd);
8410 }
8411 }
8412 else
8413 {
8414 auto desc = nonConstBuilder->duplicateOperand(getSrc(1));
8415 // desc -> src1, no extDesc (must be imm and stored in SendMsgDesc)
8416 newInst = nonConstBuilder->createInternalSendInst(prd, op, getExecSize(),
8417 dst, src0, desc, getOption(), getMsgDesc());
8418 }
8419
8420 return newInst;
8421 }
8422
G4_InstIntrinsic(const IR_Builder & builder,G4_Predicate * prd,Intrinsic intrinId,G4_ExecSize execSize,G4_DstRegRegion * d,G4_Operand * s0,G4_Operand * s1,G4_Operand * s2,G4_Operand * s3,G4_Operand * s4,G4_Operand * s5,G4_Operand * s6,G4_Operand * s7,G4_InstOpts opt)8423 G4_InstIntrinsic::G4_InstIntrinsic(
8424 const IR_Builder& builder,
8425 G4_Predicate* prd,
8426 Intrinsic intrinId,
8427 G4_ExecSize execSize,
8428 G4_DstRegRegion* d,
8429 G4_Operand* s0,
8430 G4_Operand* s1,
8431 G4_Operand* s2,
8432 G4_Operand* s3,
8433 G4_Operand* s4,
8434 G4_Operand* s5,
8435 G4_Operand* s6,
8436 G4_Operand* s7,
8437 G4_InstOpts opt) :
8438 G4_INST(builder, prd, G4_intrinsic, nullptr, g4::NOSAT, execSize, d, nullptr, nullptr, nullptr, opt),
8439 intrinsicId(intrinId), tmpGRFStart(-1), tmpAddrStart(-1), tmpFlagStart(-1)
8440 {
8441 srcs[0] = s0;
8442 srcs[1] = s1;
8443 srcs[2] = s2;
8444 srcs[3] = s3;
8445 srcs[4] = s4;
8446 srcs[5] = s5;
8447 srcs[6] = s6;
8448 srcs[7] = s7;
8449
8450 resetRightBound(s0);
8451 resetRightBound(s1);
8452 resetRightBound(s2);
8453 resetRightBound(s3);
8454 resetRightBound(s4);
8455 resetRightBound(s5);
8456 resetRightBound(s6);
8457 resetRightBound(s7);
8458
8459 associateOpndWithInst(s0, this);
8460 associateOpndWithInst(s1, this);
8461 associateOpndWithInst(s2, this);
8462 associateOpndWithInst(s3, this);
8463 associateOpndWithInst(s4, this);
8464 associateOpndWithInst(s5, this);
8465 associateOpndWithInst(s6, this);
8466 associateOpndWithInst(s7, this);
8467 }
8468
getIntrinsicSrc(unsigned i) const8469 G4_Operand* G4_InstIntrinsic::getIntrinsicSrc(unsigned i) const
8470 {
8471 MUST_BE_TRUE(i < G4_MAX_INTRINSIC_SRCS, ERROR_INTERNAL_ARGUMENT);
8472 return srcs[i];
8473 }
8474
getOperand(Gen4_Operand_Number opnd_num) const8475 G4_Operand* G4_InstIntrinsic::getOperand(Gen4_Operand_Number opnd_num) const
8476 {
8477 switch (opnd_num) {
8478 case Opnd_src0: return srcs[0];
8479 case Opnd_src1: return srcs[1];
8480 case Opnd_src2: return srcs[2];
8481 case Opnd_src3: return srcs[3];
8482 case Opnd_src4: return srcs[4];
8483 case Opnd_src5: return srcs[5];
8484 case Opnd_src6: return srcs[6];
8485 case Opnd_src7: return srcs[7];
8486 default:
8487 MUST_BE_TRUE(0, "Operand number is out of range.");
8488 break;
8489 }
8490 return NULL;
8491 }
8492
setIntrinsicSrc(G4_Operand * opnd,unsigned i)8493 void G4_InstIntrinsic::setIntrinsicSrc(G4_Operand* opnd, unsigned i)
8494 {
8495 MUST_BE_TRUE(i < G4_MAX_INTRINSIC_SRCS, ERROR_INTERNAL_ARGUMENT);
8496
8497 if (srcs[i] != NULL)
8498 {
8499 if (srcs[i]->getInst() == (G4_INST *)this)
8500 {
8501 srcs[i]->setInst(NULL);
8502 }
8503 }
8504 srcs[i] = opnd;
8505
8506 associateOpndWithInst(opnd, (G4_INST*)this);
8507 resetRightBound(opnd);
8508 }
8509
cloneInst()8510 G4_INST* G4_InstIntrinsic::cloneInst()
8511 {
8512 auto nonConstBuilder = const_cast<IR_Builder*>(&builder);
8513 auto prd = nonConstBuilder->duplicateOperand(getPredicate());
8514 auto dst = nonConstBuilder->duplicateOperand(getDst());
8515 auto src0 = nonConstBuilder->duplicateOperand(getSrc(0));
8516 auto src1 = nonConstBuilder->duplicateOperand(getSrc(1));
8517 auto src2 = nonConstBuilder->duplicateOperand(getSrc(2));
8518
8519 return nonConstBuilder->createInternalIntrinsicInst(prd, getIntrinsicId(), getExecSize(), dst,
8520 src0, src1, src2, option);
8521 }
8522
isLegal(unsigned vs,unsigned w,unsigned hs)8523 bool RegionDesc::isLegal(unsigned vs, unsigned w, unsigned hs)
8524 {
8525 auto isPositiveAndLegal = [](unsigned val, unsigned high) {
8526 if (val == UNDEFINED_SHORT)
8527 return true;
8528 if (val > high || val == 0)
8529 return false;
8530 return ((val - 1) & val) == 0;
8531 };
8532 return isPositiveAndLegal(w, 16) &&
8533 (vs == 0 || isPositiveAndLegal(vs, 32)) &&
8534 (hs == 0 || isPositiveAndLegal(hs, 16));
8535 }
8536
getRegionDescKind(uint16_t size,uint16_t vstride,uint16_t width,uint16_t hstride)8537 RegionDesc::RegionDescKind RegionDesc::getRegionDescKind(
8538 uint16_t size, uint16_t vstride,
8539 uint16_t width, uint16_t hstride)
8540 {
8541 // Skip special cases.
8542 if (vstride == UNDEFINED_SHORT || width == UNDEFINED_SHORT ||
8543 hstride == UNDEFINED_SHORT)
8544 return RK_Other;
8545
8546 // <0;1,0>
8547 if (size == 1 || (vstride == 0 && hstride == 0) ||
8548 (vstride == 0 && width == 1))
8549 return RK_Stride0;
8550
8551 // <1;1,0>
8552 if ((vstride == 1 && width == 1) || (size <= width && hstride == 1) ||
8553 (vstride == width && hstride == 1))
8554 return RK_Stride1;
8555
8556 // <N;1,0>
8557 uint16_t stride = 0;
8558 if (vstride == width * hstride || width == size)
8559 {
8560 stride = hstride;
8561 }
8562 else if (width == 1 && hstride == 0)
8563 {
8564 stride = vstride;
8565 }
8566
8567 return (stride == 2) ? RK_Stride2 : (stride == 4) ? RK_Stride4
8568 : RK_Other;
8569 }
8570
isContiguous(unsigned ExSize) const8571 bool RegionDesc::isContiguous(unsigned ExSize) const
8572 {
8573 if (vertStride == 1 && width == 1)
8574 return true;
8575 if (vertStride == width && horzStride == 1)
8576 return true;
8577
8578 return (ExSize == 1) ||
8579 (ExSize <= (unsigned)width && horzStride == 1);
8580 }
isSingleNonUnitStride(uint32_t execSize,uint16_t & stride) const8581 bool RegionDesc::isSingleNonUnitStride(uint32_t execSize, uint16_t& stride) const
8582 {
8583 if (isScalar() || isContiguous(execSize))
8584 {
8585 return false;
8586 }
8587
8588 if (vertStride == width * horzStride || width == execSize)
8589 {
8590 stride = horzStride;
8591 return true;
8592 }
8593
8594 if (horzStride == 0 && width == 1)
8595 {
8596 stride = vertStride;
8597 return true;
8598 }
8599
8600 return false;
8601 }
8602
isSingleStride(uint32_t execSize,uint16_t & stride) const8603 bool RegionDesc::isSingleStride(uint32_t execSize, uint16_t &stride) const
8604 {
8605 if (isScalar())
8606 {
8607 stride = 0;
8608 return true;
8609 }
8610 if (isContiguous(execSize))
8611 {
8612 stride = 1;
8613 return true;
8614 }
8615
8616 return isSingleNonUnitStride(execSize, stride);
8617 }
8618
cloneInst()8619 G4_INST* G4_InstMath::cloneInst()
8620 {
8621 auto nonConstBuilder = const_cast<IR_Builder*>(&builder);
8622 auto prd = nonConstBuilder->duplicateOperand(getPredicate());
8623 auto dst = nonConstBuilder->duplicateOperand(getDst());
8624 auto src0 = nonConstBuilder->duplicateOperand(getSrc(0));
8625 auto src1 = nonConstBuilder->duplicateOperand(getSrc(1));
8626
8627 return nonConstBuilder->createInternalMathInst(
8628 prd, getSaturate(), getExecSize(),
8629 dst, src0, src1, getMathCtrl(), option);
8630 }
8631
cloneInst()8632 G4_INST* G4_InstBfn::cloneInst()
8633 {
8634 auto nonConstBuilder = const_cast<IR_Builder*>(&builder);
8635 auto prd = nonConstBuilder->duplicateOperand(getPredicate());
8636 auto condMod = nonConstBuilder->duplicateOperand(getCondMod());
8637 auto dst = nonConstBuilder->duplicateOperand(getDst());
8638 auto src0 = nonConstBuilder->duplicateOperand(getSrc(0));
8639 auto src1 = nonConstBuilder->duplicateOperand(getSrc(1));
8640 auto src2 = nonConstBuilder->duplicateOperand(getSrc(2));
8641 return nonConstBuilder->createInternalBfnInst(
8642 getBooleanFuncCtrl(), prd, condMod, getSaturate(), getExecSize(),
8643 dst, src0, src1, src2, option);
8644 }
8645
cloneInst()8646 G4_INST* G4_InstDpas::cloneInst()
8647 {
8648 auto nonConstBuilder = const_cast<IR_Builder*>(&builder);
8649 auto dst = nonConstBuilder->duplicateOperand(getDst());
8650 auto src0 = nonConstBuilder->duplicateOperand(getSrc(0));
8651 auto src1 = nonConstBuilder->duplicateOperand(getSrc(1));
8652 auto src2 = nonConstBuilder->duplicateOperand(getSrc(2));
8653 auto src3 = nonConstBuilder->duplicateOperand(getSrc(3));
8654 return nonConstBuilder->createInternalDpasInst(
8655 op, getExecSize(),
8656 dst, src0, src1, src2, src3, option,
8657 getSrc2Precision(), getSrc1Precision(), getSystolicDepth(), getRepeatCount());
8658 }
8659
isInt() const8660 bool G4_InstDpas::isInt() const
8661 {
8662 // Check Src1 is enough.
8663 switch (Src1Precision)
8664 {
8665 case GenPrecision::S8:
8666 case GenPrecision::U8:
8667 case GenPrecision::S4:
8668 case GenPrecision::U4:
8669 case GenPrecision::S2:
8670 case GenPrecision::U2:
8671 return true;
8672 default:
8673 break;
8674 }
8675 return false;
8676 }
8677
is2xInt8() const8678 bool G4_InstDpas::is2xInt8() const
8679 {
8680 if ((Src1Precision == GenPrecision::S4 || Src1Precision == GenPrecision::U4 ||
8681 Src1Precision == GenPrecision::S2 || Src1Precision == GenPrecision::U2)
8682 &&
8683 (Src2Precision == GenPrecision::S4 || Src2Precision == GenPrecision::U4 ||
8684 Src2Precision == GenPrecision::S2 || Src2Precision == GenPrecision::U2))
8685 {
8686 return true;
8687 }
8688 return false;
8689 }
8690
getOpsPerChan() const8691 uint8_t G4_InstDpas::getOpsPerChan() const
8692 {
8693 if (isBF16() || isFP16())
8694 return OPS_PER_CHAN_2;
8695 else if (isTF32())
8696 return OPS_PER_CHAN_1;
8697 else if (isBF8())
8698 return OPS_PER_CHAN_4;
8699 else if (is2xInt8())
8700 return OPS_PER_CHAN_8;
8701 // int8
8702 return OPS_PER_CHAN_4;
8703 }
8704
computeRightBound(G4_Operand * opnd)8705 void G4_InstDpas::computeRightBound(G4_Operand* opnd)
8706 {
8707 associateOpndWithInst(opnd, this);
8708 if (opnd && !opnd->isImm() && !opnd->isNullReg())
8709 {
8710 G4_InstDpas* dpasInst = asDpasInst();
8711 uint8_t D = dpasInst->getSystolicDepth();
8712 uint8_t C = dpasInst->getRepeatCount();
8713
8714 auto computeDpasOperandBound = [](G4_Operand* opnd, unsigned leftBound, unsigned rightBound)
8715 {
8716 unsigned NBytes = rightBound - leftBound + 1;
8717 opnd->setBitVecFromSize(NBytes);
8718 opnd->setRightBound(rightBound);
8719 };
8720
8721 if (opnd == dst || (opnd == srcs[0] && !opnd->isNullReg()))
8722 {
8723 // dst and src0 are always packed, and RB is exec_size * type_size * rep_count
8724 auto opndSize = builder.getNativeExecSize() * opnd->getTypeSize() * C;
8725 computeDpasOperandBound(opnd, opnd->left_bound, opnd->left_bound + opndSize - 1);
8726 }
8727 else if (opnd == srcs[1])
8728 {
8729 uint32_t bytesPerLane = dpasInst->getSrc1SizePerLaneInByte();
8730 uint8_t src1_D = D;
8731
8732 // Each lanes needs (src1_D * bytesPerLane) bytes, and it's multiple of DW!
8733 uint32_t bytesPerLaneForAllDepth = bytesPerLane * src1_D;
8734 bytesPerLaneForAllDepth = ((bytesPerLaneForAllDepth + 3) / 4) * 4;
8735
8736 uint32_t bytes = bytesPerLaneForAllDepth * builder.getNativeExecSize();
8737 computeDpasOperandBound(opnd, opnd->left_bound, opnd->left_bound + bytes - 1);
8738 }
8739 else if (opnd == srcs[2])
8740 {
8741 // src2 is uniform.
8742 uint32_t bytesPerLane = dpasInst->getSrc2SizePerLaneInByte();
8743 uint32_t bytes = bytesPerLane * D * C;
8744 if (op == G4_dpasw) {
8745 bytes = bytesPerLane * D * ((C + 1) / 2);
8746 }
8747 computeDpasOperandBound(opnd, opnd->left_bound, opnd->left_bound + bytes - 1);
8748 }
8749 }
8750 }
8751
inheritDIFrom(const G4_INST * inst)8752 void G4_INST::inheritDIFrom(const G4_INST* inst)
8753 {
8754 // Copy over debug info from inst
8755 setLocation(inst->getLocation());
8756 setCISAOff(getCISAOff() == UndefinedCisaOffset ? inst->getCISAOff() : getCISAOff());
8757 }
8758
inheritSWSBFrom(const G4_INST * inst)8759 void G4_INST::inheritSWSBFrom(const G4_INST* inst)
8760 {
8761 // Copy the SWSB info
8762 setDistance(inst->getDistance());
8763 setLexicalId(inst->getLexicalId());
8764
8765 setDistanceTypeXe(inst->getDistanceTypeXe());
8766 unsigned short token = inst->getToken();
8767 setToken(token);
8768 SWSBTokenType type = inst->getTokenType();
8769 setTokenType(type);
8770 }
8771